diff --git a/.tool-versions b/.tool-versions new file mode 100644 index 000000000..4bb7b3074 --- /dev/null +++ b/.tool-versions @@ -0,0 +1,3 @@ +java temurin-11.0.21+9 +sbt 1.9.7 +python 3.11.6 diff --git a/README.md b/README.md index 81ba43649..e0cd3eac9 100644 --- a/README.md +++ b/README.md @@ -309,9 +309,68 @@ $ dx run bam_chrom_counter -istage-common.bam=project-BQbJpBj0bvygyQxgQ1800Jkk:f * `SoftwareRequirement` and `InplaceUpdateRequirement` are not yet supported * Publishing a dxCompiler-generated workflow as a global workflow is not supported +## Authenticated HTTP Imports + +dxCompiler supports importing WDL files from private HTTP sources that require authentication, such as private GitHub repositories. + +### Configuration + +Set the `WDL_IMPORT_TOKEN` environment variable with your access token: + +```bash +# For GitHub, use a Personal Access Token (PAT) +export WDL_IMPORT_TOKEN="ghp_xxxxxxxxxxxxxxxxxxxx" + +# Then run dxCompiler as usual +java -jar dxCompiler.jar compile workflow.wdl -project project-xxxx -folder /my/workflows/ +``` + +### Supported Domains + +By default, the token is only sent to these domains (for security): +- `github.com` +- `raw.githubusercontent.com` + +To add additional domains, use the `WDL_IMPORT_TOKEN_DOMAINS` environment variable: + +```bash +# Add custom domains (comma-separated) +export WDL_IMPORT_TOKEN_DOMAINS="github.com,raw.githubusercontent.com,gitlab.com,my-private-server.com" +``` + +### Example Usage + +In your WDL file, import from a private repository: + +```wdl +version 1.0 + +import "https://raw.githubusercontent.com/myorg/private-repo/main/tasks/my_task.wdl" as private_tasks + +workflow my_workflow { + call private_tasks.my_task +} +``` + +### Getting a GitHub Token + +1. Go to https://github.com/settings/tokens +2. Click "Generate new token (classic)" +3. Select the `repo` scope for private repository access +4. Copy the generated token and set it as `WDL_IMPORT_TOKEN` + +### Security Notes + +- The token is only sent to explicitly allowed domains +- The token is never logged +- If the token is not set, imports work as before (for public URLs only) + +For more details, see [Authenticated Imports documentation](doc/AUTHENTICATED_IMPORTS.md). + ## Additional information - [Advanced options](doc/ExpertOptions.md) explains additional compiler options +- [Authenticated Imports](doc/AUTHENTICATED_IMPORTS.md) how to import WDL from private GitHub repositories - [Internals](doc/Internals.md) describes current compiler structure (_work in progress_) - [Tips](doc/Tips.md) examples for how to write good WDL code - [Debugging](doc/Debugging.md) recommendations how to debug the workflows on DNAnexus platform diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index a9e4ddd46..e81793d07 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -2,6 +2,10 @@ ## Unreleased +### New Features + +* **Authenticated HTTP Imports**: Added support for importing WDL files from private HTTP sources that require authentication (e.g., private GitHub repositories). Set the `WDL_IMPORT_TOKEN` environment variable with a Bearer token to enable authenticated imports. By default, tokens are only sent to `github.com` and `raw.githubusercontent.com` domains. Additional domains can be configured via `WDL_IMPORT_TOKEN_DOMAINS`. See [Authenticated Imports documentation](doc/AUTHENTICATED_IMPORTS.md) for details. + ## 2.15.0 2025-09-29 * Added support for new region in OCI Ashburn diff --git a/compiler/src/main/scala/dxCompiler/Main.scala b/compiler/src/main/scala/dxCompiler/Main.scala index 9a6430f88..1cb4f844a 100644 --- a/compiler/src/main/scala/dxCompiler/Main.scala +++ b/compiler/src/main/scala/dxCompiler/Main.scala @@ -14,7 +14,8 @@ import dx.core.languages.wdl.WdlOptions import dx.dxni.DxNativeInterface import dx.translator.{Extras, TranslatorFactory} import dx.util.protocols.DxFileAccessProtocol -import dx.util.{Enum, FileSourceResolver, FileUtils, Logger, TraceLevel} +import dx.util.{Enum, FileAccessProtocol, FileSourceResolver, FileUtils, LocalFileAccessProtocol, Logger, TraceLevel} +import dx.core.io.AuthenticatedHttpFileAccessProtocol import spray.json.{JsNull, JsValue} import wdlTools.types.TypeCheckingRegime @@ -67,17 +68,25 @@ object Main { * - creates a FileSourceResolver that looks for local files in any configured -imports * directories and has a DxFileAccessProtocol * - initializes a Logger + * - configures authenticated HTTP imports if WDL_IMPORT_TOKEN is set * @param options parsed options * @return (FileSourceResolver, Logger) */ private def initCommon(options: Options): (FileSourceResolver, Logger) = { val logger = initLogger(options) val imports: Vector[Path] = options.getList[Path]("imports") - val fileResolver = FileSourceResolver.create( - imports, - Vector(DxFileAccessProtocol()), - logger + + // Create authenticated HTTP protocol for importing from private repositories + val httpProtocol = AuthenticatedHttpFileAccessProtocol.fromEnvironment(logger) + + // Build protocol list - order matters, first matching protocol wins + val protocols: Vector[FileAccessProtocol] = Vector( + LocalFileAccessProtocol(imports, logger), + httpProtocol, + DxFileAccessProtocol() ) + + val fileResolver = FileSourceResolver(protocols) FileSourceResolver.set(fileResolver) (fileResolver, logger) } @@ -877,7 +886,8 @@ object Main { | input values may only be specified for the top-level workflow. | -leaveWorkflowsOpen Leave created workflows open (otherwise they are closed). | -p | -imports Directory to search for imported WDL or CWL files. May be specified - | multiple times. + | multiple times. For HTTP imports from private repositories, + | set the WDL_IMPORT_TOKEN environment variable (see below). | -projectWideReuse Look for existing applets/workflows in the entire project | before generating new ones. The default search scope is the | target folder only. @@ -926,6 +936,12 @@ object Main { | -verboseKey Print verbose output only for a specific module. May be | specified multiple times. | -logFile File to use for logging output; defaults to stderr. + | + |Environment variables + | WDL_IMPORT_TOKEN Bearer token for authenticated HTTP imports (e.g., GitHub PAT + | for private repositories). Token is sent only to allowed domains. + | WDL_IMPORT_TOKEN_DOMAINS Comma-separated list of domains to send the token to. + | Defaults to: github.com,raw.githubusercontent.com |""".stripMargin def main(args: Vector[String]): Unit = { diff --git a/core/src/main/scala/dx/core/io/AuthenticatedHttpFileAccessProtocol.scala b/core/src/main/scala/dx/core/io/AuthenticatedHttpFileAccessProtocol.scala new file mode 100644 index 000000000..d3800ac1f --- /dev/null +++ b/core/src/main/scala/dx/core/io/AuthenticatedHttpFileAccessProtocol.scala @@ -0,0 +1,104 @@ +package dx.core.io + +import dx.util.{FileAccessProtocol, FileUtils, Logger} +import java.net.URI +import java.nio.charset.Charset + +/** + * HTTP file access protocol with Bearer token authentication support. + * + * Reads authentication token from WDL_IMPORT_TOKEN environment variable. + * Only sends tokens to allowed domains (configurable via WDL_IMPORT_TOKEN_DOMAINS) + * to prevent credential leakage to untrusted servers. + * + * @param token Optional Bearer token (defaults to WDL_IMPORT_TOKEN env var) + * @param allowedDomains Set of domains to send auth token to + * @param encoding Character encoding for file content + * @param logger Logger for trace/debug output + */ +case class AuthenticatedHttpFileAccessProtocol( + token: Option[String] = None, + allowedDomains: Set[String] = AuthenticatedHttpFileAccessProtocol.defaultAllowedDomains, + encoding: Charset = FileUtils.DefaultEncoding, + logger: Logger = Logger.Quiet +) extends FileAccessProtocol { + + override val schemes: Vector[String] = Vector(FileUtils.HttpScheme, FileUtils.HttpsScheme) + override val supportsDirectories: Boolean = true + + /** + * Determines if authentication should be used for the given URI. + * Only returns true if a token is configured AND the domain is in the allowed list. + */ + private def shouldAuthenticate(uri: URI): Boolean = { + token.isDefined && Option(uri.getHost).exists(host => + allowedDomains.exists(_.equalsIgnoreCase(host)) + ) + } + + override def resolve(address: String): AuthenticatedHttpFileSource = { + val uri = URI.create(address) + val useAuth = shouldAuthenticate(uri) + if (useAuth) { + logger.trace(s"Using authenticated HTTP for import from: ${uri.getHost}") + } + AuthenticatedHttpFileSource(uri, encoding, isDirectory = false, if (useAuth) token else None)(address) + } + + override def resolveDirectory(address: String): AuthenticatedHttpFileSource = { + val uri = URI.create(address) + val useAuth = shouldAuthenticate(uri) + if (useAuth) { + logger.trace(s"Using authenticated HTTP for directory import from: ${uri.getHost}") + } + AuthenticatedHttpFileSource(uri, encoding, isDirectory = true, if (useAuth) token else None)(address) + } +} + +object AuthenticatedHttpFileAccessProtocol { + + /** Environment variable name for the Bearer token */ + val TokenEnvVar: String = "WDL_IMPORT_TOKEN" + + /** Environment variable name for custom allowed domains */ + val DomainsEnvVar: String = "WDL_IMPORT_TOKEN_DOMAINS" + + /** Default allowed domains that will receive the auth token */ + val defaultDomains: Set[String] = Set( + "github.com", + "raw.githubusercontent.com" + ) + + /** + * Gets the allowed domains from environment variable or defaults. + * WDL_IMPORT_TOKEN_DOMAINS should be a comma-separated list of domains. + */ + lazy val defaultAllowedDomains: Set[String] = { + sys.env.get(DomainsEnvVar) match { + case Some(domains) => + domains.split(",").map(_.trim.toLowerCase).filter(_.nonEmpty).toSet + case None => + defaultDomains + } + } + + /** + * Creates an instance with configuration from environment variables. + * + * @param logger Logger for trace output (token values are never logged) + * @return AuthenticatedHttpFileAccessProtocol configured from environment + */ + def fromEnvironment(logger: Logger = Logger.Quiet): AuthenticatedHttpFileAccessProtocol = { + val tokenOpt = sys.env.get(TokenEnvVar) + if (tokenOpt.isDefined) { + logger.trace( + s"${TokenEnvVar} found; authenticated HTTP imports enabled for domains: ${defaultAllowedDomains.mkString(", ")}" + ) + } + AuthenticatedHttpFileAccessProtocol( + token = tokenOpt, + allowedDomains = defaultAllowedDomains, + logger = logger + ) + } +} diff --git a/core/src/main/scala/dx/core/io/AuthenticatedHttpFileSource.scala b/core/src/main/scala/dx/core/io/AuthenticatedHttpFileSource.scala new file mode 100644 index 000000000..bce61cad1 --- /dev/null +++ b/core/src/main/scala/dx/core/io/AuthenticatedHttpFileSource.scala @@ -0,0 +1,208 @@ +package dx.core.io + +import dx.util.{AbstractAddressableFileNode, AddressableFileSource, FileUtils, PosixPath} +import java.io.{ByteArrayOutputStream, FileOutputStream} +import java.net.{HttpURLConnection, URI} +import java.nio.charset.Charset +import java.nio.file.{Files, Path} + +/** + * An HTTP file source that supports Bearer token authentication. + * + * This class mirrors HttpFileSource from dxScala but adds support for + * Authorization headers when accessing protected resources. + * + * @param uri The URI to fetch + * @param encoding Character encoding for reading content + * @param isDirectory Whether this represents a directory (archive) + * @param token Optional Bearer token for authentication + * @param address The original address string + */ +case class AuthenticatedHttpFileSource( + override val uri: URI, + override val encoding: Charset, + override val isDirectory: Boolean, + token: Option[String] +)(override val address: String) + extends AbstractAddressableFileNode(address, encoding) { + + private lazy val path = PosixPath(uri.getPath) + + override lazy val name: String = + path.getName.getOrElse(throw new Exception(s"${path} is not a file")) + + override lazy val folder: String = path.getParent.map(_.toString).getOrElse("") + + override def container: String = s"${uri.getScheme}:${uri.getHost}:${folder}" + + private var hasBytes: Boolean = false + + /** + * Execute a function with an HTTP connection, ensuring proper cleanup. + * Adds Authorization header if a token is configured. + */ + private def withConnection[T](fn: HttpURLConnection => T): T = { + val url = uri.toURL + var conn: HttpURLConnection = null + try { + conn = url.openConnection().asInstanceOf[HttpURLConnection] + // Add authentication header if token is present + token.foreach { t => + conn.setRequestProperty("Authorization", s"Bearer $t") + } + fn(conn) + } finally { + if (conn != null) { + conn.disconnect() + } + } + } + + override def exists: Boolean = { + try { + val rc = withConnection { conn => + conn.setRequestMethod("HEAD") + conn.getResponseCode + } + rc match { + case HttpURLConnection.HTTP_OK => true + case HttpURLConnection.HTTP_UNAUTHORIZED => + throw new Exception( + s"""HTTP 401 Unauthorized when accessing ${uri}. + |If this is a private repository, ensure WDL_IMPORT_TOKEN is set with a valid access token. + |For GitHub: generate a token at https://github.com/settings/tokens with 'repo' scope. + |Current allowed domains can be configured via WDL_IMPORT_TOKEN_DOMAINS.""".stripMargin + ) + case HttpURLConnection.HTTP_FORBIDDEN => + throw new Exception( + s"""HTTP 403 Forbidden when accessing ${uri}. + |The token may be invalid or lack the required permissions. + |For GitHub: ensure the token has 'repo' scope for private repositories.""".stripMargin + ) + case _ => false + } + } catch { + case _: java.net.UnknownHostException => false + case e: Exception => throw e + } + } + + override def getParent: Option[AuthenticatedHttpFileSource] = { + if (path.getParent == null) { + None + } else { + val newUri = if (isDirectory) uri.resolve("..") else uri.resolve(".") + Some(AuthenticatedHttpFileSource(newUri, encoding, isDirectory = true, token)(newUri.toString)) + } + } + + private def resolve(path: String, isDir: Boolean): AuthenticatedHttpFileSource = { + val newUri = if (isDirectory) uri.resolve(path) else uri.resolve(".").resolve(path) + AuthenticatedHttpFileSource(newUri, encoding, isDir, token)(newUri.toString) + } + + override def resolve(path: String): AuthenticatedHttpFileSource = resolve(path, isDir = false) + + override def resolveDirectory(path: String): AuthenticatedHttpFileSource = resolve(path, isDir = true) + + override def relativize(fileSource: AddressableFileSource): String = { + fileSource match { + case fs: AuthenticatedHttpFileSource if isDirectory => + PosixPath(uri.getPath).relativize(PosixPath(fs.uri.getPath)).toString + case fs: AuthenticatedHttpFileSource => + path.getParent.get.relativize(PosixPath(fs.uri.getPath)).toString + case _ => + throw new Exception(s"not an AuthenticatedHttpFileSource: ${fileSource}") + } + } + + override lazy val size: Long = { + try { + withConnection(conn => conn.getContentLengthLong) + } catch { + case t: Throwable => + throw new Exception(s"Error getting size of URL ${uri}: ${t.getMessage}") + } + } + + private def fetchUri(buffer: java.io.OutputStream, chunkSize: Int = 16384): Int = { + withConnection { conn => + val responseCode = conn.getResponseCode + if (responseCode == HttpURLConnection.HTTP_UNAUTHORIZED) { + throw new Exception( + s"""HTTP 401 Unauthorized when fetching ${uri}. + |If this is a private repository, ensure WDL_IMPORT_TOKEN is set with a valid access token. + |For GitHub: generate a token at https://github.com/settings/tokens with 'repo' scope.""".stripMargin + ) + } else if (responseCode == HttpURLConnection.HTTP_FORBIDDEN) { + throw new Exception( + s"""HTTP 403 Forbidden when fetching ${uri}. + |The token may be invalid or lack the required permissions.""".stripMargin + ) + } else if (responseCode != HttpURLConnection.HTTP_OK) { + throw new Exception(s"HTTP ${responseCode} when fetching ${uri}") + } + + val is = conn.getInputStream + try { + var nRead = 0 + var totalRead = 0 + val data = new Array[Byte](chunkSize) + do { + nRead = is.read(data, 0, chunkSize) + if (nRead > 0) { + buffer.write(data, 0, nRead) + totalRead += nRead + } + } while (nRead > 0) + totalRead + } finally { + is.close() + } + } + } + + override lazy val readBytes: Array[Byte] = { + checkFileSize() + val buffer = new ByteArrayOutputStream() + try { + fetchUri(buffer) + hasBytes = true + buffer.toByteArray + } finally { + buffer.close() + } + } + + private def localizeToFile(path: Path): Unit = { + if (hasBytes) { + FileUtils.writeFileContent(path, new String(readBytes, encoding)) + } else { + val buffer = new FileOutputStream(path.toFile) + try { + fetchUri(buffer) + } finally { + buffer.close() + } + } + } + + override protected def localizeTo(file: Path): Unit = { + if (isDirectory) { + val dest = Files.createTempFile("temp", name) + try { + localizeToFile(dest) + if (Files.exists(file)) { + FileUtils.deleteRecursive(file) + } + FileUtils.unpackArchive(dest, file) + } finally { + FileUtils.deleteRecursive(dest) + } + } else { + localizeToFile(file) + } + } + + override def isListable: Boolean = false +} diff --git a/core/src/test/scala/dx/core/io/AuthenticatedHttpFileAccessProtocolTest.scala b/core/src/test/scala/dx/core/io/AuthenticatedHttpFileAccessProtocolTest.scala new file mode 100644 index 000000000..acc537ecb --- /dev/null +++ b/core/src/test/scala/dx/core/io/AuthenticatedHttpFileAccessProtocolTest.scala @@ -0,0 +1,234 @@ +package dx.core.io + +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers +import java.nio.charset.StandardCharsets + +/** + * Unit tests for AuthenticatedHttpFileAccessProtocol. + * These tests verify the protocol's behavior without making actual HTTP requests. + */ +class AuthenticatedHttpFileAccessProtocolTest extends AnyFlatSpec with Matchers { + + private val testToken = "test-token-12345" + + "AuthenticatedHttpFileAccessProtocol" should "read token from constructor" in { + val protocol = AuthenticatedHttpFileAccessProtocol( + token = Some(testToken), + allowedDomains = Set("github.com") + ) + protocol.token shouldBe Some(testToken) + } + + it should "only authenticate to allowed domains" in { + val protocol = AuthenticatedHttpFileAccessProtocol( + token = Some(testToken), + allowedDomains = Set("github.com", "raw.githubusercontent.com") + ) + + // GitHub should get auth + val githubSource = protocol.resolve("https://raw.githubusercontent.com/org/repo/main/file.wdl") + githubSource.token shouldBe Some(testToken) + + // Other domains should NOT get auth + val otherSource = protocol.resolve("https://example.com/file.wdl") + otherSource.token shouldBe None + } + + it should "work without a token (backward compatible)" in { + val protocol = AuthenticatedHttpFileAccessProtocol( + token = None, + allowedDomains = Set("github.com") + ) + + val source = protocol.resolve("https://github.com/org/repo/file.wdl") + source.token shouldBe None + } + + it should "be case-insensitive for domain matching" in { + val protocol = AuthenticatedHttpFileAccessProtocol( + token = Some(testToken), + allowedDomains = Set("github.com") + ) + + val source = protocol.resolve("https://GitHub.COM/org/repo/file.wdl") + source.token shouldBe Some(testToken) + } + + it should "support directory resolution" in { + val protocol = AuthenticatedHttpFileAccessProtocol( + token = Some(testToken), + allowedDomains = Set("github.com") + ) + + val dirSource = protocol.resolveDirectory("https://github.com/org/repo/archive.tar.gz") + dirSource.isDirectory shouldBe true + dirSource.token shouldBe Some(testToken) + } + + it should "handle HTTP scheme" in { + val protocol = AuthenticatedHttpFileAccessProtocol( + token = Some(testToken), + allowedDomains = Set("example.com") + ) + + protocol.schemes should contain("http") + protocol.schemes should contain("https") + } + + it should "support directories" in { + val protocol = AuthenticatedHttpFileAccessProtocol() + protocol.supportsDirectories shouldBe true + } + + it should "not send token to unlisted domains even with token configured" in { + val protocol = AuthenticatedHttpFileAccessProtocol( + token = Some(testToken), + allowedDomains = Set("github.com") + ) + + val source = protocol.resolve("https://gitlab.com/org/repo/file.wdl") + source.token shouldBe None + } + + it should "handle empty allowed domains set" in { + val protocol = AuthenticatedHttpFileAccessProtocol( + token = Some(testToken), + allowedDomains = Set.empty + ) + + val source = protocol.resolve("https://github.com/org/repo/file.wdl") + source.token shouldBe None + } + + "AuthenticatedHttpFileAccessProtocol.defaultDomains" should "include github.com" in { + AuthenticatedHttpFileAccessProtocol.defaultDomains should contain("github.com") + } + + it should "include raw.githubusercontent.com" in { + AuthenticatedHttpFileAccessProtocol.defaultDomains should contain("raw.githubusercontent.com") + } + + "Domain parsing" should "parse comma-separated domains correctly" in { + val domainsString = "gitlab.com, bitbucket.org, custom.example.com" + val parsed = domainsString.split(",").map(_.trim.toLowerCase).filter(_.nonEmpty).toSet + + parsed should contain("gitlab.com") + parsed should contain("bitbucket.org") + parsed should contain("custom.example.com") + parsed.size shouldBe 3 + } + + it should "handle extra whitespace in domain list" in { + val domainsString = " github.com , gitlab.com , " + val parsed = domainsString.split(",").map(_.trim.toLowerCase).filter(_.nonEmpty).toSet + + parsed should contain("github.com") + parsed should contain("gitlab.com") + parsed.size shouldBe 2 + } + + "AuthenticatedHttpFileSource" should "resolve relative paths" in { + val source = AuthenticatedHttpFileSource( + java.net.URI.create("https://github.com/org/repo/main/"), + StandardCharsets.UTF_8, + isDirectory = true, + token = Some(testToken) + )("https://github.com/org/repo/main/") + + val resolved = source.resolve("subdir/file.wdl") + resolved.address should include("subdir/file.wdl") + resolved.token shouldBe Some(testToken) + } + + it should "propagate token to resolved files" in { + val source = AuthenticatedHttpFileSource( + java.net.URI.create("https://github.com/org/repo/main/"), + StandardCharsets.UTF_8, + isDirectory = true, + token = Some(testToken) + )("https://github.com/org/repo/main/") + + val resolved = source.resolve("another_file.wdl") + resolved.token shouldBe Some(testToken) + } + + it should "propagate token to resolved directories" in { + val source = AuthenticatedHttpFileSource( + java.net.URI.create("https://github.com/org/repo/main/"), + StandardCharsets.UTF_8, + isDirectory = true, + token = Some(testToken) + )("https://github.com/org/repo/main/") + + val resolved = source.resolveDirectory("subdir") + resolved.isDirectory shouldBe true + resolved.token shouldBe Some(testToken) + } + + it should "get parent directory with token" in { + val source = AuthenticatedHttpFileSource( + java.net.URI.create("https://github.com/org/repo/main/file.wdl"), + StandardCharsets.UTF_8, + isDirectory = false, + token = Some(testToken) + )("https://github.com/org/repo/main/file.wdl") + + val parent = source.getParent + parent shouldBe defined + parent.get.isDirectory shouldBe true + parent.get.token shouldBe Some(testToken) + } + + it should "extract name from URI" in { + val source = AuthenticatedHttpFileSource( + java.net.URI.create("https://github.com/org/repo/main/file.wdl"), + StandardCharsets.UTF_8, + isDirectory = false, + token = None + )("https://github.com/org/repo/main/file.wdl") + + source.name shouldBe "file.wdl" + } + + it should "extract folder from URI" in { + val source = AuthenticatedHttpFileSource( + java.net.URI.create("https://github.com/org/repo/main/file.wdl"), + StandardCharsets.UTF_8, + isDirectory = false, + token = None + )("https://github.com/org/repo/main/file.wdl") + + source.folder shouldBe "/org/repo/main" + } + + it should "not be listable" in { + val source = AuthenticatedHttpFileSource( + java.net.URI.create("https://github.com/org/repo/main/"), + StandardCharsets.UTF_8, + isDirectory = true, + token = None + )("https://github.com/org/repo/main/") + + source.isListable shouldBe false + } + + it should "relativize paths correctly" in { + val dirSource = AuthenticatedHttpFileSource( + java.net.URI.create("https://github.com/org/repo/main/"), + StandardCharsets.UTF_8, + isDirectory = true, + token = None + )("https://github.com/org/repo/main/") + + val fileSource = AuthenticatedHttpFileSource( + java.net.URI.create("https://github.com/org/repo/main/subdir/file.wdl"), + StandardCharsets.UTF_8, + isDirectory = false, + token = None + )("https://github.com/org/repo/main/subdir/file.wdl") + + val relativePath = dirSource.relativize(fileSource) + relativePath shouldBe "subdir/file.wdl" + } +} diff --git a/core/src/test/scala/dx/core/io/AuthenticatedHttpIntegrationTest.scala b/core/src/test/scala/dx/core/io/AuthenticatedHttpIntegrationTest.scala new file mode 100644 index 000000000..b15ffa261 --- /dev/null +++ b/core/src/test/scala/dx/core/io/AuthenticatedHttpIntegrationTest.scala @@ -0,0 +1,229 @@ +package dx.core.io + +import com.sun.net.httpserver.{HttpExchange, HttpHandler, HttpServer} +import dx.util.{FileSourceResolver, LocalFileAccessProtocol, Logger} +import org.scalatest.BeforeAndAfterAll +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +import java.net.InetSocketAddress +import java.nio.charset.StandardCharsets + +/** + * Integration tests using an embedded HTTP server to verify + * authenticated imports work correctly end-to-end. + */ +class AuthenticatedHttpIntegrationTest extends AnyFlatSpec with Matchers with BeforeAndAfterAll { + + private var server: HttpServer = _ + private var serverPort: Int = _ + private val testToken = "test-bearer-token-xyz" + + private val wdlContent = + """version 1.0 + | + |task hello { + | command { echo "Hello, World!" } + | output { String message = read_string(stdout()) } + |} + |""".stripMargin + + override def beforeAll(): Unit = { + super.beforeAll() + + // Create embedded HTTP server + server = HttpServer.create(new InetSocketAddress(0), 0) + serverPort = server.getAddress.getPort + + // Public endpoint - no auth required + server.createContext("/public/file.wdl", new HttpHandler { + override def handle(exchange: HttpExchange): Unit = { + val response = wdlContent.getBytes(StandardCharsets.UTF_8) + exchange.sendResponseHeaders(200, response.length) + val os = exchange.getResponseBody + os.write(response) + os.close() + } + }) + + // Private endpoint - requires Bearer token + server.createContext("/private/file.wdl", new HttpHandler { + override def handle(exchange: HttpExchange): Unit = { + val authHeader = exchange.getRequestHeaders.getFirst("Authorization") + + if (authHeader == s"Bearer $testToken") { + val response = wdlContent.getBytes(StandardCharsets.UTF_8) + exchange.sendResponseHeaders(200, response.length) + val os = exchange.getResponseBody + os.write(response) + os.close() + } else if (authHeader == null) { + exchange.sendResponseHeaders(401, -1) + exchange.close() + } else { + exchange.sendResponseHeaders(403, -1) + exchange.close() + } + } + }) + + // Endpoint that checks for token and returns different content + server.createContext("/conditional/file.wdl", new HttpHandler { + override def handle(exchange: HttpExchange): Unit = { + val authHeader = exchange.getRequestHeaders.getFirst("Authorization") + val content = if (authHeader == s"Bearer $testToken") { + "authenticated content" + } else { + "public content" + } + val response = content.getBytes(StandardCharsets.UTF_8) + exchange.sendResponseHeaders(200, response.length) + val os = exchange.getResponseBody + os.write(response) + os.close() + } + }) + + // HEAD endpoint for exists check + server.createContext("/head-test/file.wdl", new HttpHandler { + override def handle(exchange: HttpExchange): Unit = { + val authHeader = exchange.getRequestHeaders.getFirst("Authorization") + + if (authHeader == s"Bearer $testToken") { + exchange.sendResponseHeaders(200, -1) + } else if (authHeader == null) { + exchange.sendResponseHeaders(401, -1) + } else { + exchange.sendResponseHeaders(403, -1) + } + exchange.close() + } + }) + + server.setExecutor(null) + server.start() + } + + override def afterAll(): Unit = { + if (server != null) { + server.stop(0) + } + super.afterAll() + } + + private def createProtocol(token: Option[String], allowedDomains: Set[String]): AuthenticatedHttpFileAccessProtocol = { + AuthenticatedHttpFileAccessProtocol( + token = token, + allowedDomains = allowedDomains, + logger = Logger.Quiet + ) + } + + "Authenticated HTTP imports" should "access public endpoints without token" in { + val protocol = createProtocol(None, Set.empty) + val source = protocol.resolve(s"http://localhost:$serverPort/public/file.wdl") + + source.readString should include("version 1.0") + } + + it should "access private endpoints with valid token" in { + val protocol = createProtocol(Some(testToken), Set("localhost")) + val source = protocol.resolve(s"http://localhost:$serverPort/private/file.wdl") + + source.readString should include("version 1.0") + } + + it should "fail on private endpoints without token" in { + val protocol = createProtocol(None, Set.empty) + val source = protocol.resolve(s"http://localhost:$serverPort/private/file.wdl") + + val exception = intercept[Exception] { + source.readString + } + exception.getMessage should include("401") + exception.getMessage should include("WDL_IMPORT_TOKEN") + } + + it should "fail on private endpoints with wrong token" in { + val protocol = createProtocol(Some("wrong-token"), Set("localhost")) + val source = protocol.resolve(s"http://localhost:$serverPort/private/file.wdl") + + val exception = intercept[Exception] { + source.readString + } + exception.getMessage should include("403") + } + + it should "not send token to non-allowed domains" in { + // Token configured but localhost not in allowed domains + val protocol = createProtocol(Some(testToken), Set("github.com")) + val source = protocol.resolve(s"http://localhost:$serverPort/conditional/file.wdl") + + // Should get public content since token wasn't sent + source.readString shouldBe "public content" + } + + it should "send token only to allowed domains" in { + val protocol = createProtocol(Some(testToken), Set("localhost")) + val source = protocol.resolve(s"http://localhost:$serverPort/conditional/file.wdl") + + // Should get authenticated content since token was sent + source.readString shouldBe "authenticated content" + } + + it should "read file content correctly" in { + val protocol = createProtocol(Some(testToken), Set("localhost")) + val source = protocol.resolve(s"http://localhost:$serverPort/private/file.wdl") + + val content = source.readString + content should include("version 1.0") + content should include("task hello") + content should include("Hello, World!") + } + + it should "check exists with authentication" in { + val protocol = createProtocol(Some(testToken), Set("localhost")) + val source = protocol.resolve(s"http://localhost:$serverPort/head-test/file.wdl") + + source.exists shouldBe true + } + + it should "fail exists check without required token" in { + val protocol = createProtocol(None, Set.empty) + val source = protocol.resolve(s"http://localhost:$serverPort/head-test/file.wdl") + + val exception = intercept[Exception] { + source.exists + } + exception.getMessage should include("401") + } + + it should "fail exists check with wrong token" in { + val protocol = createProtocol(Some("wrong-token"), Set("localhost")) + val source = protocol.resolve(s"http://localhost:$serverPort/head-test/file.wdl") + + val exception = intercept[Exception] { + source.exists + } + exception.getMessage should include("403") + } + + it should "work with FileSourceResolver" in { + val httpProtocol = createProtocol(Some(testToken), Set("localhost")) + val resolver = FileSourceResolver(Vector( + LocalFileAccessProtocol(), + httpProtocol + )) + + val source = resolver.resolve(s"http://localhost:$serverPort/private/file.wdl") + source.readString should include("version 1.0") + } + + it should "resolve relative imports with authentication" in { + val protocol = createProtocol(Some(testToken), Set("localhost")) + val baseSource = protocol.resolveDirectory(s"http://localhost:$serverPort/private/") + + val resolvedSource = baseSource.resolve("file.wdl") + resolvedSource.token shouldBe Some(testToken) + } +} diff --git a/doc/AUTHENTICATED_IMPORTS.md b/doc/AUTHENTICATED_IMPORTS.md new file mode 100644 index 000000000..f49b4d8ca --- /dev/null +++ b/doc/AUTHENTICATED_IMPORTS.md @@ -0,0 +1,136 @@ +# Authenticated HTTP Imports for WDL + +This document describes how to import WDL files from HTTP sources that require authentication. + +## Overview + +dxCompiler can import WDL files from URLs that require authentication, such as: +- Private GitHub repositories +- Private GitLab repositories (when configured) +- Internal corporate servers (when configured) + +Authentication is provided via Bearer tokens set in environment variables. + +## Environment Variables + +| Variable | Required | Description | +|----------|----------|-------------| +| `WDL_IMPORT_TOKEN` | No | Bearer token for HTTP authentication | +| `WDL_IMPORT_TOKEN_DOMAINS` | No | Comma-separated list of domains to send token to | + +## Default Allowed Domains + +When `WDL_IMPORT_TOKEN_DOMAINS` is not set, tokens are only sent to: + +- `github.com` +- `raw.githubusercontent.com` + +This prevents accidentally leaking tokens to untrusted servers. + +## Configuration Examples + +### Basic GitHub Private Repository Access + +```bash +# Generate a token at https://github.com/settings/tokens +# Required scope: repo (for private repositories) +export WDL_IMPORT_TOKEN="ghp_xxxxxxxxxxxxxxxxxxxx" + +java -jar dxCompiler.jar compile workflow.wdl -project project-xxxx -folder /my/workflows/ +``` + +### Multiple Private Sources + +```bash +export WDL_IMPORT_TOKEN="your-token-here" +export WDL_IMPORT_TOKEN_DOMAINS="github.com,raw.githubusercontent.com,gitlab.com,internal.company.com" + +java -jar dxCompiler.jar compile workflow.wdl -project project-xxxx -folder /my/workflows/ +``` + +## WDL Import Syntax + +### GitHub Raw Content URL + +```wdl +import "https://raw.githubusercontent.com/owner/repo/branch/path/to/file.wdl" +``` + +### GitHub Blob URL (Not Recommended) + +GitHub blob URLs (`github.com/owner/repo/blob/...`) do not return raw content. +Use raw.githubusercontent.com URLs instead. + +## Error Messages + +### 401 Unauthorized + +``` +HTTP 401 Unauthorized when accessing https://raw.githubusercontent.com/... +If this is a private repository, ensure WDL_IMPORT_TOKEN is set with a valid access token. +For GitHub: generate a token at https://github.com/settings/tokens with 'repo' scope. +``` + +**Solution:** Set the `WDL_IMPORT_TOKEN` environment variable with a valid token. + +### 403 Forbidden + +``` +HTTP 403 Forbidden when accessing https://raw.githubusercontent.com/... +The token may be invalid or lack the required permissions. +``` + +**Solution:** The token may be invalid or lack the required permissions. For GitHub, ensure the token has the `repo` scope. + +## Security Considerations + +1. **Token Scope**: Only grant the minimum required permissions to your token +2. **Domain Allowlist**: Tokens are only sent to explicitly allowed domains +3. **No Logging**: Token values are never logged; only usage is traced +4. **HTTPS Recommended**: Always use HTTPS URLs for private imports + +## Debugging + +Enable trace logging to see when authenticated imports are used: + +```bash +java -jar dxCompiler.jar compile workflow.wdl -verbose -verboseKey FileSourceResolver +``` + +Look for log entries like: +``` +[TRACE] WDL_IMPORT_TOKEN found; authenticated HTTP imports enabled for domains: github.com, raw.githubusercontent.com +[TRACE] Using authenticated HTTP for import from: raw.githubusercontent.com +``` + +## Troubleshooting + +### Token not being sent + +1. Verify `WDL_IMPORT_TOKEN` is set: `echo $WDL_IMPORT_TOKEN` +2. Check if the domain is in the allowed list +3. Enable verbose logging to see authentication attempts + +### Token rejected + +1. Verify the token is still valid (not expired) +2. Check token permissions/scopes +3. For GitHub, ensure the token has access to the specific repository + +### Public imports stopped working + +The authenticated HTTP protocol is backward compatible. If no token is set, it works like the standard HTTP protocol. Verify no token is set if you're testing public access. + +## How It Works + +dxCompiler uses a custom `AuthenticatedHttpFileAccessProtocol` that: + +1. Checks if `WDL_IMPORT_TOKEN` is set +2. For each HTTP import, checks if the domain is in the allowed list +3. If both conditions are met, adds an `Authorization: Bearer ` header to the request +4. If either condition is not met, the request proceeds without authentication (backward compatible) + +This ensures that: +- Tokens are never sent to untrusted domains +- Existing workflows continue to work without modification +- Authentication failures produce clear, actionable error messages diff --git a/doc/ExpertOptions.md b/doc/ExpertOptions.md index c364afb94..8b9a75c30 100644 --- a/doc/ExpertOptions.md +++ b/doc/ExpertOptions.md @@ -103,7 +103,7 @@ Compilation can be controled with several parameters. | extras \ | JSON file with extra options | | inputs \ | JSON file with standard-formatted input values. May be specified multiple times. A DNAnexus JSON input file is generated for each standard input file. | | instanceTypeSelection \[static,dynamic\] | Whether to select instance types at compile time for tasks with runtime requirements that can all be statically evaluated (the default "static" option), or to defer instance type selection in such cases to runtime (the "dynamic" option). Using static instance type selection can save time, but it requires the same set of instances to be accessible during WDL/CWL compilation and during the runtime of the generated applets and workflows. Use the "dynamic" option if you plan on creating global DNAnexus workflows or cloning the generated workflows between DNAnexus organizations with different available instance types. | -| imports \ | Directory to search for imported WDL or CWL files. May be specified multiple times. | +| imports \ | Directory to search for imported WDL or CWL files. May be specified multiple times. For importing from private HTTP sources (e.g., private GitHub repos), see [Authenticated Imports](AUTHENTICATED_IMPORTS.md). | | locked | Create a locked workflow. When running a locked workflow, input values may only be specified for the top-level workflow. | | leaveWorkflowsOpen | Leave created workflows open (otherwise they are closed). | | projectWideReuse | Look for existing applets/workflows in the entire project before generating new ones. The default search scope is the target folder only. |