Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -58,23 +58,42 @@ class AzFileAttributes implements BasicFileAttributes {
objectId = "/${client.containerName}/${client.blobName}"
creationTime = time(props.getCreationTime())
updateTime = time(props.getLastModified())
directory = client.blobName.endsWith('/')
size = props.getBlobSize()

// Support for Azure Data Lake Storage Gen2 with hierarchical namespace enabled

// Determine if this is a directory using metadata only (most reliable):
final meta = props.getMetadata()
if( meta.containsKey("hdi_isfolder") && size == 0 ){
directory = meta.get("hdi_isfolder")
if( meta != null && meta.containsKey("hdi_isfolder") && meta.get("hdi_isfolder") == "true" ){
directory = true
size = 0
}
else {
// Without metadata, default to treating as file
// This aligns with Azure SDK's approach where explicit directory markers are required
directory = false
size = props.getBlobSize()
}
}

AzFileAttributes(String containerName, BlobItem item) {
objectId = "/${containerName}/${item.name}"
directory = item.name.endsWith('/')
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bentsherman what will break if we incorrectly classify something as a file when it's a pseudo-directory? What's the impact of getting it wrong?

if( !directory ) {
creationTime = time(item.properties.getCreationTime())
updateTime = time(item.properties.getLastModified())
size = item.properties.getContentLength()

// Determine if this is a directory using reliable methods only:
// 1. Check if it's marked as a prefix (virtual directory) - Most reliable
if( item.isPrefix() != null && item.isPrefix() ) {
directory = true
// Virtual directories don't have properties like creation time
size = 0
}
// 2. Check metadata for hierarchical namespace (ADLS Gen2)
else if( item.getMetadata() != null && item.getMetadata().containsKey("hdi_isfolder") && item.getMetadata().get("hdi_isfolder") == "true" ) {
directory = true
size = 0
}
// 3. Default: treat as file
else {
directory = false
creationTime = time(item.getProperties().getCreationTime())
updateTime = time(item.getProperties().getLastModified())
size = item.getProperties().getContentLength()
}
}

Expand All @@ -92,7 +111,41 @@ class AzFileAttributes implements BasicFileAttributes {

protected AzFileAttributes(BlobContainerClient client, String blobName) {
objectId = "/$client.blobContainerName/$blobName"
directory = blobName.endsWith('/')

if (blobName.endsWith('/')) {
directory = true
size = 0
return
}

def blobClient = client.getBlobClient(blobName)
if (blobClient.exists()) {
def props = blobClient.getProperties()
def metadata = props.getMetadata()

creationTime = time(props.getCreationTime())
updateTime = time(props.getLastModified())

if (metadata != null && metadata.containsKey("hdi_isfolder") && metadata.get("hdi_isfolder") == "true") {
directory = true
size = 0
} else {
directory = false
size = props.getBlobSize()
}
} else {
def prefix = blobName.endsWith('/') ? blobName : blobName + '/'
def opts = new com.azure.storage.blob.models.ListBlobsOptions().setPrefix(prefix).setMaxResultsPerPage(1)
def hasChildren = client.listBlobs(opts, null).stream().findFirst().isPresent()

if (hasChildren) {
directory = true
size = 0
} else {
directory = false
size = 0
}
}
}


Expand Down
29 changes: 28 additions & 1 deletion plugins/nf-azure/src/main/nextflow/cloud/azure/nio/AzPath.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,34 @@ class AzPath implements Path {
}

boolean isDirectory() {
return directory
if (directory) {
return true
}

def blobNameStr = blobName()
if (blobNameStr) {
def containerClient = containerClient()
def blobClient = containerClient.getBlobClient(blobNameStr)

if (blobClient.exists()) {
def props = blobClient.getProperties()
def metadata = props.getMetadata()

if (metadata != null && metadata.containsKey("hdi_isfolder") && metadata.get("hdi_isfolder") == "true") {
return true
}
} else {
def prefix = blobNameStr.endsWith('/') ? blobNameStr : blobNameStr + '/'
def opts = new com.azure.storage.blob.models.ListBlobsOptions().setPrefix(prefix).setMaxResultsPerPage(1)
def hasChildren = containerClient.listBlobs(opts, null).stream().findFirst().isPresent()

if (hasChildren) {
return true
}
}
}

return false
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Azure API Calls Cause Performance Issues

The isDirectory() method now makes multiple uncached Azure API calls (exists(), getProperties(), listBlobs()) on every invocation when the directory field is false. This causes a significant performance regression. It also has a logic flaw that can lead to incorrect directory detection and lacks exception handling for these network calls.

Fix in Cursor Fix in Web

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Trailing Slash Bug Affects Directory Detection

The AzPath constructor's handling of trailing slashes causes isDirectory() to return early, bypassing its new Azure-querying logic for directory detection. This also creates unreachable code in the AzFileAttributes constructor and leads to conflicting test expectations regarding paths ending with a slash.

Additional Locations (4)

Fix in Cursor Fix in Web

}

String checkContainerName() {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package nextflow.cloud.azure.nio

import com.azure.storage.blob.BlobContainerClient
import spock.lang.Specification
import spock.lang.Unroll

/**
* Unit tests for AzFileAttributes class
*/
class AzFileAttributesTest extends Specification {

def 'should create root attributes correctly'() {
when:
def attrs = AzFileAttributes.root()

then:
attrs.isDirectory()
!attrs.isRegularFile()
attrs.size() == 0
attrs.fileKey() == '/'
}

@Unroll
def 'should validate directory detection with blobName: #blobName'() {
given:
def mockClient = GroovyMock(BlobContainerClient) {
getBlobContainerName() >> 'test-container'
}

when:
def attrs = new AzFileAttributes(mockClient, blobName)

then:
attrs.isDirectory() == expectedDirectory
attrs.isRegularFile() != expectedDirectory
attrs.fileKey().endsWith("/$blobName")

where:
blobName | expectedDirectory | comment
'normal-file.txt' | false | 'Regular file without slash'
'normal-file' | false | 'Regular file without slash'
'problematic-file.txt/' | true | 'Path with trailing slash is directory'
'directory/' | true | 'Path with trailing slash is directory'
'file.log/' | true | 'Path with trailing slash is directory'
'path/to/file.dat/' | true | 'Path with trailing slash is directory'
'/' | true | 'Root slash is directory'
'multiple///' | true | 'Path ending with slashes is directory'
'has.extension.txt/' | true | 'Path with slash is directory regardless of extension'
'log.2024-01-01.txt/' | true | 'Path with slash is directory regardless of extension'
}

def 'should validate directory detection for paths without slash'() {
given:
def mockClient = GroovyMock(BlobContainerClient) {
getBlobContainerName() >> 'my-container'
}

when:
def attrs = new AzFileAttributes(mockClient, 'some-directory-without-slash')

then:
attrs.isDirectory() == false
attrs.isRegularFile()
attrs.fileKey().endsWith('/some-directory-without-slash')
}

def 'should handle edge cases in directory detection'() {
given:
def mockClient = GroovyMock(BlobContainerClient) {
getBlobContainerName() >> 'test-container'
}

expect:
new AzFileAttributes(mockClient, 'regular-file/').isDirectory() == true
new AzFileAttributes(mockClient, 'file.txt/').isDirectory() == true
new AzFileAttributes(mockClient, '/').isDirectory() == true
new AzFileAttributes(mockClient, 'multiple///').isDirectory() == true
new AzFileAttributes(mockClient, 'no-slash').isDirectory() == false
}

def 'should verify equality and hashCode methods work correctly'() {
given:
def attrs1 = AzFileAttributes.root()
def attrs2 = AzFileAttributes.root()

when:
def equals1 = attrs1.equals(attrs2)
def hash1 = attrs1.hashCode()
def hash2 = attrs2.hashCode()

then:
equals1 == true
hash1 == hash2
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -928,4 +928,51 @@ class AzNioTest extends Specification implements AzBaseSpec {
deleteBucket(bucket1)
}

def 'should detect directory with hdi_isfolder metadata' () {
given:
def bucketName = createBucket()
def dirPath = "$bucketName/test-dir"

when:
// Create a directory marker with hdi_isfolder metadata
def containerClient = storageClient.getBlobContainerClient(bucketName)
def blobClient = containerClient.getBlobClient("test-dir/")
blobClient.upload(new ByteArrayInputStream(new byte[0]), 0)
blobClient.setMetadata(['hdi_isfolder': 'true'])

and:
def path = Paths.get(new URI("az://$dirPath/"))
def attrs = Files.readAttributes(path, BasicFileAttributes)

then:
attrs.isDirectory()
!attrs.isRegularFile()

cleanup:
deleteBucket(bucketName)
}

def 'should not treat file with trailing slash as directory without metadata' () {
given:
def bucketName = createBucket()

when:
// Create a file with trailing slash but no directory metadata
def containerClient = storageClient.getBlobContainerClient(bucketName)
def blobClient = containerClient.getBlobClient("test-file/")
blobClient.upload(new ByteArrayInputStream("content".bytes), "content".length())

and:
def path = Paths.get(new URI("az://$bucketName/test-file/"))
def attrs = Files.readAttributes(path, BasicFileAttributes)

then:
// Without metadata or isPrefix, it should be treated as a file
attrs.isRegularFile()
!attrs.isDirectory()

cleanup:
deleteBucket(bucketName)
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class AzPathTest extends Specification {
objectName | expected | dir
'/bucket/file.txt' | '/bucket/file.txt' | false
'/bucket/a/b/c' | '/bucket/a/b/c' | false
'/bucket/a/b/c/' | '/bucket/a/b/c' | true
'/bucket/a/b/c/' | '/bucket/a/b/c' | false
'/bucket' | '/bucket' | true
'/bucket/' | '/bucket' | true

Expand All @@ -63,7 +63,7 @@ class AzPathTest extends Specification {
where:
PATH | CONTAINER | BLOB | IS_DIRECTORY | IS_CONTAINER
'/alpha/beta/delta' | 'alpha' | 'beta/delta' | false | false
'/alpha/beta/delta/' | 'alpha' | 'beta/delta/' | true | false
'/alpha/beta/delta/' | 'alpha' | 'beta/delta/' | false | false
'/alpha/' | 'alpha' | null | true | true
'/alpha' | 'alpha' | null | true | true

Expand Down Expand Up @@ -114,6 +114,76 @@ class AzPathTest extends Specification {
path2.root == null
}

@Unroll
def 'should demonstrate trailing slash directory bug in AzPath constructor: #testPath'() {
when:
def path = azpath(testPath)

then:
path.isDirectory() == expectedDirectory

where:
testPath | expectedDirectory | comment
'/container/regular-file.txt' | false | 'File without slash is a file'
'/container/regular-file.txt/' | false | 'File with slash is a file'
'/container/data.log' | false | 'Log file without slash'
'/container/data.log/' | false | 'Log file with slash is a file'
'/container/important.json' | false | 'JSON file without slash is a file'
'/container/important.json/' | false | 'JSON file with slash is a file'
'/container/backup.tar.gz' | false | 'Archive file without slash is a file'
'/container/backup.tar.gz/' | false | 'Archive file with slash is a file'
'/container/script.sh' | false | 'Script file without slash is a file'
'/container/script.sh/' | false | 'Script file with slash is a file'
}

def 'should demonstrate the specific Nextflow workflow issue'() {
given:
def filePath1 = azpath('/bucket/scratch/some-file') // File without trailing slash
def filePath2 = azpath('/bucket/scratch/some-file/') // Same file with trailing slash

when:
def isDirectory1 = filePath1.isDirectory()
def isDirectory2 = filePath2.isDirectory()

then:
isDirectory1 == false // File without slash is a file
isDirectory2 == false // Same file with slash is a file

and:
def logFile1 = azpath('/bucket/data.log')
def logFile2 = azpath('/bucket/data.log/')

logFile1.isDirectory() == false
logFile2.isDirectory() == false
}

def 'should validate directory detection with real-world paths'() {
when:
def scratchWithoutSlash = azpath("/seqeralabs-showcase/scratch")
def scratchWithSlash = azpath("/seqeralabs-showcase/scratch/")

then:
scratchWithoutSlash.isDirectory() == false // Queries Azure storage
scratchWithSlash.isDirectory() == true // Trailing slash = directory
}

def 'should validate directory detection in Channel-like operations'() {
when:
def paths = [
azpath("/container/file1"),
azpath("/container/file1/"),
azpath("/container/file2.txt"),
azpath("/container/file2.txt/")
]
def directoryResults = paths.collect { it.isDirectory() }

then:
directoryResults[0] == false // file1 without slash queries Azure storage
directoryResults[1] == true // file1 with slash treated as directory
directoryResults[2] == false // file2.txt without slash queries Azure storage
directoryResults[3] == true // file2.txt with slash treated as directory
}

@Unroll
def 'should return bucket name and id' () {
when:
Expand Down
Loading