-
Notifications
You must be signed in to change notification settings - Fork 744
Determine Azure directories more accurately using metadata and blob properties #6428
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 8 commits
6cc7cc5
5e0709c
78ff466
7b93b04
4b3971a
3c931af
eb16a3a
4dfec66
b90fab7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -58,23 +58,42 @@ class AzFileAttributes implements BasicFileAttributes { | |
| objectId = "/${client.containerName}/${client.blobName}" | ||
| creationTime = time(props.getCreationTime()) | ||
| updateTime = time(props.getLastModified()) | ||
| directory = client.blobName.endsWith('/') | ||
| size = props.getBlobSize() | ||
|
|
||
| // Support for Azure Data Lake Storage Gen2 with hierarchical namespace enabled | ||
|
|
||
| // Determine if this is a directory using metadata only (most reliable): | ||
| final meta = props.getMetadata() | ||
| if( meta.containsKey("hdi_isfolder") && size == 0 ){ | ||
| directory = meta.get("hdi_isfolder") | ||
| if( meta != null && meta.containsKey("hdi_isfolder") && meta.get("hdi_isfolder") == "true" ){ | ||
| directory = true | ||
| size = 0 | ||
| } | ||
| else { | ||
| // Without metadata, default to treating as file | ||
| // This aligns with Azure SDK's approach where explicit directory markers are required | ||
| directory = false | ||
| size = props.getBlobSize() | ||
adamrtalbot marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
|
|
||
| AzFileAttributes(String containerName, BlobItem item) { | ||
| objectId = "/${containerName}/${item.name}" | ||
| directory = item.name.endsWith('/') | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @bentsherman what will break if we incorrectly classify something as a file when it's a pseudo-directory? What's the impact of getting it wrong? |
||
| if( !directory ) { | ||
| creationTime = time(item.properties.getCreationTime()) | ||
| updateTime = time(item.properties.getLastModified()) | ||
| size = item.properties.getContentLength() | ||
|
|
||
| // Determine if this is a directory using reliable methods only: | ||
| // 1. Check if it's marked as a prefix (virtual directory) - Most reliable | ||
| if( item.isPrefix() != null && item.isPrefix() ) { | ||
| directory = true | ||
| // Virtual directories don't have properties like creation time | ||
| size = 0 | ||
| } | ||
| // 2. Check metadata for hierarchical namespace (ADLS Gen2) | ||
| else if( item.getMetadata() != null && item.getMetadata().containsKey("hdi_isfolder") && item.getMetadata().get("hdi_isfolder") == "true" ) { | ||
| directory = true | ||
| size = 0 | ||
| } | ||
| // 3. Default: treat as file | ||
| else { | ||
| directory = false | ||
| creationTime = time(item.getProperties().getCreationTime()) | ||
| updateTime = time(item.getProperties().getLastModified()) | ||
| size = item.getProperties().getContentLength() | ||
adamrtalbot marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
|
|
||
|
|
@@ -92,7 +111,41 @@ class AzFileAttributes implements BasicFileAttributes { | |
|
|
||
| protected AzFileAttributes(BlobContainerClient client, String blobName) { | ||
| objectId = "/$client.blobContainerName/$blobName" | ||
| directory = blobName.endsWith('/') | ||
|
|
||
| if (blobName.endsWith('/')) { | ||
| directory = true | ||
| size = 0 | ||
| return | ||
| } | ||
|
|
||
| def blobClient = client.getBlobClient(blobName) | ||
| if (blobClient.exists()) { | ||
| def props = blobClient.getProperties() | ||
| def metadata = props.getMetadata() | ||
|
|
||
| creationTime = time(props.getCreationTime()) | ||
| updateTime = time(props.getLastModified()) | ||
|
|
||
| if (metadata != null && metadata.containsKey("hdi_isfolder") && metadata.get("hdi_isfolder") == "true") { | ||
| directory = true | ||
| size = 0 | ||
| } else { | ||
| directory = false | ||
| size = props.getBlobSize() | ||
| } | ||
| } else { | ||
| def prefix = blobName.endsWith('/') ? blobName : blobName + '/' | ||
| def opts = new com.azure.storage.blob.models.ListBlobsOptions().setPrefix(prefix).setMaxResultsPerPage(1) | ||
| def hasChildren = client.listBlobs(opts, null).stream().findFirst().isPresent() | ||
|
|
||
| if (hasChildren) { | ||
| directory = true | ||
| size = 0 | ||
| } else { | ||
| directory = false | ||
| size = 0 | ||
| } | ||
| } | ||
adamrtalbot marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -94,7 +94,34 @@ class AzPath implements Path { | |
| } | ||
|
|
||
| boolean isDirectory() { | ||
| return directory | ||
| if (directory) { | ||
| return true | ||
| } | ||
|
|
||
| def blobNameStr = blobName() | ||
| if (blobNameStr) { | ||
| def containerClient = containerClient() | ||
| def blobClient = containerClient.getBlobClient(blobNameStr) | ||
|
|
||
| if (blobClient.exists()) { | ||
| def props = blobClient.getProperties() | ||
| def metadata = props.getMetadata() | ||
|
|
||
| if (metadata != null && metadata.containsKey("hdi_isfolder") && metadata.get("hdi_isfolder") == "true") { | ||
| return true | ||
| } | ||
| } else { | ||
| def prefix = blobNameStr.endsWith('/') ? blobNameStr : blobNameStr + '/' | ||
| def opts = new com.azure.storage.blob.models.ListBlobsOptions().setPrefix(prefix).setMaxResultsPerPage(1) | ||
| def hasChildren = containerClient.listBlobs(opts, null).stream().findFirst().isPresent() | ||
|
|
||
| if (hasChildren) { | ||
| return true | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return false | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Azure API Calls Cause Performance IssuesThe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Trailing Slash Bug Affects Directory DetectionThe Additional Locations (4)
|
||
| } | ||
|
|
||
| String checkContainerName() { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,95 @@ | ||
| package nextflow.cloud.azure.nio | ||
|
|
||
| import com.azure.storage.blob.BlobContainerClient | ||
| import spock.lang.Specification | ||
| import spock.lang.Unroll | ||
|
|
||
| /** | ||
| * Unit tests for AzFileAttributes class | ||
| */ | ||
| class AzFileAttributesTest extends Specification { | ||
|
|
||
| def 'should create root attributes correctly'() { | ||
| when: | ||
| def attrs = AzFileAttributes.root() | ||
|
|
||
| then: | ||
| attrs.isDirectory() | ||
| !attrs.isRegularFile() | ||
| attrs.size() == 0 | ||
| attrs.fileKey() == '/' | ||
| } | ||
|
|
||
| @Unroll | ||
| def 'should validate directory detection with blobName: #blobName'() { | ||
| given: | ||
| def mockClient = GroovyMock(BlobContainerClient) { | ||
| getBlobContainerName() >> 'test-container' | ||
| } | ||
|
|
||
| when: | ||
| def attrs = new AzFileAttributes(mockClient, blobName) | ||
|
|
||
| then: | ||
| attrs.isDirectory() == expectedDirectory | ||
| attrs.isRegularFile() != expectedDirectory | ||
| attrs.fileKey().endsWith("/$blobName") | ||
|
|
||
| where: | ||
| blobName | expectedDirectory | comment | ||
| 'normal-file.txt' | false | 'Regular file without slash' | ||
| 'normal-file' | false | 'Regular file without slash' | ||
| 'problematic-file.txt/' | true | 'Path with trailing slash is directory' | ||
| 'directory/' | true | 'Path with trailing slash is directory' | ||
| 'file.log/' | true | 'Path with trailing slash is directory' | ||
| 'path/to/file.dat/' | true | 'Path with trailing slash is directory' | ||
| '/' | true | 'Root slash is directory' | ||
| 'multiple///' | true | 'Path ending with slashes is directory' | ||
| 'has.extension.txt/' | true | 'Path with slash is directory regardless of extension' | ||
| 'log.2024-01-01.txt/' | true | 'Path with slash is directory regardless of extension' | ||
| } | ||
|
|
||
| def 'should validate directory detection for paths without slash'() { | ||
| given: | ||
| def mockClient = GroovyMock(BlobContainerClient) { | ||
| getBlobContainerName() >> 'my-container' | ||
| } | ||
|
|
||
| when: | ||
| def attrs = new AzFileAttributes(mockClient, 'some-directory-without-slash') | ||
|
|
||
| then: | ||
| attrs.isDirectory() == false | ||
| attrs.isRegularFile() | ||
| attrs.fileKey().endsWith('/some-directory-without-slash') | ||
| } | ||
|
|
||
| def 'should handle edge cases in directory detection'() { | ||
| given: | ||
| def mockClient = GroovyMock(BlobContainerClient) { | ||
| getBlobContainerName() >> 'test-container' | ||
| } | ||
|
|
||
| expect: | ||
| new AzFileAttributes(mockClient, 'regular-file/').isDirectory() == true | ||
| new AzFileAttributes(mockClient, 'file.txt/').isDirectory() == true | ||
| new AzFileAttributes(mockClient, '/').isDirectory() == true | ||
| new AzFileAttributes(mockClient, 'multiple///').isDirectory() == true | ||
| new AzFileAttributes(mockClient, 'no-slash').isDirectory() == false | ||
| } | ||
|
|
||
| def 'should verify equality and hashCode methods work correctly'() { | ||
| given: | ||
| def attrs1 = AzFileAttributes.root() | ||
| def attrs2 = AzFileAttributes.root() | ||
|
|
||
| when: | ||
| def equals1 = attrs1.equals(attrs2) | ||
| def hash1 = attrs1.hashCode() | ||
| def hash2 = attrs2.hashCode() | ||
|
|
||
| then: | ||
| equals1 == true | ||
| hash1 == hash2 | ||
| } | ||
| } |
Uh oh!
There was an error while loading. Please reload this page.