Skip to content

Commit a3186f6

Browse files
authored
Merge pull request #665 from clearlydefined/fix/azure-blob-upload-race-condition
Fix Azure blob storage race condition with size-based upload
2 parents 6ee3f9b + 59163a2 commit a3186f6

File tree

1 file changed

+16
-5
lines changed

1 file changed

+16
-5
lines changed

ghcrawler/providers/storage/storageDocStore.js

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
// eslint-disable-next-line no-unused-vars
55
const { ContainerClient } = require('@azure/storage-blob')
66
const memoryCache = require('memory-cache')
7-
const { Readable } = require('stream')
87
const URL = require('url')
98

109
class AzureStorageDocStore {
@@ -41,11 +40,23 @@ class AzureStorageDocStore {
4140
blobMetadata.extra = JSON.stringify(document._metadata.extra)
4241
}
4342
const options = { metadata: blobMetadata, blobHTTPHeaders: { blobContentType: 'application/json' } }
44-
const dataStream = new Readable()
45-
dataStream.push(JSON.stringify(document))
46-
dataStream.push(null)
43+
const data = JSON.stringify(document)
4744
const blockBlobClient = this.containerClient.getBlockBlobClient(blobName)
48-
await blockBlobClient.uploadStream(dataStream, 8 << 20, 5, options)
45+
46+
// Use streaming for large documents (>100MB), direct upload for small
47+
const SIZE_THRESHOLD = 100 * 1024 * 1024
48+
49+
if (data.length > SIZE_THRESHOLD) {
50+
// Large documents: use streaming (note: still has multi-instance race condition risk)
51+
const { Readable } = require('stream')
52+
const dataStream = new Readable()
53+
dataStream.push(data)
54+
dataStream.push(null)
55+
await blockBlobClient.uploadStream(dataStream, 8 << 20, 5, options)
56+
} else {
57+
// Small documents: atomic upload (eliminates race conditions)
58+
await blockBlobClient.upload(data, data.length, options)
59+
}
4960
return blobName
5061
}
5162

0 commit comments

Comments
 (0)