-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Support concurrent multipart uploads in Azure #128449
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
1852843
ab13a0c
350ca15
1d1d913
ab50b34
05ca1da
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| pr: 128449 | ||
| summary: "[Draft] Support concurrent multipart uploads in Azure" | ||
| area: Snapshot/Restore | ||
| type: enhancement | ||
| issues: [] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,8 @@ | |
| import com.azure.core.http.HttpMethod; | ||
| import com.azure.core.http.rest.ResponseBase; | ||
| import com.azure.core.util.BinaryData; | ||
| import com.azure.core.util.FluxUtil; | ||
| import com.azure.core.util.logging.ClientLogger; | ||
| import com.azure.storage.blob.BlobAsyncClient; | ||
| import com.azure.storage.blob.BlobClient; | ||
| import com.azure.storage.blob.BlobContainerAsyncClient; | ||
|
|
@@ -48,6 +50,7 @@ | |
| import org.apache.logging.log4j.Logger; | ||
| import org.apache.logging.log4j.core.util.Throwables; | ||
| import org.elasticsearch.cluster.metadata.RepositoryMetadata; | ||
| import org.elasticsearch.common.CheckedBiFunction; | ||
| import org.elasticsearch.common.UUIDs; | ||
| import org.elasticsearch.common.blobstore.BlobContainer; | ||
| import org.elasticsearch.common.blobstore.BlobPath; | ||
|
|
@@ -64,6 +67,7 @@ | |
| import org.elasticsearch.common.unit.ByteSizeValue; | ||
| import org.elasticsearch.common.util.BigArrays; | ||
| import org.elasticsearch.core.CheckedConsumer; | ||
| import org.elasticsearch.core.IOUtils; | ||
| import org.elasticsearch.core.Nullable; | ||
| import org.elasticsearch.core.Tuple; | ||
| import org.elasticsearch.repositories.RepositoriesMetrics; | ||
|
|
@@ -121,6 +125,7 @@ public class AzureBlobStore implements BlobStore { | |
| private final ByteSizeValue maxSinglePartUploadSize; | ||
| private final int deletionBatchSize; | ||
| private final int maxConcurrentBatchDeletes; | ||
| private final int multipartUploadMaxConcurrency; | ||
|
|
||
| private final RequestMetricsRecorder requestMetricsRecorder; | ||
| private final AzureClientProvider.RequestMetricsHandler requestMetricsHandler; | ||
|
|
@@ -142,6 +147,7 @@ public AzureBlobStore( | |
| this.maxSinglePartUploadSize = Repository.MAX_SINGLE_PART_UPLOAD_SIZE_SETTING.get(metadata.settings()); | ||
| this.deletionBatchSize = Repository.DELETION_BATCH_SIZE_SETTING.get(metadata.settings()); | ||
| this.maxConcurrentBatchDeletes = Repository.MAX_CONCURRENT_BATCH_DELETES_SETTING.get(metadata.settings()); | ||
| this.multipartUploadMaxConcurrency = service.getMultipartUploadMaxConcurrency(); | ||
|
|
||
| List<RequestMatcher> requestMatchers = List.of( | ||
| new RequestMatcher((httpMethod, url) -> httpMethod == HttpMethod.HEAD, Operation.GET_BLOB_PROPERTIES), | ||
|
|
@@ -464,6 +470,136 @@ protected void onFailure() { | |
| } | ||
| } | ||
|
|
||
| void writeBlobAtomic( | ||
| final OperationPurpose purpose, | ||
| final String blobName, | ||
| final long blobSize, | ||
| final CheckedBiFunction<Long, Long, InputStream, IOException> provider, | ||
| final boolean failIfAlreadyExists | ||
| ) throws IOException { | ||
| try { | ||
| final List<MultiPart> multiParts; | ||
| if (blobSize <= getLargeBlobThresholdInBytes()) { | ||
| multiParts = null; | ||
| } else { | ||
| multiParts = computeMultiParts(blobSize, getUploadBlockSize()); | ||
| } | ||
| if (multiParts == null || multiParts.size() == 1) { | ||
| logger.debug("{}: uploading blob of size [{}] as single upload", blobName, blobSize); | ||
| try (var stream = provider.apply(0L, blobSize)) { | ||
| var flux = convertStreamToByteBuffer(stream, blobSize, DEFAULT_UPLOAD_BUFFERS_SIZE); | ||
| executeSingleUpload(purpose, blobName, flux, blobSize, failIfAlreadyExists); | ||
| } | ||
| } else { | ||
| logger.debug("{}: uploading blob of size [{}] using [{}] parts", blobName, blobSize, multiParts.size()); | ||
| assert blobSize == ((multiParts.size() - 1) * getUploadBlockSize()) + multiParts.getLast().blockSize(); | ||
| assert multiParts.size() > 1; | ||
|
|
||
| final var asyncClient = asyncClient(purpose).getBlobContainerAsyncClient(container) | ||
| .getBlobAsyncClient(blobName) | ||
| .getBlockBlobAsyncClient(); | ||
|
|
||
| Flux.fromIterable(multiParts) | ||
| .flatMapSequential(multipart -> stageBlock(asyncClient, blobName, multipart, provider), multipartUploadMaxConcurrency) | ||
| .collect(Collectors.toList()) | ||
| .flatMap(blockIds -> { | ||
| logger.debug("{}: all {} parts uploaded, now committing", blobName, multiParts.size()); | ||
| var response = asyncClient.commitBlockList( | ||
| multiParts.stream().map(MultiPart::blockId).toList(), | ||
| failIfAlreadyExists == false | ||
| ); | ||
| logger.debug("{}: all {} parts committed", blobName, multiParts.size()); | ||
| return response; | ||
| }) | ||
| .block(); | ||
| } | ||
| } catch (final BlobStorageException e) { | ||
| if (failIfAlreadyExists | ||
| && e.getStatusCode() == HttpURLConnection.HTTP_CONFLICT | ||
| && BlobErrorCode.BLOB_ALREADY_EXISTS.equals(e.getErrorCode())) { | ||
| throw new FileAlreadyExistsException(blobName, null, e.getMessage()); | ||
| } | ||
| throw new IOException("Unable to write blob " + blobName, e); | ||
| } catch (Exception e) { | ||
| throw new IOException("Unable to write blob " + blobName, e); | ||
| } | ||
| } | ||
|
|
||
| private record MultiPart(int part, String blockId, long blockOffset, long blockSize, boolean isLast) {} | ||
|
|
||
| private static List<MultiPart> computeMultiParts(long totalSize, long partSize) { | ||
| if (partSize <= 0) { | ||
| throw new IllegalArgumentException("Part size must be greater than zero"); | ||
| } | ||
| if ((totalSize == 0L) || (totalSize <= partSize)) { | ||
| return List.of(new MultiPart(0, makeMultipartBlockId(), 0L, totalSize, true)); | ||
| } | ||
|
|
||
| long lastPartSize = totalSize % partSize; | ||
| int parts = Math.toIntExact(totalSize / partSize) + (0L < lastPartSize ? 1 : 0); | ||
|
|
||
| long blockOffset = 0L; | ||
| var list = new ArrayList<MultiPart>(parts); | ||
| for (int p = 0; p < parts; p++) { | ||
| boolean isLast = (p == parts - 1); | ||
| var multipart = new MultiPart(p, makeMultipartBlockId(), blockOffset, isLast ? lastPartSize : partSize, isLast); | ||
tlrx marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| blockOffset += multipart.blockSize(); | ||
| list.add(multipart); | ||
| } | ||
| return List.copyOf(list); | ||
| } | ||
|
|
||
| private static Mono<String> stageBlock( | ||
| BlockBlobAsyncClient asyncClient, | ||
| String blobName, | ||
| MultiPart multiPart, | ||
| CheckedBiFunction<Long, Long, InputStream, IOException> provider | ||
| ) { | ||
| logger.debug( | ||
| "{}: staging part [{}] of size [{}] from offset [{}]", | ||
| blobName, | ||
| multiPart.part(), | ||
| multiPart.blockSize(), | ||
| multiPart.blockOffset() | ||
| ); | ||
| try { | ||
| var stream = toSynchronizedInputStream(blobName, provider.apply(multiPart.blockOffset(), multiPart.blockSize()), multiPart); | ||
| boolean success = false; | ||
| try { | ||
| var stageBlock = asyncClient.stageBlock( | ||
| multiPart.blockId(), | ||
| toFlux(stream, multiPart.blockSize(), DEFAULT_UPLOAD_BUFFERS_SIZE), | ||
| multiPart.blockSize() | ||
| ).doOnSuccess(unused -> { | ||
| logger.debug(() -> format("%s: part [%s] of size [%s] uploaded", blobName, multiPart.part(), multiPart.blockSize())); | ||
| IOUtils.closeWhileHandlingException(stream); | ||
| }).doOnCancel(() -> { | ||
| logger.warn(() -> format("%s: part [%s] of size [%s] cancelled", blobName, multiPart.part(), multiPart.blockSize())); | ||
| IOUtils.closeWhileHandlingException(stream); | ||
| }).doOnError(t -> { | ||
| logger.error(() -> format("%s: part [%s] of size [%s] failed", blobName, multiPart.part(), multiPart.blockSize()), t); | ||
| IOUtils.closeWhileHandlingException(stream); | ||
| }); | ||
| logger.debug( | ||
| "{}: part [{}] of size [{}] from offset [{}] staged", | ||
| blobName, | ||
| multiPart.part(), | ||
| multiPart.blockSize(), | ||
| multiPart.blockOffset() | ||
| ); | ||
| success = true; | ||
| return stageBlock.map(unused -> multiPart.blockId()); | ||
| } finally { | ||
| if (success != true) { | ||
| IOUtils.close(stream); | ||
| } | ||
| } | ||
| } catch (IOException e) { | ||
| logger.error(() -> format("%s: failed to stage part [%s] of size [%s]", blobName, multiPart.part(), multiPart.blockSize()), e); | ||
| return FluxUtil.monoError(new ClientLogger(AzureBlobStore.class), new UncheckedIOException(e)); | ||
| } | ||
| } | ||
|
|
||
| public void writeBlob(OperationPurpose purpose, String blobName, InputStream inputStream, long blobSize, boolean failIfAlreadyExists) | ||
| throws IOException { | ||
| assert inputStream.markSupported() | ||
|
|
@@ -625,6 +761,118 @@ public synchronized int read() throws IOException { | |
| // we read the input stream (i.e. when it's rate limited) | ||
| } | ||
|
|
||
| private static InputStream toSynchronizedInputStream(String blobName, InputStream delegate, MultiPart multipart) { | ||
| assert delegate.markSupported() : "An InputStream with mark support was expected"; | ||
| // We need to introduce a read barrier in order to provide visibility for the underlying | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let us leave this for now, but it seems strange to need this synchronized part really, since we expect it to be used serially only anyway - and this must require a happens-before relationship established in reactor code even if this is used across threads.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I commented #128449 (comment) |
||
| // input stream state as the input stream can be read from different threads. | ||
| // TODO See if this is still needed | ||
| return new FilterInputStream(delegate) { | ||
|
|
||
| private final boolean isTraceEnabled = logger.isTraceEnabled(); | ||
|
|
||
| @Override | ||
| public synchronized int read(byte[] b, int off, int len) throws IOException { | ||
| var result = super.read(b, off, len); | ||
| if (isTraceEnabled) { | ||
| logger.trace("{} reads {} bytes from {} part {}", Thread.currentThread(), result, blobName, multipart.part()); | ||
| } | ||
| return result; | ||
| } | ||
|
|
||
| @Override | ||
| public synchronized int read() throws IOException { | ||
| var result = super.read(); | ||
| if (isTraceEnabled) { | ||
| logger.trace("{} reads {} byte from {} part {}", Thread.currentThread(), result, blobName, multipart.part()); | ||
| } | ||
| return result; | ||
| } | ||
|
|
||
| @Override | ||
| public synchronized void mark(int readlimit) { | ||
| if (isTraceEnabled) { | ||
| logger.trace("{} marks stream {} part {}", Thread.currentThread(), blobName, multipart.part()); | ||
| } | ||
| super.mark(readlimit); | ||
| } | ||
|
|
||
| @Override | ||
| public synchronized void reset() throws IOException { | ||
| if (isTraceEnabled) { | ||
| logger.trace("{} resets stream {} part {}", Thread.currentThread(), blobName, multipart.part()); | ||
| } | ||
| super.reset(); | ||
| } | ||
|
|
||
| @Override | ||
| public synchronized void close() throws IOException { | ||
| if (isTraceEnabled) { | ||
| logger.trace("{} closes stream {} part {}", Thread.currentThread(), blobName, multipart.part()); | ||
| } | ||
| super.close(); | ||
| } | ||
|
|
||
| @Override | ||
| public String toString() { | ||
| return blobName + " part [" + multipart.part() + "] of size [" + multipart.blockSize() + ']'; | ||
| } | ||
| }; | ||
| } | ||
|
|
||
| private static Flux<ByteBuffer> toFlux(InputStream stream, long length, int chunkSize) { | ||
| assert stream.markSupported() : "An InputStream with mark support was expected"; | ||
| // We need to mark the InputStream as it's possible that we need to retry for the same chunk | ||
| stream.mark(Integer.MAX_VALUE); | ||
| return Flux.defer(() -> { | ||
| // TODO Code in this Flux.defer() can be concurrently executed by multiple threads? | ||
| try { | ||
| stream.reset(); | ||
| } catch (IOException e) { | ||
| throw new RuntimeException(e); | ||
| } | ||
| final var bytesRead = new AtomicLong(0L); | ||
| // This flux is subscribed by a downstream operator that finally queues the | ||
| // buffers into netty output queue. Sadly we are not able to get a signal once | ||
| // the buffer has been flushed, so we have to allocate those and let the GC to | ||
| // reclaim them (see MonoSendMany). Additionally, that very same operator requests | ||
| // 128 elements (that's hardcoded) once it's subscribed (later on, it requests | ||
| // by 64 elements), that's why we provide 64kb buffers. | ||
|
|
||
| // length is at most 100MB so it's safe to cast back to an integer in this case | ||
| final int parts = (int) length / chunkSize; | ||
| final long remaining = length % chunkSize; | ||
| return Flux.range(0, remaining == 0 ? parts : parts + 1).map(i -> i * chunkSize).concatMap(pos -> Mono.fromCallable(() -> { | ||
| long count = pos + chunkSize > length ? length - pos : chunkSize; | ||
| int numOfBytesRead = 0; | ||
| int offset = 0; | ||
| int len = (int) count; | ||
| final byte[] buffer = new byte[len]; | ||
| while (numOfBytesRead != -1 && offset < count) { | ||
| numOfBytesRead = stream.read(buffer, offset, len); | ||
| offset += numOfBytesRead; | ||
| len -= numOfBytesRead; | ||
| if (numOfBytesRead != -1) { | ||
| bytesRead.addAndGet(numOfBytesRead); | ||
| } | ||
| } | ||
| if (numOfBytesRead == -1 && bytesRead.get() < length) { | ||
| throw new IllegalStateException( | ||
| format("Input stream [%s] emitted %d bytes, less than the expected %d bytes.", stream, bytesRead, length) | ||
| ); | ||
| } | ||
| return ByteBuffer.wrap(buffer); | ||
| })).doOnComplete(() -> { | ||
| if (bytesRead.get() > length) { | ||
| throw new IllegalStateException( | ||
| format("Input stream [%s] emitted %d bytes, more than the expected %d bytes.", stream, bytesRead, length) | ||
| ); | ||
| } | ||
| }); | ||
| // We need to subscribe on a different scheduler to avoid blocking the io threads when we read the input stream | ||
| }).subscribeOn(Schedulers.elastic()); | ||
|
|
||
| } | ||
|
|
||
| /** | ||
| * Returns the number parts of size of {@code partSize} needed to reach {@code totalSize}, | ||
| * along with the size of the last (or unique) part. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can be follow-up for sure, but I guess we may need some cleanup to deal with failures, deleting the staged blocks? I notice that the original version also does not seem to care so we can defer for now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, there is more work to do for handling failures. I'll create a task for this if we decide to go this route.