Skip to content

Commit 06b9b48

Browse files
committed
GOSDK-32: System shutdown due to too many go routines when putting lots of files to BP. Removed done channel which was causing go routines in the consumer to be left open longer than necessary.
1 parent d98dc91 commit 06b9b48

File tree

9 files changed

+286
-198
lines changed

9 files changed

+286
-198
lines changed

ds3_integration/utils/testUtils.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ func DeleteBucketLogError(t *testing.T, client *ds3.Client, bucketName string) (
375375

376376
// Deletes the specified bucket and returns an error if one occurs
377377
func DeleteBucket(client *ds3.Client, bucketName string) (error) {
378-
deleteBucket, deleteErr := client.DeleteBucket(models.NewDeleteBucketRequest(bucketName))
378+
deleteBucket, deleteErr := client.DeleteBucketSpectraS3(models.NewDeleteBucketSpectraS3Request(bucketName).WithForce())
379379
if deleteErr != nil {
380380
return deleteErr
381381
}

helpers/conditionalBool.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
package helpers
2+
3+
import "sync"
4+
5+
type NotifyBlobDone interface {
6+
// Waits for at least one done signal.
7+
Wait()
8+
9+
// Sends a done signal. Multiple signals have no additional effect.
10+
SignalDone()
11+
}
12+
13+
14+
func NewConditionalBool() *ConditionalBool {
15+
conditional :=sync.NewCond(&sync.Mutex{})
16+
return &ConditionalBool{
17+
conditional: *conditional,
18+
Done: false,
19+
}
20+
}
21+
22+
type ConditionalBool struct {
23+
conditional sync.Cond
24+
Done bool
25+
}
26+
27+
func (conditionalBool *ConditionalBool) Wait() {
28+
conditionalBool.conditional.L.Lock()
29+
// wait for a done signal to be received
30+
for !conditionalBool.Done {
31+
conditionalBool.conditional.Wait()
32+
}
33+
// reset done notifier
34+
conditionalBool.Done = false
35+
conditionalBool.conditional.L.Unlock()
36+
}
37+
38+
func (conditionalBool *ConditionalBool) SignalDone() {
39+
conditionalBool.conditional.L.Lock()
40+
conditionalBool.Done = true
41+
conditionalBool.conditional.Broadcast()
42+
conditionalBool.conditional.L.Unlock()
43+
}

helpers/consumer.go

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -12,50 +12,42 @@ type consumerImpl struct {
1212
queue *chan TransferOperation
1313
waitGroup *sync.WaitGroup
1414
maxConcurrentOperations uint
15-
blobDoneChannel chan<- struct{}
15+
16+
// Conditional value that gets triggered when a blob has finished being transferred
17+
doneNotifier NotifyBlobDone
1618
}
1719

18-
func newConsumer(queue *chan TransferOperation, blobDoneChannel chan<- struct{}, waitGroup *sync.WaitGroup, maxConcurrentOperations uint) Consumer {
20+
func newConsumer(queue *chan TransferOperation, waitGroup *sync.WaitGroup, maxConcurrentOperations uint, doneNotifier NotifyBlobDone) Consumer {
1921
return &consumerImpl{
2022
queue: queue,
2123
waitGroup: waitGroup,
2224
maxConcurrentOperations: maxConcurrentOperations,
23-
blobDoneChannel: blobDoneChannel,
25+
doneNotifier: doneNotifier,
2426
}
2527
}
2628

27-
func performTransfer(operation TransferOperation, semaphore *chan int, blobDoneChannel chan<- struct{}, jobWaitGroup *sync.WaitGroup, childWaitGroup *sync.WaitGroup) {
28-
defer func() {
29-
// per operation that finishes, send a done message to the producer
30-
blobDoneChannel <-struct {}{}
31-
jobWaitGroup.Done()
32-
childWaitGroup.Done()
33-
}()
29+
func performTransfer(operation TransferOperation, semaphore *chan int, waitGroup *sync.WaitGroup, doneNotifier NotifyBlobDone) {
30+
defer waitGroup.Done()
3431
operation()
32+
33+
// send done signal
34+
doneNotifier.SignalDone()
35+
3536
<- *semaphore
3637
}
3738

3839
func (consumer *consumerImpl) run() {
39-
// Defer closing the blob done channel. This will signal to the producer that it can shut down.
40-
defer func() {close(consumer.blobDoneChannel)}()
41-
4240
// semaphore for controlling max number of transfer operations in flight per job
4341
semaphore := make(chan int, consumer.maxConcurrentOperations + 1)
4442

45-
var childWaitGroup sync.WaitGroup
4643
for {
4744
nextOp, ok := <- *consumer.queue
4845
if ok {
4946
semaphore <- 1
50-
childWaitGroup.Add(1)
51-
go performTransfer(nextOp, &semaphore, consumer.blobDoneChannel, consumer.waitGroup, &childWaitGroup)
47+
go performTransfer(nextOp, &semaphore, consumer.waitGroup, consumer.doneNotifier)
5248
} else {
5349
consumer.waitGroup.Done()
54-
break
50+
return
5551
}
5652
}
57-
58-
// Wait for all child transfer operations to finish before shutting down.
59-
// This is to stop the done channel from being close prematurely
60-
childWaitGroup.Wait()
6153
}

helpers/consumer_test.go

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,24 +40,15 @@ func TestProducerConsumerModel(t *testing.T) {
4040

4141
queue := make(chan TransferOperation, 5)
4242

43-
// make the blob done channel larger than the number of transfer operations queued.
44-
blobDoneChannel := make(chan struct{}, numOperations+1)
43+
doneNotifier := NewConditionalBool()
4544

46-
consumer := newConsumer(&queue, blobDoneChannel, &wg, 5)
45+
consumer := newConsumer(&queue, &wg, 5, doneNotifier)
4746

4847
go producer(&queue)
4948
go consumer.run()
5049

5150
wg.Wait()
5251

5352
ds3Testing.AssertInt(t, "Executed Transfer Operations", numOperations, resultCount)
54-
55-
// verify that 10 done messages were sent
56-
ds3Testing.AssertInt(t, "Done signals sent", numOperations, len(blobDoneChannel))
57-
for len(blobDoneChannel) > 0 {
58-
_, ok := <-blobDoneChannel
59-
ds3Testing.AssertBool(t, "expected channel not to be closed", true, ok)
60-
}
61-
_, ok := <- blobDoneChannel
62-
ds3Testing.AssertBool(t, "expected channel to be closed", false, ok)
53+
ds3Testing.AssertBool(t, "received done notification", true, doneNotifier.Done)
6354
}

helpers/getProducer.go

Lines changed: 49 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -24,23 +24,18 @@ type getProducer struct {
2424
rangeFinder ranges.BlobRangeFinder
2525
sdk_log.Logger
2626

27-
// Channel that represents blobs that have finished being process.
28-
// This will be written to once a get object operation has completed regardless of error or success.
29-
// This is used to notify the runner to re-check if any blobs are now ready to be retrieved.
30-
blobDoneChannel <-chan struct{}
31-
32-
// Used to track if we are done queuing blobs
33-
continueQueuingBlobs bool
27+
// Conditional value that gets triggered when a blob has finished being transferred
28+
doneNotifier NotifyBlobDone
3429
}
3530

3631
func newGetProducer(
3732
jobMasterObjectList *ds3Models.MasterObjectList,
3833
getObjects *[]helperModels.GetObject,
3934
queue *chan TransferOperation,
4035
strategy *ReadTransferStrategy,
41-
blobDoneChannel <-chan struct{},
4236
client *ds3.Client,
43-
waitGroup *sync.WaitGroup) *getProducer {
37+
waitGroup *sync.WaitGroup,
38+
doneNotifier NotifyBlobDone) *getProducer {
4439

4540
return &getProducer{
4641
JobMasterObjectList: jobMasterObjectList,
@@ -54,8 +49,7 @@ func newGetProducer(
5449
deferredBlobQueue: NewBlobDescriptionQueue(),
5550
rangeFinder: ranges.NewBlobRangeFinder(getObjects),
5651
Logger: client.Logger, //use the same logger as the client
57-
blobDoneChannel: blobDoneChannel,
58-
continueQueuingBlobs: true,
52+
doneNotifier: doneNotifier,
5953
}
6054
}
6155

@@ -75,15 +69,20 @@ func toReadObjectMap(getObjects *[]helperModels.GetObject) map[string]helperMode
7569
}
7670

7771
// Processes all the blobs in a chunk that are ready for transfer from BP
78-
func (producer *getProducer) processChunk(curChunk *ds3Models.Objects, bucketName string, jobId string) {
72+
// Returns the number of blobs queued for process
73+
func (producer *getProducer) processChunk(curChunk *ds3Models.Objects, bucketName string, jobId string) int {
7974
producer.Debugf("begin chunk processing %s", curChunk.ChunkId)
8075

76+
processedCount := 0
8177
// transfer blobs that are ready, and queue those that are waiting for channel
8278
for _, curObj := range curChunk.Objects {
8379
producer.Debugf("queuing object in waiting to be processed %s offset=%d length=%d", *curObj.Name, curObj.Offset, curObj.Length)
8480
blob := helperModels.NewBlobDescription(*curObj.Name, curObj.Offset, curObj.Length)
85-
producer.queueBlobForTransfer(&blob, bucketName, jobId)
81+
if producer.queueBlobForTransfer(&blob, bucketName, jobId) {
82+
processedCount++
83+
}
8684
}
85+
return processedCount
8786
}
8887

8988
// Information required to perform a get operation of a blob with BP as data source and channelBuilder as destination
@@ -174,9 +173,10 @@ func writeRangeToDestination(channelBuilder helperModels.WriteChannelBuilder, bl
174173

175174
// Attempts to transfer a single blob from the BP to the client. If the blob is not ready for transfer,
176175
// then it is added to the waiting to transfer queue
177-
func (producer *getProducer) queueBlobForTransfer(blob *helperModels.BlobDescription, bucketName string, jobId string) {
176+
// Returns whether or not the blob was queued for transfer
177+
func (producer *getProducer) queueBlobForTransfer(blob *helperModels.BlobDescription, bucketName string, jobId string) bool {
178178
if producer.processedBlobTracker.IsProcessed(*blob) {
179-
return
179+
return false // already been processed
180180
}
181181

182182
curReadObj := producer.readObjectMap[blob.Name()]
@@ -185,13 +185,13 @@ func (producer *getProducer) queueBlobForTransfer(blob *helperModels.BlobDescrip
185185
// a fatal error happened on a previous blob for this file, skip processing
186186
producer.Warningf("fatal error occurred while transferring previous blob on this file, skipping blob '%s' offset=%d length=%d", blob.Name(), blob.Offset(), blob.Length())
187187
producer.processedBlobTracker.MarkProcessed(*blob)
188-
return
188+
return false // not going to process
189189
}
190190

191191
if !curReadObj.ChannelBuilder.IsChannelAvailable(blob.Offset()) {
192192
producer.Debugf("channel is not currently available for getting blob '%s' offset=%d length=%d", blob.Name(), blob.Offset(), blob.Length())
193193
producer.deferredBlobQueue.Push(blob)
194-
return
194+
return false // not ready to be processed
195195
}
196196

197197
producer.Debugf("channel is available for getting blob '%s' offset=%d length=%d", blob.Name(), blob.Offset(), blob.Length())
@@ -212,11 +212,16 @@ func (producer *getProducer) queueBlobForTransfer(blob *helperModels.BlobDescrip
212212

213213
// Mark blob as processed
214214
producer.processedBlobTracker.MarkProcessed(*blob)
215+
216+
return true
215217
}
216218

217219
// Attempts to process all blobs whose channels were not available for transfer.
218220
// Blobs whose channels are still not available are placed back on the queue.
219-
func (producer *getProducer) processWaitingBlobs(bucketName string, jobId string) {
221+
// Returns the number of blobs queued for processing.
222+
func (producer *getProducer) processWaitingBlobs(bucketName string, jobId string) int {
223+
processedCount := 0
224+
220225
// attempt to process all blobs in waiting to be transferred
221226
waitingBlobs := producer.deferredBlobQueue.Size()
222227
for i := 0; i < waitingBlobs; i++ {
@@ -228,87 +233,54 @@ func (producer *getProducer) processWaitingBlobs(bucketName string, jobId string
228233
producer.Errorf("failure during blob transfer '%s' at offset %d: %s", curBlob.Name(), curBlob.Offset(), err.Error())
229234
break
230235
}
231-
producer.queueBlobForTransfer(curBlob, bucketName, jobId)
236+
if producer.queueBlobForTransfer(curBlob, bucketName, jobId) {
237+
processedCount++
238+
}
232239
}
240+
return processedCount
233241
}
234242

235243
// This initiates the production of the transfer operations which will be consumed by a consumer running in a separate go routine.
236244
// Each transfer operation will retrieve one blob of content from the BP.
237245
// Once all blobs have been queued to be transferred, the producer will finish, even if all operations have not been consumed yet.
238246
func (producer *getProducer) run() error {
247+
defer close(*producer.queue)
248+
239249
// determine number of blobs to be processed
240250
var totalBlobCount int64 = producer.totalBlobCount()
241251
producer.Debugf("job status totalBlobs=%d processedBlobs=%d", totalBlobCount, producer.processedBlobTracker.NumberOfProcessedBlobs())
242252

243-
// initiate first set of blob transfers
244-
err := producer.queueBlobsReadyForTransfer(totalBlobCount)
245-
if err != nil {
246-
return err
247-
}
253+
// process all chunks and make sure all blobs are queued for transfer
254+
for producer.hasMoreToProcess(totalBlobCount) {
255+
processedCount, err := producer.queueBlobsReadyForTransfer(totalBlobCount)
256+
if err != nil {
257+
return err
258+
}
248259

249-
// wait for either a timer or for at least one blob to finish before attempting to queue more items for transfer
250-
ticker := time.NewTicker(producer.strategy.BlobStrategy.delay())
251-
var fatalErr error
252-
for {
253-
select {
254-
case _, ok := <- producer.blobDoneChannel:
255-
if ok {
256-
// reset the timer
257-
ticker.Stop()
258-
ticker = time.NewTicker(producer.strategy.BlobStrategy.delay())
259-
260-
err = producer.queueBlobsReadyForTransfer(totalBlobCount)
261-
if err != nil {
262-
// A fatal error has occurred, stop queuing blobs for processing and
263-
// close processing queue to signal consumer we won't be sending any more blobs.
264-
producer.continueQueuingBlobs = false
265-
fatalErr = err
266-
close(*producer.queue)
267-
}
268-
} else {
269-
// The consumer closed the channel, signaling completion.
270-
return fatalErr
271-
}
272-
case <- ticker.C:
273-
err = producer.queueBlobsReadyForTransfer(totalBlobCount)
274-
if err != nil {
275-
// A fatal error has occurred, stop queuing blobs for processing and
276-
// close processing queue to signal consumer we won't be sending any more blobs.
277-
producer.continueQueuingBlobs = false
278-
fatalErr = err
279-
close(*producer.queue)
280-
}
260+
// If the last operation processed blobs, then wait for something to finish
261+
if processedCount > 0 {
262+
producer.doneNotifier.Wait()
263+
} else if producer.hasMoreToProcess(totalBlobCount) {
264+
// nothing could be processed, cache is probably full, wait a bit before trying again
265+
time.Sleep(producer.strategy.BlobStrategy.delay())
281266
}
282267
}
283-
return fatalErr
268+
return nil
284269
}
285270

286271
func (producer *getProducer) hasMoreToProcess(totalBlobCount int64) bool {
287272
return producer.processedBlobTracker.NumberOfProcessedBlobs() < totalBlobCount || producer.deferredBlobQueue.Size() > 0
288273
}
289274

290-
func (producer *getProducer) queueBlobsReadyForTransfer(totalBlobCount int64) error {
291-
if !producer.continueQueuingBlobs {
292-
// We've queued up all the blobs we are going to for this job.
293-
return nil
294-
}
295-
296-
// check if there is anything left to be queued
297-
if !producer.hasMoreToProcess(totalBlobCount) {
298-
// Everything has been queued for processing.
299-
producer.continueQueuingBlobs = false
300-
// close processing queue to signal consumer we won't be sending any more blobs.
301-
close(*producer.queue)
302-
return nil
303-
}
304-
275+
// Returns the number of blobs that have been queued for transfer
276+
func (producer *getProducer) queueBlobsReadyForTransfer(totalBlobCount int64) (int, error) {
305277
// Attempt to transfer waiting blobs
306-
producer.processWaitingBlobs(*producer.JobMasterObjectList.BucketName, producer.JobMasterObjectList.JobId)
278+
processedCount := producer.processWaitingBlobs(*producer.JobMasterObjectList.BucketName, producer.JobMasterObjectList.JobId)
307279

308280
// Check if we need to query the BP for allocated blobs, or if we already know everything is allocated.
309281
if int64(producer.deferredBlobQueue.Size()) + producer.processedBlobTracker.NumberOfProcessedBlobs() >= totalBlobCount {
310282
// Everything is already allocated, no need to query BP for allocated chunks
311-
return nil
283+
return processedCount, nil
312284
}
313285

314286
// Get the list of available chunks that the server can receive. The server may
@@ -318,7 +290,7 @@ func (producer *getProducer) queueBlobsReadyForTransfer(totalBlobCount int64) er
318290
chunksReadyResponse, err := producer.client.GetJobChunksReadyForClientProcessingSpectraS3(chunksReady)
319291
if err != nil {
320292
producer.Errorf("unrecoverable error: %v", err)
321-
return err
293+
return processedCount, err
322294
}
323295

324296
// Check to see if any chunks can be processed
@@ -327,14 +299,10 @@ func (producer *getProducer) queueBlobsReadyForTransfer(totalBlobCount int64) er
327299
// Loop through all the chunks that are available for processing, and send
328300
// the files that are contained within them.
329301
for _, curChunk := range chunksReadyResponse.MasterObjectList.Objects {
330-
producer.processChunk(&curChunk, *chunksReadyResponse.MasterObjectList.BucketName, chunksReadyResponse.MasterObjectList.JobId)
302+
processedCount += producer.processChunk(&curChunk, *chunksReadyResponse.MasterObjectList.BucketName, chunksReadyResponse.MasterObjectList.JobId)
331303
}
332-
} else {
333-
// When no chunks are returned we need to sleep to allow for cache space to
334-
// be freed.
335-
producer.strategy.BlobStrategy.delay()
336304
}
337-
return nil
305+
return processedCount, nil
338306
}
339307

340308
// Determines the number of blobs to be transferred.

0 commit comments

Comments
 (0)