|
15 | 15 |
|
16 | 16 | from ds3 import ds3 |
17 | 17 |
|
18 | | -client = ds3.createClientFromEnv() |
| 18 | +# This example retrieves all objects in the specified bucket and lands them in the specified destination. |
| 19 | +# By default it looks for objects in bucket 'books' and lands them in the temporary directory. |
| 20 | +# At the end of running, those files are removed from the local system for testing reasons. |
| 21 | +# |
| 22 | +# This example assumes that a bucket named "books" containing some objects exist on the server. |
19 | 23 |
|
20 | | -bucketName = "books" |
21 | | -# this example assumes that a bucket named "books" containing some objects exist on the server |
| 24 | +bucketName = "books" # modify this value to match the BP bucket you wish to retrieve objects from |
22 | 25 |
|
| 26 | +destination = tempfile.gettempdir() # modify this value to match where the object should be landed on your system |
| 27 | + |
| 28 | +client = ds3.createClientFromEnv() |
| 29 | + |
| 30 | +# retrieves a list of all objects in the bucket |
23 | 31 | bucketContents = client.get_bucket(ds3.GetBucketRequest(bucketName)) |
24 | 32 |
|
| 33 | +# Converting that list of objects into a list of objects to retrieve. |
| 34 | +# If you want to retrieve a subset of objects, or already know their names, then just make a list of ds3.DesGetObject |
| 35 | +# where each item describes one object you wish to retrieve from the BP. |
25 | 36 | objectList = list([ds3.Ds3GetObject(obj['Key']) for obj in bucketContents.result['ContentsList']]) |
| 37 | + |
| 38 | +# Create a dictionary to map the BP object name to the destination where your landing the object. |
| 39 | +# In this example, we are landing all objects to the path described in the destination variable. |
| 40 | +# Also, if the object name contains paths, this will normalize it for your OS and land that object |
| 41 | +# in a sub-folder of the destination. |
| 42 | +objectNameToDestinationPathMap = {} |
| 43 | +for obj in objectList: |
| 44 | + objectNameToDestinationPathMap[obj.name] = os.path.join(destination, os.path.normpath(obj.name)) |
| 45 | + |
| 46 | +# Create a bulk get job on the BP. This tells the BP what objects your going to retrieve. |
| 47 | +# This triggers the BP to start staging the objects in cache. |
| 48 | +# Large objects may have been broken up into several pieces, i.e. blobs. |
| 49 | +# The BP breaks up your retrieval job into "chunks". |
| 50 | +# These chunks represent bundles of data that are ready to be retrieved. |
| 51 | +# Each chunk which will contain one or more pieces of your files (blobs). |
| 52 | +# How the job will be broken up (chunked) is determined when you create the bulk get job. |
26 | 53 | bulkGetResult = client.get_bulk_job_spectra_s3(ds3.GetBulkJobSpectraS3Request(bucketName, objectList)) |
27 | 54 |
|
28 | | -# create a set of the chunk ids which will be used to track |
29 | | -# what chunks have not been retrieved |
| 55 | +# Create a set of the chunk ids that describe all units of work that make up the get job. |
| 56 | +# This will be used to track which chunks we still need to process. |
30 | 57 | chunkIds = set([x['ChunkId'] for x in bulkGetResult.result['ObjectsList']]) |
31 | 58 |
|
32 | | -# create a dictionary to map our retrieved objects to temporary files |
33 | | -# if you want to keep the retreived files on disk, this is not necessary |
34 | | -tempFiles = {} |
35 | | - |
36 | | -# while we still have chunks to retrieve |
| 59 | +# Attempt to retrieve data from the BP while there are still chunks that need to be processed. |
37 | 60 | while len(chunkIds) > 0: |
38 | | - # get a list of the available chunks that we can get |
| 61 | + # Get a list of chunks for this job that are ready to be retrieved. |
39 | 62 | availableChunks = client.get_job_chunks_ready_for_client_processing_spectra_s3( |
40 | 63 | ds3.GetJobChunksReadyForClientProcessingSpectraS3Request(bulkGetResult.result['JobId'])) |
41 | 64 |
|
42 | 65 | chunks = availableChunks.result['ObjectsList'] |
43 | 66 |
|
44 | | - # check to make sure we got some chunks, if we did not |
45 | | - # sleep and retry. This could mean that the cache is full |
| 67 | + # Check to make sure we got some chunks, if we did not sleep and retry. |
| 68 | + # Having no chunks ready may indicate that the BP cache is currently full. |
46 | 69 | if len(chunks) == 0: |
47 | 70 | time.sleep(availableChunks.retryAfter) |
48 | 71 | continue |
49 | 72 |
|
50 | | - # for each chunk that is available, check to make sure |
51 | | - # we have not gotten it, and if not, get that object |
| 73 | + # For each chunk that is available, check to make sure we haven't processed it already. |
| 74 | + # If we have not processed this chunk yet, then retrieve all its objects. |
52 | 75 | for chunk in chunks: |
53 | 76 | if not chunk['ChunkId'] in chunkIds: |
| 77 | + # This chunk has already been processed |
54 | 78 | continue |
55 | | - chunkIds.remove(chunk['ChunkId']) |
| 79 | + |
| 80 | + # For each blob within this chunk, retrieve the data and land it on the destination. |
56 | 81 | for obj in chunk['ObjectList']: |
57 | | - # if we haven't create a temporary file for this object yet, create one |
58 | | - if obj['Name'] not in list(tempFiles.keys()): |
59 | | - tempFiles[obj['Name']] = tempfile.mkstemp() |
| 82 | + # Open the destination file and seek to the offset corresponding with this blob. |
| 83 | + objectStream = open(objectNameToDestinationPathMap[obj['Name']], "wb") |
| 84 | + objectStream.seek(int(obj['Offset'])) |
60 | 85 |
|
61 | | - # get the object |
62 | | - objectStream = open(tempFiles[obj['Name']][1], "wb") |
| 86 | + # Get the blob for the current object and write it to the destination. |
63 | 87 | client.get_object(ds3.GetObjectRequest(bucketName, |
64 | 88 | obj['Name'], |
65 | 89 | objectStream, |
66 | 90 | offset=int(obj['Offset']), |
67 | 91 | job=bulkGetResult.result['JobId'])) |
68 | 92 |
|
69 | | -# iterate over the temporary files, printing out their names, then closing and and removing them |
70 | | -for objName in list(tempFiles.keys()): |
71 | | - print(objName) |
72 | | - os.close(tempFiles[objName][0]) |
73 | | - os.remove(tempFiles[objName][1]) |
| 93 | + # Close the file handle. |
| 94 | + objectStream.close() |
| 95 | + |
| 96 | + # We've finished processing this chunk. Remove it from our list of chunks that still need processing. |
| 97 | + chunkIds.remove(chunk['ChunkId']) |
| 98 | + |
| 99 | +# Go through all items that were landed and check that they were created. |
| 100 | +# This is not needed in production code. |
| 101 | +for objName in objectNameToDestinationPathMap.keys(): |
| 102 | + destinationPath = objectNameToDestinationPathMap[objName] |
| 103 | + if os.path.isfile(destinationPath): |
| 104 | + fileSize = os.path.getsize(destinationPath) |
| 105 | + print(f'Retrieved object={objName}, landed at destination={destinationPath}, has size={fileSize}') |
| 106 | + |
| 107 | + # This removes the retrieved file from the destination. |
| 108 | + # This is done to clean up the script for when people are using it to test connection only. |
| 109 | + os.remove(destinationPath) # Remove in production code. |
| 110 | + else: |
| 111 | + print(f'Failed to retrieve object={objName}') |
0 commit comments