music_composition/src/utility/s3_mongo_inter.py at master · gavin-peterkin/music_composition · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from __future__ import print_function

from settings import (
    BUCKET_NAME, MONGODB_SERVER, MONGODB_PORT, MONGODB_DB, MONGODB_COLLECTION,
    HEADERS, COOKIES, MIN_SLEEP, MAX_SLEEP
)

from bson.binary import Binary
from io import BytesIO

import boto3
import cPickle as pickle
import random
import requests
import time
import pymongo


class S3MongoInterface(object):

    def __init__(self, enable_frequency_checks=False, enable_s3_conn=True):
        self.enable_frequency_checks = enable_frequency_checks

        # use "put_object"

        self.mongo_client = pymongo.MongoClient(MONGODB_SERVER, MONGODB_PORT)
        self.mongodb = self.mongo_client[MONGODB_DB]
        self.collection = self.mongodb[MONGODB_COLLECTION]
        self.cursor = None

        self.s3 = boto3.resource('s3')
        self.bucket = self.s3.Bucket(BUCKET_NAME)
        self.already_downloaded_keys = [obj.key for obj in self.bucket.objects.all()]

    def _execute_download(self, url):
        """Returns raw bytestream from url"""
        time.sleep(random.randint(MIN_SLEEP, MAX_SLEEP))
        response = requests.get(url)
        if response.ok:
            return BytesIO(response.content)
        else:
            return None

    def _construct_obj_key(self, url):
        """Just using the filename for now"""
        return url.split('/')[-1]

    def _insert_new_midi(self, _id, download_url):
        object_key = self._construct_obj_key(download_url)
        if object_key in self.already_downloaded_keys:
            print("Skipping already downloaded {}".format(object_key))
        else:
            data = self._execute_download(download_url)
            if data:
                self.bucket.put_object(Key=object_key, Body=data)
                self.collection.update_one(
                    {'_id': _id},
                    {"$set": {'s3_complete': 1, 's3_filename': object_key}}, upsert=False
                )
                print("Dumped file: {}".format(object_key))


    def execute_midi_dumps(self):
        """
        Looks at local MongoDB and checks for midi download urls every X seconds
        if enable_frequency_checks is True, otherwise it just checks once.

        Downloads file into memory and sends to S3 bucket with key equal to the
        original download URL.

        Upon successful completion, adds boolean document attribute "s3_complete"
        with value true and string attribute "s3_filename" that has the filename
        in the bucket.
        """
        # Get all docs with at least one midi file
        cursor = self.collection.find(
            {"download_urls": {'$elemMatch': {'$regex': '.*\.mid'}}},
            {"_id": 1, "download_urls": 1}
        )
        for doc in cursor:
            for download_url in doc['download_urls']:
                if download_url[-4:] == '.mid':
                    self._insert_new_midi(doc['_id'], download_url)

    def pull_midi_data(self, mongo_query, limit=None):
        """
        Generator that yields bytestream for all midi files that are returned
        by querying DB with <mongo_query>

        Limit: None by default. If int, returns only first int items.
        """
        # We only ever care about entries with a midi file associated with them
        # We also want to be sure to only get data that has been downloaded and
        # added to the S3 bucket
        mongo_query.update({
            "download_urls": {'$elemMatch': {'$regex': '.*\.mid'}},
            "s3_complete": 1,
            "missing_key": {"$exists": 0},
            # "has_input_layer": {"$exists": 0}  # Doesn't already have saved input layer
        })
        cursor = self.collection.find(
            mongo_query,
            {"_id": 1, "download_urls": 1, "key": 1}
        )
        i = 0
        for doc in cursor:
            web_suggested_key = doc['key']
            for download_url in doc['download_urls']:
                imslp_filename = self._construct_obj_key(download_url)
                if download_url[-4:] == '.mid':
                    i += 1
                    # yields id_, filename, data, key_sig_from_site
                    yield (
                        doc['_id'],
                        imslp_filename,
                        self.bucket.Object(key=imslp_filename).get()['Body'].read(),
                        web_suggested_key
                    )
                if limit and i >= limit:
                    break

    def insert_input_array(self, id_, arr):
        try:
            self.collection.update_one(
                {"_id": id_},
                {"$set": {
                    "input_layer_array": Binary(pickle.dumps(arr, protocol=2)),
                    "has_input_layer": 1
                }}
            )
        except pymongo.errors.DocumentTooLarge:
            self.collection.update_one(
                {"_id": id_},
                {"$set": {
                    "has_input_layer": 0
                }}
            )

    def get_input_arrays(self, mongo_query, go_forever=True):
        """
        Generator that yields an input array
        """
        mongo_query.update({
            "download_urls": {'$elemMatch': {'$regex': '.*\.mid'}},
            "has_input_layer": 1
        })
        while True:
            if self.cursor is not None:
                self.cursor.close()
            self.cursor = self.collection.find(
                mongo_query,
                {"input_layer_array": 1},
                no_cursor_timeout=True
            )
            for doc in self.cursor:
                yield pickle.loads(doc["input_layer_array"])

    def close_connections(self):
        if self.cursor is not None:
            self.cursor.close()