diff --git a/.circleci/config.yml b/.circleci/config.yml index 99f7692..d475399 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -107,10 +107,16 @@ jobs: app-dir: ./onestop-python-client pkg-manager: pip - run: - name: "Run util tests" + name: "Run unit tests" command: > cd onestop-python-client/; - python -m unittest tests/util/*.py + python -m unittest discover -s test/unit +# This is commented out only because the OneStop we have running on cedardevs doesn't have its registry exposed. You can only reach it via sshing to another machine. +# - run: +# name: "Run integration tests" +# command: > +# cd onestop-python-client/; +# python -m unittest discover -s test/integration orbs: slack: circleci/slack@3.4.2 diff --git a/kubernetes/pyconsumer-pod.yaml b/kubernetes/pyconsumer-pod.yaml index fed2258..6943403 100644 --- a/kubernetes/pyconsumer-pod.yaml +++ b/kubernetes/pyconsumer-pod.yaml @@ -72,7 +72,7 @@ data: headers: UNIQUE_ID,FILE_UUID,LON,LAT,DEPTH,TIME,PLATFORM_NAME,PROVIDER type: COLLECTION collection_id: fdb56230-87f4-49f2-ab83-104cfd073177 - psi_registry_url: https://cedardevs.org/ + registry_base_url: https://cedardevs.org/ access_bucket: https://archive-testing-demo.s3-us-east-2.amazonaws.com #access_bucket: https://odp-noaa-nesdis-ncei-test.s3-us-west-2.amazonaws.com file_identifier_prefix: "gov.noaa.ncei.csb:" diff --git a/onestop-python-client/config/aws-util-config-dev.yml b/onestop-python-client/config/aws-util-config-dev.yml index ee1ad95..2fdb5c1 100644 --- a/onestop-python-client/config/aws-util-config-dev.yml +++ b/onestop-python-client/config/aws-util-config-dev.yml @@ -1,11 +1,12 @@ # Example config values for osim client -log_level: INFO # AWS config values sqs_url: https://sqs.us-east-2.amazonaws.com/798276211865/cloud-archive-client-sqs +sqs_name: 'foobar' sqs_max_polls: 2 s3_region: "us-east-2" s3_bucket: archive-testing-demo +s3_key: 'ABI-L1b-RadF/2019/298/15/OR_ABI-L1b-RadF-M6C15_G16_s20192981500369_e20192981510082_c20192981510166.nc' #AWS config values for 2nd vault in different region vault_name: archive-vault-new diff --git a/onestop-python-client/config/credentials-template.yml b/onestop-python-client/config/credentials-template.yml index 006e175..f94c70b 100644 --- a/onestop-python-client/config/credentials-template.yml +++ b/onestop-python-client/config/credentials-template.yml @@ -9,5 +9,4 @@ registry: username: rw_user password: rw_user_pwd - - +log_level: INFO \ No newline at end of file diff --git a/onestop-python-client/config/csb-data-stream-config-template.yml b/onestop-python-client/config/csb-data-stream-config-template.yml index 887c9be..8c2d4de 100644 --- a/onestop-python-client/config/csb-data-stream-config-template.yml +++ b/onestop-python-client/config/csb-data-stream-config-template.yml @@ -1,4 +1,3 @@ -log_level: INFO format: csv headers: UNIQUE_ID,FILE_UUID,LON,LAT,DEPTH,TIME,PLATFORM_NAME,PROVIDER type: COLLECTION @@ -9,7 +8,7 @@ registry_base_url: http://localhost/onestop/api/registry onestop_base_url: http://localhost/onestop/api/search/search access_bucket: https://archive-testing-demo.s3-us-east-2.amazonaws.com #access_bucket: https://odp-noaa-nesdis-ncei-test.s3-us-west-2.amazonaws.com -file_identifier_prefix: "gov.noaa.ncei.csb:" +file_id_prefix: "gov.noaa.ncei.csb:" prefixMap: NESDIS/CSB: 'fdb56230-87f4-49f2-ab83-104cfd073177' diff --git a/onestop-python-client/onestop/KafkaConsumer.py b/onestop-python-client/onestop/KafkaConsumer.py index e45d6cc..747b0e4 100644 --- a/onestop-python-client/onestop/KafkaConsumer.py +++ b/onestop-python-client/onestop/KafkaConsumer.py @@ -1,11 +1,9 @@ -import logging -import yaml - from confluent_kafka.schema_registry import SchemaRegistryClient from confluent_kafka.error import KafkaError from confluent_kafka import DeserializingConsumer from confluent_kafka.schema_registry.avro import AvroDeserializer from confluent_kafka.serialization import StringDeserializer +from onestop.util.ClientLogger import ClientLogger class KafkaConsumer: """ @@ -13,124 +11,113 @@ class KafkaConsumer: Attributes ---------- - conf: yaml file - kafka-publisher-config-dev.yml - logger: Logger object - utilizes python logger library and creates logging for our specific needs - logger.info: Logger object - logging statement that occurs when the class is instantiated - metadata_type: str - type of metadata (COLLECTION or GRANULE) - brokers: str - brokers (kubernetes service) - group_id: str - Client group id string. All clients sharing the same group.id belong to the same group - auto_offset_reset: str - Action to take when there is no initial offset in offset store or the desired offset is out of range (smallest, earliest, beginning, largest, latest, end, error) - schema_registry: str - schema registry (kubernetes service) - security: boolean - defines if security is in place - collection_topic: str - collection topic you want to consume - granule_topic: str - granule topic you want to consume + metadata_type: str + type of metadata (COLLECTION or GRANULE) + brokers: str + brokers (kubernetes service) + group_id: str + Client group id string. All clients sharing the same group.id belong to the same group + auto_offset_reset: str + Action to take when there is no initial offset in offset store or the desired offset is out of range (smallest, earliest, beginning, largest, latest, end, error) + schema_registry: str + schema registry (kubernetes service) + security_enabled: boolean + Whether to use security for the kafka schema registry client. + security_caLoc: str + Kafka schema registry certification authority (CA) file location. + security_keyLoc: str + Kafka schema registry client's private key file location. + security_certLoc: str + Kafka schema registry client's public key file location. + collection_topic_consume: str + collection topic you want to consume + granule_topic_consume: str + granule topic you want to consume + logger: Logger object + utilizes python logger library and creates logging for our specific needs Methods ------- - get_logger(log_name, create_file) - creates logger file - - register_client() - registers to schema registry client based on configs + register_client() + registers to schema registry client based on configs - create_consumer(registry_client) - subscribes to topic defined in configs and creates a consumer to deserialize messages from topic + connect() + utilizes register_client() and create_consumer(registry_client) to connect to schema registry and allow for consumption of topics - connect() - utilizes register_client() and create_consumer(registry_client) to connect to schema registry and allow for consumption of topics + create_consumer(registry_client) + subscribes to topic defined in configs and creates a consumer to deserialize messages from topic - consume(metadata_consumer, handler) - asynchronously polls for messages in the connected topic, results vary depending on the handler function that is passed into it + consume(metadata_consumer, handler) + asynchronously polls for messages in the connected topic, results vary depending on the handler function that is passed into it """ - conf = None - - def __init__(self, conf_loc): - with open(conf_loc) as f: - self.conf = yaml.load(f, Loader=yaml.FullLoader) - - self.logger = self.get_logger(self.__class__.__name__, False) - self.logger.info("Initializing " + self.__class__.__name__) - self.metadata_type = self.conf['metadata_type'] - self.brokers = self.conf['brokers'] - self.group_id = self.conf['group_id'] - self.auto_offset_reset = self.conf['auto_offset_reset'] - self.schema_registry = self.conf['schema_registry'] - self.security = self.conf['security']['enabled'] - - self.collection_topic = self.conf['collection_topic_consume'] - self.granule_topic = self.conf['granule_topic_consume'] - if self.metadata_type not in ['COLLECTION', 'GRANULE']: - raise ValueError("metadata_type must be 'COLLECTION' or 'GRANULE'") - - def get_logger(self, log_name, create_file): + def __init__(self, metadata_type, brokers, group_id, auto_offset_reset, schema_registry, security, collection_topic_consume, granule_topic_consume, log_level = 'INFO', **wildargs): """ - Utilizes python logger library and creates logging - - :param log_name: str - name of log to be created - :param create_file: boolean - defines whether of not you want a logger file to be created - - :return: Logger object + Attributes + ---------- + metadata_type: str + type of metadata (COLLECTION or GRANULE) + brokers: str + brokers (kubernetes service) + group_id: str + Client group id string. All clients sharing the same group.id belong to the same group + auto_offset_reset: str + Action to take when there is no initial offset in offset store or the desired offset is out of range (smallest, earliest, beginning, largest, latest, end, error) + schema_registry: str + schema registry (kubernetes service) URL + security: dict + enabled boolean: Whether to use security for kafka schema registry client. + caLoc str: Kafka schema registry certification authority (CA) file location. + keyLoc str: Kafka schema registry client's private key file location. + certLoc str: Kafka schema registry client's public key file location. + + collection_topic_consume: str + collection topic you want to consume + granule_topic_consume: str + granule topic you want to consume + log_level: str + What log level to use for this class """ - # create logger - log = logging.getLogger() + self.metadata_type = metadata_type + self.brokers = brokers + self.group_id = group_id + self.auto_offset_reset = auto_offset_reset + self.schema_registry = schema_registry + self.security_enabled = security['enabled'] - # create formatter and add it to the handlers - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + if self.security_enabled: + self.security_caLoc = security['caLoc'] + self.security_keyLoc = security['keyLoc'] + self.security_certLoc = security['certLoc'] - if self.conf['log_level'] == "DEBUG": - log.setLevel(level=logging.DEBUG) - else: - if self.conf['log_level'] == "INFO": - log.setLevel(level=logging.INFO) - else: - log.setLevel(level=logging.ERROR) + self.collection_topic = collection_topic_consume + self.granule_topic = granule_topic_consume - fh = None - if create_file: - # create file handler for logger. - fh = logging.FileHandler(log_name) - fh.setFormatter(formatter) - - # create console handler for logger. - ch = logging.StreamHandler() - ch.setFormatter(formatter) + if self.metadata_type not in ['COLLECTION', 'GRANULE']: + raise ValueError("metadata_type must be 'COLLECTION' or 'GRANULE'") - # add handlers to logger. - if create_file: - log.addHandler(fh) + self.logger = ClientLogger.get_logger(self.__class__.__name__, log_level, False) + self.logger.info("Initializing " + self.__class__.__name__) - log.addHandler(ch) - return log + if wildargs: + self.logger.warning("There were extra constructor arguments: " + str(wildargs)) def register_client(self): """ Registers to schema registry client based on configs :return: SchemaRegistryClient (confluent kafka library) - """ - reg_conf = {'url': self.schema_registry} + """ + conf = {'url': self.schema_registry} - if self.security: - reg_conf['ssl.ca.location'] = self.conf['security']['caLoc'] - reg_conf['ssl.key.location'] = self.conf['security']['keyLoc'] - reg_conf['ssl.certificate.location'] = self.conf['security']['certLoc'] + if self.security_enabled: + conf['ssl.ca.location'] = self.security_caLoc + conf['ssl.key.location'] = self.security_keyLoc + conf['ssl.certificate.location'] = self.security_certLoc - registry_client = SchemaRegistryClient(reg_conf) + self.logger.info("Creating SchemaRegistryClient with configuration:"+str(conf)) + registry_client = SchemaRegistryClient(conf) return registry_client def connect(self): @@ -152,33 +139,38 @@ def create_consumer(self, registry_client): :return: DeserializingConsumer object """ - metadata_schema = None topic = None if self.metadata_type == "COLLECTION": - metadata_schema = registry_client.get_latest_version(self.collection_topic + '-value').schema.schema_str topic = self.collection_topic if self.metadata_type == "GRANULE": - metadata_schema = registry_client.get_latest_version(self.granule_topic + '-value').schema.schema_str topic = self.granule_topic - metadata_deserializer = AvroDeserializer(metadata_schema, registry_client) - - consumer_conf = {'bootstrap.servers': self.brokers} - - if self.security: - consumer_conf['security.protocol'] = 'SSL' - consumer_conf['ssl.ca.location'] = self.conf['security']['caLoc'] - consumer_conf['ssl.key.location'] = self.conf['security']['keyLoc'] - consumer_conf['ssl.certificate.location'] = self.conf['security']['certLoc'] + self.logger.debug("topic: "+str(topic)) - meta_consumer_conf = consumer_conf - meta_consumer_conf['key.deserializer'] = StringDeserializer('utf-8') - meta_consumer_conf['value.deserializer'] = metadata_deserializer - meta_consumer_conf['group.id'] = self.group_id - meta_consumer_conf['auto.offset.reset'] = self.auto_offset_reset + # This topic naming scheme is how OneStop creates the topics. + latest_schema = registry_client.get_latest_version(topic + '-value') - metadata_consumer = DeserializingConsumer(meta_consumer_conf) + metadata_schema = latest_schema.schema.schema_str + self.logger.debug("metadata_schema: "+metadata_schema) + metadata_deserializer = AvroDeserializer(metadata_schema, registry_client) + conf = { + 'bootstrap.servers': self.brokers, + 'key.deserializer': StringDeserializer('utf-8'), + 'value.deserializer': metadata_deserializer, + 'group.id': self.group_id, + 'auto.offset.reset': self.auto_offset_reset + } + + if self.security_enabled: + conf['security.protocol'] = 'SSL' + conf['ssl.ca.location'] = self.security_caLoc + conf['ssl.key.location'] = self.security_keyLoc + conf['ssl.certificate.location'] = self.security_certLoc + + self.logger.debug("conf: "+str(conf)) + metadata_consumer = DeserializingConsumer(conf) + self.logger.debug("topic: "+str(topic)) metadata_consumer.subscribe([topic]) return metadata_consumer @@ -197,15 +189,16 @@ def consume(self, metadata_consumer, handler): while True: try: msg = metadata_consumer.poll(10) + self.logger.debug("Message received: "+str(msg)) if msg is None: - print('No Messages') + self.logger.info('No Messages') continue + self.logger.debug("Message key="+str(msg.key())+" value="+str(msg.value())) key = msg.key() value = msg.value() - except KafkaError: raise try: @@ -213,4 +206,5 @@ def consume(self, metadata_consumer, handler): except Exception as e: self.logger.error("Message handler failed: {}".format(e)) break + self.logger.debug("Closing metadata_consumer") metadata_consumer.close() diff --git a/onestop-python-client/onestop/KafkaPublisher.py b/onestop-python-client/onestop/KafkaPublisher.py index d357de8..047783c 100644 --- a/onestop-python-client/onestop/KafkaPublisher.py +++ b/onestop-python-client/onestop/KafkaPublisher.py @@ -1,13 +1,11 @@ -import logging -from uuid import UUID import json -import yaml +from uuid import UUID from confluent_kafka.schema_registry import SchemaRegistryClient from confluent_kafka.error import KafkaError from confluent_kafka import SerializingProducer from confluent_kafka.schema_registry.avro import AvroSerializer - +from onestop.util.ClientLogger import ClientLogger class KafkaPublisher: """ @@ -15,114 +13,98 @@ class KafkaPublisher: Attributes ---------- - conf: yaml file - config/kafka-publisher-config-dev.yml - logger: Logger object - utilizes python logger library and creates logging for our specific needs - logger.info: Logger object - logging statement that occurs when the class is instantiated - metadata_type: str - type of metadata (COLLECTION or GRANULE) - brokers: str - brokers (kubernetes service) - schema_registry: str - schema registry (kubernetes service) - security: boolean - defines if security is in place - collection_topic: str - collection topic you want to consume - granule_topic: str - granule topic you want to consume + metadata_type: str + type of metadata (COLLECTION or GRANULE) + brokers: str + brokers (kubernetes service) + schema_registry: str + schema registry (kubernetes service) + security_enabled: boolean + defines if security is in place + security_caLoc: str + Kafka schema registry certification authority (CA) file location. + security_keyLoc: str + Kafka schema registry client's private key file location. + security_certLoc: str + Kafka schema registry client's public key file location. + collection_topic: str + collection topic you want to produce to + granule_topic: str + granule topic you want to produce to + logger: Logger object + utilizes python logger library and creates logging for our specific needs Methods ------- - get_logger(log_name, create_file) - creates logger file - - register_client() - registers to schema registry client based on configs + register_client() + registers to schema registry client based on configs - create_producer(registry_client) - creates a SerializingProducer object to produce to kafka topic + create_producer(registry_client) + creates a SerializingProducer object to produce to kafka topic - connect() - utilizes register_client() and create_producer(registry_client) to connect to schema registry and allow for producing to kafka topics + connect() + utilizes register_client() and create_producer(registry_client) to connect to schema registry and allow for producing to kafka topics - publish_collection(collection_producer, collection_uuid, content_dict, method) - Publish collection to collection topic + publish_collection(collection_producer, collection_uuid, content_dict, method) + Publish collection to collection topic - publish_granule(granule_producer, record_uuid, collection_uuid, content_dict) - Publish granule to granule topic + publish_granule(granule_producer, collection_uuid, content_dict) + Publish granule to granule topic """ - conf = None - - def __init__(self, conf_loc): - - with open(conf_loc) as f: - self.conf = yaml.load(f, Loader=yaml.FullLoader) - - self.logger = self.get_logger(self.__class__.__name__, False) - self.logger.info("Initializing " + self.__class__.__name__) - self.metadata_type = self.conf['metadata_type'] - self.brokers = self.conf['brokers'] - self.schema_registry = self.conf['schema_registry'] - self.security = self.conf['security']['enabled'] - self.collection_topic = self.conf['collection_topic_produce'] - self.granule_topic = self.conf['granule_topic_produce'] - - if self.metadata_type not in ['COLLECTION', 'GRANULE']: - raise ValueError("metadata_type must be 'COLLECTION' or 'GRANULE'") - - def get_logger(self, log_name, create_file): + def __init__(self, metadata_type, brokers, schema_registry, security, collection_topic_publish, granule_topic_publish, log_level='INFO', **wildargs): """ - Utilizes python logger library and creates logging - - :param log_name: str - name of log to be created - :param create_file: boolean - defines whether of not you want a logger file to be created - - :return: Logger object + Attributes + ---------- + metadata_type: str + type of metadata (COLLECTION or GRANULE) + brokers: str + brokers (kubernetes service) + group_id: str + Client group id string. All clients sharing the same group.id belong to the same group + auto_offset_reset: str + Action to take when there is no initial offset in offset store or the desired offset is out of range (smallest, earliest, beginning, largest, latest, end, error) + schema_registry: str + schema registry (kubernetes service) URL + security: dict + enabled boolean: Whether to use security for kafka schema registry client. + caLoc str: Kafka schema registry certification authority (CA) file location. + keyLoc str: Kafka schema registry client's private key file location. + certLoc str: Kafka schema registry client's public key file location. + + collection_topic: str + collection topic you want to produce to + granule_topic: str + granule topic you want to produce to """ + self.metadata_type = metadata_type + self.brokers = brokers + self.schema_registry = schema_registry + self.security_enabled = security['enabled'] - # create logger - log = logging.getLogger() + if self.security_enabled: + self.security_caLoc = security['caLoc'] + self.security_keyLoc = security['keyLoc'] + self.security_certLoc = security['certLoc'] - # create formatter and add it to the handlers - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - - if self.conf['log_level'] == "DEBUG": - log.setLevel(level=logging.DEBUG) - else: - if self.conf['log_level'] == "INFO": - log.setLevel(level=logging.INFO) - else: - log.setLevel(level=logging.ERROR) + self.collection_topic = collection_topic_publish + self.granule_topic = granule_topic_publish - fh = None - if create_file: - # create file handler for logger. - fh = logging.FileHandler(log_name) - fh.setFormatter(formatter) - - # create console handler for logger. - ch = logging.StreamHandler() - ch.setFormatter(formatter) + if self.metadata_type not in ['COLLECTION', 'GRANULE']: + raise ValueError("metadata_type must be 'COLLECTION' or 'GRANULE'") - # add handlers to logger. - if create_file: - log.addHandler(fh) + self.logger = ClientLogger.get_logger(self.__class__.__name__, log_level, False) + self.logger.info("Initializing " + self.__class__.__name__) - log.addHandler(ch) - return log + if wildargs: + self.logger.warning("There were extra constructor arguments: " + str(wildargs)) def connect(self): """ Utilizes register_client() and create_producer(registry_client) to connect to schema registry and allow for producing to kafka topics :return: SerializingProducer Object - based on config values + based on initial constructor values """ registry_client = self.register_client() metadata_producer = self.create_producer(registry_client) @@ -137,10 +119,10 @@ def register_client(self): reg_conf = {'url': self.schema_registry} - if self.security: - reg_conf['ssl.ca.location'] = self.conf['security']['caLoc'] - reg_conf['ssl.key.location'] = self.conf['security']['keyLoc'] - reg_conf['ssl.certificate.location'] = self.conf['security']['certLoc'] + if self.security_enabled: + reg_conf['ssl.ca.location'] = self.security_caLoc + reg_conf['ssl.key.location'] = self.security_keyLoc + reg_conf['ssl.certificate.location'] = self.security_certLoc registry_client = SchemaRegistryClient(reg_conf) return registry_client @@ -153,34 +135,34 @@ def create_producer(self, registry_client): get this from register_client() :return: SerializingProducer Object - based on config values + based on initial constructor values """ - metadata_schema = None + topic = None if self.metadata_type == "COLLECTION": - metadata_schema = registry_client.get_latest_version(self.collection_topic + '-value').schema.schema_str + topic = self.collection_topic if self.metadata_type == "GRANULE": - metadata_schema = registry_client.get_latest_version(self.granule_topic + '-value').schema.schema_str + topic = self.granule_topic + metadata_schema = registry_client.get_latest_version(topic + '-value').schema.schema_str metadata_serializer = AvroSerializer(metadata_schema, registry_client) - producer_conf = {'bootstrap.servers': self.brokers} + conf = {'bootstrap.servers': self.brokers} - if self.security: - producer_conf['security.protocol'] = 'SSL' - producer_conf['ssl.ca.location'] = self.conf['security']['caLoc'] - producer_conf['ssl.key.location'] = self.conf['security']['keyLoc'] - producer_conf['ssl.certificate.location'] = self.conf['security']['certLoc'] + if self.security_enabled: + conf['security.protocol'] = 'SSL' + conf['ssl.ca.location'] = self.security_caLoc + conf['ssl.key.location'] = self.security_keyLoc + conf['ssl.certificate.location'] = self.security_certLoc - meta_producer_conf = producer_conf - meta_producer_conf['value.serializer'] = metadata_serializer + conf['value.serializer'] = metadata_serializer - metadata_producer = SerializingProducer(meta_producer_conf) + metadata_producer = SerializingProducer(conf) return metadata_producer def delivery_report(self, err, msg): """ - Called once for each message produced to indicate delivery result. Triggered by poll() or flush(). + Called once for each message produced to indicate delivery of message. Triggered by poll() or flush(). :param err: str err produced after publishing, if there is one @@ -192,14 +174,27 @@ def delivery_report(self, err, msg): else: self.logger.error('Message delivered to {} [{}]'.format(msg.topic(), msg.partition())) + @staticmethod + def get_collection_key_from_uuid(collection_uuid): + """ + Create a key to use in a kafka message from the given string representation of the collection UUID. + :param collection_uuid: str + collection string to turn into a key. + :return: + """ + if type(collection_uuid) == bytes: + return str(UUID(bytes=collection_uuid)) + else: + return str(UUID(hex=collection_uuid)) + def publish_collection(self, collection_producer, collection_uuid, content_dict, method): """ - Publish collection to collection topic + Publish a collection to the collection topic :param collection_producer: SerializingProducer use connect() :param collection_uuid: str - collection uuid that you want colelction to have + collection uuid that you want the collection to have :param content_dict: dict dictionary containing information you want to publish :param method: str @@ -208,11 +203,9 @@ def publish_collection(self, collection_producer, collection_uuid, content_dict, :return: str returns msg if publish is successful, kafka error if it wasn't successful """ - self.logger.info('Publish collection') - if type(collection_uuid) == bytes: - key = str(UUID(bytes=collection_uuid)) - else: - key = str(UUID(hex=collection_uuid)) + self.logger.info('Publishing collection') + + key = self.get_collection_key_from_uuid(collection_uuid) value_dict = { 'type': 'collection', @@ -222,20 +215,22 @@ def publish_collection(self, collection_producer, collection_uuid, content_dict, 'source': 'unknown', } try: - collection_producer.produce(topic=self.collection_topic, value=value_dict, key=key, - on_delivery=self.delivery_report) + self.logger.debug('Publishing collection with topic='+self.collection_topic+' key='+key+' value='+str(value_dict)) + collection_producer.produce( + topic=self.collection_topic, + value=value_dict, + key=key, + on_delivery=self.delivery_report) except KafkaError: raise collection_producer.poll() - def publish_granule(self, granule_producer, record_uuid, collection_uuid, content_dict): + def publish_granule(self, granule_producer, collection_uuid, content_dict): """ - Publishes granule to granule topic + Publish a granule to the granule topic :param granule_producer: SerializingProducer use connect() - :param record_uuid: str - record uuid associated with the granule :param collection_uuid: str collection uuid associated with the granule :param content_dict: dict @@ -246,10 +241,8 @@ def publish_granule(self, granule_producer, record_uuid, collection_uuid, conten """ self.logger.info('Publish granule') - if type(record_uuid) == bytes: - key = str(UUID(bytes=collection_uuid)) - else: - key = str(UUID(hex=collection_uuid)) + key = self.get_collection_key_from_uuid(collection_uuid) + """ if type(collection_uuid) == bytes: content_dict['relationships'] = [{"type": "COLLECTION", "id": collection_uuid.hex()}] @@ -282,8 +275,12 @@ def publish_granule(self, granule_producer, record_uuid, collection_uuid, conten } try: - granule_producer.produce(topic=self.granule_topic, value=value_dict, key=key, - on_delivery=self.delivery_report) + self.logger.debug('Publishing granule with topic='+self.granule_topic+' key='+key+' value='+str(value_dict)) + granule_producer.produce( + topic=self.granule_topic, + value=value_dict, + key=key, + on_delivery=self.delivery_report) except KafkaError: raise granule_producer.poll() diff --git a/onestop-python-client/onestop/WebPublisher.py b/onestop-python-client/onestop/WebPublisher.py index 55ca06c..7b1c6bd 100644 --- a/onestop-python-client/onestop/WebPublisher.py +++ b/onestop-python-client/onestop/WebPublisher.py @@ -7,31 +7,30 @@ class WebPublisher: Attributes ---------- - registry_base_url: str - url for registry endpoint - registry_username: str - username for posting metadata to registry - registry_password: str - password for posting metadata to registry - onestop_base_url: str - url for onestop endpoint - logger.info: str - logging level + registry_base_url: str + URL for registry endpoint + registry_username: str + Registry username where credentials needed + registry_password: str + Registry password where credentials needed + onestop_base_url: str + URL for OneStop endpoint + logger.info: str + logging level Methods ------- - publish_registry(metadata_type, uuid, payload, method) - Publish to registry with either POST,PUT, OR PATCH methods - delete_registry(metadata_type, uuid) - Deletes item from registry - search_registry(metadata_type, uuid) - Searches for an item in registry given its metadata type and uuid - search_onestop(metadata_type, payload) - Acquires the item, collection or granule, from OneStop - get_granules_onestop(self, uuid) - Acquires granules from OneStop given the uuid + publish_registry(metadata_type, uuid, payload, method) + Publish an item to registry with either POST, PUT, OR PATCH methods + delete_registry(metadata_type, uuid) + Delete an item from registry + search_registry(metadata_type, uuid) + Search for an item in registry given its metadata type and uuid + search_onestop(metadata_type, payload) + Search for an item in OneStop given its metadata type and payload search criteria + get_granules_onestop(self, uuid) + Search for a granule in OneStop given its uuid """ - conf = None def __init__(self, registry_base_url, registry_username, registry_password, onestop_base_url, log_level="INFO", **kwargs): self.registry_base_url = registry_base_url @@ -43,7 +42,7 @@ def __init__(self, registry_base_url, registry_username, registry_password, ones self.logger.info("Initializing " + self.__class__.__name__) if kwargs: - self.logger.info("There were extra constructor arguments: " + str(kwargs)) + self.logger.warning("There were extra constructor arguments: " + str(kwargs)) def publish_registry(self, metadata_type, uuid, payload, method): """ @@ -84,12 +83,12 @@ def publish_registry(self, metadata_type, uuid, payload, method): def delete_registry(self, metadata_type, uuid): """ - Deletes item from registry + Delete an item from registry :param metadata_type: str metadata type (GRANULE/COLLECTION) :param uuid: str - uuid you want to publish with + uuid you want to delete :return: str response message indicating if delete was successful @@ -105,7 +104,7 @@ def delete_registry(self, metadata_type, uuid): def search_registry(self, metadata_type, uuid): """ - Searches for an item in registry given its metadata type and uuid + Search for an item in registry given its metadata type and uuid :param metadata_type: str metadata type (GRANULE/COLLECTION) @@ -126,7 +125,7 @@ def search_registry(self, metadata_type, uuid): def search_onestop(self, metadata_type, payload): """ - Searches for an item in OneStop given its metadata type and payload search criteria. + Search for an item in OneStop given its metadata type and payload search criteria. :param metadata_type: str metadata type (GRANULE/COLLECTION) @@ -147,7 +146,7 @@ def search_onestop(self, metadata_type, payload): def get_granules_onestop(self, uuid): """ - Searches for a granule in OneStop given its uuid + Search for a granule in OneStop given its uuid :param uuid: str uuid you want search for diff --git a/onestop-python-client/onestop/extract/CsbExtractor.py b/onestop-python-client/onestop/extract/CsbExtractor.py index e79cddc..b1006cb 100644 --- a/onestop-python-client/onestop/extract/CsbExtractor.py +++ b/onestop-python-client/onestop/extract/CsbExtractor.py @@ -2,61 +2,33 @@ from datetime import datetime class CsbExtractor: + """ A class used to extract geospatial data from csv files in an s3 bucket - Attributes - ---------- - su : S3 Utils object - an instance of the s3 utils class used to connect to the corresponding s3 bucket to get access to the csv file for extraction - boto_client: boto3 client - specific boto3 client type (s3, s3_resource, glacier, session) used to access aws resources - bucket: str - the name of the s3 bucket in which you want to access - key: str - the name of key path for the specific item you want to access in the bucket - - Methods ------- is_csv(file_name) - checks to see if the given file is of type csv + Verifies a file name ends with '.csv' get_spatial_temporal_bounds(lon_column_name, lat_column_name, date_column_name) - extracts min/max longitude and latitude values as well as beginning and ending dates from specified csv file + Gets the spacial bounding box for the open file. This seeks to the start of the file at start and the end. extract_coords(max_lon, max_lat, min_lon, min_lat) - extracts specific coordinates corresponding to min/max longitude and latitude values given from get_spatial_temporal_bounds(....) method + Given the max/min lon and lat, the function will parse the csv file to extract the coordinates within the given bounding box. """ - def __init__(self, su, key): - """ - :param su: S3 Utils object - an instance of the s3 utils class used to connect to the corresponding s3 bucket to get access to the csv file for extraction - :param key: str - the name of key path for the specific item you want to access in the bucket - - Other Attributes - ________________ - boto_client: boto3 client - specific boto3 client type (s3, s3_resource, glacier, session) used to access aws resources - bucket: str - the name of the s3 bucket in which you want to access + @staticmethod + def is_csv(file_name): """ - self.su = su - boto_client = self.su.connect("session", None) - bucket = self.su.conf['s3_bucket'] - self.key = key - - def is_csv(self, file_name): - """ - Checks to see if the given file is of type csv + Verifies a file name ends with '.csv' :param file_name: str - the name of the file in the s3 bucket i.e. file1.csv + File name with extension on the end. - :return: boolean - True if the file name contains .csv and False otherwise + :return: str + True if ends with csv + False if doesn't end with csv """ csv_str = '.csv' if file_name.endswith(csv_str): @@ -64,28 +36,22 @@ def is_csv(self, file_name): return False - # def smart_open_read(self, key): - # boto_client = self.su.connect("session", None) - # bucket = self.su.conf['s3_bucket'] - # self.su.read_csv_s3(boto_client, bucket, key) - - - def get_spatial_temporal_bounds(self, lon_column_name, lat_column_name, date_column_name): + @staticmethod + def get_spatial_temporal_bounds(sm_open_file, lon_column_name, lat_column_name, date_column_name): """ - Extracts min/max longitude and latitude values as well as beginning and ending dates from specified csv file + Gets the spacial bounding box for the open file. This seeks to the start of the file at start and the end. + :param sm_open_file: file-like object + A file-like object that is open, say from smart_open's sm_open. :param lon_column_name: str - name of longitude column in the csv file + Longitude column name :param lat_column_name: str - name of the latitude column in the csv file + Latitude column name :param date_column_name: str - name of the date column in the csv file + Date column name :return: dict - Key : Value - geospatial (str) -> List[float] containing min/max longitude and latitude values - temporal (str) -> List[str] containing beginning and end dates - + geospatial and temporal fields of the bounding box for given constraints. """ lon_min_val = None lon_max_val = None @@ -99,9 +65,7 @@ def get_spatial_temporal_bounds(self, lon_column_name, lat_column_name, date_col # variable to be returned in string format begin_date_str = '' - boto_client = self.su.connect("session", None) - bucket = self.su.conf['s3_bucket'] - sm_open_file = self.su.get_csv_s3(boto_client, bucket, self.key) + sm_open_file.seek(0) csv_reader = csv.DictReader(sm_open_file) for row in csv_reader: @@ -151,43 +115,40 @@ def get_spatial_temporal_bounds(self, lon_column_name, lat_column_name, date_col "temporal": [begin_date_str, end_date_str] } + sm_open_file.seek(0) return geospatial_temporal_bounds - - def extract_coords(self, max_lon, max_lat, min_lon, min_lat): + @staticmethod + def extract_coords(sm_open_file, max_lon, max_lat, min_lon, min_lat): """ - Extracts specific coordinates corresponding to min/max longitude and latitude values given from get_spatial_temporal_bounds(....) method - - :param max_lon: float - maximum longitude value - :param max_lat: float - maximum latitude value - :param min_lon: float - minimum longitude value - :param min_lat: float - minimum latitude value - - :return: List[ List[Float] ] - Returns a list of lists. Each list contains floats (longitude and latitude ) value pairs corresponding to - one of the min/max latitude and longitude values that were extracted previously from get_spatial_temporal_bounds (...) + Given the max/min lon and lat, the function will parse the csv file to extract the coordinates within the given bounding box. + + :param sm_open_file: file-like object + A file-like object that is open, say from smart_open's sm_open. + :param max_lon: str + Maximum longitude + :param max_lat: str + Maximum latitude + :param min_lon: str + Minimum longitude + :param min_lat: str + Minimum latitude + + :return: list + List of the the coordinates (no duplicates) within the file that are within the given bounding box. """ - # Keeps track of all coordinates that needs to be added to json payload coords = [] - boto_client = self.su.connect("session", None) - bucket = self.su.conf['s3_bucket'] - sm_open_file = self.su.get_csv_s3(boto_client, bucket, self.key) + sm_open_file.seek(0) csv_reader = csv.DictReader(sm_open_file) - for row in csv_reader: - if float( row['LAT'] ) == min_lat or float( row['LAT'] ) == max_lat or float( - row['LON'] ) == min_lon or float( row['LON'] ) == max_lon: + if float( row['LAT'] ) == min_lat or float( row['LAT'] ) == max_lat or \ + float( row['LON'] ) == min_lon or float( row['LON'] ) == max_lon: coord = [float( row['LON'] ), float( row['LAT'] )] - - # check to see if that coordinate has already been appended to the list that is keeping track of our coordinates + # if this coordinate has already been appended to the list to return (no duplicates) if coord not in coords: coords.append( coord ) + sm_open_file.seek(0) return coords - diff --git a/onestop-python-client/onestop/util/S3MessageAdapter.py b/onestop-python-client/onestop/util/S3MessageAdapter.py index d640b77..6bd832d 100644 --- a/onestop-python-client/onestop/util/S3MessageAdapter.py +++ b/onestop-python-client/onestop/util/S3MessageAdapter.py @@ -1,10 +1,4 @@ -import yaml from onestop.util.ClientLogger import ClientLogger -""" -from onestop.info.ImMessage import ImMessage -from onestop.info.FileMessage import FileMessage -from onestop.info.Link import Link -""" from onestop.schemas.psiSchemaClasses.org.cedar.schemas.avro.psi.parsed_record import ParsedRecord, Publishing, ErrorEvent from onestop.schemas.psiSchemaClasses.org.cedar.schemas.avro.psi.file_location import FileLocation,FileLocationType @@ -14,81 +8,67 @@ from onestop.schemas.psiSchemaClasses.org.cedar.schemas.avro.psi.discovery import Discovery, Link - class S3MessageAdapter: """ A class used to extract information from sqs messages that have been triggered by s3 events and transform it into correct format for publishing to IM Registry Attributes ---------- - conf: yaml file - csb-data-stream-config.yml - s3_utils: S3Utils object - used to access objects inside of s3 buckets - logger: ClientLogger object - utilizes python logger library and creates logging for our specific needs - logger.info: ClientLogger object - logging statement that occurs when the class is instantiated - prefix_mapping: Dict - contains mapping of various line offices and their associated collection id + access_bucket: str + Cloud bucket to put in the links field when transformed. + type: str + COLLECTION or GRANULE + file_id_prefix: str + File prefix returned as fileIdentifier + collection_id: str + Collection this data belongs to. Returned as parent identifier. + log_level: str + The log level to use for this class (Defaults to 'INFO') - Methods - ------- - collection_id_map(s3_key) - given an s3 key that contains one of the NESDIS line offices in its path, it will provide the corresponding collection id - - transform(recs) - transforms sqs message triggered by s3 event to correct format for publishing to IM registry - """ - def __init__(self, conf_loc, s3_utils): - """ - - :param conf_loc: yaml file - csb-data-stream-config.yml - :param s3_utils: S3Utils object - used to access objects inside of s3 buckets - - Other Attributes - ---------------- logger: ClientLogger object utilizes python logger library and creates logging for our specific needs logger.info: ClientLogger object logging statement that occurs when the class is instantiated - prefix_mapping: Dict - contains mapping of various line offices and their associated collection id - - """ - with open(conf_loc) as f: - self.conf = yaml.load(f, Loader=yaml.FullLoader) - - self.logger = ClientLogger.get_logger(self.__class__.__name__, self.conf['log_level'], False) - self.logger.info("Initializing " + self.__class__.__name__) - self.s3_utils = s3_utils - self.prefix_mapping = self.conf['prefixMap'] - - def collection_id_map(self, s3_key): + Methods + ------- + transform(recs) + transforms sqs message triggered by s3 event to correct format for publishing to IM registry + """ + def __init__(self, access_bucket, type, file_id_prefix, collection_id, log_level = 'INFO', **wildargs): """ - Given an s3 key that contains one of the NESDIS line offices in its path, it will provide the corresponding collection id + Parameters + ---------- + access_bucket: str + access bucket to put in the links field when transformed. + type: str + COLLECTION or GRANULE + file_id_prefix: str + File prefix returned as fileIdentifier + collection_id: str + Collection this data belongs to. Returned as parent identifier. + log_level: str + Log level for when logging in class. - :param s3_key: str - key path of object in s3 bucket - - :return: str - associated line office collection id """ - # Looks through our prefix map and returns appropriate collection id - for key in self.prefix_mapping: - if key in s3_key: - return self.prefix_mapping[key] + self.access_bucket = access_bucket + self.type = type + self.file_id_prefix = file_id_prefix + self.collection_id = collection_id + self.logger = ClientLogger.get_logger(self.__class__.__name__, log_level, False) + self.logger.info("Initializing " + self.__class__.__name__) + if wildargs: + self.logger.warning("There were extra constructor arguments: " + str(wildargs)) def transform(self, recs): """ Transforms sqs message triggered by s3 event to correct format for publishing to IM registry - :param recs: dict - sqs event message + Parameters: + ---------- + recs: dict + sqs event message to transform :return: ParsedRecord Object The Parsed Record class is an avro schema generated class @@ -111,8 +91,8 @@ def transform(self, recs): fileInformation = FileInformation(name=file_name, size=file_size, checksums=[checkSum], optionalAttributes={}) # Relationship - relationshipType = RelationshipType(type=self.conf['type']) - relationship = Relationship(id=self.conf['collection_id'], type=relationshipType) + relationshipType = RelationshipType(type=self.type) + relationship = Relationship(id=self.collection_id, type=relationshipType) # File Location fileLocationType = FileLocationType(type='ARCHIVE') @@ -127,12 +107,12 @@ def transform(self, recs): publishing = Publishing(isPrivate=True) # Discovery - access_obj_uri = self.conf['access_bucket'] + "/" + s3_key + access_obj_uri = self.access_bucket + "/" + s3_key link1 = Link(linkName="Amazon S3", linkUrl=access_obj_uri, linkProtocol="HTTPS", linkFunction="download") link2 = Link(linkName="Amazon S3", linkUrl=s3_obj_uri, linkProtocol="Amazon:AWS:S3", linkFunction="download") # To Change? Come back to this later - parent_identifier = self.conf['collection_id'] - file_identifier = self.conf['file_identifier_prefix'] + file_name[:-4] + parent_identifier = self.collection_id + file_identifier = self.file_id_prefix + file_name[:-4] # Initializing most fields to their default values in the avro schema so that it doesn't cause an error in Kafka discovery = Discovery(links=[link1, link2], title=file_name, parentIdentifier=parent_identifier, diff --git a/onestop-python-client/onestop/util/S3Utils.py b/onestop-python-client/onestop/util/S3Utils.py index 7bb0fbe..d63e654 100644 --- a/onestop-python-client/onestop/util/S3Utils.py +++ b/onestop-python-client/onestop/util/S3Utils.py @@ -1,5 +1,5 @@ import logging -import yaml + import uuid import boto3 import botocore @@ -15,100 +15,106 @@ class S3Utils: Attributes ---------- - conf: yaml file - aws-util-config-dev.yml - cred: yaml file - credentials.yml - logger: ClientLogger object - utilizes python logger library and creates logging for our specific needs - logger.info: ClientLogger object - logging statement that occurs when the class is instantiated + access_key: str + Cloud access key - Methods - ------- - connect(client_type, region) - connects to a boto3 client + secret_key: str + Cloud secret key - objectkey_exists(bucket, s3_key) - checks to see if a s3 key path exists in a particular bucket + log_level: str + The log level to use for this class (Defaults to 'INFO') - get_uuid_metadata(boto_client, bucket, s3_key) - returns metadata uuid of an s3 object if it has one, otherwise prints that one does not exist + logger: ClientLogger object + Creates logging for us to log to. - add_uuid_metadata(boto_client, bucket, s3_key) - adds metadata uuid to an s3 object + Methods + ------- + connect(client_type, region) + connects to a boto3 service - upload_s3(boto_client, local_file, bucket, s3_key, overwrite) - uploads a file to s3 bucket + objectkey_exists(bucket, s3_key) + checks to see if a s3 key path exists in a particular bucket - get_csv_s3(boto_client, bucket, key) - gets a csv file from s3 bucket using smart open library + get_uuid_metadata(boto_client, bucket, s3_key) + returns metadata uuid of an s3 object if it has one, otherwise prints that one does not exist - read_bytes_s3(boto_client, bucket, key) - returns raw information of s3 object + add_uuid_metadata(boto_client, bucket, s3_key) + adds metadata uuid to an s3 object - upload_archive(boto_client, vault_name, src_data) - Add an archive to an Amazon S3 Glacier vault. The upload occurs synchronously. + upload_s3(boto_client, local_file, bucket, s3_key, overwrite) + uploads a file to s3 bucket - s3_to_glacier(boto_client, bucket_name, key) - Changes storage class of s3 object from s3 -> glacier. Utilizes s3 client type + get_csv_s3(boto_client, bucket, key) + gets a csv file from s3 bucket using smart open library - s3_to_glacier_object_lock(boto_client, bucket_name, key, object_lock_mode, object_lock_retention) - Changes storage class of s3 object from s3 -> glacier and places it in object lock mode. Utilizes s3 client type + read_bytes_s3(boto_client, bucket, key) + returns raw information of s3 object - s3_restore(boto_client, bucket_name, key, days) - Restores an object in S3 glacier back to S3 for specified amount of days + upload_archive(boto_client, vault_name, src_data) + Add an archive to an Amazon S3 Glacier vault. The upload occurs synchronously. - retrieve_inventory(boto_client, vault_name) - Initiate an Amazon Glacier inventory-retrieval job + s3_to_glacier(boto_client, bucket_name, key) + Changes storage class of s3 object from s3 -> glacier. Utilizes s3 client type - retrieve_inventory_results(vault_name, boto_client, job_id) - Retrieve the results of an Amazon Glacier inventory-retrieval job - """ - conf = None + s3_to_glacier_object_lock(boto_client, bucket_name, key, object_lock_mode, object_lock_retention) + Changes storage class of s3 object from s3 -> glacier and places it in object lock mode. Utilizes s3 client type - def __init__(self, conf_loc, cred_loc): + s3_restore(boto_client, bucket_name, key, days) + Restores an object in S3 glacier back to S3 for specified amount of days - with open(conf_loc) as f: - self.conf = yaml.load(f, Loader=yaml.FullLoader) + retrieve_inventory(boto_client, vault_name) + Initiate an Amazon Glacier inventory-retrieval job - with open(cred_loc) as f: - self.cred = yaml.load(f, Loader=yaml.FullLoader) + retrieve_inventory_results(vault_name, boto_client, job_id) + Retrieve the results of an Amazon Glacier inventory-retrieval job + """ - self.logger = ClientLogger.get_logger(self.__class__.__name__, self.conf['log_level'], False) + def __init__(self, access_key, secret_key, log_level = 'INFO', **wildargs): + self.access_key = access_key + self.secret_key = secret_key + self.logger = ClientLogger.get_logger(self.__class__.__name__, log_level, False) self.logger.info("Initializing " + self.__class__.__name__) - def connect(self, client_type, region): + if wildargs: + self.logger.warning("There were extra constructor arguments: " + str(wildargs)) + + def connect(self, type, service_name, region): """ - Connects to a boto3 client + Connects to a boto3 of specified type using the credentials provided in the constructor. - :param client_type: str - boto client type in which you want to access + :param type: str + boto object type to return, see return type. + :param service_name: str + (Optional for session type) boto service name in which you want to access :param region: str - name of aws region you want to access + (Optional for session type) name of aws region you want to access - :return: boto3 client - dependent on the client_type parameter + :return: boto3 connection object + A boto3 connection object; Client, Session, or Resource. """ - - if client_type == "s3": - boto = boto3.client("s3", aws_access_key_id=self.cred['sandbox']['access_key'], - aws_secret_access_key=self.cred['sandbox']['secret_key'], region_name=region) - - if client_type == "s3_resource": - boto = boto3.resource("s3", region_name=region, aws_access_key_id=self.cred['sandbox']['access_key'], - aws_secret_access_key=self.cred['sandbox']['secret_key'] ) - - if client_type == "glacier": - boto = boto3.client("glacier", region_name=region, aws_access_key_id=self.cred['sandbox']['access_key'], - aws_secret_access_key=self.cred['sandbox']['secret_key']) - - if client_type == "session": - boto = boto3.Session( - aws_access_key_id=self.cred['sandbox']['access_key'], - aws_secret_access_key=self.cred['sandbox']['secret_key'], + type = type.lower() + if type == 'session': + return boto3.Session( + aws_access_key_id=self.access_key, + aws_secret_access_key=self.secret_key, + region_name=region + ) + elif type == 'client': + return boto3.client( + service_name, + aws_access_key_id=self.access_key, + aws_secret_access_key=self.secret_key, + region_name=region + ) + elif type == 'resource': + return boto3.resource( + service_name, + region_name=region, + aws_access_key_id=self.access_key, + aws_secret_access_key=self.secret_key ) - return boto + else: + raise Exception('Unknown boto3 type of "%s"'%(type)) def objectkey_exists(self, bucket, s3_key): """ @@ -226,11 +232,11 @@ def upload_s3(self, boto_client, local_file, bucket, s3_key, overwrite): self.logger.error("File to upload was not found. Path: "+local_file) return False - def get_csv_s3(self, boto_client, bucket, key): + def get_csv_s3(self, boto_session, bucket, key): """ gets a csv file from s3 bucket using smart open library - :param boto_client: session + :param boto_session: session utilizes boto session type :param bucket: str name of bucket @@ -240,7 +246,7 @@ def get_csv_s3(self, boto_client, bucket, key): :return: smart open file """ url = "s3://" + bucket + "/" + key - sm_open_file = sm_open(url, 'r', transport_params={'session': boto_client}) + sm_open_file = sm_open(url, 'r', transport_params={'session': boto_session}) return sm_open_file def read_bytes_s3(self, boto_client, bucket, key): @@ -386,7 +392,6 @@ def s3_restore(self, boto_client, bucket_name, key, days): # returns status of object retrieval return obj.restore - def retrieve_inventory(self, boto_client, vault_name): """ Initiate an Amazon Glacier inventory-retrieval job diff --git a/onestop-python-client/onestop/util/SqsConsumer.py b/onestop-python-client/onestop/util/SqsConsumer.py index f782cc5..1972cc6 100644 --- a/onestop-python-client/onestop/util/SqsConsumer.py +++ b/onestop-python-client/onestop/util/SqsConsumer.py @@ -1,10 +1,7 @@ -import logging -from datetime import datetime, timezone -import yaml -import boto3 import json -from onestop.util.ClientLogger import ClientLogger +from datetime import datetime, timezone +from onestop.util.ClientLogger import ClientLogger class SqsConsumer: """ @@ -12,116 +9,102 @@ class SqsConsumer: Attributes ---------- - conf: yaml file - aws-util-config-dev.yml - cred: yaml file - credentials.yml - logger: ClientLogger object - utilizes python logger library and creates logging for our specific needs - logger.info: ClientLogger object - logging statement that occurs when the class is instantiated + logger: ClientLogger object + utilizes python logger library and creates logging for our specific needs Methods ------- - connect() - connects a boto sqs instance based on configurations in conf and cred yml files - - receive_messages(queue, sqs_max_polls, cb) - polls for messages in the queue + receive_messages(sqs_client, sqs_queue_name, sqs_max_polls, cb) + polls for messages in the queue """ - conf = None - def __init__(self, conf_loc, cred_loc): + def __init__(self, log_level = 'INFO', **wildargs): """ - - :param conf_loc: yaml file - aws-util-config-dev.yml - :param cred_loc: yaml file - credentials.yml - - Other Attributes - ---------------- - logger: ClientLogger object - utilizes python logger library and creates logging for our specific needs - logger.info: ClientLogger object - logging statement that occurs when the class is instantiated - + Attributes + ---------- + log_level: str + The log level to use for this class (Defaults to 'INFO') """ - with open(conf_loc) as f: - self.conf = yaml.load(f, Loader=yaml.FullLoader) - - with open(cred_loc) as f: - self.cred = yaml.load(f, Loader=yaml.FullLoader) - - self.logger = ClientLogger.get_logger(self.__class__.__name__, self.conf['log_level'], False) + self.log_level = log_level + self.logger = ClientLogger.get_logger(self.__class__.__name__, log_level, False) self.logger.info("Initializing " + self.__class__.__name__) - def connect(self): - """ - Connects a boto sqs instance based on configurations in conf and cred yml files + if wildargs: + self.logger.warning("There were extra constructor arguments: " + str(wildargs)) - :return: boto sqs - returns instance of boto sqs resource + def connect(self, sqs_resource, sqs_queue_name): + """ + Gets a boto SQS.Queue resource. + :param sqs_resource: boto SQS.Resource + SQS resource to create the queue from. + :param sqs_queue_name: str + SQS queue name to create and return a boto SQS.Queue object to. + :return: SQS.Queue + An SQS.Queue resource to use for Queue operations. """ - boto_session = boto3.Session(aws_access_key_id=self.cred['sandbox']['access_key'], - aws_secret_access_key=self.cred['sandbox']['secret_key']) - # Get the queue. This returns an SQS.Queue instance - sqs_session = boto_session.resource('sqs', region_name=self.conf['s3_region']) - sqs_queue = sqs_session.Queue(self.conf['sqs_url']) - self.logger.info("Connecting to " + self.conf['sqs_url']) - return sqs_queue - - def receive_messages(self, queue, sqs_max_polls, cb): + return sqs_resource.create_queue(QueueName=sqs_queue_name) + + def receive_messages(self, sqs_queue, sqs_max_polls, cb): """ - Polls for messages in the queue + Polls for messages from an sqs queue - :param queue: boto sqs resource - instance of boto sqs resource given from connect() + :param sqs_queue: boto SQS.Queue object + boto SQS Queue object. Can be generated by the method in this class. :param sqs_max_polls: int number of polls :param cb: function call back function - :return: Dependent on the call back function + :return: If the Message has a Records key then the call back function gets called on the Message. """ self.logger.info("Receive messages") + self.logger.info("Polling %d time(s) for SQS messages" % sqs_max_polls) - i = 1 - while i <= sqs_max_polls: + if sqs_max_polls < 1: + raise ValueError('Max polling value should be greater than 0.') + + for i in range(1, sqs_max_polls+1): self.logger.info("Polling attempt: " + str(i)) - i = i + 1 - sqs_messages = queue.receive_messages(MaxNumberOfMessages=10, WaitTimeSeconds=10) + # boto3 SQS.Queue appears to have a subset of SQS.Client methods plus a few management queue ones. + # The ones they do share seem to have different return types. + # The message method names are different and return types different: + # Client.send_message and Queue.send_message and Queue.send_messages + # Client.receive_message and Queue.receive_messages + sqs_messages = sqs_queue.receive_messages( + MaxNumberOfMessages=10, + WaitTimeSeconds=10 + ) self.logger.info("Received %d messages." % len(sqs_messages)) + self.logger.debug("Messages: %s" % sqs_messages) for sqs_message in sqs_messages: try: # Log start time dt_start = datetime.now(tz=timezone.utc) - self.logger.info("Started processing message") + self.logger.info("Starting processing message") + self.logger.debug("Message: %s" % sqs_message) + self.logger.debug("Message body: %s" % sqs_message.body) message_body = json.loads(sqs_message.body) + self.logger.debug("Message body message: %s" % message_body['Message']) message_content = json.loads(message_body['Message']) if 'Records' in message_content: recs = message_content['Records'] - self.logger.info("Received message") - self.logger.debug('Records: ' + str(recs)) + self.logger.debug('Message "Records": %s' % recs) + cb(recs, self.log_level) else: - self.logger.info("s3 event without records content received.") - - sqs_message.delete() - - self.logger.info("The SQS message has been deleted.") + self.logger.info("s3 event message without 'Records' content received.") dt_end = datetime.now(tz=timezone.utc) processing_time = dt_end - dt_start + self.logger.info("Completed processing the message in %s seconds."%(processing_time.microseconds / 1000000)) - self.logger.info("Completed processing message (s):" + str(processing_time.microseconds * 1000)) - cb(recs) - + sqs_message.delete() + self.logger.info("The SQS message has been deleted.") except: self.logger.exception( "An exception was thrown while processing a message, but this program will continue. The " - "message will not be deleted from the SQS queue. The message was: %s" % sqs_message.body) + "message will not be deleted from the SQS queue. The message was: %s" % sqs_message) diff --git a/onestop-python-client/onestop/util/SqsHandlers.py b/onestop-python-client/onestop/util/SqsHandlers.py index 57be8da..894f8b5 100644 --- a/onestop-python-client/onestop/util/SqsHandlers.py +++ b/onestop-python-client/onestop/util/SqsHandlers.py @@ -1,3 +1,5 @@ +from onestop.util.ClientLogger import ClientLogger + def create_delete_handler(web_publisher): """ Creates a delete function handler to be used with SqsConsumer.receive_messages. @@ -7,21 +9,89 @@ def create_delete_handler(web_publisher): :param: web_publisher: WebPublisher object """ - def delete(records): - if records is None: + def delete(records, log_level='INFO'): + + logger = ClientLogger.get_logger('SqsHandlers.create_delete_handler.delete', log_level, False) + logger.info("In create_delete_handler.delete() handler") + logger.debug("Records: %s"%records) + + if not records or records is None: + logger.info("Ending handler, records empty, records=%s"%records) return + record = records[0] if record['eventName'] != 'ObjectRemoved:Delete': + logger.info("Ending handler, eventName=%s"%record['eventName']) return + bucket = record['s3']['bucket']['name'] s3_key = record['s3']['object']['key'] s3_url = "s3://" + bucket + "/" + s3_key payload = '{"queries":[{"type": "fieldQuery", "field": "links.linkUrl", "value": "' + s3_url + '"}] }' search_response = web_publisher.search_onestop('granule', payload) + logger.debug('OneStop search response=%s'%search_response) response_json = search_response.json() + logger.debug('OneStop search response json=%s'%response_json) + logger.debug('OneStop search response data=%s'%response_json['data']) if len(response_json['data']) != 0: granule_uuid = response_json['data'][0]['id'] response = web_publisher.delete_registry('granule', granule_uuid) + logger.debug('web_publisher.delete_registry response: %s'%response) return response + logger.warning("OneStop search response has no 'data' field. Response=%s"%response_json) + return delete + +def create_upload_handler(web_publisher, s3_utils, s3_message_adapter): + """ + Creates a upload function handler to be used with SqsConsumer.receive_messages. + + The upload handler function checks the object for a UUID and if one is not found, it will create one for it. + + :param: web_publisher: WebPublisher object + :param: s3_utils: S3Utils object + :param: s3ma: S3MessageAdapter object + + """ + def upload(records, log_level='INFO'): + logger = ClientLogger.get_logger('SqsHandlers.create_upload_handler.upload', log_level, False) + logger.info("In create_upload_handler.upload() handler") + logger.debug("Records: %s"%records) + + rec = records[0] + s3_key = rec['s3']['object']['key'] + logger.info("Received message for " + s3_key) + logger.info("Event type: " + rec['eventName']) + bucket = rec['s3']['bucket']['name'] + logger.info("BUCKET: %s"%bucket) + s3_resource = s3_utils.connect("s3_resource", None) + + # Fetch the object to get the uuid + object_uuid = s3_utils.get_uuid_metadata(s3_resource, bucket, s3_key) + if object_uuid is not None: + logger.info("Retrieved object-uuid: %s"%object_uuid) + else: + logger.info("Adding uuid") + # Can't add uuid to glacier and should be copied over + if "backup" not in bucket: + object_uuid = s3_utils.add_uuid_metadata(s3_resource, bucket, s3_key) + + # Convert s3 message to IM message + json_payload = s3_message_adapter.transform(records) + logger.debug('transformed message, json_payload: %s'%json_payload) + + # Send the message to registry + payload = json_payload.serialize() + method = 'PATCH' # Backup location should be patched if not backup within bucket name + if "backup" not in bucket: + method = 'POST' + + logger.debug('web_publisher.publish_registry method using "%s" with payload %s'%(method,payload)) + registry_response = web_publisher.publish_registry("granule", object_uuid, payload, method) + logger.debug('web_publisher.publish_registry response=%s'%registry_response) + logger.debug('web_publisher.publish_registry response json=%s'%registry_response.json()) + + return registry_response + + return upload \ No newline at end of file diff --git a/onestop-python-client/requirements.txt b/onestop-python-client/requirements.txt index 735dad7..036e217 100644 --- a/onestop-python-client/requirements.txt +++ b/onestop-python-client/requirements.txt @@ -5,8 +5,9 @@ smart-open PyYAML~=5.3.1 setuptools~=49.2.0 argparse~=1.4.0 -boto3~=1.15.11 +boto~=2.49.0 +boto3~=1.17.71 requests~=2.24.0 -botocore~=1.18.11 -moto==1.3.16.dev122 +botocore~=1.20.71 +moto[all]==2.0.5 undictify diff --git a/onestop-python-client/test/__init__.py b/onestop-python-client/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/onestop-python-client/tests/data/file1.csv b/onestop-python-client/test/data/file1.csv similarity index 100% rename from onestop-python-client/tests/data/file1.csv rename to onestop-python-client/test/data/file1.csv diff --git a/onestop-python-client/tests/data/file1_s3.csv b/onestop-python-client/test/data/file1_s3.csv similarity index 100% rename from onestop-python-client/tests/data/file1_s3.csv rename to onestop-python-client/test/data/file1_s3.csv diff --git a/onestop-python-client/tests/data/file2.csv b/onestop-python-client/test/data/file2.csv similarity index 100% rename from onestop-python-client/tests/data/file2.csv rename to onestop-python-client/test/data/file2.csv diff --git a/onestop-python-client/tests/data/file3.csv b/onestop-python-client/test/data/file3.csv similarity index 100% rename from onestop-python-client/tests/data/file3.csv rename to onestop-python-client/test/data/file3.csv diff --git a/onestop-python-client/tests/data/file4.csv b/onestop-python-client/test/data/file4.csv similarity index 100% rename from onestop-python-client/tests/data/file4.csv rename to onestop-python-client/test/data/file4.csv diff --git a/onestop-python-client/test/integration/__init__.py b/onestop-python-client/test/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/onestop-python-client/tests/test_WebPublisher.py b/onestop-python-client/test/integration/test_WebPublisher.py similarity index 83% rename from onestop-python-client/tests/test_WebPublisher.py rename to onestop-python-client/test/integration/test_WebPublisher.py index c81a7de..5c7935a 100644 --- a/onestop-python-client/tests/test_WebPublisher.py +++ b/onestop-python-client/test/integration/test_WebPublisher.py @@ -2,8 +2,10 @@ import json import unittest import time +import os.path from onestop.WebPublisher import WebPublisher +from os import path class WebPublisherTest(unittest.TestCase): wp = None @@ -56,16 +58,26 @@ class WebPublisherTest(unittest.TestCase): def setUpClass(cls): print("Set it up!") - cred_loc = "../config/credentials.yml" - conf_loc = "../config/csb-data-stream-config-template.yml" - - with open(cred_loc) as f: - creds = yaml.load(f, Loader=yaml.FullLoader) - - registry_username = creds['registry']['username'] - registry_password = creds['registry']['password'] - access_key = creds['sandbox']['access_key'] - access_secret = creds['sandbox']['secret_key'] + cred_loc = "config/credentials.yml" + conf_loc = "config/csb-data-stream-config-template.yml" + + if path.exists(cred_loc): + with open(cred_loc) as f: + creds = yaml.load(f, Loader=yaml.FullLoader) + + registry_username = creds['registry']['username'] + registry_password = creds['registry']['password'] + access_key = creds['sandbox']['access_key'] + access_secret = creds['sandbox']['secret_key'] + else: + print("Credentials file doesn't exist at '%s', using environment variables."%cred_loc) + registry_username = os.environ.get('REGISTRY_USERNAME') + registry_password = os.environ.get("REGISTRY_PASSWORD") + access_key = os.environ.get("ACCESS_KEY") + access_secret = os.environ.get("SECRET_KEY") + if registry_username == None: + msg = "REGISTRY_USERNAME not defined as env variable. Credentials file at '" + cred_loc + "' doesn't exist." + raise Exception(msg) with open(conf_loc) as f: conf = yaml.load(f, Loader=yaml.FullLoader) diff --git a/onestop-python-client/test/unit/__init__.py b/onestop-python-client/test/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/onestop-python-client/test/unit/extractor/__init__.py b/onestop-python-client/test/unit/extractor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/onestop-python-client/test/unit/extractor/test_CsbExtractor.py b/onestop-python-client/test/unit/extractor/test_CsbExtractor.py new file mode 100644 index 0000000..cba1bf7 --- /dev/null +++ b/onestop-python-client/test/unit/extractor/test_CsbExtractor.py @@ -0,0 +1,119 @@ +import unittest +import os + +from moto import mock_s3 +from onestop.extract.CsbExtractor import CsbExtractor +from onestop.util.S3Utils import S3Utils + +class CsbExtractorTest(unittest.TestCase): + + def setUp(self): + print("Set it up!") + self.root_proj_path = os.getcwd() + self.assertIsNotNone(self.root_proj_path) + self.data_file_path = os.getcwd() + '/test/data/file4.csv' + self.key = "file4.csv" + # Use open instead of our method because we aren't testing our code here. + self.file_obj = open(self.data_file_path) + + config_dict = { + "access_key": "test_access_key", + "secret_key": "test_secret_key", + "log_level": "DEBUG" + } + + self.s3_utils = S3Utils(**config_dict) + self.bucket = "bucket" + self.region = "region" + + def tearDown(self): + print("Tear it down!") + self.file_obj.close() + + def test_is_csv(self): + self.assertTrue(CsbExtractor.is_csv("test/blah/file.csv"), "Failed to determine a csv file name was a csv file.") + + def test_is_not_csv(self): + self.assertFalse(CsbExtractor.is_csv("test/blah/file.txt"), "Failed to determine a csv file name was not a csv file.") + + @mock_s3 + def test_csb_SME_user_path(self): + # Setup bucket and file to read + s3 = self.s3_utils.connect('client', 's3', self.region) + s3.create_bucket(Bucket=self.bucket, CreateBucketConfiguration={'LocationConstraint': self.region}) + self.s3_utils.upload_s3(s3, self.data_file_path, self.bucket, self.key, True) + self.assertTrue(self.s3_utils.read_bytes_s3(s3, self.bucket, self.key)) + + # This is how we would expect an external user to get the file. + sm_open_file = self.s3_utils.get_csv_s3(self.s3_utils.connect('session', None, self.region), self.bucket, self.key) + + bounds_dict = CsbExtractor.get_spatial_temporal_bounds(sm_open_file, 'LON', 'LAT', 'TIME') + coords = bounds_dict["geospatial"] + self.assertEqual(coords[0], -96.847995) + self.assertEqual(coords[1], 29.373065) + self.assertEqual(coords[2], -92.747995) + self.assertEqual(coords[3], 33.373065) + + date_rng = bounds_dict["temporal"] + self.assertEqual(date_rng[0], '2018-04-10T14:00:06.000Z' ) + self.assertEqual(date_rng[1], '2020-04-10T14:00:06.000Z' ) + + def test_get_geospatial_temporal_bounds(self): + bounds_dict = CsbExtractor.get_spatial_temporal_bounds(self.file_obj, 'LON', 'LAT', 'TIME') + + coords = bounds_dict["geospatial"] + self.assertEqual(coords[0], -96.847995) + self.assertEqual(coords[1], 29.373065) + self.assertEqual(coords[2], -92.747995) + self.assertEqual(coords[3], 33.373065) + + date_rng = bounds_dict["temporal"] + self.assertEqual(date_rng[0], '2018-04-10T14:00:06.000Z' ) + self.assertEqual(date_rng[1], '2020-04-10T14:00:06.000Z' ) + + def test_get_min_lon(self): + bounds_dict = CsbExtractor.get_spatial_temporal_bounds(self.file_obj, 'LON', 'LAT', 'TIME') + + coords = bounds_dict["geospatial"] + min_lon = coords[0] + self.assertEqual(min_lon, -96.847995) + + def test_get_max_datetime(self): + + bounds_dict = CsbExtractor.get_spatial_temporal_bounds(self.file_obj, 'LON', 'LAT', 'TIME') + + date_rng = bounds_dict["temporal"] + end_date = date_rng[1] + self.assertEqual(end_date, '2020-04-10T14:00:06.000Z') + + def test_get_min_datetime(self): + bounds_dict = CsbExtractor.get_spatial_temporal_bounds(self.file_obj, 'LON', 'LAT', 'TIME') + + date_rng = bounds_dict["temporal"] + begin_date = date_rng[0] + self.assertEqual(begin_date, '2018-04-10T14:00:06.000Z') + + def test_extract_coords(self): + coords = CsbExtractor.extract_coords(self.file_obj, -92.747995, 33.373065, -96.847995, 29.373065) + result = [[ + -94.847995, + 29.373065 + ], + [ + -96.847995, + 29.373065 + ], + [ + -94.847995, + 33.373065 + ], + [ + -92.747995, + 29.383065 + ] + ] + self.assertEqual(coords, result) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/onestop-python-client/test/unit/test_KafkaConsumer.py b/onestop-python-client/test/unit/test_KafkaConsumer.py new file mode 100644 index 0000000..b119e9a --- /dev/null +++ b/onestop-python-client/test/unit/test_KafkaConsumer.py @@ -0,0 +1,287 @@ +import unittest + +from unittest.mock import ANY, patch, MagicMock, call +from onestop.KafkaConsumer import KafkaConsumer +from confluent_kafka.schema_registry import SchemaRegistryClient + +class test_KafkaConsumer(unittest.TestCase): + kp = None + conf_w_security = None + conf_wo_security = None + + @classmethod + def setUp(cls): + print("Set it up!") + cls.conf_w_security = { + "metadata_type" : "GRANULE", + "brokers" : "onestop-dev-cp-kafka:9092", + "group_id" : "sme-test", + "auto_offset_reset" : "earliest", + "schema_registry" : "http://onestop-dev-cp-schema-registry:8081", + "security" : { + "enabled" : True, + "caLoc" : "/etc/pki/tls/cert.pem", + "keyLoc" : "/etc/pki/tls/private/kafka-user.key", + "certLoc" : "/etc/pki/tls/certs/kafka-user.crt" + }, + "collection_topic_consume" : "psi-collection-input-unknown", + "granule_topic_consume" : "psi-granule-input-unknown", + "log_level" : "DEBUG" + } + cls.conf_wo_security = dict(cls.conf_w_security) + # Remove security credential section. + cls.conf_wo_security['security'] = { + "enabled":False + } + + @classmethod + def tearDown(self): + print("Tear it down!") + + def test_init_happy_nonconditional_params(self): + consumer = KafkaConsumer(**self.conf_w_security) + + self.assertEqual(consumer.metadata_type, self.conf_w_security['metadata_type']) + self.assertEqual(consumer.brokers, self.conf_w_security['brokers']) + self.assertEqual(consumer.group_id, self.conf_w_security['group_id']) + self.assertEqual(consumer.auto_offset_reset, self.conf_w_security['auto_offset_reset']) + self.assertEqual(consumer.schema_registry, self.conf_w_security['schema_registry']) + self.assertEqual(consumer.security_enabled, self.conf_w_security['security']['enabled']) + self.assertEqual(consumer.collection_topic, self.conf_w_security['collection_topic_consume']) + self.assertEqual(consumer.granule_topic, self.conf_w_security['granule_topic_consume']) + + def test_init_security_enabled(self): + consumer = KafkaConsumer(**self.conf_w_security) + + self.assertEqual(consumer.security_caLoc, self.conf_w_security['security']['caLoc']) + self.assertEqual(consumer.security_keyLoc, self.conf_w_security['security']['keyLoc']) + self.assertEqual(consumer.security_certLoc, self.conf_w_security['security']['certLoc']) + + def test_init_security_disabled(self): + consumer = KafkaConsumer(**self.conf_wo_security) + + self.assertRaises(AttributeError, getattr, consumer, "security_caLoc") + self.assertRaises(AttributeError, getattr, consumer, "security_keyLoc") + self.assertRaises(AttributeError, getattr, consumer, "security_certLoc") + + def test_init_metadata_type_valid(self): + consumer = KafkaConsumer(**self.conf_w_security) + + self.assertEqual(consumer.metadata_type, self.conf_w_security['metadata_type']) + + def test_init_metadata_type_invalid(self): + wrong_metadata_type_config = dict(self.conf_w_security) + wrong_metadata_type_config['metadata_type'] = "invalid_type" + + self.assertRaises(ValueError, KafkaConsumer, **wrong_metadata_type_config) + + def test_init_extra_params(self): + conf = dict(self.conf_wo_security) + conf['junk_key'] = 'junk_value' + KafkaConsumer(**conf) + + @patch.object(SchemaRegistryClient, '__init__', autospec=True) + def test_register_client_w_security(self, mock_client): + exp_security_conf = { + 'url':self.conf_w_security['schema_registry'], + 'ssl.ca.location': self.conf_w_security['security']['caLoc'], + 'ssl.key.location': self.conf_w_security['security']['keyLoc'], + 'ssl.certificate.location': self.conf_w_security['security']['certLoc'] + } + mock_client.return_value = None + + consumer = KafkaConsumer(**self.conf_w_security) + consumer.register_client() + + mock_client.assert_called() + mock_client.assert_called_with(ANY, exp_security_conf) + + @patch.object(SchemaRegistryClient, '__init__', autospec=True) + def test_register_client_wo_security(self, mock_client): + exp_security_conf = { + 'url':self.conf_w_security['schema_registry'], + 'ssl.ca.location': self.conf_w_security['security']['caLoc'], + 'ssl.key.location': self.conf_w_security['security']['keyLoc'], + 'ssl.certificate.location': self.conf_w_security['security']['certLoc'] + } + mock_client.return_value = None + + consumer = KafkaConsumer(**self.conf_wo_security) + consumer.register_client() + try: + mock_client.assert_called_with(ANY, exp_security_conf) + except: + return + raise AssertionError('Expected register_client() to not have been called with security arguments.') + + @patch('onestop.KafkaConsumer.AvroDeserializer') + @patch('onestop.KafkaConsumer.DeserializingConsumer') + def test_create_consumer_calls_AvroDeserializer(self, mock_deserializing_consumer, mock_avro_deserializer): + conf_w_security_collection = dict(self.conf_w_security) + conf_w_security_collection['metadata_type'] = "COLLECTION" + + consumer = KafkaConsumer(**conf_w_security_collection) + reg_client = consumer.register_client() + reg_client.get_latest_version = MagicMock() + deser_consumer = consumer.create_consumer(reg_client) + + # Verify AvroDeserializer called with expected registry client + mock_avro_deserializer.assert_called_with(ANY, reg_client) + + self.assertIsNotNone(deser_consumer) + + @patch('onestop.KafkaConsumer.AvroDeserializer') + @patch('onestop.KafkaConsumer.DeserializingConsumer') + def test_create_consumer_collection_w_security(self, mock_deserializing_consumer, mock_avro_deserializer): + conf_w_security_collection = dict(self.conf_w_security) + topic = conf_w_security_collection['collection_topic_consume'] + conf_w_security_collection['metadata_type'] = 'COLLECTION' + + consumer = KafkaConsumer(**conf_w_security_collection) + reg_client = MagicMock() + deser_consumer = consumer.create_consumer(reg_client) + + # Verify metadata type was taken into consideration for getting topic information + reg_client.get_latest_version.assert_called_with(topic + '-value') + + # Verify security passed into DeserializingConsumer + mock_deserializing_consumer.assert_called_with( + { + 'bootstrap.servers': conf_w_security_collection['brokers'], + 'security.protocol': 'SSL', + 'ssl.ca.location': conf_w_security_collection['security']['caLoc'], + 'ssl.key.location': conf_w_security_collection['security']['keyLoc'], + 'ssl.certificate.location': conf_w_security_collection['security']['certLoc'], + 'key.deserializer': ANY, + 'value.deserializer': ANY, + 'group.id': conf_w_security_collection['group_id'], + 'auto.offset.reset': conf_w_security_collection['auto_offset_reset'] + }) + mock_deserializing_consumer.return_value.subscribe.assert_called_with([topic]) + + self.assertIsNotNone(deser_consumer) + + @patch('onestop.KafkaConsumer.AvroDeserializer') + @patch('onestop.KafkaConsumer.DeserializingConsumer') + def test_create_consumer_collection_wo_security(self, mock_deserializing_consumer, mock_avro_deserializer): + conf_wo_security_collection = dict(self.conf_wo_security) + topic = conf_wo_security_collection['collection_topic_consume'] + conf_wo_security_collection['metadata_type'] = 'COLLECTION' + + consumer = KafkaConsumer(**conf_wo_security_collection) + reg_client = MagicMock() + deser_consumer = consumer.create_consumer(reg_client) + + # Verify metadata type was taken into consideration for getting topic information + reg_client.get_latest_version.assert_called_with(topic + '-value') + + # Verify no security passed into DeserializingConsumer + mock_deserializing_consumer.assert_called_with( + { + 'bootstrap.servers': conf_wo_security_collection['brokers'], + 'key.deserializer': ANY, + 'value.deserializer': ANY, + 'group.id': conf_wo_security_collection['group_id'], + 'auto.offset.reset': conf_wo_security_collection['auto_offset_reset'] + }) + mock_deserializing_consumer.return_value.subscribe.assert_called_with([topic]) + + self.assertIsNotNone(deser_consumer) + + @patch('onestop.KafkaConsumer.AvroDeserializer') + @patch('onestop.KafkaConsumer.DeserializingConsumer') + def test_create_consumer_granule_w_security(self, mock_deserializing_consumer, mock_avro_deserializer): + conf_w_security_granule = dict(self.conf_w_security) + topic = conf_w_security_granule['granule_topic_consume'] + conf_w_security_granule['metadata_type'] = 'GRANULE' + + consumer = KafkaConsumer(**conf_w_security_granule) + reg_client = MagicMock() + deser_consumer = consumer.create_consumer(reg_client) + + # Verify metadata type was taken into consideration for getting topic information + reg_client.get_latest_version.assert_called_with(topic + '-value') + + # Verify security passed into DeserializingConsumer + mock_deserializing_consumer.assert_called_with( + { + 'bootstrap.servers': conf_w_security_granule['brokers'], + 'security.protocol': 'SSL', + 'ssl.ca.location': conf_w_security_granule['security']['caLoc'], + 'ssl.key.location': conf_w_security_granule['security']['keyLoc'], + 'ssl.certificate.location': conf_w_security_granule['security']['certLoc'], + 'key.deserializer': ANY, + 'value.deserializer': ANY, + 'group.id': conf_w_security_granule['group_id'], + 'auto.offset.reset': conf_w_security_granule['auto_offset_reset'] + }) + mock_deserializing_consumer.return_value.subscribe.assert_called_with([topic]) + + self.assertIsNotNone(deser_consumer) + + @patch('onestop.KafkaConsumer.AvroDeserializer') + @patch('onestop.KafkaConsumer.DeserializingConsumer') + def test_create_consumer_granule_wo_security(self, mock_deserializing_consumer, mock_avro_deserializer): + conf_wo_security_granule = dict(self.conf_wo_security) + exp_topic = conf_wo_security_granule['granule_topic_consume'] + conf_wo_security_granule['metadata_type'] = 'GRANULE' + + consumer = KafkaConsumer(**conf_wo_security_granule) + reg_client = MagicMock() + deser_consumer = consumer.create_consumer(reg_client) + + # Verify metadata type was taken into consideration for getting topic information + reg_client.get_latest_version.assert_called_with(exp_topic + '-value') + + # Verify no security passed into DeserializingConsumer called with expected configuration + mock_deserializing_consumer.assert_called_with( + { + 'bootstrap.servers': conf_wo_security_granule['brokers'], + 'key.deserializer': ANY, + 'value.deserializer': ANY, + 'group.id': conf_wo_security_granule['group_id'], + 'auto.offset.reset': conf_wo_security_granule['auto_offset_reset'] + }) + mock_deserializing_consumer.return_value.subscribe.assert_called_with([exp_topic]) + + self.assertIsNotNone(deser_consumer) + + def test_connect(self): + mock_client = MagicMock() + + consumer = KafkaConsumer(**self.conf_w_security) + consumer.register_client = MagicMock(return_value=mock_client) + consumer.create_consumer = MagicMock(return_value=MagicMock(mock_client)) + consumer.connect() + + consumer.register_client.assert_called_once() + consumer.create_consumer.assert_called_with(mock_client) + + @patch('confluent_kafka.cimpl.Message') + @patch('onestop.KafkaConsumer.DeserializingConsumer') + def test_consume(self, mock_metadata_consumer, mock_message): + mock_message_key = 'key1' + mock_message_value = 'value1' + consumer = KafkaConsumer(**self.conf_w_security) + consumer.register_client = MagicMock(return_value=MagicMock()) + mock_message.key.return_value = mock_message_key + mock_message.value.return_value = mock_message_value + mock_metadata_consumer.poll.side_effect = [None, mock_message, Exception] + mock_handler = MagicMock() + + # Would have liked not having the try/catch but it wasn't ignoring the exception. Just need to not fail due to end of loop. + try: + self.assertRaises(Exception, consumer.consume(mock_metadata_consumer, mock_handler)) + except Exception as e: + print("Ignoring exception: {}".format(e)) + + # Verify kafka consumer poll called expected number of times + self.assertEqual(mock_metadata_consumer.poll.call_count, 3) + mock_metadata_consumer.poll.assert_has_calls([call(10), call(10), call(10)]) + + # Verify callback function was called once with expected message attributes + mock_handler.assert_called_once() + mock_handler.assert_called_with(mock_message_key, mock_message_value) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/onestop-python-client/test/unit/test_KafkaPublisher.py b/onestop-python-client/test/unit/test_KafkaPublisher.py new file mode 100644 index 0000000..1c9497b --- /dev/null +++ b/onestop-python-client/test/unit/test_KafkaPublisher.py @@ -0,0 +1,335 @@ +import unittest +import json + +from onestop.KafkaPublisher import KafkaPublisher +from unittest.mock import ANY, patch, MagicMock +from confluent_kafka.schema_registry import SchemaRegistryClient + +class test_KafkaPublisher(unittest.TestCase): + kp = None + conf_w_security = None + conf_wo_security = None + + @classmethod + def setUp(cls): + print("Set it up!") + cls.conf_w_security = { + "metadata_type" : "GRANULE", + "brokers" : "onestop-dev-cp-kafka:9092", + "schema_registry" : "http://onestop-dev-cp-schema-registry:8081", + "security" : { + "enabled" : True, + "caLoc" : "/etc/pki/tls/cert.pem", + "keyLoc" : "/etc/pki/tls/private/kafka-user.key", + "certLoc" : "/etc/pki/tls/certs/kafka-user.crt" + }, + "collection_topic_publish" : "psi-collection-input-unknown", + "granule_topic_publish" : "psi-granule-input-unknown", + "log_level" : "DEBUG" + } + cls.conf_wo_security = dict(cls.conf_w_security) + # Remove security credential section. + cls.conf_wo_security['security'] = { + "enabled":False + } + + @classmethod + def tearDown(self): + print("Tear it down!") + + def test_init_happy_nonconditional_params(self): + publisher = KafkaPublisher(**self.conf_w_security) + + self.assertEqual(publisher.metadata_type, self.conf_w_security['metadata_type']) + self.assertEqual(publisher.brokers, self.conf_w_security['brokers']) + self.assertEqual(publisher.schema_registry, self.conf_w_security['schema_registry']) + self.assertEqual(publisher.security_enabled, self.conf_w_security['security']['enabled']) + self.assertEqual(publisher.collection_topic, self.conf_w_security['collection_topic_publish']) + self.assertEqual(publisher.granule_topic, self.conf_w_security['granule_topic_publish']) + + def test_init_security_enabled(self): + publisher = KafkaPublisher(**self.conf_w_security) + + self.assertEqual(publisher.security_caLoc, self.conf_w_security['security']['caLoc']) + self.assertEqual(publisher.security_keyLoc, self.conf_w_security['security']['keyLoc']) + self.assertEqual(publisher.security_certLoc, self.conf_w_security['security']['certLoc']) + + def test_init_security_disabled(self): + publisher = KafkaPublisher(**self.conf_wo_security) + + self.assertRaises(AttributeError, getattr, publisher, "security_caLoc") + self.assertRaises(AttributeError, getattr, publisher, "security_keyLoc") + self.assertRaises(AttributeError, getattr, publisher, "security_certLoc") + + def test_init_metadata_type_valid(self): + publisher = KafkaPublisher(**self.conf_w_security) + + self.assertEqual(publisher.metadata_type, self.conf_w_security['metadata_type']) + + def test_init_metadata_type_invalid(self): + wrong_metadata_type_config = dict(self.conf_w_security) + wrong_metadata_type_config['metadata_type'] = "invalid_type" + + self.assertRaises(ValueError, KafkaPublisher, **wrong_metadata_type_config) + + def test_init_extra_params(self): + conf = dict(self.conf_wo_security) + conf['junk_key'] = 'junk_value' + KafkaPublisher(**conf) + + @patch.object(SchemaRegistryClient, '__init__', autospec=True) + def test_register_client_w_security(self, mock_client): + exp_security_conf = { + 'url':self.conf_w_security['schema_registry'], + 'ssl.ca.location': self.conf_w_security['security']['caLoc'], + 'ssl.key.location': self.conf_w_security['security']['keyLoc'], + 'ssl.certificate.location': self.conf_w_security['security']['certLoc'] + } + mock_client.return_value = None + + publisher = KafkaPublisher(**self.conf_w_security) + publisher.register_client() + + mock_client.assert_called() + mock_client.assert_called_with(ANY, exp_security_conf) + + @patch.object(SchemaRegistryClient, '__init__', autospec=True) + def test_register_client_wo_security(self, mock_client): + exp_security_conf = { + 'url':self.conf_w_security['schema_registry'], + 'ssl.ca.location': self.conf_w_security['security']['caLoc'], + 'ssl.key.location': self.conf_w_security['security']['keyLoc'], + 'ssl.certificate.location': self.conf_w_security['security']['certLoc'] + } + mock_client.return_value = None + + publisher = KafkaPublisher(**self.conf_wo_security) + publisher.register_client() + try: + mock_client.assert_called_with(ANY, exp_security_conf) + except: + return + raise AssertionError('Expected register_client() to not have been called with security arguments.') + + @patch('onestop.KafkaPublisher.AvroSerializer') + @patch('onestop.KafkaPublisher.SerializingProducer') + def test_create_producer_calls_AvroSerializer(self, mock_serializing_publisher, mock_avro_serializer): + conf_w_security_collection = dict(self.conf_w_security) + conf_w_security_collection['metadata_type'] = "COLLECTION" + + publisher = KafkaPublisher(**conf_w_security_collection) + reg_client = publisher.register_client() + reg_client.get_latest_version = MagicMock() + publisher.create_producer(reg_client) + + # Verify AvroSerializer called with expected registry client + mock_avro_serializer.assert_called_with(ANY, reg_client) + + @patch('onestop.KafkaPublisher.AvroSerializer') + @patch('onestop.KafkaPublisher.SerializingProducer') + def test_create_producer_collection_w_security(self, mock_serializing_producer, mock_avro_serializer): + conf_w_security_collection = dict(self.conf_w_security) + topic = conf_w_security_collection['collection_topic_publish'] + conf_w_security_collection['metadata_type'] = 'COLLECTION' + + publisher = KafkaPublisher(**conf_w_security_collection) + reg_client = MagicMock() + prod = publisher.create_producer(reg_client) + + # Verify metadata type was taken into consideration for getting topic information + reg_client.get_latest_version.assert_called_with(topic + '-value') + + # Verify security passed into SerializingProducer + mock_serializing_producer.assert_called_with( + { + 'bootstrap.servers': conf_w_security_collection['brokers'], + 'security.protocol': 'SSL', + 'ssl.ca.location': conf_w_security_collection['security']['caLoc'], + 'ssl.key.location': conf_w_security_collection['security']['keyLoc'], + 'ssl.certificate.location': conf_w_security_collection['security']['certLoc'], + 'value.serializer': ANY, + }) + + self.assertIsNotNone(prod) + + @patch('onestop.KafkaPublisher.AvroSerializer') + @patch('onestop.KafkaPublisher.SerializingProducer') + def test_create_producer_collection_wo_security(self, mock_serializing_producer, mock_avro_serializer): + conf_wo_security_collection = dict(self.conf_wo_security) + topic = conf_wo_security_collection['collection_topic_publish'] + conf_wo_security_collection['metadata_type'] = 'COLLECTION' + + publisher = KafkaPublisher(**conf_wo_security_collection) + reg_client = MagicMock() + prod = publisher.create_producer(reg_client) + + # Verify metadata type was taken into consideration for getting topic information + reg_client.get_latest_version.assert_called_with(topic + '-value') + + # Verify no security passed into SerializingProducer + mock_serializing_producer.assert_called_with( + { + 'bootstrap.servers': conf_wo_security_collection['brokers'], + 'value.serializer': ANY, + }) + + self.assertIsNotNone(prod) + + @patch('onestop.KafkaPublisher.AvroSerializer') + @patch('onestop.KafkaPublisher.SerializingProducer') + def test_create_producer_granule_w_security(self, mock_serializing_producer, mock_avro_serializer): + conf_w_security_granule = dict(self.conf_w_security) + topic = conf_w_security_granule['granule_topic_publish'] + conf_w_security_granule['metadata_type'] = 'GRANULE' + + publisher = KafkaPublisher(**conf_w_security_granule) + reg_client = MagicMock() + prod = publisher.create_producer(reg_client) + + # Verify metadata type was taken into consideration for getting topic information + reg_client.get_latest_version.assert_called_with(topic + '-value') + + # Verify security passed into SerializingProducer + mock_serializing_producer.assert_called_with( + { + 'bootstrap.servers': conf_w_security_granule['brokers'], + 'security.protocol': 'SSL', + 'ssl.ca.location': conf_w_security_granule['security']['caLoc'], + 'ssl.key.location': conf_w_security_granule['security']['keyLoc'], + 'ssl.certificate.location': conf_w_security_granule['security']['certLoc'], + 'value.serializer': ANY, + }) + + self.assertIsNotNone(prod) + + @patch('onestop.KafkaPublisher.AvroSerializer') + @patch('onestop.KafkaPublisher.SerializingProducer') + def test_create_producer_granule_wo_security(self, mock_serializing_producer, mock_avro_serializer): + conf_wo_security_granule = dict(self.conf_wo_security) + exp_topic = conf_wo_security_granule['granule_topic_publish'] + conf_wo_security_granule['metadata_type'] = 'GRANULE' + + publisher = KafkaPublisher(**conf_wo_security_granule) + reg_client = MagicMock() + prod = publisher.create_producer(reg_client) + + # Verify metadata type was taken into consideration for getting topic information + reg_client.get_latest_version.assert_called_with(exp_topic + '-value') + + # Verify no security passed into SerializingProducer called with expected configuration + mock_serializing_producer.assert_called_with( + { + 'bootstrap.servers': conf_wo_security_granule['brokers'], + 'value.serializer': ANY, + }) + + self.assertIsNotNone(prod) + + def test_connect(self): + mock_client = MagicMock() + + publisher = KafkaPublisher(**self.conf_w_security) + publisher.register_client = MagicMock(return_value=mock_client) + publisher.create_producer = MagicMock(return_value=MagicMock(mock_client)) + publisher.connect() + + publisher.register_client.assert_called_once() + publisher.create_producer.assert_called_with(mock_client) + + def test_get_collection_key_from_uuid(self): + expKey = '12345678-1234-5678-1234-567812345678' + for uuid in [ + '{12345678-1234-5678-1234-567812345678}', + '12345678123456781234567812345678', + 'urn:uuid:12345678-1234-5678-1234-567812345678', + b'\x12\x34\x56\x78'*4, +# b'\x78\x56\x34\x12\x34\x12\x78\x56' + b'\x12\x34\x56\x78\x12\x34\x56\x78', +# {0x12345678, 0x1234, 0x5678, 0x12, 0x34, 0x567812345678}, +# 0x12345678123456781234567812345678, + ]: + with self.subTest(uuid=uuid): + print ("Testing uuid "+str(uuid)) + key = KafkaPublisher.get_collection_key_from_uuid(uuid) + print("Acquired uuid="+str(key)) + self.assertEqual(key, expKey) + + @patch('onestop.KafkaPublisher.SerializingProducer') + def test_publish_collection(self, mock_collection_producer): + uuid = '{12345678-1234-5678-1234-567812345678}' + content_dict = { + 'title': 'this is a test', + 'location': 'somewhere in space' + } + method = 'PUT' + publisher = KafkaPublisher(**self.conf_w_security) + publisher.register_client = MagicMock(return_value=MagicMock()) + mock_collection_producer.produce = MagicMock() + mock_collection_producer.poll.side_effect = [1] + + publisher.publish_collection(mock_collection_producer, uuid, content_dict, method) + + # Verify kafka produce called once + mock_collection_producer.produce.assert_called_with( + topic=self.conf_w_security['collection_topic_publish'], + value={ + 'type': 'collection', + 'content': json.dumps(content_dict), + 'contentType': 'application/json', + 'method': method, + 'source': 'unknown', + }, + key=publisher.get_collection_key_from_uuid(uuid), + on_delivery=publisher.delivery_report + ) + + # Verify kafka produce poll called once + mock_collection_producer.poll.assert_called_once() + + + @patch('onestop.KafkaPublisher.SerializingProducer') + def test_publish_granule(self, mock_collection_producer): + uuid = '{12345678-1234-5678-1234-567812345678}' + content_dict = { + 'title': 'this is a test', + 'location': 'somewhere in space', + 'relationships': [{"type": "COLLECTION", + "id": '{12345678-1234-5678-1234-567812345678}'}], + 'errors': [], + 'analysis': 'No analysis', + 'fileLocations': 'archived', + 'fileInformation': 'no information', + 'discovery': 'AWS' + } + publisher = KafkaPublisher(**self.conf_w_security) + publisher.register_client = MagicMock(return_value=MagicMock()) + mock_collection_producer.produce = MagicMock() + mock_collection_producer.poll.side_effect = [1] + + publisher.publish_granule(mock_collection_producer, uuid, content_dict) + + # Verify kafka produce called once + mock_collection_producer.produce.assert_called_with( + topic=self.conf_w_security['granule_topic_publish'], + value={ + 'type': 'granule', + 'content': json.dumps(content_dict), + #'contentType': 'application/json', + 'method': 'PUT', + 'source': 'unknown', + 'operation': None, + 'relationships': content_dict['relationships'], + 'errors': content_dict['errors'], + 'analysis': content_dict['analysis'], + 'fileLocations': {'fileLocation': content_dict['fileLocations']}, + 'fileInformation': content_dict['fileInformation'], + 'discovery': content_dict['discovery'] + }, + key=publisher.get_collection_key_from_uuid(uuid), + on_delivery=publisher.delivery_report + ) + + # Verify kafka produce poll called once + mock_collection_producer.poll.assert_called_once() + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/onestop-python-client/test/unit/test_SqsHandlers.py b/onestop-python-client/test/unit/test_SqsHandlers.py new file mode 100644 index 0000000..c17b972 --- /dev/null +++ b/onestop-python-client/test/unit/test_SqsHandlers.py @@ -0,0 +1,328 @@ +import json +import unittest + +from unittest import mock +from unittest.mock import patch +from moto import mock_sqs +from test.utils import abspath_from_relative, create_delete_message +from onestop.WebPublisher import WebPublisher +from onestop.util.S3Utils import S3Utils +from onestop.util.S3MessageAdapter import S3MessageAdapter +from onestop.util.SqsConsumer import SqsConsumer +from onestop.util.SqsHandlers import create_delete_handler +from onestop.util.SqsHandlers import create_upload_handler + +class test_SqsHandler(unittest.TestCase): + + def setUp(self): + print("Set it up!") + + self.config_dict = { + 'access_key': 'test_access_key', + 'secret_key': 'test_secret_key', + 'access_bucket': 'https://archive-testing-demo.s3-us-east-2.amazonaws.com', + 'type': 'COLLECTION', + 'file_id_prefix': 'gov.noaa.ncei.csb:', + 'collection_id': 'fdb56230-87f4-49f2-ab83-104cfd073177', + 'registry_base_url': 'http://localhost/onestop/api/registry', + 'registry_username': 'admin', + 'registry_password': 'whoknows', + 'onestop_base_url': 'http://localhost/onestop/api/search/search', + 'log_level': 'DEBUG' + } + + self.wp = WebPublisher(**self.config_dict) + self.s3_utils = S3Utils(**self.config_dict) + self.s3_message_adapter = S3MessageAdapter(**self.config_dict) + self.sqs_consumer = SqsConsumer(**self.config_dict) + + self.sqs_max_polls = 3 + self.region = 'us-east-2' + self.bucket = 'archive-testing-demo' + self.key = 'ABI-L1b-RadF/2019/298/15/OR_ABI-L1b-RadF-M6C15_G16_s20192981500369_e20192981510082_c20192981510166.nc' + + def tearDown(self): + print("Tear it down!") + + def mocked_search_response_data(*args, **kwargs): + class MockResponse: + def __init__(self, json_data, status_code): + self.json_data = json_data + self.status_code = status_code + + def json(self): + return self.json_data + + print ("args: "+str(args)+" kwargs: "+str(kwargs)) + onestop_search_response = { + "data":[ + { + "attributes":{ + "serviceLinks":[ + + ], + "citeAsStatements":[ + + ], + "links":[ + { + "linkFunction":"download", + "linkUrl":"s3://archive-testing-demo-backup/public/NESDIS/CSB/csv/2019/12/01/20191201_08d5538c6f8dbefd7d82929623a34385_pointData.csv", + "linkName":"Amazon S3", + "linkProtocol":"Amazon:AWS:S3" + }, + { + "linkFunction":"download", + "linkUrl":"https://archive-testing-demo.s3-us-east-2.amazonaws.com/public/NESDIS/CSB/csv/2019/12/01/20191201_08d5538c6f8dbefd7d82929623a34385_pointData.csv", + "linkName":"Amazon S3", + "linkProtocol":"HTTPS" + } + ], + "internalParentIdentifier":"fdb56230-87f4-49f2-ab83-104cfd073177", + "filesize":63751, + "title":"20191201_08d5538c6f8dbefd7d82929623a34385_pointData.csv" + }, + "id":"77b11a1e-1b75-46e1-b7d6-99b5022ed113", + "type":"granule" + } + ], + "meta":{ + "took":1, + "total":6, + "exactCount":True + } + } + return MockResponse(onestop_search_response, 200) + + def mocked_search_response_data_empty(*args, **kwargs): + class MockResponse: + def __init__(self, json_data, status_code): + self.json_data = json_data + self.status_code = status_code + + def json(self): + return self.json_data + + print ("args: "+str(args)+" kwargs: "+str(kwargs)) + onestop_search_response = { + "data":[], + "meta":{ + "took":1, + "total":6, + "exactCount":True + } + } + return MockResponse(onestop_search_response, 200) + + @mock_sqs + @mock.patch('requests.get', side_effect=mocked_search_response_data, autospec=True) + @patch('onestop.WebPublisher') + def test_delete_handler_happy(self, mock_wp, mock_response): + queue_name = 'test_queue' + sqs_resource = self.s3_utils.connect('resource', 'sqs', self.region) + sqs_queue_url = sqs_resource.create_queue(QueueName=queue_name).url + sqs_queue = sqs_resource.Queue(queue_name) + + # Send a test message + sqs_client = self.s3_utils.connect('client', 'sqs' , self.region) + message = create_delete_message(self.region, self.bucket, self.key) + sqs_client.send_message( + QueueUrl=sqs_queue_url, + MessageBody=json.dumps(message) + ) + + mock_wp.search_onestop.side_effect = mock_response + cb = create_delete_handler(mock_wp) + + self.sqs_consumer.receive_messages(sqs_queue, 1, cb) + + # Verify search and delete called once. + mock_wp.search_onestop.assert_called_once() + mock_wp.delete_registry.assert_called_once() + + @mock_sqs + @mock.patch('requests.get', side_effect=mocked_search_response_data_empty, autospec=True) + @patch('onestop.WebPublisher') + def test_delete_handler_data_empty_ends_cb(self, mock_wp, mock_response): + queue_name = 'test_queue' + sqs_resource = self.s3_utils.connect('resource', 'sqs', self.region) + sqs_queue_url = sqs_resource.create_queue(QueueName=queue_name).url + sqs_queue = sqs_resource.Queue(queue_name) + + # Send a test message + sqs_client = self.s3_utils.connect('client', 'sqs' , self.region) + message = create_delete_message(self.region, self.bucket, self.key) + sqs_client.send_message( + QueueUrl=sqs_queue_url, + MessageBody=json.dumps(message) + ) + + mock_wp.search_onestop.side_effect = mock_response + cb = create_delete_handler(mock_wp) + + self.sqs_consumer.receive_messages(sqs_queue, 1, cb) + + # Verify search and delete called once. + mock_wp.search_onestop.assert_called_once() + mock_wp.delete_registry.assert_not_called() + + @mock_sqs + @mock.patch('requests.get', side_effect=mocked_search_response_data, autospec=True) + @patch('onestop.WebPublisher') + def test_delete_handler_no_records_ends_cb(self, mock_wp, mock_response): + queue_name = 'test_queue' + sqs_resource = self.s3_utils.connect('resource', 'sqs', self.region) + sqs_queue_url = sqs_resource.create_queue(QueueName=queue_name).url + sqs_queue = sqs_resource.Queue(queue_name) + + # Send a test message + sqs_client = self.s3_utils.connect('client', 'sqs' , self.region) + sqs_client.send_message( + QueueUrl=sqs_queue_url, + MessageBody=json.dumps({"Message":'''{"Records":[]}'''}) + ) + + mock_wp.search_onestop.side_effect = mock_response + cb = create_delete_handler(mock_wp) + + self.sqs_consumer.receive_messages(sqs_queue, 1, cb) + + # Verify search and delete called once. + mock_wp.search_onestop.assert_not_called() + mock_wp.delete_registry.assert_not_called() + + @mock_sqs + @mock.patch('requests.get', side_effect=mocked_search_response_data, autospec=True) + @patch('onestop.WebPublisher') + def test_delete_handler_eventName_not_delete_ends_cb(self, mock_wp, mock_response): + queue_name = 'test_queue' + sqs_resource = self.s3_utils.connect('resource', 'sqs', self.region) + sqs_queue_url = sqs_resource.create_queue(QueueName=queue_name).url + sqs_queue = sqs_resource.Queue(queue_name) + + # Send a test message + sqs_client = self.s3_utils.connect('client', 'sqs' , self.region) + sqs_client.send_message( + QueueUrl=sqs_queue_url, + MessageBody=json.dumps({"Message":'''{"Records":[{"eventName":"Unknown"}]}'''}) + ) + + mock_wp.search_onestop.side_effect = mock_response + cb = create_delete_handler(mock_wp) + + self.sqs_consumer.receive_messages(sqs_queue, 1, cb) + + # Verify search and delete called once. + mock_wp.search_onestop.assert_not_called() + mock_wp.delete_registry.assert_not_called() + + @mock_sqs + @patch('onestop.WebPublisher') + @patch('onestop.util.S3Utils') + @patch('onestop.util.S3MessageAdapter') + def test_upload_handler_happy(self, mock_s3_utils, mock_s3_msg_adapter, mock_wp): + bucket = self.bucket + key = self.key + queue_name = 'test_queue' + sqs_resource = self.s3_utils.connect('resource', 'sqs', self.region) + sqs_queue_url = sqs_resource.create_queue(QueueName=queue_name).url + sqs_queue = sqs_resource.Queue(queue_name) + + # Send a test message + sqs_client = self.s3_utils.connect('client', 'sqs' , self.region) + message = create_delete_message(self.region, bucket, key) + sqs_client.send_message( + QueueUrl=sqs_queue_url, + MessageBody=json.dumps(message) + ) + + records = json.loads(message['Message'])['Records'] + records_transformed = mock_s3_msg_adapter.transform(records) + cb = create_upload_handler(mock_wp, mock_s3_utils, mock_s3_msg_adapter) + self.sqs_consumer.receive_messages(sqs_queue, 1, cb) + + # Verify get uuid called + mock_s3_utils.get_uuid_metadata.assert_called_with( + mock_s3_utils.connect('s3_resource', None), + bucket, + key) + # Verify uuid not added + mock_s3_utils.add_uuid_metadata.assert_not_called() + # Verify transform called + mock_s3_msg_adapter.transform.assert_called_with(records) + # Verify publish called + mock_wp.publish_registry.assert_called_with( + 'granule', + mock_s3_utils.get_uuid_metadata(mock_s3_utils.connect('s3_resource', None), bucket, key), + records_transformed.serialize(), + 'POST' + ) + + @mock_sqs + @patch('onestop.WebPublisher') + @patch('onestop.util.S3Utils') + @patch('onestop.util.S3MessageAdapter') + def test_upload_handler_adds_uuid(self, mock_s3_utils, mock_s3_msg_adapter, mock_wp): + bucket = self.bucket + key = self.key + queue_name = 'test_queue' + sqs_resource = self.s3_utils.connect('resource', 'sqs', self.region) + sqs_queue_url = sqs_resource.create_queue(QueueName=queue_name).url + sqs_queue = sqs_resource.Queue(queue_name) + + # Send a test message + sqs_client = self.s3_utils.connect('client', 'sqs' , self.region) + message = create_delete_message(self.region, bucket, key) + sqs_client.send_message( + QueueUrl=sqs_queue_url, + MessageBody=json.dumps(message) + ) + + mock_s3_utils.get_uuid_metadata.return_value = None + cb = create_upload_handler(mock_wp, mock_s3_utils, mock_s3_msg_adapter) + + self.sqs_consumer.receive_messages(sqs_queue, 1, cb) + + # Verify add uuid called + mock_s3_utils.add_uuid_metadata.assert_called_with( + mock_s3_utils.connect('s3_resource', None), + bucket, + key) + + @mock_sqs + @patch('onestop.WebPublisher') + @patch('onestop.util.S3Utils') + @patch('onestop.util.S3MessageAdapter') + def test_upload_handler_bucket_as_backup_PATCH(self, mock_s3_utils, mock_s3_msg_adapter, mock_wp): + bucket = "testing_backup_bucket" + key = self.key + queue_name = 'test_queue' + sqs_resource = self.s3_utils.connect('resource', 'sqs', self.region) + sqs_queue_url = sqs_resource.create_queue(QueueName=queue_name).url + sqs_queue = sqs_resource.Queue(queue_name) + + # Send a test message + sqs_client = self.s3_utils.connect('client', 'sqs' , self.region) + message = create_delete_message(self.region, bucket, key) + sqs_client.send_message( + QueueUrl=sqs_queue_url, + MessageBody=json.dumps(message) + ) + + mock_s3_utils.get_uuid_metadata.return_value = None + records = json.loads(message['Message'])['Records'] + records_transformed = mock_s3_msg_adapter.transform(records) + cb = create_upload_handler(mock_wp, mock_s3_utils, mock_s3_msg_adapter) + + self.sqs_consumer.receive_messages(sqs_queue, 1, cb) + + # Verify publish called + mock_wp.publish_registry.assert_called_with( + 'granule', + mock_s3_utils.get_uuid_metadata(mock_s3_utils.connect('s3_resource', None), bucket, key), + records_transformed.serialize(), + 'PATCH' + ) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/onestop-python-client/test/unit/test_WebPublisher.py b/onestop-python-client/test/unit/test_WebPublisher.py new file mode 100644 index 0000000..af0802f --- /dev/null +++ b/onestop-python-client/test/unit/test_WebPublisher.py @@ -0,0 +1,145 @@ +import json +import unittest + +from unittest.mock import ANY +from unittest import mock +from moto import mock_s3 +from onestop.WebPublisher import WebPublisher + +class test_WebPublisher(unittest.TestCase): + username="admin" + password="a_password" + uuid = "9f0a5ff2-fcc0-5bcb-a225-024b669c9bba" + registry_base_url = "https://localhost/onestop/api/registry" + registry_full_url_granule = registry_base_url + "/metadata/granule/" + uuid + registry_full_url_collection = registry_base_url + "/metadata/collection/" + uuid + onestop_base_url = "https://localhost/onestop/api/search" + + payloadDict = { + "fileInformation": { + "name": "file2.csv", + "size": 1385, + "checksums": [{ + "algorithm": "MD5", + "value": "44d2452e8bc2c8013e9c673086fbab7a" + }] + }, + "relationships": [ + {"type": "COLLECTION", + "id": "fdb56230-87f4-49f2-ab83-104cfd073177" + } + ], + "fileLocations": { + "nesdis-ncei-csb-dev/csv/file2.csv": { + "uri": "https://odp-noaa-nesdis-ncei-test.s3-us-west-2.amazonaws.com/csv/file2.csv", + "type": "ACCESS", + "restricted": False, + "serviceType": "HTTPS", + "asynchronous": False + } + }, + "discovery": { + "title": "file2.csv", + "parentIdentifier": "fdb56230-87f4-49f2-ab83-104cfd073177", + "fileIdentifier": "gov.noaa.ncei.csb:file2" + } + } + + addlocDict = { + "fileLocations": { + "Crt3a-Hq2SGUp8n8QSRNpFIf59kmMONqaKlJ_7-Igd8ijMM62deLdtVkiYwlaePbC4JNCsfeg5i-DWDmwxLIx9V-OGgiQp_CZ0rEFXIZxM_ZPyGu7TTv8wwos5SvAI6xDURhzoCH-w": { + "uri": "/282856304593/vaults/noaa-nesdis-ncei-vault-test/archives/Crt3a-Hq2SGUp8n8QSRNpFIf59kmMONqaKlJ_7-Igd8ijMM62deLdtVkiYwlaePbC4JNCsfeg5i-DWDmwxLIx9V-OGgiQp_CZ0rEFXIZxM_ZPyGu7TTv8wwos5SvAI6xDURhzoCH-w", + "type": "ACCESS", + "restricted": True, + "serviceType": "Amazon:AWS:Glacier", + "asynchronous": True + } + } + } + + + def setUp(self): + print("Set it up!") + + self.wp = WebPublisher(self.registry_base_url, + self.username, + self.password, + self.onestop_base_url, + 'DEBUG') + + def tearDown(self): + print("Tear it down!") + + def mocked_requests_patch(*args, **kwargs): + class MockResponse: + def __init__(self, json_data, status_code): + self.json_data = json_data + self.status_code = status_code + + def json(self): + return self.json_data + + print ("args: "+str(args)+" kwargs: "+str(kwargs)) + + return MockResponse({"key1":"value1"}, 200) + + @mock_s3 + @mock.patch('requests.post', side_effect=mocked_requests_patch, autospec=True) + def test_publish(self, mock_get): + payload = json.dumps(self.payloadDict) + self.wp.publish_registry("granule", self.uuid, payload, "POST") + + mock_get.assert_called_with(url = self.registry_full_url_granule, auth = ANY, data = ANY, verify = ANY, headers = ANY) + mock_get.assert_called_with(url = ANY, auth = (self.username, self.password), data = ANY, verify = ANY, headers = ANY) + mock_get.assert_called_with(url = ANY, auth = ANY, data = payload, verify = ANY, headers = ANY) + mock_get.assert_called_with(url = ANY, auth = ANY, data = ANY, verify = False, headers = ANY) + mock_get.assert_called_with(url = ANY, auth = ANY, data = ANY, verify = ANY, headers = {'Content-Type': 'application/json'}) + + @mock_s3 + @mock.patch('requests.put', side_effect=mocked_requests_patch, autospec=True) + def test_publish(self, mock_get): + payload = json.dumps(self.payloadDict) + self.wp.publish_registry("granule", self.uuid, payload, "PUT") + + mock_get.assert_called_with(url = self.registry_full_url_granule, auth = ANY, data = ANY, verify = ANY, headers = ANY) + mock_get.assert_called_with(url = ANY, auth = (self.username, self.password), data = ANY, verify = ANY, headers = ANY) + mock_get.assert_called_with(url = ANY, auth = ANY, data = payload, verify = ANY, headers = ANY) + mock_get.assert_called_with(url = ANY, auth = ANY, data = ANY, verify = False, headers = ANY) + mock_get.assert_called_with(url = ANY, auth = ANY, data = ANY, verify = ANY, headers = {'Content-Type': 'application/json'}) + + @mock_s3 + @mock.patch('requests.patch', side_effect=mocked_requests_patch, autospec=True) + def test_add_glacier_location(self, mock_get): + payload = json.dumps(self.addlocDict) + self.wp.publish_registry("granule", self.uuid, payload, "PATCH") + + mock_get.assert_called_with(url = self.registry_full_url_granule, auth = ANY, data = ANY, verify = ANY, headers = ANY) + mock_get.assert_called_with(url = ANY, auth = (self.username, self.password), data = ANY, verify = ANY, headers = ANY) + mock_get.assert_called_with(url = ANY, auth = ANY, data = payload, verify = ANY, headers = ANY) + mock_get.assert_called_with(url = ANY, auth = ANY, data = ANY, verify = False, headers = ANY) + mock_get.assert_called_with(url = ANY, auth = ANY, data = ANY, verify = ANY, headers = {'Content-Type': 'application/json'}) + + @mock_s3 + @mock.patch('requests.delete', side_effect=mocked_requests_patch, autospec=True) + def test_delete_registry_granule(self, mock_get): + self.wp.delete_registry("granule", self.uuid) + + mock_get.assert_called_with(url = self.registry_full_url_granule, headers = ANY, auth = ANY, verify = ANY) + mock_get.assert_called_with(url = ANY, auth = (self.username, self.password), verify = ANY, headers = ANY) + mock_get.assert_called_with(url = ANY, auth = ANY, verify = ANY, headers = ANY) + mock_get.assert_called_with(url = ANY, auth = ANY, verify = False, headers = ANY) + mock_get.assert_called_with(url = ANY, auth = ANY, verify = ANY, headers = {'Content-Type': 'application/json'}) + + @mock_s3 + @mock.patch('requests.delete', side_effect=mocked_requests_patch, autospec=True) + def test_delete_registry_collection(self, mock_get): + self.wp.delete_registry("collection", self.uuid) + + mock_get.assert_called_with(url = self.registry_full_url_collection, headers = ANY, auth = ANY, verify = ANY) + mock_get.assert_called_with(url = ANY, auth = (self.username, self.password), verify = ANY, headers = ANY) + mock_get.assert_called_with(url = ANY, auth = ANY, verify = ANY, headers = ANY) + mock_get.assert_called_with(url = ANY, auth = ANY, verify = False, headers = ANY) + mock_get.assert_called_with(url = ANY, auth = ANY, verify = ANY, headers = {'Content-Type': 'application/json'}) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/onestop-python-client/test/unit/util/__init__.py b/onestop-python-client/test/unit/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/onestop-python-client/tests/util/S3MessageAdapterTest.py b/onestop-python-client/test/unit/util/test_S3MessageAdapter.py similarity index 73% rename from onestop-python-client/tests/util/S3MessageAdapterTest.py rename to onestop-python-client/test/unit/util/test_S3MessageAdapter.py index 41a8f9d..925be2e 100644 --- a/onestop-python-client/tests/util/S3MessageAdapterTest.py +++ b/onestop-python-client/test/unit/util/test_S3MessageAdapter.py @@ -1,6 +1,6 @@ import unittest + from moto import mock_s3 -from tests.utils import abspath_from_relative from onestop.util.S3Utils import S3Utils from onestop.util.S3MessageAdapter import S3MessageAdapter @@ -51,22 +51,29 @@ class S3MessageAdapterTest(unittest.TestCase): def setUp(self): print("Set it up!") - self.s3_utils = S3Utils(abspath_from_relative(__file__, "../../config/aws-util-config-dev.yml"), - abspath_from_relative(__file__, "../../config/credentials-template.yml")) - self.s3ma = S3MessageAdapter(abspath_from_relative(__file__, "../../config/csb-data-stream-config-template.yml"), - self.s3_utils) - def tearDown(self): - print("Tear it down!") + config_dict = { + 'access_key': 'test_access_key', + 'secret_key': 'test_secret_key', + 'access_bucket': 'https://archive-testing-demo.s3-us-east-2.amazonaws.com', + 'type': 'COLLECTION', + 'file_id_prefix': 'gov.noaa.ncei.csb:', + 'collection_id': 'fdb56230-87f4-49f2-ab83-104cfd073177', + 'log_level': 'DEBUG' + } - def test_parse_config(self): - self.assertFalse(self.s3ma.conf['collection_id']==None) + self.s3_utils = S3Utils(**config_dict) + self.s3ma = S3MessageAdapter(**config_dict) + self.region = 'us-east-2' + + def tearDown(self): + print("Tear it down!") @mock_s3 def test_transform(self): - s3 = self.s3_utils.connect('s3', self.s3_utils.conf['s3_region']) - location = {'LocationConstraint': self.s3_utils.conf['s3_region']} + s3 = self.s3_utils.connect('client', 's3', self.region) + location = {'LocationConstraint': self.region} bucket = 'nesdis-ncei-csb-dev' key = 'csv/file1.csv' key2 = 'csv/file2.csv' @@ -81,4 +88,14 @@ def test_transform(self): print(payload) self.assertTrue(payload!=None) + @mock_s3 + def test_extra_parameters_constructor(self): + testParams = {"access_bucket": "blah1", + "type": "blah2", + "file_id_prefix": "blah3", + "collection_id": "blah4", + "extra": "extra value"} + self.assertRaises(Exception, S3MessageAdapter(**testParams)) +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/onestop-python-client/test/unit/util/test_S3Utils.py b/onestop-python-client/test/unit/util/test_S3Utils.py new file mode 100644 index 0000000..91b90a3 --- /dev/null +++ b/onestop-python-client/test/unit/util/test_S3Utils.py @@ -0,0 +1,259 @@ +import csv +import unittest +import uuid +import json + +from unittest import mock +from moto import mock_s3, mock_sqs +from moto import mock_glacier +from test.utils import abspath_from_relative +from onestop.util.S3Utils import S3Utils +from boto.glacier.layer1 import Layer1 +from botocore.response import StreamingBody +from io import StringIO + +class S3UtilsTest(unittest.TestCase): + + def setUp(self): + print("Set it up!") + + config_dict = { + 'access_key': 'test_access_key', + 'secret_key': 'test_secret_key', + 'access_bucket': 'https://archive-testing-demo.s3-us-east-2.amazonaws.com', + 'type': 'COLLECTION', + 'file_id_prefix': 'gov.noaa.ncei.csb:', + 'collection_id': 'fdb56230-87f4-49f2-ab83-104cfd073177', + 'log_level': 'DEBUG' + } + + self.s3_utils = S3Utils(**config_dict) + + self.region = 'us-east-2' + self.region2 = 'eu-north-1' + self.bucket = 'archive-testing-demo' + + @mock_sqs + def test_connect_session(self): + session = self.s3_utils.connect('Session', None, self.region) + + # No exception is called for unique method call + session.client('sqs') + session.resource('s3') + + @mock_sqs + def test_connect_client(self): + client = self.s3_utils.connect('Client', 'sqs', self.region) + + # No exception is called for unique method call + client.list_queues() + + @mock_sqs + def test_connect_resource(self): + resource = self.s3_utils.connect('Resource', 'sqs', self.region) + + # No exception is called for unique method call + resource.Queue(url='test') + + @mock_sqs + def test_connect_exception_for_invalid_connection_type(self): + with self.assertRaises(Exception): + self.s3_utils.connect('junk', 'sqs', self.region) + + @mock_s3 + def test_get_uuid_metadata(self): + boto_client = self.s3_utils.connect('resource', 's3', None) + s3_key = "csv/file1.csv" + + location = {'LocationConstraint': self.region} + boto_client.create_bucket(Bucket=self.bucket, CreateBucketConfiguration=location) + obj_uuid = str(uuid.uuid4()) + boto_client.Object(self.bucket, s3_key).put(Bucket=self.bucket, Key=s3_key, Body="my_body", Metadata={'object-uuid': obj_uuid}) + + self.assertFalse(self.s3_utils.get_uuid_metadata(boto_client, self.bucket, s3_key) == None) + + @mock_s3 + def test_add_uuid_metadata(self): + boto_client = self.s3_utils.connect('resource', 's3', self.region) + + s3_key = "csv/file1.csv" + + location = {'LocationConstraint': self.region} + boto_client.create_bucket(Bucket=self.bucket, CreateBucketConfiguration=location) + boto_client.Object(self.bucket, s3_key).put(Bucket=self.bucket, Key=s3_key, Body="my_body") + + self.assertTrue(self.s3_utils.add_uuid_metadata(boto_client, self.bucket, s3_key)) + + @mock_s3 + def test_add_file_s3(self): + boto_client = self.s3_utils.connect('client', 's3', None) + local_file = abspath_from_relative(__file__, "../../data/file4.csv") + s3_key = "csv/file4.csv" + location = {'LocationConstraint': self.region} + boto_client.create_bucket(Bucket=self.bucket, CreateBucketConfiguration=location) + overwrite = True + + self.assertTrue(self.s3_utils.upload_s3(boto_client, local_file, self.bucket, s3_key, overwrite)) + + @mock_s3 + def test_get_csv_s3(self): + boto_session = self.s3_utils.connect('session', None, self.region) + s3 = self.s3_utils.connect('client', 's3', self.region) + location = {'LocationConstraint': self.region} + s3_key = "csv/file1.csv" + s3.create_bucket(Bucket=self.bucket, CreateBucketConfiguration=location) + s3.put_object(Bucket=self.bucket, Key=s3_key, Body="body") + + sm_open_file = self.s3_utils.get_csv_s3(boto_session, self.bucket, s3_key) + + # print("reading csv:" + line.decode('utf-8')) + csv_reader = csv.DictReader(sm_open_file) + for row in csv_reader: + print(str(row["LON"])) + + @mock_s3 + def test_read_bytes_s3(self): + boto_client = self.s3_utils.connect('client', 's3', None) + s3_key = "csv/file1.csv" + boto_client.create_bucket(Bucket=self.bucket, CreateBucketConfiguration={'LocationConstraint': self.region}) + boto_client.put_object(Bucket=self.bucket, Key=s3_key, Body="body") + + self.assertTrue(self.s3_utils.read_bytes_s3(boto_client, self.bucket, s3_key)) + + @mock_s3 + def test_add_files(self): + boto_client = self.s3_utils.connect('client', 's3', None) + local_files = ["file1_s3.csv", "file2.csv", "file3.csv"] + location = {'LocationConstraint': self.region} + boto_client.create_bucket(Bucket=self.bucket, CreateBucketConfiguration=location) + overwrite = True + + for file in local_files: + local_file = abspath_from_relative(__file__, "../../data/" + file) + s3_file = "csv/" + file + self.assertTrue(self.s3_utils.upload_s3(boto_client, local_file, self.bucket, s3_file, overwrite)) + + @mock_s3 + @mock_glacier + def test_s3_cross_region(self): + print('Cross Region Vault Upload ------------- ') + key = "csv/file1.csv" + + # makes connection to low level s3 client + s3 = self.s3_utils.connect('client', 's3', self.region) + location = {'LocationConstraint': self.region} + s3.create_bucket(Bucket=self.bucket, CreateBucketConfiguration=location) + s3.put_object(Bucket=self.bucket, Key=key, Body="body") + + # Reads object data and stores it into a variable + file_data = self.s3_utils.read_bytes_s3(s3, self.bucket, key) + + # Redirecting upload to vault in second region + glacier = self.s3_utils.connect('client', 'glacier', self.region2) + vault_name = 'archive-vault-new' + glacier.create_vault(vaultName=vault_name) + print('vault name: ' + str(vault_name)) + print('region name: ' + str(self.region2)) + print('-------file data---------') + print(file_data) + response = self.s3_utils.upload_archive(glacier, vault_name, file_data) + + self.assertTrue(response['archiveId']!=None) + + @mock_s3 + @mock_glacier + def test_s3_to_glacier(self): + """ + Changes the storage class of an object from S3 to Glacier + Requires the configure and credential locations as parameters as well as the key of the object + """ + + print("S3 to Glacier---------") + key = "csv/file1_s3.csv" + + # Create boto3 low level api connection + s3 = self.s3_utils.connect('client', 's3', self.region) + location = {'LocationConstraint': self.region} + s3.create_bucket(Bucket=self.bucket, CreateBucketConfiguration=location) + s3.put_object(Bucket=self.bucket, Key=key, Body="body") + + # Using the S3 util class invoke the change of storage class + response = self.s3_utils.s3_to_glacier(s3, self.bucket, key) + print(response['ResponseMetadata']['HTTPHeaders']['x-amz-storage-class']) + # Assert 'x-amz-storage-class': 'GLACIER' + + self.assertTrue(response['ResponseMetadata']['HTTPHeaders']['x-amz-storage-class'] == "GLACIER") + + @mock_s3 + def test_s3_restore(self): + """ + Uses high level api to restore object from glacier to s3 + """ + + key = "csv/file1_s3.csv" + days = 3 + + # use high level api + s3 = self.s3_utils.connect('resource', 's3' , self.region2) + location = {'LocationConstraint': self.region2} + s3.create_bucket(Bucket=self.bucket, CreateBucketConfiguration=location) + s3.Object(self.bucket, key).put(Bucket=self.bucket, Key=key, Body="body") + + self.assertTrue(self.s3_utils.s3_restore(s3, self.bucket, key, days) != None) + + @mock_glacier + def test_retrieve_inventory(self): + """ + Initiates job for archive retrieval. Takes 3-5 hours to complete if not mocked. + """ + + # Using glacier api initiates job and returns archive results + # Connect to your glacier vault for retrieval + glacier = self.s3_utils.connect('client', 'glacier', self.region2) + vault_name = 'archive-vault-new' + glacier.create_vault(vaultName=vault_name) + + response = self.s3_utils.retrieve_inventory(glacier, vault_name) + print('jobid %s'%response['jobId']) + self.assertTrue(response['jobId'] != None) + + @mock_glacier + @mock_s3 + def test_retrieve_inventory_results(self): + """ + Once the job has been completed, use the job id to retrieve archive results + """ + + # Connect to your glacier vault for retrieval + glacier = mock.Mock(spec=Layer1)#self.s3_utils.connect('client', 'glacier', self.region) + vault_name = 'archive-vault-new' + glacier.create_vault(vaultName=vault_name) + + body_json = {'Body': [{'test':'value'}]} + body_encoded = json.dumps(body_json)#.encode("utf-16") + + body = StreamingBody( + StringIO(str(body_encoded)), + len(str(body_encoded)) + ) + + mocked_response = { + 'body': body + } + glacier.get_job_output.return_value = mocked_response + with mock.patch('boto.glacier.job.tree_hash_from_str') as t: + t.return_value = 'tree_hash' + inventory = self.s3_utils.retrieve_inventory_results(vault_name, glacier, 'ASDF78') + + self.assertEqual(body_json, inventory) + + @mock_s3 + def test_extra_parameters_constructor(self): + testParams = {"access_key": "blah", + "secret_key": "blah", + "log_level": "DEBUG", + "extra": "extra value"} + self.assertRaises(Exception, S3Utils(**testParams)) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/onestop-python-client/test/unit/util/test_SqsConsumer.py b/onestop-python-client/test/unit/util/test_SqsConsumer.py new file mode 100644 index 0000000..ef50b20 --- /dev/null +++ b/onestop-python-client/test/unit/util/test_SqsConsumer.py @@ -0,0 +1,178 @@ +import unittest +import json + +from moto import mock_sqs +from unittest.mock import MagicMock, ANY +from onestop.util.S3Utils import S3Utils +from onestop.util.SqsConsumer import SqsConsumer + +class SqsConsumerTest(unittest.TestCase): + config_dict = { + 'access_key': 'test_access_key', + 'secret_key': 'test_secret_key', + 's3_region': 'us-east-2', + 's3_bucket': 'archive-testing-demo', + 'sqs_url': 'https://sqs.us-east-2.amazonaws.com/798276211865/cloud-archive-client-sqs', + 'type': 'COLLECTION', + 'file_id_prefix': 'gov.noaa.ncei.csb:', + 'collection_id': 'fdb56230-87f4-49f2-ab83-104cfd073177', + 'registry_base_url': 'http://localhost/onestop/api/registry', + 'registry_username': 'admin', + 'registry_password': 'whoknows', + 'onestop_base_url': 'http://localhost/onestop/api/search/search', + 'log_level': 'DEBUG' + } + + records = [{"eventVersion":"2.1"}] + message = json.dumps( + {"Type": "Notification", + "MessageId": "9d0691d2-ae9c-58f9-a9f4-c8dcf05d87be", + "TopicArn": "arn:aws:sns:us-east-1:798276211865:archive-testing-demo-backup-use-1", + "Subject": "Amazon S3 Notification", + "Message": json.dumps({"Records": records}), + "Timestamp": "2021-05-06T21:15:45.427Z", + "SignatureVersion": "1", + "Signature": "Ui5s4uVgcMr5fjGmePCMgmi14Dx9oS8hIpjXXiQo+xZPgsHkUayz7dEeGmMGGt45l8blmZTZEbxJG+HVGfIUmQGRqoimwiLm+mIAaNIN/BV76FVFcQUIkORX8gYN0a4RS3HU8/ElrKFK8Iz0zpxJdjwxa3xPCDwu+dTotiLTJxSouvg8MmkkDnq758a8vZ9WK2PaOlZiZ3m8Mv2ZvLrozZ/DAAz48HSad6Mymhit82RpGCUxy4SDwXVlP/nLB01AS11Gp2HowJR8NXyStrZYzzQEc+PebITaExyikgTMiVhRHkmb7JrtZPpgZu2daQsSooqpwyIzb6pvgwu9W54jkw==", + "SigningCertURL": "https://sns.us-east-1.amazonaws.com/SimpleNotificationService-010a507c1833636cd94bdb98bd93083a.pem", + "UnsubscribeURL": "https://sns.us-east-1.amazonaws.com/?Action=Unsubscribe&SubscriptionArn=arn:aws:sns:us-east-1:798276211865:archive-testing-demo-backup-use-1:e7a9a9f5-792e-48a6-9ec8-40f7f5a8f600" + }) + + message_wo_records = json.dumps( + {"Type": "Notification", + "MessageId": "9d0691d2-ae9c-58f9-a9f4-c8dcf05d87be", + "TopicArn": "arn:aws:sns:us-east-1:798276211865:archive-testing-demo-backup-use-1", + "Subject": "Amazon S3 Notification", + "Message": "{}", + "Timestamp": "2021-05-06T21:15:45.427Z", + "SignatureVersion": "1", + "Signature": "Ui5s4uVgcMr5fjGmePCMgmi14Dx9oS8hIpjXXiQo+xZPgsHkUayz7dEeGmMGGt45l8blmZTZEbxJG+HVGfIUmQGRqoimwiLm+mIAaNIN/BV76FVFcQUIkORX8gYN0a4RS3HU8/ElrKFK8Iz0zpxJdjwxa3xPCDwu+dTotiLTJxSouvg8MmkkDnq758a8vZ9WK2PaOlZiZ3m8Mv2ZvLrozZ/DAAz48HSad6Mymhit82RpGCUxy4SDwXVlP/nLB01AS11Gp2HowJR8NXyStrZYzzQEc+PebITaExyikgTMiVhRHkmb7JrtZPpgZu2daQsSooqpwyIzb6pvgwu9W54jkw==", + "SigningCertURL": "https://sns.us-east-1.amazonaws.com/SimpleNotificationService-010a507c1833636cd94bdb98bd93083a.pem", + "UnsubscribeURL": "https://sns.us-east-1.amazonaws.com/?Action=Unsubscribe&SubscriptionArn=arn:aws:sns:us-east-1:798276211865:archive-testing-demo-backup-use-1:e7a9a9f5-792e-48a6-9ec8-40f7f5a8f600" + }) + + @mock_sqs + def setUp(self): + print("Set it up!") + + self.s3_utils = S3Utils(**self.config_dict) + self.sqs_consumer = SqsConsumer(**self.config_dict) + + def tearDown(self): + print("Tear it down!") + + @mock_sqs + def test_connect(self): + queue_name = 'test' + sqs_resource = self.s3_utils.connect('resource', 'sqs', self.config_dict['s3_region']) + expQueue = sqs_resource.create_queue(QueueName=queue_name) + queue = self.sqs_consumer.connect(sqs_resource, queue_name) + + self.assertEqual(expQueue.url, queue.url) + + # Kind of pointless since we catch every exception this doesn't fail when it should.... + @mock_sqs + def test_receive_messages_no_records(self): + mock_cb = MagicMock() + + # Create the mock queue beforehand and set SqsConsumer's 'sqs_url' to the mock's URL + queue_name = 'test_queue' + sqs_resource = self.s3_utils.connect('resource', 'sqs', self.config_dict['s3_region']) + sqs_queue_url = sqs_resource.create_queue(QueueName=queue_name).url + + # Send a test message lacking Records field + sqs_client = self.s3_utils.connect('client', 'sqs' , self.config_dict['s3_region']) + sqs_client.send_message( + QueueUrl=sqs_queue_url, + MessageBody= self.message_wo_records + ) + queue = sqs_resource.Queue(queue_name) + + self.sqs_consumer.receive_messages(queue, 1, mock_cb) + + # Verify callback function was called once with expected message attributes + mock_cb.assert_not_called() + + @mock_sqs + def test_receive_messages_fails_invalid_sqs_max_polls(self): + with self.assertRaises(ValueError): + self.sqs_consumer.receive_messages(MagicMock(), 0, MagicMock()) + + @mock_sqs + def test_receive_messages_polls_msgs_expected_times(self): + mock_cb = MagicMock() + queue = MagicMock() + + sqs_max_polls = 2 + self.sqs_consumer.receive_messages(queue, sqs_max_polls, mock_cb) + + # Verify polling called expected times + self.assertEqual(queue.receive_messages.call_count, sqs_max_polls) + + @mock_sqs + def test_receive_messages_callback_occurs(self): + mock_cb = MagicMock() + + # Create the mock queue beforehand and set SqsConsumer's 'sqs_url' to the mock's URL + queue_name = 'test_queue' + sqs_resource = self.s3_utils.connect('resource', 'sqs', self.config_dict['s3_region']) + sqs_queue_url = sqs_resource.create_queue(QueueName=queue_name).url + + # Send a test message + sqs_client = self.s3_utils.connect('client', 'sqs' , self.config_dict['s3_region']) + sqs_client.send_message( + QueueUrl=sqs_queue_url, + MessageBody= self.message + ) + queue = sqs_resource.Queue(queue_name) + + self.sqs_consumer.receive_messages(queue, 1, mock_cb) + + # Verify callback function was called once with expected message attributes + mock_cb.assert_called_with(self.records, ANY) + + @mock_sqs + def test_happy_path(self): + mock_cb = MagicMock() + + # Create the mock queue beforehand and set SqsConsumer's 'sqs_url' to the mock's URL + queue_name = 'test_queue' + sqs_resource = self.s3_utils.connect('resource', 'sqs', self.config_dict['s3_region']) + queue = self.sqs_consumer.connect(sqs_resource, queue_name) #sqs_resource.create_queue(QueueName=queue_name) + + # Send a test message + sqs_client = self.s3_utils.connect('client', 'sqs' , self.config_dict['s3_region']) + sqs_client.send_message( + QueueUrl=queue.url, + MessageBody= self.message + ) + + self.sqs_consumer.receive_messages(queue, 1, mock_cb) + + # Verify callback function was called once with expected message attributes + mock_cb.assert_called_with(self.records, ANY) + + # An example using external send/receive methods + @unittest.skip + @mock_sqs + def test_write_message_valid(self): + "Test the write_message method with a valid message" + sqs_client = self.s3_utils.connect('client', 'sqs' , self.config_dict['s3_region']) + sqs = self.s3_utils.connect('resource', 'sqs', self.config_dict['s3_region']) + queue = sqs.create_queue(QueueName='test-skype-sender') + self.sqs_consumer.sqs_url = queue.url + skype_message = 'Testing with a valid message' + channel = 'test' + expected_message = str({'msg':f'{skype_message}', 'channel':channel}) + message = str({'msg':f'{skype_message}', 'channel':channel}) + queue.send_message(MessageBody=(message)) + + sqs_messages = queue.receive_messages() + print('Message: %s'%sqs_messages) + print('Message0: %s'%sqs_messages[0]) + assert sqs_messages[0].body == expected_message, 'Message in skype-sender does not match expected' + print(f'The message in skype-sender SQS matches what we sent') + assert len(sqs_messages) == 1, 'Expected exactly one message in SQS' + print(f'\nExactly one message in skype-sender SQS') + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/onestop-python-client/tests/utils.py b/onestop-python-client/test/utils.py similarity index 95% rename from onestop-python-client/tests/utils.py rename to onestop-python-client/test/utils.py index 2f1e6d5..9cb7913 100644 --- a/onestop-python-client/tests/utils.py +++ b/onestop-python-client/test/utils.py @@ -15,7 +15,8 @@ def create_delete_message(region, bucket, key): "Message": '''{ "Records": [{ "eventVersion": "2.1", "eventSource": "aws:s3", "awsRegion": "''' + region + '''", - "eventTime": "2020-12-14T20:56:08.725Z", "eventName": "ObjectRemoved:Delete", + "eventTime": "2020-12-14T20:56:08.725Z", + "eventName": "ObjectRemoved:Delete", "userIdentity": {"principalId": "AX8TWPQYA8JEM"}, "requestParameters": {"sourceIPAddress": "65.113.158.185"}, "responseElements": {"x-amz-request-id": "D8059E6A1D53597A", diff --git a/onestop-python-client/tests/KafkaPublisherTest.py b/onestop-python-client/tests/KafkaPublisherTest.py deleted file mode 100644 index 7d992ae..0000000 --- a/onestop-python-client/tests/KafkaPublisherTest.py +++ /dev/null @@ -1,25 +0,0 @@ -import unittest - -import json - -from onestop.KafkaPublisher import KafkaPublisher - -class KafkaPublisherTest(unittest.TestCase): - kp = None - - def setUp(self): - print("Set it up!") - self.kp = KafkaPublisher("../config/kafka-publisher-config-dev.yml") - - def tearDown(self): - print("Tear it down!") - - def test_parse_config(self): - self.assertFalse(self.kp.conf['brokers']==None) - - def test_publish_collection(self): - print("Publish collection") - # Integration test TBD - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/onestop-python-client/tests/SqsHandlersTest.py b/onestop-python-client/tests/SqsHandlersTest.py deleted file mode 100644 index 12323ef..0000000 --- a/onestop-python-client/tests/SqsHandlersTest.py +++ /dev/null @@ -1,89 +0,0 @@ -import json -import unittest -import boto3 - -from moto import mock_s3 -from moto import mock_sqs -from tests.utils import abspath_from_relative, create_delete_message -from onestop.WebPublisher import WebPublisher -from onestop.util.S3Utils import S3Utils -from onestop.util.S3MessageAdapter import S3MessageAdapter -from onestop.util.SqsConsumer import SqsConsumer -from onestop.util.SqsHandlers import create_delete_handler - - -class SqsHandlerTest(unittest.TestCase): - wp = None - su = None - s3ma = None - sqs = None - wp_config = abspath_from_relative(__file__, "../config/web-publisher-config-local.yml") - aws_config = abspath_from_relative(__file__, "../config/aws-util-config-dev.yml") - cred_config = abspath_from_relative(__file__, "../config/credentials-template.yml") - csb_config = abspath_from_relative(__file__, "../config/csb-data-stream-config.yml") - - collection_uuid = '5b58de08-afef-49fb-99a1-9c5d5c003bde' - payloadDict = { - "fileInformation": { - "name": "OR_ABI-L1b-RadF-M6C13_G16_s20192981730367_e20192981740087_c20192981740157.nc", - "size": 30551050, - "checksums": [{ - "algorithm": "SHA1", - "value": "bf4c5b58f8d5f9445f7b277f988e5861184f775a" - }], - "format": "NetCDF" - }, - "relationships": [{ - "type": "COLLECTION", - "id": collection_uuid - }], - "fileLocations": { - "s3://noaa-goes16/ABI-L1b-RadF/2019/298/17/OR_ABI-L1b-RadF-M6C13_G16_s20192981730367_e20192981740087_c20192981740157.nc": { - "uri": "s3://noaa-goes16/ABI-L1b-RadF/2019/298/17/OR_ABI-L1b-RadF-M6C13_G16_s20192981730367_e20192981740087_c20192981740157.nc", - "type": "ACCESS", - "deleted": "false", - "restricted": "false", - "asynchronous": "false", - "locality": "us-east-2", - "lastModified": 1572025823000, - "serviceType": "Amazon:AWS:S3", - "optionalAttributes": {} - } - } - } - - def setUp(self): - print("Set it up!") - self.wp = WebPublisher(self.wp_config, self.cred_config) - self.su = S3Utils(self.aws_config, self.cred_config) - self.s3ma = S3MessageAdapter(self.csb_config, self.su) - - def tearDown(self): - print("Tear it down!") - - @mock_s3 - @mock_sqs - def init_s3(self): - bucket = self.su.conf['s3_bucket'] - key = self.su.conf['s3_key'] - boto_client = self.su.connect("s3", None) - boto_client.create_bucket(Bucket=bucket) - boto_client.put_object(Bucket=bucket, Key=key, Body="foobar") - - sqs_client = boto3.client('sqs', region_name=self.su.conf['s3_region']) - sqs_queue = sqs_client.create_queue(QueueName=self.su.conf['sqs_name']) - self.sqs = SqsConsumer(self.aws_config, self.cred_config) - message = create_delete_message(self.su.conf['s3_region'], bucket, key) - sqs_client.send_message(QueueUrl=sqs_queue['QueueUrl'], MessageBody=json.dumps(message)) - return sqs_queue['QueueUrl'] - - def delete_handler_wrapper(self, recs): - handler = create_delete_handler(self.wp) - result = handler(recs) - self.assertTrue(result) - - @mock_sqs - def test_delete_handler(self): - mock_queue_url = self.init_s3() - sqs_queue = boto3.resource('sqs', region_name=self.su.conf['s3_region']).Queue(mock_queue_url) - self.sqs.receive_messages(sqs_queue, self.su.conf['sqs_max_polls'], self.delete_handler_wrapper) diff --git a/onestop-python-client/tests/extractor/CsbExtractorTest.py b/onestop-python-client/tests/extractor/CsbExtractorTest.py deleted file mode 100644 index 7dbbc9e..0000000 --- a/onestop-python-client/tests/extractor/CsbExtractorTest.py +++ /dev/null @@ -1,95 +0,0 @@ -import unittest -from onestop.extract.CsbExtractor import CsbExtractor -from onestop.util.S3Utils import S3Utils -from tests.utils import abspath_from_relative - - -class CsbExtractorTest(unittest.TestCase): - - # def setUp(self): - # print("Set it up!") - # file_name = '../data/file4.csv' - # self.csb_extractor = CsbExtractor(file_name) - - def setUp(self): - print("Set it up!") - key = "public/NESDIS/CSB/file4.csv" - self.su = S3Utils( abspath_from_relative( __file__, "../../config/aws-util-config-dev.yml" ), - abspath_from_relative(__file__, "../../config/credentials.yml") ) - self.csb_extractor = CsbExtractor(self.su, key) - - def tearDown(self): - print("Tear it down!") - - def test_is_csv(self): - csv_str = '.csv' - self.assertTrue(self.csb_extractor.is_csv(self.csb_extractor.file_name)) - - - def test_get_geospatial_temporal_bounds(self): - bounds_dict = self.csb_extractor.get_spatial_temporal_bounds('LON', 'LAT', 'TIME') - coords = bounds_dict["geospatial"] - print(str(coords)) - self.assertEqual(coords[0], -96.847995) - self.assertEqual(coords[1], 29.373065) - self.assertEqual(coords[2], -92.747995) - self.assertEqual(coords[3], 33.373065) - - date_rng = bounds_dict["temporal"] - self.assertEqual(date_rng[0], '2018-04-10T14:00:06.000Z' ) - self.assertEqual(date_rng[1], '2020-04-10T14:00:06.000Z' ) - - - def test_get_min_lon(self): - bounds_dict = self.csb_extractor.get_spatial_temporal_bounds('LON', 'LAT', 'TIME') - coords = bounds_dict["geospatial"] - min_lon = coords[0] - self.assertEqual(min_lon, -96.847995) - - - def test_get_max_datetime(self): - bounds_dict = self.csb_extractor.get_spatial_temporal_bounds('LON', 'LAT', 'TIME') - date_rng = bounds_dict["temporal"] - end_date = date_rng[1] - self.assertEqual(end_date, '2020-04-10T14:00:06.000Z') - - - def test_get_min_datetime(self): - bounds_dict = self.csb_extractor.get_spatial_temporal_bounds('LON', 'LAT', 'TIME') - date_rng = bounds_dict["temporal"] - begin_date = date_rng[0] - self.assertEqual(begin_date, '2018-04-10T14:00:06.000Z') - - - def test_extract_coords(self): - bounds_dict = self.csb_extractor.get_spatial_temporal_bounds('LON', 'LAT', 'TIME') - coords = bounds_dict["geospatial"] - - min_lon = coords[0] - min_lat = coords[1] - max_lon = coords[2] - max_lat = coords[3] - - coords = self.csb_extractor.extract_coords(max_lon, max_lat, min_lon, min_lat) - result = [[ - -94.847995, - 29.373065 - ], - [ - -96.847995, - 29.373065 - ], - [ - -94.847995, - 33.373065 - ], - [ - -92.747995, - 29.383065 - ] - ] - self.assertEqual(coords, result) - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/onestop-python-client/tests/util/IntegrationTest.py b/onestop-python-client/tests/util/IntegrationTest.py deleted file mode 100644 index 381e4d7..0000000 --- a/onestop-python-client/tests/util/IntegrationTest.py +++ /dev/null @@ -1 +0,0 @@ -#TBD \ No newline at end of file diff --git a/onestop-python-client/tests/util/S3UtilsTest.py b/onestop-python-client/tests/util/S3UtilsTest.py deleted file mode 100644 index 34850ad..0000000 --- a/onestop-python-client/tests/util/S3UtilsTest.py +++ /dev/null @@ -1,209 +0,0 @@ -import csv -import unittest -import uuid -from moto import mock_s3 -from moto import mock_glacier - -from tests.utils import abspath_from_relative -from onestop.util.S3Utils import S3Utils - -class S3UtilsTest(unittest.TestCase): - su = None - - def setUp(self): - print("Set it up!") - self.su = S3Utils(abspath_from_relative(__file__, "../../config/aws-util-config-dev.yml"), - abspath_from_relative(__file__, "../../config/credentials.yml")) - - def tearDown(self): - print("Tear it down!") - # Remove files from bucket - - def test_parse_config(self): - self.assertFalse(self.su.conf['sqs_url']==None) - - @mock_s3 - def test_get_uuid_metadata(self): - boto_client = self.su.connect("s3_resource", None) - s3_key = "csv/file1.csv" - bucket = self.su.conf['s3_bucket'] - region = self.su.conf['s3_region'] - location = {'LocationConstraint': region} - boto_client.create_bucket(Bucket=bucket, CreateBucketConfiguration=location) - obj_uuid = str(uuid.uuid4()) - boto_client.Object(bucket, s3_key).put(Bucket=bucket, Key=s3_key, Body="my_body", Metadata={'object-uuid': obj_uuid}) - - self.assertFalse(self.su.get_uuid_metadata(boto_client, bucket, s3_key) == None) - - @mock_s3 - def test_add_uuid_metadata(self): - region = self.su.conf['s3_region'] - boto_client = self.su.connect("s3_resource", region) - - s3_key = "csv/file1.csv" - bucket = self.su.conf['s3_bucket'] - - location = {'LocationConstraint': region} - boto_client.create_bucket(Bucket=bucket, CreateBucketConfiguration=location) - boto_client.Object(bucket, s3_key).put(Bucket=bucket, Key=s3_key, Body="my_body") - - self.assertTrue(self.su.add_uuid_metadata(boto_client, bucket, s3_key)) - - @mock_s3 - def test_add_file_s3(self): - boto_client = self.su.connect("s3", None) - local_file = abspath_from_relative(__file__, "../data/file4.csv") - s3_key = "csv/file4.csv" - bucket = self.su.conf['s3_bucket'] - region = self.su.conf['s3_region'] - location = {'LocationConstraint': region} - boto_client.create_bucket(Bucket=bucket, CreateBucketConfiguration=location) - overwrite = True - - self.assertTrue(self.su.upload_s3(boto_client, local_file, bucket, s3_key, overwrite)) - - def test_get_csv_s3(self): - boto_client = self.su.connect("session", None) - s3_key = "csv/file1.csv" - bucket = self.su.conf['s3_bucket'] - sm_open_file = self.su.get_csv_s3(boto_client, bucket, s3_key) - - # print("reading csv:" + line.decode('utf-8')) - csv_reader = csv.DictReader(sm_open_file) - for row in csv_reader: - print(str(row["LON"])) - - def test_read_bytes_s3(self): - boto_client = self.su.connect("s3", None) - s3_key = "csv/file1.csv" - bucket = self.su.conf['s3_bucket'] - self.assertTrue(self.su.read_bytes_s3(boto_client, bucket, s3_key)) - - @mock_s3 - def test_add_files(self): - boto_client = self.su.connect("s3", None) - local_files = ["file1_s3.csv", "file2.csv", "file3.csv"] - bucket = self.su.conf['s3_bucket'] - region = self.su.conf['s3_region'] - location = {'LocationConstraint': region} - boto_client.create_bucket(Bucket=bucket, CreateBucketConfiguration=location) - overwrite = True - s3_file = None - for file in local_files: - local_file = abspath_from_relative(__file__, "../data/" + file) - s3_file = "csv/" + file - self.assertTrue(self.su.upload_s3(boto_client, local_file, bucket, s3_file, overwrite)) - - @mock_s3 - @mock_glacier - def test_s3_cross_region(self): - print('Cross Region Vault Upload ------------- ') - key = "csv/file1.csv" - # grabs te region and bucket name from the config file - region = self.su.conf['s3_region'] - bucket = self.su.conf['s3_bucket'] - - # makes connection to low level s3 client - s3 = self.su.connect('s3', region) - location = {'LocationConstraint': region} - s3.create_bucket(Bucket=bucket, CreateBucketConfiguration=location) - s3.put_object(Bucket=bucket, Key=key, Body="body") - - # Reads object data and stores it into a variable - file_data = self.su.read_bytes_s3(s3, bucket, key) - - # Redirecting upload to vault in second region - glacier = self.su.connect("glacier", self.su.conf['s3_region2']) - vault_name = self.su.conf['vault_name'] - glacier.create_vault(vaultName=vault_name) - print('vault name: ' + str(vault_name)) - print('region name: ' + str(self.su.conf['s3_region2'])) - print('-------file data---------') - print(file_data) - response = self.su.upload_archive(glacier, vault_name, file_data) - - self.assertTrue(response['archiveId']!=None) - - @mock_s3 - @mock_glacier - def test_s3_to_glacier(self): - """ - Changes the storage class of an object from S3 to Glacier - Requires the configure and credential locations as parameters as well as the key of the object - """ - - print("S3 to Glacier---------") - key = "csv/file1_s3.csv" - # grabs te region and bucket name from the config file - region = self.su.conf['s3_region'] - bucket = self.su.conf['s3_bucket'] - - # Create boto3 low level api connection - s3 = self.su.connect('s3', region) - location = {'LocationConstraint': region} - s3.create_bucket(Bucket=bucket, CreateBucketConfiguration=location) - s3.put_object(Bucket=bucket, Key=key, Body="body") - - # Using the S3 util class invoke the change of storage class - response = self.su.s3_to_glacier(s3, bucket, key) - print(response['ResponseMetadata']['HTTPHeaders']['x-amz-storage-class']) - # Assert 'x-amz-storage-class': 'GLACIER' - - self.assertTrue(response['ResponseMetadata']['HTTPHeaders']['x-amz-storage-class'] == "GLACIER") - - @mock_s3 - def test_s3_restore(self): - """ - Uses high level api to restore object from glacier to s3 - """ - - region = self.su.conf['s3_region2'] - bucket = self.su.conf['s3_bucket'] - key = "csv/file1_s3.csv" - days = 3 - - # use high level api - s3 = self.su.connect('s3_resource', region) - location = {'LocationConstraint': region} - s3.create_bucket(Bucket=bucket, CreateBucketConfiguration=location) - s3.Object(bucket, key).put(Bucket=bucket, Key=key, Body="body") - - self.assertTrue(self.su.s3_restore(s3, bucket, key, days) != None) - - @mock_glacier - def test_retrieve_inventory(self): - """ - Initiates job for archive retrieval. Takes 3-5 hours to complete - """ - - # Using glacier api initiates job and returns archive results - # Connect to your glacier vault for retrieval - glacier = self.su.connect("glacier", self.su.conf['s3_region2']) - vault_name = self.su.conf['vault_name'] - glacier.create_vault(vaultName=vault_name) - - - response = self.su.retrieve_inventory(glacier, vault_name) - self.assertTrue(response['jobId']!= None) - - ''' - Excluding for now because it's an asynchronous test - def test_retrieve_inventory_results(self, jobid): - """ - Once the job has been completed, use the job id to retrieve archive results - """ - - # Connect to your glacier vault for retrieval - glacier = self.su.connect("glacier", self.su.conf['region']) - vault_name = self.su.conf['vault_name'] - - # Retrieve the job results - inventory = self.su.retrieve_inventory_results(vault_name, glacier, jobid) - - self.assertTrue(inventory != None) - ''' - - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/onestop-python-client/tests/util/SqsConsumerTest.py b/onestop-python-client/tests/util/SqsConsumerTest.py deleted file mode 100644 index 4d6be77..0000000 --- a/onestop-python-client/tests/util/SqsConsumerTest.py +++ /dev/null @@ -1,34 +0,0 @@ -import unittest -import boto3 -from moto import mock_sqs -from tests.utils import abspath_from_relative -from onestop.util.SqsConsumer import SqsConsumer - -class SqsConsumerTest(unittest.TestCase): - sc = None - - def setUp(self): - print("Set it up!") - self.sc = SqsConsumer(abspath_from_relative(__file__, "../../config/aws-util-config-dev.yml"), - abspath_from_relative(__file__, "../../config/credentials-template.yml")) - - def tearDown(self): - print("Tear it down!") - - def test_parse_config(self): - self.assertFalse(self.sc.conf['sqs_url']==None) - - @mock_sqs - def test_poll_messages(self): - # Create the mock queue beforehand and set its mock URL as the 'sqs_url' config value for SqsConsumer - boto_session = boto3.Session(aws_access_key_id=self.sc.cred['sandbox']['access_key'], - aws_secret_access_key=self.sc.cred['sandbox']['secret_key']) - sqs_session = boto_session.resource('sqs', region_name=self.sc.conf['s3_region']) - res = sqs_session.create_queue(QueueName="test_queue") - self.sc.conf['sqs_url'] = res.url - queue = self.sc.connect() - self.sc.receive_messages(queue, self.sc.conf['sqs_max_polls'], lambda *args, **kwargs: None) - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/scripts/config/aws-util-config-dev.yml b/scripts/config/aws-util-config-dev.yml index e054f49..9102be0 100644 --- a/scripts/config/aws-util-config-dev.yml +++ b/scripts/config/aws-util-config-dev.yml @@ -1,5 +1,4 @@ # Example config values for osim client -log_level: INFO # AWS config values sqs_url: https://sqs.us-east-2.amazonaws.com/798276211865/cloud-archive-client-sqs diff --git a/scripts/config/aws-util-config-test.yml b/scripts/config/aws-util-config-test.yml index 6aac07a..9de4618 100644 --- a/scripts/config/aws-util-config-test.yml +++ b/scripts/config/aws-util-config-test.yml @@ -1,5 +1,4 @@ # Example config values for osim client -log_level: DEBUG # AWS config values sqs_url: 'test-queue' diff --git a/scripts/config/csb-data-stream-config.yml b/scripts/config/csb-data-stream-config.yml index 1556ab9..2d25328 100644 --- a/scripts/config/csb-data-stream-config.yml +++ b/scripts/config/csb-data-stream-config.yml @@ -1,12 +1,11 @@ -log_level: INFO format: csv headers: UNIQUE_ID,FILE_UUID,LON,LAT,DEPTH,TIME,PLATFORM_NAME,PROVIDER type: COLLECTION collection_id: fdb56230-87f4-49f2-ab83-104cfd073177 -psi_registry_url: https://internal-a683c98a66fb011eaa4230e0d5e5657f-369075387.us-east-1.elb.amazonaws.com +registry_base_url: https://internal-a683c98a66fb011eaa4230e0d5e5657f-369075387.us-east-1.elb.amazonaws.com access_bucket: https://archive-testing-demo.s3-us-east-2.amazonaws.com #access_bucket: https://odp-noaa-nesdis-ncei-test.s3-us-west-2.amazonaws.com -file_identifier_prefix: "gov.noaa.ncei.csb:" +file_id_prefix: "gov.noaa.ncei.csb:" prefixMap: NESDIS/CSB: 'fdb56230-87f4-49f2-ab83-104cfd073177' diff --git a/scripts/config/kafka-publisher-config-dev.yml b/scripts/config/kafka-publisher-config-dev.yml index 85a66f3..8a94bf3 100644 --- a/scripts/config/kafka-publisher-config-dev.yml +++ b/scripts/config/kafka-publisher-config-dev.yml @@ -1,5 +1,4 @@ # Example config values for osim client -log_level: DEBUG # COLLECTION or GRANULE metadata_type: GRANULE @@ -7,8 +6,8 @@ metadata_type: GRANULE # Kafka config values brokers: onestop-dev-cp-kafka:9092 schema_registry: http://onestop-dev-cp-schema-registry:8081 -collection_topic_produce: psi-granules-by-collection -granule_topic_produce: psi-granule-parsed +collection_topic_publish: psi-granules-by-collection +granule_topic_publish: psi-granule-parsed collection_topic_consume: psi-collection-input-unknown granule_topic_consume: psi-granule-input-unknown group_id: sme-test diff --git a/scripts/config/web-publisher-config-dev.yml b/scripts/config/web-publisher-config-dev.yml index 9b08391..387d252 100644 --- a/scripts/config/web-publisher-config-dev.yml +++ b/scripts/config/web-publisher-config-dev.yml @@ -1,5 +1,4 @@ # Example config values for osim client -log_level: INFO # COLLECTION or GRANULE metadata_type: granule diff --git a/scripts/config/web-publisher-config-local.yml b/scripts/config/web-publisher-config-local.yml index 32db955..3ce7d88 100644 --- a/scripts/config/web-publisher-config-local.yml +++ b/scripts/config/web-publisher-config-local.yml @@ -1,5 +1,4 @@ # Example config values for osim client -log_level: INFO # COLLECTION or GRANULE metadata_type: granule diff --git a/scripts/launch_e2e.py b/scripts/launch_e2e.py index 2d5b79b..6d60b2c 100644 --- a/scripts/launch_e2e.py +++ b/scripts/launch_e2e.py @@ -1,6 +1,8 @@ import argparse import json import os +import yaml + from onestop.util.SqsConsumer import SqsConsumer from onestop.util.S3Utils import S3Utils from onestop.util.S3MessageAdapter import S3MessageAdapter @@ -55,8 +57,8 @@ def handler(recs): # Upload to archive file_data = s3_utils.read_bytes_s3(s3_client, bucket, s3_key) - glacier = s3_utils.connect("glacier", s3_utils.conf['s3_region']) - vault_name = s3_utils.conf['vault_name'] + glacier = s3_utils.connect("glacier", cloud_conf['s3_region']) + vault_name = cloud_conf['vault_name'] resp_dict = s3_utils.upload_archive(glacier, vault_name, file_data) @@ -106,9 +108,9 @@ def handler(recs): # High-level api s3_resource = s3_utils.connect("s3_resource", None) - bucket = s3_utils.conf['s3_bucket'] + bucket = cloud_conf['s3_bucket'] overwrite = True - sqs_max_polls = s3_utils.conf['sqs_max_polls'] + sqs_max_polls = cloud_conf['sqs_max_polls'] # Add 3 files to bucket local_files = ["file1.csv", "file4.csv"] s3_file = None @@ -141,18 +143,35 @@ def handler(recs): # Get configuration file path locations conf_loc = args.pop('conf') cred_loc = args.pop('cred') + stream_conf_loc = args.pop('cred') - # Upload a test file to s3 bucket - s3_utils = S3Utils(conf_loc, cred_loc) + with open(os.path.abspath(os.path.join(os.path.dirname(__file__), cred_loc))) as f: + cred = yaml.load(f, Loader=yaml.FullLoader) + with open(os.path.abspath(os.path.join(os.path.dirname(__file__), conf_loc))) as f: + cloud_conf = yaml.load(f, Loader=yaml.FullLoader) + with open(os.path.abspath(os.path.join(os.path.dirname(__file__), stream_conf_loc))) as f: + stream_conf = yaml.load(f, Loader=yaml.FullLoader) - # Low-level api ? Can we just use high level revisit me! - s3_client = s3_utils.connect("s3", None) + s3_utils = S3Utils(cred['sandbox']['access_key'], + cred['sandbox']['secret_key'], + "DEBUG") - bucket = s3_utils.conf['s3_bucket'] + bucket = cloud_conf['s3_bucket'] + sqs_max_polls = cloud_conf['sqs_max_polls'] - sqs_max_polls = s3_utils.conf['sqs_max_polls'] + #Source + access_bucket = stream_conf['access_bucket'] - # Add 3 files to bucket + #Onestop related + file_id_prefix = stream_conf['file_identifier_prefix'] + file_format = stream_conf['format'] + headers = stream_conf['headers'] + type = stream_conf['type'] + + # Low-level api ? Can we just use high level revisit me! + s3_client = s3_utils.connect("s3", None) + + # Upload test files to s3 bucket local_files = ["file1.csv", "file4.csv"] s3_file = None for file in local_files: @@ -162,9 +181,11 @@ def handler(recs): if not s3_utils.upload_s3(s3_client, local_file, bucket, s3_file, True): exit("Error setting up for e2e: The test files were not uploaded to the s3 bucket therefore the tests cannot continue.") + + # Receive s3 message and MVM from SQS queue sqs_consumer = SqsConsumer(conf_loc, cred_loc) - s3ma = S3MessageAdapter("config/csb-data-stream-config.yml", s3_utils) + s3ma = S3MessageAdapter(access_bucket, headers, type, file_id_prefix, "DEBUG") wp = WebPublisher("config/web-publisher-config-dev.yml", cred_loc) queue = sqs_consumer.connect() diff --git a/scripts/launch_pyconsumer.py b/scripts/launch_pyconsumer.py index f9dbcf6..7850f38 100644 --- a/scripts/launch_pyconsumer.py +++ b/scripts/launch_pyconsumer.py @@ -1,4 +1,6 @@ import os +import yaml + from onestop.util.SqsConsumer import SqsConsumer from onestop.util.S3Utils import S3Utils from onestop.util.S3MessageAdapter import S3MessageAdapter @@ -49,6 +51,10 @@ def handler(recs): if __name__ == '__main__': conf_loc = "/etc/config/config.yml" cred_loc = "creds.yml" + with open(os.path.abspath(os.path.join(os.path.dirname(__file__), "creds.yml"))) as f: + cred = yaml.load(f, Loader=yaml.FullLoader) + with open(os.path.abspath(os.path.join(os.path.dirname(__file__), "/etc/config/config.yml"))) as f: + conf = yaml.load(f, Loader=yaml.FullLoader) registry_user = os.environ.get("REGISTRY_USERNAME") registry_pwd = os.environ.get("REGISTRY_PASSWORD") @@ -71,8 +77,10 @@ def handler(recs): r = open(cred_loc, "r") # # Receive s3 message and MVM from SQS queue - s3_utils = S3Utils(conf_loc, cred_loc) - sqs_max_polls = s3_utils.conf['sqs_max_polls'] + s3_utils = S3Utils(cred['sandbox']['access_key'], + cred['sandbox']['secret_key'], + "DEBUG") + sqs_max_polls = conf['sqs_max_polls'] sqs_consumer = SqsConsumer(conf_loc, cred_loc) queue = sqs_consumer.connect() diff --git a/scripts/sme/smeFunc.py b/scripts/sme/smeFunc.py index 2e11d51..084e15b 100644 --- a/scripts/sme/smeFunc.py +++ b/scripts/sme/smeFunc.py @@ -27,7 +27,7 @@ def handler(key,value): if __name__ == '__main__': kafka_consumer = KafkaConsumer("scripts/config/kafka-publisher-config-dev.yml") - kafka_consumer.granule_topic = 'psi-granule-parsed' + kafka_consumer.granule_topic_consume = 'psi-granule-parsed' metadata_consumer = kafka_consumer.connect() kafka_consumer.consume(metadata_consumer, lambda k, v: handler(k, v)) """ diff --git a/serverless/conf.py b/serverless/conf.py index b41eb0b..26ef3cd 100644 --- a/serverless/conf.py +++ b/serverless/conf.py @@ -3,6 +3,6 @@ HEADERS = 'UNIQUE_ID,FILE_UUID,LON,LAT,DEPTH,TIME,PLATFORM_NAME,PROVIDER' TYPE = 'COLLECTION' COLLECTION_ID = 'fdb56230-87f4-49f2-ab83-104cfd073177' -PSI_REGISTRY_URL = 'http://internal-a683c98a66fb011eaa4230e0d5e5657f-369075387.us-east-1.elb.amazonaws.com' +REGISTRY_BASE_URL = 'http://internal-a683c98a66fb011eaa4230e0d5e5657f-369075387.us-east-1.elb.amazonaws.com' ACCESS_BUCKET = 'https://odp-noaa-nesdis-ncei-test.s3-us-west-2.amazonaws.com' FILE_IDENTIFIER_PREFIX = 'gov.noaa.ncei.csb:' diff --git a/serverless/lambda_function.py b/serverless/lambda_function.py index abe8fb7..3b6cd97 100644 --- a/serverless/lambda_function.py +++ b/serverless/lambda_function.py @@ -9,7 +9,7 @@ def lambda_handler(event, context): - registry_url = conf.PSI_REGISTRY_URL + "/metadata/granule" + registry_url = conf.REGISTRY_BASE_URL + "/metadata/granule" for rec in event['Records']: