Merge pull request #14 from ibm-watson-data-lab/PythonCOSSupport

bassel-zeidan · web-flow · commit b3c0f31a7af6 · 2017-07-19T11:20:25.000+02:00
Python Cloud Object Storage Support
diff --git a/python/README.md b/python/README.md
@@ -1,16 +1,16 @@
 # ibmos2spark
 
-The package sets Spark Hadoop configurations for connecting to 
+The package sets Spark Hadoop configurations for connecting to
 IBM Bluemix Object Storage and Softlayer Account Object Storage instances. This packages uses the new [stocator](https://github.com/SparkTC/stocator) driver, which implements the `swift2d` protocol, and is availble
-on the latest IBM Apache Spark Service instances (and through IBM Data Science Experience). 
+on the latest IBM Apache Spark Service instances (and through IBM Data Science Experience).
 
 
-Using the `stocator` driver connects your Spark executor nodes directly 
+Using the `stocator` driver connects your Spark executor nodes directly
 to your data in object storage.
 This is an optimized, high-performance method to connect Spark to your data. All IBM Apache Spark kernels
-are instantiated with the `stocator` driver in the Spark kernel's classpath. 
-You can also run this locally by installing the [stocator driver](https://github.com/SparkTC/stocator) 
-and adding it to your local Apache Spark kernel's classpath. 
+are instantiated with the `stocator` driver in the Spark kernel's classpath.
+You can also run this locally by installing the [stocator driver](https://github.com/SparkTC/stocator)
+and adding it to your local Apache Spark kernel's classpath.
 
 ## Installation
 
@@ -21,22 +21,38 @@ pip install --user --upgrade ibmos2spark
 ## Usage
 
 The usage of this package depends on *from where* your Object Storage instance was created. This package
-is intended to connect to IBM's Object Storage instances obtained from Bluemix or Data Science Experience 
-(DSX) or from a separate account on IBM Softlayer. The instructions below show how to connect to 
-either type of instance. 
+is intended to connect to IBM's Object Storage instances (Swift OS). This OS can be obtained from Bluemix or Data Science Experience (DSX) or from a separate account on IBM Softlayer. The package also supports IBM Cloud Object Storage as well (COS).
+The instructions below show how to connect to either type of instance.
 
 The connection setup is essentially the same. But the difference for you is how you deliver the
 credentials. If your Object Storage was created with Bluemix/DSX, with a few clicks on the side-tab
 within a DSX Jupyter notebook, you can obtain your account credentials in the form of a Python dictionary.
 If your Object Storage was created with a Softlayer account, each part of the credentials will
-be found as text that you can copy and paste into the example code below. 
+be found as text that you can copy and paste into the example code below.
+
+### CloudObjectStorage / Data Science Experience
+```python
+import ibmos2spark
+
+credentials = {
+  'endpoint': 'https://s3-api.objectstorage.softlayer.net/',  #just an example. Your url might be different
+  'access_key': '',
+  'secret_key': ''
+}
+
+cos = ibmos2spark.CloudObjectStorage(sc, credentials)  #sc is the SparkContext instance
+
+bucket_name = 'some_bucket_name'
+object_name = 'file1'
+data = sc.textFile(cos.url(object_name, bucket_name))
+```
 
 ### Bluemix / Data Science Experience
 
 ```python
 import ibmos2spark
 
-#To obtain these credentials in IBM Spark, click the "insert to code" 
+#To obtain these credentials in IBM Spark, click the "insert to code"
 #button below your data source found on the panel to the right of your notebook.
 
 credentials = {
@@ -78,7 +94,7 @@ data = sc.textFile(slos.url(container_name, object_name))
 ```
 
 
-## License 
+## License
 
 Copyright 2016 IBM Cloud Data Services
 
diff --git a/python/ibmos2spark/__init__.py b/python/ibmos2spark/__init__.py
@@ -16,4 +16,4 @@
 Helper to connect to Softlayer and Bluemix ObjectStore from IBM Spark Service
 """
 from .__info__ import __version__
-from .osconfig import softlayer, bluemix
+from .osconfig import softlayer, bluemix, CloudObjectStorage
diff --git a/python/ibmos2spark/osconfig.py b/python/ibmos2spark/osconfig.py
@@ -16,7 +16,7 @@
 and generate the swifturl.
 
 """
-  
+
 import warnings
 
 def swifturl2d(name, container_name, object_name):
@@ -36,7 +36,7 @@ def __init__(self, sparkcontext, name, auth_url, tenant, username, password=None
     auth_url, tenant, username and password are string credentials for your
     Softlayer Object Store
 
-    Example: 
+    Example:
 
       slos = softlayer(sc, 'mySLOS', 'https://dal05.objectstorage.softlayer.net/auth/v1.0',
                        'IBMOS278685-10','username@somewhere.com', 'password_234234ada')
@@ -49,19 +49,19 @@ def __init__(self, sparkcontext, name, auth_url, tenant, username, password=None
     this class should have failed when attempted to access data with swift.
 
     As of the version 0.0.7 update, support for the old protocol has been removed in
-    favor of the new swift2d/stocator protocol. 
+    favor of the new swift2d/stocator protocol.
 
-    Subsequently, the __init__ for this class has been changed!  
+    Subsequently, the __init__ for this class has been changed!
 
-    However, to support older code that may have been unused since this transition, 
+    However, to support older code that may have been unused since this transition,
     this __init__ function will check the arguments and attempt to determine
     the proper credentials. Specifically, if the <password> is None, then
-    the <tenant> argument will be interpreted as <tenant>:<username> and the 
+    the <tenant> argument will be interpreted as <tenant>:<username> and the
     <username> argument will be interpreted as the <password> value. This is because
-    the <username> for Softlayer keystone 1 authentication is equivalent to <tenant>:<username>. 
+    the <username> for Softlayer keystone 1 authentication is equivalent to <tenant>:<username>.
     For example, typcial usernames look like 'IBMOS278685-10:<email>', as shown here
-    http://knowledgelayer.softlayer.com/procedure/how-do-i-access-object-storage-command-line. 
- 
+    http://knowledgelayer.softlayer.com/procedure/how-do-i-access-object-storage-command-line.
+
 
     Therefore, this class will attempt to extract tenant, username and password from
     uses such as
@@ -75,22 +75,22 @@ def __init__(self, sparkcontext, name, auth_url, tenant, username, password=None
     '''
     if password is None:
       msg = '''
-               password was set to None! 
+               password was set to None!
                Attempting to interpret tentant = tenant:username and username=password.
                This is an attempt to support older code that may have missed the transition or
                errors using the old swift protocol connection to Softlayer Object Storage accounts.
                If you are seeing this warning, you should separate your tenant and username values,
-               as this support will be deprecated in the near future. 
+               as this support will be deprecated in the near future.
             '''
       warnings.warn(msg, UserWarning)
       password = username
       tenant, username  = tenant.split(':')
       warnings.warn('Trying tenant {}, username {} and password {}'.format(tenant, username, password), UserWarning)
-      
+
 
     self.name = name
 
-    prefix = "fs.swift2d.service." + name 
+    prefix = "fs.swift2d.service." + name
     hconf = sparkcontext._jsc.hadoopConfiguration()
     hconf.set("fs.swift2d.impl", swift2d_driver)
     hconf.set(prefix + ".auth.url", auth_url)
@@ -100,7 +100,7 @@ def __init__(self, sparkcontext, name, auth_url, tenant, username, password=None
     hconf.set(prefix + ".auth.method", "swiftauth")
     hconf.setInt(prefix + ".http.port", 8080)
     hconf.set(prefix + ".apikey", password)
-    hconf.setBoolean(prefix + ".public", public) 
+    hconf.setBoolean(prefix + ".public", public)
     hconf.set(prefix + ".use.get.auth", "true")
     hconf.setBoolean(prefix + ".location-aware", False)
     hconf.set(prefix + ".password", password)
@@ -116,7 +116,7 @@ def __init__(self, sparkcontext, credentials, name=None, public=False, swift2d_d
     sparkcontext:  a SparkContext object.
 
     credentials:  a dictionary with the following required keys:
-      
+
       auth_url
       project_id (or projectId)
       user_id (or userId)
@@ -148,12 +148,12 @@ def __init__(self, sparkcontext, credentials, name=None, public=False, swift2d_d
     try:
         user_id = credentials['user_id']
     except KeyError as e:
-        user_id = credentials['userId'] 
+        user_id = credentials['userId']
 
     try:
         tenant = credentials['project_id']
     except KeyError as e:
-        tenant = credentials['projectId'] 
+        tenant = credentials['projectId']
 
     prefix = "fs.swift2d.service." + self.name
     hconf = sparkcontext._jsc.hadoopConfiguration()
@@ -170,3 +170,68 @@ def __init__(self, sparkcontext, credentials, name=None, public=False, swift2d_d
 
   def url(self, container_name, object_name):
     return swifturl2d(self.name, container_name, object_name)
+
+
+class CloudObjectStorage(object):
+
+    def __init__(self, sparkcontext, credentials, cos_id='', bucket_name=''):
+
+        '''
+        sparkcontext:  a SparkContext object.
+
+        credentials:  a dictionary with the following required keys:
+          * endpoint
+          * access_key
+          * secret_key
+
+        When using this on DSX credentials and bucket_name can be obtained
+        in DSX - Notebooks by clicking on the datasources palette then
+        choose the datasource you want to access then hit insert credentials.
+
+        cos_id [optional]: this parameter is the cloud object storage unique id. It is useful
+            to keep in the class instance for further checks after the initialization. However,
+            it is not mandatory for the class instance to work. This value can be retrieved by
+            calling the get_os_id function.
+
+        bucket_name (projectId in DSX) [optional]:  string that identifies the defult
+            bucket nameyou want to access files from in the COS service instance.
+            In DSX, bucket_name is the same as projectId. One bucket is
+            associated with one project.
+            If this value is not specified, you need to pass it when
+            you use the url function.
+
+        Warning: creating a new instance of this class would overwrite the existing
+            spark hadoop configs if set before if used with the same spark context instance.
+
+        '''
+        self.bucket_name = bucket_name
+        self.cos_id = cos_id
+
+        # check if all required values are availble
+        credential_key_list = ["endpoint", "access_key", "secret_key"]
+
+        for i in range(len(credential_key_list)):
+            key = credential_key_list[i]
+            if (not key in credentials):
+                raise ValueError("Invalid input: credentials.{} is required!".format(key))
+
+        # setup config
+        prefix = "fs.s3d.service"
+        hconf = sparkcontext._jsc.hadoopConfiguration()
+        hconf.set(prefix + ".endpoint", credentials['endpoint'])
+        hconf.set(prefix + ".access.key", credentials['access_key'])
+        hconf.set(prefix + ".secret.key", credentials['secret_key'])
+
+    def get_os_id():
+        return self.cos_id
+
+    def url(self, object_name, bucket_name=''):
+        bucket_name_var = ''
+        if (bucket_name):
+            bucket_name_var = bucket_name
+        elif (self.bucket_name):
+            bucket_name_var = self.bucket_name
+        else:
+            raise ValueError("Invalid input: bucket_name is required!")
+
+        return "s3d://{}.service/{}".format(bucket_name_var, object_name)