lithops-cloud
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎config/README.md‎
Lines changed: 11 additions & 2 deletions b/‎config/README.md‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎config/config_template.yaml‎
Lines changed: 11 additions & 6 deletions b/‎config/config_template.yaml‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎docs/data-processing.md‎
Lines changed: 45 additions & 56 deletions b/‎docs/data-processing.md‎
Lines changed: 45 additions & 56 deletions
diff --git a/‎docs/knative.md‎
Lines changed: 3 additions & 4 deletions b/‎docs/knative.md‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎examples/knative.py‎
Lines changed: 10 additions & 13 deletions b/‎examples/knative.py‎
Lines changed: 10 additions & 13 deletions
diff --git a/‎examples/map.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/map.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/map_cos_bucket.py‎ ‎examples/map_cos_prefix.py‎examples/map_cos_bucket.py renamed to examples/map_cos_prefix.py b/‎examples/map_cos_bucket.py‎ ‎examples/map_cos_prefix.py‎examples/map_cos_bucket.py renamed to examples/map_cos_prefix.py
diff --git a/‎pywren_ibm_cloud/compute/backends/knative/knative.py‎
Lines changed: 18 additions & 11 deletions b/‎pywren_ibm_cloud/compute/backends/knative/knative.py‎
Lines changed: 18 additions & 11 deletions
diff --git a/‎pywren_ibm_cloud/config.py‎
Lines changed: 1 addition & 1 deletion b/‎pywren_ibm_cloud/config.py‎
Lines changed: 1 addition & 1 deletion
@@ -98,7 +98,7 @@ def add_seven(x):
 if __name__ == '__main__':
     ibmcf = pywren.ibm_cf_executor()
     ibmcf.call_async(add_seven, 3)
-    print (ibmcf.get_result())
+    print(ibmcf.get_result())
 ```
 
 ### Functions
 
@@ -116,7 +116,7 @@ pw = pywren.ibm_cf_executor(rabbitmq_monitor=True)
 
 |Group|Key|Default|Mandatory|Additional info|
 |---|---|---|---|---|
-|ibm_cf| endpoint | |yes | IBM Cloud Functions endpoint from [here](https://cloud.ibm.com/docs/openwhisk?topic=cloud-functions-cloudfunctions_regions#cloud-functions-endpoints). Make sure to use https:// prefix |
+|ibm_cf| endpoint | |yes | IBM Cloud Functions endpoint from [here](https://cloud.ibm.com/docs/openwhisk?topic=cloud-functions-cloudfunctions_regions#cloud-functions-endpoints). Make sure to use https:// prefix, for example: https://us-east.functions.cloud.ibm.com |
 |ibm_cf| namespace | |yes | Value of CURRENT NAMESPACE from [here](https://cloud.ibm.com/functions/namespace-settings) |
 |ibm_cf| api_key |  | no | **Mandatory** if using Cloud Foundry-based namespace. Value of 'KEY' from [here](https://cloud.ibm.com/functions/namespace-settings)|
 |ibm_cf| namespace_id |  |no | **Mandatory** if using IAM-based namespace with IAM API Key. Value of 'GUID' from [here](https://cloud.ibm.com/functions/namespace-settings)|
@@ -137,4 +137,13 @@ pw = pywren.ibm_cf_executor(rabbitmq_monitor=True)
 
 |Group|Key|Default|Mandatory|Additional info|
 |---|---|---|---|---|
-| rabbitmq |amqp_url | |no | AMQP URL |
+| rabbitmq |amqp_url | |no | AMQP URL from RabbitMQ service. Make sure to use amqp:// prefix |
+
+
+### Summary of configuration keys for Knative:
+
+|Group|Key|Default|Mandatory|Additional info|
+|---|---|---|---|---|
+|knative | endpoint | |no | Istio IngressGateway Endpoint. Make sure to use http:// prefix |
+|knative | docker_user | |yes | Docker hub username |
+|knative | docker_token | |yes | Login to your docker hub account and generate a new access token [here](https://hub.docker.com/settings/security)|
@@ -21,17 +21,22 @@ ibm_cf:
     api_key     : <API_KEY>
 
 ibm_cos:
-    endpoint   : <REGION_ENDPOINT>
-    api_key    : <API_KEY>
+    endpoint    : <REGION_ENDPOINT>
+    api_key     : <API_KEY>
     #access_key : <ACCESS_KEY>  # Optional
     #secret_key : <SECRET_KEY>  # Optional
-    
+
+#rabbitmq:
+    #amqp_url   : <RABBIT_AMQP_URL>  # amqp://
+
+#knative:
+#   endpoint    : <ISTIO_INGRESS_ENDPOINT>
+#   docker_user : <DOCKER_HUB_USERNAME>
+#   docker_token: <DOCKER_HUB_TOKEN>
+
 #swift:
     #auth_url   : <SWIFT_AUTH_URL>
     #region     : <SWIFT_REGION>
     #user_id    : <SWIFT_USER_ID>
     #project_id : <SWIFT_PROJECT_ID>
     #password   : <SWIFT_PASSWORD>
-
-#rabbitmq:
-    #amqp_url 	: <RABBIT_AMQP_URL>  # amqp://
 
@@ -6,80 +6,71 @@ Additionally, the built-in data-processing logic integrates a **data partitioner
 
 
 ## Processing data from IBM Cloud Object Storage
-The input to the partitioner may be either a list of data objects, a list of URLs or the entire bucket itself. The partitioner is activated inside PyWren and it responsible to split the objects into smaller chunks. It executes one *`my_map_function`* for each object chunk and when all executions are completed, the partitioner executes the *`my_reduce_function`*. The reduce function will wait for all the partial results before processing them. 
+This mode is activated when you write the parameter **obj** into the function arguments. The input to the partitioner may be either a list of buckets, a list of buckets with object prefix, or a list of data objects. If you set the *size of the chunk* or the *number of chunks*, the partitioner is activated inside PyWren and it is responsible to split the objects into smaller chunks, eventually running one function activation for each generated chunk. If *size of the chunk* and *number of chunks* are not set, chunk is an entire object, so one function activation is executed for each individual object. For example consider the following function:
 
 
-#### Partitioner get a list of objects
+The *obj* parameter is a python class from where you can access all the information related to the object (or chunk) that the function is processing. For example, consider the following function that shows all the available attributes in *obj*:
+
 
 ```python
-import pywren_ibm_cloud as pywren
+def my_map_function(obj):
+    print(obj.bucket)
+    print(obj.key)
+    print(obj.data_stream.read())
+    print(obj.part)
+    print(obj.data_byte_range)
+    print(obj.chunk_size)
+```
 
-iterdata = ['cos://bucket1/object1', 'cos://bucket1/object2', 'cos://bucket1/object3'] 
+As stated above, the allowed inputs of the function can be:
 
-def my_map_function(obj):
-    for line in obj.data_stream:
-        # Do some process
-    return partial_intersting_data
+- Input data is a bucket or a list of buckets. See a complete example in [map_reduce_cos_bucket.py](../examples/map_reduce_cos_bucket.py):
+    ```python
+    iterdata = 'cos://bucket1'
+    ```
 
-def my_reduce_function(results):
-    for partial_intersting_data in results:
-        # Do some process
-    return final_result
+-  Input data is a bucket(s) with object prefix. See a complete example in [map_cos_prefix.py](../examples/map_cos_prefix.py):
+    ```python
+    iterdata = ['cos://bucket1/images/', 'cos://bucket1/videos/']
+    ```
+    Notice that you must write the end slash (/) to inform partitioner you are providing an object prefix.
 
-chunk_size = 4*1024**2  # 4MB
+- Input data is a list of object keys. See a complete example in [map_reduce_cos_key.py](../examples/map_reduce_cos_key.py):
+    ```python
+    iterdata = ['cos://bucket1/object1', 'cos://bucket1/object2', 'cos://bucket1/object3'] 
+    ```
+    
+Notice that *iterdata* must be only one of the previous 3 types. Intermingled types are not allowed. For example, you cannot set in the same *iterdata* list a bucket and some object keys:
 
-pw = pywren.ibm_cf_executor()
-pw.map_reduce(my_map_function, iterdata, my_reduce_function, chunk_size=chunk_size)
-result = pw.get_result()
+```python
+iterdata = ['cos://bucket1', 'cos://bucket1/object2', 'cos://bucket1/object3'] 
 ```
 
-| method | method signature |
-|---| ---| 
-| `pw.map_reduce`(`my_map_function`, `iterdata`, `my_reduce_function`, `chunk_size`)| `iterdata` contains list of objects in the format of `bucket_name/object_name` |
-| `my_map_function`(`obj`) | `obj` is a Python class that contains the *bucket*, *key* and *data_stream* of the object assigned to the activation|
-
-#### Partitioner gets entire bucket
+Once iterdata is defined, you can execute PyWren as usual, either using *map()* or **map_reduce()* calls. If you need to split the files in smaller chunks, you can set (optionally) the *chunk_size* or *chunk_n* parameters.
 
-Commonly, a dataset may contains hundreds or thousands of files, so the previous approach where you have to specify each object one by one is not well suited in this case. With this new `map_reduce()` method you can specify, instead, the bucket name which contains all the object of the dataset.
-    
 ```python
 import pywren_ibm_cloud as pywren
 
-bucket_name = 'cos://my_data_bucket'
-
-def my_map_function(obj, ibm_cos):
-    for line in obj.data_stream:
-        # Do some process
-    return partial_intersting_data
-
-def my_reduce_function(results):
-    for partial_intersting_data in results:
-        # Do some process
-    return final_result
-
 chunk_size = 4*1024**2  # 4MB
 
 pw = pywren.ibm_cf_executor()
-pw.map_reduce(my_map_function, bucket_name, my_reduce_function, chunk_size=chunk_size)
+pw.map_reduce(my_map_function, iterdata, chunk_size=chunk_size)
 result = pw.get_result()
 ```
 
-* If `chunk_size=None` then partitioner's granularity is a single object. 
-    
-| method | method signature |
-|---| ---| 
-| `pw.map_reduce`(`my_map_function`, `bucket_name`, `my_reduce_function`, `chunk_size`)| `bucket_name` contains the name of the bucket |
-| `my_map_function`(`obj`, `ibm_cos`) | `obj` is a Python class that contains the *bucket*, *key* and *data_stream* of the object assigned to the activation. `ibm_cos` is an optional parameter which provides a `ibm_boto3.Client()`|
-
-
 ## Processing data from public URLs
+This mode is activated when you write the parameter **url** into the function arguments. The input to the partitioner must be a list of object URls. As with COS data processing, if you set the *size of the chunk* or the *number of chunks*, the partitioner is activated inside PyWren and it is responsible to split the objects into smaller chunks, as long as the remote storage server allows requests in chunks (ranges). If range requests are not allowed in the remote storage server, each URL is treated as a single object. For example consider the following code that shows all the available attributes in *url*:
 
 ```python
 import pywren_ibm_cloud as pywren
 
-iterdata = ['http://myurl/myobject1', 'http://myurl/myobject1'] 
-
 def my_map_function(url):
+    print(url.path)
+    print(url.data_stream.read())
+    print(url.part)
+    print(url.data_byte_range)
+    print(url.chunk_size)
+
     for line in url.data_stream:
         # Do some process
     return partial_intersting_data
@@ -89,24 +80,22 @@ def my_reduce_function(results):
         # Do some process
     return final_result
 
-chunk_size = 4*1024**2  # 4MB
+iterdata = ['http://myurl/myobject1', 'http://myurl/myobject1'] 
+chunk_n = 5
 
 pw = pywren.ibm_cf_executor()
-pw.map_reduce(my_map_function, iterdata, my_reduce_function, chunk_size=chunk_size)
+pw.map_reduce(my_map_function, iterdata, my_reduce_function, chunk_n=chunk_n)
 result = pw.get_result()
 ```
 
-| method | method signature |
-|---| ---| 
-| `pw.map_reduce`(`my_map_function`, `iterdata`, `my_reduce_function`, `chunk_size`)| `iterdata` contains list of objects in the format of `http://myurl/myobject.data` |
-| `my_map_function`(`url`) | `url` is an object Pytnon class that contains the url *path* assigned to the activation (an entry of iterdata) and the *data_stream*|
+See a complete example in [map_reduce_url.py](../examples/map_reduce_url.py).
+
 
 ## Reducer granularity            
-By default there will be one reducer for all the objects. If you need one reducer for each object, you must set the parameter
-`reducer_one_per_object=True` into the **map_reduce()** method.
+By default there will be one reducer for all the object chunks. If you need one reducer for each object, you must set the parameter
+`reducer_one_per_object=True` into the *map()* or *map_reduce()* methods.
 
 ```python
 pw.map_reduce(my_map_function, bucket_name, my_reduce_function, 
               chunk_size=chunk_size, reducer_one_per_object=True)
 ```
-
@@ -4,14 +4,14 @@ The easiest way to make it working is to create an IBM Kubernetes (IKS) cluster
 - Install Kubernetes v1.15.3
 - Select a **single zone** to place the worker nodes
 - *Master service endpoint*: Public endpoint only
-- You must create a cluster with at least 3 worker nodes, each one with a minimum flavor of 4vCPU and 16GB RAM
+- Your cluster must have 3 or more worker nodes with at least 4 cores and 16GB RAM.
 - No need to encrypt local disk
 
 Once the cluster is running, follow the instructions of the "Access" tab to configure the *kubectl* client in your local machine. Then, follow one of this two options to install the PyWren environment:
 
   - Option 1 (IBM IKS):
 
-    1. In the Dashboard of your cluster, go to the "Add-ons" tab and install knative v0.8.0. It automatically installs Istio v1.2.5 and Tekton v0.3.1.
+    1. In the Dashboard of your cluster, go to the "Add-ons" tab and install knative v0.8.0. It automatically installs Istio v1.3.0 and Tekton v0.3.1.
 
 
   - Option 2 (IBM IKS or any other Kubernetes Cluster):
@@ -41,7 +41,7 @@ knative:
       docker_user: my-username
       docker_token: 12e9075f-6cd7-4147-a01e-8e34ffe9196e
 ```
-- **docker_token**: Login to your docker hub account and generate a new docker access token [here](https://hub.docker.com/settings/security)
+- **docker_token**: Login to your docker hub account and generate a new access token [here](https://hub.docker.com/settings/security)
 
 
 
@@ -63,6 +63,5 @@ if __name__ == '__main__':
 #### Check how pods and other resources are created:
 
 ```
-export KUBECONFIG=/home/... (Same as before in "Access" tab)
 watch kubectl get pod,revision,service,deployment -o wide
 ```
@@ -1,23 +1,20 @@
 """
-Simple PyWren example using one single function invocation
+Simple PyWren example using the map method.
+In this example the map() method will launch one
+map function for each entry in 'iterdata'. Finally
+it will print the results for each invocation with
+pw.get_result()
 """
 import pywren_ibm_cloud as pywren
 
 
-#iterdata = [1, 2, 3, 4]
-iterdata = range(10)
-#iterdata = [2, 3, 4]
-
-def my_function(x):
+def my_function(id, x):
+    print("I'm activation number {}".format(id))
     return x + 7
 
-config = {'pywren': {'runtime': '<>','compute_backend': 'knative', 'storage_bucket': 'pywren-knative', 'storage_prefix': 'pywren.jobs'},
-          #'knative': {'docker_user': 'iamapikey', 'docker_password': '<iamkey>', 'docker_repo': 'uk.icr.io'},
-          'knative': {'docker_user': '<docker-hub user>', 'docker_password': 'docker-hub password', 'docker_repo': 'docker.io'},
-          'ibm_cos': {}}
 
 if __name__ == '__main__':
-    pw = pywren.ibm_cf_executor(config=config)
-    #pw.call_async(my_function, 3)
+    iterdata = [1, 2, 3, 4]
+    pw = pywren.knative_executor()
     pw.map(my_function, iterdata)
-    print (pw.get_result())
+    print(pw.get_result())
@@ -3,7 +3,7 @@
 In this example the map() method will launch one
 map function for each entry in 'iterdata'. Finally
 it will print the results for each invocation with
-pw.get_all_result()
+pw.get_result()
 """
 import pywren_ibm_cloud as pywren
 
 
@@ -14,6 +14,7 @@
 import urllib3
 urllib3.disable_warnings()
 logging.getLogger('kubernetes').setLevel(logging.CRITICAL)
+logging.getLogger('urllib3.connectionpool').setLevel(logging.CRITICAL)
 
 #Monkey patch for issue: https://github.com/kubernetes-client/python/issues/895
 from kubernetes.client.models.v1_container_image import V1ContainerImage
@@ -411,7 +412,7 @@ def list_runtimes(self, docker_image_name='all'):
         runtimes = [[docker_image_name, 256]]
         return runtimes
 
-    def invoke(self, docker_image_name, memory, payload):
+    def invoke(self, docker_image_name, memory, payload, return_result=False):
         """
         Invoke -- return information about this invocation
         """
@@ -425,12 +426,16 @@ def invoke(self, docker_image_name, memory, payload):
         route = payload.get("service_route", '/')
 
         try:
+            logger.debug('ExecutorID {} | JobID {} - Starting function invocation {}'
+                         .format(exec_id, job_id, call_id))
             start = time.time()
             parsed_url = urlparse(self.endpoint)
             conn = http.client.HTTPConnection(parsed_url.netloc, timeout=600)
             conn.request("POST", route,
                          body=json.dumps(payload),
                          headers=self.headers)
+            logger.debug('ExecutorID {} | JobID {} - Function invocation {} done. Waiting '
+                         'for a response'.format(exec_id, job_id, call_id))
             resp = conn.getresponse()
             resp_status = resp.status
             resp_data = resp.read().decode("utf-8")
@@ -440,28 +445,30 @@ def invoke(self, docker_image_name, memory, payload):
 
             if resp_status in [200, 202]:
                 data = json.loads(resp_data)
-                log_msg = ('ExecutorID {} - Function invocation {} done! ({}s) '
-                           .format(exec_id, call_id, resp_time))
+                log_msg = ('ExecutorID {} | JobID {} - Function activation {} finished! ({}s) '
+                           .format(exec_id, job_id, call_id, resp_time))
                 logger.debug(log_msg)
-                return exec_id + job_id + call_id, data
+                if return_result:
+                    return data
+                return data["activationId"]
             elif resp_status == 404:
                 raise Exception("PyWren runtime is not deployed in your k8s cluster")
             else:
-                log_msg = ('ExecutorID {} - Function invocation {} failed: {} {}'
-                           .format(exec_id, call_id, resp_status, resp_data))
+                log_msg = ('ExecutorID {} | JobID {} - Function invocation {} failed: {} {}'
+                           .format(exec_id, job_id, call_id, resp_status, resp_data))
                 logger.debug(log_msg)
 
         except Exception as e:
             conn.close()
-            log_msg = ('ExecutorID {} - Function invocation {} failed: {}'
-                       .format(exec_id, call_id, str(e)))
+            log_msg = ('ExecutorID {} | JobID {} - Function invocation {} failed: {}'
+                       .format(exec_id, job_id, call_id, str(e)))
             logger.debug(log_msg)
 
     def invoke_with_result(self, docker_image_name, memory, payload={}):
         """
         Invoke waiting for a result -- return information about this invocation
         """
-        return self.invoke(docker_image_name, memory, payload)
+        return self.invoke(docker_image_name, memory, payload, return_result=True)
 
     def get_runtime_key(self, docker_image_name, runtime_memory):
         """
@@ -484,11 +491,11 @@ def _generate_runtime_meta(self, docker_image_name, memory):
         payload['service_route'] = "/preinstalls"
         logger.debug("Extracting Python modules list from: {}".format(docker_image_name))
         try:
-            _, runtime_meta = self.invoke_with_result(docker_image_name, memory, payload)
+            runtime_meta = self.invoke_with_result(docker_image_name, memory, payload)
         except Exception as e:
             raise Exception("Unable to invoke 'modules' action {}".format(e))
 
         if not runtime_meta or 'preinstalls' not in runtime_meta:
-            raise Exception(runtime_meta)
+            raise Exception('Failed getting runtime metadata: {}'.format(runtime_meta))
 
         return runtime_meta
@@ -30,7 +30,7 @@
 DATA_CLEANER_DEFAULT = False
 MAX_AGG_DATA_SIZE = 4e6
 INVOCATION_RETRY_DEFAULT = True
-RETRY_SLEEPS_DEFAULT = [1, 2, 4, 8]
+RETRY_SLEEPS_DEFAULT = [4, 8, 16, 24]
 RETRIES_DEFAULT = 5
 
 CONFIG_DIR = os.path.expanduser('~/.pywren')