Python client: new AsyncBucket interface (#17)

neilmovva · web-flow · commit b77a93f43a53 · 2023-03-27T18:03:38.000-07:00
New client interface to support async usage. 
This commit implements async versions of write() and privateRead(); all other calls fall back to blocking synchronous versions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.analysis.typeCheckingMode": "basic"
+}
diff --git a/python/Cargo.lock b/python/Cargo.lock
diff --git a/python/Cargo.toml b/python/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "blyss-client-python"
-version = "0.1.0"
+version = "0.1.7"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/python/blyss/api.py b/python/blyss/api.py
@@ -7,6 +7,9 @@
 
 from typing import Any, Optional, Union
 import requests
+import httpx
+import gzip
+import asyncio
 import json
 import logging
 import base64
@@ -58,6 +61,24 @@ def _get_data(api_key: Optional[str], url: str) -> bytes:
     return resp.content
 
 
+async def _async_get_data(
+    api_key: Optional[str], url: str, decode_json: bool = True
+) -> Any:
+    headers = {}
+    if api_key:
+        headers["x-api-key"] = api_key
+
+    logging.info(f"GET {url} {headers}")
+    async with httpx.AsyncClient() as client:
+        r = await client.get(url, headers=headers)
+    _check_http_error(r)
+
+    if decode_json:
+        return r.json()
+    else:
+        return r.content
+
+
 def _get_data_json(api_key: str, url: str) -> dict[Any, Any]:
     """Perform an HTTP GET request, returning a JSON-parsed dict"""
     return json.loads(_get_data(api_key, url))
@@ -94,7 +115,37 @@ def _post_form_data(url: str, fields: dict[Any, Any], data: bytes):
     _check_http_error(resp)
 
 
-# API
+async def _async_post_data(
+    api_key: str,
+    url: str,
+    data: Union[str, bytes],
+    compress: bool = True,
+    decode_json: bool = True,
+) -> Any:
+    """Perform an async HTTP POST request, returning a JSON-parsed dict response"""
+    headers = {
+        "x-api-key": api_key,
+    }
+    if type(data) == str:
+        headers["Content-Type"] = "application/json"
+        data = data.encode("utf-8")
+    else:
+        headers["Content-Type"] = "application/octet-stream"
+    assert type(data) == bytes
+
+    if compress:
+        # apply gzip compression to data before sending
+        data = gzip.compress(data)
+        headers["Content-Encoding"] = "gzip"
+
+    async with httpx.AsyncClient(timeout=httpx.Timeout(5, read=None)) as client:
+        r = await client.post(url, content=data, headers=headers)
+
+    _check_http_error(r)  # type: ignore
+    if decode_json:
+        return r.json()
+    else:
+        return r.content
 
 
 class API:
@@ -138,6 +189,13 @@ def check(self, uuid: str) -> dict[Any, Any]:
             self.api_key, self._service_url_for("/" + uuid + CHECK_PATH)
         )
 
+    async def async_check(self, uuid: str) -> dict[Any, Any]:
+        return await _async_get_data(
+            self.api_key,
+            self._service_url_for("/" + uuid + CHECK_PATH),
+            decode_json=True,
+        )
+
     def list_buckets(self) -> dict[Any, Any]:
         """List all buckets accessible to this API key.
 
@@ -204,6 +262,12 @@ def write(self, bucket_name: str, data: bytes):
         """Write some data to this bucket."""
         _post_data(self.api_key, self._url_for(bucket_name, WRITE_PATH), data)
 
+    async def async_write(self, bucket_name: str, data: str):
+        """Write JSON payload to this bucket."""
+        await _async_post_data(
+            self.api_key, self._url_for(bucket_name, WRITE_PATH), data, decode_json=True
+        )
+
     def delete_key(self, bucket_name: str, key: str):
         """Delete a key in this bucket."""
         _post_data(
@@ -214,3 +278,12 @@ def private_read(self, bucket_name: str, data: bytes) -> bytes:
         """Privately read data from this bucket."""
         val = _post_data(self.api_key, self._url_for(bucket_name, READ_PATH), data)
         return base64.b64decode(val)
+
+    async def async_private_read(self, bucket_name: str, data: bytes) -> bytes:
+        """Privately read data from this bucket."""
+        val: bytes = await _async_post_data(
+            self.api_key, self._url_for(bucket_name, READ_PATH), data, decode_json=False
+        )
+        # AWS APIGW encodes its responses as base64
+        return base64.b64decode(val)
+        # return self.private_read(bucket_name, data)
diff --git a/python/blyss/bucket.py b/python/blyss/bucket.py
@@ -9,8 +9,10 @@
 from .blyss_lib import BlyssLib
 
 import json
+import base64
 import bz2
 import time
+import asyncio
 
 
 def _chunk_parser(raw_data: bytes) -> Iterator[bytes]:
@@ -71,23 +73,72 @@ def _check(self, uuid: str) -> bool:
             else:
                 raise e
 
-    def _private_read(self, keys: list[str]) -> list[tuple[bytes, Optional[dict[Any, Any]]]]:
-        """Performs the underlying private retrieval.
+    async def _async_check(self, uuid: str) -> bool:
+        try:
+            await self.api.async_check(uuid)
+            return True
+        except api.ApiException as e:
+            if e.code == 404:
+                return False
+            else:
+                raise e
 
-        Args:
-            keys (str): A list of keys to retrieve.
+    def _split_into_chunks(
+        self, kv_pairs: dict[str, bytes]
+    ) -> list[list[dict[str, str]]]:
+        _MAX_PAYLOAD = 5 * 2**20  # 5 MiB
+
+        # 1. Bin keys by row index
+        keys_by_index: dict[int, list[str]] = {}
+        for k in kv_pairs.keys():
+            i = self.lib.get_row(k)
+            if i in keys_by_index:
+                keys_by_index[i].append(k)
+            else:
+                keys_by_index[i] = [k]
+
+        # 2. Prepare chunks of items, where each is a JSON-ready structure.
+        # Each chunk is less than the maximum payload size, and guarantees
+        # zero overlap of rows across chunks.
+        kv_chunks: list[list[dict[str, str]]] = []
+        current_chunk: list[dict[str, str]] = []
+        current_chunk_size = 0
+        sorted_indices = sorted(keys_by_index.keys())
+        for i in sorted_indices:
+            keys = keys_by_index[i]
+            # prepare all keys in this row
+            row = []
+            row_size = 0
+            for key in keys:
+                value = kv_pairs[key]
+                value_str = base64.b64encode(value).decode("utf-8")
+                fmt = {
+                    "key": key,
+                    "value": value_str,
+                    "content-type": "application/octet-stream",
+                }
+                row.append(fmt)
+                row_size += int(72 + len(key) + len(value_str))
+
+            # if the new row doesn't fit into the current chunk, start a new one
+            if current_chunk_size + row_size > _MAX_PAYLOAD:
+                kv_chunks.append(current_chunk)
+                current_chunk = row
+                current_chunk_size = row_size
+            else:
+                current_chunk.extend(row)
+                current_chunk_size += row_size
 
-        Returns:
-            tuple[bytes, Optional[dict]]: Returns a tuple of (value, optional_metadata).
-        """
-        if not self.public_uuid or not self._check(self.public_uuid):
-            self.setup()
-            assert self.public_uuid
+        # add the last chunk
+        if len(current_chunk) > 0:
+            kv_chunks.append(current_chunk)
+
+        return kv_chunks
 
+    def _generate_query_stream(self, keys: list[str]) -> bytes:
         # generate encrypted queries
         queries: list[bytes] = [
-            self.lib.generate_query(self.public_uuid, self.lib.get_row(k)) 
-            for k in keys
+            self.lib.generate_query(self.public_uuid, self.lib.get_row(k)) for k in keys
         ]
         # interleave the queries with their lengths (uint64_t)
         query_lengths = [len(q).to_bytes(8, "little") for q in queries]
@@ -96,18 +147,43 @@ def _private_read(self, keys: list[str]) -> list[tuple[bytes, Optional[dict[Any,
         lengths_and_queries.insert(0, len(queries).to_bytes(8, "little"))
         # serialize the queries
         multi_query = b"".join(lengths_and_queries)
-        
-        start = time.perf_counter()
-        multi_result = self.api.private_read(self.name, multi_query)
-        self.exfil = time.perf_counter() - start
+        return multi_query
 
-        retrievals = [] 
-        for key, result in zip(keys, _chunk_parser(multi_result)):
+    def _unpack_query_result(
+        self, keys: list[str], raw_result: bytes, parse_metadata: bool = True
+    ) -> list[bytes]:
+        retrievals = []
+        for key, result in zip(keys, _chunk_parser(raw_result)):
             decrypted_result = self.lib.decode_response(result)
             decompressed_result = bz2.decompress(decrypted_result)
             extracted_result = self.lib.extract_result(key, decompressed_result)
-            output = serializer.deserialize(extracted_result)
+            if parse_metadata:
+                output = serializer.deserialize(extracted_result)
+            else:
+                output = extracted_result
             retrievals.append(output)
+        return retrievals
+
+    def _private_read(self, keys: list[str]) -> list[bytes]:
+        """Performs the underlying private retrieval.
+
+        Args:
+            keys (str): A list of keys to retrieve.
+
+        Returns:
+            tuple[bytes, Optional[dict]]: Returns a tuple of (value, optional_metadata).
+        """
+        if not self.public_uuid or not self._check(self.public_uuid):
+            self.setup()
+            assert self.public_uuid
+
+        multi_query = self._generate_query_stream(keys)
+
+        start = time.perf_counter()
+        multi_result = self.api.private_read(self.name, multi_query)
+        self.exfil = time.perf_counter() - start
+
+        retrievals = self._unpack_query_result(keys, multi_result)
 
         return retrievals
 
@@ -184,11 +260,11 @@ def private_read(self, keys: Union[str, list[str]]) -> Union[bytes, list[bytes]]
 
         Args:
             keys (str): A key or list of keys to privately read.
-                        If a list of keys is supplied, 
+                        If a list of keys is supplied,
                         results will be returned in the same order.
 
         Returns:
-            bytes: The value found for the key in the bucket, 
+            bytes: The value found for the key in the bucket,
                    or None if the key was not found.
         """
         single_query = False
@@ -202,7 +278,6 @@ def private_read(self, keys: Union[str, list[str]]) -> Union[bytes, list[bytes]]
 
         return results
 
-
     def private_key_intersect(self, keys: list[str]) -> list[str]:
         """Privately intersects the given set of keys with the keys in this bucket,
         returning the keys that intersected. This is generally slower than a single
@@ -217,3 +292,36 @@ def private_key_intersect(self, keys: list[str]) -> list[str]:
         bloom_filter = self.api.bloom(self.name)
         present_keys = list(filter(bloom_filter.lookup, keys))
         return present_keys
+
+
+class AsyncBucket(Bucket):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    async def write(self, kv_pairs: dict[str, bytes], MAX_CONCURRENCY=8):
+        # Split the key-value pairs into chunks not exceeding max payload size.
+        kv_chunks = self._split_into_chunks(kv_pairs)
+        # Make one write call per chunk, while respecting a max concurrency limit.
+        sem = asyncio.Semaphore(MAX_CONCURRENCY)
+
+        async def _paced_writer(chunk):
+            async with sem:
+                await self.api.async_write(self.name, json.dumps(chunk))
+
+        _tasks = [asyncio.create_task(_paced_writer(c)) for c in kv_chunks]
+        await asyncio.gather(*_tasks)
+
+    async def private_read(self, keys: list[str]) -> list[bytes]:
+        if not self.public_uuid or not await self._async_check(self.public_uuid):
+            self.setup()
+            assert self.public_uuid
+
+        multi_query = self._generate_query_stream(keys)
+
+        start = time.perf_counter()
+        multi_result = await self.api.async_private_read(self.name, multi_query)
+        self.exfil = time.perf_counter() - start
+
+        retrievals = self._unpack_query_result(keys, multi_result, parse_metadata=False)
+
+        return retrievals
diff --git a/python/blyss/bucket_service.py b/python/blyss/bucket_service.py
@@ -32,7 +32,9 @@ def __init__(self, api_config: Union[str, ApiConfig]):
         self.api = api.API(self.api_config["api_key"], self.service_endpoint)
 
     def connect(
-        self, bucket_name: str, secret_seed: Optional[str] = None
+        self,
+        bucket_name: str,
+        secret_seed: Optional[str] = None,
     ) -> bucket.Bucket:
         """Connect to an existing Blyss bucket.
 
@@ -47,10 +49,22 @@ def connect(
         """
         if secret_seed is None:
             secret_seed = seed.get_random_seed()
-
         b = bucket.Bucket(self.api, bucket_name, secret_seed=secret_seed)
         return b
 
+    def connect_async(
+        self, bucket_name: str, secret_seed: Optional[str] = None
+    ) -> bucket.AsyncBucket:
+        """Connect to an existing Blyss bucket, using an asyncio-ready interface.
+
+        Args:
+            see connect()
+
+        Returns:
+            bucket.Bucket: An object representing a client to the Blyss bucket.
+        """
+        return bucket.AsyncBucket(self.api, bucket_name, secret_seed=secret_seed)
+
     def create(
         self,
         bucket_name: str,
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -4,11 +4,14 @@ build-backend = "maturin"
 
 [project]
 name = "blyss"
-requires-python = ">=3.7"
+requires-python = ">=3.8"
 classifiers = [
     "Programming Language :: Rust",
     "Programming Language :: Python :: Implementation :: CPython",
     "Programming Language :: Python :: Implementation :: PyPy",
 ]
-
-
+dependencies = [
+    "requests",
+    "httpx",
+]
+dynamic = ["version"]

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{`
	`2`	`+ "python.analysis.typeCheckingMode": "basic"`
	`3`	`+}`