support scaling/hard timeouts & graceful-fail for sync batch

UmanShahzad · UmanShahzad · commit 901fcafde43f · 2020-12-21T18:06:45.000+05:00
diff --git a/ipinfo/exceptions.py b/ipinfo/exceptions.py
@@ -7,3 +7,9 @@ class RequestQuotaExceededError(Exception):
     """Error indicating that users monthly request quota has been passed."""
 
     pass
+
+
+class TimeoutExceededError(Exception):
+    """Error indicating that some timeout has been exceeded."""
+
+    pass
diff --git a/ipinfo/handler.py b/ipinfo/handler.py
@@ -6,12 +6,22 @@
 import json
 import os
 import sys
+import time
 
 import requests
 
 from .cache.default import DefaultCache
 from .details import Details
-from .exceptions import RequestQuotaExceededError
+from .exceptions import RequestQuotaExceededError, TimeoutExceededError
+from .handler_utils import (
+    API_URL,
+    COUNTRY_FILE_DEFAULT,
+    BATCH_MAX_SIZE,
+    CACHE_MAXSIZE,
+    CACHE_TTL,
+    REQUEST_TIMEOUT_DEFAULT,
+    BATCH_REQ_TIMEOUT_DEFAULT,
+)
 from . import handler_utils
 
 
@@ -21,10 +31,6 @@ class Handler:
     Instantiates and maintains access to cache.
     """
 
-    CACHE_MAXSIZE = 4096
-    CACHE_TTL = 60 * 60 * 24
-    REQUEST_TIMEOUT_DEFAULT = 2
-
     def __init__(self, access_token=None, **kwargs):
         """
         Initialize the Handler object with country name list and the
@@ -40,21 +46,26 @@ def __init__(self, access_token=None, **kwargs):
         # setup req opts
         self.request_options = kwargs.get("request_options", {})
         if "timeout" not in self.request_options:
-            self.request_options["timeout"] = self.REQUEST_TIMEOUT_DEFAULT
+            self.request_options["timeout"] = REQUEST_TIMEOUT_DEFAULT
 
         # setup cache
         if "cache" in kwargs:
             self.cache = kwargs["cache"]
         else:
             cache_options = kwargs.get("cache_options", {})
             if "maxsize" not in cache_options:
-                cache_options["maxsize"] = self.CACHE_MAXSIZE
+                cache_options["maxsize"] = CACHE_MAXSIZE
             if "ttl" not in cache_options:
-                cache_options["ttl"] = self.CACHE_TTL
+                cache_options["ttl"] = CACHE_TTL
             self.cache = DefaultCache(**cache_options)
 
-    def getDetails(self, ip_address=None):
-        """Get details for specified IP address as a Details object."""
+    def getDetails(self, ip_address=None, timeout=None):
+        """
+        Get details for specified IP address as a Details object.
+
+        If `timeout` is not `None`, it will override the client-level timeout
+        just for this operation.
+        """
         # If the supplied IP address uses the objects defined in the built-in
         # module ipaddress extract the appropriate string notation before
         # formatting the URL.
@@ -66,12 +77,17 @@ def getDetails(self, ip_address=None):
         if ip_address in self.cache:
             return Details(self.cache[ip_address])
 
+        # prepare req http opts
+        req_opts = {**self.request_options}
+        if timeout is not None:
+            req_opts["timeout"] = timeout
+
         # not in cache; do http req
-        url = handler_utils.API_URL
+        url = API_URL
         if ip_address:
             url += "/" + ip_address
         headers = handler_utils.get_headers(self.access_token)
-        response = requests.get(url, headers=headers, **self.request_options)
+        response = requests.get(url, headers=headers, **req_opts)
         if response.status_code == 429:
             raise RequestQuotaExceededError()
         response.raise_for_status()
@@ -83,7 +99,14 @@ def getDetails(self, ip_address=None):
 
         return Details(details)
 
-    def getBatchDetails(self, ip_addresses, batch_size=None):
+    def getBatchDetails(
+        self,
+        ip_addresses,
+        batch_size=None,
+        timeout_per_batch=BATCH_REQ_TIMEOUT_DEFAULT,
+        timeout_total=None,
+        raise_on_fail=True,
+    ):
         """
         Get details for a batch of IP addresses at once.
 
@@ -92,11 +115,26 @@ def getBatchDetails(self, ip_addresses, batch_size=None):
         all of the response data, which is at least a magnitude larger than the
         input list).
 
+        The input list is broken up into batches to abide by API requirements.
         The batch size can be adjusted with `batch_size` but is clipped to (and
-        also defaults to) `handler_utils.BATCH_MAX_SIZE`.
+        also defaults to) `BATCH_MAX_SIZE`.
+
+        For each batch, `timeout_per_batch` indicates the maximum seconds to
+        spend waiting for the HTTP request to complete. If any batch fails with
+        this timeout, the whole operation fails.
+        Defaults to `BATCH_REQ_TIMEOUT_DEFAULT` seconds.
+
+        `timeout_total` is a seconds-denominated hard-timeout for the time
+        spent in HTTP operations; regardless of whether all batches have
+        succeeded so far, if `timeout_total` is reached, the whole operation
+        will fail. Defaults to being turned off.
+
+        `raise_on_fail`, if turned off, will return any result retrieved so far
+        rather than raise an exception when errors occur, including timeout and
+        quota errors. Defaults to on.
         """
         if batch_size == None:
-            batch_size = handler_utils.BATCH_MAX_SIZE
+            batch_size = BATCH_MAX_SIZE
 
         result = {}
 
@@ -117,23 +155,44 @@ def getBatchDetails(self, ip_addresses, batch_size=None):
             else:
                 lookup_addresses.append(ip_address)
 
+        # prepare req http options
+        req_opts = {**self.request_options, "timeout": timeout_per_batch}
+
+        if timeout_total is not None:
+            start_time = time.time()
+
         # loop over batch chunks and do lookup for each.
         for i in range(0, len(ip_addresses), batch_size):
+            # quit if total timeout is reached.
+            if (
+                timeout_total is not None
+                and time.time() - start_time > timeout_total
+            ):
+                if raise_on_fail:
+                    raise TimeoutExceededError()
+                else:
+                    return result
+
             chunk = ip_addresses[i : i + batch_size]
 
             # lookup
-            url = handler_utils.API_URL + "/batch"
+            url = API_URL + "/batch"
             headers = handler_utils.get_headers(self.access_token)
             headers["content-type"] = "application/json"
             response = requests.post(
-                url,
-                json=lookup_addresses,
-                headers=headers,
-                **self.request_options
+                url, json=lookup_addresses, headers=headers, **req_opts
             )
-            if response.status_code == 429:
-                raise RequestQuotaExceededError()
-            response.raise_for_status()
+
+            # fail on bad status codes
+            try:
+                if response.status_code == 429:
+                    raise RequestQuotaExceededError()
+                response.raise_for_status()
+            except Exception as e:
+                if raise_on_fail:
+                    raise e
+                else:
+                    return result
 
             # fill cache
             json_response = response.json()
diff --git a/ipinfo/handler_async.py b/ipinfo/handler_async.py
@@ -13,6 +13,15 @@
 from .cache.default import DefaultCache
 from .details import Details
 from .exceptions import RequestQuotaExceededError
+from .handler_utils import (
+    API_URL,
+    COUNTRY_FILE_DEFAULT,
+    BATCH_MAX_SIZE,
+    CACHE_MAXSIZE,
+    CACHE_TTL,
+    REQUEST_TIMEOUT_DEFAULT,
+    BATCH_REQ_TIMEOUT_DEFAULT,
+)
 from . import handler_utils
 
 
@@ -22,10 +31,6 @@ class AsyncHandler:
     Instantiates and maintains access to cache.
     """
 
-    CACHE_MAXSIZE = 4096
-    CACHE_TTL = 60 * 60 * 24
-    REQUEST_TIMEOUT_DEFAULT = 2
-
     def __init__(self, access_token=None, **kwargs):
         """
         Initialize the Handler object with country name list and the
@@ -41,7 +46,7 @@ def __init__(self, access_token=None, **kwargs):
         # setup req opts
         self.request_options = kwargs.get("request_options", {})
         if "timeout" not in self.request_options:
-            self.request_options["timeout"] = self.REQUEST_TIMEOUT_DEFAULT
+            self.request_options["timeout"] = REQUEST_TIMEOUT_DEFAULT
 
         # setup aiohttp
         self.httpsess = None
@@ -52,9 +57,9 @@ def __init__(self, access_token=None, **kwargs):
         else:
             cache_options = kwargs.get("cache_options", {})
             if "maxsize" not in cache_options:
-                cache_options["maxsize"] = self.CACHE_MAXSIZE
+                cache_options["maxsize"] = CACHE_MAXSIZE
             if "ttl" not in cache_options:
-                cache_options["ttl"] = self.CACHE_TTL
+                cache_options["ttl"] = CACHE_TTL
             self.cache = DefaultCache(**cache_options)
 
     async def init(self):
@@ -97,7 +102,7 @@ async def getDetails(self, ip_address=None):
             return Details(self.cache[ip_address])
 
         # not in cache; do http req
-        url = handler_utils.API_URL
+        url = API_URL
         if ip_address:
             url += "/" + ip_address
         headers = handler_utils.get_headers(self.access_token)
@@ -122,16 +127,17 @@ async def getBatchDetails(self, ip_addresses, batch_size=None):
         all of the response data, which is at least a magnitude larger than the
         input list).
 
+        The input list is broken up into batches to abide by API requirements.
         The batch size can be adjusted with `batch_size` but is clipped to (and
-        also defaults to) `handler_utils.BATCH_MAX_SIZE`.
+        also defaults to) `BATCH_MAX_SIZE`.
 
         The concurrency level is currently unadjustable; coroutines will be
         created and consumed for all batches at once.
         """
         self._ensure_aiohttp_ready()
 
         if batch_size == None:
-            batch_size = handler_utils.BATCH_MAX_SIZE
+            batch_size = BATCH_MAX_SIZE
 
         result = {}
 
@@ -162,7 +168,7 @@ async def getBatchDetails(self, ip_addresses, batch_size=None):
                 return result
 
             # do http req
-            url = handler_utils.API_URL + "/batch"
+            url = API_URL + "/batch"
             headers = handler_utils.get_headers(self.access_token)
             headers["content-type"] = "application/json"
             reqs.append(
diff --git a/ipinfo/handler_utils.py b/ipinfo/handler_utils.py
@@ -18,6 +18,18 @@
 # The max amount of IPs allowed by the API per batch request.
 BATCH_MAX_SIZE = 1000
 
+# The default max size of the cache in terms of number of items.
+CACHE_MAXSIZE = 4096
+
+# The default TTL of the cache in seconds
+CACHE_TTL = 60 * 60 * 24
+
+# The default request timeout for per-IP requests.
+REQUEST_TIMEOUT_DEFAULT = 2
+
+# The default request timeout for batch requests.
+BATCH_REQ_TIMEOUT_DEFAULT = 5
+
 
 def get_headers(access_token):
     """Build headers for request to IPinfo API."""