Merge pull request #208 from aperture-data/release-0.4.1

gsaluja9 · web-flow · commit f7b59ae8eda9 · 2023-02-09T09:49:06.000-08:00
Addresses connection handling on fork.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -19,7 +19,7 @@ jobs:
   # This workflow contains a single job called "build-test"
   build-test:
     # The type of runner that the job will run on Ubuntu 18.04 (latest)
-    runs-on: ubuntu-latest
+    runs-on: self-hosted
 
     # Steps represent a sequence of tasks that will be
     # executed as part of the job
@@ -61,7 +61,7 @@ jobs:
       - build-test
 
     # The type of runner that the job will run on Ubuntu 18.04 (latest)
-    runs-on: ubuntu-latest
+    runs-on: self-hosted
 
     # Steps represent a sequence of tasks that will be
     # executed as part of the job
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -15,7 +15,7 @@ on:
 # that can run sequentially or in parallel
 jobs:
   run_test:
-    runs-on: ubuntu-latest
+    runs-on: self-hosted
 
     # Steps represent a sequence of tasks that will be
     # executed as part of the job
@@ -50,7 +50,7 @@ jobs:
           BRANCH_NAME=${{ github.event.pull_request.head.ref }} ./run_test_container.sh
 
   run_test_conda_gpu:
-    runs-on: ubuntu-latest
+    runs-on: self-hosted
 
     # Steps represent a sequence of tasks that will be
     # executed as part of the job
@@ -88,7 +88,7 @@ jobs:
           ./run_test_container.sh aperturedata/aperturedb-pytorch-gpu
 
   build_images:
-    runs-on: ubuntu-latest
+    runs-on: self-hosted
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE,
       # so your job can access it
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -19,7 +19,7 @@ jobs:
   # This workflow contains a single job called "build-test"
   build-test:
     # The type of runner that the job will run on Ubuntu 18.04 (latest)
-    runs-on: ubuntu-latest
+    runs-on: self-hosted
 
     # Steps represent a sequence of tasks that will be
     # executed as part of the job
@@ -58,7 +58,7 @@ jobs:
   build_and_deploy_docs:
 
     # The type of runner that the job will run on Ubuntu 18.04 (latest)
-    runs-on: ubuntu-latest
+    runs-on: self-hosted
 
     # Steps represent a sequence of tasks that will be
     # executed as part of the job
diff --git a/aperturedb/Connector.py b/aperturedb/Connector.py
@@ -287,25 +287,27 @@ def _query(self, query, blob_array = []):
 
         tries = 0
         while tries < 3:
-            if self._send_msg(data):
-                response = self._recv_msg()
-                if response is not None:
-                    break
-
+            try:
+                if self._send_msg(data):
+                    response = self._recv_msg()
+                    if response is not None:
+                        querRes = queryMessage_pb2.queryMessage()
+                        querRes.ParseFromString(response)
+                        response_blob_array = [b for b in querRes.blobs]
+                        self.last_response = json.loads(querRes.json)
+                        break
+            except ssl.SSLError as e:
+                # This can happen in a scenario where multiple
+                # processes might be accessing a single connection.
+                # The copy does not make usable connections.
+                logger.warning(f"Socket error on process {os.getpid()}")
             tries += 1
-            logger.error(
-                f"Connection broken. Reconnectng attempt [{tries}/3] ..")
-            time.sleep(5)
+            logger.warning(
+                f"Connection broken. Reconnectng attempt [{tries}/3] .. PID = {os.getpid()}")
+            time.sleep(1)
             self._connect()
             self._renew_session()
 
-        querRes = queryMessage_pb2.queryMessage()
-        querRes.ParseFromString(response)
-
-        response_blob_array = [b for b in querRes.blobs]
-
-        self.last_response = json.loads(querRes.json)
-
         return (self.last_response, response_blob_array)
 
     def query(self, q, blobs=[]):
diff --git a/aperturedb/__init__.py b/aperturedb/__init__.py
@@ -7,7 +7,7 @@
 
 logger = logging.getLogger(__name__)
 
-__version__ = "0.4.0"
+__version__ = "0.4.1"
 
 # set log level
 logger.setLevel(logging.DEBUG)
diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
 
 setuptools.setup(
     name="aperturedb",
-    version="0.4.0",
+    version="0.4.1",
     description="ApertureDB Client Module",
     install_requires=install_requires,
     long_description=long_description,
diff --git a/test/test_torch_connector.py b/test/test_torch_connector.py
@@ -1,28 +1,34 @@
 import time
 import os
 import logging
+from typing import Union
 
 import torch
 import torch.distributed as dist
 from aperturedb import Images
 from aperturedb import PyTorchDataset
+from torch.utils.data.dataloader import DataLoader
+from torch.utils.data.dataset import Dataset
 
 logger = logging.getLogger(__name__)
 
 
 class TestTorchDatasets():
-    def validate_dataset(self, dataset):
+    def validate_dataset(self, dataset: Union[DataLoader, Dataset], expected_length):
         start = time.time()
 
+        count = 0
         # Iterate over dataset.
         for img in dataset:
             if len(img[0]) < 0:
                 logger.error("Empty image?")
                 assert True == False
+            count += len(img[1]) if isinstance(dataset, DataLoader) else 1
+        assert count == expected_length
 
-        logger.info("\n")
-        logger.info("Throughput (imgs/s):",
-                    len(dataset) / (time.time() - start))
+        time_taken = time.time() - start
+        if time_taken != 0:
+            logger.info(f"Throughput (imgs/s): {len(dataset) / time_taken}")
 
     def test_omConstraints(self, db, utils, images):
         assert len(images) > 0
@@ -31,8 +37,7 @@ def test_omConstraints(self, db, utils, images):
         dataset = PyTorchDataset.ApertureDBDatasetConstraints(
             db, constraints=const)
 
-        assert len(dataset) == utils.count_images()
-        self.validate_dataset(dataset)
+        self.validate_dataset(dataset, utils.count_images())
 
     def test_nativeContraints(self, db, utils, images):
         assert len(images) > 0
@@ -57,10 +62,10 @@ def test_nativeContraints(self, db, utils, images):
         dataset = PyTorchDataset.ApertureDBDataset(
             db, query, label_prop="license")
 
-        assert len(dataset) == utils.count_images()
-        self.validate_dataset(dataset)
+        self.validate_dataset(dataset, utils.count_images())
 
     def test_datasetWithMultiprocessing(self, db, utils):
+        len_limit = utils.count_images()
         query = [{
             "FindImage": {
                 "constraints": {
@@ -74,16 +79,16 @@ def test_datasetWithMultiprocessing(self, db, utils):
                     }
                 ],
                 "results": {
-                    "list": ["license"]
+                    "list": ["license"],
+                    "limit": len_limit
                 }
             }
         }]
 
         dataset = PyTorchDataset.ApertureDBDataset(
             db, query, label_prop="license")
 
-        assert len(dataset) == utils.count_images()
-        self.validate_dataset(dataset)
+        self.validate_dataset(dataset, len_limit)
         # Distributed Data Loader Setup
 
         # Needed for init_process_group
@@ -93,30 +98,30 @@ def test_datasetWithMultiprocessing(self, db, utils):
         dist.init_process_group("gloo", rank=0, world_size=1)
 
         # === Distributed Data Loader Sequential
-
-        data_loader = torch.utils.data.DataLoader(
+        batch_size = 10
+        data_loader = DataLoader(
             dataset,
-            batch_size=10,          # pick random values here to test
+            batch_size=batch_size,          # pick random values here to test
             num_workers=4,          # num_workers > 1 to test multiprocessing works
             pin_memory=True,
             drop_last=True,
         )
 
-        self.validate_dataset(data_loader)
+        self.validate_dataset(data_loader, len_limit)
         # === Distributed Data Loader Shuffler
 
         # This will generate a random sampler, which will make the use
         # of batching wasteful
         sampler     = torch.utils.data.DistributedSampler(
             dataset, shuffle=True)
 
-        data_loader = torch.utils.data.DataLoader(
+        data_loader = DataLoader(
             dataset,
             sampler=sampler,
-            batch_size=10,          # pick random values here to test
+            batch_size=batch_size,          # pick random values here to test
             num_workers=4,          # num_workers > 1 to test multiprocessing works
             pin_memory=True,
             drop_last=True,
         )
 
-        self.validate_dataset(data_loader)
+        self.validate_dataset(data_loader, len_limit)