Ingestion with a imbalanced producer / consumer.

gsaluja9 · gsaluja9 · commit 052f3333b0e2 · 2025-03-13T15:44:23.000-04:00
diff --git a/apps/dataset-ingestion/app/app.sh b/apps/dataset-ingestion/app/app.sh
@@ -41,6 +41,8 @@ build_coco() {
     date
     adb utils log --level INFO "${APP}: Loading begins"
     echo "loading data..."
+    python3 ingest_streaming.py /app/input/val/val_images.adb.csv $BATCH_SIZE $NUM_WORKERS
+    python3 ingest_streaming.py /app/input/val/val_pixelmaps.adb.csv $BATCH_SIZE $NUM_WORKERS
     python3 ingestion_demo_trial.py -R /app/input -C $CLEAN -B $BATCH_SIZE -W $NUM_WORKERS -S $SAMPLE_COUNT -T $INCLUDE_TRAIN
 
     # Validation
@@ -64,6 +66,9 @@ build_faces() {
 
     # Ingest the CSV files
     adb utils log --level INFO "${APP}: Loading faces dataset"
+    python3 /app/build_faces/create_indexes.py
+    python3 /app/build_faces/ingest_streaming.py /app/input/faces/pruned_celebA.csv $BATCH_SIZE $NUM_WORKERS
+    python3 /app/build_faces/ingest_streaming.py /app/input/faces/hqimages.adb.csv $BATCH_SIZE $NUM_WORKERS
     bash /app/build_faces/load.sh
     adb utils log --level INFO "${APP}: Successful completion"
 }
diff --git a/apps/dataset-ingestion/app/build_coco/ingest_streaming.py b/apps/dataset-ingestion/app/build_coco/ingest_streaming.py
@@ -0,0 +1,124 @@
+import time
+from concurrent.futures import ThreadPoolExecutor
+from enum import Enum
+from queue import Full, Queue
+
+import pandas as pd
+import requests
+from aperturedb.CommonLibrary import create_connector
+from aperturedb.QueryGenerator import QueryGenerator
+from typer import Typer
+
+
+class HTTPStorageURLS():
+    def __init__(self, q: Queue, df: pd.DataFrame, executor: ThreadPoolExecutor):
+        self.executor = executor
+        self.q = q
+        self.df = df
+        self.row = self.df.iloc[0]
+        self.session = requests.Session()
+        self.sync()
+
+    def sync(self):
+        def download_blob(i, row):
+            url = row["url"]
+            r = self.session.get(url)
+            try:
+                self.q.put((row, r.content))
+            except Full:
+                print("Queue is full")
+                time.sleep(1)
+
+        for i, row in enumerate(self.df.to_dict("records")):
+            self.executor.submit(download_blob, i, row)
+        print(f"Synced to {self.q}")
+
+class GoogleCloudStorage():
+    def __init__(self, q: Queue, df: pd.DataFrame, executor: ThreadPoolExecutor):
+        self.executor = executor
+        self.q = q
+        self.df = df
+        self.row = self.df.iloc[0]
+        gs_url = self.row["gs_url"]
+        from google.cloud import storage
+        self.client = storage.Client.create_anonymous_client()
+        self.source_bucket_name = gs_url.split("/")[2]
+        self.source_bucket = self.client.bucket(self.source_bucket_name)
+        self.sync()
+
+    def sync(self):
+        def download_blob(i, row):
+            object_name = row["gs_url"].split("gs://" + self.source_bucket_name + "/")[-1]
+            blob = self.source_bucket.blob(object_name).download_as_bytes()
+            try:
+                self.q.put((row, blob))
+            except Full:
+                print("Queue is full")
+                time.sleep(1)
+
+        for i, row in enumerate(self.df.to_dict("records")):
+            self.executor.submit(download_blob, i, row)
+        print(f"Synced to {self.q}")
+
+class ObjectStorage(Enum):
+    GCS = 1
+    HTTP = 2
+
+class Sequence(QueryGenerator):
+    def __init__(self, input_csv: str):
+        super().__init__()
+        self.q = Queue(maxsize=1000)
+
+
+        self.df = pd.read_csv(input_csv)
+        url_type = self.df.columns[0]
+        if url_type == "gs_url":
+            self.storage = ObjectStorage.GCS
+        elif url_type == "url":
+            self.storage = ObjectStorage.HTTP
+        else:
+            raise ValueError("Invalid URL type")
+        self.executor = ThreadPoolExecutor(max_workers=64)
+        if self.storage == ObjectStorage.GCS:
+            self.gcs = GoogleCloudStorage(self.q, self.df, self.executor)
+        elif self.storage == ObjectStorage.HTTP:
+            self.gcs = HTTPStorageURLS(self.q, self.df, self.executor)
+        # Hack to reuse extra 5 items on top of the queue
+        # which are used to check if generator has implemented getitem
+        # And what is commands per query, and blobs per query.
+        self.inspect = 0
+
+    def __del__(self):
+        self.executor.shutdown()
+
+    def getitem(self, subscript):
+        data = self.q.get()
+        if self.inspect < 5:
+            self.q.put(data)
+            self.inspect += 1
+        q = [
+            {
+                "AddImage": {
+                    "properties": data[0]
+                }
+            }
+        ]
+        return q, [data[1]]
+
+    def __len__(self):
+        return len(self.df)
+
+
+app = Typer()
+@app.command()
+def ingest(input_csv: str, batch_size: int, num_workers: int):
+    s = Sequence(input_csv)
+    client = create_connector()
+    from aperturedb.ParallelLoader import ParallelLoader
+    loader = ParallelLoader(client=client)
+    loader.ingest(s, batch_size, num_workers, True)
+    print("Done")
+
+
+if __name__ == "__main__":
+    app()
diff --git a/apps/dataset-ingestion/app/build_coco/ingestion_demo_trial.py b/apps/dataset-ingestion/app/build_coco/ingestion_demo_trial.py
@@ -1,5 +1,4 @@
 import os
-import sys
 from aperturedb.Utils import Utils
 from aperturedb.CommonLibrary import create_connector
 
@@ -17,9 +16,7 @@ def ingest_coco(cli_args):
     dbutils.create_entity_index("_Descriptor", "yfcc_id")
 
     args = {
-        "images": "IMAGE",
         "bboxes": "BOUNDING_BOX",
-        "pixelmaps": "IMAGE",
         "img_pixelmap_connections": "CONNECTION",
         "polygons": "POLYGON",
         "images.adb.csv_clip_pytorch_embeddings_metadata": "DESCRIPTOR",
@@ -29,10 +26,9 @@ def ingest_coco(cli_args):
     if cli_args.train == "true":
         stages.append("train")
 
-    objs = ["images",
+    objs = [
             "bboxes",
             "polygons",
-            "pixelmaps",
             "img_pixelmap_connections",
             "images.adb.csv_clip_pytorch_embeddings_metadata",
             "images.adb.csv_clip_pytorch_embeddings_connection"]
diff --git a/apps/dataset-ingestion/app/build_faces/ingest_streaming.py b/apps/dataset-ingestion/app/build_faces/ingest_streaming.py
@@ -0,0 +1,124 @@
+import time
+from concurrent.futures import ThreadPoolExecutor
+from enum import Enum
+from queue import Full, Queue
+
+import pandas as pd
+import requests
+from aperturedb.CommonLibrary import create_connector
+from aperturedb.QueryGenerator import QueryGenerator
+from typer import Typer
+
+
+class HTTPStorageURLS():
+    def __init__(self, q: Queue, df: pd.DataFrame, executor: ThreadPoolExecutor):
+        self.executor = executor
+        self.q = q
+        self.df = df
+        self.row = self.df.iloc[0]
+        self.session = requests.Session()
+        self.sync()
+
+    def sync(self):
+        def download_blob(i, row):
+            url = row["url"]
+            r = self.session.get(url)
+            try:
+                self.q.put((row, r.content))
+            except Full:
+                print("Queue is full")
+                time.sleep(1)
+
+        for i, row in enumerate(self.df.to_dict("records")):
+            self.executor.submit(download_blob, i, row)
+        print(f"Synced to {self.q}")
+
+class GoogleCloudStorage():
+    def __init__(self, q: Queue, df: pd.DataFrame, executor: ThreadPoolExecutor):
+        self.executor = executor
+        self.q = q
+        self.df = df
+        self.row = self.df.iloc[0]
+        gs_url = self.row["gs_url"]
+        from google.cloud import storage
+        self.client = storage.Client.create_anonymous_client()
+        self.source_bucket_name = gs_url.split("/")[2]
+        self.source_bucket = self.client.bucket(self.source_bucket_name)
+        self.sync()
+
+    def sync(self):
+        def download_blob(i, row):
+            object_name = row["gs_url"].split("gs://" + self.source_bucket_name + "/")[-1]
+            blob = self.source_bucket.blob(object_name).download_as_bytes()
+            try:
+                self.q.put((row, blob))
+            except Full:
+                print("Queue is full")
+                time.sleep(1)
+
+        for i, row in enumerate(self.df.to_dict("records")):
+            self.executor.submit(download_blob, i, row)
+        print(f"Synced to {self.q}")
+
+class ObjectStorage(Enum):
+    GCS = 1
+    HTTP = 2
+
+class Sequence(QueryGenerator):
+    def __init__(self, input_csv: str):
+        super().__init__()
+        self.q = Queue(maxsize=1000)
+
+
+        self.df = pd.read_csv(input_csv)
+        url_type = self.df.columns[0]
+        if url_type == "gs_url":
+            self.storage = ObjectStorage.GCS
+        elif url_type == "url":
+            self.storage = ObjectStorage.HTTP
+        else:
+            raise ValueError("Invalid URL type")
+        self.executor = ThreadPoolExecutor(max_workers=64)
+        if self.storage == ObjectStorage.GCS:
+            self.gcs = GoogleCloudStorage(self.q, self.df, self.executor)
+        elif self.storage == ObjectStorage.HTTP:
+            self.gcs = HTTPStorageURLS(self.q, self.df, self.executor)
+        # Hack to reuse extra 5 items on top of the queue
+        # which are used to check if generator has implemented getitem
+        # And what is commands per query, and blobs per query.
+        self.inspect = 0
+
+    def __del__(self):
+        self.executor.shutdown()
+
+    def getitem(self, subscript):
+        data = self.q.get()
+        if self.inspect < 5:
+            self.q.put(data)
+            self.inspect += 1
+        q = [
+            {
+                "AddImage": {
+                    "properties": data[0]
+                }
+            }
+        ]
+        return q, [data[1]]
+
+    def __len__(self):
+        return len(self.df)
+
+
+app = Typer()
+@app.command()
+def ingest(input_csv: str, batch_size: int, num_workers: int):
+    s = Sequence(input_csv)
+    client = create_connector()
+    from aperturedb.ParallelLoader import ParallelLoader
+    loader = ParallelLoader(client=client)
+    loader.ingest(s, batch_size, num_workers, True)
+    print("Done")
+
+
+if __name__ == "__main__":
+    app()
diff --git a/apps/dataset-ingestion/app/build_faces/load.sh b/apps/dataset-ingestion/app/build_faces/load.sh
@@ -11,19 +11,17 @@ if [[ ${CLEAN} == "true" ]]; then
     adb utils execute remove_all --force
 fi
 cd /app/build_faces
-python3 create_indexes.py
+
 python3 create_descriptorsets.py
 
 echo "Ingesting"
 cd /app/input/faces
-adb ingest from-csv pruned_celebA.csv --transformer image_properties --transformer common_properties  --ingest-type IMAGE --batchsize ${BATCH_SIZE} --num-workers ${NUM_WORKERS}  --sample-count ${SAMPLE_COUNT}
 adb ingest from-csv celebA.csv_clip_pytorch_embeddings_metadata.adb.csv --ingest-type DESCRIPTOR --batchsize ${BATCH_SIZE} --num-workers ${NUM_WORKERS} --sample-count ${SAMPLE_COUNT}
 adb ingest from-csv celebA.csv_clip_pytorch_embeddings_connection.adb.csv --ingest-type CONNECTION --batchsize ${BATCH_SIZE} --num-workers ${NUM_WORKERS} --sample-count ${SAMPLE_COUNT}
 
 adb ingest from-csv celebA.csv_facenet_pytorch_embeddings_metadata.adb.csv --ingest-type DESCRIPTOR --batchsize ${BATCH_SIZE} --num-workers ${NUM_WORKERS} --sample-count ${SAMPLE_COUNT}
 adb ingest from-csv celebA.csv_facenet_pytorch_embeddings_connection.adb.csv --ingest-type CONNECTION --batchsize ${BATCH_SIZE} --num-workers ${NUM_WORKERS} --sample-count ${SAMPLE_COUNT}
 
-adb ingest from-csv hqimages.adb.csv --ingest-type IMAGE --transformer common_properties --transformer image_properties --batchsize ${BATCH_SIZE} --num-workers ${NUM_WORKERS} --sample-count ${SAMPLE_COUNT}
 adb ingest from-csv hqpolygons.adb.csv --ingest-type POLYGON --batchsize ${BATCH_SIZE} --num-workers ${NUM_WORKERS} --sample-count ${SAMPLE_COUNT}
 adb ingest from-csv hqbboxes.adb.csv --ingest-type BOUNDING_BOX --batchsize ${BATCH_SIZE} --num-workers ${NUM_WORKERS} --sample-count ${SAMPLE_COUNT}