diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml new file mode 100644 index 00000000..35c453cc --- /dev/null +++ b/.github/workflows/benchmarking.yml @@ -0,0 +1,117 @@ +name: Build/Push Benchmark Images +on: + push: + branches: + - main + tags: + - 'v*' + paths: + - 'benchmarking/benchmarks/**' + - '.github/workflows/benchmarking.yml' + pull_request: + branches: + - main + paths: + - 'benchmarking/benchmarks/**' + - '.github/workflows/benchmarking.yml' + +jobs: + discover-benchmarks: + runs-on: ubuntu-latest + outputs: + benchmarks: ${{ steps.set-matrix.outputs.benchmarks }} + steps: + - uses: actions/checkout@v4 + + - name: Discover Benchmarks + id: set-matrix + run: | + # Find all benchmark directories, excluding template + BENCHMARKS=$(find benchmarking/benchmarks -mindepth 1 -maxdepth 1 -type d -not -name 'template' -not -name '.git' -exec basename {} \; | jq -R -s -c 'split("\n")[:-1]') + echo "benchmarks=$BENCHMARKS" >> $GITHUB_OUTPUT + echo "Found benchmarks: $BENCHMARKS" + + build-push-benchmarks: + needs: discover-benchmarks + if: needs.discover-benchmarks.outputs.benchmarks != '[]' + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + benchmark: ${{ fromJson(needs.discover-benchmarks.outputs.benchmarks) }} + env: + REGISTRY_URL: ${{ secrets.NRP_GITLAB_REGISTRY_URL }} + IMAGE_PATH: /ndp/sage/nrp-image-search + BENCHMARK_NAME: ${{ matrix.benchmark }} + steps: + - uses: actions/checkout@v4 + + - name: Set Image Tag + run: | + if [[ "$GITHUB_REF" == refs/heads/* ]]; then + IMAGE_TAG="${GITHUB_REF#refs/heads/}" + elif [[ "$GITHUB_REF" == refs/tags/* ]]; then + IMAGE_TAG="${GITHUB_REF#refs/tags/}" + elif [[ "$GITHUB_REF" == refs/pull/* ]]; then + PR_NUMBER=$(echo "$GITHUB_REF" | cut -d'/' -f3) + IMAGE_TAG="pr-${PR_NUMBER}" + fi + IMAGE_TAG=$(echo "$IMAGE_TAG" | tr '/' '-') + echo "IMAGE_TAG=$IMAGE_TAG" >> $GITHUB_ENV + + - name: Determine Dockerfile and Image Name + id: docker-config + run: | + BENCHMARK_DIR="benchmarking/benchmarks/${{ matrix.benchmark }}" + + # Convert benchmark name to lowercase for Docker image name (Docker requires lowercase) + BENCHMARK_NAME_LOWER=$(echo "${{ matrix.benchmark }}" | tr '[:upper:]' '[:lower:]') + + # Use Dockerfile.job for the combined job image + DOCKERFILE="$BENCHMARK_DIR/Dockerfile.job" + IMAGE_NAME="benchmark-${BENCHMARK_NAME_LOWER}-job" + + # Check if Dockerfile exists + if [ ! -f "$DOCKERFILE" ]; then + echo "Dockerfile not found: $DOCKERFILE" + echo "skip=true" >> $GITHUB_OUTPUT + exit 0 + fi + + echo "dockerfile=$DOCKERFILE" >> $GITHUB_OUTPUT + echo "image_name=$IMAGE_NAME" >> $GITHUB_OUTPUT + echo "skip=false" >> $GITHUB_OUTPUT + echo "Building $IMAGE_NAME from $DOCKERFILE" + + - name: Log in to NRP GitLab Container Registry + if: steps.docker-config.outputs.skip != 'true' + run: echo "${{ secrets.NRP_GITLAB_DEPLOY_TOKEN }}" | docker login ${{ secrets.NRP_GITLAB_REGISTRY_URL }} -u ${{ secrets.NRP_GITLAB_DEPLOY_TOKEN_USERNAME }} --password-stdin + + - name: Build & Tag Benchmark Image + if: steps.docker-config.outputs.skip != 'true' + env: + REGISTRY_URL: ${{ secrets.NRP_GITLAB_REGISTRY_URL }} + IMAGE_PATH: ${{ env.IMAGE_PATH }} + IMAGE_NAME: ${{ steps.docker-config.outputs.image_name }} + run: | + BENCHMARK_DIR="benchmarking/benchmarks/${{ matrix.benchmark }}" + DOCKERFILE="${{ steps.docker-config.outputs.dockerfile }}" + + # Build from benchmark directory with Dockerfile path relative to repo root + docker build -f "$DOCKERFILE" -t "$REGISTRY_URL$IMAGE_PATH/$IMAGE_NAME:$IMAGE_TAG" "$BENCHMARK_DIR" + + if [ "$GITHUB_REF" = "refs/heads/main" ]; then + docker tag "$REGISTRY_URL$IMAGE_PATH/$IMAGE_NAME:$IMAGE_TAG" "$REGISTRY_URL$IMAGE_PATH/$IMAGE_NAME:latest" + fi + + - name: Push Benchmark Image to NRP GitLab Image Registry + if: steps.docker-config.outputs.skip != 'true' + env: + REGISTRY_URL: ${{ secrets.NRP_GITLAB_REGISTRY_URL }} + IMAGE_PATH: ${{ env.IMAGE_PATH }} + IMAGE_NAME: ${{ steps.docker-config.outputs.image_name }} + run: | + docker push "$REGISTRY_URL$IMAGE_PATH/$IMAGE_NAME:$IMAGE_TAG" + if [ "$GITHUB_REF" = "refs/heads/main" ]; then + docker push "$REGISTRY_URL$IMAGE_PATH/$IMAGE_NAME:latest" + fi diff --git a/.gitignore b/.gitignore index 8c8558ec..f7650da9 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ __pycache__/ **.DS_Store **/flagged/ .env -settings.json \ No newline at end of file +settings.json +._* \ No newline at end of file diff --git a/Readme.md b/Readme.md index ff7648f8..1f988b89 100644 --- a/Readme.md +++ b/Readme.md @@ -107,20 +107,20 @@ kubectl kustomize nrp-dev -o sage-image-search-dev.yaml or kubectl kustomize nrp --- ## TODOs - +- [ ] Benchmark existing deployment using new framework + - using... + - https://huggingface.co/datasets/sagecontinuum/INQUIRE-Benchmark-small + - https://huggingface.co/datasets/sagecontinuum/FireBench + - ... - [ ] Bechmark Milvus@NRP + - using... + - https://huggingface.co/datasets/sagecontinuum/INQUIRE-Benchmark-small + - https://huggingface.co/datasets/sagecontinuum/FireBench + - ... +- [ ] add a heartbeat metric for Sage Object Storage (nrdstor) + - specifically here in the code: https://github.com/waggle-sensor/sage-nrp-image-search/blob/main/weavloader/processing.py#L159 - [ ] Use other benchmarks to test image retrieval in other domains (ex; Urban) & System-Level Performance - - General Image-Caption Retrieval Benchmarks - - **MS COCO Captions:** A widely used benchmark drawn from the MS-COCO dataset (Common Objects in Context). It contains **123,287 images** covering everyday scenes (including many urban street scenes with people, vehicles, buildings, etc.), each paired with 5 human-written captions. The standard split is \~82k images for training, 5k for validation, 5k for testing. *Relevance:* Although not exclusively urban, COCO features many city context images (e.g. street traffic, city parks, indoor scenes). *Evaluation:* Typically uses **Recall\@K** (K=1,5,10) as the primary metric – e.g. the percentage of queries for which the correct image is in the top K results. Some works also report mean average precision (mAP) on the 5K test set. **Access:** [COCO Dataset Page](https://cocodataset.org/#download) (captions and images are publicly downloadable). - - **Flickr30K:** Another popular benchmark with **31,000 photographs** from Flickr, each image paired with 5 crowd-sourced textual descriptions. It is split into 29k images for train, 1k for validation, 1k for test. *Relevance:* Images cover a broad range of everyday situations (some urban, some rural, people and objects in various settings). *Evaluation:* Uses the same **Recall\@K** metrics as COCO (often evaluating Recall\@1, 5, 10 for text→image retrieval). Models today achieve high performance (e.g. near 99% recall\@10 for top methods). **Access:** Available via [Kaggle dataset](https://www.kaggle.com/datasets/hsankesara/flickr-image-dataset) or the original authors’ webpage (University of Illinois). - *(**Note:** Flickr8K is an older, smaller dataset with 8,000 images and captions, now less commonly used in benchmarks.)* - - **NUS-WIDE:** A large-scale **web image dataset** (269,648 Flickr images) with associated **user tags and 81 high-level concepts** annotated. While not caption-based, it is a standard benchmark for text-to-image retrieval using tags or keywords. Many concepts are object or scene categories (e.g. *building, car, street, person*, etc.), making it relevant for urban imagery retrieval. *Evaluation:* Typically uses **mean Average Precision (mAP)** over all queries, since multiple images can be relevant for a given tag query. NUS-WIDE is often used for evaluating cross-modal retrieval and hashing methods. **Access:** [NUS-WIDE on Kaggle](https://www.kaggle.com/datasets/xinleili/nuswide) (contains the images and annotations). - - Urban-Focused - - **CityFlow-NL (Natural Language Vehicle Retrieval):** A benchmark introduced via the AI City Challenge for retrieving traffic camera images of vehicles based on descriptions. Built on the CityFlow surveillance dataset, it provides **5,000+ unique natural language descriptions** for **666 target vehicles** captured across **3,028 multi-camera tracks** in a city. Descriptions include vehicle attributes (color, type), motion (e.g. “turning right”), and surrounding context (other vehicles, road type). *Relevance:* Focused on **urban street scenes** – traffic surveillance footage from a city, featuring cars, trucks, intersections, etc. *Evaluation:* Uses ranking metrics similar to person search – the challenge reports **mAP** (mean average precision) over the top 100 retrieved results, as well as **Recall\@1,5,10** hit rates for each query. For instance, the baseline in one study achieved \~29.6% Recall\@1 and \~64.7% Recall\@10, illustrating the task difficulty. **Access:** Dataset introduced in the *AI City Challenge 2021 (Track 5)*. Available through the challenge organizers (download via the [AI City Challenge website](https://www.aicitychallenge.org/) – data request required) or the authors’ GitHub repository which provides code and data links for CityFlow-NL. - - Paper: https://arxiv.org/abs/2101.04741 - - code: https://github.com/fredfung007/cityflow-nl - - Compositional & Expert-Level Retrieval Benchmarks - - **Cola (Compositional Localized Attributes):** A **compositional text-to-image retrieval** benchmark (NeurIPS 2023) designed to test fine-grained understanding of object-attribute combinations. **Cola contains \~1,236 queries** composed of **168 objects and 197 attributes** (e.g. “red car next to blue car”, “person in yellow shirt riding a bike”) with target images drawn from about **30K images**. Each query has challenging confounders (distractor images that have the right objects but wrong attribute pairing). *Relevance:* Not specific to urban scenes, but many queries could involve everyday objects (cars, people, etc. in various configurations) – useful for evaluating **relational understanding in images**. *Evaluation:* Measures whether the system retrieves the correct image that satisfies the composed query. Metrics include **Recall\@1 (accuracy)** – human performance is \~83% on this benchmark. The goal is to push models to avoid retrieving images that have partial matches (only one attribute-object correct). **Access:** The authors provide a project page and data download (Boston University) – see the [Cola project page](https://cs-people.bu.edu/array/research/cola/) for dataset and instructions. + - see [imsearch_benchmarks](https://github.com/waggle-sensor/imsearch_benchmarks) for the existing benchmarks - Atmospheric Science Focused - Multimodal Ground‐based Cloud Dataset (MGCD) * **Description and purpose:** A dataset of 8,000 ground-based hemispheric sky images collected in Tianjin, China (2017–2018) for cloud classification research. It includes seven cloud categories (grouped per WMO classification) such as cumulus, altocumulus/cirrocumulus, cirrus/cirrostratus, clear sky, stratocumulus/stratus/altostratus, cumulonimbus/nimbostratus, and mixed cloud. The dataset was created to improve automated cloud-type recognition and is labeled by meteorologists, ensuring high-quality ground truth. @@ -157,36 +157,18 @@ kubectl kustomize nrp-dev -o sage-image-search-dev.yaml or kubectl kustomize nrp * **Type of annotations:** Each image is labeled with a **weather condition tag** corresponding to one of the 11 classes. These tags are human-readable descriptors (e.g., “rain” or “sandstorm”). In some cases, multiple phenomena might co-occur (like rain with lightning), but in this dataset each image is categorized by its primary phenomenon. The annotations are structured (one label per image) but effectively serve as short text descriptions of the image’s content (the weather event present). * **Relevance to retrieval:** This dataset directly supports text-to-image retrieval scenarios for weather events. For example, a query “lightning storm” or “dense fog” would correspond to the *lightning* or *fog/smog* categories, and relevant images can be retrieved and evaluated. Because it covers a wide array of weather phenomena (including hazardous events like hail and sandstorms), it’s valuable for testing retrieval across both common and relatively rare atmospheric conditions. The human-chosen labels act as ground truth keywords for evaluating retrieval accuracy. * **Download/access link:** Available on **Kaggle** (dataset titled “Weather Image Recognition”). Users can download it directly from the Kaggle page. Additionally, a GitHub repository by an author of the project provides the class breakdown and can be used as a reference for accessing the data. (Kaggle login may be required to access the files.) - - Fire Science/Ecologist Focused - - FLAME 2/3 (Fire Detection *Aerial Multi-spectral* Dataset) - * **Description & Context:** FLAME 2 is a UAV-captured dataset from a **prescribed burn experiment** in an open-canopy pine forest (Northern Arizona, 2021). It provides **synchronized aerial video frames in both infrared (IR) and visible light**. The data consist of side-by-side IR/RGB frame pairs recorded by drones flying over an active controlled fire. This unique multi-spectral imagery helps researchers analyze fire behavior that is visible in IR but obscured in RGB (e.g. through smoke). The dataset was created to advance high-precision, real-time wildfire monitoring using UAVs. - * **Camera Platform:** **Drone-based dual cameras** – one RGB camera and one thermal infrared camera rigidly mounted on a UAV, capturing the same scene simultaneously. The drone’s mobility allowed capturing different angles of the burn and up-close fire behavior not observable from satellites or fixed towers. - * **Size & Format:** Comprises **video frame pairs** (RGB + IR) extracted from the drone footage. Thousands of paired frames are included (over 8 GB of data) in image format. The frames in the public release are downsampled to 254×254 pixels for manageable size, but retain the alignment between color and thermal channels. Additionally, a **supplementary set** provides context data: a georeferenced pre-burn **3D point cloud** of the area, an **RGB orthomosaic** map, weather logs, and the burn plan. This extra data situates the images in a real-world scientific context (fuel conditions, topography, etc.). - * **Annotations:** Each RGB-IR frame pair has **two binary labels** indicating (a) whether active **fire/flame is present** in the frame, and (b) whether **smoke covers at least 50%** of the frame. These labels were assigned by human experts reviewing the imagery. In other words, every image pair is tagged with “Fire” vs “No Fire”, and “Heavy Smoke” vs “No Heavy Smoke” as textual metadata. This allows querying images by fire presence or smoke density. (No bounding boxes are provided – the labels apply to the whole frame, but fire pixels were segmented in a related study.) - * **Relevance to Fire Ecology:** FLAME 2 is used to develop and evaluate **fire detection algorithms in multi-modal imagery**, which is crucial for **operational wildfire drones**. The IR channel aids in seeing through smoke to detect hot spots, while the RGB channel captures smoke plumes – together they support research on early fire growth, smoke dynamics, and fire spread modeling. The included 3D pre-burn data can also support **post-burn ecological assessments** (e.g. mapping char and scorch in the canopy) by comparing conditions before and after the fire. - * **Access:** *FLAME 2/3 is publicly available* via IEEE DataPort (CC BY 4.0). DOI: 10.21227/krde-h753. Download page: **[IEEE DataPort – FLAME 2 Dataset](https://ieee-dataport.org/open-access/flame-2-fire-detection-and-modeling-aerial-multi-spectral-image-dataset)** or **[IEEE DataPort – FLAME 3 Dataset](https://ieee-dataport.org/open-access/flame-3-radiometric-thermal-uav-imagery-wildfire-management)**. - - The Wildfire Dataset (El-Madafri et al. 2023) - * **Description & Context:** *“The Wildfire Dataset”* is an **open-source image dataset for forest fire detection**, designed to be **diverse and evolving**. The authors curated a broad collection of wildfire-related images to capture varied **forest types, geographic regions, lighting and weather conditions**, and common false-alarm elements. Unlike many prior datasets, it focuses on representativeness: only public-domain images were included (e.g. from government archives, Flickr, Unsplash) to ensure legality and diversity. The dataset’s goal is to improve deep learning models by reducing false positives – it introduces challenging “confounding” scenes that often fool fire detectors (such as sun glare or fog that looks like smoke). - * **Camera Platform:** **Heterogeneous sources** – a mix of **ground-based photos and aerial images** taken from drones, planes, and helicopters. This means some images are on-the-ground wildfire photographs, while others are oblique aerial shots of smoke plumes or burning forests. Such variety exposes models to different scales and viewpoints of fires. - * **Size & Format:** Currently contains **approximately 2,700 color images**. The images are high-resolution on average (mean \~4057×3155 px) but with a wide size range (some thumbnails as small as 153×206, up to large photos \~19,699×8974). This reflects the mix of sources. The dataset is continuously expanding with new images and even video clips in updates. Data is provided in standard image files (JPEG/PNG) along with a CSV or folder structure for labels. - * **Annotations:** Each image is **labeled with a human-readable category describing its fire content**, following a multi-class scheme to differentiate real fires from look-alikes. In particular, images are grouped into classes such as: **“Fire – Smoke from fires”** (actual wildfire images), **“NoFire – with fire-like elements”** (e.g. bright sunset or flames from non-wildfire sources), **“NoFire – with smoke-like elements”** (e.g. fog, dust, or cloud that resembles smoke), and **“NoFire – no confounding elements”** (normal forest scene with no fire or smoke). These tags serve as descriptive metadata; for example, a query for “smoke plume” could retrieve images labeled *Fire – Smoke from fires*, while “cloudy forest with no fire” maps to *NoFire – smoke confounder*. The labels enable multi-task training (fire vs no-fire, and identifying the confounding factors). *(No pixel-level annotations are given, as the focus is on image-level classification.)* - * **Relevance to Fire Ecology:** This dataset is a \*\*benchmark for wildfire \*\*early detection algorithms, especially in distinguishing true fires from false alarms. By including confounding scenarios (hazy weather, sun rays, etc.), it directly tackles a key challenge in operational fire monitoring – high false positive rates. In a broader sense, it aids any **fire science application needing image-based recognition**, from automatic lookout tower systems to climate research (by providing a varied image set of fires around the world). Researchers can also study the visual features of wildfires across different ecosystems since the images span various forest types and regions. - * **Access:** *The Wildfire Dataset is publicly available.* It’s hosted on **Kaggle** as an open dataset (maintained by the authors). Access it here: **[Kaggle – The Wildfire Dataset](https://www.kaggle.com/datasets/elmadafri/the-wildfire-dataset)**. (No login fees; images are in the public domain with appropriate credits.) - - NEMO: Nevada Smoke Detection Dataset - * **Description & Context:** NEMO is a dataset devoted to \*\*early wildfire \*\*smoke detection from fixed cameras. It was created by researchers in collaboration with the AlertWildfire camera network to capture the **incipient stage of wildfires** – when only a faint smoke plume is visible. The authors extracted image frames from over 1,000 timelapse wildfire camera videos and hand-annotated smoke plumes in them. NEMO’s focus is on *real-world “in-the-wild” conditions*: small or distant smokes that are hard to distinguish from clouds, fog, or haze. This makes it a valuable dataset for developing robust smoke detection algorithms for wildfire alert systems. - * **Camera Platform:** **Ground-based PTZ wildfire cameras** – specifically the AlertWildfire/HPWREN network of pan-tilt-zoom cameras stationed on mountaintops in Nevada and California. These cameras continuously monitor remote wildland areas for smoke. The dataset frames are essentially **time-stamped photographs from these live cameras**, often showing vast landscapes or horizons where a tiny smoke column might appear. (The PTZ cameras can zoom and pivot, so perspectives vary.) - * **Size & Format:** The dataset contains **2,934 labeled images** (frames), all extracted from video footage. Images are high-definition (most around 1920×1080 pixels, as per the camera streams). **4,522 total smoke instances** are labeled across the images – meaning many images have multiple distinct smoke plumes annotated. Data is provided in **COCO-style format** (images plus a JSON of annotations) and also converted to other formats by contributors. - * **Annotations:** Each image comes with **bounding box annotations** around any visible smoke plumes, along with a **classification of the smoke density**. Specifically, smoke instances are categorized into **three classes: “low smoke”, “mid smoke”, and “high smoke”** depending on the plume’s size/opacity. For example, a very faint, small distant wisp might be labeled *low smoke*, whereas a large, billowing column would be *high smoke*. These textual labels allow filtering images by smoke severity. Images with no smoke were also included as negatives in some training configurations (to reduce false alarms). Overall, the annotations enable both object detection (find smoke in image) and image-level retrieval (e.g. find all images with “high smoke” plumes). - * **Relevance to Fire Science:** NEMO is highly relevant for **operational wildfire monitoring**. It mirrors the exact scenario of interest to fire agencies: detecting a **tiny smoke on the horizon** minutes after ignition. By training on NEMO, AI models can be deployed on camera feeds to automatically alert firefighters of new smokes faster than human spotters. For fire ecology research, NEMO’s real-time image data (with time series of smoke growth) can help in understanding **fire spread dynamics** at ignition, and improve early warning systems that mitigate large fires. The dataset also helps quantify false alarm sources (e.g. differentiating smoke vs. dust or cloud) which is crucial for reliable automated detection. - * **Access:** *NEMO is an open dataset.* It is hosted on GitHub by the creators under an Apache 2.0 license. The repository provides data and pretrained models: **[GitHub – SayBender/Nemo (Nevada Smoke Dataset)](https://github.com/SayBender/Nemo)**. (From the GitHub, one can download the image dataset and annotation files. The project is also described in an MDPI paper for further reference.) - - PyroNear 2024 Smoke Plume Dataset - * **Description & Context:** **PyroNear2024** is a large-scale, recently introduced dataset containing both **still images and video sequences of wildfire smoke plumes**. It was compiled by the PyroNear project (an open-source wildfire AI initiative) to enable training of next-generation smoke detection models, including temporal (video) models. PyroNear2024 significantly **surpasses prior datasets in size and diversity**: it covers **around 400 wildfire events** in multiple countries (France, Spain, USA), and includes **synthetic data** for rare scenarios. By combining data from different regions and camera networks, it ensures a wide variety of backgrounds, climates, and forest types. The emphasis is on **early detection**, so the dataset focuses on the first moments of ignition where only smoke (no flame) is visible. - * **Camera Platform:** Primarily **ground-based wildfire surveillance cameras** (both public networks like AlertWildfire and a PyroNear in-house camera network) and some **web-scraped videos**. These are augmented with **synthetic smoke images** generated via computer graphics (Blender) to simulate additional scenarios. The result is a mix of real camera footage and realistic simulated data. The video component consists of sequences from those cameras, capturing 15 minutes before/after smoke onset for temporal analysis. - * **Size & Format:** **Very large** – on the order of **50,000 images** (frames) in total, with roughly **150,000 manual annotations** (since many images contain multiple plumes). After quality filtering, about **24,000 high-quality labeled images** were retained. Additionally, **video data** is provided: thousands of short clips or frame sequences around each fire start. Annotations are in COCO format for images and a suitable format for video (with frame-by-frame labels). The images come from various cameras, typically HD resolution. - * **Annotations:** All smoke plumes in the images are **annotated with bounding boxes** and class labels. The primary label is simply “smoke” (since the dataset’s purpose is smoke vs. no-smoke detection). However, because of the dataset’s construction, images are often grouped by wildfire event and time, and there are **temporal annotations** as well – e.g. which frames belong to the same smoke plume over time, the timing of detection, etc. The dataset creators report **≈150k smoke region annotations on \~50k images**. In practical terms, a user can query, for example, “early stage smoke” to retrieve images where only a tiny smoke is present (the dataset includes many such examples, which are inherently labeled by virtue of being early in the sequence). The inclusion of **video** allows retrieval tasks like finding a sequence of images corresponding to “smoke developing over 10 minutes.” - * **Relevance to Fire Ecology:** PyroNear2024 is aimed squarely at **operational early fire detection**. It provides a benchmark to evaluate smoke detection algorithms in a realistic, **multi-regional setting**. Its large scale and inclusion of sequential data make it valuable for training more robust models (e.g. reducing false alarms across different environments, and using motion cues in video). For fire science, this dataset can help study the visual signatures of fires in their initial phase across ecosystems (e.g. how a chaparral fire’s smoke looks vs. a conifer forest’s). Ultimately, improvements in early detection directly benefit fire ecology by enabling quicker suppression and thus reducing the ecological impact of wildfires. PyroNear2024’s global scope also facilitates research into **smoke dynamics** under different atmospheric conditions. - * **Access:** *PyroNear2024 is expected to be released openly* by the PyroNear team. As of 2024, an arXiv preprint is available and the **data (images & videos) will be made public** via PyroNear’s platforms. For updates and downloads, refer to the PyroNear project page (pyronear.org) or the arXiv reference: **[PyroNear2024 Dataset – ArXiv Preprint](https://arxiv.org/abs/2402.05349)**. *(This link provides details and will point to the code/data release once live.)* - * seems like PyroNear2024 is not available yet, can't find it. But they do have this dataset available [pyronear/pyro-sdis](https://huggingface.co/datasets/pyronear/pyro-sdis) + - Sage focused + - get a sample of images and create queries based on the metadata. For example, "animals in W09E" + - this can also be just images from sage so it can truly test the image retrieval capabilities of the system on real data. + - Urban-Focused + - **CityFlow-NL (Natural Language Vehicle Retrieval):** A benchmark introduced via the AI City Challenge for retrieving traffic camera images of vehicles based on descriptions. Built on the CityFlow surveillance dataset, it provides **5,000+ unique natural language descriptions** for **666 target vehicles** captured across **3,028 multi-camera tracks** in a city. Descriptions include vehicle attributes (color, type), motion (e.g. “turning right”), and surrounding context (other vehicles, road type). *Relevance:* Focused on **urban street scenes** – traffic surveillance footage from a city, featuring cars, trucks, intersections, etc. *Evaluation:* Uses ranking metrics similar to person search – the challenge reports **mAP** (mean average precision) over the top 100 retrieved results, as well as **Recall\@1,5,10** hit rates for each query. For instance, the baseline in one study achieved \~29.6% Recall\@1 and \~64.7% Recall\@10, illustrating the task difficulty. **Access:** Dataset introduced in the *AI City Challenge 2021 (Track 5)*. Available through the challenge organizers (download via the [AI City Challenge website](https://www.aicitychallenge.org/) – data request required) or the authors’ GitHub repository which provides code and data links for CityFlow-NL. + - Paper: https://arxiv.org/abs/2101.04741 + - code: https://github.com/fredfung007/cityflow-nl + - text extraction benchmarks + - for example how good can the image search return images based on text found in the image + - to do this gather lots of images with text in the image and use imsearch_benchmaker to create the benchmark. + - Compositional & Expert-Level Retrieval Benchmarks + - **Cola (Compositional Localized Attributes):** A **compositional text-to-image retrieval** benchmark (NeurIPS 2023) designed to test fine-grained understanding of object-attribute combinations. **Cola contains \~1,236 queries** composed of **168 objects and 197 attributes** (e.g. “red car next to blue car”, “person in yellow shirt riding a bike”) with target images drawn from about **30K images**. Each query has challenging confounders (distractor images that have the right objects but wrong attribute pairing). *Relevance:* Not specific to urban scenes, but many queries could involve everyday objects (cars, people, etc. in various configurations) – useful for evaluating **relational understanding in images**. *Evaluation:* Measures whether the system retrieves the correct image that satisfies the composed query. Metrics include **Recall\@1 (accuracy)** – human performance is \~83% on this benchmark. The goal is to push models to avoid retrieving images that have partial matches (only one attribute-object correct). **Access:** The authors provide a project page and data download (Boston University) – see the [Cola project page](https://cs-people.bu.edu/array/research/cola/) for dataset and instructions. - System-Level Performance Benchmarks - Latency - Time taken per query (cold start vs. warm cache) diff --git a/benchmarking/INQUIRE/Makefile b/benchmarking/INQUIRE/Makefile deleted file mode 100644 index 40c54195..00000000 --- a/benchmarking/INQUIRE/Makefile +++ /dev/null @@ -1,101 +0,0 @@ -.PHONY: down build calculate load get - -# vars -NETWORK_NAME=weaviate_network -app_image=inquire_benchmark -weavloader_image=inquire_weavloader -INQUIRE_DATASET=sagecontinuum/INQUIRE-Benchmark-small -IMAGE_BATCH_SIZE=25 -QUERY_BATCH_SIZE=5 -SAMPLE_SIZE=0 -WORKERS=5 -IMAGE_RESULTS_FILE=image_search_results.csv -QUERY_EVAL_METRICS_FILE=query_eval_metrics.csv - -down: - - # Stop and remove app container - @if docker ps -a --format '{{.Names}}' | grep -q "^$(app_image)$$"; then \ - echo "Stopping $(app_image)..."; \ - docker stop $(app_image); \ - echo "Removing $(app_image)..."; \ - docker rm $(app_image); \ - else \ - echo "$(app_image) does not exist. Skipping..."; \ - fi - - # Stop and remove app container - @if docker ps -a --format '{{.Names}}' | grep -q "^$(weavloader_image)$$"; then \ - echo "Stopping $(weavloader_image)..."; \ - docker stop $(weavloader_image); \ - echo "Removing $(weavloader_image)..."; \ - docker rm $(weavloader_image); \ - else \ - echo "$(weavloader_image) does not exist. Skipping..."; \ - fi - -# Build custom services -build: - # Build App image - docker build -t $(app_image) ./app - - # Build Weaviate Loader - docker build -t $(weavloader_image) ./weavloader - -#Calculate INQUIRE benchmark -calculate: - - # Stop and remove app container - @if docker ps -a --format '{{.Names}}' | grep -q "^$(app_image)$$"; then \ - echo "Stopping $(app_image)..."; \ - docker stop $(app_image); \ - echo "Removing $(app_image)..."; \ - docker rm $(app_image); \ - else \ - echo "$(app_image) does not exist. Skipping..."; \ - fi - - # Run app container with the network configuration - docker run --name $(app_image) --network $(NETWORK_NAME) -p 7862:7862 --restart on-failure \ - -e PYTHONUNBUFFERED=1 \ - -e WEAVIATE_HOST='weaviate' \ - -e WEAVIATE_PORT='8080' \ - -e WEAVIATE_GRPC_PORT='50051' \ - -e INQUIRE_DATASET='$(INQUIRE_DATASET)' \ - -e QUERY_BATCH_SIZE='$(QUERY_BATCH_SIZE)' \ - -e IMAGE_RESULTS_FILE='$(IMAGE_RESULTS_FILE)' \ - -e QUERY_EVAL_METRICS_FILE='$(QUERY_EVAL_METRICS_FILE)' \ - -e CLUSTER_FLAG='True' \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - -d $(app_image) - -# load in INQUIRE dataset into weaviate -load: - - # Stop and remove app container - @if docker ps -a --format '{{.Names}}' | grep -q "^$(weavloader_image)$$"; then \ - echo "Stopping $(weavloader_image)..."; \ - docker stop $(weavloader_image); \ - echo "Removing $(weavloader_image)..."; \ - docker rm $(weavloader_image); \ - else \ - echo "$(weavloader_image) does not exist. Skipping..."; \ - fi - - # Run data loader - docker run --name $(weavloader_image) --network $(NETWORK_NAME) --restart on-failure \ - -e WEAVIATE_HOST='weaviate' \ - -e WEAVIATE_PORT='8080' \ - -e WEAVIATE_GRPC_PORT='50051' \ - -e INQUIRE_DATASET='$(INQUIRE_DATASET)' \ - -e IMAGE_BATCH_SIZE='$(IMAGE_BATCH_SIZE)' \ - -e SAMPLE_SIZE='$(SAMPLE_SIZE)' \ - -e WORKERS='$(WORKERS)' \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - -d $(weavloader_image) - -# retrieve the results -get: - # get the result files - docker cp $(app_image):/app/$(IMAGE_RESULTS_FILE) . - docker cp $(app_image):/app/$(QUERY_EVAL_METRICS_FILE) . diff --git a/benchmarking/INQUIRE/Readme.md b/benchmarking/INQUIRE/Readme.md deleted file mode 100644 index 20c95cc3..00000000 --- a/benchmarking/INQUIRE/Readme.md +++ /dev/null @@ -1,71 +0,0 @@ -# INQUIRE Benchmark - -This project uses the same setup as [Hybrid Search](../HybridSearch_example/) so that we can benchmark [Hybrid Search](../HybridSearch_example/) using [INQUIRE](https://github.com/inquire-benchmark/INQUIRE). ->NOTE: The INQUIRE benchmark needs to be updated to work with sage-nrp-image-search. The current code is not compatible with this service. - -## Usage - -This benchmark is supposed to be used in conjuction with [Hybrid Search](../HybridSearch_example/). The Makefile references components that are deployed in [Hybrid Search](../HybridSearch_example/). The Makefile in here deploys additional containers that are used to run the INQUIRE Benchmark. - -## Running the Example - -### Prerequisites -To run this example, you'll need: -- **Docker** installed on your machine with GPU access - -### Step-by-Step Setup - -1. **Spin up your Hybrid Search Instance**: - - Navigate to the [Hybrid Search](../HybridSearch_example/) directory and follow those instructions to spin up a Hybrid Search Instance. - -2. **Load in the dataset**: - - Navigate back into this directory containing the `Makefile` file and run: - ```bash - make build && make load && docker logs inquire_weavloader -f - ``` - >NOTE: This loads in [INQUIRE-Benchmark-small](https://huggingface.co/datasets/sagecontinuum/INQUIRE-Benchmark-small) into Weaviate. - -3. **Calculate the Query Metrics**: - - After dataset is fully loaded into Weaviate, run: - ```bash - make build && make calculate && docker logs inquire_benchmark -f - ``` - >NOTE: inquire_weavloader's logs will indicate when the dataset is fully loaded into Weaviate. - -4. **Retrieve the Results**: - - After the metrics are calculated, run: - ```bash - make get - ``` - >NOTE: This will copy the csv files into your currect working directory - -### Results - -Once the benchmark is ran, two csv files will be generated: -- `image_search_results.csv` - - This file includes the metadata of all images returned by Weaviate when different queries were being ran. -- `query_eval_metrics.csv` - - This file includes the calculated metrics based on images returned by different queries. - -There is multiple results placed in version folders. Each folder has a evaluate.ipynb notebook that goes into more details what that version tested and the metrics. - -## References -- [Weaviate Blog: NDCG](https://weaviate.io/blog/retrieval-evaluation-metrics#normalized-discounted-cumulative-gain-ndcg) -- [RAG Evaluation](https://weaviate.io/blog/rag-evaluation) -- [Scikit-Learn NDCG](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ndcg_score.html) -- [A Guide on NDCG](https://www.aporia.com/learn/a-practical-guide-to-normalized-discounted-cumulative-gain-ndcg/) -- [Weaviate: Batch import](https://weaviate.io/developers/weaviate/manage-data/import) -- [Weaviate: Imports in Detail](https://weaviate.io/developers/weaviate/tutorials/import#data-import---best-practices) -- [INQUIRE](https://inquire-benchmark.github.io/) -- [Hugginface: Fine-tuning Florence2](https://huggingface.co/blog/finetune-florence2) -- [Medium: Fine-tuning Florence2](https://medium.com/@amit25173/fine-tuning-florence-2-aa9c99b2a83d) - -## Citation -``` -@article{vendrow2024inquire, - title={INQUIRE: A Natural World Text-to-Image Retrieval Benchmark}, - author={Vendrow, Edward and Pantazis, Omiros and Shepard, Alexander and Brostow, Gabriel and Jones, Kate E and Mac Aodha, Oisin and Beery, Sara and Van Horn, Grant}, - journal={NeurIPS}, - year={2024}, -} -``` diff --git a/benchmarking/INQUIRE/app/Dockerfile b/benchmarking/INQUIRE/app/Dockerfile deleted file mode 100644 index 1ada9ca1..00000000 --- a/benchmarking/INQUIRE/app/Dockerfile +++ /dev/null @@ -1,21 +0,0 @@ -FROM python:3.11-slim - -RUN apt-get update \ - && apt-get install -y \ - wget \ - curl - -# Set working directory -WORKDIR /app - -# Copy requirements.txt into the container -COPY requirements.txt . - -# Install dependencies -RUN pip install --no-cache-dir -r requirements.txt - -# Copy the rest of the application code into the container -COPY . . - -# Run upload.py -CMD ["python", "main.py"] \ No newline at end of file diff --git a/benchmarking/INQUIRE/app/HyperParameters.py b/benchmarking/INQUIRE/app/HyperParameters.py deleted file mode 100644 index 92974b25..00000000 --- a/benchmarking/INQUIRE/app/HyperParameters.py +++ /dev/null @@ -1,24 +0,0 @@ -'''This file contains the hyper parameters that can be changed to fine tune -the system. -NOTE: Not all params have been added here. More in depth search must be -done to find more hyper params that can be altered''' - -# 1) Hybrid Search Query hyperparameters -response_limit=50 #Number of objects to return, switched from 0 to 50 to match how INQUIRE benchmarks -query_alpha=0.4 #An alpha of 1 is a pure vector search, An alpha of 0 is a pure keyword search. -max_vector_distance=0.4 #max accepted distance for the vector search component -near_text_certainty=0.7 #The minimum similarity score to return. If not specified, the default certainty specified by the server is used. -#NOTE: USE max_vector_distance OR near_text_certainty -concepts_to_avoid=["police", "gun"] # Concepts to avoid -avoid_concepts_force=0 #the strength to avoid the concepts -# autocut limits results based on discontinuities -# more info: https://weaviate.io/developers/weaviate/api/graphql/additional-operators#autocut -autocut_jumps=1 #To explicitly disable autocut, set the number of jumps to 0 or a negative value -#NOTE: USE autocut_jumps OR response_limit -hybrid_weight=0.7 #The weight of the hybrid search component in the unified score for hybrid colbert blend. -colbert_weight=0.3 #The weight of the colbert search component in the unified score for hybrid colbert blend. -hybrid_colbert_blend_top_k=50 #The number of top results to return from the hybrid colbert blend search. - -# 2) Experimental hyperparameters -align_alpha = 0.7 -clip_alpha = 0.7 \ No newline at end of file diff --git a/benchmarking/INQUIRE/app/client.py b/benchmarking/INQUIRE/app/client.py deleted file mode 100644 index 403468b5..00000000 --- a/benchmarking/INQUIRE/app/client.py +++ /dev/null @@ -1,49 +0,0 @@ -'''This file contains the code to interact with the weaviate client''' -import logging -import argparse -import os -import weaviate -import time - -def initialize_weaviate_client(): - ''' - Intialize weaviate client based on arg or env var - ''' - parser = argparse.ArgumentParser() - parser.add_argument( - "--weaviate_host", - default=os.getenv("WEAVIATE_HOST","127.0.0.1"), - help="Weaviate host IP.", - ) - parser.add_argument( - "--weaviate_port", - default=os.getenv("WEAVIATE_PORT","8080"), - help="Weaviate REST port.", - ) - parser.add_argument( - "--weaviate_grpc_port", - default=os.getenv("WEAVIATE_GRPC_PORT","50051"), - help="Weaviate GRPC port.", - ) - args = parser.parse_args() - - weaviate_host = args.weaviate_host - weaviate_port = args.weaviate_port - weaviate_grpc_port = args.weaviate_grpc_port - - logging.debug(f"Attempting to connect to Weaviate at {weaviate_host}:{weaviate_port}") - - # Retry logic to connect to Weaviate - while True: - try: - client = weaviate.connect_to_local( - host=weaviate_host, - port=weaviate_port, - grpc_port=weaviate_grpc_port - ) - logging.debug("Successfully connected to Weaviate") - return client - except weaviate.exceptions.WeaviateConnectionError as e: - logging.error(f"Failed to connect to Weaviate: {e}") - logging.debug("Retrying in 10 seconds...") - time.sleep(10) \ No newline at end of file diff --git a/benchmarking/INQUIRE/app/inquire_eval.py b/benchmarking/INQUIRE/app/inquire_eval.py deleted file mode 100644 index 9ed90d22..00000000 --- a/benchmarking/INQUIRE/app/inquire_eval.py +++ /dev/null @@ -1,175 +0,0 @@ -'''This file contains the code to run generate the results of the Benchmark.''' - -import os -import pandas as pd -import tritonclient.grpc as TritonClient -from query import Weav_query -from concurrent.futures import ThreadPoolExecutor -from datasets import load_dataset -from sklearn.metrics import ndcg_score -from itertools import islice -import logging - -# Load INQUIRE benchmark dataset from Hugging Face -INQUIRE_DATASET = os.environ.get("INQUIRE_DATASET", "sagecontinuum/INQUIRE-Benchmark-small") - -# Batch size for parallel processing -QUERY_BATCH_SIZE = int(os.environ.get("QUERY_BATCH_SIZE", 100)) - -def load_inquire_dataset(split="test"): - """ Load INQUIRE dataset from HuggingFace and return as pandas DataFrame. """ - dataset = load_dataset(INQUIRE_DATASET, split=split).to_pandas() - return dataset - -def compute_ndcg(df, sortby="rerank_score"): - """ - Compute Normalized Discounted Cumulative Gain (NDCG) using scikit-learn. - Args: - df (pd.DataFrame): DataFrame containing Weaviate results - sortby (str): Column to sort by (e.g., "rerank_score") - Returns: - float: NDCG score - """ - if df.empty or len(df) < 2: - return 0 # NDCG is not defined for a single document. - - # Ensure results are sorted (higher score = better ranking) - df_sorted = df.sort_values(sortby, ascending=False) - - # Extract true relevance labels (1 = relevant, 0 = irrelevant) - y_true = df_sorted["relevant"].values.reshape(1, -1) # Must be 2D array - - # Extract ranking scores (e.g., rerank_score or clip_score) - y_score = df_sorted[sortby].values.reshape(1, -1) # Must be 2D array - - # Compute NDCG using Scikit-Learn - return ndcg_score(y_true, y_score) - -def batched(iterable, batch_size): - """ - Yield successive batch_size chunks from iterable. - Args: - iterable: An iterable (e.g., list, DataFrame rows) - batch_size: Size of each batch - Yields: - list: A batch of items from the iterable - """ - it = iter(iterable) - while batch := list(islice(it, batch_size)): - yield batch - -def evaluate_query(query_row, wq: Weav_query, dataset): - """ Evaluates a single query by comparing retrieved results to ground truth dataset. """ - - query = str(query_row["query"]) - query_id = query_row["query_id"] - - # Log the query being evaluated - logging.debug(f"Evaluating query {query_id}: {query}") - - # Run search query on Weaviate - weav_df = wq.clip_hybrid_query(query) - weav_df["queried_on_query_id"] = query_id - weav_df["queried_on_query"] = query - - # Check if no results were returned - if weav_df.empty: - logging.debug(f"No results returned for query {query_id}") - - # Store per-query statistics with default values - query_stats = { - "query_id": query_id, - "query": query, - "total_images": 0, - "correctly_returned": 0, - "incorrectly_returned": 0, - "relevant_images": 0, - "non_relevant_images": 0, - "accuracy": 0, - "precision": 0, - "recall": 0, - "NDCG": 0, - "clip_NDCG": 0, - "category": query_row["category"], - "supercategory": query_row["supercategory"], - "iconic_group": query_row["iconic_group"], - } - - return weav_df, query_stats # Return empty DataFrame and stats with default values - - # Count total images returned - total_images = len(weav_df) - - # Check if image retrieval is correct and count relevant images - correct_retrieval = 0 - relevant_images = 0 - for _, row in weav_df.iterrows(): - if row["queried_on_query_id"] == row["query_id"]: - correct_retrieval += 1 - relevant_images += row["relevant"] - incorrect_retrieval = total_images - correct_retrieval - non_relevant_images = total_images - relevant_images - - # get number of relevant images in dataset - relevant_images_in_dataset = dataset[dataset["query_id"] == query_id]["relevant"].sum() - - # Comput NDCG to evaluate ranking - ndcg = compute_ndcg(weav_df, sortby="rerank_score") - clip_ndcg = compute_ndcg(weav_df, sortby="clip_score") - - # Store per-query statistics - query_stats = { - "query_id": query_id, - "query": query, - "total_images": total_images, - "correctly_returned": correct_retrieval, - "incorrectly_returned": incorrect_retrieval, - "relevant_images": relevant_images, - "non_relevant_images": non_relevant_images, - "accuracy": correct_retrieval / total_images if total_images else 0, # not rank-aware metric - "precision": relevant_images / total_images if total_images else 0, # not rank-aware metric - "recall": relevant_images / relevant_images_in_dataset if relevant_images_in_dataset else 0, # not rank-aware metric - "NDCG": ndcg, # https://www.aporia.com/learn/a-practical-guide-to-normalized-discounted-cumulative-gain-ndcg/ - "clip_NDCG": clip_ndcg, - "category": query_row["category"], - "supercategory": query_row["supercategory"], - "iconic_group": query_row["iconic_group"], - } - - return weav_df, query_stats - -def evaluate_queries(weaviate_client, dataset): - """ Evaluate unique queries in parallel using their full row data. """ - - logging.debug("Starting INQUIRE Benchmark...") - triton_client = TritonClient.InferenceServerClient(url="triton:8001") - wq = Weav_query(weaviate_client, triton_client) - - results = [] - query_stats = [] - - # Convert dataset to Pandas DataFrame if it's not already - if not isinstance(dataset, pd.DataFrame): - dataset = dataset.to_pandas() - - # Get unique queries along with their metadata (e.g., query_id, category) - unique_queries = dataset.drop_duplicates(subset=["query"]) - - with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: - for batch in batched(unique_queries.iterrows(), QUERY_BATCH_SIZE): - # Process in parallel - futures = { - executor.submit(evaluate_query, query_row, wq, dataset): query_row["query"] - for _, query_row in batch - } - - for future in futures: - df, stats = future.result() - results.append(df) - query_stats.append(stats) - - # Combine all results into a DataFrame - all_results_df = pd.concat(results, ignore_index=True) - query_stats_df = pd.DataFrame(query_stats) - - return all_results_df, query_stats_df diff --git a/benchmarking/INQUIRE/app/main.py b/benchmarking/INQUIRE/app/main.py deleted file mode 100644 index 9472cf16..00000000 --- a/benchmarking/INQUIRE/app/main.py +++ /dev/null @@ -1,50 +0,0 @@ -'''This file contains the code to run the Benchmark and save the results.''' - -import os -from inquire_eval import evaluate_queries -from datasets import load_dataset -from client import initialize_weaviate_client -import logging -import time - -# Load INQUIRE benchmark dataset from Hugging Face -INQUIRE_DATASET = os.environ.get("INQUIRE_DATASET", "sagecontinuum/INQUIRE-Benchmark-small") -IMAGE_RESULTS_FILE = os.environ.get("IMAGE_RESULTS_FILE", "image_search_results.csv") -QUERY_EVAL_METRICS_FILE = os.environ.get("QUERY_EVAL_METRICS_FILE", "query_eval_metrics.csv") - -def load_inquire_dataset(): - """ Load INQUIRE dataset from HuggingFace and return it as a pandas DataFrame. """ - dataset = load_dataset(INQUIRE_DATASET, split="test").to_pandas() - return dataset - -if __name__ == "__main__": - - # Configure logging - logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s %(message)s", - datefmt="%Y/%m/%d %H:%M:%S", - ) - - # Load INQUIRE dataset - inquire_dataset = load_inquire_dataset() - - # Connect to Weaviate and Evaluate search system - with initialize_weaviate_client() as weaviate_client: - image_results, query_evaluation = evaluate_queries(weaviate_client, inquire_dataset) - - # Save results - image_results_location = os.path.join("/app", IMAGE_RESULTS_FILE) - query_evaluation_location = os.path.join("/app", QUERY_EVAL_METRICS_FILE) - - image_results.to_csv(image_results_location, index=False) - query_evaluation.to_csv(query_evaluation_location, index=False) - logging.debug(f"Evaluation is done, INQUIRE results saved to {image_results_location} and {query_evaluation_location}") - weaviate_client.close() - - # Keep the program running when the evaluation is done - try: - while True: - time.sleep(10) - except (KeyboardInterrupt, SystemExit): - exit() \ No newline at end of file diff --git a/benchmarking/INQUIRE/app/model.py b/benchmarking/INQUIRE/app/model.py deleted file mode 100644 index c0cc9ac9..00000000 --- a/benchmarking/INQUIRE/app/model.py +++ /dev/null @@ -1,156 +0,0 @@ -'''This file contains the code to talk to Triton Inference Server''' - -import logging -import tritonclient.grpc as TritonClient -import numpy as np -import HyperParameters as hp - -def get_colbert_embedding(triton_client, text): - """ - Embed text using ColBERT encoder served via Triton Inference Server. - Returns token-level embeddings of shape [num_tokens, 128] - """ - # Prepare input - text_bytes = text.encode("utf-8") - input_tensor = np.array([text_bytes], dtype="object") # batch size = 1 - - # Prepare inputs & outputs for Triton - # NOTE: if you enable max_batch_size, leading number is batch size, example [1,1] 1 is batch size - inputs = [ - TritonClient.InferInput("text", input_tensor.shape, "BYTES") - ] - outputs = [ - TritonClient.InferRequestedOutput("embedding"), - TritonClient.InferRequestedOutput("token_lengths") - ] - - # Add tensors - inputs[0].set_data_from_numpy(input_tensor) - - # Run inference - try: - results = triton_client.infer(model_name="colbert", inputs=inputs, outputs=outputs) - - # Retrieve and reshape output - emb_flat = results.as_numpy("embedding") # shape: (1, max_len * 128) - token_lengths = results.as_numpy("token_lengths") # shape: (1,) - num_tokens = token_lengths[0] - - # Reshape and unpad - emb_3d = emb_flat.reshape(1, -1, 128) - token_embeddings = emb_3d[0, :num_tokens, :] # shape: [num_tokens, 128] - except Exception as e: - logging.error(f"Error during Colbert inference: {str(e)}") - return None - - return token_embeddings - -def fuse_embeddings( img_emb: np.ndarray, txt_emb: np.ndarray, alpha: float = 0.5) -> np.ndarray: - """ - Given two L2-normalized vectors img_emb and txt_emb (shape (D,)), - returns their weighted sum (alpha * img + (1-alpha) * txt), re-normalized to unit norm. - """ - if img_emb.shape != txt_emb.shape: - raise ValueError("img_emb and txt_emb must have the same dimension") - - # Weighted sum - combined = alpha * img_emb + (1.0 - alpha) * txt_emb - - # Re-normalize - norm = np.linalg.norm(combined) - if norm == 0.0: - # Edge case: if they cancel out exactly (unlikely), fall back to text alone - return txt_emb.copy() - return (combined / norm).astype(np.float32) - -def get_allign_embeddings(triton_client, text, image=None): - """ - Embed text and image using ALIGN encoder served via Triton Inference Server. - Returns one fused embedding created from both modalities. - """ - # --- 1. Prepare Inputs --- - text_bytes = text.encode("utf-8") - text_np = np.array([text_bytes], dtype="object") - - # Fallback image shape (e.g., placeholder 1x1 RGB) - if image is not None: - image_np = np.array(image).astype(np.float32) - else: - image_np = np.zeros((1, 1, 3), dtype=np.float32) - - # Create Triton input objects - inputs = [ - TritonClient.InferInput("text", [1], "BYTES"), - TritonClient.InferInput("image", list(image_np.shape), "FP32") - ] - - inputs[0].set_data_from_numpy(text_np) - inputs[1].set_data_from_numpy(image_np) - - outputs = [ - TritonClient.InferRequestedOutput("text_embedding"), - TritonClient.InferRequestedOutput("image_embedding") - ] - - # --- 2. Inference Call --- - try: - results = triton_client.infer(model_name="align", inputs=inputs, outputs=outputs) - text_embedding = results.as_numpy("text_embedding")[0] - image_embedding = results.as_numpy("image_embedding")[0] - except Exception as e: - logging.error(f"Error during ALIGN inference: {str(e)}") - return None - - # --- 3. Fuse Embeddings --- - if image is not None: - embedding = fuse_embeddings(image_embedding, text_embedding, alpha=hp.align_alpha) - else: - embedding = text_embedding - - return embedding - -def get_clip_embeddings(triton_client, text, image=None): - """ - Embed text and image using CLIP encoder served via Triton Inference Server. - Returns one fused embedding created from both modalities. - """ - # --- 1. Prepare Inputs --- - text_bytes = text.encode("utf-8") - text_np = np.array([text_bytes], dtype="object") - - # Fallback image shape (e.g., placeholder 1x1 RGB) - if image is not None: - image_np = np.array(image).astype(np.float32) - else: - image_np = np.zeros((1, 1, 3), dtype=np.float32) - - # Create Triton input objects - inputs = [ - TritonClient.InferInput("text", [1], "BYTES"), - TritonClient.InferInput("image", list(image_np.shape), "FP32") - ] - - inputs[0].set_data_from_numpy(text_np) - inputs[1].set_data_from_numpy(image_np) - - outputs = [ - TritonClient.InferRequestedOutput("text_embedding"), - TritonClient.InferRequestedOutput("image_embedding") - ] - - # --- 2. Inference Call --- - try: - results = triton_client.infer(model_name="clip", inputs=inputs, outputs=outputs) - text_embedding = results.as_numpy("text_embedding")[0] - image_embedding = results.as_numpy("image_embedding")[0] - except Exception as e: - logging.error(f"Error during CLIP inference: {str(e)}") - return None - - # --- 3. Fuse Embeddings --- - if image is not None: - embedding = fuse_embeddings(image_embedding, text_embedding, alpha=hp.clip_alpha) - else: - embedding = text_embedding - - return embedding \ No newline at end of file diff --git a/benchmarking/INQUIRE/app/query.py b/benchmarking/INQUIRE/app/query.py deleted file mode 100644 index 180fab96..00000000 --- a/benchmarking/INQUIRE/app/query.py +++ /dev/null @@ -1,339 +0,0 @@ -'''This file implements functions that fetch results from weaviate for the query -entered by user.''' - -import HyperParameters as hp -from weaviate.classes.query import MetadataQuery, Move, HybridVector, Rerank, HybridFusion -from model import get_colbert_embedding, get_clip_embeddings -import logging -import pandas as pd - -class Weav_query: - """ - This class is used to query Weaviate. - It contains methods for multiple types of queries. - """ - - def __init__(self, weav_client, triton_client=None): - self.weav_client = weav_client - self.triton_client = triton_client - - def hybrid_query(self, nearText, collection_name="INQUIRE"): - """ - This method performs a hybrid vector and keyword search on a embedding space. - """ - # used this for hybrid search params https://weaviate.io/developers/weaviate/search/hybrid - - #get collection - collection = self.weav_client.collections.get(collection_name) - - # Perform the hybrid search - res = collection.query.hybrid( - query=nearText, # The model provider integration will automatically vectorize the query - target_vector="imagebind", # The name of the vector space to search in - fusion_type= HybridFusion.RELATIVE_SCORE, - # max_vector_distance=hp.max_vector_distance, - # auto_limit=hp.autocut_jumps, - limit=hp.response_limit, - alpha=hp.query_alpha, - return_metadata=MetadataQuery(score=True, explain_score=True), - query_properties=["caption"], #Keyword search properties - # bm25_operator=hp.keyword_search_params, - vector=HybridVector.near_text( - query=nearText, - move_away=Move(force=hp.avoid_concepts_force, concepts=hp.concepts_to_avoid), #can this be used as guardrails? - # distance=hp.max_vector_distance, - # certainty=hp.near_text_certainty, - ), - rerank=Rerank( - prop="caption", # The property to rerank on - query=nearText # If not provided, the original query will be used - ) - ) - - # init - objects = [] - - # Log the results - logging.debug("============hybrid_query RESULTS==================") - - # Extract results from QueryReturn object type - for obj in res.objects: - #log results - logging.debug("----------------%s----------------", obj.uuid) - logging.debug(f"Properties: {obj.properties}") - logging.debug(f"Score: {obj.metadata.score}") - logging.debug(f"Explain Score: {obj.metadata.explain_score}") - logging.debug(f"Rerank Score: {obj.metadata.rerank_score}") - - # Append the relevant object data into the list - objects.append({ - "uuid": str(obj.uuid), - "inat24_image_id": obj.properties.get("inat24_image_id", ""), - "inat24_file_name": obj.properties.get("inat24_file_name", ""), - "score": obj.metadata.score, - "explainScore": obj.metadata.explain_score, - "rerank_score": obj.metadata.rerank_score, - "query": obj.properties.get("query", ""), - "query_id": obj.properties.get("query_id", ""), - "caption": obj.properties.get("caption", ""), - "relevant": obj.properties.get("relevant", ""), - "clip_score": obj.properties.get("clip_score", ""), - "supercategory": obj.properties.get("supercategory", ""), - "category": obj.properties.get("category", ""), - "iconic_group": obj.properties.get("iconic_group", ""), - "inat24_species_id": obj.properties.get("inat24_species_id", ""), - "inat24_species_name": obj.properties.get("inat24_species_name", ""), - "location_uncertainty": obj.properties.get("location_uncertainty", ""), - "date": obj.properties.get("date", ""), - "location_lat": self.get_location_coordinate(obj, "latitude"), - "location_lon": self.get_location_coordinate(obj, "longitude"), - }) - - logging.debug("==============END========================") - - # Convert the list of dictionaries into a pandas DataFrame - df = pd.DataFrame(objects) - - # Return the DataFrame - return df - - def colbert_query(self, nearText, collection_name="INQUIRE"): - """ - This method performs a vector search on a ColBERT embedding space. - """ - #get collection - collection = self.weav_client.collections.get(collection_name) - - # Generate colbert embedding - colbert_embedding = get_colbert_embedding(self.triton_client, nearText) - - # Perform vector search on the "colbert" vector space - res = collection.query.near_vector( - near_vector=colbert_embedding, - target_vector="colbert", - auto_limit=hp.autocut_jumps, - limit=hp.response_limit, - # distance=hp.max_vector_distance, - return_metadata=MetadataQuery(distance=True), - rerank=Rerank( - prop="caption", # The property to rerank on - query=nearText # If not provided, the original query will be used - ) - ) - - # init - objects = [] - - # Log the results - logging.debug("============colberty_query RESULTS===============") - - # Extract results from QueryReturn object type - for obj in res.objects: - #log results - logging.debug("----------------%s----------------", obj.uuid) - logging.debug(f"Properties: {obj.properties}") - logging.debug(f"Distance: {obj.metadata.distance}") - logging.debug(f"Rerank Score: {obj.metadata.rerank_score}") - - # Append the relevant object data into the list - objects.append({ - "uuid": str(obj.uuid), - "inat24_image_id": obj.properties.get("inat24_image_id", ""), - "inat24_file_name": obj.properties.get("inat24_file_name", ""), - "distance": obj.metadata.distance, - "rerank_score": obj.metadata.rerank_score, - "query": obj.properties.get("query", ""), - "query_id": obj.properties.get("query_id", ""), - "caption": obj.properties.get("caption", ""), - "relevant": obj.properties.get("relevant", ""), - "clip_score": obj.properties.get("clip_score", ""), - "supercategory": obj.properties.get("supercategory", ""), - "category": obj.properties.get("category", ""), - "iconic_group": obj.properties.get("iconic_group", ""), - "inat24_species_id": obj.properties.get("inat24_species_id", ""), - "inat24_species_name": obj.properties.get("inat24_species_name", ""), - "location_uncertainty": obj.properties.get("location_uncertainty", ""), - "date": obj.properties.get("date", ""), - "location_lat": self.get_location_coordinate(obj, "latitude"), - "location_lon": self.get_location_coordinate(obj, "longitude"), - }) - - logging.debug("==============END========================") - - # Convert the list of dictionaries into a pandas DataFrame - df = pd.DataFrame(objects) - - # Return the DataFrame - return df - - def colbert_hybrid_query(self, nearText, collection_name="INQUIRE"): - """ - This method performs both a hybrid query on and a colbert query on either the same or seperate embedding spaces. - Combines the results by normalizing their scores to [0, 1], - then summing them according to specified weights. Deduplicates by UUID, - and returns top results sorted by rerank_score. Weights must sum to 1.0. - Final unified_score is between 0 and 1, where 0 is least relevant and 1 is most relevant. - - Parameters: - nearText (str): The text query to search for. - hybrid_collection (str): Name of the collection for hybrid search. - vector_collection (str): Name of the collection for vector search. - - Returns: - pd.DataFrame: Top-k deduplicated and reranked results. - """ - # Ensure both weights add up to 1.0 - if hp.hybrid_weight + hp.colbert_weight != 1.0: - raise ValueError("Weights must sum to 1.0") - - # Perform queries - hybrid_df = self.hybrid_query(nearText, collection_name=collection_name) - colbert_df = self.colbert_query(nearText, collection_name=collection_name) - - #NOTE: hybrid score is already normalized to [0, 1] by Weaviate, - # Normalize vector 'distance' - if not colbert_df.empty: - min_score = colbert_df["distance"].min() - max_score = colbert_df["distance"].max() - colbert_df["normalized_vector_distance"] = (colbert_df["distance"] - min_score) / (max_score - min_score + 1e-10) - else: - colbert_df["normalized_vector_distance"] = [] - - # Merge by uuid (outer join to keep all results) - colbert_suffix = "_colbert" - merged_df = pd.merge( - hybrid_df, - colbert_df, - on="uuid", - how="outer", - suffixes=("", colbert_suffix) - ) - - # Dynamically identify and merge all shared columns - for col in hybrid_df.columns: - if col == "uuid": - continue - col_colbert = f"{col}{colbert_suffix}" - if col_colbert in merged_df.columns: - # Use hybrid value if present, fallback to colbert value - merged_df[col] = merged_df[col].combine_first(merged_df[col_colbert]) - # Drop the duplicate colbert column - merged_df.drop(columns=[col_colbert], inplace=True) - - # Fill missing scores if needed - merged_df["normalized_vector_distance"] = merged_df["normalized_vector_distance"].fillna(0) - merged_df["rerank_score"] = merged_df["rerank_score"].fillna(0) - merged_df["score"] = merged_df["score"].fillna(0) - - merged_df["unified_score"] = ( - hp.hybrid_weight * merged_df["score"] + - hp.colbert_weight * merged_df["normalized_vector_distance"] - ) - - # Sort and select top-k - final_df = merged_df.sort_values(by=["rerank_score", "unified_score"], ascending=False).head(hp.hybrid_colbert_blend_top_k).reset_index(drop=True) - - # Logging block - logging.debug("============colbert_hybrid_query RESULTS===============") - for _,row in final_df.iterrows(): - logging.debug("----------------%s----------------", row["uuid"]) - logging.debug(f"Properties: {row.to_dict()}") - logging.debug(f"Unified Score: {row.get('unified_score', 0):.4f}") - # logging.debug(f"Normalized Hybrid Score: {row.get('normalized_hybrid_score', 0):.4f}") - logging.debug(f"Hybrid Score: {row.get('score', 0):.4f}") - # logging.debug(f"Normalized Vector Certainty: {row.get('normalized_vector_certainty', 0):.4f}") - logging.debug(f"Normalized Vector Distance: {row.get('normalized_vector_distance', 0):.4f}") - logging.debug(f"Rerank Score: {row.get('rerank_score', 0):.4f}") - logging.debug("==============END========================") - - return final_df - - def get_location_coordinate(self, obj, coordinate_type): - """ Helper function to safely fetch latitude or longitude from the location property. """ - location = obj.properties.get("location", "") - if location: - try: - # Ensure the coordinate_type is valid and fetch the correct value - return float(getattr(location, coordinate_type, "0.0")) if coordinate_type in ["latitude", "longitude"] else "0.0" - except (AttributeError, ValueError): - logging.warning(f"Invalid {coordinate_type} value found for obj {obj.uuid}") - return "0.0" # Default fallback for invalid location - return "0.0" # Default fallback if location is missing - - def clip_hybrid_query(self, nearText, collection_name="INQUIRE"): - """ - This method performs a hybrid vector and keyword search on a clip embedding space. - """ - # used this for hybrid search params https://weaviate.io/developers/weaviate/search/hybrid - - #get collection - collection = self.weav_client.collections.get(collection_name) - - # get clip embedding - clip_embedding = get_clip_embeddings(self.triton_client, nearText) - - # Perform the hybrid search - res = collection.query.hybrid( - query=nearText, # The model provider integration will automatically vectorize the query - target_vector="clip", # The name of the vector space to search in - fusion_type= HybridFusion.RELATIVE_SCORE, - # max_vector_distance=hp.max_vector_distance, - # auto_limit=hp.autocut_jumps, - limit=hp.response_limit, - alpha=hp.query_alpha, - return_metadata=MetadataQuery(score=True, explain_score=True), - query_properties=["caption"], #Keyword search properties - # bm25_operator=hp.keyword_search_params, - vector=clip_embedding, # the custom vector - rerank=Rerank( - prop="caption", # The property to rerank on - query=nearText # If not provided, the original query will be used - ) - ) - - # init - objects = [] - - # Log the results - logging.debug("============clip_hybrid_query RESULTS==================") - - # Extract results from QueryReturn object type - for obj in res.objects: - #log results - logging.debug("----------------%s----------------", obj.uuid) - logging.debug(f"Properties: {obj.properties}") - logging.debug(f"Score: {obj.metadata.score}") - logging.debug(f"Explain Score: {obj.metadata.explain_score}") - logging.debug(f"Rerank Score: {obj.metadata.rerank_score}") - - # Append the relevant object data into the list - objects.append({ - "uuid": str(obj.uuid), - "inat24_image_id": obj.properties.get("inat24_image_id", ""), - "inat24_file_name": obj.properties.get("inat24_file_name", ""), - "score": obj.metadata.score, - "explainScore": obj.metadata.explain_score, - "rerank_score": obj.metadata.rerank_score, - "query": obj.properties.get("query", ""), - "query_id": obj.properties.get("query_id", ""), - "caption": obj.properties.get("caption", ""), - "relevant": obj.properties.get("relevant", ""), - "clip_score": obj.properties.get("clip_score", ""), - "supercategory": obj.properties.get("supercategory", ""), - "category": obj.properties.get("category", ""), - "iconic_group": obj.properties.get("iconic_group", ""), - "inat24_species_id": obj.properties.get("inat24_species_id", ""), - "inat24_species_name": obj.properties.get("inat24_species_name", ""), - "location_uncertainty": obj.properties.get("location_uncertainty", ""), - "date": obj.properties.get("date", ""), - "location_lat": self.get_location_coordinate(obj, "latitude"), - "location_lon": self.get_location_coordinate(obj, "longitude"), - }) - - logging.debug("==============END========================") - - # Convert the list of dictionaries into a pandas DataFrame - df = pd.DataFrame(objects) - - # Return the DataFrame - return df \ No newline at end of file diff --git a/benchmarking/INQUIRE/app/requirements.txt b/benchmarking/INQUIRE/app/requirements.txt deleted file mode 100644 index 39cf9da5..00000000 --- a/benchmarking/INQUIRE/app/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -weaviate_client==4.14.* #https://weaviate.io/developers/weaviate/release-notes#weaviate-core-and-client-releases -Pillow==10.4.* -plotly==6.0.* -scikit_learn==1.6.* -datasets==3.2.* -tritonclient[grpc]==2.53.* -Requests \ No newline at end of file diff --git a/benchmarking/INQUIRE/weavloader/Dockerfile b/benchmarking/INQUIRE/weavloader/Dockerfile deleted file mode 100644 index 1ada9ca1..00000000 --- a/benchmarking/INQUIRE/weavloader/Dockerfile +++ /dev/null @@ -1,21 +0,0 @@ -FROM python:3.11-slim - -RUN apt-get update \ - && apt-get install -y \ - wget \ - curl - -# Set working directory -WORKDIR /app - -# Copy requirements.txt into the container -COPY requirements.txt . - -# Install dependencies -RUN pip install --no-cache-dir -r requirements.txt - -# Copy the rest of the application code into the container -COPY . . - -# Run upload.py -CMD ["python", "main.py"] \ No newline at end of file diff --git a/benchmarking/INQUIRE/weavloader/HyperParameters.py b/benchmarking/INQUIRE/weavloader/HyperParameters.py deleted file mode 100644 index 8680f537..00000000 --- a/benchmarking/INQUIRE/weavloader/HyperParameters.py +++ /dev/null @@ -1,64 +0,0 @@ -'''This file contains the hyper parameters that can be changed to fine tune -the system. -NOTE: Not all params have been added here. More in depth search must be -done to find more hyper params that can be altered''' - -from weaviate.classes.config import VectorDistances, Configure -from weaviate.collections.classes.config_vector_index import VectorFilterStrategy - -# 1) Weaviate module multi2vec-bind (Imagebind) weights -textWeight = 0.3 -imageWeight = 0.7 # Increase the weighting here so that the embedding is more influenced by the image -audioWeight = 0 # Currently not being used -videoWeight = 0 # Currently not being used - -# 2) Hierarchical Navigable Small World (hnsw) for Approximate Nearest Neighbor (ANN) hyperparamaters -# used hsnw since it works well with bigger datasets -# more info: https://weaviate.io/developers/weaviate/config-refs/schema/vector-index#hnsw-indexes -# configuration tips: https://weaviate.io/developers/weaviate/config-refs/schema/vector-index#hnsw-configuration-tips -# helpful article: https://gagan-mehta.medium.com/efficient-resource-understanding-and-planning-in-weaviate-ec673f065e86 -hnsw_dist_metric=VectorDistances.COSINE -hnsw_ef=-1 #Balance search speed and recall, Weaviate automatically adjusts the ef value and creates a dynamic ef list when ef is set to -1 -hnsw_ef_construction=100 #Balance index search speed and build speed. Changed from 128 to 100 -hnsw_maxConnections=50 #Maximum number of connections per element. Changed from 32 to 50 -hsnw_dynamicEfMax=500 #Upper bound for dynamic ef -hsnw_dynamicEfMin=200 #Lower bound for dynamic ef. Changed from 100 to 200 -hnsw_ef_factor=20 #This setting is only used when hnsw_ef is -1, Sets the potential length of the search list. Changed from 8 to 20 -hsnw_filterStrategy=VectorFilterStrategy.ACORN #The filter strategy to use for filtering the search results. -hnsw_flatSearchCutoff=40000 #cutoff to automatically switch to a flat (brute-force) vector search when a filter becomes too restrictive -hnsw_vector_cache_max_objects=1e12 #Maximum number of objects in the memory cache -# Auto Product Quantization (PQ) -# https://weaviate.io/developers/weaviate/configuration/compression/pq-compression -hnsw_quantizer=Configure.VectorIndex.Quantizer.pq( - training_limit=500000 #threshold to begin training -) - -# 3) Experimental hyperparameters -align_alpha = 0.7 -clip_alpha = 0.7 -qwen2_5_prompt=""" -role: -You are a world-class Scientific Image Captioning Expert. - -context: -You will be shown a scientific image captured by edge devices. Your goal is to analyze its content and significance in detail. - -task: -Generate exactly one scientifically detailed caption that accurately describes what is visible in the image and its scientific relevance. -Make it as detailed as possible. Also extract text and numbers from the images. - -constraints: -- Only return: - 1. A single caption. - 2. a list of 15 keywords relevant to the image. -- Do not include any additional text, explanations, or formatting. - -format: - caption: - keywords: , , ... -""" -gemma3_prompt=qwen2_5_prompt - -# 3) Weaviate module reranker-transformers (ms-marco-MiniLM-L-6-v2 Reranker Model) -# Model info: https://huggingface.co/cross-encoder/ms-marco-TinyBERT-L-2 -# NOTE: there is no HPs I can change in this module \ No newline at end of file diff --git a/benchmarking/INQUIRE/weavloader/client.py b/benchmarking/INQUIRE/weavloader/client.py deleted file mode 100644 index 403468b5..00000000 --- a/benchmarking/INQUIRE/weavloader/client.py +++ /dev/null @@ -1,49 +0,0 @@ -'''This file contains the code to interact with the weaviate client''' -import logging -import argparse -import os -import weaviate -import time - -def initialize_weaviate_client(): - ''' - Intialize weaviate client based on arg or env var - ''' - parser = argparse.ArgumentParser() - parser.add_argument( - "--weaviate_host", - default=os.getenv("WEAVIATE_HOST","127.0.0.1"), - help="Weaviate host IP.", - ) - parser.add_argument( - "--weaviate_port", - default=os.getenv("WEAVIATE_PORT","8080"), - help="Weaviate REST port.", - ) - parser.add_argument( - "--weaviate_grpc_port", - default=os.getenv("WEAVIATE_GRPC_PORT","50051"), - help="Weaviate GRPC port.", - ) - args = parser.parse_args() - - weaviate_host = args.weaviate_host - weaviate_port = args.weaviate_port - weaviate_grpc_port = args.weaviate_grpc_port - - logging.debug(f"Attempting to connect to Weaviate at {weaviate_host}:{weaviate_port}") - - # Retry logic to connect to Weaviate - while True: - try: - client = weaviate.connect_to_local( - host=weaviate_host, - port=weaviate_port, - grpc_port=weaviate_grpc_port - ) - logging.debug("Successfully connected to Weaviate") - return client - except weaviate.exceptions.WeaviateConnectionError as e: - logging.error(f"Failed to connect to Weaviate: {e}") - logging.debug("Retrying in 10 seconds...") - time.sleep(10) \ No newline at end of file diff --git a/benchmarking/INQUIRE/weavloader/data.py b/benchmarking/INQUIRE/weavloader/data.py deleted file mode 100644 index a3697e39..00000000 --- a/benchmarking/INQUIRE/weavloader/data.py +++ /dev/null @@ -1,207 +0,0 @@ -'''This file contains code that adds data to weaviate using HuggingFace. -These images will be the ones with which the hybrid search will compare -the text query given by the user.''' -import weaviate -import os -import logging -import random -from dateutil.parser import parse -from concurrent.futures import ThreadPoolExecutor, as_completed -from datasets import load_dataset -from io import BytesIO, BufferedReader -from PIL import Image -from model import get_clip_embeddings, gemma3_run_model -from weaviate.classes.data import GeoCoordinate -from itertools import islice - -# Load INQUIRE benchmark dataset from Hugging Face -INQUIRE_DATASET = os.environ.get("INQUIRE_DATASET", "sagecontinuum/INQUIRE-Benchmark-small") - -def process_batch(batch, triton_client): - """ - Process a batch of images and return formatted data for Weaviate. - """ - formatted_data = [] - - for item in batch: - try: - if not isinstance(item, dict): - raise TypeError(f"Expected dict, got {type(item)} - {item}") - - logging.debug(f"Processing item: {item['inat24_file_name']}") - - if not isinstance(item["image"], Image.Image): - raise TypeError(f"Expected PIL.Image, got {type(item['image'])}") - - # Extract metadata - image = item["image"] # PIL.Image object - query = item["query"] - query_id = item["query_id"] - relevant = item["relevant"] - clip_score = item["clip_score"] - inat_id = item["inat24_image_id"] - filename = item["inat24_file_name"] - supercategory = item["supercategory"] - category = item["category"] - iconic_group = item["iconic_group"] - species_id = item["inat24_species_id"] - species_name = item["inat24_species_name"] - location_uncertainty = item["location_uncertainty"] - lat, lon = item.get("latitude", None), item.get("longitude", None) - raw_date = item["date"] - - try: - # Convert the date string to a datetime object and then to RFC3339 format. - date_obj = parse(raw_date) - date_rfc3339 = date_obj.isoformat() - except Exception as e: - logging.error(f"Error parsing date for image {filename}: {e}") - date_rfc3339 = item["date"].replace(" ", "T") # Fallback conversion - - # Convert image to BytesIO for encoding - image_stream = BytesIO() - image.save(image_stream, format="JPEG") - image_stream.seek(0) - - # Encode image for Weaviate - buffered_stream = BufferedReader(image_stream) - encoded_image = weaviate.util.image_encoder_b64(buffered_stream) - - # Generate caption - caption = gemma3_run_model(triton_client, image) - - # Generate CLIP embeddings for the image - clip_embedding = get_clip_embeddings(triton_client, caption, image) - - # Construct data for Weaviate - data_properties = ({ - "inat24_file_name": filename, - "image": encoded_image, - "query": query, - "query_id": query_id, - "caption": caption, - "relevant": relevant, - "clip_score": clip_score, - "inat24_image_id": inat_id, - "supercategory": supercategory, - "category": category, - "iconic_group": iconic_group, - "inat24_species_id": species_id, - "inat24_species_name": species_name, - "location_uncertainty": location_uncertainty, - "date": date_rfc3339, - "location": GeoCoordinate(latitude=float(lat), longitude=float(lon)) if lat and lon else None, - }, - {"clip": clip_embedding}) - - formatted_data.append(data_properties) - - except Exception as e: - logging.error(f"Error processing image {filename}: {e}") - - return formatted_data - -def batched(iterable, batch_size): - """ - Yield successive batch_size chunks from iterable. - Args: - iterable: An iterable (e.g., list, DataFrame rows) - batch_size: Size of each batch - Yields: - list: A batch of items from the iterable - """ - it = iter(iterable) - while batch := list(islice(it, batch_size)): - yield batch - -def load_inquire_data(weaviate_client, triton_client, batch_size=0, sample_size=0, workers=-1): - """ - Load images from HuggingFace INQUIRE dataset into Weaviate using batch import. - Uses parallel processing to maximize CPU usage. - Args: - weaviate_client: Weaviate client instance. - triton_client: Triton client instance for image captioning. - batch_size: Size of each batch for processing. - sample_size: Number of samples to load from the dataset (0 for all). - workers: Number of parallel workers (0 for all available CPU cores, -1 for sequential). - Returns: - None - """ - - # Load dataset - dataset = load_dataset(INQUIRE_DATASET, split="test") - - # Sample the dataset if sample_size is provided - if sample_size > 0: - sampled_indices = random.sample(range(len(dataset)), sample_size) - dataset = dataset.select(sampled_indices) - logging.debug(f"Sampled {sample_size} records from the dataset.") - else: - logging.debug("Using the entire dataset.") - - # Get Weaviate collection - collection = weaviate_client.collections.get("INQUIRE") - - # If workers is set to -1, process batches sequentially - if workers == -1: - logging.debug("Processing sequentially (no parallelization).") - - for batch in batched(dataset, batch_size): - results = process_batch(batch, triton_client) - - # Batch insert into Weaviate - with collection.batch.fixed_size(batch_size=batch_size) as batch: - for properties, vector in results: - batch.add_object(properties=properties, vector=vector) - - # Stop batch import if too many errors occur - if batch.number_errors > 5: - logging.error("Batch import stopped due to excessive errors.") - break - else: - - if workers == 0: - workers = os.cpu_count() - - # Use parallel processing - logging.debug(f"Processing with {workers} parallel workers.") - - with ThreadPoolExecutor(max_workers=workers) as executor: - futures = [] - for batch in batched(dataset, batch_size): - futures.append(executor.submit(process_batch, batch, triton_client)) - - # Prepare a batch process for Weaviate - with collection.batch.fixed_size(batch_size=batch_size) as batch: - for future in as_completed(futures): - results = future.result() - if results: - for properties, vector in results: - batch.add_object(properties=properties, vector=vector) - - # Stop batch import if too many errors occur - if batch.number_errors > 5: - logging.error("Batch import stopped due to excessive errors.") - break - # Log failed imports - failed_objects = collection.batch.failed_objects - if failed_objects: - logging.debug(f"Number of failed imports: {len(failed_objects)}") - - logging.debug(f"{INQUIRE_DATASET} dataset successfully loaded into Weaviate") - -def reload_inquire_data(weaviate_client, triton_client, batch_size=0, sample_size=0, workers=-1): - """ - Reload INQUIRE collection as vectors into Weaviate using batch import. - Uses parallel processing to maximize CPU usage. - Args: - weaviate_client: Weaviate client instance. - triton_client: Triton client instance for image captioning. - batch_size: Size of each batch for processing. - sample_size: Number of samples to load from the dataset (0 for all). - workers: Number of parallel workers (0 for all available CPU cores, -1 for sequential). - Returns: - None - """ - #TODO: export the images from weaviate once they are load in so you can just load them again using this function - return None \ No newline at end of file diff --git a/benchmarking/INQUIRE/weavloader/init.py b/benchmarking/INQUIRE/weavloader/init.py deleted file mode 100644 index 8bd3b5a8..00000000 --- a/benchmarking/INQUIRE/weavloader/init.py +++ /dev/null @@ -1,133 +0,0 @@ -from weaviate.classes.config import Configure, Property, DataType, Multi2VecField -import HyperParameters as hp -import time -import logging - -def run(client): - """ - Create the initial schema after deleting the existing collection if it exists. - This allows for reloading the schema without needing to restart the server. - """ - - collection_name = "INQUIRE" - - # Check if the collection exists - if collection_name in client.collections.list_all(): - logging.debug(f"Collection '{collection_name}' exists. Deleting it first...") - client.collections.delete(collection_name) - - # Ensure deletion before proceeding - while collection_name in client.collections.list_all(): - time.sleep(1) # Wait until it's fully deleted - - logging.debug(f"Creating collection '{collection_name}'...") - - # Create a schema to add images, audio, etc. - client.collections.create( - name=collection_name, - description="A collection to test our set up using INQUIRE", - properties=[ - Property(name="inat24_image_id", data_type=DataType.NUMBER), - Property(name="inat24_file_name", data_type=DataType.TEXT), - Property(name="query", data_type=DataType.TEXT), - Property(name="query_id", data_type=DataType.NUMBER), - Property(name="image", data_type=DataType.BLOB), - Property(name="audio", data_type=DataType.BLOB), - Property(name="video", data_type=DataType.BLOB), - Property(name="caption", data_type=DataType.TEXT), # Caption for keyword search - Property(name="relevant", data_type=DataType.NUMBER), - Property(name="clip_score", data_type=DataType.NUMBER), - Property(name="supercategory", data_type=DataType.TEXT), - Property(name="category", data_type=DataType.TEXT), - Property(name="iconic_group", data_type=DataType.TEXT), - Property(name="inat24_species_id", data_type=DataType.NUMBER), - Property(name="inat24_species_name", data_type=DataType.TEXT), - Property(name="location_uncertainty", data_type=DataType.NUMBER), - Property(name="date", data_type=DataType.DATE), - Property(name="location", data_type=DataType.GEO_COORDINATES) - ], - vectorizer_config=[ - # Configure.NamedVectors.multi2vec_bind( - # name="imagebind", - # vectorize_collection_name=False, - # # Define fields for vectorization - # image_fields=[ - # Multi2VecField(name="image", weight=hp.imageWeight) - # ], - # text_fields=[ - # Multi2VecField(name="caption", weight=hp.textWeight) - # ], - # audio_fields=[ - # Multi2VecField(name="audio", weight=hp.audioWeight) - # ], - # video_fields=[ - # Multi2VecField(name="video", weight=hp.videoWeight) - # ], - # vector_index_config=Configure.VectorIndex.hnsw( - # distance_metric=hp.hnsw_dist_metric, - # dynamic_ef_factor=hp.hnsw_ef_factor, - # dynamic_ef_max=hp.hsnw_dynamicEfMax, - # dynamic_ef_min=hp.hsnw_dynamicEfMin, - # ef=hp.hnsw_ef, - # ef_construction=hp.hnsw_ef_construction, - # filter_strategy=hp.hsnw_filterStrategy, - # flat_search_cutoff=hp.hnsw_flatSearchCutoff, - # max_connections=hp.hnsw_maxConnections, - # vector_cache_max_objects=int(hp.hnsw_vector_cache_max_objects), - # quantizer=hp.hnsw_quantizer, - # ) - # ), - # Configure.NamedVectors.none( - # name="colbert", - # vector_index_config=Configure.VectorIndex.hnsw( #https://weaviate.io/developers/weaviate/concepts/vector-index , https://weaviate.io/developers/weaviate/config-refs/schema/vector-index - # distance_metric=hp.hnsw_dist_metric, #works well to compare images with different attributes such as brightness levels or sizes. - # dynamic_ef_factor=hp.hnsw_ef_factor, - # dynamic_ef_max=hp.hsnw_dynamicEfMax, - # dynamic_ef_min=hp.hsnw_dynamicEfMin, - # ef=hp.hnsw_ef, - # ef_construction=hp.hnsw_ef_construction, - # filter_strategy=hp.hsnw_filterStrategy, - # flat_search_cutoff=hp.hnsw_flatSearchCutoff, - # max_connections=hp.hnsw_maxConnections, - # vector_cache_max_objects=int(hp.hnsw_vector_cache_max_objects), - # quantizer=hp.hnsw_quantizer, - # multi_vector=Configure.VectorIndex.MultiVector.multi_vector() - # ) - # ), - # Configure.NamedVectors.none( - # name="align", - # vector_index_config=Configure.VectorIndex.hnsw( #https://weaviate.io/developers/weaviate/concepts/vector-index , https://weaviate.io/developers/weaviate/config-refs/schema/vector-index - # distance_metric=hp.hnsw_dist_metric, #works well to compare images with different attributes such as brightness levels or sizes. - # dynamic_ef_factor=hp.hnsw_ef_factor, - # dynamic_ef_max=hp.hsnw_dynamicEfMax, - # dynamic_ef_min=hp.hsnw_dynamicEfMin, - # ef=hp.hnsw_ef, - # ef_construction=hp.hnsw_ef_construction, - # filter_strategy=hp.hsnw_filterStrategy, - # flat_search_cutoff=hp.hnsw_flatSearchCutoff, - # max_connections=hp.hnsw_maxConnections, - # vector_cache_max_objects=int(hp.hnsw_vector_cache_max_objects), - # quantizer=hp.hnsw_quantizer, - # ) - # ), - Configure.NamedVectors.none( - name="clip", - vector_index_config=Configure.VectorIndex.hnsw( #https://weaviate.io/developers/weaviate/concepts/vector-index , https://weaviate.io/developers/weaviate/config-refs/schema/vector-index - distance_metric=hp.hnsw_dist_metric, #works well to compare images with different attributes such as brightness levels or sizes. - dynamic_ef_factor=hp.hnsw_ef_factor, - dynamic_ef_max=hp.hsnw_dynamicEfMax, - dynamic_ef_min=hp.hsnw_dynamicEfMin, - ef=hp.hnsw_ef, - ef_construction=hp.hnsw_ef_construction, - filter_strategy=hp.hsnw_filterStrategy, - flat_search_cutoff=hp.hnsw_flatSearchCutoff, - max_connections=hp.hnsw_maxConnections, - vector_cache_max_objects=int(hp.hnsw_vector_cache_max_objects), - quantizer=hp.hnsw_quantizer, - ) - ) - ], - reranker_config=Configure.Reranker.transformers() - ) - - logging.debug(f"Collection '{collection_name}' successfully created.") diff --git a/benchmarking/INQUIRE/weavloader/main.py b/benchmarking/INQUIRE/weavloader/main.py deleted file mode 100644 index 35b84286..00000000 --- a/benchmarking/INQUIRE/weavloader/main.py +++ /dev/null @@ -1,63 +0,0 @@ -'''Main File''' -#NOTE: This will be deployed in our cloud under k8s namespace beehive-sage -# maybe integrated with sage-data-loader. Keep in mind, I will have to -# somehow make the data loader not wait on creating an object in weaviate -# because this takes longer. - -import logging -import os -import time -from client import initialize_weaviate_client -import tritonclient.grpc as TritonClient -from data import load_inquire_data -from init import run - -SAMPLE_SIZE = int(os.environ.get("SAMPLE_SIZE", 0)) -WORKERS = int(os.environ.get("WORKERS", 0)) -IMAGE_BATCH_SIZE = int(os.environ.get("IMAGE_BATCH_SIZE", 100)) - -def run_load(): - ''' - Run the loading function - ''' - #init weaviate client - weaviate_client = initialize_weaviate_client() - - # Initiate Triton client - triton_client = TritonClient.InferenceServerClient(url="triton:8001") - - # create the schema - try: - run(weaviate_client) - except Exception as e: - logging.error(f"Error in run: {e}") - weaviate_client.close() - - # Start loading - try: - load_inquire_data(weaviate_client, triton_client, IMAGE_BATCH_SIZE, SAMPLE_SIZE, WORKERS) - except Exception as e: - logging.error(f"Error in load_inquire_data: {e}") - weaviate_client.close() - - #close the client - weaviate_client.close() - -if __name__ == "__main__": - - # Configure logging - logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s %(message)s", - datefmt="%Y/%m/%d %H:%M:%S", - ) - - # load the data into weaviate - run_load() - - # Keep the program running when the loading is done - try: - while True: - time.sleep(10) - except (KeyboardInterrupt, SystemExit): - exit() \ No newline at end of file diff --git a/benchmarking/INQUIRE/weavloader/model.py b/benchmarking/INQUIRE/weavloader/model.py deleted file mode 100644 index 0a8220cc..00000000 --- a/benchmarking/INQUIRE/weavloader/model.py +++ /dev/null @@ -1,319 +0,0 @@ -'''This file contains the code to talk to Triton Inference Server''' - -import logging -from collections import OrderedDict -import tritonclient.grpc as TritonClient -import numpy as np -import HyperParameters as hp -import json - -def florence2_run_model(triton_client, task_prompt, image, text_input=""): - """ - takes in a task prompt and image, returns an answer using florence2 base model - """ - # Prepare inputs for Triton - image_width, image_height = image.size - image_np = np.array(image).astype(np.float32) - task_prompt_bytes = task_prompt.encode("utf-8") - text_input_bytes = text_input.encode("utf-8") - - # Prepare inputs & outputs for Triton - # NOTE: if you enable max_batch_size, leading number is batch size, example [1,1] 1 is batch size - inputs = [ - TritonClient.InferInput("image", [image_height, image_width, 3], "FP32"), - TritonClient.InferInput("prompt", [1], "BYTES"), - TritonClient.InferInput("text_input", [1], "BYTES"), - TritonClient.InferInput("image_width", [1], "INT32"), - TritonClient.InferInput("image_height", [1], "INT32") - ] - outputs = [ - TritonClient.InferRequestedOutput("answer") - ] - - # Add tensors - inputs[0].set_data_from_numpy(image_np) - inputs[1].set_data_from_numpy(np.array([task_prompt_bytes], dtype="object")) - inputs[2].set_data_from_numpy(np.array([text_input_bytes], dtype="object")) - inputs[3].set_data_from_numpy(np.array([image_width], dtype="int32")) - inputs[4].set_data_from_numpy(np.array([image_height], dtype="int32")) - - # Perform inference - try: - response = triton_client.infer(model_name="florence2base", inputs=inputs, outputs=outputs) - - # Get the result - answer = response.as_numpy("answer")[0] - answer_str = answer.decode("utf-8") - - # Convert the JSON string to a dictionary - answer_dict = json.loads(answer_str) - - return answer_dict - except Exception as e: - logging.error(f"Error during Florence2 inference: {str(e)}") - return None - -def florence2_gen_caption(triton_client, image): - """ - Generate image caption using florence2 base model - """ - task_prompt = '' - - description_text = florence2_run_model(triton_client, task_prompt, image) - description_text = description_text[task_prompt] - - #takes those details from the setences and finds labels and boxes in the image - task_prompt = '' - boxed_descriptions = florence2_run_model(triton_client, task_prompt, image, description_text) - - #only prints out labels not bboxes - descriptions = boxed_descriptions[task_prompt]['labels'] - logging.debug(f'Labels Generated: {descriptions}') - - #finds other things in the image that the description did not explicitly say - task_prompt = '' - labels = florence2_run_model(triton_client, task_prompt, image) - - #only prints out labels not bboxes - printed_labels = labels[task_prompt]['labels'] - - # Join description_text into a single string - description_text_joined = "".join(description_text) - - #makes unique list of labels and adds commas - label_list = descriptions + printed_labels - unique_labels = list(OrderedDict.fromkeys(label_list)) - labels = ", ".join(unique_labels) - - # Combine all lists into one list - combined_list = ["DESCRIPTION:"] + [description_text_joined] + ["LABELS:"] + [labels] - - # Join the unique items into a single string with spaces between them - final_description = " ".join(combined_list) - - logging.debug(f'Final Generated Description: {final_description}') - return final_description - -def get_colbert_embedding(triton_client, text): - """ - Embed text using ColBERT encoder served via Triton Inference Server. - Returns token-level embeddings of shape [num_tokens, 128] - """ - # Prepare input - text_bytes = text.encode("utf-8") - input_tensor = np.array([text_bytes], dtype="object") # batch size = 1 - - # Prepare inputs & outputs for Triton - # NOTE: if you enable max_batch_size, leading number is batch size, example [1,1] 1 is batch size - inputs = [ - TritonClient.InferInput("text", input_tensor.shape, "BYTES") - ] - outputs = [ - TritonClient.InferRequestedOutput("embedding"), - TritonClient.InferRequestedOutput("token_lengths") - ] - - # Add tensors - inputs[0].set_data_from_numpy(input_tensor) - - # Run inference - try: - results = triton_client.infer(model_name="colbert", inputs=inputs, outputs=outputs) - - # Retrieve and reshape output - emb_flat = results.as_numpy("embedding") # shape: (1, max_len * 128) - token_lengths = results.as_numpy("token_lengths") # shape: (1,) - num_tokens = token_lengths[0] - - # Reshape and unpad - emb_3d = emb_flat.reshape(1, -1, 128) - token_embeddings = emb_3d[0, :num_tokens, :] # shape: [num_tokens, 128] - except Exception as e: - logging.error(f"Error during Colbert inference: {str(e)}") - return None - - return token_embeddings - -def fuse_embeddings( img_emb: np.ndarray, txt_emb: np.ndarray, alpha: float = 0.5) -> np.ndarray: - """ - Given two L2-normalized vectors img_emb and txt_emb (shape (D,)), - returns their weighted sum (alpha * img + (1-alpha) * txt), re-normalized to unit norm. - """ - if img_emb.shape != txt_emb.shape: - raise ValueError("img_emb and txt_emb must have the same dimension") - - # Weighted sum - combined = alpha * img_emb + (1.0 - alpha) * txt_emb - - # Re-normalize - norm = np.linalg.norm(combined) - if norm == 0.0: - # Edge case: if they cancel out exactly (unlikely), fall back to text alone - return txt_emb.copy() - return (combined / norm).astype(np.float32) - -def get_allign_embeddings(triton_client, text, image=None): - """ - Embed text and image using ALIGN encoder served via Triton Inference Server. - Returns one fused embedding created from both modalities. - """ - # --- 1. Prepare Inputs --- - text_bytes = text.encode("utf-8") - text_np = np.array([text_bytes], dtype="object") - - # Fallback image shape (e.g., placeholder 1x1 RGB) - if image is not None: - image_np = np.array(image).astype(np.float32) - else: - image_np = np.zeros((1, 1, 3), dtype=np.float32) - - # Create Triton input objects - inputs = [ - TritonClient.InferInput("text", [1], "BYTES"), - TritonClient.InferInput("image", list(image_np.shape), "FP32") - ] - - inputs[0].set_data_from_numpy(text_np) - inputs[1].set_data_from_numpy(image_np) - - outputs = [ - TritonClient.InferRequestedOutput("text_embedding"), - TritonClient.InferRequestedOutput("image_embedding") - ] - - # --- 2. Inference Call --- - try: - results = triton_client.infer(model_name="align", inputs=inputs, outputs=outputs) - text_embedding = results.as_numpy("text_embedding")[0] - image_embedding = results.as_numpy("image_embedding")[0] - except Exception as e: - logging.error(f"Error during ALIGN inference: {str(e)}") - return None - - # --- 3. Fuse Embeddings --- - if image is not None: - embedding = fuse_embeddings(image_embedding, text_embedding, alpha=hp.align_alpha) - else: - embedding = text_embedding - - return embedding - -def get_clip_embeddings(triton_client, text, image=None): - """ - Embed text and image using CLIP encoder served via Triton Inference Server. - Returns one fused embedding created from both modalities. - """ - # --- 1. Prepare Inputs --- - text_bytes = text.encode("utf-8") - text_np = np.array([text_bytes], dtype="object") - - # Fallback image shape (e.g., placeholder 1x1 RGB) - if image is not None: - image_np = np.array(image).astype(np.float32) - else: - image_np = np.zeros((1, 1, 3), dtype=np.float32) - - # Create Triton input objects - inputs = [ - TritonClient.InferInput("text", [1], "BYTES"), - TritonClient.InferInput("image", list(image_np.shape), "FP32") - ] - - inputs[0].set_data_from_numpy(text_np) - inputs[1].set_data_from_numpy(image_np) - - outputs = [ - TritonClient.InferRequestedOutput("text_embedding"), - TritonClient.InferRequestedOutput("image_embedding") - ] - - # --- 2. Inference Call --- - try: - results = triton_client.infer(model_name="clip", inputs=inputs, outputs=outputs) - text_embedding = results.as_numpy("text_embedding")[0] - image_embedding = results.as_numpy("image_embedding")[0] - except Exception as e: - logging.error(f"Error during CLIP inference: {str(e)}") - return None - - # --- 3. Fuse Embeddings --- - if image is not None: - embedding = fuse_embeddings(image_embedding, text_embedding, alpha=hp.clip_alpha) - else: - embedding = text_embedding - - return embedding - -def qwen2_5_run_model(triton_client, image, task_prompt=hp.qwen2_5_prompt): - """ - takes in a task prompt and image, returns an answer using Qwen2.5-VL model - """ - # Prepare inputs for Triton - image_width, image_height = image.size - image_np = np.array(image).astype(np.uint8) - task_prompt_bytes = task_prompt.encode("utf-8") - - # Prepare inputs & outputs for Triton - # NOTE: if you enable max_batch_size, leading number is batch size, example [1,1] 1 is batch size - inputs = [ - TritonClient.InferInput("image", [image_height, image_width, 3], "UINT8"), - TritonClient.InferInput("prompt", [1], "BYTES"), - ] - outputs = [ - TritonClient.InferRequestedOutput("answer") - ] - - # Add tensors - inputs[0].set_data_from_numpy(image_np) - inputs[1].set_data_from_numpy(np.array([task_prompt_bytes], dtype="object")) - - # Perform inference - try: - response = triton_client.infer(model_name="qwen2_5_vl", inputs=inputs, outputs=outputs) - - # Get the result - answer = response.as_numpy("answer")[0] - answer_str = answer.decode("utf-8") - - logging.debug(f'Final Generated Description: {answer_str}') - return answer_str - except Exception as e: - logging.error(f"Error during Qwen2.5-VL inference: {str(e)}") - return None - -def gemma3_run_model(triton_client, image, task_prompt=hp.gemma3_prompt): - """ - takes in a task prompt and image, returns an answer using gemma3 model - """ - # Prepare inputs for Triton - image_width, image_height = image.size - image_np = np.array(image).astype(np.uint8) - task_prompt_bytes = task_prompt.encode("utf-8") - - # Prepare inputs & outputs for Triton - # NOTE: if you enable max_batch_size, leading number is batch size, example [1,1] 1 is batch size - inputs = [ - TritonClient.InferInput("image", [image_height, image_width, 3], "UINT8"), - TritonClient.InferInput("prompt", [1], "BYTES"), - ] - outputs = [ - TritonClient.InferRequestedOutput("answer") - ] - - # Add tensors - inputs[0].set_data_from_numpy(image_np) - inputs[1].set_data_from_numpy(np.array([task_prompt_bytes], dtype="object")) - - # Perform inference - try: - response = triton_client.infer(model_name="gemma3", inputs=inputs, outputs=outputs) - - # Get the result - answer = response.as_numpy("answer")[0] - answer_str = answer.decode("utf-8") - - logging.debug(f'Final Generated Description: {answer_str}') - return answer_str - except Exception as e: - logging.error(f"Error during Gemma3 inference: {str(e)}") - return None \ No newline at end of file diff --git a/benchmarking/INQUIRE/weavloader/requirements.txt b/benchmarking/INQUIRE/weavloader/requirements.txt deleted file mode 100644 index 365e8eeb..00000000 --- a/benchmarking/INQUIRE/weavloader/requirements.txt +++ /dev/null @@ -1,9 +0,0 @@ -weaviate_client==4.14.* #https://weaviate.io/developers/weaviate/release-notes#weaviate-core-and-client-releases -imageio==2.33.1 -sage_data_client==0.7.1 -matplotlib==3.8.2 -Pillow==10.4.* -tritonclient[grpc]==2.53.* -numpy==1.24.* -datasets==3.2.* -Requests \ No newline at end of file diff --git a/benchmarking/README.md b/benchmarking/README.md new file mode 100644 index 00000000..ed1c447c --- /dev/null +++ b/benchmarking/README.md @@ -0,0 +1,321 @@ +# Sage Image Search on NRP Benchmarks + +This repository contains benchmark implementations for evaluating vector databases and models using the [`imsearch_eval`](https://github.com/waggle-sensor/imsearch_eval) framework. + +## What's in This Repository + +This repository provides: +- **Benchmark implementations** (e.g., INQUIRE) that use the `imsearch_eval` framework +- **Template system** for creating new benchmarks +- **Makefile system** for building, deploying, and managing benchmarks on NRP +- **Dockerfile templates** for containerizing benchmarks for NRP +- **Kubernetes configurations** for deploying benchmarks on NRP + +The framework code itself (interfaces, adapters, evaluator) is in the separate [`imsearch_eval`](https://github.com/waggle-sensor/imsearch_eval) package. + +The existing benchmarks are in the [imsearch_benchmarks](https://github.com/waggle-sensor/imsearch_benchmarks) repository. Some of them have been implemented here in this repository. + +## Quick Start: Creating a New Benchmark + +```bash +cd benchmarking +cp -r benchmarks/template benchmarks/MYBENCHMARK +cd benchmarks/MYBENCHMARK +# Follow instructions in README.md +``` + +The `benchmarks/template/` directory contains everything you need: +- ✅ Ready-to-use Makefile and Dockerfile.job +- ✅ Python templates for `run_benchmark.py`, `config.py`, `benchmark_dataset.py` +- ✅ Comprehensive documentation and quick start guide + +See `benchmarks/template/README.md` for detailed setup instructions, or `benchmarks/template/QUICKSTART.md` for a 5-minute guide. + +## Repository Structure + +``` +benchmarking/ +├── benchmarks/ # Benchmark implementations +│ ├── template/ # Template for creating new benchmarks +│ ├── INQUIRE/ # INQUIRE benchmark implementation +│ ├── Makefile # Base Makefile (included by benchmarks) +│ ├── MAKEFILE.md # Makefile documentation +│ ├── Dockerfile.template # Base Dockerfile template +│ └── DOCKER.md # Dockerfile documentation +└── kubernetes/ # Kubernetes deployment configurations + ├── base/ # Base Kubernetes resources + └── INQUIRE/ # INQUIRE-specific Kubernetes configs +``` + +## Existing Benchmarks + +### INQUIRE + +- **Location**: `benchmarks/INQUIRE/` +- **Dataset**: INQUIRE benchmark for natural world image retrieval +- **Vector DB**: Weaviate +- **Models**: CLIP, ColBERT, ALIGN (embeddings); Gemma3, Qwen2.5-VL (captions) +- **Usage**: See `benchmarks/INQUIRE/Readme.md` + +## Creating a New Benchmark + +### Step 1: Copy Template + +```bash +cd benchmarking/benchmarks +cp -r template MYBENCHMARK +cd MYBENCHMARK +``` + +### Step 2: Implement BenchmarkDataset + +Create `benchmark_dataset.py` extending the `HuggingFaceDataset` adapter from `imsearch_eval`: + +```python +from imsearch_eval.adapters.huggingface import HuggingFaceDataset + +class MyBenchmarkDataset(HuggingFaceDataset): + """Benchmark dataset class for MYBENCHMARK.""" + + def get_query_column(self) -> str: + """Return the column name containing query text.""" + return "query" + + def get_query_id_column(self) -> str: + """Return the column name containing query IDs.""" + return "query_id" + + def get_relevance_column(self) -> str: + """Return the column name containing relevance labels (1 for relevant, 0 for not).""" + return "relevant" + + def get_metadata_columns(self) -> list: + """Return optional metadata columns to include in evaluation stats.""" + return ["category", "type"] +``` + +The `HuggingFaceDataset` adapter handles loading datasets from HuggingFace Hub. You only need to implement the column mapping methods. The dataset is loaded using `benchmark_dataset.load_as_dataset(split="test", sample_size=0, seed=42, token=config._hf_token)`. + +>NOTE: You can also implement new adapters for other vector databases and models. See the `imsearch_eval` repository for more information. + +### Step 3: Create config.py + +Create a `config.py` that implements the `Config` interface and loads all environment variables: + +```python +import os +from imsearch_eval.framework.interfaces import Config + +class MyConfig(Config): + def __init__(self): + self.MYBENCHMARK_DATASET = os.environ.get("MYBENCHMARK_DATASET", "your-dataset/name") + self.WEAVIATE_HOST = os.environ.get("WEAVIATE_HOST", "127.0.0.1") + # ... add more environment variables +``` + +See `benchmarks/template/config.py` and `benchmarks/INQUIRE/config.py` for examples. + +### Step 4: Create run_benchmark.py + +Create `run_benchmark.py` that combines data loading and evaluation. The script should have: + +1. A `load_data()` function that loads data into the vector database +2. A `run_evaluation()` function that runs the benchmark evaluation +3. An `upload_to_s3()` function for S3 uploads (optional) +4. A `main()` function that orchestrates the complete benchmark run + +```python +from config import MyConfig +from imsearch_eval import BenchmarkEvaluator, VectorDBAdapter +from imsearch_eval.adapters import WeaviateAdapter, TritonModelProvider, WeaviateQuery +from benchmark_dataset import MyBenchmarkDataset +from data_loader import MyDataLoader # Optional + +config = MyConfig() + +def load_data(data_loader, vector_db: VectorDBAdapter, hf_dataset): + """Load dataset into vector database.""" + # Create collection schema + schema_config = data_loader.get_schema_config() + vector_db.create_collection(schema_config) + + # Process and insert data + results = data_loader.process_batch(batch_size=config._image_batch_size, + dataset=hf_dataset, + workers=config._workers) + inserted = vector_db.insert_data(config._collection_name, results, + batch_size=config._image_batch_size) + +def run_evaluation(evaluator: BenchmarkEvaluator, hf_dataset): + """Run the benchmark evaluation.""" + image_results, query_evaluation = evaluator.evaluate_queries( + query_batch_size=config._query_batch_size, + dataset=hf_dataset, + workers=config._workers + ) + return image_results, query_evaluation + +def main(): + # Step 0: Set up clients and adapters + # Step 1: Call load_data(data_loader, vector_db, hf_dataset) + # Step 2: Call run_evaluation(evaluator, hf_dataset) + # Step 3: Save results (image_search_results.csv, query_eval_metrics.csv, config_values.csv) + # Step 4: Upload to S3 (optional) + pass +``` + +See `benchmarks/INQUIRE/run_benchmark.py` for a complete example. + +### Step 5: Update Makefile + +Edit `Makefile` and set: +- `BENCHMARK_NAME` +- `DOCKERFILE_JOB` +- `KUSTOMIZE_DIR` +- `RESULTS_FILES` + +### Step 6: Update requirements.txt + +Add the required packages: + +```txt +# Core benchmarking framework (install with all extras needed) +imsearch_eval[weaviate] @ git+https://github.com/waggle-sensor/imsearch_eval.git@0.1.0 +imsearch_eval[triton] @ git+https://github.com/waggle-sensor/imsearch_eval.git@0.1.0 +imsearch_eval[huggingface] @ git+https://github.com/waggle-sensor/imsearch_eval.git@0.1.0 + +# S3 upload support (MinIO) +minio>=7.2.0 + +# Add other dependencies as needed (e.g., Pillow, python-dateutil) +``` + +### Step 7: Create Kubernetes Config + +```bash +cd ../../kubernetes +cp -r ../benchmarks/template/kubernetes MYBENCHMARK +cd MYBENCHMARK +# Update kustomization.yaml, env.yaml, etc. +``` + +See `benchmarks/template/README.md` for complete instructions. + +## Makefile System + +The Makefile system provides consistent commands across all benchmarks. + +### Base Makefile + +Located at `benchmarks/Makefile`, this contains reusable commands that all benchmarks inherit. + +### Benchmark Makefiles + +Each benchmark has its own `Makefile` that: +1. Sets benchmark-specific variables +2. Includes the base Makefile: `include ../Makefile` + +### Common Commands + +All benchmarks support: +- `make build` - Build Docker job image +- `make run` - Deploy and run benchmark job (loads data and evaluates) +- `make run-local` - Run benchmark locally with port-forwarding +- `make status` - Show deployment status +- `make logs` - View job logs +- `make down` - Remove deployments + +See `benchmarks/MAKEFILE.md` for detailed documentation. + +## Dockerfile System + +The Dockerfile system provides templates for consistent container builds. + +### Template Files + +- `benchmarks/Dockerfile.template` - Base template +- `benchmarks/template/Dockerfile.job` - Combined job template + +### Creating Benchmark Dockerfiles + +1. Copy from template: `cp benchmarks/template/Dockerfile.job benchmarks/MYBENCHMARK/` +2. Verify `CMD` line runs `run_benchmark.py` +3. Ensure `requirements.txt` includes `imsearch_eval` and `minio` packages + +See `benchmarks/DOCKER.md` for detailed documentation. + +## Kubernetes Deployment + +### Base Resources + +Located in `kubernetes/base/`, these provide common Kubernetes resources: +- `benchmark-job.yaml` - Combined job template (loads data and evaluates) +- `._s3-secret.yaml` - S3 credentials secret (use the template file as a guide) +- `._huggingface-secret.yaml` - HuggingFace token secret (use the template file as a guide) +- `kustomization.yaml` - Base kustomization config + +> **Important:** +> All secret files you actually use must be named with leading `._` per `.gitignore` and not checked into version control! Only commit the `*.template.yaml` files. + +### Benchmark-Specific Configs + +Each benchmark has its own directory under `kubernetes/` (e.g., `kubernetes/INQUIRE/`) with `nrp-dev/` and `nrp-prod/` overlays: +- `nrp-dev/` - Development environment overlay (default) + - `kustomization.yaml` - Extends base, sets images, patches + - `env.yaml` - Environment variables for dev environment +- `nrp-prod/` - Production environment overlay (optional) + - `kustomization.yaml` - Extends base, sets images, patches + - `env.yaml` - Environment variables for prod environment + +### Deployment Workflow + +1. **Build image**: `make build` (in benchmark directory) +2. **Run benchmark**: `make run` (deploys and runs the benchmark job) +4. **Monitor**: `make logs` +5. **Status**: `make status` + +See `kubernetes/README.md` for detailed Kubernetes documentation. + +## Template Directory + +The `benchmarks/template/` directory provides a complete starting point for new benchmarks: + +- **README.md**: Comprehensive guide for creating new benchmarks +- **QUICKSTART.md**: 5-minute quick start guide +- **Makefile**: Template with all required variables +- **Dockerfile.job**: Ready-to-use combined job Dockerfile +- **Python Templates**: Template files for `run_benchmark.py`, `load_data.py`, `benchmark_dataset.py` +- **requirements.txt**: Base dependencies including `imsearch_eval` and `minio` +- **kubernetes/**: Complete Kubernetes template + +## Dependencies + +All benchmarks depend on the [`imsearch_eval`](https://github.com/waggle-sensor/imsearch_eval) package, which provides: +- Abstract interfaces (`VectorDBAdapter`, `ModelProvider`, `Query`, `BenchmarkDataset`, etc.) +- Evaluation logic (`BenchmarkEvaluator`) +- Shared adapters (`WeaviateAdapter`, `TritonModelProvider`, etc.) + +Install it via: +```bash +# Install with all extras needed for benchmarks +pip install imsearch_eval[weaviate] @ git+https://github.com/waggle-sensor/imsearch_eval.git@0.1.0 +pip install imsearch_eval[triton] @ git+https://github.com/waggle-sensor/imsearch_eval.git@0.1.0 +pip install imsearch_eval[huggingface] @ git+https://github.com/waggle-sensor/imsearch_eval.git@0.1.0 +``` + +Or install all at once: +```bash +pip install "imsearch_eval[weaviate,triton,huggingface] @ git+https://github.com/waggle-sensor/imsearch_eval.git@0.1.0" +``` + +See the [`imsearch_eval` README](https://github.com/waggle-sensor/imsearch_eval) for framework documentation. + +## Documentation + +- **Framework Documentation**: See [`imsearch_eval` repository](https://github.com/waggle-sensor/imsearch_eval) +- **Makefile System**: `benchmarks/MAKEFILE.md` +- **Dockerfile System**: `benchmarks/DOCKER.md` +- **Kubernetes**: `kubernetes/README.md` +- **Template Guide**: `benchmarks/template/README.md` +- **Quick Start**: `benchmarks/template/QUICKSTART.md` +- **INQUIRE Benchmark**: `benchmarks/INQUIRE/Readme.md` diff --git a/benchmarking/benchmarks/DOCKER.md b/benchmarking/benchmarks/DOCKER.md new file mode 100644 index 00000000..1f48f7ce --- /dev/null +++ b/benchmarking/benchmarks/DOCKER.md @@ -0,0 +1,160 @@ +# Benchmark Dockerfile Guide + +## Overview + +The benchmarking framework provides a Dockerfile template that can be reused across all benchmarks. Since Dockerfiles must be in the benchmark directory for the build context, each benchmark creates its own Dockerfile based on the template. + +## Template + +The base template is located at `benchmarks/template/Dockerfile.job`. This template provides: + +- Python 3.11 base image (configurable via `PYTHON_VERSION` ARG) +- System dependencies (build-essential, git, procps) +- Requirements installation +- Application code copying +- Entrypoint that runs `run_benchmark.py` + +## Creating Dockerfile for a New Benchmark + +### Step 1: Copy the Template + +Copy the template to your benchmark directory: + +```bash +cd benchmarks/MYBENCHMARK +cp ../template/Dockerfile.job Dockerfile.job +``` + +Or use the complete template directory: + +```bash +cd benchmarks +cp -r template MYBENCHMARK +cd MYBENCHMARK +# Customize the files as needed +``` + +### Step 2: Verify the Entrypoint + +The `Dockerfile.job` should run the combined benchmark script: + +```dockerfile +# Run combined benchmark script +CMD ["python", "run_benchmark.py"] +``` + +This is already set in the template, so usually no changes are needed. + +### Step 3: Ensure requirements.txt is Complete + +Make sure your `requirements.txt` includes: +- `imsearch_eval[weaviate]` - Core benchmarking framework +- `minio>=7.2.0` - S3 upload support +- Any benchmark-specific dependencies + +## Dockerfile Structure + +A typical `Dockerfile.job` looks like: + +```dockerfile +# MYBENCHMARK Benchmark Job Dockerfile +# Combined Dockerfile for running both data loading and evaluation + +ARG PYTHON_VERSION=3.11-slim +FROM python:${PYTHON_VERSION} + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + git \ + procps \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY . . + +# Run combined benchmark script +CMD ["python", "run_benchmark.py"] +``` + +## Building Images + +### Local Build + +```bash +cd benchmarks/MYBENCHMARK +docker build -f Dockerfile.job -t benchmark-mybenchmark-job:latest . +``` + +### Using Makefile + +```bash +cd benchmarks/MYBENCHMARK +make build +``` + +This will build the image using the `DOCKERFILE_JOB` specified in the Makefile. + +### Using GitHub Actions + +The GitHub Actions workflow (`.github/workflows/benchmarking.yml`) automatically builds and pushes images when: +- Changes are pushed to `main` branch +- Tags are created +- Pull requests are opened + +The workflow builds `Dockerfile.job` for each benchmark. + +## Image Naming Convention + +Images should follow this naming pattern: +- `benchmark-{benchmark-name}-job:latest` + +For example: +- `benchmark-inquire-job:latest` +- `benchmark-mybenchmark-job:latest` + +## Registry + +Images are typically pushed to: +- `gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/benchmark-{name}-job:latest` + +Update the `REGISTRY` variable in your Makefile if using a different registry. + +## Best Practices + +1. **Keep Dockerfiles Simple**: The template is already optimized, avoid unnecessary changes +2. **Pin Dependencies**: Use specific versions in `requirements.txt` when possible +3. **Multi-stage Builds**: Not needed for benchmarks, but can be used if image size is a concern +4. **Layer Caching**: The template is structured to maximize Docker layer caching +5. **Security**: Keep base images updated and avoid running as root if possible + +## Troubleshooting + +### Build Fails with "Module not found" + +Ensure all dependencies are in `requirements.txt` and the file is copied before `pip install`. + +### Build is Slow + +- Check if Docker layer caching is working +- Consider using a local Docker registry for faster builds +- Ensure `requirements.txt` is copied before application code (for better caching) + +### Image is Too Large + +- Use `python:3.11-slim` base image (already in template) +- Remove unnecessary system packages after installation +- Consider multi-stage builds if needed + +## See Also + +- `template/Dockerfile.job` - Complete template example +- `INQUIRE/Dockerfile.job` - Real-world example +- `../README.md` - Framework overview +- `../MAKEFILE.md` - Makefile documentation diff --git a/benchmarking/benchmarks/Dockerfile.template b/benchmarking/benchmarks/Dockerfile.template new file mode 100644 index 00000000..4b37d64b --- /dev/null +++ b/benchmarking/benchmarks/Dockerfile.template @@ -0,0 +1,41 @@ +# Benchmark Dockerfile Template +# Copy this file to your benchmark directory and customize as needed. +# +# Usage: +# 1. Copy this file to your benchmark directory as Dockerfile.benchmark or Dockerfile.data_loader +# 2. Update the CMD line with your entrypoint script (e.g., "main.py" or "load_data.py") +# 3. Ensure requirements.txt includes imsearch-eval package +# +# Example for Dockerfile.benchmark: +# CMD ["python", "main.py"] +# +# Example for Dockerfile.data_loader: +# CMD ["python", "load_data.py"] + +ARG PYTHON_VERSION=3.11-slim +FROM python:${PYTHON_VERSION} + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + git \ + procps \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY . . + +# Note: The imsearch-eval package is installed via requirements.txt +# No need to set PYTHONPATH as the package is installed in the Python environment + +# Set your entrypoint script here +# For evaluator: CMD ["python", "main.py"] +# For data loader: CMD ["python", "load_data.py"] +CMD ["python", "main.py"] + diff --git a/benchmarking/benchmarks/INQUIRE/Dockerfile.job b/benchmarking/benchmarks/INQUIRE/Dockerfile.job new file mode 100644 index 00000000..b40d7402 --- /dev/null +++ b/benchmarking/benchmarks/INQUIRE/Dockerfile.job @@ -0,0 +1,25 @@ +# INQUIRE Benchmark Job Dockerfile +# Combined Dockerfile for running both data loading and evaluation + +ARG PYTHON_VERSION=3.11-slim +FROM python:${PYTHON_VERSION} + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + git \ + procps \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY . . + +# Run combined benchmark script +CMD ["python", "run_benchmark.py"] + diff --git a/benchmarking/benchmarks/INQUIRE/Makefile b/benchmarking/benchmarks/INQUIRE/Makefile new file mode 100644 index 00000000..6994f9f1 --- /dev/null +++ b/benchmarking/benchmarks/INQUIRE/Makefile @@ -0,0 +1,29 @@ +# INQUIRE Benchmark Makefile +# This file sets INQUIRE-specific variables and includes the base benchmarking Makefile + +# ============================================================================ +# Required Variables (must be set for base Makefile) +# ============================================================================ +BENCHMARK_NAME := inquire +DOCKERFILE_JOB := Dockerfile.job +RESULTS_FILES := image_search_results.csv query_eval_metrics.csv +ENV ?= dev +ifeq ($(ENV),prod) + KUSTOMIZE_DIR := ../../kubernetes/INQUIRE/nrp-prod +else + KUSTOMIZE_DIR := ../../kubernetes/INQUIRE/nrp-dev +endif + +# ============================================================================ +# Optional Variables (can be overridden) +# ============================================================================ +KUBECTL_NAMESPACE := sage +KUBECTL_CONTEXT ?= nautilus +REGISTRY := gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search +JOB_TAG ?= latest + +# Local run script +RUN_SCRIPT := run_benchmark.py + +# Include the base Makefile (after setting variables) +include ../Makefile diff --git a/benchmarking/benchmarks/INQUIRE/Readme.md b/benchmarking/benchmarks/INQUIRE/Readme.md new file mode 100644 index 00000000..2c0b652e --- /dev/null +++ b/benchmarking/benchmarks/INQUIRE/Readme.md @@ -0,0 +1,272 @@ +# INQUIRE Benchmark + +Here we use [INQUIRE](https://github.com/inquire-benchmark/INQUIRE) with Weaviate as the vector database for benchmarking. Different models were used to generate captions and keywords for the images. Also different models were used to generate the embeddings for the images. + +## Usage + +This benchmark is supposed to be used in conjunction with [Sage Image Search](../../../kubernetes/base/). The Makefile references components that are deployed in [Sage Image Search](../../../kubernetes/base/) and deploys additional containers that are used to run the INQUIRE Benchmark. + +## Running the Example + +### Prerequisites +To run this example, you'll need: +- **Kubernetes cluster** access with `kubectl` configured +- **kustomize** (or kubectl with kustomize support) +- **Docker** for building images +- **Weaviate and Triton** deployed (from `kubernetes/nrp-dev` or `kubernetes/nrp-prod` depending on the environment you want to use) + +### Step-by-Step Setup + +1. **Deploy Sage Image Search Infrastructure**: + - Navigate to the main [kubernetes](../../../kubernetes) directory and deploy base services: + ```bash + kubectl apply -k nrp-dev # or nrp-prod + ``` + +2. **Build and Push Images**: + - Build the benchmark image: + ```bash + cd benchmarking/benchmarks/INQUIRE + make build + ``` + - Push to registry (update registry in Makefile if needed): + ```bash + docker push /benchmark-inquire-job:latest + ``` +>NOTE: You can also use the GitHub Actions to build and push the images to the registry. See `.github/workflows/benchmarking.yml` for more details. + +3. **Run INQUIRE Benchmark**: + - Deploy and run the complete benchmark (loads data and evaluates): + ```bash + make run # defaults to dev environment + ``` + - Monitor progress: + ```bash + make logs + ``` + >NOTE: This loads [INQUIRE-Benchmark-small](https://huggingface.co/datasets/sagecontinuum/INQUIRE-Benchmark-small) into Weaviate, runs the evaluation, and saves results. + +5. **Run Locally (Development)**: + - For local development with port-forwarding: + ```bash + make run-local + ``` + - This will automatically set up port-forwarding and run the benchmark locally. + +### Results + +Once the benchmark is run, three CSV files will be generated: +- **`image_search_results.csv`**: Metadata of all images returned by Weaviate when different queries were being run +- **`query_eval_metrics.csv`**: Calculated evaluation metrics (NDCG, precision, recall, etc.) based on images returned by different queries +- **`config_values.csv`**: Configuration values used for the benchmark run (generated via `config.to_csv()`) + +Results are saved to `/app/results` when running in Kubernetes (with volume mount), or to the current directory when running locally with `make run-local`. Results can also be automatically uploaded to S3 if configured (with timestamps: `{S3_PREFIX}/{timestamp}/{filename}`). + +## References +- [Weaviate Blog: NDCG](https://weaviate.io/blog/retrieval-evaluation-metrics#normalized-discounted-cumulative-gain-ndcg) +- [RAG Evaluation](https://weaviate.io/blog/rag-evaluation) +- [Scikit-Learn NDCG](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ndcg_score.html) +- [A Guide on NDCG](https://www.aporia.com/learn/a-practical-guide-to-normalized-discounted-cumulative-gain-ndcg/) +- [Weaviate: Batch import](https://weaviate.io/developers/weaviate/manage-data/import) +- [Weaviate: Imports in Detail](https://weaviate.io/developers/weaviate/tutorials/import#data-import---best-practices) +- [INQUIRE](https://inquire-benchmark.github.io/) +- [Huggingface: Fine-tuning Florence2](https://huggingface.co/blog/finetune-florence2) +- [Medium: Fine-tuning Florence2](https://medium.com/@amit25173/fine-tuning-florence-2-aa9c99b2a83d) + +## Citation +``` +@article{vendrow2024inquire, + title={INQUIRE: A Natural World Text-to-Image Retrieval Benchmark}, + author={Vendrow, Edward and Pantazis, Omiros and Shepard, Alexander and Brostow, Gabriel and Jones, Kate E and Mac Aodha, Oisin and Beery, Sara and Van Horn, Grant}, + journal={NeurIPS}, + year={2024}, +} +``` + +# INQUIRE Benchmark Structure + +## Overview + +INQUIRE is a benchmark instance that uses the abstract benchmarking framework provided by the `imsearch-eval` Python package. The framework is installed from GitHub: https://github.com/waggle-sensor/imsearch_eval. This instance uses the INQUIRE dataset with Weaviate as the vector database. + +## Directory Structure + +``` +benchmarking/ +└── benchmarks/ + └── INQUIRE/ # INQUIRE benchmark instance + ├── benchmark_dataset.py # INQUIRE-specific benchmark dataset (BenchmarkDataset) + ├── data_loader.py # INQUIRE-specific data loader (DataLoader) + ├── config.py # INQUIRE-specific configuration (Config) + ├── run_benchmark.py # Main script loads data and evaluates) + ├── requirements.txt # Dependencies including imsearch-eval package + ├── Dockerfile.job # Dockerfile for the combined job + ├── Makefile # Makefile for building and deploying + └── Readme.md # INQUIRE-specific instructions + +The framework and adapters are provided by the imsearch-eval package: +- Repository: https://github.com/waggle-sensor/imsearch_eval +- Package: imsearch_eval[weaviate] +- Installation: pip install imsearch_eval[weaviate] @ git+https://github.com/waggle-sensor/imsearch_eval.git@main +``` + +## Key Components + +### 1. Config Class (`config.py`) + +Implements `Config` interface for INQUIRE benchmark: +- Loads all environment variables (dataset, collection, S3 settings, etc.) +- Defines Weaviate HNSW hyperparameters +- Defines model and query hyperparameters +- Provides caption prompts for different models + +### 2. Benchmark Dataset Class (`benchmark_dataset.py`) + +Extends `HuggingFaceDataset` adapter for INQUIRE dataset: +- Extends `HuggingFaceDataset` from `imsearch_eval.adapters.huggingface` +- Loads from HuggingFace: `sagecontinuum/INQUIRE-Benchmark-small` (via `load_as_dataset()`) +- Defines column mappings: `query`, `query_id`, `relevant` +- Provides metadata columns: `category`, `supercategory`, `iconic_group` + +### 3. Data Loader Class (`data_loader.py`) + +Implements `DataLoader` interface for INQUIRE dataset: +- Processes INQUIRE dataset items +- Generates captions using model provider +- Generates CLIP embeddings +- Returns formatted data for Weaviate insertion +- Provides schema configuration for Weaviate collection + +### 4. Main Script (`run_benchmark.py`) + +Combined script that: +1. **Step 0**: Sets up benchmark environment (initializes clients and adapters) +2. **Step 1**: Loads data into vector database (calls `load_data()` function) +3. **Step 2**: Runs evaluation (calls `run_evaluation()` function) +4. **Step 3**: Saves results locally +5. **Step 4**: Optionally uploads results to S3 + +The script uses a `config` object (instance of `INQUIREConfig`) to access all configuration values. + +### 5. Shared Adapters (from `imsearch-eval` package) + +**WeaviateAdapter and WeaviateQuery**: +- Provided by `imsearch_eval.adapters.weaviate` +- `WeaviateQuery`: Independent implementation of Weaviate query methods +- `WeaviateAdapter`: Uses `WeaviateQuery` for search operations +- Supports query methods: `clip_hybrid_query`, `hybrid_query`, `colbert_query`, etc. +- Implements `init_client()` class method for client initialization +- Import: `from imsearch_eval.adapters import WeaviateAdapter, WeaviateQuery` + +**TritonModelProvider and TritonModelUtils**: +- Provided by `imsearch_eval.adapters.triton` +- `TritonModelUtils`: Implements `ModelUtils` interface +- `TritonModelProvider`: Uses `TritonModelUtils` for model operations +- Supports: CLIP, ColBERT, ALIGN embeddings +- Supports: Gemma3, Qwen2.5-VL captioning +- Import: `from imsearch_eval.adapters import TritonModelProvider, TritonModelUtils` + +## Usage + +### Running on Kubernetes + +```bash +cd benchmarking/benchmarks/INQUIRE +make build # Build Docker image +make run # Deploy and run benchmark job +make logs # Monitor logs +``` + +### Running Locally + +```bash +cd benchmarking/benchmarks/INQUIRE +make run-local # Runs with automatic port-forwarding +``` + +## Environment Variables + +All environment variables are loaded through the `INQUIREConfig` class in `config.py`: + +**Dataset Parameters:** +- `INQUIRE_DATASET`: HuggingFace dataset name (default: `sagecontinuum/INQUIRE-Benchmark-small`) +- `SAMPLE_SIZE`: Number of samples to use (0 = all, default: 0) +- `SEED`: Random seed for sampling (default: 42) +- `HF_TOKEN`: HuggingFace token (from secret, optional) + +**Vector DB Parameters:** +- `WEAVIATE_HOST`: Weaviate host (default: 127.0.0.1) +- `WEAVIATE_PORT`: Weaviate HTTP port (default: 8080) +- `WEAVIATE_GRPC_PORT`: Weaviate gRPC port (default: 50051) +- `COLLECTION_NAME`: Weaviate collection (default: INQUIRE) + +**Inference Server Parameters:** +- `TRITON_HOST`: Triton host (default: triton) +- `TRITON_PORT`: Triton port (default: 8001) + +**Processing Parameters:** +- `WORKERS`: Number of parallel workers (default: 5) +- `IMAGE_BATCH_SIZE`: Batch size for processing images (default: 25) +- `QUERY_BATCH_SIZE`: Batch size for parallel queries (default: 5) + +**Query Parameters:** +- `QUERY_METHOD`: Query method to use (default: clip_hybrid_query) +- `TARGET_VECTOR`: Target vector name (default: clip) +- `RESPONSE_LIMIT`: Maximum number of results to return (default: 50) +- `QUERY_ALPHA`: Hybrid query alpha parameter (default: 0.4) +- `CLIP_ALPHA`: CLIP alpha parameter (default: 0.7) +- `AUTOCUT_JUMPS`: Autocut jumps (default: 0) +- `RERANK_PROP`: Property to use for reranking (default: caption) + +**HNSW Hyperparameters:** +- `HNSW_DIST_METRIC`: Distance metric (default: COSINE) +- `HNSW_EF`: EF parameter (default: -1) +- `HNSW_EF_CONSTRUCTION`: EF construction (default: 100) +- `HNSW_MAX_CONNECTIONS`: Max connections (default: 50) +- And more... (see `config.py` for full list) + +**S3 Upload Parameters:** +- `UPLOAD_TO_S3`: Enable S3 upload (default: false) +- `S3_BUCKET`: S3 bucket name (default: sage_imsearch) +- `S3_PREFIX`: S3 prefix for uploaded files (default: dev-metrics) +- `S3_ENDPOINT`: S3 endpoint URL (default: http://rook-ceph-rgw-nautiluss3.rook) +- `S3_ACCESS_KEY`: S3 access key (from secret) +- `S3_SECRET_KEY`: S3 secret key (from secret) +- `S3_SECURE`: Use TLS for S3 (default: false) + +**Results Files:** +- `IMAGE_RESULTS_FILE`: Image results filename (default: image_search_results.csv) +- `QUERY_EVAL_METRICS_FILE`: Query metrics filename (default: query_eval_metrics.csv) +- `CONFIG_VALUES_FILE`: Config values filename (default: config_values.csv) + +**Logging:** +- `LOG_LEVEL`: Logging level (default: INFO) + +## Extending INQUIRE + +To add new components to INQUIRE: + +1. **New Vector DB**: Add adapter to the `imsearch-eval` package (contribute to the repository) +2. **New Model**: Add provider to the `imsearch-eval` package (contribute to the repository) +3. **New Query Method**: Add method to `WeaviateQuery` in the `imsearch-eval` package +4. **New Model Function**: Add method to `TritonModelUtils` in the `imsearch-eval` package or implement `ModelUtils` interface +5. **New Dataset**: Create benchmark dataset class in INQUIRE directory (benchmark-specific) + +## Framework Package + +The abstract framework and adapters are provided by the `imsearch-eval` Python package: + +- **Repository**: https://github.com/waggle-sensor/imsearch_eval +- **Package**: `imsearch_eval[weaviate,triton,huggingface]` +- **Installation**: + ```bash + pip install "imsearch_eval[weaviate,triton,huggingface] @ git+https://github.com/waggle-sensor/imsearch_eval.git@0.1.0" + ``` + +This allows: +- Multiple benchmark instances to share framework and adapter code +- Framework and adapter updates to benefit all benchmarks +- Clear separation between shared code and instance-specific code +- Easy reuse of adapters across different benchmarks +- **Independence from `app/`**: All functions are in the framework package, won't break when `app/` changes +- **Easy distribution**: Benchmarks can be used in any environment by installing the package diff --git a/benchmarking/benchmarks/INQUIRE/benchmark_dataset.py b/benchmarking/benchmarks/INQUIRE/benchmark_dataset.py new file mode 100644 index 00000000..77cf1001 --- /dev/null +++ b/benchmarking/benchmarks/INQUIRE/benchmark_dataset.py @@ -0,0 +1,22 @@ +"""INQUIRE benchmark dataset implementation.""" +from imsearch_eval.adapters.huggingface import HuggingFaceDataset + +class INQUIRE(HuggingFaceDataset): + """Benchmark dataset class for INQUIRE dataset.""" + + def get_query_column(self) -> str: + """Get the name of the column containing the query text.""" + return "query" + + def get_query_id_column(self) -> str: + """Get the name of the column containing the query ID.""" + return "query_id" + + def get_relevance_column(self) -> str: + """Get the name of the column containing relevance labels.""" + return "relevant" + + def get_metadata_columns(self) -> list: + """Get optional metadata columns to include in evaluation stats.""" + return ["category", "supercategory", "iconic_group"] + diff --git a/benchmarking/benchmarks/INQUIRE/config.py b/benchmarking/benchmarks/INQUIRE/config.py new file mode 100644 index 00000000..8a60f78e --- /dev/null +++ b/benchmarking/benchmarks/INQUIRE/config.py @@ -0,0 +1,99 @@ +"""INQUIRE-specific configuration/hyperparameters.""" + +import os +from weaviate.classes.config import VectorDistances, Configure +from weaviate.collections.classes.config_vector_index import VectorFilterStrategy + +from imsearch_eval.framework.interfaces import Config + + +class INQUIREConfig(Config): + """Configuration for INQUIRE benchmark.""" + + def __init__(self): + """Initialize INQUIRE configuration.""" + # dataset parameters + self.inquire_dataset = os.environ.get("INQUIRE_DATASET", "sagecontinuum/INQUIRE-Benchmark-small") + self.sample_size = int(os.environ.get("SAMPLE_SIZE", 0)) + self.seed = int(os.environ.get("SEED", 42)) + self._hf_token = os.environ.get("HF_TOKEN", "") + # Upload parameters + self._upload_to_s3 = os.environ.get("UPLOAD_TO_S3", "false").lower() == "true" + self._s3_bucket = os.environ.get("S3_BUCKET", "sage_imsearch") + self._s3_prefix = os.environ.get("S3_PREFIX", "dev-metrics") + self._s3_endpoint = os.environ.get("S3_ENDPOINT", "http://rook-ceph-rgw-nautiluss3.rook") + self._s3_access_key = os.environ.get("S3_ACCESS_KEY", "") + self._s3_secret_key = os.environ.get("S3_SECRET_KEY", "") + self._s3_secure = os.environ.get("S3_SECURE", "false").lower() == "true" + self._image_results_file = os.environ.get("IMAGE_RESULTS_FILE", "image_search_results.csv") + self._query_eval_metrics_file = os.environ.get("QUERY_EVAL_METRICS_FILE", "query_eval_metrics.csv") + self._config_values_file = os.environ.get("CONFIG_VALUES_FILE", "config_values.csv") + + # Weaviate parameters + self._weaviate_host = os.environ.get("WEAVIATE_HOST", "127.0.0.1") + self._weaviate_port = os.environ.get("WEAVIATE_PORT", "8080") + self._weaviate_grpc_port = os.environ.get("WEAVIATE_GRPC_PORT", "50051") + self._collection_name = os.environ.get("COLLECTION_NAME", "INQUIRE") + + # Triton parameters + self._triton_host = os.environ.get("TRITON_HOST", "triton") + self._triton_port = os.environ.get("TRITON_PORT", "8001") + + # Workers parameters + self._workers = int(os.environ.get("WORKERS", 5)) + self._image_batch_size = int(os.environ.get("IMAGE_BATCH_SIZE", 25)) + self._query_batch_size = int(os.environ.get("QUERY_BATCH_SIZE", 5)) + + # Logging parameters + self._log_level = os.environ.get("LOG_LEVEL", "INFO").upper() + + # Weaviate HNSW hyperparameters + self.hnsw_dist_metric = getattr(VectorDistances, os.environ.get("HNSW_DIST_METRIC", "COSINE").upper()) + self.hnsw_ef = int(os.environ.get("HNSW_EF", -1)) + self.hnsw_ef_construction = int(os.environ.get("HNSW_EF_CONSTRUCTION", 100)) + self.hnsw_maxConnections = int(os.environ.get("HNSW_MAX_CONNECTIONS", 50)) + self.hsnw_dynamicEfMax = int(os.environ.get("HNSW_DYNAMIC_EF_MAX", 500)) + self.hsnw_dynamicEfMin = int(os.environ.get("HNSW_DYNAMIC_EF_MIN", 200)) + self.hnsw_ef_factor = int(os.environ.get("HNSW_EF_FACTOR", 20)) + self.hsnw_filterStrategy = getattr(VectorFilterStrategy, os.environ.get("HNSW_FILTER_STRATEGY", "ACORN").upper()) + self.hnsw_flatSearchCutoff = int(os.environ.get("HNSW_FLAT_SEARCH_CUTOFF", 40000)) + self.hnsw_vector_cache_max_objects = int(os.environ.get("HNSW_VECTOR_CACHE_MAX_OBJECTS", 1e12)) + self.hnsw_quantizer = Configure.VectorIndex.Quantizer.pq( + training_limit=int(os.environ.get("HNSW_QUANTIZER_TRAINING_LIMIT", 500000)) + ) + + # Query parameters + self.query_method = os.environ.get("QUERY_METHOD", "clip_hybrid_query") + self.target_vector = os.environ.get("TARGET_VECTOR", "clip") + self.response_limit = int(os.environ.get("RESPONSE_LIMIT", 50)) + self.advanced_query_parameters = { + "alpha": float(os.environ.get("QUERY_ALPHA", 0.4)), + "query_properties": ["caption"], + "autocut_jumps": int(os.environ.get("AUTOCUT_JUMPS", 0)), + "rerank_prop": os.environ.get("RERANK_PROP", "caption"), + "clip_alpha": float(os.environ.get("CLIP_ALPHA", 0.7)), + } + + # Caption prompts + default_prompt = """ +role: +You are a world-class Scientific Image Captioning Expert. + +context: +You will be shown a scientific image captured by edge devices. Your goal is to analyze its content and significance in detail. + +task: +Generate exactly one scientifically detailed caption that accurately describes what is visible in the image and its scientific relevance. +Make it as detailed as possible. Also extract text and numbers from the images. + +constraints: +- Only return: + 1. A single caption. + 2. a list of 15 keywords relevant to the image. +- Do not include any additional text, explanations, or formatting. + +format: + caption: + keywords: , , ... +""" + self.gemma3_prompt = os.environ.get("GEMMA3_PROMPT", default_prompt) diff --git a/benchmarking/benchmarks/INQUIRE/data_loader.py b/benchmarking/benchmarks/INQUIRE/data_loader.py new file mode 100644 index 00000000..68292141 --- /dev/null +++ b/benchmarking/benchmarks/INQUIRE/data_loader.py @@ -0,0 +1,167 @@ +"""INQUIRE-specific data loader for loading data into vector databases.""" + +import os +import logging +import random +from dateutil.parser import parse +from io import BytesIO, BufferedReader +from PIL import Image +import weaviate +from weaviate.classes.data import GeoCoordinate +from imsearch_eval.framework.interfaces import DataLoader + + +class INQUIREDataLoader(DataLoader): + """Data loader for INQUIRE dataset.""" + + def process_item( + self, + item: dict + ) -> dict: + """ + Process a single INQUIRE dataset item. + + Args: + item: Dictionary containing INQUIRE dataset item + Returns: + Dictionary with 'properties' and 'vector' keys for Weaviate insertion + """ + try: + if not isinstance(item, dict): + raise TypeError(f"Expected dict, got {type(item)}") + + if not isinstance(item.get("image"), Image.Image): + raise TypeError(f"Expected PIL.Image, got {type(item.get('image'))}") + + image = item["image"] + filename = item.get("inat24_file_name", "") + + logging.debug(f"Processing item: {filename}") + + # Extract metadata + query = item.get("query", "") + query_id = item.get("query_id", 0) + relevant = item.get("relevant", 0) + clip_score = item.get("clip_score", 0.0) + inat_id = item.get("inat24_image_id", 0) + supercategory = item.get("supercategory", "") + category = item.get("category", "") + iconic_group = item.get("iconic_group", "") + species_id = item.get("inat24_species_id", 0) + species_name = item.get("inat24_species_name", "") + location_uncertainty = item.get("location_uncertainty", 0) + lat = item.get("latitude", None) + lon = item.get("longitude", None) + raw_date = item.get("date", "") + + # Parse date + try: + date_obj = parse(raw_date) + date_rfc3339 = date_obj.isoformat() + except Exception as e: + logging.error(f"Error parsing date for image {filename}: {e}") + date_rfc3339 = raw_date.replace(" ", "T") if raw_date else "" + + # Convert image to BytesIO for encoding + image_stream = BytesIO() + image.save(image_stream, format="JPEG") + image_stream.seek(0) + + # Encode image for Weaviate + buffered_stream = BufferedReader(image_stream) + encoded_image = weaviate.util.image_encoder_b64(buffered_stream) + + # Generate caption using model provider + caption = self.model_provider.generate_caption(image, self.config.gemma3_prompt, model_name="gemma3") + + if not caption: + caption = "" # Fallback if caption generation fails + + # Generate CLIP embeddings + clip_embedding = self.model_provider.get_embedding(caption, image=image, model_name="clip") + if clip_embedding is None: + raise ValueError("Failed to generate CLIP embedding") + + # Construct properties and vector + properties = { + "inat24_image_id": inat_id, + "inat24_file_name": filename, + "query": query, + "query_id": query_id, + "image": encoded_image, + "caption": caption, + "relevant": relevant, + "clip_score": clip_score, + "supercategory": supercategory, + "category": category, + "iconic_group": iconic_group, + "inat24_species_id": species_id, + "inat24_species_name": species_name, + "location_uncertainty": location_uncertainty, + "date": date_rfc3339, + "location": GeoCoordinate(latitude=float(lat), longitude=float(lon)) if lat and lon else None, + } + + return { + "properties": properties, + "vector": {"clip": clip_embedding} + } + + except Exception as e: + logging.error(f"Error processing item {item.get('inat24_file_name', 'unknown')}: {e}") + return None + + def get_schema_config(self) -> dict: + """ + Get Weaviate schema configuration for INQUIRE collection. + + Returns: + Dictionary containing schema configuration + """ + from weaviate.classes.config import Configure, Property, DataType + TARGET_VECTOR = os.environ.get("TARGET_VECTOR", "clip") + COLLECTION_NAME = os.environ.get("COLLECTION_NAME", "INQUIRE") + return { + "name": COLLECTION_NAME, + "description": "A collection to test our set up using INQUIRE with Weaviate", + "properties": [ + Property(name="inat24_image_id", data_type=DataType.NUMBER), + Property(name="inat24_file_name", data_type=DataType.TEXT), + Property(name="query", data_type=DataType.TEXT), + Property(name="query_id", data_type=DataType.NUMBER), + Property(name="image", data_type=DataType.BLOB), + Property(name="audio", data_type=DataType.BLOB), + Property(name="video", data_type=DataType.BLOB), + Property(name="caption", data_type=DataType.TEXT), + Property(name="relevant", data_type=DataType.NUMBER), + Property(name="clip_score", data_type=DataType.NUMBER), + Property(name="supercategory", data_type=DataType.TEXT), + Property(name="category", data_type=DataType.TEXT), + Property(name="iconic_group", data_type=DataType.TEXT), + Property(name="inat24_species_id", data_type=DataType.NUMBER), + Property(name="inat24_species_name", data_type=DataType.TEXT), + Property(name="location_uncertainty", data_type=DataType.NUMBER), + Property(name="date", data_type=DataType.DATE), + Property(name="location", data_type=DataType.GEO_COORDINATES) + ], + "vectorizer_config": [ + Configure.NamedVectors.none( + name=TARGET_VECTOR, + vector_index_config=Configure.VectorIndex.hnsw( + distance_metric=self.config.hnsw_dist_metric, + dynamic_ef_factor=self.config.hnsw_ef_factor, + dynamic_ef_max=self.config.hsnw_dynamicEfMax, + dynamic_ef_min=self.config.hsnw_dynamicEfMin, + ef=self.config.hnsw_ef, + ef_construction=self.config.hnsw_ef_construction, + filter_strategy=self.config.hsnw_filterStrategy, + flat_search_cutoff=self.config.hnsw_flatSearchCutoff, + max_connections=self.config.hnsw_maxConnections, + vector_cache_max_objects=int(self.config.hnsw_vector_cache_max_objects), + quantizer=self.config.hnsw_quantizer, + ) + ) + ], + "reranker_config": Configure.Reranker.transformers() + } + diff --git a/benchmarking/benchmarks/INQUIRE/requirements.txt b/benchmarking/benchmarks/INQUIRE/requirements.txt new file mode 100644 index 00000000..303121d4 --- /dev/null +++ b/benchmarking/benchmarks/INQUIRE/requirements.txt @@ -0,0 +1,14 @@ +# INQUIRE Benchmark Requirements +# Core benchmarking framework +imsearch_eval[weaviate] @ git+https://github.com/waggle-sensor/imsearch_eval.git@0.1.0 +imsearch_eval[triton] @ git+https://github.com/waggle-sensor/imsearch_eval.git@0.1.0 +imsearch_eval[huggingface] @ git+https://github.com/waggle-sensor/imsearch_eval.git@0.1.0 + +# Image processing +Pillow>=10.0.0 + +# Date parsing +python-dateutil>=2.8.0 + +# S3 upload support (MinIO) +minio>=7.2.0 diff --git a/benchmarking/INQUIRE/results/images/inquire_categories.png b/benchmarking/benchmarks/INQUIRE/results/images/inquire_categories.png similarity index 100% rename from benchmarking/INQUIRE/results/images/inquire_categories.png rename to benchmarking/benchmarks/INQUIRE/results/images/inquire_categories.png diff --git a/benchmarking/INQUIRE/results/images/inquire_iconic_groups.png b/benchmarking/benchmarks/INQUIRE/results/images/inquire_iconic_groups.png similarity index 100% rename from benchmarking/INQUIRE/results/images/inquire_iconic_groups.png rename to benchmarking/benchmarks/INQUIRE/results/images/inquire_iconic_groups.png diff --git a/benchmarking/INQUIRE/results/v1/evaluate.ipynb b/benchmarking/benchmarks/INQUIRE/results/v1/evaluate.ipynb similarity index 100% rename from benchmarking/INQUIRE/results/v1/evaluate.ipynb rename to benchmarking/benchmarks/INQUIRE/results/v1/evaluate.ipynb diff --git a/benchmarking/INQUIRE/results/v1/image_search_results.csv b/benchmarking/benchmarks/INQUIRE/results/v1/image_search_results.csv similarity index 100% rename from benchmarking/INQUIRE/results/v1/image_search_results.csv rename to benchmarking/benchmarks/INQUIRE/results/v1/image_search_results.csv diff --git a/benchmarking/INQUIRE/results/v1/query_eval_metrics.csv b/benchmarking/benchmarks/INQUIRE/results/v1/query_eval_metrics.csv similarity index 100% rename from benchmarking/INQUIRE/results/v1/query_eval_metrics.csv rename to benchmarking/benchmarks/INQUIRE/results/v1/query_eval_metrics.csv diff --git a/benchmarking/INQUIRE/results/v2/evaluate.ipynb b/benchmarking/benchmarks/INQUIRE/results/v2/evaluate.ipynb similarity index 100% rename from benchmarking/INQUIRE/results/v2/evaluate.ipynb rename to benchmarking/benchmarks/INQUIRE/results/v2/evaluate.ipynb diff --git a/benchmarking/INQUIRE/results/v2/image_search_results.csv b/benchmarking/benchmarks/INQUIRE/results/v2/image_search_results.csv similarity index 100% rename from benchmarking/INQUIRE/results/v2/image_search_results.csv rename to benchmarking/benchmarks/INQUIRE/results/v2/image_search_results.csv diff --git a/benchmarking/INQUIRE/results/v2/query_eval_metrics.csv b/benchmarking/benchmarks/INQUIRE/results/v2/query_eval_metrics.csv similarity index 100% rename from benchmarking/INQUIRE/results/v2/query_eval_metrics.csv rename to benchmarking/benchmarks/INQUIRE/results/v2/query_eval_metrics.csv diff --git a/benchmarking/INQUIRE/results/v3/evaluate.ipynb b/benchmarking/benchmarks/INQUIRE/results/v3/evaluate.ipynb similarity index 100% rename from benchmarking/INQUIRE/results/v3/evaluate.ipynb rename to benchmarking/benchmarks/INQUIRE/results/v3/evaluate.ipynb diff --git a/benchmarking/INQUIRE/results/v3/image_search_results.csv b/benchmarking/benchmarks/INQUIRE/results/v3/image_search_results.csv similarity index 100% rename from benchmarking/INQUIRE/results/v3/image_search_results.csv rename to benchmarking/benchmarks/INQUIRE/results/v3/image_search_results.csv diff --git a/benchmarking/INQUIRE/results/v3/query_eval_metrics.csv b/benchmarking/benchmarks/INQUIRE/results/v3/query_eval_metrics.csv similarity index 100% rename from benchmarking/INQUIRE/results/v3/query_eval_metrics.csv rename to benchmarking/benchmarks/INQUIRE/results/v3/query_eval_metrics.csv diff --git a/benchmarking/INQUIRE/results/v4/evaluate.ipynb b/benchmarking/benchmarks/INQUIRE/results/v4/evaluate.ipynb similarity index 100% rename from benchmarking/INQUIRE/results/v4/evaluate.ipynb rename to benchmarking/benchmarks/INQUIRE/results/v4/evaluate.ipynb diff --git a/benchmarking/INQUIRE/results/v4/image_search_results.csv b/benchmarking/benchmarks/INQUIRE/results/v4/image_search_results.csv similarity index 100% rename from benchmarking/INQUIRE/results/v4/image_search_results.csv rename to benchmarking/benchmarks/INQUIRE/results/v4/image_search_results.csv diff --git a/benchmarking/INQUIRE/results/v4/query_eval_metrics.csv b/benchmarking/benchmarks/INQUIRE/results/v4/query_eval_metrics.csv similarity index 100% rename from benchmarking/INQUIRE/results/v4/query_eval_metrics.csv rename to benchmarking/benchmarks/INQUIRE/results/v4/query_eval_metrics.csv diff --git a/benchmarking/INQUIRE/results/v5/evaluate.ipynb b/benchmarking/benchmarks/INQUIRE/results/v5/evaluate.ipynb similarity index 100% rename from benchmarking/INQUIRE/results/v5/evaluate.ipynb rename to benchmarking/benchmarks/INQUIRE/results/v5/evaluate.ipynb diff --git a/benchmarking/INQUIRE/results/v5/image_search_results.csv b/benchmarking/benchmarks/INQUIRE/results/v5/image_search_results.csv similarity index 100% rename from benchmarking/INQUIRE/results/v5/image_search_results.csv rename to benchmarking/benchmarks/INQUIRE/results/v5/image_search_results.csv diff --git a/benchmarking/INQUIRE/results/v5/query_eval_metrics.csv b/benchmarking/benchmarks/INQUIRE/results/v5/query_eval_metrics.csv similarity index 100% rename from benchmarking/INQUIRE/results/v5/query_eval_metrics.csv rename to benchmarking/benchmarks/INQUIRE/results/v5/query_eval_metrics.csv diff --git a/benchmarking/INQUIRE/results/v6/evaluate.ipynb b/benchmarking/benchmarks/INQUIRE/results/v6/evaluate.ipynb similarity index 100% rename from benchmarking/INQUIRE/results/v6/evaluate.ipynb rename to benchmarking/benchmarks/INQUIRE/results/v6/evaluate.ipynb diff --git a/benchmarking/INQUIRE/results/v6/image_search_results.csv b/benchmarking/benchmarks/INQUIRE/results/v6/image_search_results.csv similarity index 100% rename from benchmarking/INQUIRE/results/v6/image_search_results.csv rename to benchmarking/benchmarks/INQUIRE/results/v6/image_search_results.csv diff --git a/benchmarking/INQUIRE/results/v6/query_eval_metrics.csv b/benchmarking/benchmarks/INQUIRE/results/v6/query_eval_metrics.csv similarity index 100% rename from benchmarking/INQUIRE/results/v6/query_eval_metrics.csv rename to benchmarking/benchmarks/INQUIRE/results/v6/query_eval_metrics.csv diff --git a/benchmarking/INQUIRE/results/v7/evaluate.ipynb b/benchmarking/benchmarks/INQUIRE/results/v7/evaluate.ipynb similarity index 100% rename from benchmarking/INQUIRE/results/v7/evaluate.ipynb rename to benchmarking/benchmarks/INQUIRE/results/v7/evaluate.ipynb diff --git a/benchmarking/INQUIRE/results/v7/image_search_results.csv b/benchmarking/benchmarks/INQUIRE/results/v7/image_search_results.csv similarity index 100% rename from benchmarking/INQUIRE/results/v7/image_search_results.csv rename to benchmarking/benchmarks/INQUIRE/results/v7/image_search_results.csv diff --git a/benchmarking/INQUIRE/results/v7/query_eval_metrics.csv b/benchmarking/benchmarks/INQUIRE/results/v7/query_eval_metrics.csv similarity index 100% rename from benchmarking/INQUIRE/results/v7/query_eval_metrics.csv rename to benchmarking/benchmarks/INQUIRE/results/v7/query_eval_metrics.csv diff --git a/benchmarking/INQUIRE/results/v8/evaluate.ipynb b/benchmarking/benchmarks/INQUIRE/results/v8/evaluate.ipynb similarity index 100% rename from benchmarking/INQUIRE/results/v8/evaluate.ipynb rename to benchmarking/benchmarks/INQUIRE/results/v8/evaluate.ipynb diff --git a/benchmarking/INQUIRE/results/v8/image_search_results.csv b/benchmarking/benchmarks/INQUIRE/results/v8/image_search_results.csv similarity index 100% rename from benchmarking/INQUIRE/results/v8/image_search_results.csv rename to benchmarking/benchmarks/INQUIRE/results/v8/image_search_results.csv diff --git a/benchmarking/INQUIRE/results/v8/query_eval_metrics.csv b/benchmarking/benchmarks/INQUIRE/results/v8/query_eval_metrics.csv similarity index 100% rename from benchmarking/INQUIRE/results/v8/query_eval_metrics.csv rename to benchmarking/benchmarks/INQUIRE/results/v8/query_eval_metrics.csv diff --git a/benchmarking/INQUIRE/results/v9/evaluate.ipynb b/benchmarking/benchmarks/INQUIRE/results/v9/evaluate.ipynb similarity index 100% rename from benchmarking/INQUIRE/results/v9/evaluate.ipynb rename to benchmarking/benchmarks/INQUIRE/results/v9/evaluate.ipynb diff --git a/benchmarking/INQUIRE/results/v9/image_search_results.csv b/benchmarking/benchmarks/INQUIRE/results/v9/image_search_results.csv similarity index 100% rename from benchmarking/INQUIRE/results/v9/image_search_results.csv rename to benchmarking/benchmarks/INQUIRE/results/v9/image_search_results.csv diff --git a/benchmarking/INQUIRE/results/v9/query_eval_metrics.csv b/benchmarking/benchmarks/INQUIRE/results/v9/query_eval_metrics.csv similarity index 100% rename from benchmarking/INQUIRE/results/v9/query_eval_metrics.csv rename to benchmarking/benchmarks/INQUIRE/results/v9/query_eval_metrics.csv diff --git a/benchmarking/benchmarks/INQUIRE/run_benchmark.py b/benchmarking/benchmarks/INQUIRE/run_benchmark.py new file mode 100644 index 00000000..a329466a --- /dev/null +++ b/benchmarking/benchmarks/INQUIRE/run_benchmark.py @@ -0,0 +1,238 @@ +"""run INQUIRE benchmark: load data and evaluate queries.""" + +import os +import logging +import time +import sys +from pathlib import Path +import tritonclient.grpc as TritonClient +from datasets import Dataset +from imsearch_eval import BenchmarkEvaluator, VectorDBAdapter +from imsearch_eval.adapters import WeaviateAdapter, TritonModelProvider, WeaviateQuery +from benchmark_dataset import INQUIRE +from config import INQUIREConfig +from data_loader import INQUIREDataLoader + +config = INQUIREConfig() + + +def load_data(data_loader: INQUIREDataLoader, vector_db: VectorDBAdapter, hf_dataset: Dataset): + """Load INQUIRE dataset into Weaviate for INQUIRE benchmark.""" + try: + # Create collection schema + logging.info("Creating collection schema...") + schema_config = data_loader.get_schema_config() + vector_db.create_collection(schema_config) + + # Process and insert data + logging.info("Processing and inserting data...") + results = data_loader.process_batch(batch_size=config._image_batch_size, dataset=hf_dataset, workers=config._workers) + inserted = vector_db.insert_data(config._collection_name, results, batch_size=config._image_batch_size) + logging.info(f"Inserted {inserted} items.") + logging.info(f"Successfully loaded {config.inquire_dataset} into Weaviate collection '{config._collection_name}'") + except Exception as e: + logging.error(f"Error loading data: {e}") + vector_db.close() + raise + +def run_evaluation(evaluator: BenchmarkEvaluator, hf_dataset: Dataset): + """Run the INQUIRE benchmark evaluation.""" + # Run evaluation + logging.info("Starting evaluation...") + try: + image_results, query_evaluation = evaluator.evaluate_queries(query_batch_size=config._query_batch_size, dataset=hf_dataset, workers=config._workers) + except Exception as e: + logging.error(f"Error running evaluation: {e}") + evaluator.vector_db.close() + raise + + return image_results, query_evaluation + +def upload_to_s3(local_file_path: str, s3_key: str): + """Upload a file to S3-compatible storage using MinIO.""" + try: + from minio import Minio + from minio.error import S3Error + + if not config._s3_endpoint: + raise ValueError("S3_ENDPOINT environment variable must be set") + + # Parse endpoint (remove http:// or https:// if present) + endpoint = config._s3_endpoint.replace("http://", "").replace("https://", "") + + # Create MinIO client + client = Minio( + endpoint, + access_key=config._s3_access_key, + secret_key=config._s3_secret_key, + secure=config._s3_secure + ) + + # Upload file + logging.info(f"Uploading {local_file_path} to s3://{config._s3_bucket}/{s3_key}") + client.fput_object(config._s3_bucket, s3_key, local_file_path) + logging.info(f"Successfully uploaded to s3://{config._s3_bucket}/{s3_key}") + + except ImportError: + logging.error("minio is not installed. Install it with: pip install minio") + raise + except S3Error as e: + logging.error(f"Error uploading to S3: {e}") + raise + except Exception as e: + logging.error(f"Unexpected error uploading to S3: {e}") + raise + +def main(): + """Main entry point for running the complete benchmark.""" + + # Configure logging + logging.basicConfig( + level=getattr(logging, config._log_level, logging.INFO), + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + datefmt="%Y/%m/%d %H:%M:%S", + ) + + # Step 0: load framework components + logging.info("=" * 80) + logging.info("Step 0: Setting up benchmark environment") + logging.info("=" * 80) + logging.info("Initializing Weaviate client...") + weaviate_client = WeaviateAdapter.init_client( + host=config._weaviate_host, + port=config._weaviate_port, + grpc_port=config._weaviate_grpc_port + ) + + logging.info("Initializing Triton client...") + triton_client = TritonClient.InferenceServerClient(url=f"{config._triton_host}:{config._triton_port}") + + # Create query method + query_method = WeaviateQuery( + weaviate_client=weaviate_client, + triton_client=triton_client + ) + + # Create adapters + logging.info("Creating adapters...") + vector_db = WeaviateAdapter( + weaviate_client=weaviate_client, + triton_client=triton_client, + query_method=query_method + ) + + model_provider = TritonModelProvider(triton_client=triton_client) + + # Create benchmark dataset + logging.info("Creating benchmark dataset class...") + benchmark_dataset = INQUIRE(dataset_name=config.inquire_dataset) + hf_dataset = benchmark_dataset.load_as_dataset(split="test", sample_size=config.sample_size, seed=config.seed, token=config._hf_token) + + # Create data loader + logging.info("Creating data loader...") + data_loader = INQUIREDataLoader( + config=config, + model_provider=model_provider, + dataset=benchmark_dataset, + ) + + # Create evaluator + logging.info("Creating benchmark evaluator...") + evaluator = BenchmarkEvaluator( + vector_db=vector_db, + model_provider=model_provider, + dataset=benchmark_dataset, + collection_name=config._collection_name, + limit=config.response_limit, + query_method=getattr(query_method, config.query_method), + query_parameters=config.advanced_query_parameters, + score_columns=["rerank_score", "clip_score"], + target_vector=config.target_vector + ) + + # Step 1: Load data + logging.info("=" * 80) + logging.info("Step 1: Loading data into vector database") + logging.info("=" * 80) + try: + load_data(data_loader, vector_db, hf_dataset) + logging.info("Data loading completed successfully.") + except Exception as e: + logging.error(f"Error loading data: {e}") + sys.exit(1) + + # Step 2: Run evaluation + logging.info("=" * 80) + logging.info("Step 2: Running benchmark evaluation") + logging.info("=" * 80) + try: + image_results, query_evaluation = run_evaluation(evaluator, hf_dataset) + logging.info("Evaluation completed successfully.") + except Exception as e: + logging.error(f"Error running evaluation: {e}") + sys.exit(1) + + # Step 3: Save results locally + logging.info("=" * 80) + logging.info("Step 3: Saving results") + logging.info("=" * 80) + + # Determine results directory (use /app/results if PVC is mounted, otherwise current directory) + results_dir = Path("/app/results" if os.path.exists("/app/results") else ".") + results_dir.mkdir(parents=True, exist_ok=True) + + image_results_path = results_dir / config._image_results_file + query_evaluation_path = results_dir / config._query_eval_metrics_file + config_csv_path = results_dir / config._config_values_file + + image_results.to_csv(image_results_path, index=False) + query_evaluation.to_csv(query_evaluation_path, index=False) + + config_csv_str = config.to_csv() + with open(config_csv_path, "w") as f: + f.write(config_csv_str) + + logging.info(f"Results saved locally to:") + logging.info(f" - {image_results_path}") + logging.info(f" - {query_evaluation_path}") + logging.info(f" - {config_csv_path}") + + # Step 4: Upload to S3 if enabled + if config._upload_to_s3: + if not config._s3_bucket: + logging.warning("UPLOAD_TO_S3 is true but S3_BUCKET is not set. Skipping S3 upload.") + elif not config._s3_endpoint: + logging.warning("UPLOAD_TO_S3 is true but S3_ENDPOINT is not set. Skipping S3 upload.") + elif not config._s3_access_key or not config._s3_secret_key: + logging.warning("UPLOAD_TO_S3 is true but S3 credentials are not set. Skipping S3 upload.") + else: + logging.info("=" * 80) + logging.info("Step 4: Uploading results to S3") + logging.info("=" * 80) + try: + # Generate S3 keys with timestamp + timestamp = time.strftime("%Y%m%dT%H%M%S") + s3_key_image = f"{config._s3_prefix}/{timestamp}/{config._image_results_file}" + s3_key_query = f"{config._s3_prefix}/{timestamp}/{config._query_eval_metrics_file}" + s3_key_config = f"{config._s3_prefix}/{timestamp}/{config._config_values_file}" + + upload_to_s3(str(image_results_path), s3_key_image) + upload_to_s3(str(query_evaluation_path), s3_key_query) + upload_to_s3(str(config_csv_path), s3_key_config) + + logging.info("S3 upload completed successfully.") + except Exception as e: + logging.error(f"Error uploading to S3: {e}") + logging.warning("Continuing despite S3 upload error...") + else: + logging.info("S3 upload is disabled (UPLOAD_TO_S3=false or not set).") + + vector_db.close() + logging.info("=" * 80) + logging.info("Benchmark run completed successfully!") + logging.info("=" * 80) + + +if __name__ == "__main__": + main() + diff --git a/benchmarking/benchmarks/MAKEFILE.md b/benchmarking/benchmarks/MAKEFILE.md new file mode 100644 index 00000000..97ff84c8 --- /dev/null +++ b/benchmarking/benchmarks/MAKEFILE.md @@ -0,0 +1,219 @@ +# Benchmarking Makefile Guide + +## Overview + +The benchmarking framework provides a reusable Makefile system that allows any benchmark instance to leverage common build, deployment, and execution commands while customizing benchmark-specific settings. + +## Structure + +``` +benchmarking/ +├── Makefile # Base Makefile with generic commands +benchmarks/ +├── Makefile # Base Makefile (shared by all benchmarks) +├── MAKEFILE.md # Makefile documentation +├── Dockerfile.template # Base Dockerfile template +├── DOCKER.md # Dockerfile documentation + ├── INQUIRE/ +│ └── Makefile # INQUIRE-specific variables + includes base + └── template/ + └── Makefile # Template Makefile +``` + +## Base Makefile + +The base `benchmarks/Makefile` contains all generic commands that work for any benchmark: + +- **Build**: `make build` - Build Docker job image +- **Run**: `make run` - Deploy and run benchmark job (loads data and evaluates) +- **Run Local**: `make run-local` - Run benchmark locally with port-forwarding +- **Status**: `make status` - Show deployment status +- **Logs**: `make logs` - View job logs +- **Port Forward**: `make port-forward-start` / `make port-forward-stop` - Manage port-forwarding +- **Down**: `make down` - Remove deployments + +## Creating a New Benchmark Makefile + +To create a Makefile for a new benchmark (e.g., `MYBENCHMARK`): + +### 1. Create the Makefile + +Create `benchmarking/MYBENCHMARK/Makefile`: + +```makefile +# MYBENCHMARK Benchmark Makefile +# This file sets MYBENCHMARK-specific variables and includes the base benchmarking Makefile + +# ============================================================================ +# Required Variables (must be set for base Makefile) +# ============================================================================ +BENCHMARK_NAME := mybenchmark +DOCKERFILE_JOB := Dockerfile.job +RESULTS_FILES := image_search_results.csv query_eval_metrics.csv +ENV ?= dev +ifeq ($(ENV),prod) + KUSTOMIZE_DIR := ../../kubernetes/MYBENCHMARK/nrp-prod +else + KUSTOMIZE_DIR := ../../kubernetes/MYBENCHMARK/nrp-dev +endif + +# ============================================================================ +# Optional Variables (can be overridden) +# ============================================================================ +KUBECTL_NAMESPACE := sage +KUBECTL_CONTEXT ?= nautilus +REGISTRY := gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search +JOB_TAG ?= latest + +# Local run script +RUN_SCRIPT := run_benchmark.py + +# Include the base Makefile (after setting variables) +include ../Makefile +``` + +### 2. Required Variables + +Each benchmark Makefile **must** define: + +- `BENCHMARK_NAME`: Unique identifier for the benchmark (used in labels, names, etc.) +- `DOCKERFILE_JOB`: Name of the job Dockerfile (typically `Dockerfile.job`) +- `RESULTS_FILES`: Space-separated list of result files to copy (e.g., `image_search_results.csv query_eval_metrics.csv`) +- `KUSTOMIZE_DIR`: Path to the kustomize directory (can be conditional based on `ENV`) + +### 3. Optional Variables + +These can be overridden but have defaults: + +- `KUBECTL_NAMESPACE`: Kubernetes namespace (default: `sage`) +- `KUBECTL_CONTEXT`: kubectl context (default: `nautilus`) +- `REGISTRY`: Docker registry (default: `gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search`) +- `JOB_TAG`: Job image tag (default: `latest`) +- `RUN_SCRIPT`: Script to run locally (default: `run_benchmark.py`) + +### 4. Environment Switching + +The Makefile supports switching between dev and prod environments: + +```makefile +ENV ?= dev +ifeq ($(ENV),prod) + KUSTOMIZE_DIR := ../../kubernetes/MYBENCHMARK/nrp-prod +else + KUSTOMIZE_DIR := ../../kubernetes/MYBENCHMARK/nrp-dev +endif +``` + +Then use: +```bash +make run ENV=prod # Run using prod resources +make run # Run using dev resources (default) +``` + +## Example: INQUIRE Makefile + +See `benchmarks/INQUIRE/Makefile` for a complete example: + +```makefile +BENCHMARK_NAME := inquire +DOCKERFILE_JOB := Dockerfile.job +RESULTS_FILES := image_search_results.csv query_eval_metrics.csv +ENV ?= dev +ifeq ($(ENV),prod) + KUSTOMIZE_DIR := ../../kubernetes/INQUIRE/nrp-prod +else + KUSTOMIZE_DIR := ../../kubernetes/INQUIRE/nrp-dev +endif + +KUBECTL_NAMESPACE := sage +KUBECTL_CONTEXT ?= nautilus +REGISTRY := gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search +JOB_TAG ?= latest + +RUN_SCRIPT := run_benchmark.py + +include ../Makefile +``` + +## Usage + +Once your Makefile is set up, use it from your benchmark directory: + +```bash +cd benchmarking/MYBENCHMARK + +# Build image +make build + +# Run benchmark (deploys and runs the job) +make run + +# Monitor logs +make logs + +# View status +make status + +# Run locally (with port-forwarding) +make run-local + +# Clean up +make down +``` + +## Local Development + +For local development, use port-forwarding: + +```bash +# Start port-forwarding manually +make port-forward-start + +# Run your benchmark script locally +python run_benchmark.py + +# Stop port-forwarding +make port-forward-stop + +# Or use the convenience command (does all of the above) +make run-local +``` + +## How It Works + +1. The benchmark-specific Makefile sets required variables +2. It includes the base Makefile using `include ../Makefile` +3. The base Makefile uses these variables to execute commands +4. All commands are generic and work with any benchmark that sets the required variables + +## Benefits + +- **DRY Principle**: No code duplication across benchmarks +- **Consistency**: All benchmarks use the same commands +- **Maintainability**: Fix bugs or add features once in the base Makefile +- **Flexibility**: Each benchmark can customize variables and add benchmark-specific logic + +## Adding New Commands + +To add a new command that all benchmarks can use: + +1. Add it to `benchmarks/Makefile` (the base) +2. Use the standard variables (`BENCHMARK_NAME`, `KUBECTL_NAMESPACE`, etc.) +3. All benchmarks will automatically inherit the new command + +## Overriding Commands + +If a benchmark needs to override a base command, it can define its own version after the `include` statement: + +```makefile +include ../Makefile + +# ... variables ... + +# Override the build command +build: + @echo "Custom build for MYBENCHMARK" + @# Custom build logic here +``` + +However, this should be rare - most customization should be done via variables. diff --git a/benchmarking/benchmarks/Makefile b/benchmarking/benchmarks/Makefile new file mode 100644 index 00000000..76ba11dd --- /dev/null +++ b/benchmarking/benchmarks/Makefile @@ -0,0 +1,183 @@ +# Base Makefile for benchmarking framework +# This file contains generic benchmark commands that can be reused by any benchmark instance. +# Each benchmark should define its specific variables and include this file. + +.PHONY: down build run kubectl port-forward-start port-forward-stop run-local status logs + +# ============================================================================ +# Required Variables (must be set by benchmark-specific Makefile) +# ============================================================================ +# BENCHMARK_NAME - Name of the benchmark (e.g., "inquire") +# KUSTOMIZE_DIR - Path to kustomize directory (e.g., "../../kubernetes/INQUIRE") +# DOCKERFILE_JOB - Dockerfile for combined job (e.g., "Dockerfile.job") +# RESULTS_FILES - Space-separated list of result files to copy (e.g., "image_search_results.csv query_eval_metrics.csv") + +# ============================================================================ +# Optional Variables (can be overridden) +# ============================================================================ +KUBECTL_NAMESPACE ?= sage +KUBECTL_CONTEXT ?= nautilus +REGISTRY ?= gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search +JOB_TAG ?= latest +ENV ?= dev + +# Local run configuration +WEAVIATE_SERVICE ?= $(ENV)-weaviate +TRITON_SERVICE ?= $(ENV)-triton +WEAVIATE_HTTP_PORT ?= 8080 +WEAVIATE_GRPC_PORT ?= 50051 +TRITON_GRPC_PORT ?= 8001 +LOCAL_WEAVIATE_HTTP_PORT ?= 8080 +LOCAL_WEAVIATE_GRPC_PORT ?= 50051 +LOCAL_TRITON_GRPC_PORT ?= 8001 +RUN_SCRIPT ?= run_benchmark.py + +# Derived variables +JOB_IMAGE ?= $(REGISTRY)/benchmark-$(BENCHMARK_NAME)-job + +# ============================================================================ +# Helper Targets +# ============================================================================ + +# Set kubectl context +kubectl: + @kubectl config use-context $(KUBECTL_CONTEXT) + +# ============================================================================ +# Build Targets +# ============================================================================ + +# Build Docker images (for pushing to registry) +build: + @if [ -z "$(BENCHMARK_NAME)" ]; then \ + echo "Error: BENCHMARK_NAME must be set"; exit 1; \ + fi + @if [ -z "$(DOCKERFILE_JOB)" ]; then \ + echo "Error: DOCKERFILE_JOB must be set"; exit 1; \ + fi + @echo "Building benchmark job image..." + @docker build -t $(JOB_IMAGE):$(JOB_TAG) -f $(DOCKERFILE_JOB) . + @echo "Build complete. Push with:" + @echo " docker push $(JOB_IMAGE):$(JOB_TAG)" + +# ============================================================================ +# Cleanup Targets +# ============================================================================ +down: kubectl + @if [ -z "$(BENCHMARK_NAME)" ]; then \ + echo "Error: BENCHMARK_NAME must be set"; exit 1; \ + fi + @if [ -z "$(KUSTOMIZE_DIR)" ]; then \ + echo "Error: KUSTOMIZE_DIR must be set"; exit 1; \ + fi + @echo "Removing $(BENCHMARK_NAME) benchmark deployments..." + @kubectl delete -k $(KUSTOMIZE_DIR) --ignore-not-found=true || true + @echo "Cleanup complete." + +# ============================================================================ +# Execution Targets +# ============================================================================ + +# Run benchmark job (deploys and runs the benchmark) +run: kubectl + @if [ -z "$(BENCHMARK_NAME)" ]; then \ + echo "Error: BENCHMARK_NAME must be set"; exit 1; \ + fi + @if [ -z "$(KUSTOMIZE_DIR)" ]; then \ + echo "Error: KUSTOMIZE_DIR must be set"; exit 1; \ + fi + @echo "Deploying and running $(BENCHMARK_NAME) benchmark job..." + @kubectl apply -k $(KUSTOMIZE_DIR) + @echo "Job started. Monitor with:" + @echo " kubectl logs -n $(KUBECTL_NAMESPACE) -l app=benchmark-job,benchmark=$(BENCHMARK_NAME) -f" + + +# ============================================================================ +# Utility Targets +# ============================================================================ + +# Show status +status: kubectl + @if [ -z "$(BENCHMARK_NAME)" ]; then \ + echo "Error: BENCHMARK_NAME must be set"; exit 1; \ + fi + @echo "=== $(BENCHMARK_NAME) Benchmark Status ===" + @echo "=== Jobs ===" + @kubectl get jobs -n $(KUBECTL_NAMESPACE) -l benchmark=$(BENCHMARK_NAME) + @echo "" + @echo "=== Pods ===" + @kubectl get pods -n $(KUBECTL_NAMESPACE) -l benchmark=$(BENCHMARK_NAME) + +# Show logs +logs: kubectl + @if [ -z "$(BENCHMARK_NAME)" ]; then \ + echo "Error: BENCHMARK_NAME must be set"; exit 1; \ + fi + @pod=$$(kubectl get pods -n $(KUBECTL_NAMESPACE) -l app=benchmark-job,benchmark=$(BENCHMARK_NAME) --field-selector=status.phase!=Failed,status.phase!=Pending --sort-by=.metadata.creationTimestamp -o jsonpath="{.items[-1].metadata.name}"); \ + if [ -z "$$pod" ]; then \ + echo "No Running or Completed pods found for $(BENCHMARK_NAME) benchmark."; \ + exit 1; \ + fi; \ + phase=$$(kubectl get pod -n $(KUBECTL_NAMESPACE) $$pod -o jsonpath="{.status.phase}"); \ + if [ "$$phase" = "Running" ]; then \ + kubectl logs -n $(KUBECTL_NAMESPACE) -f $$pod; \ + else \ + kubectl logs -n $(KUBECTL_NAMESPACE) $$pod; \ + fi + +# ============================================================================ +# Local Development Targets +# ============================================================================ + +# Start port-forwarding for Weaviate and Triton services +port-forward-start: kubectl + @echo "Starting port-forwarding for services..." + @echo "Weaviate HTTP: localhost:$(LOCAL_WEAVIATE_HTTP_PORT) -> $(WEAVIATE_SERVICE):$(WEAVIATE_HTTP_PORT)" + @echo "Weaviate gRPC: localhost:$(LOCAL_WEAVIATE_GRPC_PORT) -> $(WEAVIATE_SERVICE):$(WEAVIATE_GRPC_PORT)" + @echo "Triton gRPC: localhost:$(LOCAL_TRITON_GRPC_PORT) -> $(TRITON_SERVICE):$(TRITON_GRPC_PORT)" + @kubectl port-forward -n $(KUBECTL_NAMESPACE) svc/$(WEAVIATE_SERVICE) $(LOCAL_WEAVIATE_HTTP_PORT):$(WEAVIATE_HTTP_PORT) $(LOCAL_WEAVIATE_GRPC_PORT):$(WEAVIATE_GRPC_PORT) > /tmp/kubectl-port-forward-weaviate.log 2>&1 & \ + echo $$! > /tmp/kubectl-port-forward-weaviate.pid + @kubectl port-forward -n $(KUBECTL_NAMESPACE) svc/$(TRITON_SERVICE) $(LOCAL_TRITON_GRPC_PORT):$(TRITON_GRPC_PORT) > /tmp/kubectl-port-forward-triton.log 2>&1 & \ + echo $$! > /tmp/kubectl-port-forward-triton.pid + @sleep 2 + @echo "Port-forwarding started. PIDs saved to /tmp/kubectl-port-forward-*.pid" + @echo "To stop port-forwarding, run: make port-forward-stop" + +# Stop port-forwarding +port-forward-stop: + @echo "Stopping port-forwarding..." + @if [ -f /tmp/kubectl-port-forward-weaviate.pid ]; then \ + kill $$(cat /tmp/kubectl-port-forward-weaviate.pid) 2>/dev/null || true; \ + rm -f /tmp/kubectl-port-forward-weaviate.pid; \ + echo "Stopped Weaviate port-forwarding"; \ + fi + @if [ -f /tmp/kubectl-port-forward-triton.pid ]; then \ + kill $$(cat /tmp/kubectl-port-forward-triton.pid) 2>/dev/null || true; \ + rm -f /tmp/kubectl-port-forward-triton.pid; \ + echo "Stopped Triton port-forwarding"; \ + fi + @echo "Port-forwarding stopped." + +# Run benchmark locally with port-forwarding +run-local: port-forward-start + @if [ -z "$(BENCHMARK_NAME)" ]; then \ + echo "Error: BENCHMARK_NAME must be set"; exit 1; \ + fi + @if [ ! -f "$(RUN_SCRIPT)" ]; then \ + echo "Error: Run script $(RUN_SCRIPT) not found"; exit 1; \ + fi + @echo "Running $(BENCHMARK_NAME) benchmark locally..." + @echo "Using port-forwarded services:" + @echo " WEAVIATE_HOST=127.0.0.1" + @echo " WEAVIATE_PORT=$(LOCAL_WEAVIATE_HTTP_PORT)" + @echo " WEAVIATE_GRPC_PORT=$(LOCAL_WEAVIATE_GRPC_PORT)" + @echo " TRITON_HOST=127.0.0.1" + @echo " TRITON_PORT=$(LOCAL_TRITON_GRPC_PORT)" + @WEAVIATE_HOST=127.0.0.1 \ + WEAVIATE_PORT=$(LOCAL_WEAVIATE_HTTP_PORT) \ + WEAVIATE_GRPC_PORT=$(LOCAL_WEAVIATE_GRPC_PORT) \ + TRITON_HOST=127.0.0.1 \ + TRITON_PORT=$(LOCAL_TRITON_GRPC_PORT) \ + python $(RUN_SCRIPT) || (make port-forward-stop && exit 1) + @make port-forward-stop + @echo "Benchmark run completed. Results saved locally." diff --git a/benchmarking/benchmarks/template/Dockerfile.job b/benchmarking/benchmarks/template/Dockerfile.job new file mode 100644 index 00000000..f4f72a1e --- /dev/null +++ b/benchmarking/benchmarks/template/Dockerfile.job @@ -0,0 +1,30 @@ +# MYBENCHMARK Benchmark Job Dockerfile +# Combined Dockerfile for running both data loading and evaluation +# +# Instructions: +# 1. Replace MYBENCHMARK with your benchmark name in the comment +# 2. Verify the CMD line runs your run_benchmark.py script +# 3. Ensure requirements.txt includes imsearch-eval and minio packages + +ARG PYTHON_VERSION=3.11-slim +FROM python:${PYTHON_VERSION} + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + git \ + procps \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY . . + +# Run combined benchmark script +CMD ["python", "run_benchmark.py"] + diff --git a/benchmarking/benchmarks/template/Makefile b/benchmarking/benchmarks/template/Makefile new file mode 100644 index 00000000..adc31aae --- /dev/null +++ b/benchmarking/benchmarks/template/Makefile @@ -0,0 +1,35 @@ +# MYBENCHMARK Benchmark Makefile +# This file sets MYBENCHMARK-specific variables and includes the base benchmarking Makefile +# +# Instructions: +# 1. Replace MYBENCHMARK with your benchmark name throughout this file +# 2. Update the required variables below +# 3. Add any benchmark-specific environment variables at the bottom + +# ============================================================================ +# Required Variables (must be set for base Makefile) +# ============================================================================ +# TODO: Replace MYBENCHMARK with your benchmark name +BENCHMARK_NAME := mybenchmark +DOCKERFILE_JOB := Dockerfile.job +RESULTS_FILES := image_search_results.csv query_eval_metrics.csv +ENV ?= dev +ifeq ($(ENV),prod) + KUSTOMIZE_DIR := ../../kubernetes/MYBENCHMARK/nrp-prod +else + KUSTOMIZE_DIR := ../../kubernetes/MYBENCHMARK/nrp-dev +endif + +# ============================================================================ +# Optional Variables (can be overridden) +# ============================================================================ +KUBECTL_NAMESPACE := sage +KUBECTL_CONTEXT ?= nautilus +REGISTRY := gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search +JOB_TAG ?= latest + +# Local run script +RUN_SCRIPT := run_benchmark.py + +# Include the base Makefile (after setting variables) +include ../Makefile diff --git a/benchmarking/benchmarks/template/QUICKSTART.md b/benchmarking/benchmarks/template/QUICKSTART.md new file mode 100644 index 00000000..aa124129 --- /dev/null +++ b/benchmarking/benchmarks/template/QUICKSTART.md @@ -0,0 +1,116 @@ +# Quick Start Guide + +Create a new benchmark in 5 minutes! + +## Step 1: Copy Template + +```bash +cd benchmarking/benchmarks +cp -r template MYBENCHMARK +cd MYBENCHMARK +``` + +## Step 2: Update Makefile + +Edit `Makefile` and replace `mybenchmark` with your benchmark name: + +```makefile +BENCHMARK_NAME := mybenchmark # Change this! +DOCKERFILE_JOB := Dockerfile.job +RESULTS_FILES := image_search_results.csv query_eval_metrics.csv +ENV ?= dev +ifeq ($(ENV),prod) + KUSTOMIZE_DIR := ../../kubernetes/MYBENCHMARK/nrp-prod +else + KUSTOMIZE_DIR := ../../kubernetes/MYBENCHMARK/nrp-dev +endif +``` + +## Step 3: Rename Template Files + +```bash +mv benchmark_dataset.template.py benchmark_dataset.py +mv run_benchmark.template.py run_benchmark.py +# config.py is already named correctly, just customize it +``` + +## Step 4: Update config.py + +Edit `config.py` and: +- Replace `MYBENCHMARK` with your benchmark name +- Update default values for your dataset, collection name, etc. +- Add any benchmark-specific hyperparameters + +## Step 5: Implement BenchmarkDataset + +Edit `benchmark_dataset.py` and implement: +- `load()` - Load your dataset +- `get_query_column()` - Return query column name +- `get_query_id_column()` - Return query ID column name +- `get_relevance_column()` - Return relevance column name + +## Step 6: Update run_benchmark.py + +Edit `run_benchmark.py` and: +- Import your `BenchmarkDataset` class (replace `MyBenchmarkDataset`) +- Import your `Config` class (replace `MyConfig`) +- Update the config instance creation +- Implement the `load_data(vector_db, model_provider)` function: + - Load your dataset + - Create collection schema + - Process and insert data into vector database +- Implement the `run_evaluation(vector_db, model_provider)` function: + - Create evaluator + - Run evaluation queries + - Return results + +See `../INQUIRE/run_benchmark.py` for a complete example. + +## Step 7: Create Kubernetes Config + +```bash +cd ../../kubernetes +cp -r ../benchmarks/template/kubernetes MYBENCHMARK +cd MYBENCHMARK +# Replace MYBENCHMARK with your benchmark name +find . -type f -name "*.yaml" -exec sed -i '' 's/MYBENCHMARK/mybenchmark/g' {} + +# Update image name in nrp-dev/kustomization.yaml and nrp-prod/kustomization.yaml +# Update environment variables in nrp-dev/env.yaml and nrp-prod/env.yaml +``` + +## Step 8: Update config.py (if needed) + +If your config needs Weaviate connection parameters, ensure they're in your config: +- `WEAVIATE_HOST` +- `WEAVIATE_PORT` +- `WEAVIATE_GRPC_PORT` + +These are typically set via environment variables in Kubernetes. + +## Step 9: Deploy + +```bash +cd ../../benchmarks/MYBENCHMARK +make build # Build image or use GitHub Actions to build and push to registry +make run # Deploy and run benchmark job (loads data and evaluates) +make logs # Monitor logs +``` + +## Files to Customize + +| File | What to Change | +|------|----------------| +| `Makefile` | Benchmark name, kustomize dir, result files | +| `config.py` | Update with your benchmark name, dataset, and configuration values | +| `benchmark_dataset.py` | Implement benchmark dataset logic | +| `run_benchmark.py` | Import your classes, implement `load_data()` and `run_evaluation()` functions | +| `Dockerfile.job` | Usually no changes needed | +| `requirements.txt` | Add your dependencies | +| `kubernetes/nrp-dev/env.yaml` | Update environment variables for dev | +| `kubernetes/nrp-prod/env.yaml` | Update environment variables for prod | + +## Need Help? + +- See `README.md` for detailed instructions +- Check `../INQUIRE/` for a complete example (same directory level) +- Review `../README.md` for framework overview diff --git a/benchmarking/benchmarks/template/README.md b/benchmarking/benchmarks/template/README.md new file mode 100644 index 00000000..38b73665 --- /dev/null +++ b/benchmarking/benchmarks/template/README.md @@ -0,0 +1,315 @@ +# Benchmark Template + +This directory contains templates and documentation for creating new benchmark instances. + +## Quick Start + +To create a new benchmark: + +```bash +cd benchmarking/benchmarks +cp -r template MYBENCHMARK +cd MYBENCHMARK +# Customize the files as described below +``` + +## Directory Structure + +A new benchmark should have the following structure: + +``` +MYBENCHMARK/ +├── Makefile # Benchmark-specific Makefile (from template) +├── Dockerfile.job # Combined job container (from template) +├── requirements.txt # Python dependencies +├── run_benchmark.py # Combined benchmark script (loads data and evaluates) +├── benchmark_dataset.py # BenchmarkDataset implementation +├── data_loader.py # DataLoader implementation (optional) +├── config.py # Config implementation (recommended) +└── README.md # Benchmark-specific documentation +``` + +## Step-by-Step Setup + +### 1. Create Benchmark Directory + +```bash +cd benchmarking/benchmarks +cp -r template MYBENCHMARK +cd MYBENCHMARK +``` + +### 2. Update Makefile + +Edit `Makefile` and set the required variables: + +```makefile +BENCHMARK_NAME := mybenchmark +DOCKERFILE_JOB := Dockerfile.job +RESULTS_FILES := image_search_results.csv query_eval_metrics.csv +ENV ?= dev +ifeq ($(ENV),prod) + KUSTOMIZE_DIR := ../../kubernetes/MYBENCHMARK/nrp-prod +else + KUSTOMIZE_DIR := ../../kubernetes/MYBENCHMARK/nrp-dev +endif +``` + +### 3. Update Dockerfile + +The `Dockerfile.job` is already set up to run `run_benchmark.py`. Verify the CMD line is correct. + +### 4. Create Python Files + +#### `config.py` - Configuration Class (Recommended) + +Create a Config class that extends the `Config` interface and loads all environment variables: + +```python +import os +from imsearch_eval.framework.interfaces import Config + +class MyConfig(Config): + def __init__(self): + # Environment variables + self.MYBENCHMARK_DATASET = os.environ.get("MYBENCHMARK_DATASET", "your-dataset/name") + self.WEAVIATE_HOST = os.environ.get("WEAVIATE_HOST", "127.0.0.1") + self.WEAVIATE_PORT = os.environ.get("WEAVIATE_PORT", "8080") + self.WEAVIATE_GRPC_PORT = os.environ.get("WEAVIATE_GRPC_PORT", "50051") + self.TRITON_HOST = os.environ.get("TRITON_HOST", "triton") + self.TRITON_PORT = os.environ.get("TRITON_PORT", "8001") + self.COLLECTION_NAME = os.environ.get("COLLECTION_NAME", "MYBENCHMARK") + # ... add more as needed +``` + +See `config.py` template and `../INQUIRE/config.py` for complete examples. + +#### `benchmark_dataset.py` - Implement BenchmarkDataset + +Extend the `HuggingFaceDataset` adapter for HuggingFace Hub datasets: + +```python +from imsearch_eval.adapters.huggingface import HuggingFaceDataset + +class MyBenchmarkDataset(HuggingFaceDataset): + """Benchmark dataset class for MYBENCHMARK.""" + + def get_query_column(self) -> str: + """Return the column name containing query text.""" + return "query" # TODO: Update with your column name + + def get_query_id_column(self) -> str: + """Return the column name containing query IDs.""" + return "query_id" # TODO: Update with your column name + + def get_relevance_column(self) -> str: + """Return the column name containing relevance labels (1 for relevant, 0 for not).""" + return "relevant" # TODO: Update with your column name + + def get_metadata_columns(self) -> list: + """Return optional metadata columns to include in evaluation stats.""" + return [] # TODO: Add metadata columns if available (e.g., ["category", "type"]) +``` + +The `HuggingFaceDataset` adapter handles loading datasets from HuggingFace Hub. You only need to implement the column mapping methods. The dataset is loaded using `benchmark_dataset.load_as_dataset(split="test", sample_size=0, seed=42, token=config._hf_token)`. + +#### `run_benchmark.py` - Benchmark Script + +This script should: +1. Create a config instance at the top +2. Define a `load_data(data_loader, vector_db, hf_dataset)` function that loads data into the vector database +3. Define a `run_evaluation(evaluator, hf_dataset)` function that runs the evaluation +4. Define an `upload_to_s3(local_file_path, s3_key)` function for S3 uploads (optional) +5. In `main()`, set up clients/adapters, then call both functions sequentially +6. Save results locally (three CSV files: `image_search_results.csv`, `query_eval_metrics.csv`, `config_values.csv`) +7. Optionally upload results to S3 + +The structure should be: +```python +from config import MyConfig +from imsearch_eval import BenchmarkEvaluator, VectorDBAdapter +from imsearch_eval.adapters import WeaviateAdapter, TritonModelProvider, WeaviateQuery +from benchmark_dataset import MyBenchmarkDataset +from data_loader import MyDataLoader # Optional + +config = MyConfig() + +def load_data(data_loader, vector_db: VectorDBAdapter, hf_dataset): + """Load dataset into vector database.""" + # Create collection schema + schema_config = data_loader.get_schema_config() + vector_db.create_collection(schema_config) + + # Process and insert data + results = data_loader.process_batch(batch_size=config._image_batch_size, + dataset=hf_dataset, + workers=config._workers) + inserted = vector_db.insert_data(config._collection_name, results, + batch_size=config._image_batch_size) + +def run_evaluation(evaluator: BenchmarkEvaluator, hf_dataset): + """Run the benchmark evaluation.""" + image_results, query_evaluation = evaluator.evaluate_queries( + query_batch_size=config._query_batch_size, + dataset=hf_dataset, + workers=config._workers + ) + return image_results, query_evaluation + +def main(): + # Step 0: Set up clients and adapters + # Step 1: Call load_data(data_loader, vector_db, hf_dataset) + # Step 2: Call run_evaluation(evaluator, hf_dataset) + # Step 3: Save results (image_search_results.csv, query_eval_metrics.csv, config_values.csv) + # Step 4: Upload to S3 (optional) + pass +``` + +See `../INQUIRE/run_benchmark.py` for a complete example. + +### 5. Create Kubernetes Configuration + +Use the Kubernetes template from this directory: + +```bash +cd ../../kubernetes +cp -r ../benchmarks/template/kubernetes MYBENCHMARK +cd MYBENCHMARK +# Replace MYBENCHMARK with your benchmark name in all files +find . -type f -name "*.yaml" -exec sed -i '' 's/MYBENCHMARK/mybenchmark/g' {} + +``` + +Then customize: +- `kustomization.yaml`: Update image name +- `env.yaml`: Set benchmark-specific environment variables + +See `../../kubernetes/README.md` for detailed instructions. + +### 6. Create requirements.txt + +Create a `requirements.txt` with your dependencies: + +```txt +# Core benchmarking framework (install with all extras needed) +imsearch_eval[weaviate] @ git+https://github.com/waggle-sensor/imsearch_eval.git@0.1.0 +imsearch_eval[triton] @ git+https://github.com/waggle-sensor/imsearch_eval.git@0.1.0 +imsearch_eval[huggingface] @ git+https://github.com/waggle-sensor/imsearch_eval.git@0.1.0 + +# S3 upload support (MinIO) +minio>=7.2.0 + +# Add other dependencies as needed +# Pillow>=10.0.0 +# python-dateutil>=2.8.0 +``` + +## Required Components + +### Must Implement + +1. **BenchmarkDataset** (`benchmark_dataset.py`): Extends `HuggingFaceDataset` and defines column mappings +2. **Config** (`config.py`): Configuration class that loads all environment variables and implements `to_csv()` method +3. **run_benchmark.py**: Script that includes: + - Config instance creation + - `load_data(data_loader, vector_db, hf_dataset)` function: Loads data into vector database + - `run_evaluation(evaluator, hf_dataset)` function: Runs the evaluation + - `upload_to_s3(local_file_path, s3_key)` function: Uploads results to S3 (optional) + - `main()` function: Sets up environment, then orchestrates the complete benchmark run + +### Optional Components + +1. **DataLoader** (`data_loader.py`): Custom data processing/insertion logic +2. Additional hyperparameters in `config.py` (e.g., Weaviate HNSW settings, model parameters) + +## Using Shared Adapters + +The `imsearch-eval` package provides shared adapters you can use: + +**Triton adapters**: +- **TritonModelProvider**: For Triton inference server (implements `ModelProvider`) +- **TritonModelUtils**: Triton implementation of `ModelUtils` interface + +**Weaviate adapters**: +- **WeaviateAdapter**: For Weaviate vector database (implements `VectorDBAdapter`) +- **WeaviateQuery**: Weaviate query implementation (implements `Query` interface) + +Import them: + +```python +from imsearch_eval.adapters import WeaviateAdapter, TritonModelProvider, WeaviateQuery, TritonModelUtils +``` + +**Note**: Install the package with all extras needed: +```bash +pip install "imsearch_eval[weaviate,triton,huggingface] @ git+https://github.com/waggle-sensor/imsearch_eval.git@0.1.0" +``` + +## Deployment + +Once everything is set up: + +1. **Build and run benchmark**: + ```bash + make build # Build Docker image + make run # Deploy and run benchmark job + ``` + +3. **Monitor logs**: + ```bash + make logs + ``` + +4. **Run locally (with port-forwarding)**: + ```bash + make run-local + ``` + +## Results + +The benchmark generates three CSV files: + +1. **`image_search_results.csv`**: Metadata of all images returned by the vector database for each query +2. **`query_eval_metrics.csv`**: Calculated evaluation metrics (NDCG, precision, recall, etc.) for each query +3. **`config_values.csv`**: Configuration values used for the benchmark run (generated via `config.to_csv()`) + +Results are saved to `/app/results` if the directory exists (when running in Kubernetes with volume mount), otherwise to the current directory. + +## S3 Upload Configuration + +Results can be automatically uploaded to S3-compatible storage (MinIO). Configuration is done via: + +- **Base Kubernetes config**: S3 endpoint, bucket, and secure flag are set in `benchmarking/kubernetes/base/benchmark-job.yaml` +- **S3 Secret**: Access key and secret key are stored in `benchmarking/kubernetes/base/._s3-secret.yaml` +- **Benchmark-specific**: Override `S3_PREFIX` in your benchmark's `nrp-dev/env.yaml` or `nrp-prod/env.yaml` if needed + +To enable S3 upload, set `UPLOAD_TO_S3=true` in the base config (already enabled by default). Results are uploaded with timestamps: `{S3_PREFIX}/{timestamp}/{filename}`. + +## Framework Structure + +The benchmarking framework is now provided as a Python package (`imsearch-eval`) installed from GitHub: + +``` +benchmarking/ +└── benchmarks/ # Benchmark instances + ├── template/ # Template for new benchmarks + └── INQUIRE/ # Example benchmark implementation +``` + +The framework code (`framework/` and `adapters/`) is now in a separate repository: +- **Repository**: https://github.com/waggle-sensor/imsearch_eval +- **Package name**: `imsearch-eval` +- **Installation**: `pip install imsearch_eval[weaviate] @ git+https://github.com/waggle-sensor/imsearch_eval.git@main` + +## Next Steps + +- Review `../README.md` for framework overview +- Review `../MAKEFILE.md` for Makefile details (same directory level) +- Review `../DOCKER.md` for Dockerfile details (same directory level) +- Review `../../kubernetes/README.md` for Kubernetes setup +- Look at `../INQUIRE/` as a complete example (same directory level) + +## Getting Help + +- Check existing benchmarks (e.g., `../INQUIRE/`) for examples +- Review framework documentation: https://github.com/waggle-sensor/imsearch_eval +- Review adapter documentation in the `imsearch-eval` package diff --git a/benchmarking/benchmarks/template/benchmark_dataset.template.py b/benchmarking/benchmarks/template/benchmark_dataset.template.py new file mode 100644 index 00000000..c472e810 --- /dev/null +++ b/benchmarking/benchmarks/template/benchmark_dataset.template.py @@ -0,0 +1,41 @@ +# Template for benchmark_dataset.py +# Copy this file to benchmark_dataset.py and implement the BenchmarkDataset interface +# +# This template uses the abstract benchmarking framework: +# - Framework: Abstract interfaces (framework/interfaces.py) +# - BenchmarkDataset: Interface for your benchmark dataset +# - Other interfaces: VectorDBAdapter, ModelProvider, Query, DataLoader, Config + +import os +import pandas as pd + +from imsearch_eval.adapters.huggingface import HuggingFaceDataset + +class MyBenchmarkDataset(HuggingFaceDataset): + """ + Benchmark dataset class for MYBENCHMARK. + + TODO: Replace MYBENCHMARK with your benchmark name + TODO: Implement all required methods + """ + + def get_query_column(self) -> str: + """Return the column name containing query text.""" + return "query" # TODO: Update with your column name + + def get_query_id_column(self) -> str: + """Return the column name containing query IDs.""" + return "query_id" # TODO: Update with your column name + + def get_relevance_column(self) -> str: + """Return the column name containing relevance labels (1 for relevant, 0 for not).""" + return "relevant" # TODO: Update with your column name + + def get_metadata_columns(self) -> list: + """ + Return list of optional metadata column names. + + These columns will be included in results but not used for evaluation. + """ + return [] # TODO: Add metadata columns if available (e.g., ["category", "type"]) + diff --git a/benchmarking/benchmarks/template/config.py b/benchmarking/benchmarks/template/config.py new file mode 100644 index 00000000..b6116e9a --- /dev/null +++ b/benchmarking/benchmarks/template/config.py @@ -0,0 +1,60 @@ +"""Template for config.py""" + +import os +from imsearch_eval.framework.interfaces import Config + + +class MyConfig(Config): + """Configuration for MYBENCHMARK benchmark.""" + + def __init__(self): + """Initialize MYBENCHMARK configuration.""" + # TODO: Update with your parameters for the benchmark + # Dataset parameters + self.mybenchmark_dataset = os.environ.get("MYBENCHMARK_DATASET", "your-dataset/name") + self.sample_size = int(os.environ.get("SAMPLE_SIZE", 0)) + self.seed = int(os.environ.get("SEED", 42)) + self._hf_token = os.environ.get("HF_TOKEN", "") + + # Upload parameters + self._upload_to_s3 = os.environ.get("UPLOAD_TO_S3", "false").lower() == "true" + self._s3_bucket = os.environ.get("S3_BUCKET", "sage_imsearch") + self._s3_prefix = os.environ.get("S3_PREFIX", "dev-metrics") + self._s3_endpoint = os.environ.get("S3_ENDPOINT", "http://rook-ceph-rgw-nautiluss3.rook") + self._s3_access_key = os.environ.get("S3_ACCESS_KEY", "") + self._s3_secret_key = os.environ.get("S3_SECRET_KEY", "") + self._s3_secure = os.environ.get("S3_SECURE", "false").lower() == "true" + self._image_results_file = os.environ.get("IMAGE_RESULTS_FILE", "image_search_results.csv") + self._query_eval_metrics_file = os.environ.get("QUERY_EVAL_METRICS_FILE", "query_eval_metrics.csv") + self._config_values_file = os.environ.get("CONFIG_VALUES_FILE", "config_values.csv") + + # Weaviate parameters + self._weaviate_host = os.environ.get("WEAVIATE_HOST", "127.0.0.1") + self._weaviate_port = os.environ.get("WEAVIATE_PORT", "8080") + self._weaviate_grpc_port = os.environ.get("WEAVIATE_GRPC_PORT", "50051") + self._collection_name = os.environ.get("COLLECTION_NAME", "MYBENCHMARK") + + # Triton parameters + self._triton_host = os.environ.get("TRITON_HOST", "triton") + self._triton_port = os.environ.get("TRITON_PORT", "8001") + + # Workers parameters + self._workers = int(os.environ.get("WORKERS", 5)) + self._image_batch_size = int(os.environ.get("IMAGE_BATCH_SIZE", 25)) + self._query_batch_size = int(os.environ.get("QUERY_BATCH_SIZE", 5)) + + # Logging parameters + self._log_level = os.environ.get("LOG_LEVEL", "INFO").upper() + + # Query parameters + self.query_method = os.environ.get("QUERY_METHOD", "clip_hybrid_query") + self.target_vector = os.environ.get("TARGET_VECTOR", "clip") + self.response_limit = int(os.environ.get("RESPONSE_LIMIT", 50)) + self.advanced_query_parameters = { + "alpha": float(os.environ.get("QUERY_ALPHA", 0.4)), + "query_properties": ["caption"], # TODO: Update with your query properties + "autocut_jumps": int(os.environ.get("AUTOCUT_JUMPS", 0)), + "rerank_prop": os.environ.get("RERANK_PROP", "caption"), # TODO: Update with your rerank property + "clip_alpha": float(os.environ.get("CLIP_ALPHA", 0.7)), + } + \ No newline at end of file diff --git a/benchmarking/benchmarks/template/kubernetes/README.md b/benchmarking/benchmarks/template/kubernetes/README.md new file mode 100644 index 00000000..6f73aedd --- /dev/null +++ b/benchmarking/benchmarks/template/kubernetes/README.md @@ -0,0 +1,261 @@ +# Kubernetes Template for Benchmarks + +This directory contains Kubernetes/kustomize templates for benchmark deployments. + +## Quick Start + +1. Copy this directory to your benchmark's kubernetes folder: + ```bash + cd benchmarking/kubernetes + cp -r ../benchmarks/template/kubernetes MYBENCHMARK + cd MYBENCHMARK + ``` + +2. Replace `MYBENCHMARK` with your benchmark name in all files: + ```bash + # On macOS/Linux + find . -type f -exec sed -i '' 's/MYBENCHMARK/mybenchmark/g' {} + + find . -type f -exec sed -i '' 's/mybenchmark/mybenchmark/g' {} + + ``` + +3. Update the image name in `kustomization.yaml` + +4. Customize environment variables in `env.yaml` + +## Files Overview + +### `nrp-dev/` (Default) +Development environment overlay that: +- Sets the name prefix (`dev-MYBENCHMARK-`) +- References the base deployment +- Applies patches for environment variables +- Defines image replacement + +**Files:** +- `kustomization.yaml` - Main kustomize configuration +- `env.yaml` - Environment variables for dev environment + +**Required changes:** +- Replace `MYBENCHMARK` with your benchmark name in both files +- Update image name in `kustomization.yaml` +- Update S3_PREFIX in `env.yaml` if needed + +### `nrp-prod/` (Optional) +Production environment overlay that: +- Sets the name prefix (`prod-MYBENCHMARK-`) +- Extends the base overlay +- Patches service names for prod environment +- Can override S3 prefix for prod + +**Files:** +- `kustomization.yaml` - Main kustomize configuration for prod +- `env.yaml` - Environment variables for prod environment + +**Required changes:** +- Replace `MYBENCHMARK` with your benchmark name in both files +- Update image name and tag in `kustomization.yaml` +- Update S3_PREFIX in `env.yaml` for prod path + +## Step-by-Step Setup + +### 1. Copy Template + +```bash +cd benchmarking/kubernetes +cp -r ../benchmarks/template/kubernetes MYBENCHMARK +cd MYBENCHMARK +``` + +### 2. Replace Placeholders + +Replace `MYBENCHMARK` with your benchmark name (lowercase) in all files: + +```bash +# Example: Replace MYBENCHMARK with "mybenchmark" +find . -type f -name "*.yaml" -exec sed -i '' 's/MYBENCHMARK/mybenchmark/g' {} + +``` + +Also replace in both `nrp-dev/kustomization.yaml` and `nrp-prod/kustomization.yaml`: +- `namePrefix: dev-MYBENCHMARK-` → `namePrefix: dev-mybenchmark-` +- `namePrefix: prod-MYBENCHMARK-` → `namePrefix: prod-mybenchmark-` +- `benchmark: MYBENCHMARK` → `benchmark: mybenchmark` + +### 3. Update Image Name + +Edit both `nrp-dev/kustomization.yaml` and `nrp-prod/kustomization.yaml` and update the image name: + +```yaml +images: + - name: PLACEHOLDER_BENCHMARK_JOB_IMAGE + newName: gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/benchmark-MYBENCHMARK-job + newTag: latest # Use specific tag for prod (e.g., pr-1) +``` + +### 4. Customize Environment Variables + +#### Dev Environment (`nrp-dev/env.yaml`) + +Update environment variables for dev environment: + +```yaml +env: + # Vector DB configuration (Weaviate) + - name: WEAVIATE_HOST + value: "dev-weaviate.sage.svc.cluster.local" + # Inference server configuration (Triton) + - name: TRITON_HOST + value: "dev-triton.sage.svc.cluster.local" + # S3 upload configuration (override base defaults for this benchmark) + - name: S3_PREFIX + value: "dev-metrics/MYBENCHMARK" + - name: LOG_LEVEL + value: "DEBUG" +``` + +#### Prod Environment (`nrp-prod/env.yaml`) + +Update environment variables for prod environment: + +```yaml +env: + # Vector DB configuration (Weaviate) - prod environment + - name: WEAVIATE_HOST + value: "prod-weaviate.sage.svc.cluster.local" + # Inference server configuration (Triton) - prod environment + - name: TRITON_HOST + value: "prod-triton.sage.svc.cluster.local" + # S3 upload configuration (override base defaults for this benchmark) + - name: S3_PREFIX + value: "prod-metrics/MYBENCHMARK" + - name: LOG_LEVEL + value: "INFO" +``` + +### 5. Update Makefile + +Ensure your benchmark's Makefile uses conditional logic to select the correct kustomize directory: + +```makefile +ENV ?= dev +ifeq ($(ENV),prod) + KUSTOMIZE_DIR := ../../kubernetes/MYBENCHMARK/nrp-prod +else + KUSTOMIZE_DIR := ../../kubernetes/MYBENCHMARK/nrp-dev +endif +``` + +## Testing + +After setting up, test the configuration: + +```bash +# Preview the generated manifests +kubectl kustomize . | less + +# Run benchmark (from benchmark directory) +make run +``` + +## Common Customizations + +### Different Namespace + +Update `kustomization.yaml`: + +```yaml +namespace: your-namespace +``` + +### Additional Environment Variables + +Add to `env.yaml`: + +```yaml +env: + - name: NEW_VARIABLE + value: "value" +``` + +### S3 Configuration + +S3 endpoint, bucket, and credentials are configured in the base. To override: + +```yaml +env: + - name: S3_PREFIX + value: "custom-prefix/benchmark-name" + - name: UPLOAD_TO_S3 + value: "true" # Enable S3 upload +``` + +## Integration with Makefile + +The Makefile should use conditional logic to select the correct kustomize directory: + +```makefile +ENV ?= dev +ifeq ($(ENV),prod) + KUSTOMIZE_DIR := ../../kubernetes/MYBENCHMARK/nrp-prod +else + KUSTOMIZE_DIR := ../../kubernetes/MYBENCHMARK/nrp-dev +endif +``` + +Then use: + +```bash +make build # Build Docker image +make run # Deploy and run benchmark job (dev environment by default) +make run ENV=prod # Deploy and run using prod environment resources +make logs # View logs +make down # Removes deployment +``` + +## Environment Switching (Dev/Prod) + +Benchmarks can be deployed to use either **dev** or **prod** environment resources. The template includes both `nrp-dev/` and `nrp-prod/` overlays. + +>NOTE: By default, the benchmark will use the dev environment resources (`nrp-dev/`). + +### Using Environment Overlays + +From the benchmark directory (e.g., `benchmarking/benchmarks/MYBENCHMARK/`): + +```bash +# Run using default (dev environment) resources +make run + +# Run using prod environment resources +make run ENV=prod +``` + +The `ENV` variable controls which kustomize overlay is used: +- `ENV=prod` → Uses `kubernetes/MYBENCHMARK/nrp-prod/` +- No `ENV` or `ENV=dev` → Uses `kubernetes/MYBENCHMARK/nrp-dev/` + +## Troubleshooting + +### Error: "no matches for kind" + +Make sure you're referencing the base correctly: +```yaml +resources: + - ../base +``` + +### Error: "image not found" + +Check that image name in `kustomization.yaml` matches your registry and image name. + +### Job not starting + +Check logs: +```bash +make logs +``` + +## See Also + +- `../../kubernetes/README.md` - Kubernetes overview +- `../../kubernetes/base/` - Base deployment definitions +- `../../../benchmarks/INQUIRE/` - Complete example diff --git a/benchmarking/benchmarks/template/kubernetes/nrp-dev/env.yaml b/benchmarking/benchmarks/template/kubernetes/nrp-dev/env.yaml new file mode 100644 index 00000000..cdad636b --- /dev/null +++ b/benchmarking/benchmarks/template/kubernetes/nrp-dev/env.yaml @@ -0,0 +1,24 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: benchmark-job +spec: + template: + spec: + containers: + - name: benchmark-job + env: + #TODO: Update with your benchmark-specific dev environment variables + # Vector DB configuration (Weaviate) + - name: WEAVIATE_HOST + value: "dev-weaviate.sage.svc.cluster.local" + # Inference server configuration (Triton) + - name: TRITON_HOST + value: "dev-triton.sage.svc.cluster.local" + # S3 upload configuration (override base defaults for this benchmark) + # TODO: Update S3_PREFIX with your benchmark-specific path + - name: S3_PREFIX + value: "dev-metrics/MYBENCHMARK" + - name: LOG_LEVEL + value: "DEBUG" + diff --git a/benchmarking/benchmarks/template/kubernetes/nrp-dev/kustomization.yaml b/benchmarking/benchmarks/template/kubernetes/nrp-dev/kustomization.yaml new file mode 100644 index 00000000..3d26d0f7 --- /dev/null +++ b/benchmarking/benchmarks/template/kubernetes/nrp-dev/kustomization.yaml @@ -0,0 +1,26 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: sage + +# TODO: Replace MYBENCHMARK with your benchmark name (lowercase) +namePrefix: dev-MYBENCHMARK- +commonLabels: + benchmark: MYBENCHMARK + +# This overlay references the base and applies MYBENCHMARK-specific patches +resources: + - ../../base + +patches: + # Patch job environment variables (MYBENCHMARK-specific + dev services) + - path: env.yaml + target: + kind: Job + labelSelector: "app=benchmark-job" + +images: + # TODO: Replace MYBENCHMARK with your benchmark name and update registry if needed + - name: PLACEHOLDER_BENCHMARK_JOB_IMAGE + newName: gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/benchmark-MYBENCHMARK-job + newTag: latest + diff --git a/benchmarking/benchmarks/template/kubernetes/nrp-prod/env.yaml b/benchmarking/benchmarks/template/kubernetes/nrp-prod/env.yaml new file mode 100644 index 00000000..ff9e9bb6 --- /dev/null +++ b/benchmarking/benchmarks/template/kubernetes/nrp-prod/env.yaml @@ -0,0 +1,24 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: benchmark-job +spec: + template: + spec: + containers: + - name: benchmark-job + env: + #TODO: Update with your benchmark-specific prod environment variables + # Vector DB configuration (Weaviate) - prod environment + - name: WEAVIATE_HOST + value: "prod-weaviate.sage.svc.cluster.local" + # Inference server configuration (Triton) - prod environment + - name: TRITON_HOST + value: "prod-triton.sage.svc.cluster.local" + # S3 upload configuration (override base defaults for this benchmark) + # TODO: Update S3_PREFIX with your benchmark-specific prod path + - name: S3_PREFIX + value: "prod-metrics/MYBENCHMARK" + - name: LOG_LEVEL + value: "INFO" + diff --git a/benchmarking/benchmarks/template/kubernetes/nrp-prod/kustomization.yaml b/benchmarking/benchmarks/template/kubernetes/nrp-prod/kustomization.yaml new file mode 100644 index 00000000..8c9a9f45 --- /dev/null +++ b/benchmarking/benchmarks/template/kubernetes/nrp-prod/kustomization.yaml @@ -0,0 +1,26 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: sage + +# TODO: Replace MYBENCHMARK with your benchmark name (lowercase) +namePrefix: prod-MYBENCHMARK- +commonLabels: + benchmark: MYBENCHMARK + +# This overlay references the base and applies MYBENCHMARK-specific patches +# configured for the prod environment (prod-weaviate, prod-triton, etc.) +resources: + - ../../base + +patches: + # Patch job environment variables (MYBENCHMARK-specific + prod services) + - path: env.yaml + target: + kind: Job + labelSelector: "app=benchmark-job" + +images: + # TODO: Replace MYBENCHMARK with your benchmark name and update registry if needed + - name: PLACEHOLDER_BENCHMARK_JOB_IMAGE + newName: gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/benchmark-MYBENCHMARK-job + newTag: latest diff --git a/benchmarking/benchmarks/template/requirements.txt b/benchmarking/benchmarks/template/requirements.txt new file mode 100644 index 00000000..81896784 --- /dev/null +++ b/benchmarking/benchmarks/template/requirements.txt @@ -0,0 +1,17 @@ +# Python dependencies for benchmark +# Add your benchmark-specific dependencies here + +# Core benchmarking framework +# TODO: Install the module you need +imsearch_eval[weaviate] @ git+https://github.com/waggle-sensor/imsearch_eval.git@main + +# S3 upload support (MinIO) +minio>=7.2.0 + +# Add other dependencies as needed +# Example: +# datasets>=2.14.0 +# huggingface-hub>=0.16.0 +# Pillow>=10.0.0 +# python-dateutil>=2.8.0 + diff --git a/benchmarking/benchmarks/template/run_benchmark.template.py b/benchmarking/benchmarks/template/run_benchmark.template.py new file mode 100644 index 00000000..cd7ca88c --- /dev/null +++ b/benchmarking/benchmarks/template/run_benchmark.template.py @@ -0,0 +1,263 @@ +"""script to run MYBENCHMARK: load data and evaluate queries.""" + +import os +import logging +import time +import sys +from pathlib import Path +import tritonclient.grpc as TritonClient +from datasets import Dataset + +from imsearch_eval import BenchmarkEvaluator, VectorDBAdapter, BatchedIterator +from imsearch_eval.adapters import WeaviateAdapter, TritonModelProvider, WeaviateQuery +from benchmark_dataset import MyBenchmarkDataset # TODO: Import your BenchmarkDataset +# from data_loader import MyDataLoader # TODO: Import if you have a custom DataLoader +from config import MyConfig # TODO: Set a Config class for your benchmark +from concurrent.futures import ThreadPoolExecutor, as_completed + +config = MyConfig() + +def load_data(data_loader, vector_db: VectorDBAdapter, hf_dataset: Dataset): + """Load MYBENCHMARK dataset into vector database. + + TODO: Implement your data loading logic here. + See benchmarks/INQUIRE/run_benchmark.py for a complete example. + + Args: + data_loader: Your DataLoader instance + vector_db: VectorDBAdapter instance + hf_dataset: HuggingFace Dataset containing the dataset to load + """ + try: + # TODO: Create collection schema + # logging.info("Creating collection schema...") + # schema_config = data_loader.get_schema_config() + # vector_db.create_collection(schema_config) + + # TODO: Process and insert data + # logging.info("Processing and inserting data...") + # results = data_loader.process_batch(batch_size=config._image_batch_size, dataset=hf_dataset, workers=config._workers) + # inserted = vector_db.insert_data(config._collection_name, results, batch_size=config._image_batch_size) + # logging.info(f"Inserted {inserted} items.") + + logging.info(f"Successfully loaded {config.mybenchmark_dataset} into Weaviate collection '{config._collection_name}'") + + except Exception as e: + logging.error(f"Error loading data: {e}") + vector_db.close() + raise + +def run_evaluation(evaluator: BenchmarkEvaluator, hf_dataset: Dataset): + """Run the MYBENCHMARK benchmark evaluation. + + Args: + evaluator: BenchmarkEvaluator instance + hf_dataset: HuggingFace Dataset containing the dataset to evaluate + + Returns: + Tuple of (image_results, query_evaluation) DataFrames + """ + # Run evaluation + logging.info("Starting evaluation...") + try: + image_results, query_evaluation = evaluator.evaluate_queries( + query_batch_size=config._query_batch_size, + dataset=hf_dataset, + workers=config._workers + ) + except Exception as e: + logging.error(f"Error running evaluation: {e}") + evaluator.vector_db.close() + raise + + return image_results, query_evaluation + +def upload_to_s3(local_file_path: str, s3_key: str): + """Upload a file to S3-compatible storage using MinIO.""" + try: + from minio import Minio + from minio.error import S3Error + + if not config._s3_endpoint: + raise ValueError("S3_ENDPOINT environment variable must be set") + + # Parse endpoint (remove http:// or https:// if present) + endpoint = config._s3_endpoint.replace("http://", "").replace("https://", "") + + # Create MinIO client + client = Minio( + endpoint, + access_key=config._s3_access_key, + secret_key=config._s3_secret_key, + secure=config._s3_secure + ) + + # Upload file + logging.info(f"Uploading {local_file_path} to s3://{config._s3_bucket}/{s3_key}") + client.fput_object(config._s3_bucket, s3_key, local_file_path) + logging.info(f"Successfully uploaded to s3://{config._s3_bucket}/{s3_key}") + + except ImportError: + logging.error("minio is not installed. Install it with: pip install minio") + raise + except S3Error as e: + logging.error(f"Error uploading to S3: {e}") + raise + except Exception as e: + logging.error(f"Unexpected error uploading to S3: {e}") + raise + +def main(): + """Main entry point for running the complete benchmark.""" + + # Configure logging + logging.basicConfig( + level=getattr(logging, config._log_level, logging.INFO), + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + datefmt="%Y/%m/%d %H:%M:%S", + ) + + # Step 0: load framework components + logging.info("=" * 80) + logging.info("Step 0: Setting up benchmark environment") + logging.info("=" * 80) + logging.info("Initializing Weaviate client...") + weaviate_client = WeaviateAdapter.init_client( # TODO: Update with your vector database client + host=config._weaviate_host, + port=config._weaviate_port, + grpc_port=config._weaviate_grpc_port + ) + + logging.info("Initializing Triton client...") + triton_client = TritonClient.InferenceServerClient(url=f"{config._triton_host}:{config._triton_port}") # TODO: Update with your model provider client + + # Create query method + query_method = WeaviateQuery( + weaviate_client=weaviate_client, + triton_client=triton_client + ) + + # Create adapters + logging.info("Creating adapters...") + vector_db = WeaviateAdapter( # TODO: Update with your vector database adapter + weaviate_client=weaviate_client, + triton_client=triton_client, + query_method=query_method + ) + + model_provider = TritonModelProvider(triton_client=triton_client) # TODO: Update with your model provider + + # Create benchmark dataset + logging.info("Creating benchmark dataset class...") + benchmark_dataset = MyBenchmarkDataset() # TODO: Use your BenchmarkDataset + hf_dataset = benchmark_dataset.load_as_dataset(split="test", sample_size=config.sample_size, seed=config.seed, token=config._hf_token) # TODO: Update parameters as needed + + # Create data loader + logging.info("Creating data loader...") + # TODO: Create your data loader if you have one + # data_loader = MyDataLoader( + # config=config, + # model_provider=model_provider, + # dataset=benchmark_dataset, + # ) + data_loader = None # TODO: Replace with your data loader or None if not using one + + # Create evaluator + logging.info("Creating benchmark evaluator...") + evaluator = BenchmarkEvaluator( + vector_db=vector_db, + model_provider=model_provider, + dataset=benchmark_dataset, + collection_name=config._collection_name, + limit=config.response_limit, + query_method=getattr(query_method, config.query_method), + query_parameters=config.advanced_query_parameters, + score_columns=["rerank_score", "clip_score"], # TODO: Adjust as needed + target_vector=config.target_vector + ) + + # Step 1: Load data + logging.info("=" * 80) + logging.info("Step 1: Loading data into vector database") + logging.info("=" * 80) + try: + load_data(data_loader, vector_db, hf_dataset) + logging.info("Data loading completed successfully.") + except Exception as e: + logging.error(f"Error loading data: {e}") + sys.exit(1) + + # Step 2: Run evaluation + logging.info("=" * 80) + logging.info("Step 2: Running benchmark evaluation") + logging.info("=" * 80) + try: + image_results, query_evaluation = run_evaluation(evaluator, hf_dataset) + logging.info("Evaluation completed successfully.") + except Exception as e: + logging.error(f"Error running evaluation: {e}") + sys.exit(1) + + # Step 3: Save results locally + logging.info("=" * 80) + logging.info("Step 3: Saving results") + logging.info("=" * 80) + + # Determine results directory (use /app/results if PVC is mounted, otherwise current directory) + results_dir = Path("/app/results" if os.path.exists("/app/results") else ".") + results_dir.mkdir(parents=True, exist_ok=True) + + image_results_path = results_dir / config._image_results_file + query_evaluation_path = results_dir / config._query_eval_metrics_file + config_csv_path = results_dir / config._config_values_file + + image_results.to_csv(image_results_path, index=False) + query_evaluation.to_csv(query_evaluation_path, index=False) + + config_csv_str = config.to_csv() + with open(config_csv_path, "w") as f: + f.write(config_csv_str) + + logging.info(f"Results saved locally to:") + logging.info(f" - {image_results_path}") + logging.info(f" - {query_evaluation_path}") + logging.info(f" - {config_csv_path}") + + # Step 4: Upload to S3 if enabled + if config._upload_to_s3: + if not config._s3_bucket: + logging.warning("UPLOAD_TO_S3 is true but S3_BUCKET is not set. Skipping S3 upload.") + elif not config._s3_endpoint: + logging.warning("UPLOAD_TO_S3 is true but S3_ENDPOINT is not set. Skipping S3 upload.") + elif not config._s3_access_key or not config._s3_secret_key: + logging.warning("UPLOAD_TO_S3 is true but S3 credentials are not set. Skipping S3 upload.") + else: + logging.info("=" * 80) + logging.info("Step 4: Uploading results to S3") + logging.info("=" * 80) + try: + # Generate S3 keys with timestamp + timestamp = time.strftime("%Y%m%dT%H%M%S") + s3_key_image = f"{config._s3_prefix}/{timestamp}/{config._image_results_file}" + s3_key_query = f"{config._s3_prefix}/{timestamp}/{config._query_eval_metrics_file}" + s3_key_config = f"{config._s3_prefix}/{timestamp}/{config._config_values_file}" + + upload_to_s3(str(image_results_path), s3_key_image) + upload_to_s3(str(query_evaluation_path), s3_key_query) + upload_to_s3(str(config_csv_path), s3_key_config) + + logging.info("S3 upload completed successfully.") + except Exception as e: + logging.error(f"Error uploading to S3: {e}") + logging.warning("Continuing despite S3 upload error...") + else: + logging.info("S3 upload is disabled (UPLOAD_TO_S3=false or not set).") + + vector_db.close() + logging.info("=" * 80) + logging.info("Benchmark run completed successfully!") + logging.info("=" * 80) + + +if __name__ == "__main__": + main() diff --git a/benchmarking/kubernetes/INQUIRE/README.md b/benchmarking/kubernetes/INQUIRE/README.md new file mode 100644 index 00000000..363c5c62 --- /dev/null +++ b/benchmarking/kubernetes/INQUIRE/README.md @@ -0,0 +1,77 @@ +# INQUIRE Benchmark Kubernetes Deployment + +Kubernetes deployment for the INQUIRE benchmark using kustomize. + +## Structure + +This overlay extends `../base/` with INQUIRE-specific configuration: + +- **env.yaml**: Environment variables for benchmark job + +## Usage + +### Prerequisites + +- Kubernetes cluster with access to Weaviate and Triton services +- Images built and pushed to registry +- `kubectl` configured with appropriate context + +### Run Benchmark + +```bash +cd benchmarking/benchmarks/INQUIRE +make run # Deploys and runs the benchmark job (dev environment by default) +make run ENV=prod # Deploys and runs using prod environment resources +``` + +Monitor with: +```bash +make logs +``` + +### Status + +```bash +make status +``` + +### Cleanup + +```bash +make down # Remove deployments (dev environment) +make down ENV=prod # Remove prod deployments +``` + +## Environment Variables + +### Job Configuration + +The following environment variables are set in `nrp-dev/env.yaml` and `nrp-prod/env.yaml`: + +**Vector DB Configuration:** +- `WEAVIATE_HOST`: Weaviate service host (dev: `dev-weaviate.sage.svc.cluster.local`, prod: `prod-weaviate.sage.svc.cluster.local`) + +**Inference Server Configuration:** +- `TRITON_HOST`: Triton service host (dev: `dev-triton.sage.svc.cluster.local`, prod: `prod-triton.sage.svc.cluster.local`) + +**Benchmark-Specific Configuration:** +- `INQUIRE_DATASET`: HuggingFace dataset name (default: `sagecontinuum/INQUIRE-Benchmark-small`) +- `COLLECTION_NAME`: Weaviate collection name (default: `INQUIRE`) +- `QUERY_METHOD`: Query method to use (default: `clip_hybrid_query`) +- `QUERY_BATCH_SIZE`: Batch size for parallel queries +- `IMAGE_BATCH_SIZE`: Batch size for processing images +- `SAMPLE_SIZE`: Number of samples (0 = all) +- `WORKERS`: Number of parallel workers +- `LOG_LEVEL`: Logging level (dev: `DEBUG`, prod: `INFO`) + +**S3 Configuration:** +- `S3_PREFIX`: S3 prefix for uploaded results (dev: `dev-metrics/inquire`, prod: `prod-metrics/inquire`) + +Additional environment variables (S3 endpoint, bucket, credentials, HuggingFace token) are configured in the base Kubernetes resources and loaded from secrets. + +## Image Registry + +Images should be built and pushed to: +- `gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/benchmark-inquire-job:latest` + +Update the registry in `kustomization.yaml` if using a different registry. diff --git a/benchmarking/kubernetes/INQUIRE/nrp-dev/env.yaml b/benchmarking/kubernetes/INQUIRE/nrp-dev/env.yaml new file mode 100644 index 00000000..3d122882 --- /dev/null +++ b/benchmarking/kubernetes/INQUIRE/nrp-dev/env.yaml @@ -0,0 +1,21 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: benchmark-job +spec: + template: + spec: + containers: + - name: benchmark-job + env: + # Vector DB configuration (Weaviate) + - name: WEAVIATE_HOST + value: "dev-weaviate.sage.svc.cluster.local" + # Inference server configuration (Triton) + - name: TRITON_HOST + value: "dev-triton.sage.svc.cluster.local" + # S3 upload configuration (override base defaults for this benchmark) + - name: S3_PREFIX + value: "dev-metrics/inquire" + - name: LOG_LEVEL + value: "DEBUG" diff --git a/benchmarking/kubernetes/INQUIRE/nrp-dev/kustomization.yaml b/benchmarking/kubernetes/INQUIRE/nrp-dev/kustomization.yaml new file mode 100644 index 00000000..aeffaab6 --- /dev/null +++ b/benchmarking/kubernetes/INQUIRE/nrp-dev/kustomization.yaml @@ -0,0 +1,23 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: sage + +namePrefix: dev-inquire- +commonLabels: + benchmark: inquire + +# This overlay references the base and applies INQUIRE-specific patches +resources: + - ../../base + +patches: + # Patch job environment variables (INQUIRE-specific + dev services) + - path: env.yaml + target: + kind: Job + labelSelector: "app=benchmark-job" + +images: + - name: PLACEHOLDER_BENCHMARK_JOB_IMAGE + newName: gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/benchmark-inquire-job + newTag: latest diff --git a/benchmarking/kubernetes/INQUIRE/nrp-prod/env.yaml b/benchmarking/kubernetes/INQUIRE/nrp-prod/env.yaml new file mode 100644 index 00000000..e5ed691d --- /dev/null +++ b/benchmarking/kubernetes/INQUIRE/nrp-prod/env.yaml @@ -0,0 +1,21 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: benchmark-job +spec: + template: + spec: + containers: + - name: benchmark-job + env: + # Vector DB configuration (Weaviate) - prod environment + - name: WEAVIATE_HOST + value: "prod-weaviate.sage.svc.cluster.local" + # Inference server configuration (Triton) - prod environment + - name: TRITON_HOST + value: "prod-triton.sage.svc.cluster.local" + # S3 upload configuration (override base defaults for this benchmark) + - name: S3_PREFIX + value: "prod-metrics/inquire" + - name: LOG_LEVEL + value: "INFO" diff --git a/benchmarking/kubernetes/INQUIRE/nrp-prod/kustomization.yaml b/benchmarking/kubernetes/INQUIRE/nrp-prod/kustomization.yaml new file mode 100644 index 00000000..42420f4f --- /dev/null +++ b/benchmarking/kubernetes/INQUIRE/nrp-prod/kustomization.yaml @@ -0,0 +1,24 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: sage + +namePrefix: prod-inquire- +commonLabels: + benchmark: inquire + +# This overlay references the base and applies INQUIRE-specific patches +# configured for the prod environment (prod-weaviate, prod-triton, etc.) +resources: + - ../../base + +patches: + # Patch job environment variables (INQUIRE-specific + prod services) + - path: env.yaml + target: + kind: Job + labelSelector: "app=benchmark-job" + +images: + - name: PLACEHOLDER_BENCHMARK_JOB_IMAGE + newName: gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/benchmark-inquire-job + newTag: latest diff --git a/benchmarking/kubernetes/README.md b/benchmarking/kubernetes/README.md new file mode 100644 index 00000000..566ba494 --- /dev/null +++ b/benchmarking/kubernetes/README.md @@ -0,0 +1,246 @@ +# Benchmarking Kubernetes Deployments + +Kubernetes deployments for benchmarking using kustomize for configuration management. + +## Structure + +``` +benchmarking/kubernetes/ +├── base/ # Base kustomization (shared across all benchmarks) +│ ├── kustomization.yaml +│ ├── benchmark-job.yaml # Combined job (loads data and evaluates) +│ └── ._s3-secret.yaml # S3 credentials secret +│ +└── INQUIRE/ # INQUIRE benchmark overlay + ├── nrp-dev/ # Dev environment overlay + │ ├── kustomization.yaml + │ └── env.yaml + └── nrp-prod/ # Prod environment overlay + ├── kustomization.yaml + └── env.yaml +``` + +## Base Components + +The `base/` directory contains generic resources that can be reused by any benchmark: + +- **benchmark-job.yaml**: Job that runs the combined benchmark script (loads data and evaluates) +- **._s3-secret.yaml**: Secret for S3 credentials (access key and secret key) +- **._huggingface-secret.yaml**: Secret for HuggingFace token (for accessing private datasets) + +The job is **vector database and inference server agnostic**: +- Includes health checks and resource limits +- Includes base environment variables (PYTHONUNBUFFERED, PYTHONPATH) +- Includes S3 configuration (endpoint, bucket, secure flag) with defaults +- S3 credentials are loaded from the `s3-secret` secret +- HuggingFace token is loaded from the `huggingface-secret` secret (for accessing private datasets) +- Vector DB and inference server environment variables should be added via patches in benchmark-specific overlays (env.yaml) + +## Creating a New Benchmark Overlay + +To create a new benchmark (e.g., `MYBENCHMARK`): + +1. **Create overlay directory**: +```bash +mkdir -p benchmarking/kubernetes/MYBENCHMARK +``` + +2. **Create kustomization.yaml**: +```yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: sage + +namePrefix: mybenchmark- +commonLabels: + benchmark: mybenchmark + +resources: + - ../base + +patches: + - path: env.yaml + target: + kind: Job + labelSelector: "app=benchmark-job" + +images: + - name: PLACEHOLDER_BENCHMARK_JOB_IMAGE + newName: gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/benchmark-mybenchmark-job + newTag: latest +``` + +3. **Update nrp-dev/env.yaml**: +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: benchmark-job +spec: + template: + spec: + containers: + - name: benchmark-job + env: + # Vector DB configuration (Weaviate) + - name: WEAVIATE_HOST + value: "dev-weaviate.sage.svc.cluster.local" + # Inference server configuration (Triton) + - name: TRITON_HOST + value: "dev-triton.sage.svc.cluster.local" + # S3 upload configuration (override base defaults for this benchmark) + - name: S3_PREFIX + value: "dev-metrics/mybenchmark" + - name: LOG_LEVEL + value: "DEBUG" +``` + +4. **Update nrp-prod/** similarly with prod service names and S3 prefix. + +## Environment Switching (Dev/Prod) + +Benchmarks use separate overlays for dev and prod environments: +- **nrp-dev/**: Default development environment overlay +- **nrp-prod/**: Production environment overlay + +>NOTE: By default, the benchmark will use the dev environment resources (`nrp-dev/`). + +### Using Environment Overlays + +From the benchmark directory (e.g., `benchmarking/benchmarks/INQUIRE/`): + +```bash +# Run using prod environment resources +make run ENV=prod + +# Run using default (dev environment) resources +make run +``` + +The `ENV` variable controls which kustomize overlay is used: +- `ENV=prod` → Uses `kubernetes/INQUIRE/nrp-prod/` +- No `ENV` or `ENV=dev` → Uses `kubernetes/INQUIRE/nrp-dev/` + +## Usage + +### Prerequisites + +- `kubectl` configured with access to cluster +- `kustomize` (or `kubectl` with kustomize support) +- Images built and pushed to registry +- S3 secret configured with credentials (if using S3 upload) + +### Run Benchmark + +```bash +cd benchmarking/benchmarks/INQUIRE +make run # Deploy and run using dev environment (default) +make run ENV=prod # Deploy and run using prod environment resources +``` + +### Monitor + +```bash +make status +make logs +``` + +### Cleanup + +```bash +make down # Remove deployments (dev environment) +make down ENV=prod # Remove prod deployments +``` + +## Environment Variables + +Benchmark-specific environment variables are set via patches in each overlay: + +- **Job** (`env.yaml`): + - Vector DB connection (e.g., WEAVIATE_HOST, WEAVIATE_PORT) + - Inference server connection (e.g., TRITON_HOST, TRITON_PORT) + - Dataset name, collection name, query method, batch sizes + - S3 prefix override (if different from base default) + +### Base Environment Variables + +The base `benchmark-job.yaml` includes: +- `S3_ENDPOINT`: S3 endpoint URL (override in env.yaml if needed) +- `S3_BUCKET`: S3 bucket name (override in env.yaml if needed) +- `S3_SECURE`: Use TLS for S3 (default: "true") +- `S3_PREFIX`: S3 prefix for uploaded files (default: "benchmark-results") +- `UPLOAD_TO_S3`: Enable S3 upload (default: "false") + +Secrets are loaded from Kubernetes secrets: +- **S3 credentials** from `s3-secret`: + - `S3_ACCESS_KEY`: From secret + - `S3_SECRET_KEY`: From secret +- **HuggingFace token** from `huggingface-secret`: + - `HF_TOKEN`: From secret (for accessing private datasets) + +## Secrets Configuration + +### Setting Up S3 Secret + +Create `kubernetes/base/._s3-secret.yaml` using the template file: +```bash +cp benchmarking/kubernetes/base/s3-secret.template.yaml benchmarking/kubernetes/base/._s3-secret.yaml +``` + +To generate base64 values: +```bash +echo -n "your-access-key" | base64 +echo -n "your-secret-key" | base64 +``` + +### Setting Up HuggingFace Secret + +Create `kubernetes/base/._huggingface-secret.yaml` using the template file: +```bash +cp benchmarking/kubernetes/base/huggingface-secret.template.yaml benchmarking/kubernetes/base/._huggingface-secret.yaml +``` + +To generate base64 value for HuggingFace token: +```bash +echo -n "your-huggingface-token" | base64 +``` + +> **Important:** +> All secret files you actually use must be named with leading `._` per `.gitignore` and not checked into version control! Only commit the `*.template.yaml` files. + +### Overriding S3 Configuration + +To override base S3 settings for a specific benchmark, add to `env.yaml`: + +```yaml +- name: S3_ENDPOINT + value: "your-custom-endpoint:9000" +- name: S3_BUCKET + value: "your-bucket" +- name: S3_PREFIX + value: "custom-prefix/benchmark-name" +- name: UPLOAD_TO_S3 + value: "true" +``` + +## Image Registry + +Images should be built and pushed to: +- `gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/benchmark-{name}-job:latest` + +Update the registry in `kustomization.yaml` if using a different registry. + +## Local Development + +For local development, use port-forwarding: + +```bash +make run-local +``` + +This will: +1. Start port-forwarding for Weaviate and Triton services +2. Run the benchmark locally +3. Stop port-forwarding when done + +Results are saved locally in the current directory. diff --git a/benchmarking/kubernetes/base/benchmark-job.yaml b/benchmarking/kubernetes/base/benchmark-job.yaml new file mode 100644 index 00000000..5dff7a66 --- /dev/null +++ b/benchmarking/kubernetes/base/benchmark-job.yaml @@ -0,0 +1,62 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: benchmark-job + labels: + app: benchmark-job +spec: + # Don't automatically delete completed jobs (for debugging) + ttlSecondsAfterFinished: 86400 # 24 hours + # Allow only one job to run at a time + backoffLimit: 3 + template: + metadata: + labels: + app: benchmark-job + spec: + restartPolicy: Never + containers: + - name: benchmark-job + image: PLACEHOLDER_BENCHMARK_JOB_IMAGE:latest + # Note: Vector DB and inference server environment variables should be + # added via patches in benchmark-specific overlays (e.g., env.yaml) + command: ["python", "run_benchmark.py"] + env: + - name: PYTHONUNBUFFERED + value: "1" + - name: PYTHONPATH + value: "/app" + # S3 upload configuration (base defaults) + - name: S3_ENDPOINT + value: "http://rook-ceph-rgw-nautiluss3.rook" #inside cluster endpoint of the s3 bucket + - name: S3_BUCKET + value: "sage_imsearch" + - name: S3_SECURE + value: "false" # inside cluster endpoint doesn't use SSL + - name: S3_PREFIX + value: "dev-metrics" # Override in benchmark-specific overlays + - name: UPLOAD_TO_S3 + value: "true" + envFrom: + - secretRef: + name: s3-secret + - secretRef: + name: huggingface-secret + resources: + limits: + cpu: 6 # maximum allowed ratio of 1.2x of requests (5 * 1.2 = 6) + memory: 14Gi # maximum allowed ratio of 1.2x of requests (12Gi * 1.2 = 14.4Gi) + requests: + cpu: 5 + memory: 12Gi + volumeMounts: + - name: huggingface-cache + mountPath: /root/.cache/huggingface + - name: results + mountPath: /app/results + volumes: + - name: huggingface-cache + emptyDir: {} + - name: results + emptyDir: {} + diff --git a/benchmarking/kubernetes/base/huggingface-secret.template.yaml b/benchmarking/kubernetes/base/huggingface-secret.template.yaml new file mode 100644 index 00000000..c4f006b3 --- /dev/null +++ b/benchmarking/kubernetes/base/huggingface-secret.template.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Secret +metadata: + name: huggingface-secret + labels: + app: benchmark +type: Opaque +data: + # Base64 encoded Hugging Face token + # To generate: echo -n "your_hf_token_here" | base64 + HF_TOKEN: "" # Replace with base64 encoded token diff --git a/benchmarking/kubernetes/base/kustomization.yaml b/benchmarking/kubernetes/base/kustomization.yaml new file mode 100644 index 00000000..27a478dc --- /dev/null +++ b/benchmarking/kubernetes/base/kustomization.yaml @@ -0,0 +1,12 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: sage + +resources: + - benchmark-job.yaml + - ._s3-secret.yaml + - ._huggingface-secret.yaml + +# Note: Vector DB and inference server configuration should be added +# in benchmark-specific overlays via environment variable patches + diff --git a/benchmarking/kubernetes/base/s3-secret.template.yaml b/benchmarking/kubernetes/base/s3-secret.template.yaml new file mode 100644 index 00000000..bb0fd2a8 --- /dev/null +++ b/benchmarking/kubernetes/base/s3-secret.template.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Secret +metadata: + name: s3-secret + labels: + app: benchmark +type: Opaque +data: + # Base64 encoded S3_ACCESS_KEY and S3_SECRET_KEY + # To generate: echo -n "your_access_key_here" | base64 + # To generate: echo -n "your_secret_key_here" | base64 + S3_ACCESS_KEY: "" # Replace with base64 encoded access key + S3_SECRET_KEY: "" # Replace with base64 encoded secret key + diff --git a/kubernetes/README.md b/kubernetes/README.md index 41f33899..810e09ab 100644 --- a/kubernetes/README.md +++ b/kubernetes/README.md @@ -1 +1,76 @@ -This folder contains the nautilous Kubernetes deployment manifests for the Sage Hybrid Search system. \ No newline at end of file +# Sage NRP Image Search - Kubernetes Deployment + +This folder contains the Kubernetes manifests for deploying the `sage-nrp-image-search` stack on Nautilus or other Kubernetes clusters. It provides all the core resources and configuration required for running the hybrid image search service, but **does not** include benchmark configs or benchmark jobs. + +## Contents + +- `base/`: Base kustomize configuration and manifests for core deployment +- `base/kustomization.yaml`: Main kustomization file listing services, secrets, and configMaps +- `base/*.yaml`: Service, Deployment, Job, and Secret manifests for all core components (Weaviate, Triton, Reranker, Gradio UI, etc.) + +## Deployment Overview + +The resources here stand up the core application stack: + +- **Weaviate** (vector database) +- **Triton** (inference server) +- **Reranker Transformers** (optional re-ranking model) +- **Gradio UI** +- **Support jobs** for dataset management, storage, and configuration +- **Secrets** for Hugging Face, S3, and Sage user credentials + +All roles and deployments are configured using kustomize to simplify environment management and overlays. + +## Setting Up Secrets + +Before deploying, you must create the necessary secret manifest files in `base/`. Templates are provided for all required secrets: + +### 1. HuggingFace Secret + +Copy the template and fill in your HuggingFace token (base64-encoded): + +```bash +cp base/huggingface-secret.template.yaml base/._huggingface-secret.yaml +``` + +### 2. Sage User Secret + +Copy the Sage user secret template and add your Sage account name and password: + +```bash +cp base/sage-user-secret.template.yaml base/._sage-user-secret.yaml +``` + +- Encode username and password values as above. +- Update the `SAGE_USER` and `SAGE_PASS` fields. + +> **Important:** +> All secret files you actually use must be named with leading `._` per `.gitignore` and not checked into version control! Only commit the `*.template.yaml` files. + +## Deploying + +> Prerequisites: +> - `kubectl` configured with cluster access +> - `kustomize` + +To deploy the base stack: + +```bash +cd kubernetes/base +kustomize build . | kubectl apply -f - +``` + +Or, using kubectl (if it supports native kustomize): + +```bash +kubectl apply -k base/ +``` + +## Managing and Customizing + +You can extend or patch this `base/` deployment using kustomize overlays for different environments, resource limits, or development setups. See included overlays (such as those in benchmark subfolders) for example usage. + +## Note + +- These resources do **not** include benchmark job definitions. For benchmarking, see `benchmarking/kubernetes/`. +- Update secret files as needed to match your deployment’s authentication requirements. \ No newline at end of file diff --git a/kubernetes/base/gradio-ui.yaml b/kubernetes/base/gradio-ui.yaml index 23a0a4d2..578d9f0d 100644 --- a/kubernetes/base/gradio-ui.yaml +++ b/kubernetes/base/gradio-ui.yaml @@ -16,7 +16,7 @@ spec: spec: containers: - name: gradio-ui - image: gitlab-registry.nrp-nautilus.io/ndp/sage/hybrid-search/gradio-ui:latest + image: gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/gradio-ui:latest env: - name: PYTHONUNBUFFERED value: "1" diff --git a/kubernetes/base/huggingface-secret.yaml b/kubernetes/base/huggingface-secret.template.yaml similarity index 100% rename from kubernetes/base/huggingface-secret.yaml rename to kubernetes/base/huggingface-secret.template.yaml diff --git a/kubernetes/base/kustomization.yaml b/kubernetes/base/kustomization.yaml index d82dabb4..7d432a40 100644 --- a/kubernetes/base/kustomization.yaml +++ b/kubernetes/base/kustomization.yaml @@ -5,8 +5,8 @@ namespace: sage resources: - reranker-transformers.yaml - weaviate.yaml - - huggingface-secret.yaml - - sage-user-secret.yaml + - ._huggingface-secret.yaml + - ._sage-user-secret.yaml - triton.yaml - gradio-ui.yaml - weavmanage.yaml diff --git a/kubernetes/base/sage-user-secret.yaml b/kubernetes/base/sage-user-secret.template.yaml similarity index 100% rename from kubernetes/base/sage-user-secret.yaml rename to kubernetes/base/sage-user-secret.template.yaml diff --git a/kubernetes/base/triton.yaml b/kubernetes/base/triton.yaml index 13a32bb7..4e99f5f8 100644 --- a/kubernetes/base/triton.yaml +++ b/kubernetes/base/triton.yaml @@ -16,7 +16,7 @@ spec: spec: containers: - name: triton - image: gitlab-registry.nrp-nautilus.io/ndp/sage/hybrid-search/triton:latest + image: gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/triton:latest env: - name: TRANSFORMERS_VERBOSITY value: "info" diff --git a/kubernetes/base/weavloader.yaml b/kubernetes/base/weavloader.yaml index 58deb69b..2f82028a 100644 --- a/kubernetes/base/weavloader.yaml +++ b/kubernetes/base/weavloader.yaml @@ -20,7 +20,7 @@ spec: spec: containers: - name: weavloader - image: gitlab-registry.nrp-nautilus.io/ndp/sage/hybrid-search/weavloader:latest + image: gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/weavloader:latest ports: - containerPort: 8080 name: metrics diff --git a/kubernetes/base/weavmanage.yaml b/kubernetes/base/weavmanage.yaml index 62cfeca7..a0f83fd8 100644 --- a/kubernetes/base/weavmanage.yaml +++ b/kubernetes/base/weavmanage.yaml @@ -14,7 +14,7 @@ spec: restartPolicy: Never containers: - name: weavmanage - image: gitlab-registry.nrp-nautilus.io/ndp/sage/hybrid-search/weavmanage:latest + image: gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/weavmanage:latest env: - name: WEAVIATE_HOST value: "$(WEAVIATE_SERVICE)" diff --git a/kubernetes/nrp-dev/kustomization.yaml b/kubernetes/nrp-dev/kustomization.yaml index c4bb777e..d311a5c2 100644 --- a/kubernetes/nrp-dev/kustomization.yaml +++ b/kubernetes/nrp-dev/kustomization.yaml @@ -34,11 +34,11 @@ images: newTag: cross-encoder-ms-marco-MiniLM-L-6-v2-latest - name: semitechnologies/weaviate newTag: 1.32.0 - - name: gitlab-registry.nrp-nautilus.io/ndp/sage/hybrid-search/triton + - name: gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/triton newTag: latest - - name: gitlab-registry.nrp-nautilus.io/ndp/sage/hybrid-search/gradio-ui + - name: gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/gradio-ui newTag: latest - - name: gitlab-registry.nrp-nautilus.io/ndp/sage/hybrid-search/weavmanage + - name: gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/weavmanage newTag: latest - - name: gitlab-registry.nrp-nautilus.io/ndp/sage/hybrid-search/weavloader + - name: gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/weavloader newTag: latest \ No newline at end of file diff --git a/kubernetes/nrp-prod/kustomization.yaml b/kubernetes/nrp-prod/kustomization.yaml index c984ddf5..5d4f74b3 100644 --- a/kubernetes/nrp-prod/kustomization.yaml +++ b/kubernetes/nrp-prod/kustomization.yaml @@ -34,11 +34,11 @@ images: newTag: cross-encoder-ms-marco-MiniLM-L-6-v2-latest - name: semitechnologies/weaviate newTag: 1.32.0 - - name: gitlab-registry.nrp-nautilus.io/ndp/sage/hybrid-search/triton + - name: gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/triton newTag: latest - - name: gitlab-registry.nrp-nautilus.io/ndp/sage/hybrid-search/gradio-ui + - name: gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/gradio-ui newTag: latest - - name: gitlab-registry.nrp-nautilus.io/ndp/sage/hybrid-search/weavmanage + - name: gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/weavmanage newTag: latest - - name: gitlab-registry.nrp-nautilus.io/ndp/sage/hybrid-search/weavloader + - name: gitlab-registry.nrp-nautilus.io/ndp/sage/nrp-image-search/weavloader newTag: latest \ No newline at end of file diff --git a/weavloader/README.md b/weavloader/README.md index 473fb06e..3f29b519 100644 --- a/weavloader/README.md +++ b/weavloader/README.md @@ -518,3 +518,11 @@ Flower metrics are automatically integrated into the unified Prometheus endpoint - **Metric Prefix**: All Flower metrics are prefixed with `weavloader_` - **Unified Endpoint**: Available at `/metrics` alongside custom metrics - **Real-time Updates**: Metrics update in real-time with task execution + +## **References** +- [Celery Documentation](https://docs.celeryq.dev/en/stable/) +- [Flower Documentation](https://flower.readthedocs.io/en/latest/) + - [Prometheus Integration](https://flower.readthedocs.io/en/latest/prometheus-integration.html) +- [Prometheus Documentation](https://prometheus.io/docs/introduction/overview/) + - [Multiprocess Mode](https://prometheus.github.io/client_python/multiprocess/) +- [Sage Documentation](https://sagecontinuum.org/docs/about/overview) \ No newline at end of file