diff --git a/.gitignore b/.gitignore index 19fdc46..071a69f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,18 @@ *~ *.pyc build/ + +# Nix +result +result-* +.direnv/ +.envrc + +# Docker +.docker/ +docker-cache/ +tmp-*/ + +# Temporary files +*.tmp +*.bak diff --git a/BUILD.md b/BUILD.md new file mode 100644 index 0000000..e65757f --- /dev/null +++ b/BUILD.md @@ -0,0 +1,82 @@ +# Pastec Build System + +This document explains the build system for Pastec, which uses Nix and Docker to create optimized builds with efficient caching. + +## Overview + +The build system is designed to optimize Docker layer caching by separating the build process into two main parts: + +1. Building dependencies (which change infrequently) +2. Building the Pastec application (which changes more frequently) + +This separation allows Docker to reuse the cached dependencies layer when only the Pastec code changes, significantly reducing build times for iterative development. + +## Key Components + +### 1. Dependencies Flake (`deps-flake.nix`) + +This flake is used in the Docker build process to build all the optimized dependencies that Pastec requires: + +- OpenCV +- libmicrohttpd +- curl +- jsoncpp +- mimalloc +- Build tools (cmake, clang, lld) + +These dependencies are built with optimization flags and stored in a separate Docker layer. + +### 2. Main Flake (`flake.nix`) + +The main flake contains the full build configuration for Pastec: + +- For local builds, it defines and builds all dependencies directly +- In the Docker build, it uses the pre-built dependencies from the previous stage +- Builds the Pastec application code + +### 3. Multi-stage Dockerfile + +The Dockerfile uses a three-stage build process: + +1. **Dependencies Stage**: + - Copies `deps-flake.nix` to a temporary directory and renames it to `flake.nix` + - Builds only the dependencies + - Stores the dependencies in the Nix store + +2. **Application Stage**: + - Copies the pre-built dependencies from the previous stage + - Builds Pastec using the main flake + - Reuses the dependencies from the previous stage + +3. **Final Stage**: + - Creates a minimal runtime image with just the necessary files + - Includes only the built application and its runtime dependencies + +## Benefits + +This approach provides several benefits: + +1. **Faster Iterative Builds**: When only the Pastec code changes, Docker reuses the cached dependencies layer, significantly reducing build time. + +2. **Optimized Dependencies**: All dependencies are built with optimization flags for maximum performance. + +3. **Minimal Final Image**: The final image contains only the necessary files, keeping the image size small. + +## How Docker Caching Works + +Docker's layer caching works based on the Dockerfile instructions: + +1. If the instructions for a layer haven't changed, Docker reuses the cached layer. +2. If a layer changes, that layer and all subsequent layers are rebuilt. + +By separating dependencies and application code into different stages, we ensure that changes to the application code don't invalidate the dependencies cache. + +## Example Build Process + +When you run `docker build`: + +1. If `deps-flake.nix` hasn't changed, Docker reuses the cached dependencies layer. +2. If only the Pastec code has changed, Docker rebuilds only the application and final stages. +3. If `deps-flake.nix` has changed, Docker rebuilds all stages. + +This results in much faster builds during normal development, where dependencies rarely change but application code changes frequently. diff --git a/CMakeLists.txt b/CMakeLists.txt index 70bf499..ab3ab06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,57 +1,205 @@ -cmake_minimum_required(VERSION 2.6) -project(Pastec) - -include_directories(include include/orb) - -set(SOURCES src/main.cpp - src/imagereranker.cpp - src/imagererankerransac.cpp - src/imageloader.cpp - src/httpserver.cpp - src/jsoncpp.cpp - src/requesthandler.cpp - src/imagedownloader.cpp - src/orb/orbfeatureextractor.cpp - src/orb/orbindex.cpp - src/orb/orbsearcher.cpp - src/orb/orbwordindex.cpp) - -set(HEADERS include/thread.h - include/messages.h - include/hit.h - include/searchResult.h - include/imagereranker.h - include/backwardindexreaderaccess.h - include/imageloader.h - include/index.h - include/featureextractor.h - include/imagedownloader.h - include/json/json-forwards.h - include/json/json.h - include/orb/orbfeatureextractor.h - include/orb/orbindex.h - include/orb/orbsearcher.h - include/orb/orbwordindex.h - include/searcher.h - include/httpserver.h - include/requesthandler.h) +cmake_minimum_required(VERSION 3.5) +project(Pastec VERSION 1.0.0) + +# Include FetchContent for SimSIMD +include(FetchContent) + +# Fetch SimSIMD +FetchContent_Declare( + simsimd + GIT_REPOSITORY https://github.com/ashvardanian/simsimd.git + GIT_TAG v6.4.0 + GIT_SHALLOW TRUE +) +FetchContent_MakeAvailable(simsimd) + +# Fix simsimd include directories to use generator expressions for proper installation +if(TARGET simsimd) + get_target_property(SIMSIMD_INCLUDE_DIRS simsimd INTERFACE_INCLUDE_DIRECTORIES) + if(SIMSIMD_INCLUDE_DIRS) + set_target_properties(simsimd PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + "$;$") + endif() +endif() + +# Include directories +include_directories(include include/orb include/pastec ${Boost_INCLUDE_DIRS}) + +# Define library sources (core functionality) +set(PASTEC_LIB_SOURCES + src/imagereranker.cpp + src/imagererankerransac.cpp + src/imageloader.cpp + src/jsoncpp.cpp + src/imagedownloader.cpp + src/batchprocessor.cpp + src/orb/orbfeatureextractor.cpp + src/orb/orbindex.cpp + src/orb/orbsearcher.cpp + src/orb/orbwordindex.cpp) + +# Define executable sources (HTTP server and main.cpp) +set(PASTEC_EXE_SOURCES + src/httpserver.cpp + src/requesthandler.cpp + src/main.cpp) + +# Define headers +set(PASTEC_HEADERS + include/thread.h + include/messages.h + include/hit.h + include/searchResult.h + include/imagereranker.h + include/backwardindexreaderaccess.h + include/imageloader.h + include/index.h + include/featureextractor.h + include/imagedownloader.h + include/batchprocessor.h + include/json/json-forwards.h + include/json/json.h + include/orb/orbfeatureextractor.h + include/orb/orbindex.h + include/orb/orbsearcher.h + include/orb/orbwordindex.h + include/searcher.h + include/httpserver.h + include/requesthandler.h + include/pastec/pastec.h + include/pastec/core/index.h + include/pastec/core/searcher.h + include/pastec/core/featureextractor.h + include/pastec/core/imageloader.h + include/pastec/core/hit.h + include/pastec/core/searchResult.h + include/pastec/core/imagedownloader.h + include/pastec/core/orb/orbindex.h + include/pastec/core/orb/orbsearcher.h + include/pastec/core/orb/orbfeatureextractor.h + include/pastec/core/orb/orbwordindex.h) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") +# Set default build type to Release if not specified if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel." FORCE) endif(NOT CMAKE_BUILD_TYPE) +# Enhanced optimization flags for Release build +if(CMAKE_BUILD_TYPE STREQUAL "Release") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -flto -DNDEBUG -march=native -ffunction-sections -fdata-sections") + set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} -flto -Wl,--gc-sections") +endif() + +# Options to control what gets built +# Library is always built, not optional +set(BUILD_PASTEC_LIB ON CACHE BOOL "Build Pastec as a library" FORCE) +option(BUILD_PASTEC_EXE "Build Pastec executable with HTTP server" ON) +option(BUILD_SHARED_LIBS "Build shared libraries" OFF) +option(INSTALL_TARGETS "Install targets and generate package files" ON) + +# Find required packages find_package(Threads REQUIRED) find_package(OpenCV REQUIRED) -find_package(microhttpd REQUIRED) find_package(CURL REQUIRED) +find_package(Boost REQUIRED COMPONENTS system) +find_package(mimalloc) + +# Only need microhttpd for the executable +if(BUILD_PASTEC_EXE) + find_package(microhttpd REQUIRED) +endif() + +# Build the library +if(BUILD_PASTEC_LIB) + add_library(pastec ${PASTEC_LIB_SOURCES} ${PASTEC_HEADERS}) + target_include_directories(pastec PUBLIC + $ + $) + target_link_libraries(pastec ${CMAKE_THREAD_LIBS_INIT} ${OpenCV_LIBS} ${CURL_LIBRARIES} ${Boost_LIBRARIES} simsimd) + + # Link mimalloc if found + if(MIMALLOC_FOUND) + target_link_libraries(pastec ${MIMALLOC_LIBRARIES}) + target_compile_definitions(pastec PRIVATE -DUSE_MIMALLOC) + message(STATUS "Building library with mimalloc allocator") + endif() + + # Install targets if enabled + if(INSTALL_TARGETS) + install(TARGETS pastec simsimd + EXPORT PastecTargets + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + RUNTIME DESTINATION bin + INCLUDES DESTINATION include) + + # Install headers + install(DIRECTORY include/pastec + DESTINATION include + FILES_MATCHING PATTERN "*.h") + endif() +endif() + +# Build the executable +if(BUILD_PASTEC_EXE) + add_executable(pastec_server ${PASTEC_EXE_SOURCES}) + + if(BUILD_PASTEC_LIB) + # Link against the library if we're building it + target_link_libraries(pastec_server pastec ${LIBMICROHTTPD_LIBRARY} ${Boost_LIBRARIES}) + else() + # Otherwise, include all sources directly + target_sources(pastec_server PRIVATE ${PASTEC_LIB_SOURCES}) + target_link_libraries(pastec_server ${CMAKE_THREAD_LIBS_INIT} ${OpenCV_LIBS} ${CURL_LIBRARIES} ${LIBMICROHTTPD_LIBRARY} ${Boost_LIBRARIES} simsimd) + endif() + + # Link mimalloc if found + if(MIMALLOC_FOUND) + target_link_libraries(pastec_server ${MIMALLOC_LIBRARIES}) + target_compile_definitions(pastec_server PRIVATE -DUSE_MIMALLOC) + message(STATUS "Building executable with mimalloc allocator") + else() + message(STATUS "mimalloc not found. For better performance, install libmimalloc-dev") + message(STATUS "On Ubuntu: sudo apt-get install libmimalloc-dev") + message(STATUS "Alternatively, you can use LD_PRELOAD to use mimalloc at runtime") + endif() + + # Install executable if enabled + if(INSTALL_TARGETS) + install(TARGETS pastec_server + RUNTIME DESTINATION bin) + endif() +endif() + +# Generate and install package configuration files if enabled +if(BUILD_PASTEC_LIB AND INSTALL_TARGETS) + include(CMakePackageConfigHelpers) + write_basic_package_version_file( + "${CMAKE_CURRENT_BINARY_DIR}/PastecConfigVersion.cmake" + VERSION ${PROJECT_VERSION} + COMPATIBILITY SameMajorVersion) -add_executable(pastec ${SOURCES} ${HEADERS}) + configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/cmake/PastecConfig.cmake.in" + "${CMAKE_CURRENT_BINARY_DIR}/PastecConfig.cmake" + @ONLY) -target_link_libraries(pastec ${CMAKE_THREAD_LIBS_INIT}) -target_link_libraries(pastec ${OpenCV_LIBS}) -target_link_libraries(pastec ${LIBMICROHTTPD_LIBRARY}) -target_link_libraries(pastec ${CURL_LIBRARIES}) + install(FILES + "${CMAKE_CURRENT_BINARY_DIR}/PastecConfig.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/PastecConfigVersion.cmake" + DESTINATION lib/cmake/Pastec) + install(EXPORT PastecTargets + FILE PastecTargets.cmake + NAMESPACE Pastec:: + DESTINATION lib/cmake/Pastec) + + # Build examples if library is being built + option(BUILD_EXAMPLES "Build examples" ON) + if(BUILD_EXAMPLES) + add_subdirectory(examples) + endif() +endif() diff --git a/Dockerfile b/Dockerfile index a3186a9..06a3bb0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,21 +1,61 @@ -FROM ubuntu:18.04 +FROM ubuntu:24.04 -MAINTAINER lklic +LABEL maintainer="lklic" ENV TZ=Europe/Rome +# Set environment variables for memory allocator optimizations +ENV MIMALLOC_LARGE_OS_PAGES=1 +ENV MALLOC_CONF="thp:always,metadata_thp:always" +ENV GLIBC_TUNABLES=glibc.malloc.hugetlb=1 +ENV SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt +ENV CURL_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt + RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone RUN apt-get update \ - && apt-get install -y curl wget vim libcurl4-openssl-dev libopencv-dev libmicrohttpd-dev libjsoncpp-dev cmake git -COPY . /pastec -RUN mkdir -p /pastec/build && mkdir /pastec/data -WORKDIR /pastec/build + && apt-get install -y curl wget libcurl4-openssl-dev libopencv-dev libmicrohttpd-dev \ + libjsoncpp-dev cmake git build-essential clang lld libmimalloc-dev ca-certificates \ + libboost-all-dev libtbb-dev + +# Create app directory +RUN mkdir -p /app +WORKDIR /app + +# Selective copy of source files (matching the Nix Dockerfile approach) +COPY CMakeLists.txt ./ +COPY cmake ./cmake/ +COPY include ./include/ +COPY python ./python/ +COPY src ./src/ +COPY examples ./examples/ +COPY visualWordsORB.dat ./ + +# Create build and data directories +RUN mkdir -p build data +WORKDIR /app/build +# Use clang as the compiler with optimized flags matching Nix configuration +RUN cmake ../ \ + -DCMAKE_C_COMPILER=clang \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_C_FLAGS="-mavx512f -mavx512dq -mavx512fp16 -mavx512bf16" \ + -DCMAKE_CXX_FLAGS="-std=c++17 -mavx512f -mavx512dq -mavx512fp16 -mavx512bf16" \ + -DSIMSIMD_TARGET_SAPPHIRE=1 \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_PASTEC_LIB=ON \ + -DBUILD_PASTEC_EXE=ON \ + -DBUILD_EXAMPLES=OFF \ + -DSIMSIMD_BUILD_SHARED=ON \ + -DINSTALL_TARGETS=OFF \ + -DUSE_MIMALLOC=ON \ + && make -j$(nproc) VERBOSE=1 -RUN cmake ../ && make +# Copy the visual words data file to the data directory +RUN cp /app/visualWordsORB.dat /app/data/ -RUN cp /pastec/visualWordsORB.dat /pastec/data +# Set up the final structure +WORKDIR /app EXPOSE 4212 -VOLUME /pastec/ +VOLUME /app/data -CMD ./pastec -p 4212 /pastec/data/visualWordsORB.dat +CMD ["/app/build/pastec_server", "--forward-index", "-p", "4212", "/app/data/visualWordsORB.dat"] diff --git a/Dockerfile.nix b/Dockerfile.nix new file mode 100644 index 0000000..ba6a632 --- /dev/null +++ b/Dockerfile.nix @@ -0,0 +1,56 @@ +# Nix builder stage +FROM nixos/nix:latest AS builder + +# Create working directory +WORKDIR /app + +# Find the path to the CA certificates +RUN echo "Locating CA certificates..." +RUN nix-build '' -A cacert --no-out-link > /tmp/cacert-path + +# Copy our source WITHOUT the .git directory +# We'll use a more selective copy approach +COPY CMakeLists.txt flake.nix shell.nix ./ +COPY cmake ./cmake/ +COPY include ./include/ +COPY python ./python/ +COPY src ./src/ +COPY examples ./examples/ +COPY visualWordsORB.dat ./ + +# Build our Nix environment +RUN nix \ + --extra-experimental-features "nix-command flakes" \ + --option filter-syscalls false \ + build . + +# Copy the Nix store closure into a directory +RUN mkdir /tmp/nix-store-closure +RUN cp -R $(nix-store -qR result/) /tmp/nix-store-closure + +# Copy CA certificates to a known location +RUN mkdir -p /tmp/etc/ssl/certs +RUN cp -L $(cat /tmp/cacert-path)/etc/ssl/certs/ca-bundle.crt /tmp/etc/ssl/certs/ + +# Final minimal image +FROM scratch + +# Set environment variables for memory allocator optimizations +ENV MIMALLOC_LARGE_OS_PAGES=1 +ENV MALLOC_CONF="thp:always,metadata_thp:always" +ENV GLIBC_TUNABLES=glibc.malloc.hugetlb=1 +ENV SSL_CERT_FILE=/etc/ssl/certs/ca-bundle.crt +ENV CURL_CA_BUNDLE=/etc/ssl/certs/ca-bundle.crt + +# Copy /nix/store and our built application +COPY --from=builder /tmp/nix-store-closure /nix/store +COPY --from=builder /app/result /app +COPY --from=builder /tmp/etc /etc + +# Create data directory and expose port +WORKDIR /app +VOLUME /app/data +EXPOSE 4212 + +# Set the command to run Pastec +CMD ["/app/bin/pastec", "-p", "4212", "/app/data/visualWordsORB.dat"] diff --git a/README.md b/README.md index 999beef..19771c8 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,94 @@ docker compose up -d This will start Pastec on port 4212. +#### Optimized Docker Build + +Pastec now uses an optimized Docker build process that leverages layer caching for faster rebuilds: + +1. Dependencies are built in a separate layer using `deps-flake.nix` +2. The Pastec application is built in another layer using the main `flake.nix` +3. When only Pastec code changes, Docker reuses the cached dependencies layer + +For more details on the build system, see [BUILD.md](BUILD.md). + +To build and run with the optimized setup: + +```bash +# Enable BuildKit for better caching +export DOCKER_BUILDKIT=1 + +# Build and run +docker-compose build +docker-compose up -d +``` + +Or use the provided script: + +```bash +./build-and-run.sh +``` + +### Using Nix + +Pastec now supports Nix for reproducible builds and development environments. This approach provides consistent environments across development, CI, and production. + +#### Prerequisites + +- Install Nix: `curl -L https://nixos.org/nix/install | sh` +- Enable flakes: Add `experimental-features = nix-command flakes` to your Nix configuration (in `~/.config/nix/nix.conf` or `/etc/nix/nix.conf`) +- This project uses the stable NixOS 24.11 channel + +#### Development Environment + +To enter a development shell with all dependencies: + +```bash +nix develop +``` + +#### Building with Nix + +To build Pastec using Nix: + +```bash +nix build +``` + +This will create a `result` symlink pointing to the built package. + +#### Running with Nix + +After building, you can run Pastec: + +```bash +./result/bin/pastec -p 4212 ./visualWordsORB.dat +``` + +#### Docker with Nix + +The default Dockerfile now uses Nix to build Pastec, resulting in a minimal and reproducible Docker image with optimized layer caching: + +```bash +# Enable BuildKit for better caching +export DOCKER_BUILDKIT=1 + +# Build the image +docker build -t pastec . + +# Run the container +docker run -p 4212:4212 pastec +``` + +For backward compatibility, the Ubuntu-based Dockerfile is still available as `Dockerfile.ubuntu`. + +To test the caching benefits, you can use the provided script: + +```bash +./test-caching.sh +``` + +This script demonstrates how changes to Pastec code only rebuild the necessary layers, significantly reducing build time. + ### Manual Compilation #### Dependencies @@ -65,6 +153,92 @@ cmake ../ make ``` +### Using Pastec as a Library + +Pastec can now be used as a library in your C++ projects. This allows you to integrate Pastec's image recognition capabilities directly into your applications without using the HTTP API. + +#### Building the Library + +To build Pastec as a library: + +```bash +git clone https://github.com/Visu4link/pastec.git +cd pastec +mkdir build +cd build +cmake ../ +make +sudo make install +``` + +The library is always built by default. You can control other build options with these CMake flags: +- `-DBUILD_PASTEC_EXE=ON|OFF`: Build Pastec executable with HTTP server (default: ON) +- `-DBUILD_SHARED_LIBS=ON|OFF`: Build shared libraries instead of static (default: OFF) +- `-DBUILD_EXAMPLES=ON|OFF`: Build example applications (default: ON) + +#### Using in CMake Projects + +After installing the library, you can use it in your CMake projects: + +```cmake +find_package(Pastec REQUIRED) +add_executable(myapp main.cpp) +target_link_libraries(myapp Pastec::pastec) +``` + +#### Basic Usage Example + +```cpp +#include +#include + +int main(int argc, char** argv) { + if (argc < 3) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + + // Initialize index and searcher + pastec::ORBIndex index; + pastec::ORBWordIndex wordIndex(argv[1]); + pastec::ORBSearcher searcher(&index, &wordIndex); + pastec::ImageLoader imageLoader; + + // Load image + unsigned char* imageData; + unsigned long imageLength; + + if (!imageLoader.loadFile(argv[2], &imageData, &imageLength)) { + std::cerr << "Failed to load image" << std::endl; + return 1; + } + + // Add image to index + unsigned imageId = 1; + unsigned nbFeatures = 0; + pastec::ORBFeatureExtractor featureExtractor(&index, &wordIndex); + featureExtractor.processNewImage(imageId, imageLength, (char*)imageData, nbFeatures); + + // Search for the same image + pastec::SearchRequest request; + request.imageId = 0; // 0 means don't store the image + request.imageData.assign(imageData, imageData + imageLength); + + searcher.searchImage(request); + + // Print results + for (size_t i = 0; i < request.results.size(); i++) { + std::cout << "Found image ID: " << request.results[i] + << " with score: " << request.scores[i] << std::endl; + } + + delete[] imageData; + return 0; +} +``` + +For more examples, see the `examples` directory. + ### Running To start Pastec, run the **pastec** executable. It takes as mandatory argument the path to a file containing a list of ORB visual words: @@ -111,6 +285,64 @@ Example using URL: curl -X POST -d '{"url":"http://example.com/image.jpg"}' http://localhost:4212/index/images/23 ``` +Example using local file URL: +```bash +curl -X POST -d '{"url":"file:///path/to/local/image.jpg"}' http://localhost:4212/index/images/23 +``` + +> **Note:** When using the `file://` URL scheme with Docker, ensure that the directory containing your images is mounted as a volume in the container. For example: `docker run -v /path/on/host:/path/in/container pastec ...` + +### Batch processing images + +Process multiple images in a single request for improved performance. Optionally add tags during indexing. + +* **Path:** /index/images/batch +* **HTTP method:** POST +* **Data:** JSON array of objects with image_id, url, and optional tag +* **Response:** +```json +{ + "type": "BATCH_PROCESSED", + "results": [ + { + "image_id": 23, + "url": "http://example.com/image1.jpg", + "type": "IMAGE_ADDED", + "nb_features_extracted": 542, + "tag": "example_tag", + "tag_status": "IMAGE_TAG_ADDED" + }, + { + "image_id": 24, + "url": "http://example.com/image2.jpg", + "type": "IMAGE_ADDED", + "nb_features_extracted": 328 + }, + { + "image_id": 25, + "url": "http://invalid-url.com/image.jpg", + "type": "IMAGE_DOWNLOADER_HTTP_ERROR", + "image_downloader_http_response_code": 404 + } + ] +} +``` + +Example: +```bash +curl -X POST -d '[ + {"image_id": 23, "url": "http://example.com/image1.jpg", "tag": "example_tag"}, + {"image_id": 24, "url": "http://example.com/image2.jpg"}, + {"image_id": 25, "url": "file:///path/to/local/image.jpg", "tag": "local_image"} +]' http://localhost:4212/index/images/batch +``` + +The batch processing endpoint: +- Processes images in parallel using multiple threads for better performance +- Allows adding tags during initial indexing +- Automatically writes indices to disk after batch operations +- Returns detailed status for each image in the batch + ### Removing an image from the index * **Path:** /index/images/ @@ -215,6 +447,13 @@ Example using URL: curl -X POST -d '{"url":"http://example.com/query.jpg"}' http://localhost:4212/index/searcher ``` +Example using local file URL: +```bash +curl -X POST -d '{"url":"file:///path/to/local/query.jpg"}' http://localhost:4212/index/searcher +``` + +> **Note:** When using the `file://` URL scheme with Docker, ensure that the directory containing your images is mounted as a volume in the container. + ### List all indexed image IDs @@ -327,4 +566,4 @@ pastec.loadIndexTags("tags.dat") # Clear index pastec.clearIndex() -``` \ No newline at end of file +``` diff --git a/README.md.performance b/README.md.performance new file mode 100644 index 0000000..78127db --- /dev/null +++ b/README.md.performance @@ -0,0 +1,110 @@ +# Pastec Performance Optimizations + +This document outlines the performance optimizations applied to Pastec based on the article about improving C++ build performance. + +## Optimizations Implemented + +1. **Enhanced Compiler Flags** + - Added `-O3` for maximum optimization (instead of default `-O2`) + - Added `-flto` for Link Time Optimization + - Added `-DNDEBUG` to disable assertions in Release mode + - Added `-march=native` to optimize for the specific CPU architecture + - Added `-ffunction-sections` and `-fdata-sections` with `-Wl,--gc-sections` to remove unused code + +2. **Switched to Clang Compiler** + - Updated Dockerfile to use clang instead of gcc + - Added lld as the linker + +3. **Added mimalloc Memory Allocator** + - Added support for mimalloc, which has shown to be significantly faster than the default glibc allocator + - Created a FindMimalloc.cmake module to detect and use mimalloc + - Updated CMakeLists.txt to link against mimalloc when available + +4. **Enabled Transparent Huge Pages** + - Added environment variables in Dockerfile and docker-compose.yml: + - `MIMALLOC_LARGE_OS_PAGES=1` + - `MALLOC_CONF="thp:always,metadata_thp:always"` + - `GLIBC_TUNABLES=glibc.malloc.hugetlb=1"` + +## How to Use These Optimizations + +### Building with Optimizations + +Simply build the project as usual: + +```bash +mkdir -p build && cd build +cmake ../ +make -j$(nproc) +``` + +If you have clang installed, you can explicitly use it: + +```bash +mkdir -p build && cd build +CC=clang CXX=clang++ cmake ../ +make -j$(nproc) +``` + +### Using Docker + +The Dockerfile has been updated to include all optimizations. Build and run with: + +```bash +docker build -t pastec . +docker run -p 4212:4212 pastec +``` + +Or use docker-compose: + +```bash +docker-compose up -d +``` + +### Using mimalloc with Existing Binaries + +If you don't want to rebuild Pastec, you can use the provided script to run an existing binary with mimalloc: + +```bash +# First, install mimalloc +sudo apt-get install libmimalloc-dev + +# Then run Pastec with mimalloc preloaded +./run-with-mimalloc.sh -p 4212 /path/to/visualWordsORB.dat +``` + +Alternatively, you can manually preload mimalloc: + +```bash +LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libmimalloc.so ./pastec -p 4212 /path/to/visualWordsORB.dat +``` + +## Expected Performance Improvements + +Based on the article, you can expect: + +1. **Compiler Optimizations**: 20-25% improvement +2. **mimalloc Memory Allocator**: 30-40% improvement +3. **Combined Optimizations**: Potentially up to 90% improvement in some workloads + +The actual performance improvement will depend on your specific workload and hardware. The most significant gains will likely be seen in memory-intensive operations like batch processing of images. + +## Benchmarking + +To measure the performance improvement, you can use tools like `hyperfine` or `time`: + +```bash +# Install hyperfine +sudo apt-get install hyperfine + +# Benchmark the original binary +hyperfine --warmup 1 --runs 3 "./original-pastec -p 4212 /path/to/visualWordsORB.dat" + +# Benchmark the optimized binary +hyperfine --warmup 1 --runs 3 "./optimized-pastec -p 4212 /path/to/visualWordsORB.dat" + +# Or benchmark with mimalloc preloaded +hyperfine --warmup 1 --runs 3 "LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libmimalloc.so ./pastec -p 4212 /path/to/visualWordsORB.dat" +``` + +For real-world performance testing, you might want to measure the time it takes to process a batch of images or perform searches on a large dataset. diff --git a/build-and-run.sh b/build-and-run.sh new file mode 100755 index 0000000..f7137d2 --- /dev/null +++ b/build-and-run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Script to build and run Pastec with the optimized Docker setup + +# Enable BuildKit for better caching +export DOCKER_BUILDKIT=1 + +# Build the Docker image with progress output +echo "Building Pastec Docker image..." +DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker-compose build --progress=plain + +# Show the layers that were created +echo "Docker image layers:" +docker history pastec_pastec + +# Run the container +echo "Starting Pastec container..." +docker-compose up -d + +# Show logs +echo "Container logs:" +docker-compose logs -f diff --git a/cmake/Modules/Findmimalloc.cmake b/cmake/Modules/Findmimalloc.cmake new file mode 100644 index 0000000..1afafcb --- /dev/null +++ b/cmake/Modules/Findmimalloc.cmake @@ -0,0 +1,21 @@ +# - Find mimalloc library +# Once done this will define +# MIMALLOC_FOUND - System has mimalloc +# MIMALLOC_INCLUDE_DIRS - The mimalloc include directories +# MIMALLOC_LIBRARIES - The libraries needed to use mimalloc + +find_path(MIMALLOC_INCLUDE_DIR mimalloc.h + PATH_SUFFIXES mimalloc) + +find_library(MIMALLOC_LIBRARY NAMES mimalloc libmimalloc) + +include(FindPackageHandleStandardArgs) +# handle the QUIETLY and REQUIRED arguments and set MIMALLOC_FOUND to TRUE +# if all listed variables are TRUE +find_package_handle_standard_args(mimalloc DEFAULT_MSG + MIMALLOC_LIBRARY MIMALLOC_INCLUDE_DIR) + +mark_as_advanced(MIMALLOC_INCLUDE_DIR MIMALLOC_LIBRARY) + +set(MIMALLOC_LIBRARIES ${MIMALLOC_LIBRARY}) +set(MIMALLOC_INCLUDE_DIRS ${MIMALLOC_INCLUDE_DIR}) diff --git a/cmake/PastecConfig.cmake.in b/cmake/PastecConfig.cmake.in new file mode 100644 index 0000000..702019c --- /dev/null +++ b/cmake/PastecConfig.cmake.in @@ -0,0 +1,8 @@ +@PACKAGE_INIT@ + +include(CMakeFindDependencyMacro) +find_dependency(Threads) +find_dependency(OpenCV) +find_dependency(CURL) + +include("${CMAKE_CURRENT_LIST_DIR}/PastecTargets.cmake") \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 4cca1c4..eccd03d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,13 +5,26 @@ services: build: context: . dockerfile: Dockerfile + args: + BUILDKIT_INLINE_CACHE: 1 # Enable BuildKit inline caching ports: - '4212:4212' restart: always volumes: - - ./../pastec-index:/pastec/build/pastec-index:rw + - ./../pastec-index:/app/data/pastec-index:rw + - /opt/data/images:/opt/data/images:ro + environment: + - MIMALLOC_LARGE_OS_PAGES=1 + - MALLOC_CONF=thp:always,metadata_thp:always + - GLIBC_TUNABLES=glibc.malloc.hugetlb=1 + - SSL_CERT_FILE=/etc/ssl/certs/ca-bundle.crt + - CURL_CA_BUNDLE=/etc/ssl/certs/ca-bundle.crt command: - - /bin/sh - - -c - - | - ./pastec -p 4212 -i /pastec/build/pastec-index/pastec-index.dat /pastec/data/visualWordsORB.dat + - /app/bin/pastec + - -p + - "4212" + - -i + - /app/data/pastec-index/pastec_index.dat + - -t + - /app/data/pastec-index/pastec_tags_index.dat + - /app/data/visualWordsORB.dat diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 0000000..2e864d7 --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,14 @@ +cmake_minimum_required(VERSION 3.5) +project(PastecExamples) + +# Find Pastec package +find_package(Pastec REQUIRED) + +# Find required packages +find_package(OpenCV REQUIRED) + +# Add the example executable +add_executable(simple_index simple_index.cpp) + +# Link against Pastec library +target_link_libraries(simple_index Pastec::pastec ${OpenCV_LIBS}) \ No newline at end of file diff --git a/examples/simple_index.cpp b/examples/simple_index.cpp new file mode 100644 index 0000000..438e418 --- /dev/null +++ b/examples/simple_index.cpp @@ -0,0 +1,64 @@ +/** + * Simple example of using Pastec as a library + * + * This example shows how to: + * 1. Initialize the index and searcher + * 2. Add an image to the index + * 3. Search for an image + */ + +#include +#include +#include +#include + +int main(int argc, char** argv) { + if (argc < 3) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + + // Initialize index and searcher + pastec::ORBIndex index; + pastec::ORBWordIndex wordIndex(argv[1]); + pastec::ORBSearcher searcher(&index, &wordIndex); + pastec::ImageLoader imageLoader; + + // Load image + unsigned char* imageData; + unsigned long imageLength; + + if (!imageLoader.loadFile(argv[2], &imageData, &imageLength)) { + std::cerr << "Failed to load image" << std::endl; + return 1; + } + + std::cout << "Adding image to index..." << std::endl; + + // Add image to index + unsigned imageId = 1; + unsigned nbFeatures = 0; + pastec::ORBFeatureExtractor featureExtractor(&index, &wordIndex); + featureExtractor.processNewImage(imageId, imageLength, (char*)imageData, nbFeatures); + + std::cout << "Extracted " << nbFeatures << " features" << std::endl; + + // Search for the same image + std::cout << "Searching for the same image..." << std::endl; + + pastec::SearchRequest request; + request.imageId = 0; // 0 means don't store the image + request.imageData.assign(imageData, imageData + imageLength); + + searcher.searchImage(request); + + // Print results + std::cout << "Search results:" << std::endl; + for (size_t i = 0; i < request.results.size(); i++) { + std::cout << "Found image ID: " << request.results[i] + << " with score: " << request.scores[i] << std::endl; + } + + delete[] imageData; + return 0; +} \ No newline at end of file diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..a1e02be --- /dev/null +++ b/flake.nix @@ -0,0 +1,83 @@ +{ + description = "Pastec - Image Recognition Engine"; + + inputs = { + nixpkgs.url = "github:nixos/nixpkgs/nixos-24.11"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = { self, nixpkgs, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = import nixpkgs { inherit system; }; + in { + # Development environment + devShell = pkgs.mkShell { + buildInputs = with pkgs; [ + # Build tools + cmake + clang + lld + + # Libraries + opencv + libmicrohttpd + curl + jsoncpp + mimalloc + ]; + + # Environment variables for memory optimization + shellHook = '' + export MIMALLOC_LARGE_OS_PAGES=1 + export MALLOC_CONF="thp:always,metadata_thp:always" + export GLIBC_TUNABLES=glibc.malloc.hugetlb=1 + ''; + }; + + # Package definition + packages.pastec = pkgs.stdenv.mkDerivation { + pname = "pastec"; + version = "1.0.0"; + + src = ./.; + nativeBuildInputs = with pkgs; [ + cmake + clang + lld + ]; + + buildInputs = with pkgs; [ + opencv + libmicrohttpd + curl + jsoncpp + mimalloc + ]; + + cmakeFlags = [ + "-DCMAKE_C_COMPILER=clang" + "-DCMAKE_CXX_COMPILER=clang++" + "-DCMAKE_BUILD_TYPE=Release" + "-DBUILD_PASTEC_LIB=ON" + "-DBUILD_PASTEC_EXE=ON" + "-DBUILD_EXAMPLES=OFF" + ]; + + installPhase = '' + mkdir -p $out/bin + cp pastec_server $out/bin/pastec + mkdir -p $out/lib + cp lib/libpastec.a $out/lib/ + mkdir -p $out/include + cp -r ../include/pastec $out/include/ + mkdir -p $out/data + cp ../visualWordsORB.dat $out/data/ + ''; + }; + + # Default package + defaultPackage = self.packages.${system}.pastec; + } + ); +} diff --git a/include/backwardindexreaderaccess.h b/include/backwardindexreaderaccess.h index 3366988..d247029 100644 --- a/include/backwardindexreaderaccess.h +++ b/include/backwardindexreaderaccess.h @@ -23,6 +23,10 @@ #define PASTEC_BACKWARDINDEXREADERACCESS_H #include +#include +#include +#include +#include #include @@ -161,4 +165,119 @@ class BackwardIndexReaderMemAccess : public BackwardIndexReaderAccess u_int64_t i_curPos; }; +/** + * Memory-mapped file access for the backward index. + * This provides zero-copy access to the file data through the OS's virtual memory system. + */ +class BackwardIndexReaderMMapAccess : public BackwardIndexReaderAccess +{ +public: + BackwardIndexReaderMMapAccess() : fd(-1), mappedData(nullptr), i_fileSize(0), i_curPos(0) {} + + virtual bool open(string indexPath) + { + // Open the file + fd = ::open(indexPath.c_str(), O_RDONLY); + if (fd == -1) + { + cout << "Could not open the backward index file." << endl; + return false; + } + + // Get file size + struct stat sb; + if (fstat(fd, &sb) == -1) + { + cout << "Could not get file size." << endl; + ::close(fd); + fd = -1; + return false; + } + i_fileSize = sb.st_size; + + // Map the file into memory + mappedData = mmap(NULL, i_fileSize, PROT_READ, MAP_PRIVATE, fd, 0); + if (mappedData == MAP_FAILED) + { + cout << "Could not memory map the index file." << endl; + ::close(fd); + fd = -1; + mappedData = nullptr; + return false; + } + + // Advise the kernel that we'll access the data sequentially + madvise(mappedData, i_fileSize, MADV_SEQUENTIAL); + + i_curPos = 0; + return true; + } + + virtual void moveAt(u_int64_t pos) + { + i_curPos = pos; + } + + virtual void read(char *p_data, unsigned i_nbBytes) + { + if (i_curPos + i_nbBytes <= i_fileSize) + { + memcpy(p_data, static_cast(mappedData) + i_curPos, i_nbBytes); + i_curPos += i_nbBytes; + } + } + + virtual bool endOfIndex() + { + return i_curPos >= i_fileSize; + } + + virtual void reset() + { + i_curPos = 0; + } + + virtual void close() + { + if (mappedData != nullptr && mappedData != MAP_FAILED) + { + munmap(mappedData, i_fileSize); + mappedData = nullptr; + } + + if (fd != -1) + { + ::close(fd); + fd = -1; + } + + i_fileSize = 0; + i_curPos = 0; + } + + // Get direct pointer to the mapped data at current position + char* getCurrentDataPtr() const + { + return static_cast(mappedData) + i_curPos; + } + + // Get direct pointer to the mapped data at specified offset + char* getDataPtr(u_int64_t offset) const + { + return static_cast(mappedData) + offset; + } + + // Get file size + u_int64_t getFileSize() const + { + return i_fileSize; + } + +private: + int fd; // File descriptor + void* mappedData; // Pointer to memory-mapped data + u_int64_t i_fileSize; // Size of the file + u_int64_t i_curPos; // Current position in the file +}; + #endif // PASTEC_BACKWARDINDEXREADERACCESS_H diff --git a/include/batchprocessor.h b/include/batchprocessor.h new file mode 100644 index 0000000..5448a2f --- /dev/null +++ b/include/batchprocessor.h @@ -0,0 +1,93 @@ +/***************************************************************************** + * Copyright (C) 2025 Pharos + * + * This file is part of Pastec. + * + * Pastec is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pastec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Pastec. If not, see . + *****************************************************************************/ + +#ifndef PASTEC_BATCHPROCESSOR_H +#define PASTEC_BATCHPROCESSOR_H + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +using namespace std; + +// Structure to hold the result of processing a single image in a batch +struct BatchImageResult { + u_int32_t imageId; + string url; + string tag; // Added tag field + u_int32_t status; // Using the same status codes as single image processing + unsigned nbFeaturesExtracted; + long httpResponseCode; // For URL-based images +}; + +// Structure to hold the task information for batch processing +struct BatchProcessingTask { + u_int32_t imageId; + string url; + string tag; // Added tag field +}; + +// Worker thread for batch processing +class BatchWorkerThread : public Thread { +public: + BatchWorkerThread(ImageDownloader* imgDownloader, + FeatureExtractor* featureExtractor, + const vector& tasks, + unordered_map>& imageHits, + vector& results, + pthread_mutex_t& resultsMutex); + +private: + virtual void* run(); + + ImageDownloader* imgDownloader; + FeatureExtractor* featureExtractor; + const vector tasks; // This thread's dedicated tasks + unordered_map>& imageHits; // Shared hits collection + vector& results; // Shared results collection + pthread_mutex_t& resultsMutex; // Mutex for thread-safe updates to shared collections +}; + +// Main batch processor class +class BatchProcessor { +public: + BatchProcessor(ImageDownloader* imgDownloader, + FeatureExtractor* featureExtractor, + ORBIndex* index); + + vector processBatch(const vector& batchData); + +private: + static const int NUM_THREADS = 20; // Hardcoded number of threads + + ImageDownloader* imgDownloader; + FeatureExtractor* featureExtractor; + ORBIndex* index; +}; + +#endif // PASTEC_BATCHPROCESSOR_H diff --git a/include/httpserver.h b/include/httpserver.h index 0b95348..a0b36f6 100644 --- a/include/httpserver.h +++ b/include/httpserver.h @@ -47,14 +47,14 @@ class HTTPServer private: char *loadFile(const char *filename); - static int answerToConnection(void *cls, MHD_Connection *connection, + static MHD_Result answerToConnection(void *cls, MHD_Connection *connection, const char *url, const char *method, const char *version, const char *upload_data, size_t *upload_data_size, void **con_cls); static void requestCompleted(void *cls, MHD_Connection *connection, void **con_cls, MHD_RequestTerminationCode toe); - static int sendAnswer(struct MHD_Connection *connection, ConnectionInfo &conInfo); - static int readAuthHeader(void *cls, enum MHD_ValueKind kind, + static MHD_Result sendAnswer(struct MHD_Connection *connection, ConnectionInfo &conInfo); + static MHD_Result readAuthHeader(void *cls, enum MHD_ValueKind kind, const char *key, const char *value); MHD_Daemon *daemon; @@ -77,6 +77,7 @@ struct ConnectionInfo string answerString; int answerCode; string authKey; + string contentType; // Added to store Content-Type header vector uploadedData; }; diff --git a/include/imagereranker.h b/include/imagereranker.h index 8f2317a..b4d0a42 100644 --- a/include/imagereranker.h +++ b/include/imagereranker.h @@ -23,6 +23,7 @@ #define PASTEC_IMAGERERANKER_H #include +#include #include #include @@ -31,7 +32,6 @@ #include -#include #include #include @@ -43,31 +43,40 @@ class ImageReranker { public: ImageReranker() {} - void rerank(std::unordered_map > &imagesReqHits, - std::unordered_map > &indexHits, - priority_queue &rankedResultsIn, - priority_queue &rankedResultsOut, - unsigned i_nbResults); + + // Main reranking method that works with vectors + vector rerank(std::unordered_map > &imagesReqHits, + std::unordered_map* > &indexHits, + const vector> &sortedResults, + unsigned i_nbResults); + + // Reranking method that uses the forward index for better performance + vector rerankUsingForwardIndex(std::unordered_map > &imagesReqHits, + class ORBIndex* index, + std::unordered_set &firstImageIds); private: float angleDiff(unsigned i_angle1, unsigned i_angle2); - void getFirstImageIds(priority_queue &rankedResultsIn, - unsigned i_nbResults, unordered_set &firstImageIds); + void getFirstImageIds(const vector> &sortedResults, + unsigned i_nbResults, unordered_set &firstImageIds); + + // Common reranking implementation + vector rerankCommon(std::unordered_map > &imagesReqHits, + std::unordered_map* > &indexHits, + unordered_set &firstImageIds); }; -// A task that must be performed when the rerankRANSAC function is called. -struct RANSACTask +// Point pairs for RANSAC +struct PointPairs { vector points1; vector points2; }; - #define HISTOGRAM_NB_BINS 32 #define DIFF_MIN -360.0f / (2.0f * HISTOGRAM_NB_BINS) - struct Histogram { Histogram() : i_total(0) @@ -79,33 +88,23 @@ struct Histogram unsigned i_total; }; - #define RANSAC_MIN_INLINERS 12 - -class RANSACThread : public Thread +// Helper functions for RANSAC +class RANSACHelper { public: - RANSACThread(pthread_mutex_t &mutex, - std::unordered_map &imgTasks, - priority_queue &rankedResultsOut) - : mutex(mutex), imgTasks(imgTasks), rankedResultsOut(rankedResultsOut) - { } - -public: - void *run(); - - pthread_mutex_t &mutex; - std::unordered_map &imgTasks; - priority_queue &rankedResultsOut; - deque imageIds; - deque histograms; - -private: - void getRTMatrix(const Point2f* a, const Point2f* b, + static void getRTMatrix(const Point2f* a, const Point2f* b, int count, Mat& M, bool fullAffine); - cv::Mat pastecEstimateRigidTransform(InputArray src1, InputArray src2, + static cv::Mat pastecEstimateRigidTransform(InputArray src1, InputArray src2, bool fullAffine); + + // Helper method to calculate time difference in milliseconds + static unsigned long getTimeDiff(const timeval t1, const timeval t2) + { + return ((t2.tv_sec - t1.tv_sec) * 1000000 + + (t2.tv_usec - t1.tv_usec)) / 1000; + } }; diff --git a/include/messages.h b/include/messages.h index 39db3a5..b46dc52 100644 --- a/include/messages.h +++ b/include/messages.h @@ -62,6 +62,7 @@ enum MessagesOut SEARCH_RESULTS = 0x10070100, IMAGE_DOWNLOADER_HTTP_ERROR = 0x10080100, + BATCH_PROCESSED = 0x10080200, }; @@ -110,6 +111,7 @@ class Converter case SEARCH_RESULTS: s = "SEARCH_RESULTS"; break; case IMAGE_DOWNLOADER_HTTP_ERROR: s = "IMAGE_DOWNLOADER_HTTP_ERROR"; break; + case BATCH_PROCESSED: s = "BATCH_PROCESSED"; break; default: break; } diff --git a/include/orb/orbfeatureextractor.h b/include/orb/orbfeatureextractor.h index 457556d..b8d332f 100644 --- a/include/orb/orbfeatureextractor.h +++ b/include/orb/orbfeatureextractor.h @@ -48,6 +48,11 @@ class ORBFeatureExtractor : public FeatureExtractor u_int32_t processNewImage(unsigned i_imageId, unsigned i_imgSize, char *p_imgData, unsigned &i_nbFeaturesExtracted); + + // Extract features without adding to index (for batch processing) + u_int32_t extractFeatures(unsigned i_imageId, unsigned i_imgSize, + char *p_imgData, list &hits, + unsigned &i_nbFeaturesExtracted); private: ORBIndex *index; diff --git a/include/orb/orbindex.h b/include/orb/orbindex.h index 691a832..b334364 100644 --- a/include/orb/orbindex.h +++ b/include/orb/orbindex.h @@ -45,14 +45,16 @@ using namespace std; class ORBIndex : public Index { public: - ORBIndex(string indexPath, bool buildForwardIndex); + ORBIndex(string indexPath, string tagsPath, bool buildForwardIndex); virtual ~ORBIndex(); void getImagesWithVisualWords(std::unordered_map > &imagesReqHits, - std::unordered_map > &indexHitsForReq); + std::unordered_map* > &indexHitsForReq); unsigned getWordNbOccurences(unsigned i_wordId); unsigned countTotalNbWord(unsigned i_imageId); unsigned getTotalNbIndexedImages(); u_int32_t addImage(unsigned i_imageId, list hitList); + u_int32_t addBatchImages(const unordered_map>& batchHits); + u_int32_t addBatchTags(const unordered_map& batchTags); u_int32_t addTag(const unsigned i_imageId, const string tag); u_int32_t removeImage(const unsigned i_imageId); u_int32_t getImageWords(const unsigned i_imageId, unordered_map > &hitList); @@ -68,15 +70,42 @@ class ORBIndex : public Index void readLock(); void unlock(); + + // Get direct access to the word count vector + const vector& getWordCountVector() const; + + // Check if forward index is available + bool hasForwardIndex() const; + + // Get all words for an image from the forward index + const vector& getForwardIndexWords(u_int32_t i_imageId) const; + + // Get a hit for a specific word and image + const Hit* getHitForWordAndImage(u_int32_t i_wordId, u_int32_t i_imageId) const; private: + // Recalculate the total number of indexed images + void recalculateTotalIndexedImages(); + + // Sort all word vectors by image ID to enable binary search in getHitForWordAndImage + // This improves lookup performance from O(n) to O(log n) + void sortAllWordVectors(); + + // Update the index state by recalculating total indexed images and sorting word vectors + void updateIndexState(); + u_int64_t nbOccurences[NB_VISUAL_WORDS]; u_int64_t totalNbRecords; bool buildForwardIndex; + unsigned m_totalIndexedImages; // Cached count of indexed images + + // Store the paths provided in the constructor + string storedIndexPath; + string storedTagsPath; - unordered_map nbWords; - unordered_map > forwardIndex; - unordered_map tags; + vector nbWords; + vector > forwardIndex; + vector tags; vector indexHits[NB_VISUAL_WORDS]; pthread_rwlock_t rwLock; diff --git a/include/orb/orbsearcher.h b/include/orb/orbsearcher.h index f388326..9bcc45d 100644 --- a/include/orb/orbsearcher.h +++ b/include/orb/orbsearcher.h @@ -23,6 +23,12 @@ #define PASTEC_IMAGESEARCHER_H #include +#include +#include +#include + +#include +#include #include #include @@ -38,6 +44,12 @@ using namespace std; class ClientConnection; +// Structure to hold a hit with its distance during search +struct SearchHit { + Hit hit; // The original hit data + float distance; // KNN distance (smaller is better) +}; + class ORBSearcher : public Searcher { @@ -48,16 +60,41 @@ class ORBSearcher : public Searcher u_int32_t searchSimilar(SearchRequest &request); private: - void returnResults(priority_queue &rankedResults, + // Process a batch of keypoints + std::vector> processKeyPointBatch( + const Mat& descriptors, + const vector& keypoints, + size_t startIdx, + size_t endIdx, + ORBWordIndex* localWordIndex); + + // Helper method to process a batch of words for TF-IDF computation + vector processTFIDFBatch( + const vector*>>& batch, + const vector& wordCounts, + unsigned i_nbTotalIndexedImages, + unsigned maxImageId); + + void returnResults(vector &rankedResults, SearchRequest &req, unsigned i_maxNbResults); unsigned long getTimeDiff(const timeval t1, const timeval t2) const; u_int32_t processSimilar(SearchRequest &request, std::unordered_map > imageReqHits); + // Constants + static const int NUM_THREADS = 20; // Increased from 3 + static const int FEATURE_BATCH_COUNT = 8; // Number of batches for feature extraction + static const int WEIGHT_BATCH_COUNT = 4; // Number of batches for TF-IDF computation + + // Original members ORBIndex *index; ORBWordIndex *wordIndex; ImageReranker reranker; Ptr orb; + + // Thread pool members + std::vector> threadWordIndices; + boost::asio::thread_pool threadPool; }; #endif // PASTEC_IMAGESEARCHER_H diff --git a/include/orb/orbwordindex.h b/include/orb/orbwordindex.h index 628d363..e3b1444 100644 --- a/include/orb/orbwordindex.h +++ b/include/orb/orbwordindex.h @@ -27,23 +27,53 @@ #include #include +// Include SimSIMD +#include + using namespace cv; using namespace std; +// Custom Hamming distance functor using SimSIMD +class SimSIMDHamming { +public: + typedef unsigned char ElementType; + typedef int ResultType; + + template + ResultType operator()(Iterator1 a, Iterator2 b, size_t size) const { + simsimd_distance_t distance; + simsimd_hamming_b8((simsimd_b8_t*)a, (simsimd_b8_t*)b, size, &distance); + return (ResultType)distance; + } +}; + class ORBWordIndex { public: + // Original constructor ORBWordIndex(string visualWordsPath); + + // Constructor that accepts an existing words matrix (shares the matrix) + ORBWordIndex(const Mat* sharedWords); + + // Constructor that creates a deep copy of an existing words matrix + ORBWordIndex(const Mat& wordsToCopy); + ~ORBWordIndex(); + void knnSearch(const Mat &query, vector& indices, vector &dists, int knn); + + // Getter for the words matrix + Mat* getWords() const { return words; } private: bool readVisualWords(string fileName); Mat *words; // The matrix that stores the visual words. - cvflann::HierarchicalClusteringIndex > *kdIndex; // The kd-tree index. + bool ownsWords; // Flag to indicate if this instance owns the words matrix + cvflann::HierarchicalClusteringIndex *kdIndex; // The kd-tree index with SimSIMD Hamming. }; #endif // PASTEC_ORBWORDINDEX_H diff --git a/include/pastec/core/featureextractor.h b/include/pastec/core/featureextractor.h new file mode 100644 index 0000000..6064294 --- /dev/null +++ b/include/pastec/core/featureextractor.h @@ -0,0 +1,39 @@ +/***************************************************************************** + * Copyright (C) 2014 Visualink + * + * Authors: Adrien Maglo + * + * This file is part of Pastec. + * + * Pastec is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pastec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Pastec. If not, see . + *****************************************************************************/ + +#ifndef PASTEC_FEATUREEXTRACTOR_H +#define PASTEC_FEATUREEXTRACTOR_H + +#include + +namespace pastec { + +class FeatureExtractor +{ +public: + virtual ~FeatureExtractor() {} + virtual u_int32_t processNewImage(unsigned i_imageId, unsigned i_imgSize, + char *p_imgData, unsigned &i_nbFeaturesExtracted) = 0; +}; + +} // namespace pastec + +#endif // PASTEC_FEATUREEXTRACTOR_H \ No newline at end of file diff --git a/include/pastec/core/hit.h b/include/pastec/core/hit.h new file mode 100644 index 0000000..9ae120f --- /dev/null +++ b/include/pastec/core/hit.h @@ -0,0 +1,49 @@ +/***************************************************************************** + * Copyright (C) 2014 Visualink + * + * Authors: Adrien Maglo + * + * This file is part of Pastec. + * + * Pastec is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pastec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Pastec. If not, see . + *****************************************************************************/ + +#ifndef PASTEC_HIT_H +#define PASTEC_HIT_H + +#include + +namespace pastec { + +struct Hit +{ + u_int32_t i_imageId; + u_int16_t i_angle; + u_int16_t i_x; + u_int16_t i_y; + u_int16_t i_scale; +}; + +struct HitForward +{ + u_int32_t i_wordId; + u_int16_t i_angle; + u_int16_t i_x; + u_int16_t i_y; + u_int16_t i_scale; +}; + +} // namespace pastec + +#endif // PASTEC_HIT_H \ No newline at end of file diff --git a/include/pastec/core/imagedownloader.h b/include/pastec/core/imagedownloader.h new file mode 100644 index 0000000..cfdb91c --- /dev/null +++ b/include/pastec/core/imagedownloader.h @@ -0,0 +1,43 @@ +/***************************************************************************** + * Copyright (C) 2014 Visualink + * + * Authors: Adrien Maglo + * + * This file is part of Pastec. + * + * Pastec is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pastec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Pastec. If not, see . + *****************************************************************************/ + +#ifndef PASTEC_IMAGEDOWNLOADER_H +#define PASTEC_IMAGEDOWNLOADER_H + +#include +#include + +namespace pastec { + +class ImageDownloader +{ +public: + ImageDownloader() {} + ~ImageDownloader() {} + bool downloadImage(const std::string &url, std::vector &data); + +private: + static size_t write_data(void *ptr, size_t size, size_t nmemb, void *stream); +}; + +} // namespace pastec + +#endif // PASTEC_IMAGEDOWNLOADER_H \ No newline at end of file diff --git a/include/pastec/core/imageloader.h b/include/pastec/core/imageloader.h new file mode 100644 index 0000000..206a41f --- /dev/null +++ b/include/pastec/core/imageloader.h @@ -0,0 +1,39 @@ +/***************************************************************************** + * Copyright (C) 2014 Visualink + * + * Authors: Adrien Maglo + * + * This file is part of Pastec. + * + * Pastec is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pastec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Pastec. If not, see . + *****************************************************************************/ + +#ifndef PASTEC_IMAGELOADER_H +#define PASTEC_IMAGELOADER_H + +#include + +namespace pastec { + +class ImageLoader +{ +public: + ImageLoader() {} + ~ImageLoader() {} + bool loadFile(const std::string &filePath, unsigned char **data, unsigned long *dataLen); +}; + +} // namespace pastec + +#endif // PASTEC_IMAGELOADER_H \ No newline at end of file diff --git a/include/pastec/core/index.h b/include/pastec/core/index.h new file mode 100644 index 0000000..7d97c85 --- /dev/null +++ b/include/pastec/core/index.h @@ -0,0 +1,56 @@ +/***************************************************************************** + * Copyright (C) 2014 Visualink + * + * Authors: Adrien Maglo + * + * This file is part of Pastec. + * + * Pastec is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pastec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Pastec. If not, see . + *****************************************************************************/ + +#ifndef PASTEC_INDEX_H +#define PASTEC_INDEX_H + +#include +#include +#include +#include + +namespace pastec { + +class HitForward; + +#define DEFAULT_INDEX_PATH "backwardIndex.dat" +#define DEFAULT_INDEX_TAGS_PATH "indexTags.dat" + +class Index +{ +public: + virtual ~Index() {} + virtual u_int32_t addTag(const unsigned i_imageId, const std::string tag) = 0; + virtual u_int32_t removeImage(const unsigned i_imageId) = 0; + virtual u_int32_t removeTag(const unsigned i_imageId) = 0; + virtual u_int32_t getTag(unsigned i_imageId, std::string &tag) = 0; + virtual u_int32_t write(std::string backwardIndexPath) = 0; + virtual u_int32_t clear() = 0; + virtual u_int32_t load(std::string backwardIndexPath) = 0; + virtual u_int32_t getImageIds(std::vector &imageIds) = 0; + + virtual u_int32_t loadTags(std::string indexTagsPath) = 0; + virtual u_int32_t writeTags(std::string indexTagsPath) = 0; +}; + +} // namespace pastec + +#endif // PASTEC_INDEX_H \ No newline at end of file diff --git a/include/pastec/core/orb/orbfeatureextractor.h b/include/pastec/core/orb/orbfeatureextractor.h new file mode 100644 index 0000000..8ff9c14 --- /dev/null +++ b/include/pastec/core/orb/orbfeatureextractor.h @@ -0,0 +1,60 @@ +/***************************************************************************** + * Copyright (C) 2014 Visualink + * + * Authors: Adrien Maglo + * + * This file is part of Pastec. + * + * Pastec is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pastec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Pastec. If not, see . + *****************************************************************************/ + +#ifndef PASTEC_ORBFEATUREEXTRACTOR_H +#define PASTEC_ORBFEATUREEXTRACTOR_H + +#include +#include +#include + +#include +#include + +#include "pastec/core/featureextractor.h" +#include "pastec/core/orb/orbindex.h" +#include "pastec/core/orb/orbwordindex.h" + +namespace pastec { + +class ORBFeatureExtractor : public FeatureExtractor +{ +public: + ORBFeatureExtractor(ORBIndex *index, ORBWordIndex *wordIndex); + virtual ~ORBFeatureExtractor() {} + + u_int32_t processNewImage(unsigned i_imageId, unsigned i_imgSize, + char *p_imgData, unsigned &i_nbFeaturesExtracted); + + // Extract features without adding to index (for batch processing) + u_int32_t extractFeatures(unsigned i_imageId, unsigned i_imgSize, + char *p_imgData, std::list &hits, + unsigned &i_nbFeaturesExtracted); + +private: + ORBIndex *index; + ORBWordIndex *wordIndex; + cv::Ptr orb; +}; + +} // namespace pastec + +#endif // PASTEC_ORBFEATUREEXTRACTOR_H \ No newline at end of file diff --git a/include/pastec/core/orb/orbindex.h b/include/pastec/core/orb/orbindex.h new file mode 100644 index 0000000..1874980 --- /dev/null +++ b/include/pastec/core/orb/orbindex.h @@ -0,0 +1,93 @@ +/***************************************************************************** + * Copyright (C) 2014 Visualink + * + * Authors: Adrien Maglo + * + * This file is part of Pastec. + * + * Pastec is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pastec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Pastec. If not, see . + *****************************************************************************/ + +#ifndef PASTEC_ORBINDEX_H +#define PASTEC_ORBINDEX_H + +#include +#include +#include +#include +#include +#include +#include + +#include "pastec/core/hit.h" +#include "pastec/core/index.h" + +namespace pastec { + +#define NB_VISUAL_WORDS 1000000 +#define BACKWARD_INDEX_ENTRY_SIZE 10 + +class BackwardIndexReaderAccess; + +class ORBIndex : public Index +{ +public: + ORBIndex(std::string indexPath = DEFAULT_INDEX_PATH, + std::string tagsPath = DEFAULT_INDEX_TAGS_PATH, + bool buildForwardIndex = false); + virtual ~ORBIndex(); + void getImagesWithVisualWords(std::unordered_map > &imagesReqHits, + std::unordered_map > &indexHitsForReq); + unsigned getWordNbOccurences(unsigned i_wordId); + unsigned countTotalNbWord(unsigned i_imageId); + unsigned getTotalNbIndexedImages(); + u_int32_t addImage(unsigned i_imageId, std::list hitList); + u_int32_t addBatchImages(const std::unordered_map>& batchHits); + u_int32_t addBatchTags(const std::unordered_map& batchTags); + u_int32_t addTag(const unsigned i_imageId, const std::string tag); + u_int32_t removeImage(const unsigned i_imageId); + u_int32_t getImageWords(const unsigned i_imageId, std::unordered_map > &hitList); + u_int32_t removeTag(const unsigned i_imageId); + u_int32_t getTag(unsigned i_imageId, std::string &tag); + u_int32_t write(std::string backwardIndexPath); + u_int32_t clear(); + u_int32_t load(std::string backwardIndexPath); + u_int32_t getImageIds(std::vector &imageIds); + + u_int32_t loadTags(std::string indexTagsPath); + u_int32_t writeTags(std::string indexTagsPath); + + void readLock(); + void unlock(); + +private: + u_int64_t nbOccurences[NB_VISUAL_WORDS]; + u_int64_t totalNbRecords; + bool buildForwardIndex; + + // Store the paths provided in the constructor + std::string storedIndexPath; + std::string storedTagsPath; + + std::unordered_map nbWords; + std::unordered_map > forwardIndex; + std::unordered_map tags; + std::vector indexHits[NB_VISUAL_WORDS]; + + pthread_rwlock_t rwLock; +}; + +} // namespace pastec + +#endif // PASTEC_ORBINDEX_H \ No newline at end of file diff --git a/include/pastec/core/orb/orbsearcher.h b/include/pastec/core/orb/orbsearcher.h new file mode 100644 index 0000000..746bda8 --- /dev/null +++ b/include/pastec/core/orb/orbsearcher.h @@ -0,0 +1,63 @@ +/***************************************************************************** + * Copyright (C) 2014 Visualink + * + * Authors: Adrien Maglo + * + * This file is part of Pastec. + * + * Pastec is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pastec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Pastec. If not, see . + *****************************************************************************/ + +#ifndef PASTEC_IMAGESEARCHER_H +#define PASTEC_IMAGESEARCHER_H + +#include +#include + +#include +#include + +#include "pastec/core/searcher.h" +#include "pastec/core/orb/orbindex.h" +#include "pastec/core/orb/orbwordindex.h" +#include "pastec/core/searchResult.h" + +namespace pastec { + +class ImageReranker; + +class ORBSearcher : public Searcher +{ +public: + ORBSearcher(ORBIndex *index, ORBWordIndex *wordIndex); + virtual ~ORBSearcher(); + u_int32_t searchImage(SearchRequest &request); + u_int32_t searchSimilar(SearchRequest &request); + +private: + void returnResults(std::priority_queue &rankedResults, + SearchRequest &req, unsigned i_maxNbResults); + unsigned long getTimeDiff(const timeval t1, const timeval t2) const; + u_int32_t processSimilar(SearchRequest &request, + std::unordered_map > imageReqHits); + + ORBIndex *index; + ORBWordIndex *wordIndex; + ImageReranker *reranker; + cv::Ptr orb; +}; + +} // namespace pastec + +#endif // PASTEC_IMAGESEARCHER_H \ No newline at end of file diff --git a/include/pastec/core/orb/orbwordindex.h b/include/pastec/core/orb/orbwordindex.h new file mode 100644 index 0000000..9dcff72 --- /dev/null +++ b/include/pastec/core/orb/orbwordindex.h @@ -0,0 +1,48 @@ +/***************************************************************************** + * Copyright (C) 2014 Visualink + * + * Authors: Adrien Maglo + * + * This file is part of Pastec. + * + * Pastec is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pastec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Pastec. If not, see . + *****************************************************************************/ + +#ifndef PASTEC_ORBWORDINDEX_H +#define PASTEC_ORBWORDINDEX_H + +#include +#include + +#include +#include + +namespace pastec { + +class ORBWordIndex +{ +public: + ORBWordIndex(std::string visualWordPath); + ~ORBWordIndex(); + u_int32_t getWordIndex(cv::Mat features, unsigned i_nbFeatures, + unsigned *indexes); + +private: + cv::flann::Index *index; + cv::Mat words; +}; + +} // namespace pastec + +#endif // PASTEC_ORBWORDINDEX_H \ No newline at end of file diff --git a/include/pastec/core/searchResult.h b/include/pastec/core/searchResult.h new file mode 100644 index 0000000..a417495 --- /dev/null +++ b/include/pastec/core/searchResult.h @@ -0,0 +1,47 @@ +/***************************************************************************** + * Copyright (C) 2014 Visualink + * + * Authors: Adrien Maglo + * + * This file is part of Pastec. + * + * Pastec is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pastec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Pastec. If not, see . + *****************************************************************************/ + +#ifndef PASTEC_SEARCHRESULT_H +#define PASTEC_SEARCHRESULT_H + +#include +#include + +#include + +namespace pastec { + +struct SearchResult +{ + u_int32_t imageId; + float score; + cv::Rect boundingRect; + std::string tag; + + bool operator<(const SearchResult &a) const + { + return score < a.score; + } +}; + +} // namespace pastec + +#endif // PASTEC_SEARCHRESULT_H \ No newline at end of file diff --git a/include/pastec/core/searcher.h b/include/pastec/core/searcher.h new file mode 100644 index 0000000..867a971 --- /dev/null +++ b/include/pastec/core/searcher.h @@ -0,0 +1,56 @@ +/***************************************************************************** + * Copyright (C) 2014 Visualink + * + * Authors: Adrien Maglo + * + * This file is part of Pastec. + * + * Pastec is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pastec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Pastec. If not, see . + *****************************************************************************/ + +#ifndef PASTEC_SEARCHER_H +#define PASTEC_SEARCHER_H + +#include +#include +#include + +#include + +namespace pastec { + +class ClientConnection; + +struct SearchRequest +{ + u_int32_t imageId; + std::vector imageData; + ClientConnection *client; + std::vector results; + std::vector boundingRects; + std::vector scores; + std::vector tags; +}; + +class Searcher +{ +public: + virtual ~Searcher() {} + virtual u_int32_t searchImage(SearchRequest &request) = 0; + virtual u_int32_t searchSimilar(SearchRequest &request) = 0; +}; + +} // namespace pastec + +#endif // PASTEC_SEARCHER_H \ No newline at end of file diff --git a/include/pastec/http/httpserver.h b/include/pastec/http/httpserver.h new file mode 100644 index 0000000..e6d9c14 --- /dev/null +++ b/include/pastec/http/httpserver.h @@ -0,0 +1,86 @@ +/***************************************************************************** + * Copyright (C) 2014 Visualink + * + * Authors: Adrien Maglo + * + * This file is part of Pastec. + * + * Pastec is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pastec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Pastec. If not, see . + *****************************************************************************/ + +#ifndef PASTEC_HTTPSERVER_H +#define PASTEC_HTTPSERVER_H + +#include +#include +#include +#include + +namespace pastec { + +class RequestHandler; +struct ConnectionInfo; + +#define GET 0 +#define POST 1 +#define DELETE 2 +#define PUT 3 + +class HTTPServer +{ +public: + HTTPServer(RequestHandler *requestHandler, unsigned i_port, bool https); + ~HTTPServer(); + int run(); + int stop(); + +private: + char *loadFile(const char *filename); + static MHD_Result answerToConnection(void *cls, MHD_Connection *connection, + const char *url, const char *method, + const char *version, const char *upload_data, + size_t *upload_data_size, void **con_cls); + static void requestCompleted(void *cls, MHD_Connection *connection, + void **con_cls, MHD_RequestTerminationCode toe); + static MHD_Result sendAnswer(struct MHD_Connection *connection, ConnectionInfo &conInfo); + static MHD_Result readAuthHeader(void *cls, enum MHD_ValueKind kind, + const char *key, const char *value); + + MHD_Daemon *daemon; + RequestHandler *requestHandler; + + unsigned i_port; + bool https; + + pthread_cond_t stopCond; + pthread_mutex_t stopMutex; + bool b_stop; +}; + + +struct ConnectionInfo +{ + int connectionType; + std::string url; + struct MHD_PostProcessor *postprocessor; + std::string answerString; + int answerCode; + std::string authKey; + + std::vector uploadedData; +}; + +} // namespace pastec + +#endif // PASTEC_HTTPSERVER_H \ No newline at end of file diff --git a/include/pastec/pastec.h b/include/pastec/pastec.h new file mode 100644 index 0000000..0aa86ec --- /dev/null +++ b/include/pastec/pastec.h @@ -0,0 +1,40 @@ +/***************************************************************************** + * Copyright (C) 2014 Visualink + * + * Authors: Adrien Maglo + * + * This file is part of Pastec. + * + * Pastec is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pastec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Pastec. If not, see . + *****************************************************************************/ + +#ifndef PASTEC_H +#define PASTEC_H + +// Core functionality +#include "pastec/core/index.h" +#include "pastec/core/searcher.h" +#include "pastec/core/featureextractor.h" +#include "pastec/core/imageloader.h" +#include "pastec/core/hit.h" +#include "pastec/core/searchResult.h" +#include "pastec/core/imagedownloader.h" + +// ORB-specific implementations +#include "pastec/core/orb/orbindex.h" +#include "pastec/core/orb/orbsearcher.h" +#include "pastec/core/orb/orbfeatureextractor.h" +#include "pastec/core/orb/orbwordindex.h" + +#endif // PASTEC_H \ No newline at end of file diff --git a/include/requesthandler.h b/include/requesthandler.h index 6c8a2ef..9ae27b6 100644 --- a/include/requesthandler.h +++ b/include/requesthandler.h @@ -24,6 +24,7 @@ #include #include +#include class FeatureExtractor; class Searcher; @@ -42,6 +43,7 @@ class RequestHandler RequestHandler(FeatureExtractor *featureExtractor, Searcher *imageSearcher, Index *index, ImageDownloader *imgDownloader, string authKey); + ~RequestHandler(); void handleRequest(ConnectionInfo &conInfo); private: @@ -49,11 +51,13 @@ class RequestHandler bool testURIWithPattern(vector parsedURI, string p_pattern[]); string JsonToString(Json::Value data); Json::Value StringToJson(string str); + unsigned long getTimeDiff(const timeval t1, const timeval t2) const; FeatureExtractor *featureExtractor; Searcher *imageSearcher; Index *index; ImageDownloader *imgDownloader; + BatchProcessor *batchProcessor; string authKey; }; diff --git a/shell.nix b/shell.nix new file mode 100644 index 0000000..9de6546 --- /dev/null +++ b/shell.nix @@ -0,0 +1,11 @@ +# This file provides backward compatibility for older Nix versions +# that don't support flakes. It imports the development shell from flake.nix. + +(import ( + fetchTarball { + url = "https://github.com/edolstra/flake-compat/archive/master.tar.gz"; + sha256 = "0m6nmi4jb34rykzs3lg1ip7ar95zv9s2mr6sqs6k45idpwsmcbfy"; + } +) { + src = ./.; +}).shellNix diff --git a/src/batchprocessor.cpp b/src/batchprocessor.cpp new file mode 100644 index 0000000..b77eb0d --- /dev/null +++ b/src/batchprocessor.cpp @@ -0,0 +1,202 @@ +/***************************************************************************** + * Copyright (C) 2025 Pharos + * + * This file is part of Pastec. + * + * Pastec is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pastec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Pastec. If not, see . + *****************************************************************************/ + +#include +#include + +#include +#include +#include + +BatchWorkerThread::BatchWorkerThread(ImageDownloader* imgDownloader, + FeatureExtractor* featureExtractor, + const vector& tasks, + unordered_map>& imageHits, + vector& results, + pthread_mutex_t& resultsMutex) + : imgDownloader(imgDownloader), featureExtractor(featureExtractor), + tasks(tasks), imageHits(imageHits), results(results), resultsMutex(resultsMutex) +{ } + +void* BatchWorkerThread::run() { + // Local collections to minimize synchronization + unordered_map> localHits; + vector localResults; + + // Process all assigned tasks without synchronization + for (const auto& task : tasks) { + BatchImageResult result; + result.imageId = task.imageId; + result.url = task.url; + result.tag = task.tag; // Store the tag in the result + + // Download image if URL is provided + vector imageData; + if (!task.url.empty()) { + if (imgDownloader->canDownloadImage(task.url)) { + long httpResponseCode; + u_int32_t downloadStatus = imgDownloader->getImageData( + task.url, imageData, httpResponseCode); + + result.httpResponseCode = httpResponseCode; + + if (downloadStatus != OK) { + result.status = downloadStatus; + localResults.push_back(result); + continue; // Skip to next image + } + } else { + result.status = IMAGE_NOT_DECODED; + localResults.push_back(result); + continue; // Skip to next image + } + } else { + result.status = IMAGE_NOT_DECODED; + localResults.push_back(result); + continue; // Skip to next image + } + + // Extract features without adding to index + list hits; + unsigned nbFeaturesExtracted = 0; + + // We need to cast the feature extractor to ORBFeatureExtractor to use extractFeatures + ORBFeatureExtractor* orbExtractor = dynamic_cast(featureExtractor); + if (!orbExtractor) { + result.status = ERROR_GENERIC; + localResults.push_back(result); + continue; + } + + u_int32_t status = orbExtractor->extractFeatures( + task.imageId, imageData.size(), imageData.data(), + hits, nbFeaturesExtracted); + + result.status = status; + result.nbFeaturesExtracted = nbFeaturesExtracted; + + // Store results locally + if (status == IMAGE_ADDED) { + localHits[task.imageId] = std::move(hits); + } + + localResults.push_back(result); + } + + // Now synchronize once to update shared collections + pthread_mutex_lock(&resultsMutex); + + // Add local hits to shared hits collection + for (auto& pair : localHits) { + imageHits[pair.first] = std::move(pair.second); + } + + // Add local results to shared results collection + results.insert(results.end(), localResults.begin(), localResults.end()); + + pthread_mutex_unlock(&resultsMutex); + + return NULL; +} + +BatchProcessor::BatchProcessor(ImageDownloader* imgDownloader, + FeatureExtractor* featureExtractor, + ORBIndex* index) + : imgDownloader(imgDownloader), featureExtractor(featureExtractor), index(index) +{ } + +vector BatchProcessor::processBatch(const vector& batchData) { + // Prepare tasks from batch data + vector allTasks; + allTasks.reserve(batchData.size()); + + for (const auto& item : batchData) { + BatchProcessingTask task; + task.imageId = item["image_id"].asUInt(); + task.url = item["url"].asString(); + // Extract tag if present + task.tag = item.isMember("tag") ? item["tag"].asString() : ""; + allTasks.push_back(task); + } + + // Determine number of threads to use (hardcoded to NUM_THREADS) + int actualThreads = std::min(NUM_THREADS, static_cast(allTasks.size())); + + // Split tasks among threads + vector> threadTasks(actualThreads); + + // Distribute tasks evenly + for (size_t i = 0; i < allTasks.size(); i++) { + threadTasks[i % actualThreads].push_back(allTasks[i]); + } + + // Shared collections for results and hits + vector results; + unordered_map> imageHits; + pthread_mutex_t mutex; + pthread_mutex_init(&mutex, NULL); + + // Create and start worker threads + vector threads; + for (int i = 0; i < actualThreads; i++) { + BatchWorkerThread* thread = new BatchWorkerThread( + imgDownloader, featureExtractor, threadTasks[i], + imageHits, results, mutex); + threads.push_back(thread); + thread->start(); + } + + // Wait for all threads to complete + for (auto thread : threads) { + thread->join(); + delete thread; + } + + pthread_mutex_destroy(&mutex); + + // Add all hits to the index in one transaction + if (!imageHits.empty()) { + index->addBatchImages(imageHits); + } + + // Collect tags for successfully processed images + unordered_map imageTags; + for (const auto& result : results) { + // Only add tags for successfully processed images + if (result.status == IMAGE_ADDED && !result.tag.empty()) { + imageTags[result.imageId] = result.tag; + } + } + + // Log how many tags were collected for addition + cout << "DEBUG: Collected " << imageTags.size() << " tags for batch addition" << endl; + + // Add all tags in one transaction + if (!imageTags.empty()) { + index->addBatchTags(imageTags); + } + + // Write both indices to disk after batch processing + cout << "DEBUG: Writing indices to disk after batch processing" << endl; + index->write(""); // Pass empty string to use stored paths + index->writeTags(""); // Pass empty string to use stored paths + cout << "DEBUG: Indices written successfully" << endl; + + return results; +} diff --git a/src/httpserver.cpp b/src/httpserver.cpp index ad5ab9f..8254c80 100644 --- a/src/httpserver.cpp +++ b/src/httpserver.cpp @@ -135,7 +135,7 @@ int HTTPServer::stop() } -int HTTPServer::sendAnswer(struct MHD_Connection *connection, ConnectionInfo &conInfo) +MHD_Result HTTPServer::sendAnswer(struct MHD_Connection *connection, ConnectionInfo &conInfo) { int ret; struct MHD_Response *response; @@ -151,7 +151,7 @@ int HTTPServer::sendAnswer(struct MHD_Connection *connection, ConnectionInfo &co ret = MHD_queue_response(connection, conInfo.answerCode, response); MHD_destroy_response(response); - return ret; + return (MHD_Result)ret; } @@ -173,7 +173,7 @@ void HTTPServer::requestCompleted(void *cls, MHD_Connection *connection, } -int HTTPServer::answerToConnection(void *cls, MHD_Connection *connection, +MHD_Result HTTPServer::answerToConnection(void *cls, MHD_Connection *connection, const char *url, const char *method, const char *version, const char *upload_data, size_t *upload_data_size, void **conCls) @@ -231,7 +231,7 @@ int HTTPServer::answerToConnection(void *cls, MHD_Connection *connection, } -int HTTPServer::readAuthHeader(void *cls, enum MHD_ValueKind kind, +MHD_Result HTTPServer::readAuthHeader(void *cls, enum MHD_ValueKind kind, const char *key, const char *value) { (void) kind; @@ -245,6 +245,10 @@ int HTTPServer::readAuthHeader(void *cls, enum MHD_ValueKind kind, conInfo->authKey = string(value); return MHD_NO; } + else if (keyString == "content-type") + { + conInfo->contentType = string(value); + } return MHD_YES; } diff --git a/src/imagedownloader.cpp b/src/imagedownloader.cpp index d0ab8fd..4cb0a77 100644 --- a/src/imagedownloader.cpp +++ b/src/imagedownloader.cpp @@ -22,6 +22,7 @@ #include #include +#include ImageDownloader::ImageDownloader() @@ -34,7 +35,8 @@ bool ImageDownloader::canDownloadImage(std::string imgURL) { bool ret = false; if (imgURL.substr(0, 7) == std::string("http://") - || imgURL.substr(0, 8) == std::string("https://")) + || imgURL.substr(0, 8) == std::string("https://") + || imgURL.substr(0, 7) == std::string("file://")) ret = true; return ret; } @@ -46,6 +48,31 @@ u_int32_t ImageDownloader::getImageData(std::string imgURL, std::vector &i if (!canDownloadImage(imgURL)) return ERROR_GENERIC; + // Handle file:// URLs + if (imgURL.substr(0, 7) == std::string("file://")) { + std::string filePath = imgURL.substr(7); // Remove "file://" + + // Open file + std::ifstream file(filePath, std::ios::binary | std::ios::ate); + if (!file.is_open()) { + responseCode = 404; // Not found + return IMAGE_DOWNLOADER_HTTP_ERROR; + } + + // Get file size + std::streamsize size = file.tellg(); + file.seekg(0, std::ios::beg); + + // Read file into imgData vector + imgData.resize(size); + if (!file.read((char*)imgData.data(), size)) { + responseCode = 500; // Error reading + return IMAGE_DOWNLOADER_HTTP_ERROR; + } + + responseCode = 200; // Success + return OK; + } u_int32_t i_ret = OK; @@ -89,5 +116,3 @@ size_t ImageDownloader::writeCallback(char *ptr, size_t size, size_t nmemb, void return sizeToWrite; } - - diff --git a/src/imageloader.cpp b/src/imageloader.cpp index 30bc442..a958e28 100644 --- a/src/imageloader.cpp +++ b/src/imageloader.cpp @@ -36,7 +36,7 @@ u_int32_t ImageLoader::loadImage(unsigned i_imgSize, char *p_imgData, Mat &img) try { - img = imdecode(imgData, CV_LOAD_IMAGE_GRAYSCALE); + img = imdecode(imgData, cv::IMREAD_GRAYSCALE); } catch (cv::Exception& e) // The decoding of an image can raise an exception. { diff --git a/src/imagereranker.cpp b/src/imagereranker.cpp index 6f33941..0019063 100644 --- a/src/imagereranker.cpp +++ b/src/imagereranker.cpp @@ -31,54 +31,69 @@ #include #include +#include -void *RANSACThread::run() +/** + * @brief Rerank images using a vector of sorted results. + * @param imagesReqHits the hits of the request image. + * @param indexHits the hits of the index. + * @param sortedResults the sorted vector of results (weight, imageId). + * @param i_nbResults the number of results to rerank. + * @return A vector of reranked search results. + */ +vector ImageReranker::rerank(unordered_map > &imagesReqHits, + unordered_map* > &indexHits, + const vector> &sortedResults, + unsigned i_nbResults) { - for (unsigned i = 0; i < imageIds.size(); ++i) - { - const unsigned i_imageId = imageIds[i]; - const Histogram histogram = histograms[i]; - unsigned i_binMax = max_element(histogram.bins, histogram.bins + HISTOGRAM_NB_BINS) - histogram.bins; - float i_maxVal = histogram.bins[i_binMax]; - if (i_maxVal > 10) - { - RANSACTask &task = imgTasks[i_imageId]; - assert(task.points1.size() == task.points2.size()); - - if (task.points1.size() >= RANSAC_MIN_INLINERS) - { - Mat H = pastecEstimateRigidTransform(task.points2, task.points1, true); - - if (countNonZero(H) == 0) - continue; - - Rect bRect1 = boundingRect(task.points1); + unordered_set firstImageIds; + // Extract the first i_nbResults ranked images from the vector. + getFirstImageIds(sortedResults, i_nbResults, firstImageIds); + // Continue with the common reranking logic + return rerankCommon(imagesReqHits, indexHits, firstImageIds); +} - pthread_mutex_lock(&mutex); - rankedResultsOut.push(SearchResult(i_maxVal, i_imageId, bRect1)); - pthread_mutex_unlock(&mutex); - } - } +/** + * @brief Return the first ids of ranked images from a sorted vector. + * @param sortedResults the sorted vector of results (weight, imageId). + * @param i_nbResults the number of images to return. + * @param firstImageIds a set to return the image ids. + */ +void ImageReranker::getFirstImageIds(const vector> &sortedResults, + unsigned i_nbResults, unordered_set &firstImageIds) +{ + unsigned i_res = 0; + for (const auto& result : sortedResults) + { + if (i_res >= i_nbResults) + break; + + firstImageIds.insert(result.second); // Insert the image ID + i_res++; } } - -void ImageReranker::rerank(unordered_map > &imagesReqHits, - unordered_map > &indexHits, - priority_queue &rankedResultsIn, - priority_queue &rankedResultsOut, - unsigned i_nbResults) +/** + * @brief Common reranking implementation. + * @param imagesReqHits the hits of the request image. + * @param indexHits the hits of the index. + * @param firstImageIds the set of image IDs to rerank. + * @return A vector of reranked search results. + */ +vector ImageReranker::rerankCommon(unordered_map > &imagesReqHits, + unordered_map* > &indexHits, + unordered_set &firstImageIds) { - unordered_set firstImageIds; - - // Extract the first i_nbResults ranked images. - getFirstImageIds(rankedResultsIn, i_nbResults, firstImageIds); - - unordered_map imgTasks; + // Use PointPairs instead of RANSACTask + unordered_map imgPointPairs; // Compute the histograms. unordered_map histograms; // key: the image id, value: the corresponding histogram. + + unsigned totalMatches = 0; + unsigned totalHistogramEntries = 0; + unsigned totalPointPairs = 0; for (unordered_map >::const_iterator it = imagesReqHits.begin(); it != imagesReqHits.end(); ++it) @@ -92,15 +107,24 @@ void ImageReranker::rerank(unordered_map > &imagesReqHits, // If there is several hits for the same word in the image... const u_int16_t i_angle1 = hits.front().i_angle; const Point2f point1(hits.front().x, hits.front().y); - const vector &hitIndex = indexHits[i_wordId]; + const vector *hitIndex = indexHits[i_wordId]; + + if (!hitIndex) { + continue; + } + + unsigned matchesForThisWord = 0; - for (unsigned i = 0; i < hitIndex.size(); ++i) + for (unsigned i = 0; i < hitIndex->size(); ++i) { - const u_int32_t i_imageId = hitIndex[i].i_imageId; + const u_int32_t i_imageId = (*hitIndex)[i].i_imageId; // Test if the image belongs to the image to rerank. if (firstImageIds.find(i_imageId) != firstImageIds.end()) { - const u_int16_t i_angle2 = hitIndex[i].i_angle; + matchesForThisWord++; + totalMatches++; + + const u_int16_t i_angle2 = (*hitIndex)[i].i_angle; float f_diff = angleDiff(i_angle1, i_angle2); unsigned bin = (f_diff - DIFF_MIN) / 360 * HISTOGRAM_NB_BINS; assert(bin < HISTOGRAM_NB_BINS); @@ -108,45 +132,77 @@ void ImageReranker::rerank(unordered_map > &imagesReqHits, Histogram &histogram = histograms[i_imageId]; histogram.bins[bin]++; histogram.i_total++; + totalHistogramEntries++; - const Point2f point2(hitIndex[i].x, hitIndex[i].y); - RANSACTask &imgTask = imgTasks[i_imageId]; + const Point2f point2((*hitIndex)[i].x, (*hitIndex)[i].y); + PointPairs &pointPairs = imgPointPairs[i_imageId]; - imgTask.points1.push_back(point1); - imgTask.points2.push_back(point2); + pointPairs.points1.push_back(point1); + pointPairs.points2.push_back(point2); + totalPointPairs++; } } + + // Debug output removed to improve performance } - pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; - - #define NB_RANSAC_THREAD 4 - RANSACThread *threads[NB_RANSAC_THREAD]; - - for (unsigned i = 0; i < NB_RANSAC_THREAD; ++i) - threads[i] = new RANSACThread(mutex, imgTasks, rankedResultsOut); - + // Create a vector to store the results + vector rankedResults; + rankedResults.reserve(histograms.size()); // Reserve space for efficiency + + // Process all images in a single thread + unsigned ransacAttempts = 0; + unsigned successfulRansacs = 0; + unsigned skippedDueToLowValue = 0; + unsigned skippedDueToFewPoints = 0; + unsigned skippedDueToZeroH = 0; + // Rank the images according to their histogram. - unsigned i = 0; - for (unordered_map::iterator it = histograms.begin(); - it != histograms.end(); ++it, ++i) + for (const auto& histogramPair : histograms) { - unsigned i_imageId = it->first; - Histogram histogram = it->second; - threads[i % NB_RANSAC_THREAD]->imageIds.push_back(i_imageId); - threads[i % NB_RANSAC_THREAD]->histograms.push_back(histogram); - } + const unsigned i_imageId = histogramPair.first; + const Histogram& histogram = histogramPair.second; + + // Find the maximum bin value + unsigned i_binMax = max_element(histogram.bins, histogram.bins + HISTOGRAM_NB_BINS) - histogram.bins; + float i_maxVal = histogram.bins[i_binMax]; + + if (i_maxVal > 10) + { + const PointPairs& pointPairs = imgPointPairs[i_imageId]; + assert(pointPairs.points1.size() == pointPairs.points2.size()); - // Compute - for (unsigned i = 0; i < NB_RANSAC_THREAD; ++i) - threads[i]->start(); - for (unsigned i = 0; i < NB_RANSAC_THREAD; ++i) - { - threads[i]->join(); - delete threads[i]; + if (pointPairs.points1.size() >= RANSAC_MIN_INLINERS) + { + ransacAttempts++; + Mat H = RANSACHelper::pastecEstimateRigidTransform(pointPairs.points2, pointPairs.points1, true); + + if (countNonZero(H) == 0) { + skippedDueToZeroH++; + continue; + } + + Rect bRect1 = boundingRect(pointPairs.points1); + rankedResults.push_back(SearchResult(i_maxVal, i_imageId, bRect1)); + + successfulRansacs++; + } + else { + skippedDueToFewPoints++; + } + } + else { + skippedDueToLowValue++; + } } - pthread_mutex_destroy(&mutex); + // Sort the results by weight in descending order + sort(rankedResults.begin(), rankedResults.end(), + [](const SearchResult& a, const SearchResult& b) { + return a.f_weight > b.f_weight; + }); + + return rankedResults; } @@ -167,26 +223,135 @@ class Pos { /** - * @brief Return the first ids of ranked images. - * @param rankedResultsIn the ranked images. - * @param i_nbResults the number of images to return. - * @param firstImageIds a set to return the image ids. + * @brief Rerank images using the forward index for better performance. + * @param imagesReqHits the hits of the request image. + * @param index the ORB index with forward index. + * @param firstImageIds the set of image IDs to rerank. + * @return A vector of reranked search results. */ -void ImageReranker::getFirstImageIds(priority_queue &rankedResultsIn, - unsigned i_nbResults, unordered_set &firstImageIds) -{ - unsigned i_res = 0; - while(!rankedResultsIn.empty() - && i_res < i_nbResults) +vector ImageReranker::rerankUsingForwardIndex(unordered_map > &imagesReqHits, + ORBIndex* index, + unordered_set &firstImageIds) +{ + // Create a map of query words for fast lookup + unordered_map queryWords; + for (const auto& pair : imagesReqHits) { + queryWords[pair.first] = pair.second.front(); + } + + // Use PointPairs instead of RANSACTask + unordered_map imgPointPairs; + + // Compute the histograms + unordered_map histograms; + + unsigned totalMatches = 0; + unsigned totalHistogramEntries = 0; + unsigned totalPointPairs = 0; + + // Process each image in the reranking set + for (const u_int32_t i_imageId : firstImageIds) { + // Get all words for this image from the forward index + const vector& imageWords = index->getForwardIndexWords(i_imageId); + + // For each word in this image + for (const unsigned i_wordId : imageWords) { + // Check if this word exists in the query image + auto queryIt = queryWords.find(i_wordId); + if (queryIt == queryWords.end()) { + continue; // Word not in query, skip + } + + // Get the hit from the index for this word and image + const Hit* indexHit = index->getHitForWordAndImage(i_wordId, i_imageId); + if (!indexHit) { + continue; // No hit found, skip + } + + totalMatches++; + + // Calculate angle difference + const u_int16_t i_angle1 = queryIt->second.i_angle; + const u_int16_t i_angle2 = indexHit->i_angle; + float f_diff = angleDiff(i_angle1, i_angle2); + unsigned bin = (f_diff - DIFF_MIN) / 360 * HISTOGRAM_NB_BINS; + assert(bin < HISTOGRAM_NB_BINS); + + // Update histogram + Histogram &histogram = histograms[i_imageId]; + histogram.bins[bin]++; + histogram.i_total++; + totalHistogramEntries++; + + // Store point pairs for RANSAC + const Point2f point1(queryIt->second.x, queryIt->second.y); + const Point2f point2(indexHit->x, indexHit->y); + PointPairs &pointPairs = imgPointPairs[i_imageId]; + + pointPairs.points1.push_back(point1); + pointPairs.points2.push_back(point2); + totalPointPairs++; + } + } + + // Create a vector to store the results + vector rankedResults; + rankedResults.reserve(histograms.size()); // Reserve space for efficiency + + // Process all images in a single thread + unsigned ransacAttempts = 0; + unsigned successfulRansacs = 0; + unsigned skippedDueToLowValue = 0; + unsigned skippedDueToFewPoints = 0; + unsigned skippedDueToZeroH = 0; + + // Rank the images according to their histogram. + for (const auto& histogramPair : histograms) { - const SearchResult &res = rankedResultsIn.top(); - firstImageIds.insert(res.i_imageId); - rankedResultsIn.pop(); - i_res++; + const unsigned i_imageId = histogramPair.first; + const Histogram& histogram = histogramPair.second; + + // Find the maximum bin value + unsigned i_binMax = max_element(histogram.bins, histogram.bins + HISTOGRAM_NB_BINS) - histogram.bins; + float i_maxVal = histogram.bins[i_binMax]; + + if (i_maxVal > 10) + { + const PointPairs& pointPairs = imgPointPairs[i_imageId]; + assert(pointPairs.points1.size() == pointPairs.points2.size()); + + if (pointPairs.points1.size() >= RANSAC_MIN_INLINERS) + { + ransacAttempts++; + Mat H = RANSACHelper::pastecEstimateRigidTransform(pointPairs.points2, pointPairs.points1, true); + if (countNonZero(H) == 0) { + skippedDueToZeroH++; + continue; + } + + Rect bRect1 = boundingRect(pointPairs.points1); + rankedResults.push_back(SearchResult(i_maxVal, i_imageId, bRect1)); + + successfulRansacs++; + } + else { + skippedDueToFewPoints++; + } + } + else { + skippedDueToLowValue++; + } } + + // Sort the results by weight in descending order + sort(rankedResults.begin(), rankedResults.end(), + [](const SearchResult& a, const SearchResult& b) { + return a.f_weight > b.f_weight; + }); + + return rankedResults; } - float ImageReranker::angleDiff(unsigned i_angle1, unsigned i_angle2) { // Convert the angle in the [-180, 180] range. diff --git a/src/imagererankerransac.cpp b/src/imagererankerransac.cpp index 319f960..0ff34a5 100644 --- a/src/imagererankerransac.cpp +++ b/src/imagererankerransac.cpp @@ -49,7 +49,7 @@ #include -void RANSACThread::getRTMatrix(const Point2f* a, const Point2f* b, +void RANSACHelper::getRTMatrix(const Point2f* a, const Point2f* b, int count, Mat& M, bool fullAffine) { CV_Assert( M.isContinuous() ); @@ -133,12 +133,13 @@ void RANSACThread::getRTMatrix(const Point2f* a, const Point2f* b, } -cv::Mat RANSACThread::pastecEstimateRigidTransform(InputArray src1, InputArray src2, +cv::Mat RANSACHelper::pastecEstimateRigidTransform(InputArray src1, InputArray src2, bool fullAffine) { Mat M(2, 3, CV_64F), A = src1.getMat(), B = src2.getMat(); - const int RANSAC_MAX_ITERS = 5000; + // Reduced from 5000 to 1000 to improve performance + const int RANSAC_MAX_ITERS = 1000; const int RANSAC_SIZE0 = 3; std::vector pA, pB; diff --git a/src/main.cpp b/src/main.cpp index 1de7484..07cbd9b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -22,6 +22,10 @@ #include #include +#ifdef USE_MIMALLOC +#include +#endif + #include #include #include @@ -42,12 +46,18 @@ void intHandler(int signum) { void printUsage() { cout << "Usage :" << endl - << "./PastecIndex [-p portNumber] [-i indexPath] [--forward-index] [--https] [--auth-key AuthKey] visualWordList" << endl; + << "./PastecIndex [-p portNumber] [-i indexPath] [-t tagsPath] [--forward-index] [--https] [--auth-key AuthKey] visualWordList" << endl; } int main(int argc, char** argv) { +#ifdef USE_MIMALLOC + // Initialize mimalloc + mi_version(); + cout << "Using mimalloc memory allocator" << endl; +#endif + cout << "Pastec Index v0.0.1" << endl; if (argc < 2) @@ -66,6 +76,7 @@ int main(int argc, char** argv) unsigned i_port = 4212; string visualWordPath; string indexPath(DEFAULT_INDEX_PATH); + string tagsPath; bool buildForwardIndex = false; string authKey(""); bool https = false; @@ -83,6 +94,11 @@ int main(int argc, char** argv) EXIT_IF_LAST_ARGUMENT() indexPath = argv[++i]; } + else if (string(argv[i]) == "-t") + { + EXIT_IF_LAST_ARGUMENT() + tagsPath = argv[++i]; + } else if (string(argv[i]) == "--auth-key") { EXIT_IF_LAST_ARGUMENT() @@ -108,7 +124,7 @@ int main(int argc, char** argv) ++i; } - Index *index = new ORBIndex(indexPath, buildForwardIndex); + Index *index = new ORBIndex(indexPath, tagsPath, buildForwardIndex); ORBWordIndex *wordIndex = new ORBWordIndex(visualWordPath); FeatureExtractor *ife = new ORBFeatureExtractor((ORBIndex *)index, wordIndex); Searcher *is = new ORBSearcher((ORBIndex *)index, wordIndex); diff --git a/src/orb/orbfeatureextractor.cpp b/src/orb/orbfeatureextractor.cpp index 661745c..7f48f84 100644 --- a/src/orb/orbfeatureextractor.cpp +++ b/src/orb/orbfeatureextractor.cpp @@ -35,6 +35,56 @@ ORBFeatureExtractor::ORBFeatureExtractor(ORBIndex *index, ORBWordIndex *wordInde : index(index), wordIndex(wordIndex), orb(ORB::create(2000, 1.02, 100)) { } +u_int32_t ORBFeatureExtractor::extractFeatures(unsigned i_imageId, unsigned i_imgSize, + char *p_imgData, list &hits, + unsigned &i_nbFeaturesExtracted) +{ + Mat img; + u_int32_t i_ret = ImageLoader::loadImage(i_imgSize, p_imgData, img); + if (i_ret != OK) + return i_ret; + + vector keypoints; + Mat descriptors; + + orb->detectAndCompute(img, noArray(), keypoints, descriptors); + i_nbFeaturesExtracted = keypoints.size(); + + unsigned i_nbKeyPoints = 0; + unordered_set matchedWords; + for (unsigned i = 0; i < keypoints.size(); ++i) + { + i_nbKeyPoints++; + + // Recording the angle on 16 bits. + u_int16_t angle = keypoints[i].angle / 360 * (1 << 16); + u_int16_t x = keypoints[i].pt.x; + u_int16_t y = keypoints[i].pt.y; + + vector indices(1); + vector dists(1); + wordIndex->knnSearch(descriptors.row(i), indices, dists, 1); + + for (unsigned j = 0; j < indices.size(); ++j) + { + const unsigned i_wordId = indices[j]; + if (matchedWords.find(i_wordId) == matchedWords.end()) + { + HitForward newHit; + newHit.i_wordId = i_wordId; + newHit.i_imageId = i_imageId; + newHit.i_angle = angle; + newHit.x = x; + newHit.y = y; + hits.push_back(newHit); + matchedWords.insert(i_wordId); + } + } + } + + return IMAGE_ADDED; +} + u_int32_t ORBFeatureExtractor::processNewImage(unsigned i_imageId, unsigned i_imgSize, char *p_imgData, unsigned &i_nbFeaturesExtracted) diff --git a/src/orb/orbindex.cpp b/src/orb/orbindex.cpp index f076113..2c9cd76 100644 --- a/src/orb/orbindex.cpp +++ b/src/orb/orbindex.cpp @@ -31,8 +31,9 @@ #include -ORBIndex::ORBIndex(string indexPath, bool buildForwardIndex) - : buildForwardIndex(buildForwardIndex) +ORBIndex::ORBIndex(string indexPath, string tagsPath, bool buildForwardIndex) + : buildForwardIndex(buildForwardIndex), totalNbRecords(0), m_totalIndexedImages(0), + storedIndexPath(indexPath), storedTagsPath(tagsPath) { // Init the mutex. pthread_rwlock_init(&rwLock, NULL); @@ -42,6 +43,8 @@ ORBIndex::ORBIndex(string indexPath, bool buildForwardIndex) nbOccurences[i] = 0; load(indexPath); + loadTags(tagsPath); + cout << "DEBUG: Index initialized with " << nbWords.size() << endl; } @@ -52,10 +55,9 @@ ORBIndex::ORBIndex(string indexPath, bool buildForwardIndex) */ unsigned ORBIndex::getWordNbOccurences(unsigned i_wordId) { - pthread_rwlock_rdlock(&rwLock); + // No locks needed since the index is read-only during queries assert(i_wordId < NB_VISUAL_WORDS); unsigned i_ret = nbOccurences[i_wordId]; - pthread_rwlock_unlock(&rwLock); return i_ret; } @@ -67,18 +69,20 @@ ORBIndex::~ORBIndex() void ORBIndex::getImagesWithVisualWords(unordered_map > &imagesReqHits, - unordered_map > &indexHitsForReq) + unordered_map* > &indexHitsForReq) { - pthread_rwlock_rdlock(&rwLock); - + // Pre-allocate memory for the result to avoid reallocations + indexHitsForReq.reserve(imagesReqHits.size()); + + // No locks needed since the index is read-only during queries for (unordered_map >::const_iterator it = imagesReqHits.begin(); it != imagesReqHits.end(); ++it) { const unsigned i_wordId = it->first; - indexHitsForReq[i_wordId] = indexHits[i_wordId]; + + // Direct access without locks - store pointer to original data instead of copying + indexHitsForReq[i_wordId] = &indexHits[i_wordId]; } - - pthread_rwlock_unlock(&rwLock); } @@ -86,21 +90,41 @@ void ORBIndex::getImagesWithVisualWords(unordered_map > &im * @brief Return the number of words for an image * @param i_imageId the image id. * @return the number of words. - * readLock() and unlock MUST be called before and after calling this function. + * No locks needed since the index is read-only during queries. */ unsigned ORBIndex::countTotalNbWord(unsigned i_imageId) { + // Make sure the image ID is within bounds + if (i_imageId >= nbWords.size()) + return 0; + unsigned i_ret = nbWords[i_imageId]; return i_ret; } +/** + * @brief Recalculate the total number of indexed images. + * This method counts the number of images that have at least one word in the index. + */ +void ORBIndex::recalculateTotalIndexedImages() +{ + m_totalIndexedImages = 0; + for (const auto& wordCount : nbWords) { + if (wordCount > 0) { + m_totalIndexedImages++; + } + } +} + +/** + * @brief Return the total number of indexed images. + * @return the number of images that have at least one word in the index. + */ unsigned ORBIndex::getTotalNbIndexedImages() { - pthread_rwlock_rdlock(&rwLock); - unsigned i_ret = nbWords.size(); - pthread_rwlock_unlock(&rwLock); - return i_ret; + // No locks needed since the index is read-only during queries + return m_totalIndexedImages; } @@ -111,12 +135,25 @@ unsigned ORBIndex::getTotalNbIndexedImages() u_int32_t ORBIndex::addImage(unsigned i_imageId, list hitList) { pthread_rwlock_wrlock(&rwLock); - if (nbWords.find(i_imageId) != nbWords.end()) + + // Check if image already exists + if (i_imageId < nbWords.size() && nbWords[i_imageId] > 0) { pthread_rwlock_unlock(&rwLock); removeImage(i_imageId); pthread_rwlock_wrlock(&rwLock); } + + // Ensure vectors have sufficient capacity + if (i_imageId >= nbWords.size()) + { + nbWords.resize(i_imageId + 1, 0); + if (buildForwardIndex) + { + forwardIndex.resize(i_imageId + 1); + } + tags.resize(i_imageId + 1); + } for (list::iterator it = hitList.begin(); it != hitList.end(); ++it) { @@ -137,6 +174,8 @@ u_int32_t ORBIndex::addImage(unsigned i_imageId, list hitList) nbOccurences[hitFor.i_wordId]++; totalNbRecords++; } + updateIndexState(); + pthread_rwlock_unlock(&rwLock); if (!hitList.empty()) @@ -146,6 +185,95 @@ u_int32_t ORBIndex::addImage(unsigned i_imageId, list hitList) return IMAGE_ADDED; } +/** + * @brief Add multiple images to the index in a single transaction. + * @param batchHits map of image IDs to their respective hit lists. + * @return IMAGE_ADDED on success. + */ +u_int32_t ORBIndex::addBatchImages(const unordered_map>& batchHits) +{ + pthread_rwlock_wrlock(&rwLock); + + // Find maximum image ID to ensure array capacity + u_int32_t maxImageId = 0; + for (const auto& pair : batchHits) { + maxImageId = std::max(maxImageId, pair.first); + } + + // Ensure vectors have sufficient capacity + if (maxImageId >= nbWords.size()) { + nbWords.resize(maxImageId + 1, 0); + if (buildForwardIndex) { + forwardIndex.resize(maxImageId + 1); + } + tags.resize(maxImageId + 1); + } + + // Add all the new hits + for (const auto& pair : batchHits) { + u_int32_t imageId = pair.first; + const list& hitList = pair.second; + + for (const HitForward& hitFor : hitList) { + assert(imageId == hitFor.i_imageId); + Hit hitBack; + hitBack.i_imageId = hitFor.i_imageId; + hitBack.i_angle = hitFor.i_angle; + hitBack.x = hitFor.x; + hitBack.y = hitFor.y; + + if (buildForwardIndex) { + forwardIndex[hitFor.i_imageId].push_back(hitFor.i_wordId); + } + indexHits[hitFor.i_wordId].push_back(hitBack); + nbWords[hitFor.i_imageId]++; + nbOccurences[hitFor.i_wordId]++; + totalNbRecords++; + } + } + + updateIndexState(); + + pthread_rwlock_unlock(&rwLock); + + return IMAGE_ADDED; +} + +/** + * @brief Add multiple tags to images in a single transaction. + * @param batchTags map of image IDs to their respective tags. + * @return IMAGE_TAG_ADDED on success. + */ +u_int32_t ORBIndex::addBatchTags(const unordered_map& batchTags) +{ + if (batchTags.empty()) { + cout << "DEBUG: No tags to add in batch" << endl; + return OK; + } + + pthread_rwlock_wrlock(&rwLock); + + // Find maximum image ID to ensure array capacity + u_int32_t maxImageId = 0; + for (const auto& pair : batchTags) { + maxImageId = std::max(maxImageId, pair.first); + } + + // Ensure tags vector has sufficient capacity + if (maxImageId >= tags.size()) { + tags.resize(maxImageId + 1); + } + + // Add all tags + for (const auto& pair : batchTags) { + tags[pair.first] = pair.second; + } + + pthread_rwlock_unlock(&rwLock); + + return IMAGE_TAG_ADDED; +} + /** * @brief Add a string tag to an image. @@ -155,11 +283,17 @@ u_int32_t ORBIndex::addTag(const unsigned i_imageId, const string tag) { pthread_rwlock_wrlock(&rwLock); - if (nbWords.find(i_imageId) == nbWords.end()) { + // Check if image exists + if (i_imageId >= nbWords.size() || nbWords[i_imageId] == 0) { pthread_rwlock_unlock(&rwLock); return IMAGE_NOT_FOUND; } + // Ensure tags vector has sufficient capacity + if (i_imageId >= tags.size()) { + tags.resize(i_imageId + 1); + } + tags[i_imageId] = tag; pthread_rwlock_unlock(&rwLock); @@ -181,33 +315,25 @@ u_int32_t ORBIndex::removeImage(const unsigned i_imageId) removeTag((u_int64_t)i_imageId); pthread_rwlock_wrlock(&rwLock); - unordered_map::iterator imgIt = - nbWords.find(i_imageId); - - if (imgIt == nbWords.end()) + + // Check if image exists + if (i_imageId >= nbWords.size() || nbWords[i_imageId] == 0) { cout << "Image " << i_imageId << " not found." << endl; pthread_rwlock_unlock(&rwLock); return IMAGE_NOT_FOUND; } - nbWords.erase(imgIt); + // Set word count to 0 (effectively removing the image) + nbWords[i_imageId] = 0; - if (buildForwardIndex) + if (buildForwardIndex && i_imageId < forwardIndex.size()) { - unordered_map >::iterator forwardIndexIt = - forwardIndex.find(i_imageId); - - if (forwardIndexIt == forwardIndex.end()) - { - cout << "Image " << i_imageId << " not found." << endl; - pthread_rwlock_unlock(&rwLock); - return IMAGE_NOT_FOUND; - } - - forwardIndex.erase(forwardIndexIt); + // Clear the forward index for this image + forwardIndex[i_imageId].clear(); } + // Remove hits from indexHits for (unsigned i_wordId = 0; i_wordId < NB_VISUAL_WORDS; ++i_wordId) { vector &hits = indexHits[i_wordId]; @@ -225,6 +351,7 @@ u_int32_t ORBIndex::removeImage(const unsigned i_imageId) ++it; } } + updateIndexState(); pthread_rwlock_unlock(&rwLock); cout << "Image " << i_imageId << " removed." << endl; @@ -239,48 +366,38 @@ u_int32_t ORBIndex::removeImage(const unsigned i_imageId) */ u_int32_t ORBIndex::getImageWords(unsigned i_imageId, unordered_map > &hitList) { - pthread_rwlock_wrlock(&rwLock); - + // No locks needed since the index is read-only during queries const unsigned i_nbTotalIndexedImages = getTotalNbIndexedImages(); const unsigned i_maxNbOccurences = i_nbTotalIndexedImages > 10000 ? 0.15 * i_nbTotalIndexedImages : i_nbTotalIndexedImages; - unordered_map::iterator imgIt = - nbWords.find(i_imageId); - - if (imgIt == nbWords.end()) + // Check if image exists + if (i_imageId >= nbWords.size() || nbWords[i_imageId] == 0) { cout << "Image " << i_imageId << " not found." << endl; - pthread_rwlock_unlock(&rwLock); return IMAGE_NOT_FOUND; } - if (buildForwardIndex) + if (buildForwardIndex && i_imageId < forwardIndex.size()) { - vector &words = forwardIndex[i_imageId]; - vector::iterator word_it = words.begin(); - - while (word_it != words.end()) + const vector &words = forwardIndex[i_imageId]; + + for (const unsigned i_wordId : words) { - unsigned i_wordId = *word_it; - if (getWordNbOccurences(i_wordId) <= i_maxNbOccurences) { vector &hits = indexHits[i_wordId]; - vector::iterator hit_it = hits.begin(); - - while (hit_it != hits.end()) + + for (const Hit &hit : hits) { - if (hit_it->i_imageId == i_imageId) + if (hit.i_imageId == i_imageId) { - hitList[i_wordId].push_back(*hit_it); + hitList[i_wordId].push_back(hit); break; } - ++hit_it; } } - ++word_it; } } else @@ -288,25 +405,21 @@ u_int32_t ORBIndex::getImageWords(unsigned i_imageId, unordered_map &hits = indexHits[i_wordId]; - vector::iterator it = hits.begin(); - - while (it != hits.end()) + + for (const Hit &hit : hits) { - if (it->i_imageId == i_imageId) + if (hit.i_imageId == i_imageId) { if (getWordNbOccurences(i_wordId) <= i_maxNbOccurences) { - hitList[i_wordId].push_back(*it); + hitList[i_wordId].push_back(hit); } break; } - ++it; } } } - pthread_rwlock_unlock(&rwLock); - cout << "Image " << i_imageId << " found with " << hitList.size() << " words." << endl; return OK; @@ -320,15 +433,14 @@ u_int32_t ORBIndex::removeTag(const unsigned i_imageId) { pthread_rwlock_wrlock(&rwLock); - unordered_map::iterator tagIt = - tags.find(i_imageId); - - if (tagIt == tags.end()) { + // Check if tag exists + if (i_imageId >= tags.size() || tags[i_imageId].empty()) { pthread_rwlock_unlock(&rwLock); return IMAGE_TAG_NOT_FOUND; } - tags.erase(tagIt); + // Clear the tag (set to empty string) + tags[i_imageId] = ""; pthread_rwlock_unlock(&rwLock); @@ -345,19 +457,14 @@ u_int32_t ORBIndex::removeTag(const unsigned i_imageId) */ u_int32_t ORBIndex::getTag(const unsigned i_imageId, string &tag) { - pthread_rwlock_rdlock(&rwLock); - - unordered_map::iterator tagIt = - tags.find(i_imageId); - - if (tagIt == tags.end()) { - pthread_rwlock_unlock(&rwLock); + // No locks needed since the index is read-only during queries + + // Check if tag exists + if (i_imageId >= tags.size() || tags[i_imageId].empty()) { return IMAGE_TAG_NOT_FOUND; } - tag = tagIt->second; - - pthread_rwlock_unlock(&rwLock); + tag = tags[i_imageId]; return OK; } @@ -371,7 +478,13 @@ u_int32_t ORBIndex::getTag(const unsigned i_imageId, string &tag) u_int32_t ORBIndex::write(string backwardIndexPath) { if (backwardIndexPath == "") - backwardIndexPath = DEFAULT_INDEX_PATH; + { + // If no path is provided, use the stored path from constructor + if (storedIndexPath != "") + backwardIndexPath = storedIndexPath; + else + backwardIndexPath = DEFAULT_INDEX_PATH; + } ofstream ofs; @@ -419,6 +532,7 @@ u_int32_t ORBIndex::write(string backwardIndexPath) u_int32_t ORBIndex::clear() { pthread_rwlock_wrlock(&rwLock); + // Reset the nbOccurences table. for (unsigned i = 0; i < NB_VISUAL_WORDS; ++i) { @@ -426,11 +540,16 @@ u_int32_t ORBIndex::clear() indexHits[i].clear(); } - nbWords.clear(); - forwardIndex.clear(); - tags.clear(); + // Clear vectors (but keep their capacity) + std::fill(nbWords.begin(), nbWords.end(), 0); + for (auto& words : forwardIndex) { + words.clear(); + } + std::fill(tags.begin(), tags.end(), ""); totalNbRecords = 0; + m_totalIndexedImages = 0; // Reset the cached count since all images are removed + pthread_rwlock_unlock(&rwLock); cout << "Index cleared." << endl; @@ -446,91 +565,132 @@ u_int32_t ORBIndex::clear() */ u_int32_t ORBIndex::load(string backwardIndexPath) { - u_int32_t i_ret; - - // Open the file. - BackwardIndexReaderFileAccess indexAccess; + struct timeval start, end; + gettimeofday(&start, NULL); + + cout << "Loading index from " << backwardIndexPath << endl; + + // Open the file using memory mapping for optimal performance + BackwardIndexReaderMMapAccess indexAccess; if (!indexAccess.open(backwardIndexPath)) { cout << "Could not open the backward index file." << endl; - i_ret = INDEX_NOT_FOUND; + return INDEX_NOT_FOUND; } - else - { - clear(); - - pthread_rwlock_wrlock(&rwLock); - - /* Read the table to know where are located the lines corresponding to each - * visual word. */ - cout << "Reading the numbers of occurences." << endl; - u_int64_t *wordOffSet = new u_int64_t[NB_VISUAL_WORDS]; - u_int64_t i_offset = NB_VISUAL_WORDS * sizeof(u_int64_t); - for (unsigned i = 0; i < NB_VISUAL_WORDS; ++i) - { - indexAccess.read((char *)(nbOccurences + i), sizeof(u_int64_t)); - wordOffSet[i] = i_offset; - i_offset += nbOccurences[i] * BACKWARD_INDEX_ENTRY_SIZE; + + clear(); + pthread_rwlock_wrlock(&rwLock); + + // Get direct pointer to the mapped data + char* mappedData = indexAccess.getDataPtr(0); + u_int64_t fileSize = indexAccess.getFileSize(); + + cout << "File size: " << fileSize / (1024 * 1024) << " MB" << endl; + + // Read the occurrence counts (first part of the file) + cout << "Reading occurrence counts..." << endl; + memcpy(nbOccurences, mappedData, NB_VISUAL_WORDS * sizeof(u_int64_t)); + + // Calculate total number of hits and prepare for single-pass processing + u_int64_t totalHits = 0; + for (unsigned i = 0; i < NB_VISUAL_WORDS; ++i) { + totalHits += nbOccurences[i]; + } + + cout << "Total hits: " << totalHits << endl; + totalNbRecords = totalHits; + + // Pre-allocate memory for all hits + cout << "Pre-allocating memory for hits..." << endl; + for (unsigned i = 0; i < NB_VISUAL_WORDS; ++i) { + if (nbOccurences[i] > 0) { + indexHits[i].reserve(nbOccurences[i]); } - - /* Count the number of words per image. */ - cout << "Counting the number of words per image." << endl; - totalNbRecords = 0; - while (true) - { - u_int32_t i_imageId; - u_int16_t i_angle, x, y; - indexAccess.read((char *)&i_imageId, sizeof(u_int32_t)); - if (indexAccess.endOfIndex()) + } + + // Process all hits in a single pass + cout << "Processing hits in a single pass..." << endl; + + // Start after the occurrence counts + char* dataPtr = mappedData + NB_VISUAL_WORDS * sizeof(u_int64_t); + u_int32_t maxImageId = 0; + + // Create a vector to count hits per word ID (to ensure we don't exceed nbOccurences) + vector hitCounts(NB_VISUAL_WORDS, 0); + + // Process all hits + for (unsigned i_wordId = 0; i_wordId < NB_VISUAL_WORDS; ++i_wordId) { + for (u_int64_t i = 0; i < nbOccurences[i_wordId]; ++i) { + // Direct struct access instead of individual reads + Hit hit; + memcpy(&hit.i_imageId, dataPtr, sizeof(u_int32_t)); + dataPtr += sizeof(u_int32_t); + memcpy(&hit.i_angle, dataPtr, sizeof(u_int16_t)); + dataPtr += sizeof(u_int16_t); + memcpy(&hit.x, dataPtr, sizeof(u_int16_t)); + dataPtr += sizeof(u_int16_t); + memcpy(&hit.y, dataPtr, sizeof(u_int16_t)); + dataPtr += sizeof(u_int16_t); + + // Update max image ID + maxImageId = std::max(maxImageId, hit.i_imageId); + + // Add hit to index + indexHits[i_wordId].push_back(hit); + hitCounts[i_wordId]++; + + // Ensure we don't exceed the array bounds + if (dataPtr - mappedData >= fileSize) { + cout << "Warning: Reached end of file before processing all expected hits" << endl; break; - indexAccess.read((char *)&i_angle, sizeof(u_int16_t)); - indexAccess.read((char *)&x, sizeof(u_int16_t)); - indexAccess.read((char *)&y, sizeof(u_int16_t)); - nbWords[i_imageId]++; - totalNbRecords++; + } } - - indexAccess.reset(); - - cout << "Loading the index in memory." << endl; - - for (unsigned i_wordId = 0; i_wordId < NB_VISUAL_WORDS; ++i_wordId) - { - indexAccess.moveAt(wordOffSet[i_wordId]); - vector &hits = indexHits[i_wordId]; - - const unsigned i_nbOccurences = nbOccurences[i_wordId]; - hits.resize(i_nbOccurences); - - for (u_int64_t i = 0; i < i_nbOccurences; ++i) - { - u_int32_t i_imageId; - u_int16_t i_angle, x, y; - indexAccess.read((char *)&i_imageId, sizeof(u_int32_t)); - indexAccess.read((char *)&i_angle, sizeof(u_int16_t)); - indexAccess.read((char *)&x, sizeof(u_int16_t)); - indexAccess.read((char *)&y, sizeof(u_int16_t)); - hits[i].i_imageId = i_imageId; - hits[i].i_angle = i_angle; - hits[i].x = x; - hits[i].y = y; - - if (buildForwardIndex) - { - forwardIndex[i_imageId].push_back(i_wordId); - } + + // Verify we read the expected number of hits + if (hitCounts[i_wordId] != nbOccurences[i_wordId]) { + cout << "Warning: Expected " << nbOccurences[i_wordId] << " hits for word " + << i_wordId << " but read " << hitCounts[i_wordId] << endl; + } + } + + cout << "Maximum image ID: " << maxImageId << endl; + + // Resize vectors based on max image ID + nbWords.resize(maxImageId + 1, 0); + if (buildForwardIndex) { + forwardIndex.resize(maxImageId + 1); + } + + // Count words per image and build forward index in a single pass + cout << "Counting words per image and building forward index..." << endl; + for (unsigned i_wordId = 0; i_wordId < NB_VISUAL_WORDS; ++i_wordId) { + const vector& hits = indexHits[i_wordId]; + for (const Hit& hit : hits) { + // Count words per image + nbWords[hit.i_imageId]++; + + // Build forward index if needed + if (buildForwardIndex) { + forwardIndex[hit.i_imageId].push_back(i_wordId); } } - - indexAccess.close(); - delete[] wordOffSet; - - pthread_rwlock_unlock(&rwLock); - - i_ret = INDEX_LOADED; } - - return i_ret; + + // Close the memory-mapped file + indexAccess.close(); + + // Update index state (recalculate total indexed images and sort word vectors) + updateIndexState(); + + pthread_rwlock_unlock(&rwLock); + + gettimeofday(&end, NULL); + double elapsed = (end.tv_sec - start.tv_sec) + + (end.tv_usec - start.tv_usec) / 1000000.0; + + cout << "Index loaded in " << elapsed << " seconds." << endl; + + return INDEX_LOADED; } @@ -541,41 +701,119 @@ u_int32_t ORBIndex::load(string backwardIndexPath) */ u_int32_t ORBIndex::loadTags(string indexTagsPath) { + struct timeval start, end; + gettimeofday(&start, NULL); + if (indexTagsPath == "") indexTagsPath = DEFAULT_INDEX_TAGS_PATH; - ifstream ifs; - - ifs.open(indexTagsPath.c_str(), ios_base::binary); - if (!ifs.good()) + cout << "Loading tags from " << indexTagsPath << endl; + + // Try to open the file using memory mapping + int fd = ::open(indexTagsPath.c_str(), O_RDONLY); + if (fd == -1) { cout << "Could not open the index tags file." << endl; return INDEX_TAGS_NOT_FOUND; } - + + // Get file size + struct stat sb; + if (fstat(fd, &sb) == -1) + { + cout << "Could not get file size." << endl; + ::close(fd); + return INDEX_TAGS_NOT_FOUND; + } + + u_int64_t fileSize = sb.st_size; + if (fileSize == 0) + { + cout << "Tags file is empty." << endl; + ::close(fd); + return INDEX_TAGS_LOADED; // Empty file is not an error + } + + // Map the file into memory + void* mappedData = mmap(NULL, fileSize, PROT_READ, MAP_PRIVATE, fd, 0); + if (mappedData == MAP_FAILED) + { + cout << "Could not memory map the tags file." << endl; + ::close(fd); + return INDEX_TAGS_NOT_FOUND; + } + + // Advise the kernel that we'll access the data sequentially + madvise(mappedData, fileSize, MADV_SEQUENTIAL); + pthread_rwlock_wrlock(&rwLock); - - tags.clear(); - while (true) + + // First pass: find maximum image ID + char* dataPtr = static_cast(mappedData); + char* endPtr = dataPtr + fileSize; + u_int32_t maxImageId = 0; + + while (dataPtr < endPtr - sizeof(u_int32_t) * 2) // Need at least space for imageId and tagSize { - // Read the image tag. u_int32_t i_imageId; + memcpy(&i_imageId, dataPtr, sizeof(u_int32_t)); + dataPtr += sizeof(u_int32_t); + u_int32_t i_tagSize; - ifs.read((char *)&i_imageId, sizeof(u_int32_t)); - if (ifs.eof()) + memcpy(&i_tagSize, dataPtr, sizeof(u_int32_t)); + dataPtr += sizeof(u_int32_t); + + // Skip tag content + dataPtr += i_tagSize; + + maxImageId = std::max(maxImageId, i_imageId); + + // Check if we've reached the end of the file + if (dataPtr >= endPtr) break; - ifs.read((char *)&i_tagSize, sizeof(u_int32_t)); - char psz_tag[i_tagSize]; - ifs.read((char *)psz_tag, i_tagSize); - - cout << i_imageId << " " << i_tagSize << " " << psz_tag << endl; - - // Save it into the memory. - tags[i_imageId] = string(psz_tag); } - + + cout << "Maximum tag image ID: " << maxImageId << endl; + + // Ensure tags vector has sufficient capacity + tags.resize(maxImageId + 1); + + // Second pass: load the actual tags + dataPtr = static_cast(mappedData); + + while (dataPtr < endPtr - sizeof(u_int32_t) * 2) + { + u_int32_t i_imageId; + memcpy(&i_imageId, dataPtr, sizeof(u_int32_t)); + dataPtr += sizeof(u_int32_t); + + u_int32_t i_tagSize; + memcpy(&i_tagSize, dataPtr, sizeof(u_int32_t)); + dataPtr += sizeof(u_int32_t); + + // Read tag content + if (dataPtr + i_tagSize <= endPtr) { + tags[i_imageId] = string(dataPtr, i_tagSize - 1); // Subtract 1 to exclude null terminator + } else { + cout << "Warning: Tag data for image " << i_imageId << " extends beyond file end" << endl; + break; + } + + dataPtr += i_tagSize; + } + + // Unmap and close the file + munmap(mappedData, fileSize); + ::close(fd); + pthread_rwlock_unlock(&rwLock); - + + gettimeofday(&end, NULL); + double elapsed = (end.tv_sec - start.tv_sec) + + (end.tv_usec - start.tv_usec) / 1000000.0; + + cout << "Tags loaded in " << elapsed << " seconds." << endl; + return INDEX_TAGS_LOADED; } @@ -588,7 +826,13 @@ u_int32_t ORBIndex::loadTags(string indexTagsPath) u_int32_t ORBIndex::writeTags(string indexTagsPath) { if (indexTagsPath == "") - indexTagsPath = DEFAULT_INDEX_TAGS_PATH; + { + // If no path is provided, use the stored path from constructor + if (storedTagsPath != "") + indexTagsPath = storedTagsPath; + else + indexTagsPath = DEFAULT_INDEX_TAGS_PATH; + } ofstream ofs; @@ -603,17 +847,18 @@ u_int32_t ORBIndex::writeTags(string indexTagsPath) cout << "Writing the index image tags." << endl; - for (unordered_map::const_iterator it = tags.begin(); - it != tags.end(); ++it) + // Write only non-empty tags + for (size_t i_imageId = 0; i_imageId < tags.size(); ++i_imageId) { - u_int32_t i_imageId = it->first; - const char *psz_tag = it->second.c_str(); - u_int32_t i_tagSize = strlen(psz_tag) + 1; + if (!tags[i_imageId].empty()) + { + const char *psz_tag = tags[i_imageId].c_str(); + u_int32_t i_tagSize = strlen(psz_tag) + 1; - ofs.write((char *)(&i_imageId), sizeof(u_int32_t)); - ofs.write((char *)(&i_tagSize), sizeof(u_int32_t)); - ofs.write((char *)(psz_tag), i_tagSize); - cout << "plop!" << endl; + ofs.write((char *)(&i_imageId), sizeof(u_int32_t)); + ofs.write((char *)(&i_tagSize), sizeof(u_int32_t)); + ofs.write((char *)(psz_tag), i_tagSize); + } } ofs.close(); @@ -633,10 +878,24 @@ u_int32_t ORBIndex::writeTags(string indexTagsPath) */ u_int32_t ORBIndex::getImageIds(vector &imageIds) { - imageIds.reserve(nbWords.size()); - for (unordered_map::const_iterator it = nbWords.begin(); - it != nbWords.end(); ++it) - imageIds.push_back(it->first); + // No locks needed since the index is read-only during queries + + // Count non-zero entries first to reserve the right amount of space + unsigned count = 0; + for (size_t i = 0; i < nbWords.size(); ++i) { + if (nbWords[i] > 0) { + count++; + } + } + + imageIds.reserve(count); + + // Add all image IDs with non-zero word counts + for (size_t i = 0; i < nbWords.size(); ++i) { + if (nbWords[i] > 0) { + imageIds.push_back(i); + } + } return INDEX_IMAGE_IDS; } @@ -658,3 +917,99 @@ void ORBIndex::unlock() { pthread_rwlock_unlock(&rwLock); } + +/** + * @brief Sort all word vectors by image ID. + * + * This method sorts all word vectors by image ID to enable binary search + * in getHitForWordAndImage, improving lookup performance from O(n) to O(log n). + * Called after loading the index or modifying it (adding/removing images). + */ +void ORBIndex::sortAllWordVectors() +{ + for (unsigned i_wordId = 0; i_wordId < NB_VISUAL_WORDS; ++i_wordId) { + if (nbOccurences[i_wordId] > 0) { + std::sort(indexHits[i_wordId].begin(), indexHits[i_wordId].end(), + [](const Hit& a, const Hit& b) { + return a.i_imageId < b.i_imageId; + }); + } + } +} + +/** + * @brief Update the index state by recalculating total indexed images and sorting word vectors. + * + * This method combines recalculateTotalIndexedImages() and sortAllWordVectors() since + * they are always called together after modifying the index. + */ +void ORBIndex::updateIndexState() +{ + recalculateTotalIndexedImages(); + sortAllWordVectors(); +} + +/** + * @brief Get direct access to the word count vector. + * @return A const reference to the nbWords vector. + */ +const vector& ORBIndex::getWordCountVector() const +{ + return nbWords; +} + +/** + * @brief Check if forward index is available. + * @return true if forward index is built, false otherwise. + */ +bool ORBIndex::hasForwardIndex() const +{ + return buildForwardIndex && !forwardIndex.empty(); +} + +/** + * @brief Get all words for an image from the forward index. + * @param i_imageId the image id. + * @return A const reference to the vector of word IDs for this image. + */ +const vector& ORBIndex::getForwardIndexWords(u_int32_t i_imageId) const +{ + static const vector emptyVector; + + if (!buildForwardIndex || i_imageId >= forwardIndex.size()) { + return emptyVector; + } + + return forwardIndex[i_imageId]; +} + +/** + * @brief Get a hit for a specific word and image. + * @param i_wordId the word id. + * @param i_imageId the image id. + * @return A pointer to the hit, or nullptr if not found. + * + * This method uses binary search on the sorted vector of hits to find a hit + * for a specific word and image. This improves lookup performance from O(n) to O(log n). + * The vectors must be sorted by image ID using sortAllWordVectors() for this to work. + */ +const Hit* ORBIndex::getHitForWordAndImage(u_int32_t i_wordId, u_int32_t i_imageId) const +{ + if (i_wordId >= NB_VISUAL_WORDS) { + return nullptr; + } + + const vector& hits = indexHits[i_wordId]; + + // Use binary search on the sorted vector + auto it = std::lower_bound(hits.begin(), hits.end(), i_imageId, + [](const Hit& hit, u_int32_t id) { + return hit.i_imageId < id; + }); + + if (it != hits.end() && it->i_imageId == i_imageId) { + return &(*it); + } + + return nullptr; +} diff --git a/src/orb/orbsearcher.cpp b/src/orb/orbsearcher.cpp index 5972f9e..4398b81 100644 --- a/src/orb/orbsearcher.cpp +++ b/src/orb/orbsearcher.cpp @@ -24,14 +24,10 @@ #include #include -#ifndef __APPLE__ -#include -#include -#else #include #include -#endif #include +#include // For std::partial_sort #include #include @@ -41,67 +37,141 @@ #include #include -#ifndef __APPLE__ -using namespace std::tr1; -#endif +// TODO +// Maybe when we have more images we should consider to evaluate algorithm proposed in +// https://dash.harvard.edu/server/api/core/bitstreams/030cf124-530c-4df5-a228-0fd180899d00/content +// Real-Time Tf-Idf Clustering Using Simhash, Approximate Nearest Neighbors, and DBSCAN +// It proposes near real time SIMD accelerated clustering, +// very very similar to what we are doing here ORBSearcher::ORBSearcher(ORBIndex *index, ORBWordIndex *wordIndex) - : index(index), wordIndex(wordIndex), orb(ORB::create(2000, 1.02, 100)) -{ } + : index(index), wordIndex(wordIndex), orb(ORB::create(2000, 1.02, 100)), + threadPool(NUM_THREADS) // Initialize thread pool once +{ + // Pre-compute word counts are already stored in the index + + // Initialize thread-specific word indices with deep copies of words + for (int i = 0; i < NUM_THREADS; i++) { + // Use the constructor that creates a deep copy of the words matrix + threadWordIndices.push_back(std::make_unique(*wordIndex->getWords())); + } +} + + +/** + * @brief Process a batch of words for TF-IDF computation + * @param batch The batch of words to process + * @param wordCounts Vector of word counts per image + * @param i_nbTotalIndexedImages Total number of indexed images + * @param maxImageId Maximum image ID + * @return Vector of weights for each image + */ +vector ORBSearcher::processTFIDFBatch( + const vector*>>& batch, + const vector& wordCounts, + unsigned i_nbTotalIndexedImages, + unsigned maxImageId) +{ + // Create a local weights vector for this batch + vector batchWeights(maxImageId + 1, 0.0f); + + // Process each word in the batch + for (const auto& wordPair : batch) { + const u_int32_t wordId = wordPair.first; + const vector* hits = wordPair.second; + + // Calculate IDF weight for this word + const float f_weight = log((float)i_nbTotalIndexedImages / hits->size()); + + // Update weights for all images containing this word + for (const Hit& hit : *hits) { + // TF-IDF calculation + unsigned i_totalNbWords = wordCounts[hit.i_imageId]; + batchWeights[hit.i_imageId] += f_weight / i_totalNbWords; + } + } + + return batchWeights; +} ORBSearcher::~ORBSearcher() -{ } +{ + threadPool.join(); // Ensure all tasks complete before destruction +} -/** - * @brief The RankingThread class - * This threads computes the tf-idf weights of the images that contains the words - * given in argument. - */ -class RankingThread : public Thread +// Process a batch of keypoints +std::vector> ORBSearcher::processKeyPointBatch( + const Mat& descriptors, + const vector& keypoints, + size_t startIdx, + size_t endIdx, + ORBWordIndex* localWordIndex) { -public: - RankingThread(ORBIndex *index, const unsigned i_nbTotalIndexedImages, - std::unordered_map > &indexHits) - : index(index), i_nbTotalIndexedImages(i_nbTotalIndexedImages), - indexHits(indexHits) { } + const unsigned i_nbTotalIndexedImages = index->getTotalNbIndexedImages(); + const unsigned i_maxNbOccurences = i_nbTotalIndexedImages > 10000 ? + 0.15 * i_nbTotalIndexedImages + : i_nbTotalIndexedImages; - void addWord(u_int32_t i_wordId) - { - wordIds.push_back(i_wordId); - } + // Create a vector to store all matches + std::vector> allMatches; + allMatches.reserve(endIdx - startIdx); // Reserve space for efficiency - void *run() + for (unsigned i = startIdx; i < endIdx; ++i) { - weights.rehash(wordIds.size()); + #define NB_NEIGHBORS 1 - for (deque::const_iterator it = wordIds.begin(); - it != wordIds.end(); ++it) - { - const vector &hits = indexHits[*it]; + vector indices(NB_NEIGHBORS); + vector dists(NB_NEIGHBORS); + + localWordIndex->knnSearch(descriptors.row(i), indices, dists, NB_NEIGHBORS); - const float f_weight = log((float)i_nbTotalIndexedImages / hits.size()); + for (unsigned j = 0; j < indices.size(); ++j) + { + const unsigned i_wordId = indices[j]; + float distance = dists[j]; // Get the KNN distance - for (vector::const_iterator it2 = hits.begin(); - it2 != hits.end(); ++it2) - { - /* TF-IDF according to the paper "Video Google: - * A Text Retrieval Approach to Object Matching in Videos" */ - unsigned i_totalNbWords = index->countTotalNbWord(it2->i_imageId); - weights[it2->i_imageId] += f_weight / i_totalNbWords; - } + if (index->getWordNbOccurences(i_wordId) > i_maxNbOccurences) + continue; + + // Convert the angle to a 16 bit integer. + Hit hit; + hit.i_imageId = 0; + hit.i_angle = keypoints[i].angle / 360 * (1 << 16); + hit.x = keypoints[i].pt.x; + hit.y = keypoints[i].pt.y; + + // Add this match to our results without filtering + SearchHit searchHit; + searchHit.hit = hit; + searchHit.distance = distance; + allMatches.push_back({i_wordId, searchHit}); } - - return NULL; } + + // Return all matches without filtering + return allMatches; +} - ORBIndex *index; - const unsigned i_nbTotalIndexedImages; - std::unordered_map > &indexHits; - deque wordIds; - std::unordered_map weights; // key: image id, value: image score. -}; + +// Helper function for sift-down operation in min-heap +static void siftDown(std::pair* heap, size_t size, size_t idx) { + size_t smallest = idx; + size_t left = 2 * idx + 1; + size_t right = 2 * idx + 2; + + if (left < size && heap[left].first < heap[smallest].first) + smallest = left; + + if (right < size && heap[right].first < heap[smallest].first) + smallest = right; + + if (smallest != idx) { + std::swap(heap[idx], heap[smallest]); + siftDown(heap, size, smallest); + } +} /** @@ -109,67 +179,86 @@ class RankingThread : public Thread * @param request the request to proceed. */ u_int32_t ORBSearcher::searchImage(SearchRequest &request) -{ - timeval t[3]; - gettimeofday(&t[0], NULL); - - cout << "Loading the image and extracting the ORBs." << endl; - +{ Mat img; u_int32_t i_ret = ImageLoader::loadImage(request.imageData.size(), request.imageData.data(), img); if (i_ret != OK) return i_ret; - + vector keypoints; Mat descriptors; orb->detectAndCompute(img, noArray(), keypoints, descriptors); - gettimeofday(&t[1], NULL); - - cout << "time: " << getTimeDiff(t[0], t[1]) << " ms." << endl; - cout << "Looking for the visual words. " << endl; - const unsigned i_nbTotalIndexedImages = index->getTotalNbIndexedImages(); const unsigned i_maxNbOccurences = i_nbTotalIndexedImages > 10000 ? 0.15 * i_nbTotalIndexedImages : i_nbTotalIndexedImages; - + std::unordered_map > imageReqHits; // key: visual word, value: the found angles - for (unsigned i = 0; i < keypoints.size(); ++i) - { - #define NB_NEIGHBORS 1 - - vector indices(NB_NEIGHBORS); - vector dists(NB_NEIGHBORS); - wordIndex->knnSearch(descriptors.row(i), indices, - dists, NB_NEIGHBORS); - - for (unsigned j = 0; j < indices.size(); ++j) - { - const unsigned i_wordId = indices[j]; - - if (index->getWordNbOccurences(i_wordId) > i_maxNbOccurences) - continue; - - if (imageReqHits.find(i_wordId) == imageReqHits.end()) - { - // Convert the angle to a 16 bit integer. - Hit hit; - hit.i_imageId = 0; - hit.i_angle = keypoints[i].angle / 360 * (1 << 16); - hit.x = keypoints[i].pt.x; - hit.y = keypoints[i].pt.y; - - imageReqHits[i_wordId].push_back(hit); - } + + // Use the persistent thread pool + // boost::asio::thread_pool pool(NUM_THREADS); + + // Calculate batch size based on FEATURE_BATCH_COUNT + size_t totalKeypoints = keypoints.size(); + size_t batchSize = (totalKeypoints + FEATURE_BATCH_COUNT - 1) / FEATURE_BATCH_COUNT; // Ceiling division + + // Create a vector to hold futures for each task + std::vector>>> futures; + + // Submit tasks to the thread pool + for (int b = 0; b < FEATURE_BATCH_COUNT; b++) { + size_t startIdx = b * batchSize; + size_t endIdx = std::min(startIdx + batchSize, totalKeypoints); + + // Skip empty batches + if (startIdx >= totalKeypoints) { + continue; } + + // Create a packaged task that returns a vector of all matches + auto task = std::make_shared>()>>( + [this, &descriptors, &keypoints, startIdx, endIdx, b]() { + return this->processKeyPointBatch(descriptors, keypoints, startIdx, endIdx, threadWordIndices[b % NUM_THREADS].get()); + } + ); + + // Get the future from the task + futures.push_back(task->get_future()); + + // Submit the task to the thread pool + boost::asio::post(threadPool, [task]() { (*task)(); }); } - - gettimeofday(&t[2], NULL); - cout << "time: " << getTimeDiff(t[1], t[2]) << " ms." << endl; - + + // Collect all matches from all batches + std::vector> allMatches; + + // Wait for all tasks to complete and collect their results + size_t batchIndex = 0; + + for (auto& future : futures) { + auto batchMatches = future.get(); + // Add to all matches + allMatches.insert(allMatches.end(), batchMatches.begin(), batchMatches.end()); + batchIndex++; + } + + // Now apply consistent filtering in a single pass + std::unordered_map bestDistances; + + // Group by word ID and keep ALL matches (not just the best one) + for (const auto& [wordId, searchHit] : allMatches) { + // Add this hit to the list for this word ID + imageReqHits[wordId].push_back(searchHit.hit); + + // Still track best distances for debugging + auto distIt = bestDistances.find(wordId); + if (distIt == bestDistances.end() || searchHit.distance < distIt->second) { + bestDistances[wordId] = searchHit.distance; + } + } return processSimilar(request, imageReqHits); } @@ -180,21 +269,12 @@ u_int32_t ORBSearcher::searchImage(SearchRequest &request) */ u_int32_t ORBSearcher::searchSimilar(SearchRequest &request) { - timeval t[2]; - gettimeofday(&t[0], NULL); - - cout << "Loading the image words from the index." << endl; - // key: visual word, value: the found angles std::unordered_map > imageReqHits; u_int32_t i_ret = index->getImageWords(request.imageId, imageReqHits); if (i_ret != OK) return i_ret; - - gettimeofday(&t[1], NULL); - cout << "time: " << getTimeDiff(t[0], t[1]) << " ms." << endl; - return processSimilar(request, imageReqHits); } @@ -202,90 +282,154 @@ u_int32_t ORBSearcher::searchSimilar(SearchRequest &request) u_int32_t ORBSearcher::processSimilar(SearchRequest &request, std::unordered_map > imageReqHits) { - timeval t[7]; - gettimeofday(&t[0], NULL); - const unsigned i_nbTotalIndexedImages = index->getTotalNbIndexedImages(); - cout << imageReqHits.size() << " visual words kept for the request." << endl; - cout << i_nbTotalIndexedImages << " images indexed in the index." << endl; - - std::unordered_map > indexHits; // key: visual word id, values: index hits. + std::unordered_map* > indexHits; // key: visual word id, values: index hits. indexHits.rehash(imageReqHits.size()); index->getImagesWithVisualWords(imageReqHits, indexHits); - gettimeofday(&t[1], NULL); - cout << "time: " << getTimeDiff(t[0], t[1]) << " ms." << endl; - cout << "Ranking the images." << endl; - - index->readLock(); - #define NB_RANKING_THREAD 4 - - // Map the ranking to threads. - unsigned i_wordsPerThread = indexHits.size() / NB_RANKING_THREAD + 1; - RankingThread *threads[NB_RANKING_THREAD]; - - std::unordered_map >::const_iterator it = indexHits.begin(); - for (unsigned i = 0; i < NB_RANKING_THREAD; ++i) - { - threads[i] = new RankingThread(index, i_nbTotalIndexedImages, indexHits); - - unsigned i_nbWords = 0; - for (; it != indexHits.end() && i_nbWords < i_wordsPerThread; ++it, ++i_nbWords) - threads[i]->addWord(it->first); + // Count total hits across all visual words + unsigned totalHits = 0; + unsigned maxHitsPerWord = 0; + unsigned wordsWithNoHits = 0; + + for (auto it = indexHits.begin(); it != indexHits.end(); ++it) { + unsigned wordHits = it->second->size(); + totalHits += wordHits; + + if (wordHits > maxHitsPerWord) + maxHitsPerWord = wordHits; + + if (wordHits == 0) + wordsWithNoHits++; } - - gettimeofday(&t[2], NULL); - cout << "init threads time: " << getTimeDiff(t[1], t[2]) << " ms." << endl; - - // Compute - for (unsigned i = 0; i < NB_RANKING_THREAD; ++i) - threads[i]->start(); - for (unsigned i = 0; i < NB_RANKING_THREAD; ++i) - threads[i]->join(); - - gettimeofday(&t[3], NULL); - cout << "compute time: " << getTimeDiff(t[2], t[3]) << " ms." << endl; - - // Reduce... - std::unordered_map weights; // key: image id, value: image score. - weights.rehash(i_nbTotalIndexedImages); - for (unsigned i = 0; i < NB_RANKING_THREAD; ++i) - for (std::unordered_map::const_iterator it = threads[i]->weights.begin(); - it != threads[i]->weights.end(); ++it) - weights[it->first] += it->second; - - gettimeofday(&t[4], NULL); - cout << "reduce time: " << getTimeDiff(t[3], t[4]) << " ms." << endl; - - // Free the memory - for (unsigned i = 0; i < NB_RANKING_THREAD; ++i) - delete threads[i]; - - index->unlock(); - - priority_queue rankedResults; - for (std::unordered_map::const_iterator it = weights.begin(); - it != weights.end(); ++it) - { - //cout << "Second: " << it->second << " First: " << it->first << endl; - rankedResults.push(SearchResult(it->second, it->first, Rect())); + + // Get the maximum image ID and word counts + const unsigned maxImageId = index->getWordCountVector().size() - 1; + const vector& wordCounts = index->getWordCountVector(); + + // Create a pre-allocated array for direct indexing of weights + vector weights(maxImageId + 1, 0.0f); + + // Process all visual words in parallel + unsigned totalHitsProcessed = 0; + + // Convert the map to a vector for easier batch division + vector*>> wordPairs; + wordPairs.reserve(indexHits.size()); + + for (const auto& pair : indexHits) { + wordPairs.push_back({pair.first, pair.second}); + totalHitsProcessed += pair.second->size(); } - - gettimeofday(&t[5], NULL); - cout << "rankedResult time: " << getTimeDiff(t[4], t[5]) << " ms." << endl; - cout << "Reranking 300 among " << rankedResults.size() << " images." << endl; - - priority_queue rerankedResults; - reranker.rerank(imageReqHits, indexHits, - rankedResults, rerankedResults, 300); - - gettimeofday(&t[6], NULL); - cout << "time: " << getTimeDiff(t[5], t[6]) << " ms." << endl; - cout << "Returning the results. " << endl; - + + // Calculate batch size based on WEIGHT_BATCH_COUNT + size_t totalWords = wordPairs.size(); + size_t batchSize = (totalWords + WEIGHT_BATCH_COUNT - 1) / WEIGHT_BATCH_COUNT; // Ceiling division + + // Create a vector to hold futures for each task + std::vector>> futures; + + // Submit tasks to the thread pool + for (int b = 0; b < WEIGHT_BATCH_COUNT; b++) { + size_t startIdx = b * batchSize; + size_t endIdx = std::min(startIdx + batchSize, totalWords); + + // Skip empty batches + if (startIdx >= totalWords) { + continue; + } + + // Create the batch + vector*>> batch( + wordPairs.begin() + startIdx, + wordPairs.begin() + endIdx + ); + + // Create a packaged task that returns a weights vector + auto task = std::make_shared()>>( + [this, batch, &wordCounts, i_nbTotalIndexedImages, maxImageId]() { + return this->processTFIDFBatch(batch, wordCounts, i_nbTotalIndexedImages, maxImageId); + } + ); + + // Get the future from the task + futures.push_back(task->get_future()); + + // Submit the task to the thread pool + boost::asio::post(threadPool, [task]() { (*task)(); }); + } + + // Wait for all tasks to complete and merge their results + for (auto& future : futures) { + auto batchWeights = future.get(); + + // Merge batch weights into the final weights vector + for (u_int32_t id = 0; id <= maxImageId; ++id) { + weights[id] += batchWeights[id]; + } + } + + // Find top 2000 results using a bounded min-heap (keeps largest elements by replacing smallest) + const unsigned TOP_N = 2000; + std::pair topResults[TOP_N]; + size_t heapSize = 0; + + // Process all images in a single pass + for (u_int32_t id = 0; id <= maxImageId; ++id) { + if (weights[id] > 0) { + if (heapSize < TOP_N) { + // Heap not full yet, just add the element + topResults[heapSize++] = {weights[id], id}; + + // If we just filled the heap, heapify it once + if (heapSize == TOP_N) { + // Build min-heap (smallest element at root) + for (int i = heapSize / 2 - 1; i >= 0; i--) { + siftDown(topResults, heapSize, i); + } + } + } + else if (weights[id] > topResults[0].first) { + // Heap is full and we found a larger weight + // Replace the smallest element (root) and sift down + topResults[0] = {weights[id], id}; + siftDown(topResults, heapSize, 0); + } + } + } + + // Convert heap to sorted vector (descending order by weight) + vector> sortedResults(topResults, topResults + heapSize); + std::sort(sortedResults.begin(), sortedResults.end(), + [](const std::pair& a, const std::pair& b) { + return a.first > b.first; + }); + + // Check if forward index is available and use the optimized reranking method + vector rerankedResults; + ORBIndex* orbIndex = static_cast(index); + + if (orbIndex->hasForwardIndex()) { + // Get the set of image IDs to rerank + unordered_set firstImageIds; + for (unsigned i = 0; i < min(TOP_N, (unsigned)sortedResults.size()); i++) { + firstImageIds.insert(sortedResults[i].second); + } + + // Use the forward index reranking + rerankedResults = reranker.rerankUsingForwardIndex(imageReqHits, orbIndex, firstImageIds); + } else { + // Fall back to the original reranking + unordered_set firstImageIds; + for (unsigned i = 0; i < min(TOP_N, (unsigned)sortedResults.size()); i++) { + firstImageIds.insert(sortedResults[i].second); + } + + rerankedResults = reranker.rerank(imageReqHits, indexHits, sortedResults, TOP_N); + } + returnResults(rerankedResults, request, 100); - return SEARCH_RESULTS; } @@ -296,19 +440,18 @@ u_int32_t ORBSearcher::processSimilar(SearchRequest &request, * @param req the received search request. * @param i_maxNbResults the maximum number of results returned. */ -void ORBSearcher::returnResults(priority_queue &rankedResults, - SearchRequest &req, unsigned i_maxNbResults) +void ORBSearcher::returnResults(vector &rankedResults, + SearchRequest &req, unsigned i_maxNbResults) { - list imageIds; - + list imageIds; unsigned i_res = 0; - while(!rankedResults.empty() - && i_res < i_maxNbResults) + for (const auto& res : rankedResults) { - const SearchResult &res = rankedResults.top(); + if (i_res >= i_maxNbResults) + break; + imageIds.push_back(res.i_imageId); i_res++; - cout << "Id: " << res.i_imageId << ", score: " << res.f_weight << endl; req.results.push_back(res.i_imageId); req.boundingRects.push_back(res.boundingRect); req.scores.push_back(res.f_weight); @@ -318,19 +461,5 @@ void ORBSearcher::returnResults(priority_queue &rankedResults, req.tags.push_back(tag); else req.tags.push_back(""); - - rankedResults.pop(); - } -} - - -/** - * @brief Get the time difference in ms between two instants. - * @param t1 - * @param t2 - */ -unsigned long ORBSearcher::getTimeDiff(const timeval t1, const timeval t2) const -{ - return ((t2.tv_sec - t1.tv_sec) * 1000000 - + (t2.tv_usec - t1.tv_usec)) / 1000; + } } diff --git a/src/orb/orbwordindex.cpp b/src/orb/orbwordindex.cpp index 7b11351..2d03010 100644 --- a/src/orb/orbwordindex.cpp +++ b/src/orb/orbwordindex.cpp @@ -26,6 +26,7 @@ ORBWordIndex::ORBWordIndex(string visualWordsPath) + : ownsWords(true) { words = new Mat(0, 32, CV_8U); // The matrix that stores the visual words. @@ -35,18 +36,70 @@ ORBWordIndex::ORBWordIndex(string visualWordsPath) cout << "Building the word index." << endl; + // Initialize SimSIMD for optimal performance + simsimd_flush_denormals(); + + cvflann::Matrix m_features + ((unsigned char*)words->ptr(0), words->rows, words->cols); + + // Use our custom SimSIMD Hamming distance functor + kdIndex = new cvflann::HierarchicalClusteringIndex + (m_features, cvflann::HierarchicalClusteringIndexParams(10, cvflann::FLANN_CENTERS_RANDOM, 8, 100)); + kdIndex->buildIndex(); +} + +// Constructor that accepts an existing words matrix (shares the matrix) +ORBWordIndex::ORBWordIndex(const Mat* sharedWords) + : ownsWords(false) +{ + // Use the shared words matrix + words = const_cast(sharedWords); + + cout << "Building the word index with shared words." << endl; + + // Initialize SimSIMD for optimal performance + simsimd_flush_denormals(); + cvflann::Matrix m_features ((unsigned char*)words->ptr(0), words->rows, words->cols); - kdIndex = new cvflann::HierarchicalClusteringIndex > - (m_features,cvflann::HierarchicalClusteringIndexParams(10, cvflann::FLANN_CENTERS_RANDOM, 8, 100)); + + // Use our custom SimSIMD Hamming distance functor + kdIndex = new cvflann::HierarchicalClusteringIndex + (m_features, cvflann::HierarchicalClusteringIndexParams(10, cvflann::FLANN_CENTERS_KMEANSPP, 8, 100)); + kdIndex->buildIndex(); +} + +// Constructor that creates a deep copy of an existing words matrix +ORBWordIndex::ORBWordIndex(const Mat& wordsToCopy) + : ownsWords(true) +{ + // Create a deep copy of the words matrix + words = new Mat(); + wordsToCopy.copyTo(*words); + + cout << "Building the word index with deep copy of words." << endl; + + // Initialize SimSIMD for optimal performance + simsimd_flush_denormals(); + + cvflann::Matrix m_features + ((unsigned char*)words->ptr(0), words->rows, words->cols); + + // Use our custom SimSIMD Hamming distance functor + kdIndex = new cvflann::HierarchicalClusteringIndex + (m_features, cvflann::HierarchicalClusteringIndexParams(10, cvflann::FLANN_CENTERS_GONZALES, 8, 100)); kdIndex->buildIndex(); } ORBWordIndex::~ORBWordIndex() { - delete words; delete kdIndex; + + // Only delete words if this instance owns it + if (ownsWords) { + delete words; + } } @@ -58,7 +111,7 @@ void ORBWordIndex::knnSearch(const Mat& query, vector& indices, m_indices.init(indices.data(), dists.data()); kdIndex->findNeighbors(m_indices, (unsigned char*)query.ptr(0), - cvflann::SearchParams(2000)); + cvflann::SearchParams(20000)); } diff --git a/src/requesthandler.cpp b/src/requesthandler.cpp index 82384d0..2b28cff 100644 --- a/src/requesthandler.cpp +++ b/src/requesthandler.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -40,8 +41,17 @@ RequestHandler::RequestHandler(FeatureExtractor *featureExtractor, Searcher *imageSearcher, Index *index, ImageDownloader *imgDownloader, string authKey) : featureExtractor(featureExtractor), imageSearcher(imageSearcher), - index(index), authKey(authKey) -{ } + index(index), imgDownloader(imgDownloader), authKey(authKey) +{ + // Initialize the batch processor + batchProcessor = new BatchProcessor(imgDownloader, featureExtractor, + dynamic_cast(index)); +} + +RequestHandler::~RequestHandler() +{ + delete batchProcessor; +} /** @@ -120,6 +130,7 @@ void RequestHandler::handleRequest(ConnectionInfo &conInfo) vector parsedURI = parseURI(conInfo.url); string p_image[] = {"index", "images", "IDENTIFIER", ""}; + string p_imageBatch[] = {"index", "images", "batch", ""}; string p_tag[] = {"index", "images", "IDENTIFIER", "tag", ""}; string p_searchImage[] = {"index", "searcher", ""}; string p_ioIndex[] = {"index", "io", ""}; @@ -137,33 +148,45 @@ void RequestHandler::handleRequest(ConnectionInfo &conInfo) && conInfo.connectionType == POST) { u_int32_t i_imageId = atoi(parsedURI[2].c_str()); - unsigned i_nbFeaturesExtracted; - u_int32_t i_ret = featureExtractor->processNewImage( - i_imageId, conInfo.uploadedData.size(), conInfo.uploadedData.data(), - i_nbFeaturesExtracted); + u_int32_t i_ret; - if (i_ret == IMAGE_NOT_DECODED) + // Check if the content type is JSON + if (conInfo.contentType.find("application/json") != string::npos) { - // Check if the data is an image URL to load - string dataStr(conInfo.uploadedData.begin(), - conInfo.uploadedData.end()); - + // Process as JSON with URL + string dataStr(conInfo.uploadedData.begin(), conInfo.uploadedData.end()); Json::Value data = StringToJson(dataStr); string imgURL = data["url"].asString(); + if (imgDownloader->canDownloadImage(imgURL)) { std::vector imgData; long HTTPResponseCode; i_ret = imgDownloader->getImageData(imgURL, imgData, HTTPResponseCode); if (i_ret == OK) + { i_ret = featureExtractor->processNewImage( i_imageId, imgData.size(), imgData.data(), i_nbFeaturesExtracted); + } else + { ret["image_downloader_http_response_code"] = (Json::Int64)HTTPResponseCode; + } + } + else + { + i_ret = MISFORMATTED_REQUEST; } } + else + { + // Process as direct image upload + i_ret = featureExtractor->processNewImage( + i_imageId, conInfo.uploadedData.size(), conInfo.uploadedData.data(), + i_nbFeaturesExtracted); + } ret["type"] = Converter::codeToString(i_ret); ret["image_id"] = Json::Value(i_imageId); @@ -179,6 +202,59 @@ void RequestHandler::handleRequest(ConnectionInfo &conInfo) ret["type"] = Converter::codeToString(i_ret); ret["image_id"] = Json::Value(i_imageId); } + else if (testURIWithPattern(parsedURI, p_imageBatch) + && conInfo.connectionType == POST) + { + string dataStr(conInfo.uploadedData.begin(), + conInfo.uploadedData.end()); + + Json::Value data = StringToJson(dataStr); + + // Validate the request format + if (!data.isArray()) { + ret["type"] = Converter::codeToString(MISFORMATTED_REQUEST); + conInfo.answerString = JsonToString(ret); + return; + } + + // Convert JSON array to vector + vector batchData; + for (unsigned i = 0; i < data.size(); i++) { + batchData.push_back(data[i]); + } + + // Process the batch + vector results = batchProcessor->processBatch(batchData); + + // Create response + ret["type"] = Converter::codeToString(BATCH_PROCESSED); + + Json::Value resultsArray(Json::arrayValue); + for (const auto& result : results) { + Json::Value resultObj; + resultObj["image_id"] = result.imageId; + resultObj["url"] = result.url; + resultObj["type"] = Converter::codeToString(result.status); + + if (result.status == IMAGE_ADDED) { + resultObj["nb_features_extracted"] = result.nbFeaturesExtracted; + + // Include tag status if a tag was provided + if (!result.tag.empty()) { + resultObj["tag"] = result.tag; + resultObj["tag_status"] = Converter::codeToString(IMAGE_TAG_ADDED); + } + } + + if (!result.url.empty() && result.status != IMAGE_ADDED) { + resultObj["image_downloader_http_response_code"] = (Json::Int64)result.httpResponseCode; + } + + resultsArray.append(resultObj); + } + + ret["results"] = resultsArray; + } else if (testURIWithPattern(parsedURI, p_tag) && conInfo.connectionType == POST) { @@ -203,28 +279,46 @@ void RequestHandler::handleRequest(ConnectionInfo &conInfo) else if (testURIWithPattern(parsedURI, p_searchImage) && conInfo.connectionType == POST) { + timeval t_start, t_end; + gettimeofday(&t_start, NULL); + SearchRequest req; - req.imageData = conInfo.uploadedData; req.client = NULL; - u_int32_t i_ret = imageSearcher->searchImage(req); - - if (i_ret == IMAGE_NOT_DECODED) - { - // Check if the data is an image URL to load - string dataStr(conInfo.uploadedData.begin(), - conInfo.uploadedData.end()); + u_int32_t i_ret; + // Check if the content type is JSON + if (conInfo.contentType.find("application/json") != string::npos) + { + string dataStr(conInfo.uploadedData.begin(), conInfo.uploadedData.end()); Json::Value data = StringToJson(dataStr); - string imgURL = data["url"].asString(); + string imgURL = data["url"].asString(); + if (imgDownloader->canDownloadImage(imgURL)) { std::vector imgData; long HTTPResponseCode; + + // Add timing for the image download + timeval t_download_start, t_download_end; + gettimeofday(&t_download_start, NULL); + i_ret = imgDownloader->getImageData(imgURL, imgData, HTTPResponseCode); + + gettimeofday(&t_download_end, NULL); + cout << "Image download time: " << getTimeDiff(t_download_start, t_download_end) << " ms." << endl; + if (i_ret == OK) { req.imageData = imgData; + + // Add timing for the search call + timeval t_search_start, t_search_end; + gettimeofday(&t_search_start, NULL); + i_ret = imageSearcher->searchImage(req); + + gettimeofday(&t_search_end, NULL); + cout << "Search function call time: " << getTimeDiff(t_search_start, t_search_end) << " ms." << endl; } else { ret["type"] = Converter::codeToString(i_ret); @@ -233,6 +327,24 @@ void RequestHandler::handleRequest(ConnectionInfo &conInfo) return; } } + else + { + i_ret = MISFORMATTED_REQUEST; + } + } + else + { + // Process as direct image upload + req.imageData = conInfo.uploadedData; + + // Add timing for the search call + timeval t_search_start, t_search_end; + gettimeofday(&t_search_start, NULL); + + i_ret = imageSearcher->searchImage(req); + + gettimeofday(&t_search_end, NULL); + cout << "Search function call time: " << getTimeDiff(t_search_start, t_search_end) << " ms." << endl; } ret["type"] = Converter::codeToString(i_ret); @@ -273,12 +385,18 @@ void RequestHandler::handleRequest(ConnectionInfo &conInfo) } ret["results"] = results; } + + gettimeofday(&t_end, NULL); + cout << "Total search request processing time: " << getTimeDiff(t_start, t_end) << " ms." << endl; } // And this is the updated similar search handler else if (testURIWithPattern(parsedURI, p_image) && conInfo.connectionType == GET) { + timeval t_start, t_end; + gettimeofday(&t_start, NULL); + SearchRequest req; req.imageId = atoi(parsedURI[2].c_str()); req.client = NULL; @@ -323,6 +441,9 @@ void RequestHandler::handleRequest(ConnectionInfo &conInfo) } ret["results"] = results; } + + gettimeofday(&t_end, NULL); + cout << "Total similar search request processing time: " << getTimeDiff(t_start, t_end) << " ms." << endl; } else if (testURIWithPattern(parsedURI, p_ioIndex) && conInfo.connectionType == POST) @@ -418,3 +539,16 @@ Json::Value RequestHandler::StringToJson(string inputStr) Json::parseFromStream(builder, ss, &data, &errs); return data; } + + +/** + * @brief Get the time difference in ms between two instants. + * @param t1 the start time + * @param t2 the end time + * @return the time difference in milliseconds + */ +unsigned long RequestHandler::getTimeDiff(const timeval t1, const timeval t2) const +{ + return ((t2.tv_sec - t1.tv_sec) * 1000000 + + (t2.tv_usec - t1.tv_usec)) / 1000; +} diff --git a/test-caching.sh b/test-caching.sh new file mode 100755 index 0000000..d62a5ab --- /dev/null +++ b/test-caching.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Script to demonstrate Docker layer caching with the new build system + +# Enable BuildKit for better caching +export DOCKER_BUILDKIT=1 + +# Clean any previous builds to ensure a fresh test +echo "=== Cleaning previous builds ===" +docker system prune -f + +# First build - should build everything +echo "=== First Build (Full Build) ===" +time docker-compose build + +# Show the layers that were created +echo "=== Docker image layers after first build ===" +docker history pastec_pastec + +# Make a small change to a Pastec source file +echo "=== Making a small change to Pastec source code ===" +echo "// Test comment to trigger rebuild - $(date)" >> src/main.cpp + +# Second build - should only rebuild the Pastec application, not dependencies +echo "=== Second Build (Should reuse dependency cache) ===" +time docker-compose build + +# Show the layers that were created/reused +echo "=== Docker image layers after second build ===" +docker history pastec_pastec + +# Restore the source file +echo "=== Restoring source file ===" +git checkout -- src/main.cpp + +echo "=== Test Complete ===" +echo "The second build should be significantly faster than the first build" +echo "because it reused the cached dependencies layer." +echo "" +echo "You should see in the docker history output that the deps-builder stage" +echo "was reused (marked as 'cached') in the second build."