diff --git a/.github/actions/setup-faiss/action.yml b/.github/actions/setup-faiss/action.yml
new file mode 100644
index 000000000000..0184d66b0a04
--- /dev/null
+++ b/.github/actions/setup-faiss/action.yml
@@ -0,0 +1,86 @@
+name: 'Setup FAISS'
+description: 'Install native dependencies and build FAISS library'
+
+runs:
+ using: "composite"
+ steps:
+ - name: Install native dependencies
+ shell: bash
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y \
+ build-essential \
+ libopenblas-dev \
+ liblapack-dev \
+ patchelf \
+ libgomp1 \
+ wget
+
+ - name: Install GCC 9
+ shell: bash
+ run: |
+ sudo apt-get install -y gcc-9 g++-9
+ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 90
+ sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 90
+ gcc --version
+ g++ --version
+ # Verify GCC version is >= 9.3.0
+ # Use -dumpfullversion for full version, fall back to -dumpversion
+ GCC_VERSION=$(gcc -dumpfullversion 2>/dev/null || gcc -dumpversion)
+ echo "GCC version: $GCC_VERSION"
+ # Extract major version
+ GCC_MAJOR=$(echo "$GCC_VERSION" | cut -d. -f1)
+ if [[ "$GCC_MAJOR" -lt 9 ]]; then
+ echo "ERROR: GCC major version must be >= 9, got $GCC_MAJOR"
+ exit 1
+ fi
+ echo "GCC version check passed: $GCC_VERSION (major: $GCC_MAJOR)"
+
+ - name: Install CMake 3.30.1
+ shell: bash
+ run: |
+ CMAKE_VERSION="3.30.1"
+ wget -q https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz
+ tar -xzf cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz
+ sudo mv cmake-${CMAKE_VERSION}-linux-x86_64 /opt/cmake
+ sudo ln -sf /opt/cmake/bin/cmake /usr/local/bin/cmake
+ sudo ln -sf /opt/cmake/bin/ctest /usr/local/bin/ctest
+ sudo ln -sf /opt/cmake/bin/cpack /usr/local/bin/cpack
+ cmake --version
+ # Verify CMake version
+ CMAKE_INSTALLED=$(cmake --version | head -n1 | awk '{print $3}')
+ echo "CMake version: $CMAKE_INSTALLED"
+ if [[ "$(printf '%s\n' "3.30.1" "$CMAKE_INSTALLED" | sort -V | head -n1)" != "3.30.1" ]]; then
+ echo "ERROR: CMake version must be >= 3.30.1, got $CMAKE_INSTALLED"
+ exit 1
+ fi
+
+ - name: Install FAISS
+ shell: bash
+ run: |
+ # Clone and build FAISS
+ git clone --depth 1 --branch v1.7.4 https://github.com/facebookresearch/faiss.git /tmp/faiss
+ cd /tmp/faiss
+ cmake -B build \
+ -DFAISS_ENABLE_GPU=OFF \
+ -DFAISS_ENABLE_PYTHON=OFF \
+ -DBUILD_TESTING=OFF \
+ -DCMAKE_BUILD_TYPE=Release
+ cmake --build build -j $(nproc)
+ sudo cmake --install build
+
+ - name: Build native library
+ shell: bash
+ run: |
+ cd paimon-faiss-jni
+ ./scripts/build-native.sh --clean --fat-lib
+
+ - name: Build paimon-faiss-jni
+ shell: bash
+ run: |
+ mvn -B clean install -pl paimon-faiss-jni -am -DskipTests -Ppaimon-faiss-vector
+
+ - name: Build paimon-faiss
+ shell: bash
+ run: |
+ mvn -B clean install -pl paimon-faiss -am -DskipTests -Ppaimon-faiss-vector
diff --git a/.github/workflows/faiss-vector-index-tests.yml b/.github/workflows/faiss-vector-index-tests.yml
new file mode 100644
index 000000000000..a44c9af6db94
--- /dev/null
+++ b/.github/workflows/faiss-vector-index-tests.yml
@@ -0,0 +1,92 @@
+################################################################################
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+name: Faiss Vector Index Tests
+
+on:
+ push:
+ paths:
+ - 'paimon-faiss/**'
+ - 'paimon-faiss-jni/**'
+ - '.github/workflows/faiss-vector-index-tests.yml'
+ pull_request:
+ paths:
+ - 'paimon-faiss/**'
+ - 'paimon-faiss-jni/**'
+ - '.github/workflows/faiss-vector-index-tests.yml'
+
+env:
+ JDK_VERSION: 8
+ MAVEN_OPTS: -Dmaven.wagon.httpconnectionManager.ttlSeconds=30 -Dmaven.wagon.http.retryHandler.requestSentEnabled=true
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.number || github.run_id }}
+ cancel-in-progress: true
+
+jobs:
+ build_test:
+ runs-on: ubuntu-latest
+ timeout-minutes: 90
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Set up JDK ${{ env.JDK_VERSION }}
+ uses: actions/setup-java@v4
+ with:
+ java-version: ${{ env.JDK_VERSION }}
+ distribution: 'temurin'
+
+ - name: Setup FAISS and build paimon-faiss
+ uses: ./.github/actions/setup-faiss
+
+ - name: List bundled libraries
+ run: |
+ echo "=== Bundled libraries ==="
+ ls -la paimon-faiss-jni/src/main/resources/linux/amd64/
+ echo ""
+ echo "=== Library dependencies ==="
+ ldd paimon-faiss-jni/src/main/resources/linux/amd64/libpaimon_faiss_jni.so || true
+
+ - name: Test paimon-faiss-jni
+ timeout-minutes: 10
+ run: |
+ mvn -T 1C -B test -pl paimon-faiss-jni -DskipFaissTests=false -Ppaimon-faiss-vector
+ env:
+ MAVEN_OPTS: -Xmx2048m
+
+ - name: Test paimon-faiss
+ timeout-minutes: 30
+ run: |
+ mvn -T 1C -B test -pl paimon-faiss -Ppaimon-faiss-vector
+ env:
+ MAVEN_OPTS: -Xmx4096m
+
+ - name: Build Vector E2E Test Module
+ run: mvn -T 2C -B clean install -DskipTests -Pspark3,flink1,paimon-faiss-vector -pl paimon-vector-e2e-test -am
+
+ - name: Run Vector E2E Tests
+ timeout-minutes: 30
+ run: |
+ # run tests with random timezone to find out timezone related bugs
+ . .github/workflows/utils.sh
+ jvm_timezone=$(random_timezone)
+ echo "JVM timezone is set to $jvm_timezone"
+ mvn -T 2C -B verify -Pspark3,flink1,paimon-faiss-vector -pl paimon-vector-e2e-test -Duser.timezone=$jvm_timezone
+ env:
+ MAVEN_OPTS: -Xmx4096m
diff --git a/.github/workflows/publish-faiss-snapshot.yml b/.github/workflows/publish-faiss-snapshot.yml
new file mode 100644
index 000000000000..f149f5feaec6
--- /dev/null
+++ b/.github/workflows/publish-faiss-snapshot.yml
@@ -0,0 +1,304 @@
+################################################################################
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+name: Publish Faiss Snapshot
+
+on:
+ schedule:
+ # At the end of every day
+ - cron: '0 1 * * *'
+ workflow_dispatch:
+ push:
+ paths:
+ - 'paimon-faiss/**'
+ - 'paimon-faiss-jni/**'
+ branches:
+ - master
+
+env:
+ JDK_VERSION: 8
+ MAVEN_OPTS: -Dmaven.wagon.httpconnectionManager.ttlSeconds=30 -Dmaven.wagon.http.retryHandler.requestSentEnabled=true
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.number || github.run_id }}
+ cancel-in-progress: true
+
+jobs:
+ # Build native library for Linux AMD64
+ build-native-linux-amd64:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Set up JDK ${{ env.JDK_VERSION }}
+ uses: actions/setup-java@v4
+ with:
+ java-version: ${{ env.JDK_VERSION }}
+ distribution: 'temurin'
+
+ - name: Install native dependencies
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y \
+ build-essential \
+ libopenblas-dev \
+ liblapack-dev \
+ patchelf \
+ libgomp1 \
+ wget
+
+ - name: Install GCC 9
+ run: |
+ sudo apt-get install -y gcc-9 g++-9
+ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 90
+ sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 90
+ gcc --version
+
+ - name: Install CMake 3.30.1
+ run: |
+ CMAKE_VERSION="3.30.1"
+ wget -q https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz
+ tar -xzf cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz
+ sudo mv cmake-${CMAKE_VERSION}-linux-x86_64 /opt/cmake
+ sudo ln -sf /opt/cmake/bin/cmake /usr/local/bin/cmake
+ cmake --version
+
+ - name: Install FAISS
+ run: |
+ git clone --depth 1 --branch v1.7.4 https://github.com/facebookresearch/faiss.git /tmp/faiss
+ cd /tmp/faiss
+ cmake -B build \
+ -DFAISS_ENABLE_GPU=OFF \
+ -DFAISS_ENABLE_PYTHON=OFF \
+ -DBUILD_TESTING=OFF \
+ -DCMAKE_BUILD_TYPE=Release
+ cmake --build build -j $(nproc)
+ sudo cmake --install build
+
+ - name: Build native library
+ run: |
+ cd paimon-faiss-jni
+ ./scripts/build-native.sh --clean --fat-lib
+
+ - name: List built libraries
+ run: |
+ echo "=== Built libraries ==="
+ ls -la paimon-faiss-jni/src/main/resources/linux/amd64/
+ echo ""
+ echo "=== Library dependencies ==="
+ ldd paimon-faiss-jni/src/main/resources/linux/amd64/libpaimon_faiss_jni.so || true
+
+ - name: Upload native library
+ uses: actions/upload-artifact@v4
+ with:
+ name: native-linux-amd64
+ path: paimon-faiss-jni/src/main/resources/linux/amd64/
+ retention-days: 1
+
+ # Build native library for Linux AARCH64
+ build-native-linux-aarch64:
+ runs-on: ubuntu-24.04-arm
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Set up JDK ${{ env.JDK_VERSION }}
+ uses: actions/setup-java@v4
+ with:
+ java-version: ${{ env.JDK_VERSION }}
+ distribution: 'temurin'
+
+ - name: Install native dependencies
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y \
+ build-essential \
+ libopenblas-dev \
+ liblapack-dev \
+ patchelf \
+ libgomp1 \
+ wget
+
+ - name: Install GCC 9
+ run: |
+ sudo apt-get install -y gcc-9 g++-9 || sudo apt-get install -y gcc g++
+ if command -v gcc-9 &>/dev/null; then
+ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 90
+ sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 90
+ fi
+ gcc --version
+
+ - name: Install CMake 3.30.1
+ run: |
+ CMAKE_VERSION="3.30.1"
+ wget -q https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz
+ tar -xzf cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz
+ sudo mv cmake-${CMAKE_VERSION}-linux-aarch64 /opt/cmake
+ sudo ln -sf /opt/cmake/bin/cmake /usr/local/bin/cmake
+ cmake --version
+
+ - name: Install FAISS
+ run: |
+ git clone --depth 1 --branch v1.7.4 https://github.com/facebookresearch/faiss.git /tmp/faiss
+ cd /tmp/faiss
+ cmake -B build \
+ -DFAISS_ENABLE_GPU=OFF \
+ -DFAISS_ENABLE_PYTHON=OFF \
+ -DBUILD_TESTING=OFF \
+ -DCMAKE_BUILD_TYPE=Release
+ cmake --build build -j $(nproc)
+ sudo cmake --install build
+
+ - name: Build native library
+ run: |
+ cd paimon-faiss-jni
+ ./scripts/build-native.sh --clean --fat-lib
+
+ - name: List built libraries
+ run: |
+ echo "=== Built libraries ==="
+ ls -la paimon-faiss-jni/src/main/resources/linux/aarch64/
+ echo ""
+ echo "=== Library dependencies ==="
+ ldd paimon-faiss-jni/src/main/resources/linux/aarch64/libpaimon_faiss_jni.so || true
+
+ - name: Upload native library
+ uses: actions/upload-artifact@v4
+ with:
+ name: native-linux-aarch64
+ path: paimon-faiss-jni/src/main/resources/linux/aarch64/
+ retention-days: 1
+
+ # Build native library for macOS ARM (Apple Silicon)
+ build-native-macos-arm:
+ runs-on: macos-14
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Set up JDK ${{ env.JDK_VERSION }}
+ uses: actions/setup-java@v4
+ with:
+ java-version: ${{ env.JDK_VERSION }}
+ distribution: 'temurin'
+
+ - name: Install dependencies
+ run: |
+ brew install cmake libomp openblas faiss
+
+ - name: Build native library
+ run: |
+ cd paimon-faiss-jni
+ ./scripts/build-native.sh --clean --fat-lib
+
+ - name: List built libraries
+ run: |
+ echo "=== Built libraries ==="
+ ls -la paimon-faiss-jni/src/main/resources/darwin/aarch64/
+ echo ""
+ echo "=== Library dependencies ==="
+ otool -L paimon-faiss-jni/src/main/resources/darwin/aarch64/libpaimon_faiss_jni.dylib || true
+
+ - name: Upload native library
+ uses: actions/upload-artifact@v4
+ with:
+ name: native-darwin-aarch64
+ path: paimon-faiss-jni/src/main/resources/darwin/aarch64/
+ retention-days: 1
+
+ # Package and publish
+ package-and-publish:
+ if: github.repository == 'apache/paimon'
+ needs: [build-native-linux-amd64, build-native-linux-aarch64, build-native-macos-arm]
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Set up JDK ${{ env.JDK_VERSION }}
+ uses: actions/setup-java@v4
+ with:
+ java-version: ${{ env.JDK_VERSION }}
+ distribution: 'temurin'
+
+ - name: Download Linux AMD64 native library
+ uses: actions/download-artifact@v4
+ with:
+ name: native-linux-amd64
+ path: paimon-faiss-jni/src/main/resources/linux/amd64/
+
+ - name: Download Linux AARCH64 native library
+ uses: actions/download-artifact@v4
+ with:
+ name: native-linux-aarch64
+ path: paimon-faiss-jni/src/main/resources/linux/aarch64/
+
+ - name: Download macOS ARM native library
+ uses: actions/download-artifact@v4
+ with:
+ name: native-darwin-aarch64
+ path: paimon-faiss-jni/src/main/resources/darwin/aarch64/
+
+ - name: List all native libraries
+ run: |
+ echo "=== All native libraries ==="
+ find paimon-faiss-jni/src/main/resources -type f \( -name "*.so" -o -name "*.so.*" -o -name "*.dylib" \) -exec ls -la {} \;
+
+ - name: Cache local Maven repository
+ uses: actions/cache@v4
+ with:
+ path: ~/.m2/repository
+ key: faiss-snapshot-maven-${{ hashFiles('**/pom.xml') }}
+ restore-keys: |
+ faiss-snapshot-maven-
+
+ - name: Build and package paimon-faiss-jni
+ run: |
+ mvn -B clean install -pl paimon-faiss-jni -am -DskipTests -Ppaimon-faiss-vector -Drat.skip
+
+ - name: Build and package paimon-faiss
+ run: |
+ mvn -B clean install -pl paimon-faiss -am -DskipTests -Ppaimon-faiss-vector -Drat.skip
+
+ - name: Publish snapshot
+ env:
+ ASF_USERNAME: ${{ secrets.NEXUS_USER }}
+ ASF_PASSWORD: ${{ secrets.NEXUS_PW }}
+ MAVEN_OPTS: -Xmx4096m
+ run: |
+ tmp_settings="tmp-settings.xml"
+ echo "
This class provides methods for configuring Faiss globally, such as setting the number of + * threads for parallel operations. + * + *
Example usage: + * + *
{@code
+ * // Set the number of threads for Faiss operations
+ * Faiss.setNumThreads(4);
+ *
+ * // Get the Faiss version
+ * String version = Faiss.getVersion();
+ * }
+ */
+public final class Faiss {
+
+ static {
+ try {
+ NativeLibraryLoader.load();
+ } catch (FaissException e) {
+ throw new ExceptionInInitializerError(e);
+ }
+ }
+
+ private Faiss() {
+ // Static utility class
+ }
+
+ /**
+ * Get the version of the Faiss library.
+ *
+ * @return the version string
+ */
+ public static String getVersion() {
+ return FaissNative.getVersion();
+ }
+
+ /**
+ * Set the number of threads for parallel operations.
+ *
+ * This affects operations like index training, adding vectors, and searching. Set to 1 to + * disable parallelism. + * + * @param numThreads the number of threads (must be positive) + */ + public static void setNumThreads(int numThreads) { + if (numThreads <= 0) { + throw new IllegalArgumentException("Number of threads must be positive: " + numThreads); + } + FaissNative.setNumThreads(numThreads); + } + + /** + * Get the number of threads for parallel operations. + * + * @return the current number of threads + */ + public static int getNumThreads() { + return FaissNative.getNumThreads(); + } + + /** + * Ensure the native library is loaded. + * + *
This method is called automatically when any Faiss class is used. It can be called + * explicitly to load the library early and catch any loading errors. + * + * @throws FaissException if the native library cannot be loaded + */ + public static void loadLibrary() throws FaissException { + NativeLibraryLoader.load(); + } + + /** + * Check if the native library has been loaded. + * + * @return true if the library is loaded + */ + public static boolean isLibraryLoaded() { + return NativeLibraryLoader.isLoaded(); + } +} diff --git a/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/FaissException.java b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/FaissException.java new file mode 100644 index 000000000000..c670b619e899 --- /dev/null +++ b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/FaissException.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +/** + * Exception thrown when a Faiss operation fails. + * + *
This exception wraps errors from the native Faiss library as well as errors that occur during + * JNI operations. + */ +public class FaissException extends Exception { + private static final long serialVersionUID = 1L; + + /** + * Creates a new FaissException with the specified message. + * + * @param message the error message + */ + public FaissException(String message) { + super(message); + } + + /** + * Creates a new FaissException with the specified message and cause. + * + * @param message the error message + * @param cause the underlying cause + */ + public FaissException(String message, Throwable cause) { + super(message, cause); + } + + /** + * Creates a new FaissException with the specified cause. + * + * @param cause the underlying cause + */ + public FaissException(Throwable cause) { + super(cause); + } +} diff --git a/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/FaissNative.java b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/FaissNative.java new file mode 100644 index 000000000000..77a0d5286221 --- /dev/null +++ b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/FaissNative.java @@ -0,0 +1,306 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +/** + * Native method declarations for Faiss JNI. + * + *
This class contains all the native method declarations that are implemented in the JNI C++ + * layer. These methods directly map to Faiss C++ API calls. + * + *
Users should not call these methods directly. Instead, use the high-level Java API classes + * like {@link Index} and {@link IndexFactory}. + */ +final class FaissNative { + + static { + try { + NativeLibraryLoader.load(); + } catch (FaissException e) { + throw new ExceptionInInitializerError(e); + } + } + + private FaissNative() { + // Static utility class + } + + // ==================== Index Factory ==================== + + /** + * Create an index using an index factory string. + * + * @param dimension the dimension of the vectors + * @param description the index description string (e.g., "Flat", "IVF100,Flat", "HNSW32") + * @param metricType the metric type (0 = L2, 1 = Inner Product) + * @return the native handle of the created index + */ + static native long indexFactoryCreate(int dimension, String description, int metricType); + + // ==================== Index Operations ==================== + + /** + * Destroy an index and free its resources. + * + * @param handle the native handle of the index + */ + static native void indexDestroy(long handle); + + /** + * Get the dimension of an index. + * + * @param handle the native handle of the index + * @return the dimension + */ + static native int indexGetDimension(long handle); + + /** + * Get the number of vectors in an index. + * + * @param handle the native handle of the index + * @return the number of vectors + */ + static native long indexGetCount(long handle); + + /** + * Check if an index is trained. + * + * @param handle the native handle of the index + * @return true if trained + */ + static native boolean indexIsTrained(long handle); + + /** + * Get the metric type of an index. + * + * @param handle the native handle of the index + * @return the metric type (0 = L2, 1 = Inner Product) + */ + static native int indexGetMetricType(long handle); + + /** + * Train an index on a set of training vectors. + * + * @param handle the native handle of the index + * @param n the number of training vectors + * @param vectors the training vectors (n * dimension floats) + */ + static native void indexTrain(long handle, long n, float[] vectors); + + /** + * Add vectors to an index. + * + * @param handle the native handle of the index + * @param n the number of vectors to add + * @param vectors the vectors to add (n * dimension floats) + */ + static native void indexAdd(long handle, long n, float[] vectors); + + /** + * Add vectors with IDs to an index. + * + * @param handle the native handle of the index + * @param n the number of vectors to add + * @param vectors the vectors to add (n * dimension floats) + * @param ids the IDs for the vectors (n longs) + */ + static native void indexAddWithIds(long handle, long n, float[] vectors, long[] ids); + + /** + * Search for the k nearest neighbors of query vectors. + * + * @param handle the native handle of the index + * @param n the number of query vectors + * @param queries the query vectors (n * dimension floats) + * @param k the number of nearest neighbors to find + * @param distances output array for distances (n * k floats) + * @param labels output array for labels/IDs (n * k longs) + */ + static native void indexSearch( + long handle, long n, float[] queries, int k, float[] distances, long[] labels); + + /** + * Search for neighbors within a given radius. + * + * @param handle the native handle of the index + * @param n the number of query vectors + * @param queries the query vectors (n * dimension floats) + * @param radius the search radius + * @return a range search result handle + */ + static native long indexRangeSearch(long handle, long n, float[] queries, float radius); + + /** + * Remove vectors by IDs from an index. + * + * @param handle the native handle of the index + * @param ids the IDs to remove + * @return the number of vectors removed + */ + static native long indexRemoveIds(long handle, long[] ids); + + /** + * Reset an index (remove all vectors). + * + * @param handle the native handle of the index + */ + static native void indexReset(long handle); + + // ==================== Index I/O ==================== + + /** + * Write an index to a file. + * + * @param handle the native handle of the index + * @param path the file path to write to + */ + static native void indexWriteToFile(long handle, String path); + + /** + * Read an index from a file. + * + * @param path the file path to read from + * @return the native handle of the loaded index + */ + static native long indexReadFromFile(String path); + + /** + * Serialize an index to a byte array. + * + * @param handle the native handle of the index + * @return the serialized bytes + */ + static native byte[] indexSerialize(long handle); + + /** + * Deserialize an index from a byte array. + * + * @param data the serialized bytes + * @return the native handle of the loaded index + */ + static native long indexDeserialize(byte[] data); + + // ==================== Range Search Result ==================== + + /** + * Destroy a range search result. + * + * @param handle the native handle of the range search result + */ + static native void rangeSearchResultDestroy(long handle); + + /** + * Get the number of results for each query in a range search. + * + * @param handle the native handle of the range search result + * @return array of result counts per query + */ + static native long[] rangeSearchResultGetLimits(long handle); + + /** + * Get all labels from a range search result. + * + * @param handle the native handle of the range search result + * @return array of all labels + */ + static native long[] rangeSearchResultGetLabels(long handle); + + /** + * Get all distances from a range search result. + * + * @param handle the native handle of the range search result + * @return array of all distances + */ + static native float[] rangeSearchResultGetDistances(long handle); + + // ==================== IVF Index Specific ==================== + + /** + * Get the number of probe lists for an IVF index. + * + * @param handle the native handle of the index + * @return the number of probe lists (nprobe) + */ + static native int ivfGetNprobe(long handle); + + /** + * Set the number of probe lists for an IVF index. + * + * @param handle the native handle of the index + * @param nprobe the number of probe lists + */ + static native void ivfSetNprobe(long handle, int nprobe); + + /** + * Get the number of lists (clusters) in an IVF index. + * + * @param handle the native handle of the index + * @return the number of lists + */ + static native int ivfGetNlist(long handle); + + // ==================== HNSW Index Specific ==================== + + /** + * Get the efSearch parameter of an HNSW index. + * + * @param handle the native handle of the index + * @return the efSearch value + */ + static native int hnswGetEfSearch(long handle); + + /** + * Set the efSearch parameter of an HNSW index. + * + * @param handle the native handle of the index + * @param efSearch the efSearch value + */ + static native void hnswSetEfSearch(long handle, int efSearch); + + /** + * Get the efConstruction parameter of an HNSW index. + * + * @param handle the native handle of the index + * @return the efConstruction value + */ + static native int hnswGetEfConstruction(long handle); + + // ==================== Utility ==================== + + /** + * Get the Faiss library version. + * + * @return the version string + */ + static native String getVersion(); + + /** + * Set the number of threads for parallel operations. + * + * @param numThreads the number of threads + */ + static native void setNumThreads(int numThreads); + + /** + * Get the number of threads for parallel operations. + * + * @return the number of threads + */ + static native int getNumThreads(); +} diff --git a/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/Index.java b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/Index.java new file mode 100644 index 000000000000..52392eddb44d --- /dev/null +++ b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/Index.java @@ -0,0 +1,373 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +import java.io.File; + +/** + * A Faiss index for similarity search. + * + *
This class wraps a native Faiss index and provides methods for adding vectors, searching for + * nearest neighbors, and managing the index. + * + *
Index instances must be closed when no longer needed to free native resources. It is + * recommended to use try-with-resources: + * + *
{@code
+ * try (Index index = IndexFactory.create(128, "Flat", MetricType.L2)) {
+ * index.add(vectors);
+ * SearchResult result = index.search(queries, 10);
+ * }
+ * }
+ *
+ * Thread Safety: Index instances are NOT thread-safe. External synchronization is required if an + * index is accessed from multiple threads. + * + * @see IndexFactory + */ +public class Index implements AutoCloseable { + + /** Native handle to the Faiss index. */ + private long nativeHandle; + + /** The dimension of vectors in this index. */ + private final int dimension; + + /** Whether this index has been closed. */ + private volatile boolean closed = false; + + /** + * Create an Index wrapper around a native handle. + * + * @param nativeHandle the native handle + * @param dimension the vector dimension + */ + Index(long nativeHandle, int dimension) { + this.nativeHandle = nativeHandle; + this.dimension = dimension; + } + + /** + * Get the dimension of vectors in this index. + * + * @return the vector dimension + */ + public int getDimension() { + return dimension; + } + + /** + * Get the number of vectors in this index. + * + * @return the number of vectors + */ + public long getCount() { + checkNotClosed(); + return FaissNative.indexGetCount(nativeHandle); + } + + /** + * Check if this index is trained. + * + *
Some index types (like IVF) require training before vectors can be added. Flat indexes are + * always considered trained. + * + * @return true if the index is trained + */ + public boolean isTrained() { + checkNotClosed(); + return FaissNative.indexIsTrained(nativeHandle); + } + + /** + * Get the metric type used by this index. + * + * @return the metric type + */ + public MetricType getMetricType() { + checkNotClosed(); + return MetricType.fromValue(FaissNative.indexGetMetricType(nativeHandle)); + } + + /** + * Train the index on a set of training vectors. + * + *
This is required for some index types (like IVF) before adding vectors. For flat indexes, + * this is a no-op. + * + * @param vectors the training vectors (n * dimension floats) + */ + public void train(float[] vectors) { + checkNotClosed(); + if (vectors.length % dimension != 0) { + throw new IllegalArgumentException( + "Vector array length must be a multiple of dimension " + dimension); + } + long n = vectors.length / dimension; + FaissNative.indexTrain(nativeHandle, n, vectors); + } + + /** + * Add vectors to the index. + * + *
The vectors are assigned sequential IDs starting from the current count. + * + * @param vectors the vectors to add (n * dimension floats) + */ + public void add(float[] vectors) { + checkNotClosed(); + if (vectors.length % dimension != 0) { + throw new IllegalArgumentException( + "Vector array length must be a multiple of dimension " + dimension); + } + long n = vectors.length / dimension; + FaissNative.indexAdd(nativeHandle, n, vectors); + } + + /** + * Add a single vector to the index. + * + * @param vector the vector to add (dimension floats) + */ + public void addSingle(float[] vector) { + checkNotClosed(); + if (vector.length != dimension) { + throw new IllegalArgumentException( + "Vector length must equal dimension " + dimension + ", got " + vector.length); + } + FaissNative.indexAdd(nativeHandle, 1, vector); + } + + /** + * Add vectors with explicit IDs to the index. + * + *
Note: Not all index types support this operation. Flat indexes and IndexIDMap wrapped + * indexes support it. + * + * @param vectors the vectors to add (n * dimension floats) + * @param ids the IDs for the vectors (n longs) + */ + public void addWithIds(float[] vectors, long[] ids) { + checkNotClosed(); + if (vectors.length % dimension != 0) { + throw new IllegalArgumentException( + "Vector array length must be a multiple of dimension " + dimension); + } + long n = vectors.length / dimension; + if (ids.length != n) { + throw new IllegalArgumentException( + "Number of IDs (" + ids.length + ") must match number of vectors (" + n + ")"); + } + FaissNative.indexAddWithIds(nativeHandle, n, vectors, ids); + } + + /** + * Search for the k nearest neighbors of query vectors. + * + * @param queries the query vectors (n * dimension floats) + * @param k the number of nearest neighbors to find + * @return the search result containing labels and distances + */ + public SearchResult search(float[] queries, int k) { + checkNotClosed(); + if (queries.length % dimension != 0) { + throw new IllegalArgumentException( + "Query array length must be a multiple of dimension " + dimension); + } + int n = queries.length / dimension; + long[] labels = new long[n * k]; + float[] distances = new float[n * k]; + FaissNative.indexSearch(nativeHandle, n, queries, k, distances, labels); + return new SearchResult(n, k, labels, distances); + } + + /** + * Search for a single query vector. + * + * @param query the query vector (dimension floats) + * @param k the number of nearest neighbors to find + * @return the search result + */ + public SearchResult searchSingle(float[] query, int k) { + checkNotClosed(); + if (query.length != dimension) { + throw new IllegalArgumentException( + "Query length must equal dimension " + dimension + ", got " + query.length); + } + long[] labels = new long[k]; + float[] distances = new float[k]; + FaissNative.indexSearch(nativeHandle, 1, query, k, distances, labels); + return new SearchResult(1, k, labels, distances); + } + + /** + * Search for all neighbors within a given radius. + * + * @param queries the query vectors (n * dimension floats) + * @param radius the search radius + * @return the range search result + */ + public RangeSearchResult rangeSearch(float[] queries, float radius) { + checkNotClosed(); + if (queries.length % dimension != 0) { + throw new IllegalArgumentException( + "Query array length must be a multiple of dimension " + dimension); + } + int n = queries.length / dimension; + long resultHandle = FaissNative.indexRangeSearch(nativeHandle, n, queries, radius); + return new RangeSearchResult(resultHandle, n); + } + + /** + * Remove vectors by their IDs. + * + *
Note: Not all index types support removal. Check Faiss documentation for details on which + * index types support this operation. + * + * @param ids the IDs of vectors to remove + * @return the number of vectors actually removed + */ + public long removeIds(long[] ids) { + checkNotClosed(); + return FaissNative.indexRemoveIds(nativeHandle, ids); + } + + /** Reset the index (remove all vectors). */ + public void reset() { + checkNotClosed(); + FaissNative.indexReset(nativeHandle); + } + + /** + * Write the index to a file. + * + * @param path the file path + */ + public void writeToFile(String path) { + checkNotClosed(); + FaissNative.indexWriteToFile(nativeHandle, path); + } + + /** + * Write the index to a file. + * + * @param file the file + */ + public void writeToFile(File file) { + writeToFile(file.getAbsolutePath()); + } + + /** + * Read an index from a file. + * + * @param path the file path + * @return the loaded index + */ + public static Index readFromFile(String path) { + long handle = FaissNative.indexReadFromFile(path); + int dimension = FaissNative.indexGetDimension(handle); + return new Index(handle, dimension); + } + + /** + * Read an index from a file. + * + * @param file the file + * @return the loaded index + */ + public static Index readFromFile(File file) { + return readFromFile(file.getAbsolutePath()); + } + + /** + * Serialize the index to a byte array. + * + * @return the serialized bytes + */ + public byte[] serialize() { + checkNotClosed(); + return FaissNative.indexSerialize(nativeHandle); + } + + /** + * Deserialize an index from a byte array. + * + * @param data the serialized bytes + * @return the deserialized index + */ + public static Index deserialize(byte[] data) { + long handle = FaissNative.indexDeserialize(data); + int dimension = FaissNative.indexGetDimension(handle); + return new Index(handle, dimension); + } + + /** + * Get the native handle. + * + *
This is for internal use only. + * + * @return the native handle + */ + long getNativeHandle() { + return nativeHandle; + } + + private void checkNotClosed() { + if (closed) { + throw new IllegalStateException("Index has been closed"); + } + } + + @Override + public void close() { + if (!closed) { + closed = true; + if (nativeHandle != 0) { + FaissNative.indexDestroy(nativeHandle); + nativeHandle = 0; + } + } + } + + @Override + protected void finalize() throws Throwable { + try { + close(); + } finally { + super.finalize(); + } + } + + @Override + public String toString() { + if (closed) { + return "Index[closed]"; + } + return "Index{" + + "dimension=" + + dimension + + ", count=" + + getCount() + + ", trained=" + + isTrained() + + ", metricType=" + + getMetricType() + + '}'; + } +} diff --git a/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexFactory.java b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexFactory.java new file mode 100644 index 000000000000..3432bd4e1071 --- /dev/null +++ b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexFactory.java @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +/** + * Factory for creating Faiss indexes. + * + *
This class provides static methods for creating various types of Faiss indexes using Faiss's + * index factory syntax. + * + *
Index Description Syntax + * + *
The index description string follows Faiss's index factory format: + * + *
Preprocessing Options + * + *
Preprocessing can be added before the main index: + * + *
Example Usage + * + *
{@code
+ * // Create a flat index for exact search
+ * Index flatIndex = IndexFactory.create(128, "Flat", MetricType.L2);
+ *
+ * // Create an IVF index for approximate search
+ * Index ivfIndex = IndexFactory.create(128, "IVF1000,Flat", MetricType.L2);
+ * ivfIndex.train(trainingVectors); // Training required for IVF
+ *
+ * // Create an HNSW index
+ * Index hnswIndex = IndexFactory.create(128, "HNSW32", MetricType.INNER_PRODUCT);
+ *
+ * // Create a flat index with ID mapping
+ * Index idMapIndex = IndexFactory.create(128, "IDMap,Flat", MetricType.L2);
+ * }
+ *
+ * @see Index
+ * @see MetricType
+ */
+public final class IndexFactory {
+
+ private IndexFactory() {
+ // Static utility class
+ }
+
+ /**
+ * Create a Faiss index using the index factory.
+ *
+ * @param dimension the dimension of the vectors
+ * @param description the index description string
+ * @param metricType the metric type for similarity computation
+ * @return the created index
+ */
+ public static Index create(int dimension, String description, MetricType metricType) {
+ if (dimension <= 0) {
+ throw new IllegalArgumentException("Dimension must be positive: " + dimension);
+ }
+ if (description == null || description.isEmpty()) {
+ throw new IllegalArgumentException("Index description cannot be null or empty");
+ }
+ if (metricType == null) {
+ throw new IllegalArgumentException("Metric type cannot be null");
+ }
+
+ long handle = FaissNative.indexFactoryCreate(dimension, description, metricType.getValue());
+ return new Index(handle, dimension);
+ }
+
+ /**
+ * Create a Faiss index with L2 (Euclidean) metric.
+ *
+ * @param dimension the dimension of the vectors
+ * @param description the index description string
+ * @return the created index
+ */
+ public static Index create(int dimension, String description) {
+ return create(dimension, description, MetricType.L2);
+ }
+
+ /**
+ * Create a flat (brute-force) index.
+ *
+ * Flat indexes provide exact search results but have O(n) search complexity. Suitable for + * small datasets (up to ~100K vectors). + * + * @param dimension the dimension of the vectors + * @param metricType the metric type + * @return the created index + */ + public static Index createFlat(int dimension, MetricType metricType) { + return create(dimension, "Flat", metricType); + } + + /** + * Create a flat index with L2 metric. + * + * @param dimension the dimension of the vectors + * @return the created index + */ + public static Index createFlat(int dimension) { + return createFlat(dimension, MetricType.L2); + } + + /** + * Create a flat index with ID mapping support. + * + *
This allows adding vectors with explicit IDs using {@link Index#addWithIds}. + * + * @param dimension the dimension of the vectors + * @param metricType the metric type + * @return the created index + */ + public static Index createFlatWithIds(int dimension, MetricType metricType) { + return create(dimension, "IDMap,Flat", metricType); + } + + /** + * Create an IVF (Inverted File) index. + * + *
IVF indexes partition the vector space into clusters for faster search. They require + * training before use. + * + * @param dimension the dimension of the vectors + * @param nlist the number of clusters (typically sqrt(n) to 4*sqrt(n)) + * @param metricType the metric type + * @return the created index + */ + public static Index createIVFFlat(int dimension, int nlist, MetricType metricType) { + return create(dimension, "IVF" + nlist + ",Flat", metricType); + } + + /** + * Create an IVF index with product quantization. + * + *
IVF-PQ provides a good balance between search speed, memory usage, and accuracy. + * + * @param dimension the dimension of the vectors + * @param nlist the number of clusters + * @param m the number of sub-vectors for PQ (dimension must be divisible by m) + * @param metricType the metric type + * @return the created index + */ + public static Index createIVFPQ(int dimension, int nlist, int m, MetricType metricType) { + if (dimension % m != 0) { + throw new IllegalArgumentException( + "Dimension " + dimension + " must be divisible by m " + m); + } + return create(dimension, "IVF" + nlist + ",PQ" + m, metricType); + } + + /** + * Create an HNSW (Hierarchical Navigable Small World) index. + * + *
HNSW provides excellent search performance with good recall. It does not require training. + * + * @param dimension the dimension of the vectors + * @param m the number of neighbors in the graph (typically 16-64) + * @param metricType the metric type + * @return the created index + */ + public static Index createHNSW(int dimension, int m, MetricType metricType) { + return create(dimension, "HNSW" + m, metricType); + } + + /** + * Create an HNSW index with flat storage. + * + * @param dimension the dimension of the vectors + * @param m the number of neighbors in the graph + * @param metricType the metric type + * @return the created index + */ + public static Index createHNSWFlat(int dimension, int m, MetricType metricType) { + return create(dimension, "HNSW" + m + ",Flat", metricType); + } + + /** + * Create a product quantization index. + * + *
PQ indexes provide significant memory savings at the cost of some accuracy. They require + * training. + * + * @param dimension the dimension of the vectors + * @param m the number of sub-vectors (dimension must be divisible by m) + * @param metricType the metric type + * @return the created index + */ + public static Index createPQ(int dimension, int m, MetricType metricType) { + if (dimension % m != 0) { + throw new IllegalArgumentException( + "Dimension " + dimension + " must be divisible by m " + m); + } + return create(dimension, "PQ" + m, metricType); + } + + /** + * Create a scalar quantizer index. + * + *
Scalar quantization compresses vectors by quantizing each dimension. + * + * @param dimension the dimension of the vectors + * @param bits the number of bits per dimension (4 or 8) + * @param metricType the metric type + * @return the created index + */ + public static Index createScalarQuantizer(int dimension, int bits, MetricType metricType) { + if (bits != 4 && bits != 8) { + throw new IllegalArgumentException("Bits must be 4 or 8, got: " + bits); + } + return create(dimension, "SQ" + bits, metricType); + } +} diff --git a/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexHNSW.java b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexHNSW.java new file mode 100644 index 000000000000..19773c78adad --- /dev/null +++ b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexHNSW.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +/** + * Utility class for HNSW (Hierarchical Navigable Small World) index operations. + * + *
HNSW indexes build a graph structure for fast approximate nearest neighbor search. The key + * parameters are: + * + *
Example usage: + * + *
{@code
+ * Index index = IndexFactory.createHNSW(128, 32, MetricType.L2);
+ * index.add(vectors);
+ *
+ * // Increase efSearch for more accurate results
+ * IndexHNSW.setEfSearch(index, 64);
+ *
+ * SearchResult result = index.search(queries, 10);
+ * }
+ */
+public final class IndexHNSW {
+
+ private IndexHNSW() {
+ // Static utility class
+ }
+
+ /**
+ * Get the efSearch parameter.
+ *
+ * This controls the size of the dynamic candidate list during search. Higher values give + * more accurate results but slower search. + * + * @param index the HNSW index + * @return the current efSearch value + * @throws IllegalArgumentException if the index is not an HNSW index + */ + public static int getEfSearch(Index index) { + return FaissNative.hnswGetEfSearch(index.getNativeHandle()); + } + + /** + * Set the efSearch parameter. + * + *
This should be at least k (the number of neighbors requested in search). Typical values + * range from 16 to 256. Higher values give more accurate results but slower search. + * + * @param index the HNSW index + * @param efSearch the efSearch value + * @throws IllegalArgumentException if the index is not an HNSW index + */ + public static void setEfSearch(Index index, int efSearch) { + if (efSearch <= 0) { + throw new IllegalArgumentException("efSearch must be positive: " + efSearch); + } + FaissNative.hnswSetEfSearch(index.getNativeHandle(), efSearch); + } + + /** + * Get the efConstruction parameter. + * + *
This was the size of the dynamic candidate list during index construction. It cannot be + * changed after the index is built. + * + * @param index the HNSW index + * @return the efConstruction value + * @throws IllegalArgumentException if the index is not an HNSW index + */ + public static int getEfConstruction(Index index) { + return FaissNative.hnswGetEfConstruction(index.getNativeHandle()); + } +} diff --git a/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexIVF.java b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexIVF.java new file mode 100644 index 000000000000..e0493a4a9b34 --- /dev/null +++ b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexIVF.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +/** + * Utility class for IVF (Inverted File) index operations. + * + *
IVF indexes partition the vector space into clusters (cells) and only search a subset of + * clusters during search. The {@code nprobe} parameter controls how many clusters to search, + * trading off between speed and accuracy. + * + *
Example usage: + * + *
{@code
+ * Index index = IndexFactory.createIVFFlat(128, 1000, MetricType.L2);
+ * index.train(trainingVectors);
+ * index.add(vectors);
+ *
+ * // Set number of clusters to probe during search
+ * IndexIVF.setNprobe(index, 10); // Search 10 out of 1000 clusters
+ *
+ * SearchResult result = index.search(queries, 10);
+ * }
+ */
+public final class IndexIVF {
+
+ private IndexIVF() {
+ // Static utility class
+ }
+
+ /**
+ * Get the number of clusters to probe during search (nprobe).
+ *
+ * @param index the IVF index
+ * @return the current nprobe value
+ * @throws IllegalArgumentException if the index is not an IVF index
+ */
+ public static int getNprobe(Index index) {
+ return FaissNative.ivfGetNprobe(index.getNativeHandle());
+ }
+
+ /**
+ * Set the number of clusters to probe during search (nprobe).
+ *
+ * Higher values increase accuracy but decrease search speed. A good starting point is 1-10% + * of the total number of clusters. + * + * @param index the IVF index + * @param nprobe the number of clusters to probe + * @throws IllegalArgumentException if the index is not an IVF index + */ + public static void setNprobe(Index index, int nprobe) { + if (nprobe <= 0) { + throw new IllegalArgumentException("nprobe must be positive: " + nprobe); + } + FaissNative.ivfSetNprobe(index.getNativeHandle(), nprobe); + } + + /** + * Get the total number of clusters (nlist) in the index. + * + * @param index the IVF index + * @return the number of clusters + * @throws IllegalArgumentException if the index is not an IVF index + */ + public static int getNlist(Index index) { + return FaissNative.ivfGetNlist(index.getNativeHandle()); + } +} diff --git a/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/MetricType.java b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/MetricType.java new file mode 100644 index 000000000000..28e5f5e3bf0a --- /dev/null +++ b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/MetricType.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +/** + * Metric type for similarity search. + * + *
Faiss supports two main metric types for measuring similarity between vectors: + * + *
The squared L2 distance between two vectors is computed as: {@code sum((a[i] - b[i])^2)} + * + *
Smaller distances indicate more similar vectors. + */ + L2(0), + + /** + * Inner product (dot product). + * + *
The inner product between two vectors is computed as: {@code sum(a[i] * b[i])} + * + *
Larger values indicate more similar vectors. For normalized vectors, this is equivalent to + * cosine similarity. + */ + INNER_PRODUCT(1); + + private final int value; + + MetricType(int value) { + this.value = value; + } + + /** + * Get the numeric value of this metric type. + * + * @return the numeric value + */ + public int getValue() { + return value; + } + + /** + * Get a MetricType from its numeric value. + * + * @param value the numeric value + * @return the corresponding MetricType + * @throws IllegalArgumentException if the value is not valid + */ + public static MetricType fromValue(int value) { + for (MetricType type : values()) { + if (type.value == value) { + return type; + } + } + throw new IllegalArgumentException("Unknown metric type value: " + value); + } +} diff --git a/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/NativeLibraryLoader.java b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/NativeLibraryLoader.java new file mode 100644 index 000000000000..58c8806378c2 --- /dev/null +++ b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/NativeLibraryLoader.java @@ -0,0 +1,306 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * Native library loader for Faiss JNI. + * + *
This class is responsible for loading the native Faiss library from the JAR file or system + * path. It follows a similar pattern to RocksDB's native library loading mechanism. + * + *
The loader attempts to load the library in the following order: + * + *
Order matters! Libraries must be loaded before the libraries that depend on them. + */ + private static final String[] DEPENDENCY_LIBRARIES = { + // GCC runtime libraries (must be loaded first as others depend on them) + "libgcc_s.so.1", + // Quadmath library (needed by gfortran) + "libquadmath.so.0", + // Fortran runtime (needed by OpenBLAS) - try multiple versions + "libgfortran.so.5", + "libgfortran.so.4", + "libgfortran.so.3", + // OpenMP runtime + "libgomp.so.1", + // BLAS/LAPACK + "libblas.so.3", + "liblapack.so.3", + // OpenBLAS for FAISS (load last as it depends on above) + "libopenblas.so.0", + }; + + /** Whether the native library has been loaded. */ + private static volatile boolean libraryLoaded = false; + + /** Lock for thread-safe library loading. */ + private static final Object LOAD_LOCK = new Object(); + + /** Temporary directory for extracting native libraries. */ + private static Path tempDir; + + private NativeLibraryLoader() { + // Utility class, no instantiation + } + + /** + * Load the native library. + * + * @throws FaissException if the library cannot be loaded + */ + public static void load() throws FaissException { + if (libraryLoaded) { + return; + } + + synchronized (LOAD_LOCK) { + if (libraryLoaded) { + return; + } + + try { + loadNativeLibrary(); + libraryLoaded = true; + LOG.info("Faiss native library loaded successfully"); + } catch (Exception e) { + throw new FaissException("Failed to load Faiss native library", e); + } + } + } + + /** + * Check if the native library has been loaded. + * + * @return true if the library is loaded + */ + public static boolean isLoaded() { + return libraryLoaded; + } + + private static void loadNativeLibrary() throws IOException { + // First, try loading from custom path + String customPath = System.getProperty(LIBRARY_PATH_PROPERTY); + if (customPath != null && !customPath.isEmpty()) { + File customLibrary = new File(customPath); + if (customLibrary.exists()) { + System.load(customLibrary.getAbsolutePath()); + LOG.info("Loaded Faiss native library from custom path: {}", customPath); + return; + } else { + LOG.warn("Custom library path specified but file not found: {}", customPath); + } + } + + // Second, try loading from system library path + try { + System.loadLibrary(JNI_LIBRARY_NAME); + LOG.info("Loaded Faiss native library from system path"); + return; + } catch (UnsatisfiedLinkError e) { + LOG.debug( + "Could not load from system path, trying bundled library: {}", e.getMessage()); + } + + // Third, try loading from JAR + loadFromJar(); + } + + private static void loadFromJar() throws IOException { + String libraryPath = getLibraryResourcePath(); + LOG.debug("Attempting to load native library from JAR: {}", libraryPath); + + try (InputStream is = NativeLibraryLoader.class.getResourceAsStream(libraryPath)) { + if (is == null) { + throw new IOException( + "Native library not found in JAR: " + + libraryPath + + ". " + + "Make sure you are using the correct JAR for your platform (" + + getPlatformIdentifier() + + ")"); + } + + // Create temp directory if needed + if (tempDir == null) { + tempDir = Files.createTempDirectory("paimon-faiss-native"); + tempDir.toFile().deleteOnExit(); + } + + // First, extract and load dependency libraries (if bundled) + loadDependencyLibraries(); + + // Extract native library to temp file + String fileName = System.mapLibraryName(JNI_LIBRARY_NAME); + File tempFile = new File(tempDir.toFile(), fileName); + tempFile.deleteOnExit(); + + try (OutputStream os = new FileOutputStream(tempFile)) { + byte[] buffer = new byte[8192]; + int bytesRead; + while ((bytesRead = is.read(buffer)) != -1) { + os.write(buffer, 0, bytesRead); + } + } + + // Make the file executable (for Unix-like systems) + if (!tempFile.setExecutable(true)) { + LOG.warn("Could not set executable permission on native library"); + } + + // Load the library + System.load(tempFile.getAbsolutePath()); + LOG.info("Loaded Faiss native library from JAR: {}", libraryPath); + } + } + + /** + * Extract and load dependency libraries that are bundled in the JAR. These must be loaded + * before the main JNI library to satisfy its dynamic linking requirements. + */ + private static void loadDependencyLibraries() { + String os = getOsName(); + String arch = getArchName(); + + for (String depLib : DEPENDENCY_LIBRARIES) { + String resourcePath = "/" + os + "/" + arch + "/" + depLib; + try (InputStream is = NativeLibraryLoader.class.getResourceAsStream(resourcePath)) { + if (is == null) { + LOG.debug("Dependency library not bundled: {}", depLib); + continue; + } + + File tempFile = new File(tempDir.toFile(), depLib); + tempFile.deleteOnExit(); + + try (OutputStream fos = new FileOutputStream(tempFile)) { + byte[] buffer = new byte[8192]; + int bytesRead; + while ((bytesRead = is.read(buffer)) != -1) { + fos.write(buffer, 0, bytesRead); + } + } + + if (!tempFile.setExecutable(true)) { + LOG.warn("Could not set executable permission on: {}", depLib); + } + + // Load the dependency library + System.load(tempFile.getAbsolutePath()); + LOG.info("Loaded bundled dependency library: {}", depLib); + } catch (UnsatisfiedLinkError e) { + // Library might already be loaded or not needed + LOG.debug("Could not load dependency {}: {}", depLib, e.getMessage()); + } catch (IOException e) { + LOG.debug("Could not extract dependency {}: {}", depLib, e.getMessage()); + } + } + } + + private static String getLibraryResourcePath() { + String os = getOsName(); + String arch = getArchName(); + String libraryFileName = System.mapLibraryName(JNI_LIBRARY_NAME); + return "/" + os + "/" + arch + "/" + libraryFileName; + } + + /** + * Get the platform identifier for the current system. + * + * @return platform identifier string (e.g., "linux/amd64", "darwin/aarch64") + */ + static String getPlatformIdentifier() { + return getOsName() + "/" + getArchName(); + } + + /** + * Get the normalized OS name for the current system. + * + * @return OS name string (e.g., "linux", "darwin") + */ + private static String getOsName() { + String osName = System.getProperty("os.name").toLowerCase(); + + if (osName.contains("linux")) { + return "linux"; + } else if (osName.contains("mac") || osName.contains("darwin")) { + return "darwin"; + } else { + throw new UnsupportedOperationException( + "Unsupported operating system: " + + osName + + ". Only Linux and macOS are supported."); + } + } + + /** + * Get the normalized architecture name for the current system. + * + * @return architecture name string (e.g., "amd64", "aarch64") + */ + private static String getArchName() { + String osArch = System.getProperty("os.arch").toLowerCase(); + + if (osArch.equals("amd64") || osArch.equals("x86_64")) { + return "amd64"; + } else if (osArch.equals("aarch64") || osArch.equals("arm64")) { + return "aarch64"; + } else { + throw new UnsupportedOperationException("Unsupported architecture: " + osArch); + } + } + + /** + * Get the name of the JNI library. + * + * @return the library name + */ + public static String getLibraryName() { + return JNI_LIBRARY_NAME; + } +} diff --git a/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/RangeSearchResult.java b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/RangeSearchResult.java new file mode 100644 index 000000000000..923b4cf39104 --- /dev/null +++ b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/RangeSearchResult.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +import java.util.Arrays; + +/** + * Result of a range search operation. + * + *
Unlike k-NN search which returns a fixed number of neighbors per query, range search returns + * all neighbors within a given radius, which can vary per query. + */ +public class RangeSearchResult implements AutoCloseable { + + private long nativeHandle; + private final int numQueries; + private long[] limits; + private long[] labels; + private float[] distances; + + /** + * Create a new RangeSearchResult from a native handle. + * + * @param nativeHandle the native handle + * @param numQueries the number of query vectors + */ + RangeSearchResult(long nativeHandle, int numQueries) { + this.nativeHandle = nativeHandle; + this.numQueries = numQueries; + } + + /** + * Get the number of query vectors. + * + * @return the number of queries + */ + public int getNumQueries() { + return numQueries; + } + + /** + * Get the number of results for a specific query. + * + * @param queryIndex the query index + * @return the number of results + */ + public long getResultCount(int queryIndex) { + ensureLimitsLoaded(); + if (queryIndex < 0 || queryIndex >= numQueries) { + throw new IndexOutOfBoundsException("Query index out of bounds: " + queryIndex); + } + return limits[queryIndex + 1] - limits[queryIndex]; + } + + /** + * Get the total number of results across all queries. + * + * @return the total number of results + */ + public long getTotalResultCount() { + ensureLimitsLoaded(); + return limits[numQueries]; + } + + /** + * Get the labels for a specific query. + * + * @param queryIndex the query index + * @return the labels for this query + */ + public long[] getLabelsForQuery(int queryIndex) { + ensureFullyLoaded(); + if (queryIndex < 0 || queryIndex >= numQueries) { + throw new IndexOutOfBoundsException("Query index out of bounds: " + queryIndex); + } + int start = (int) limits[queryIndex]; + int end = (int) limits[queryIndex + 1]; + return Arrays.copyOfRange(labels, start, end); + } + + /** + * Get the distances for a specific query. + * + * @param queryIndex the query index + * @return the distances for this query + */ + public float[] getDistancesForQuery(int queryIndex) { + ensureFullyLoaded(); + if (queryIndex < 0 || queryIndex >= numQueries) { + throw new IndexOutOfBoundsException("Query index out of bounds: " + queryIndex); + } + int start = (int) limits[queryIndex]; + int end = (int) limits[queryIndex + 1]; + return Arrays.copyOfRange(distances, start, end); + } + + /** + * Get all labels as a flat array. + * + * @return all labels + */ + public long[] getAllLabels() { + ensureFullyLoaded(); + return labels; + } + + /** + * Get all distances as a flat array. + * + * @return all distances + */ + public float[] getAllDistances() { + ensureFullyLoaded(); + return distances; + } + + private void ensureLimitsLoaded() { + if (limits == null && nativeHandle != 0) { + limits = FaissNative.rangeSearchResultGetLimits(nativeHandle); + } + } + + private void ensureFullyLoaded() { + ensureLimitsLoaded(); + if (labels == null && nativeHandle != 0) { + labels = FaissNative.rangeSearchResultGetLabels(nativeHandle); + distances = FaissNative.rangeSearchResultGetDistances(nativeHandle); + } + } + + @Override + public void close() { + if (nativeHandle != 0) { + FaissNative.rangeSearchResultDestroy(nativeHandle); + nativeHandle = 0; + } + } + + @Override + protected void finalize() throws Throwable { + try { + close(); + } finally { + super.finalize(); + } + } +} diff --git a/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/SearchResult.java b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/SearchResult.java new file mode 100644 index 000000000000..899f30113da7 --- /dev/null +++ b/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/SearchResult.java @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +import java.util.Arrays; + +/** + * Result of a k-nearest neighbor search operation. + * + *
Contains the labels (IDs) and distances of the k nearest neighbors for each query vector. + */ +public class SearchResult { + + private final int numQueries; + private final int k; + private final long[] labels; + private final float[] distances; + + /** + * Create a new SearchResult. + * + * @param numQueries the number of query vectors + * @param k the number of neighbors per query + * @param labels the neighbor labels (numQueries * k) + * @param distances the distances to neighbors (numQueries * k) + */ + public SearchResult(int numQueries, int k, long[] labels, float[] distances) { + this.numQueries = numQueries; + this.k = k; + this.labels = labels; + this.distances = distances; + } + + /** + * Get the number of query vectors. + * + * @return the number of queries + */ + public int getNumQueries() { + return numQueries; + } + + /** + * Get the number of neighbors per query. + * + * @return k value + */ + public int getK() { + return k; + } + + /** + * Get all labels as a flat array. + * + *
The array is organized as: [query0_neighbor0, query0_neighbor1, ..., query1_neighbor0, + * ...] + * + * @return the labels array + */ + public long[] getLabels() { + return labels; + } + + /** + * Get all distances as a flat array. + * + *
The array is organized as: [query0_dist0, query0_dist1, ..., query1_dist0, ...]
+ *
+ * @return the distances array
+ */
+ public float[] getDistances() {
+ return distances;
+ }
+
+ /**
+ * Get the labels for a specific query.
+ *
+ * @param queryIndex the query index
+ * @return the labels for this query
+ */
+ public long[] getLabelsForQuery(int queryIndex) {
+ if (queryIndex < 0 || queryIndex >= numQueries) {
+ throw new IndexOutOfBoundsException("Query index out of bounds: " + queryIndex);
+ }
+ int start = queryIndex * k;
+ return Arrays.copyOfRange(labels, start, start + k);
+ }
+
+ /**
+ * Get the distances for a specific query.
+ *
+ * @param queryIndex the query index
+ * @return the distances for this query
+ */
+ public float[] getDistancesForQuery(int queryIndex) {
+ if (queryIndex < 0 || queryIndex >= numQueries) {
+ throw new IndexOutOfBoundsException("Query index out of bounds: " + queryIndex);
+ }
+ int start = queryIndex * k;
+ return Arrays.copyOfRange(distances, start, start + k);
+ }
+
+ /**
+ * Get the label of a specific neighbor for a specific query.
+ *
+ * @param queryIndex the query index
+ * @param neighborIndex the neighbor index (0 = closest)
+ * @return the label
+ */
+ public long getLabel(int queryIndex, int neighborIndex) {
+ if (queryIndex < 0 || queryIndex >= numQueries) {
+ throw new IndexOutOfBoundsException("Query index out of bounds: " + queryIndex);
+ }
+ if (neighborIndex < 0 || neighborIndex >= k) {
+ throw new IndexOutOfBoundsException("Neighbor index out of bounds: " + neighborIndex);
+ }
+ return labels[queryIndex * k + neighborIndex];
+ }
+
+ /**
+ * Get the distance of a specific neighbor for a specific query.
+ *
+ * @param queryIndex the query index
+ * @param neighborIndex the neighbor index (0 = closest)
+ * @return the distance
+ */
+ public float getDistance(int queryIndex, int neighborIndex) {
+ if (queryIndex < 0 || queryIndex >= numQueries) {
+ throw new IndexOutOfBoundsException("Query index out of bounds: " + queryIndex);
+ }
+ if (neighborIndex < 0 || neighborIndex >= k) {
+ throw new IndexOutOfBoundsException("Neighbor index out of bounds: " + neighborIndex);
+ }
+ return distances[queryIndex * k + neighborIndex];
+ }
+
+ @Override
+ public String toString() {
+ return "SearchResult{"
+ + "numQueries="
+ + numQueries
+ + ", k="
+ + k
+ + ", labels="
+ + Arrays.toString(labels)
+ + ", distances="
+ + Arrays.toString(distances)
+ + '}';
+ }
+}
diff --git a/paimon-faiss-jni/src/main/native/CMakeLists.txt b/paimon-faiss-jni/src/main/native/CMakeLists.txt
new file mode 100644
index 000000000000..e20fbbe8489d
--- /dev/null
+++ b/paimon-faiss-jni/src/main/native/CMakeLists.txt
@@ -0,0 +1,453 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.30.1)
+project(paimon_faiss_jni VERSION 0.1.0 LANGUAGES CXX)
+
+# Check GCC version (must be >= 9.3.0)
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+ if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9.3.0")
+ message(FATAL_ERROR "GCC version must be >= 9.3.0. Found: ${CMAKE_CXX_COMPILER_VERSION}")
+ endif()
+ message(STATUS "Using GCC ${CMAKE_CXX_COMPILER_VERSION}")
+endif()
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+# Options
+option(FAISS_ENABLE_GPU "Build with GPU support" OFF)
+option(FAISS_OPT_LEVEL "Optimization level (generic, avx2, avx512)" "generic")
+option(BUILD_FAT_LIB "Build fat library with all dependencies statically linked" ON)
+
+# Find JNI
+find_package(JNI REQUIRED)
+include_directories(${JNI_INCLUDE_DIRS})
+
+# Find OpenMP (with special handling for macOS)
+if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+ # macOS requires special handling for OpenMP
+ # First try to find libomp from Homebrew
+ execute_process(
+ COMMAND brew --prefix libomp
+ OUTPUT_VARIABLE HOMEBREW_LIBOMP_PREFIX
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+ ERROR_QUIET
+ )
+
+ if(HOMEBREW_LIBOMP_PREFIX)
+ message(STATUS "Found Homebrew libomp: ${HOMEBREW_LIBOMP_PREFIX}")
+ set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_LIBOMP_PREFIX}/include")
+ set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_LIBOMP_PREFIX}/include")
+ set(OpenMP_C_LIB_NAMES "omp")
+ set(OpenMP_CXX_LIB_NAMES "omp")
+ set(OpenMP_omp_LIBRARY "${HOMEBREW_LIBOMP_PREFIX}/lib/libomp.dylib")
+
+ # Create imported target manually
+ if(NOT TARGET OpenMP::OpenMP_CXX)
+ add_library(OpenMP::OpenMP_CXX SHARED IMPORTED)
+ set_target_properties(OpenMP::OpenMP_CXX PROPERTIES
+ IMPORTED_LOCATION "${HOMEBREW_LIBOMP_PREFIX}/lib/libomp.dylib"
+ INTERFACE_INCLUDE_DIRECTORIES "${HOMEBREW_LIBOMP_PREFIX}/include"
+ INTERFACE_COMPILE_OPTIONS "-Xpreprocessor;-fopenmp"
+ )
+ endif()
+ set(OpenMP_FOUND TRUE)
+ else()
+ message(WARNING "libomp not found via Homebrew. Trying standard OpenMP detection...")
+ find_package(OpenMP)
+ endif()
+else()
+ find_package(OpenMP REQUIRED)
+endif()
+
+if(NOT OpenMP_FOUND AND NOT TARGET OpenMP::OpenMP_CXX)
+ message(WARNING "OpenMP not found. Building without OpenMP support.")
+ message(WARNING "On macOS, install libomp: brew install libomp")
+endif()
+
+# Find Faiss
+# For fat lib, prefer static libraries
+if(BUILD_FAT_LIB)
+ message(STATUS "Building fat library - preferring static libraries")
+ set(CMAKE_FIND_LIBRARY_SUFFIXES ".a" ".so" ".dylib")
+ set(FAISS_STATIC_PREFERRED TRUE)
+else()
+ set(FAISS_STATIC_PREFERRED FALSE)
+endif()
+
+# First try to find Faiss via CMake config
+find_package(faiss CONFIG QUIET)
+
+if(NOT faiss_FOUND)
+ # Try pkg-config
+ find_package(PkgConfig QUIET)
+ if(PKG_CONFIG_FOUND)
+ pkg_check_modules(FAISS QUIET faiss)
+ endif()
+
+ if(NOT FAISS_FOUND)
+ # Manual search - look in common locations
+ find_path(FAISS_INCLUDE_DIR
+ NAMES faiss/Index.h
+ PATHS
+ /usr/local/include
+ /usr/include
+ ${FAISS_ROOT}/include
+ $ENV{FAISS_ROOT}/include
+ )
+
+ # For fat lib, try to find static library first
+ if(BUILD_FAT_LIB)
+ find_library(FAISS_LIBRARY_STATIC
+ NAMES libfaiss.a faiss_static
+ PATHS
+ /usr/local/lib
+ /usr/lib
+ /usr/local/lib64
+ /usr/lib64
+ ${FAISS_ROOT}/lib
+ ${FAISS_ROOT}/lib64
+ $ENV{FAISS_ROOT}/lib
+ $ENV{FAISS_ROOT}/lib64
+ )
+ if(FAISS_LIBRARY_STATIC)
+ set(FAISS_LIBRARY ${FAISS_LIBRARY_STATIC})
+ message(STATUS "Found Faiss static library: ${FAISS_LIBRARY}")
+ endif()
+ endif()
+
+ # If static not found or not building fat lib, find any library
+ if(NOT FAISS_LIBRARY)
+ find_library(FAISS_LIBRARY
+ NAMES faiss
+ PATHS
+ /usr/local/lib
+ /usr/lib
+ /usr/local/lib64
+ /usr/lib64
+ ${FAISS_ROOT}/lib
+ ${FAISS_ROOT}/lib64
+ $ENV{FAISS_ROOT}/lib
+ $ENV{FAISS_ROOT}/lib64
+ )
+ endif()
+
+ if(FAISS_INCLUDE_DIR AND FAISS_LIBRARY)
+ set(FAISS_FOUND TRUE)
+ set(FAISS_INCLUDE_DIRS ${FAISS_INCLUDE_DIR})
+ set(FAISS_LIBRARIES ${FAISS_LIBRARY})
+ message(STATUS "Found Faiss: ${FAISS_LIBRARY}")
+ else()
+ message(FATAL_ERROR "Faiss not found. Please install Faiss or set FAISS_ROOT environment variable.")
+ endif()
+ endif()
+endif()
+
+# Find BLAS/LAPACK for static linking (Faiss depends on them)
+if(BUILD_FAT_LIB)
+ # Save original suffixes
+ set(_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
+
+ # Force static library search only
+ set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
+
+ # Try to find OpenBLAS static library
+ find_library(OPENBLAS_STATIC_LIBRARY
+ NAMES openblas openblas_static
+ PATHS
+ /usr/local/lib
+ /usr/lib
+ /usr/local/lib64
+ /usr/lib64
+ /usr/lib/x86_64-linux-gnu
+ /usr/lib/aarch64-linux-gnu
+ ${OPENBLAS_ROOT}/lib
+ $ENV{OPENBLAS_ROOT}/lib
+ NO_DEFAULT_PATH
+ )
+
+ # Also try default paths
+ if(NOT OPENBLAS_STATIC_LIBRARY)
+ find_library(OPENBLAS_STATIC_LIBRARY
+ NAMES openblas openblas_static
+ )
+ endif()
+
+ if(OPENBLAS_STATIC_LIBRARY AND OPENBLAS_STATIC_LIBRARY MATCHES "\\.a$")
+ message(STATUS "Found OpenBLAS static library: ${OPENBLAS_STATIC_LIBRARY}")
+ set(OPENBLAS_USE_STATIC TRUE)
+ list(APPEND FAISS_STATIC_LIBS ${OPENBLAS_STATIC_LIBRARY})
+ else()
+ message(STATUS "OpenBLAS static library not found, trying shared library")
+ set(OPENBLAS_USE_STATIC FALSE)
+
+ # Restore suffixes and find shared library
+ set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
+ find_library(OPENBLAS_SHARED_LIBRARY
+ NAMES openblas
+ PATHS
+ /usr/local/lib
+ /usr/lib
+ /usr/local/lib64
+ /usr/lib64
+ /usr/lib/x86_64-linux-gnu
+ /usr/lib/aarch64-linux-gnu
+ ${OPENBLAS_ROOT}/lib
+ $ENV{OPENBLAS_ROOT}/lib
+ )
+ if(OPENBLAS_SHARED_LIBRARY)
+ message(STATUS "Found OpenBLAS shared library: ${OPENBLAS_SHARED_LIBRARY}")
+ list(APPEND FAISS_EXTRA_LIBS ${OPENBLAS_SHARED_LIBRARY})
+ # Mark that we need to bundle this library
+ set(BUNDLE_OPENBLAS TRUE)
+ set(BUNDLE_OPENBLAS_PATH ${OPENBLAS_SHARED_LIBRARY})
+ else()
+ # Try to find any BLAS
+ find_package(BLAS QUIET)
+ if(BLAS_FOUND)
+ list(APPEND FAISS_EXTRA_LIBS ${BLAS_LIBRARIES})
+ message(STATUS "Found BLAS: ${BLAS_LIBRARIES}")
+ endif()
+ endif()
+ endif()
+
+ # Restore suffixes for static search
+ set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
+
+ # Find LAPACK static library
+ find_library(LAPACK_STATIC_LIBRARY
+ NAMES lapack
+ PATHS
+ /usr/local/lib
+ /usr/lib
+ /usr/local/lib64
+ /usr/lib64
+ /usr/lib/x86_64-linux-gnu
+ /usr/lib/aarch64-linux-gnu
+ )
+ if(LAPACK_STATIC_LIBRARY AND LAPACK_STATIC_LIBRARY MATCHES "\\.a$")
+ message(STATUS "Found LAPACK static library: ${LAPACK_STATIC_LIBRARY}")
+ list(APPEND FAISS_STATIC_LIBS ${LAPACK_STATIC_LIBRARY})
+ endif()
+
+ # Find gfortran static library (needed by OpenBLAS)
+ find_library(GFORTRAN_STATIC_LIBRARY
+ NAMES gfortran
+ )
+ if(GFORTRAN_STATIC_LIBRARY AND GFORTRAN_STATIC_LIBRARY MATCHES "\\.a$")
+ message(STATUS "Found gfortran static library: ${GFORTRAN_STATIC_LIBRARY}")
+ list(APPEND FAISS_STATIC_LIBS ${GFORTRAN_STATIC_LIBRARY})
+ endif()
+
+ # Restore original suffixes
+ set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
+
+ # On Linux, we may need pthread, dl, and m (these are typically dynamically linked)
+ if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ list(APPEND FAISS_EXTRA_LIBS pthread dl m)
+
+ # Try to find gfortran shared if static not found
+ if(NOT GFORTRAN_STATIC_LIBRARY)
+ find_library(GFORTRAN_LIBRARY gfortran)
+ if(GFORTRAN_LIBRARY)
+ list(APPEND FAISS_EXTRA_LIBS ${GFORTRAN_LIBRARY})
+ endif()
+ endif()
+ endif()
+endif()
+
+# Platform detection - using {os}/{arch} directory structure
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ set(PLATFORM_OS "linux")
+ if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64")
+ set(PLATFORM_ARCH "amd64")
+ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64")
+ set(PLATFORM_ARCH "aarch64")
+ else()
+ message(FATAL_ERROR "Unsupported Linux architecture: ${CMAKE_SYSTEM_PROCESSOR}")
+ endif()
+elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+ set(PLATFORM_OS "darwin")
+ if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64")
+ set(PLATFORM_ARCH "amd64")
+ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
+ set(PLATFORM_ARCH "aarch64")
+ else()
+ message(FATAL_ERROR "Unsupported macOS architecture: ${CMAKE_SYSTEM_PROCESSOR}")
+ endif()
+else()
+ message(FATAL_ERROR "Unsupported operating system: ${CMAKE_SYSTEM_NAME}. Only Linux and macOS are supported.")
+endif()
+
+set(PLATFORM_DIR "${PLATFORM_OS}/${PLATFORM_ARCH}")
+message(STATUS "Building for platform: ${PLATFORM_DIR}")
+
+# Build the JNI library
+add_library(paimon_faiss_jni SHARED
+ paimon_faiss_jni.cpp
+)
+
+# Include directories
+if(TARGET faiss)
+ target_link_libraries(paimon_faiss_jni PRIVATE faiss)
+else()
+ target_include_directories(paimon_faiss_jni PRIVATE ${FAISS_INCLUDE_DIRS})
+ target_link_libraries(paimon_faiss_jni PRIVATE ${FAISS_LIBRARIES})
+endif()
+
+# Link extra libraries for fat lib (BLAS, LAPACK, etc.)
+if(BUILD_FAT_LIB)
+ # Link static libraries with --whole-archive to embed all symbols
+ if(FAISS_STATIC_LIBS AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ message(STATUS "Linking static libraries with --whole-archive: ${FAISS_STATIC_LIBS}")
+ target_link_options(paimon_faiss_jni PRIVATE
+ "-Wl,--whole-archive"
+ )
+ target_link_libraries(paimon_faiss_jni PRIVATE ${FAISS_STATIC_LIBS})
+ target_link_options(paimon_faiss_jni PRIVATE
+ "-Wl,--no-whole-archive"
+ )
+ elseif(FAISS_STATIC_LIBS)
+ # macOS doesn't use --whole-archive, use -force_load instead
+ foreach(static_lib ${FAISS_STATIC_LIBS})
+ target_link_options(paimon_faiss_jni PRIVATE "-Wl,-force_load,${static_lib}")
+ endforeach()
+ message(STATUS "Linking static libraries with -force_load: ${FAISS_STATIC_LIBS}")
+ endif()
+
+ # Link remaining shared libraries
+ if(FAISS_EXTRA_LIBS)
+ target_link_libraries(paimon_faiss_jni PRIVATE ${FAISS_EXTRA_LIBS})
+ message(STATUS "Linking extra libraries: ${FAISS_EXTRA_LIBS}")
+ endif()
+endif()
+
+# Link OpenMP - always use dynamic linking for OpenMP (static libgomp.a often lacks -fPIC)
+if(TARGET OpenMP::OpenMP_CXX)
+ target_link_libraries(paimon_faiss_jni PRIVATE OpenMP::OpenMP_CXX)
+ message(STATUS "Linking OpenMP via imported target")
+elseif(OpenMP_FOUND)
+ target_compile_options(paimon_faiss_jni PRIVATE ${OpenMP_CXX_FLAGS})
+ # Link against the shared gomp library
+ if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ find_library(GOMP_SHARED_LIBRARY NAMES gomp PATHS /usr/lib /usr/lib64 /usr/lib/x86_64-linux-gnu)
+ if(GOMP_SHARED_LIBRARY)
+ target_link_libraries(paimon_faiss_jni PRIVATE ${GOMP_SHARED_LIBRARY})
+ message(STATUS "Linking OpenMP shared library: ${GOMP_SHARED_LIBRARY}")
+ else()
+ target_link_libraries(paimon_faiss_jni PRIVATE gomp)
+ message(STATUS "Linking OpenMP: gomp")
+ endif()
+ else()
+ target_link_libraries(paimon_faiss_jni PRIVATE ${OpenMP_CXX_FLAGS})
+ endif()
+endif()
+
+# Platform-specific settings
+if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+ # macOS specific settings
+ set_target_properties(paimon_faiss_jni PROPERTIES
+ SUFFIX ".dylib"
+ INSTALL_NAME_DIR "@rpath"
+ BUILD_WITH_INSTALL_RPATH TRUE
+ )
+
+ # Link against libc++
+ target_link_libraries(paimon_faiss_jni PRIVATE c++)
+
+ # For fat lib on macOS, embed OpenMP library path
+ if(BUILD_FAT_LIB AND HOMEBREW_LIBOMP_PREFIX)
+ target_link_options(paimon_faiss_jni PRIVATE
+ "-Wl,-rpath,@loader_path"
+ "-Wl,-rpath,${HOMEBREW_LIBOMP_PREFIX}/lib"
+ )
+ endif()
+
+elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ # Linux specific settings
+ set_target_properties(paimon_faiss_jni PROPERTIES
+ SUFFIX ".so"
+ )
+
+ if(BUILD_FAT_LIB)
+ # For fat lib, use static libstdc++ and libgcc
+ target_link_options(paimon_faiss_jni PRIVATE
+ "-static-libstdc++"
+ "-static-libgcc"
+ "-Wl,--exclude-libs,ALL"
+ )
+ message(STATUS "Using static libstdc++ and libgcc for fat lib")
+ else()
+ target_link_libraries(paimon_faiss_jni PRIVATE stdc++)
+ endif()
+
+endif()
+
+# Set output directory - output to src/main/resources/{os}/{arch}/
+set(OUTPUT_DIR "${CMAKE_SOURCE_DIR}/../resources/${PLATFORM_DIR}")
+set_target_properties(paimon_faiss_jni PROPERTIES
+ LIBRARY_OUTPUT_DIRECTORY ${OUTPUT_DIR}
+ RUNTIME_OUTPUT_DIRECTORY ${OUTPUT_DIR}
+)
+
+# Optimization level
+if(FAISS_OPT_LEVEL STREQUAL "avx2")
+ target_compile_options(paimon_faiss_jni PRIVATE -mavx2 -mfma)
+ message(STATUS "Building with AVX2 optimizations")
+elseif(FAISS_OPT_LEVEL STREQUAL "avx512")
+ target_compile_options(paimon_faiss_jni PRIVATE -mavx512f -mavx512dq -mavx512bw -mavx512vl)
+ message(STATUS "Building with AVX-512 optimizations")
+else()
+ message(STATUS "Building with generic optimizations")
+endif()
+
+# Copy bundled shared libraries to output directory and set rpath
+if(BUILD_FAT_LIB AND BUNDLE_OPENBLAS AND BUNDLE_OPENBLAS_PATH)
+ message(STATUS "Will bundle OpenBLAS shared library: ${BUNDLE_OPENBLAS_PATH}")
+
+ # Get the actual library file (resolve symlinks)
+ get_filename_component(OPENBLAS_REALPATH ${BUNDLE_OPENBLAS_PATH} REALPATH)
+ get_filename_component(OPENBLAS_FILENAME ${OPENBLAS_REALPATH} NAME)
+
+ # Copy OpenBLAS to output directory after build
+ add_custom_command(TARGET paimon_faiss_jni POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different
+ ${OPENBLAS_REALPATH}
+ ${OUTPUT_DIR}/libopenblas.so.0
+ COMMENT "Bundling OpenBLAS shared library"
+ )
+
+ # Set rpath to look in the same directory as the library
+ if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ target_link_options(paimon_faiss_jni PRIVATE
+ "-Wl,-rpath,$ORIGIN"
+ )
+ # Also patch the library after build to use the bundled libopenblas
+ add_custom_command(TARGET paimon_faiss_jni POST_BUILD
+ COMMAND patchelf --set-rpath "$$ORIGIN" ${OUTPUT_DIR}/libpaimon_faiss_jni.so || true
+ COMMENT "Setting rpath to $ORIGIN"
+ )
+ endif()
+endif()
+
+# Install target
+install(TARGETS paimon_faiss_jni
+ LIBRARY DESTINATION ${PLATFORM_DIR}
+ RUNTIME DESTINATION ${PLATFORM_DIR}
+)
+
diff --git a/paimon-faiss-jni/src/main/native/paimon_faiss_jni.cpp b/paimon-faiss-jni/src/main/native/paimon_faiss_jni.cpp
new file mode 100644
index 000000000000..38dcff06fc15
--- /dev/null
+++ b/paimon-faiss-jni/src/main/native/paimon_faiss_jni.cpp
@@ -0,0 +1,464 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "paimon_faiss_jni.h"
+
+#include Note: These tests require the native library to be built and available. They will be skipped
+ * if the native library is not found.
+ */
+class IndexTest {
+
+ private static final int DIMENSION = 128;
+ private static final int NUM_VECTORS = 1000;
+ private static final int K = 10;
+
+ @Test
+ void testFlatIndexBasicOperations() {
+ try (Index index = IndexFactory.createFlat(DIMENSION, MetricType.L2)) {
+ assertEquals(DIMENSION, index.getDimension());
+ assertEquals(0, index.getCount());
+ assertTrue(index.isTrained());
+ assertEquals(MetricType.L2, index.getMetricType());
+
+ // Add vectors
+ float[] vectors = generateRandomVectors(NUM_VECTORS, DIMENSION);
+ index.add(vectors);
+ assertEquals(NUM_VECTORS, index.getCount());
+
+ // Search
+ float[] query = generateRandomVectors(1, DIMENSION);
+ SearchResult result = index.searchSingle(query, K);
+
+ assertEquals(1, result.getNumQueries());
+ assertEquals(K, result.getK());
+ assertEquals(K, result.getLabelsForQuery(0).length);
+ assertEquals(K, result.getDistancesForQuery(0).length);
+
+ // Verify labels are in valid range
+ for (long label : result.getLabels()) {
+ assertTrue(label >= 0 && label < NUM_VECTORS, "Label " + label + " out of range");
+ }
+
+ // Verify distances are non-negative for L2
+ for (float distance : result.getDistances()) {
+ assertTrue(distance >= 0, "Distance should be non-negative for L2");
+ }
+ }
+ }
+
+ @Test
+ void testFlatIndexWithIds() {
+ try (Index index = IndexFactory.createFlatWithIds(DIMENSION, MetricType.L2)) {
+ float[] vectors = generateRandomVectors(NUM_VECTORS, DIMENSION);
+ long[] ids = new long[NUM_VECTORS];
+ for (int i = 0; i < NUM_VECTORS; i++) {
+ ids[i] = i * 100; // Use custom IDs
+ }
+
+ index.addWithIds(vectors, ids);
+ assertEquals(NUM_VECTORS, index.getCount());
+
+ // Search should return our custom IDs
+ float[] query = generateRandomVectors(1, DIMENSION);
+ SearchResult result = index.searchSingle(query, K);
+
+ for (long label : result.getLabels()) {
+ assertTrue(label % 100 == 0, "Label should be a multiple of 100");
+ }
+ }
+ }
+
+ @Test
+ void testBatchSearch() {
+ try (Index index = IndexFactory.createFlat(DIMENSION)) {
+ float[] vectors = generateRandomVectors(NUM_VECTORS, DIMENSION);
+ index.add(vectors);
+
+ int numQueries = 5;
+ float[] queries = generateRandomVectors(numQueries, DIMENSION);
+ SearchResult result = index.search(queries, K);
+
+ assertEquals(numQueries, result.getNumQueries());
+ assertEquals(K, result.getK());
+ assertEquals(numQueries * K, result.getLabels().length);
+ assertEquals(numQueries * K, result.getDistances().length);
+
+ // Test per-query accessors
+ for (int q = 0; q < numQueries; q++) {
+ long[] labels = result.getLabelsForQuery(q);
+ float[] distances = result.getDistancesForQuery(q);
+ assertEquals(K, labels.length);
+ assertEquals(K, distances.length);
+ }
+ }
+ }
+
+ @Test
+ void testInnerProductMetric() {
+ try (Index index = IndexFactory.createFlat(DIMENSION, MetricType.INNER_PRODUCT)) {
+ assertEquals(MetricType.INNER_PRODUCT, index.getMetricType());
+
+ float[] vectors = generateRandomVectors(NUM_VECTORS, DIMENSION);
+ index.add(vectors);
+
+ float[] query = generateRandomVectors(1, DIMENSION);
+ SearchResult result = index.searchSingle(query, K);
+
+ // For inner product, higher is better, so first result should have highest score
+ float[] distances = result.getDistancesForQuery(0);
+ for (int i = 1; i < K; i++) {
+ assertTrue(
+ distances[i - 1] >= distances[i],
+ "Distances should be sorted in descending order for inner product");
+ }
+ }
+ }
+
+ @Test
+ void testIndexReset() {
+ try (Index index = IndexFactory.createFlat(DIMENSION)) {
+ float[] vectors = generateRandomVectors(100, DIMENSION);
+ index.add(vectors);
+ assertEquals(100, index.getCount());
+
+ index.reset();
+ assertEquals(0, index.getCount());
+
+ // Can add again after reset
+ index.add(vectors);
+ assertEquals(100, index.getCount());
+ }
+ }
+
+ @Test
+ void testIndexSerialization(@TempDir Path tempDir) {
+ float[] vectors = generateRandomVectors(NUM_VECTORS, DIMENSION);
+ float[] query = generateRandomVectors(1, DIMENSION);
+ SearchResult originalResult;
+
+ // Create and populate index
+ try (Index index = IndexFactory.createFlat(DIMENSION)) {
+ index.add(vectors);
+ originalResult = index.searchSingle(query, K);
+
+ // Test file I/O
+ File indexFile = tempDir.resolve("test.index").toFile();
+ index.writeToFile(indexFile);
+
+ try (Index loadedIndex = Index.readFromFile(indexFile)) {
+ assertEquals(DIMENSION, loadedIndex.getDimension());
+ assertEquals(NUM_VECTORS, loadedIndex.getCount());
+
+ SearchResult loadedResult = loadedIndex.searchSingle(query, K);
+ assertArrayEquals(originalResult.getLabels(), loadedResult.getLabels());
+ }
+ }
+
+ // Test byte array serialization
+ try (Index index = IndexFactory.createFlat(DIMENSION)) {
+ index.add(vectors);
+ byte[] serialized = index.serialize();
+ assertNotNull(serialized);
+ assertTrue(serialized.length > 0);
+
+ try (Index deserializedIndex = Index.deserialize(serialized)) {
+ assertEquals(DIMENSION, deserializedIndex.getDimension());
+ assertEquals(NUM_VECTORS, deserializedIndex.getCount());
+
+ SearchResult deserializedResult = deserializedIndex.searchSingle(query, K);
+ assertArrayEquals(originalResult.getLabels(), deserializedResult.getLabels());
+ }
+ }
+ }
+
+ @Test
+ void testIndexFactoryDescriptions() {
+ // Test various index factory strings
+ String[] descriptions = {"Flat", "IDMap,Flat", "HNSW32", "HNSW32,Flat"};
+
+ for (String desc : descriptions) {
+ try (Index index = IndexFactory.create(DIMENSION, desc, MetricType.L2)) {
+ assertEquals(DIMENSION, index.getDimension());
+ assertNotNull(index.toString());
+ }
+ }
+ }
+
+ @Test
+ void testHNSWIndex() {
+ try (Index index = IndexFactory.createHNSW(DIMENSION, 32, MetricType.L2)) {
+ assertTrue(index.isTrained()); // HNSW doesn't need training
+
+ float[] vectors = generateRandomVectors(NUM_VECTORS, DIMENSION);
+ index.add(vectors);
+
+ // Get and set efSearch
+ int efSearch = IndexHNSW.getEfSearch(index);
+ assertTrue(efSearch > 0);
+
+ IndexHNSW.setEfSearch(index, 64);
+ assertEquals(64, IndexHNSW.getEfSearch(index));
+
+ // Search
+ float[] query = generateRandomVectors(1, DIMENSION);
+ SearchResult result = index.searchSingle(query, K);
+ assertEquals(K, result.getLabels().length);
+ }
+ }
+
+ @Test
+ void testErrorHandling() {
+ // Test invalid dimension
+ assertThrows(
+ IllegalArgumentException.class,
+ () -> {
+ IndexFactory.create(0, "Flat", MetricType.L2);
+ });
+
+ assertThrows(
+ IllegalArgumentException.class,
+ () -> {
+ IndexFactory.create(-1, "Flat", MetricType.L2);
+ });
+
+ // Test null description
+ assertThrows(
+ IllegalArgumentException.class,
+ () -> {
+ IndexFactory.create(DIMENSION, null, MetricType.L2);
+ });
+
+ // Test vector dimension mismatch
+ try (Index index = IndexFactory.createFlat(DIMENSION)) {
+ float[] wrongDimVectors = new float[10]; // Wrong size
+ assertThrows(
+ IllegalArgumentException.class,
+ () -> {
+ index.addSingle(wrongDimVectors);
+ });
+ }
+
+ // Test closed index
+ Index closedIndex = IndexFactory.createFlat(DIMENSION);
+ closedIndex.close();
+ assertThrows(
+ IllegalStateException.class,
+ () -> {
+ closedIndex.getCount();
+ });
+ }
+
+ @Test
+ void testSearchResultAccessors() {
+ try (Index index = IndexFactory.createFlat(DIMENSION)) {
+ float[] vectors = generateRandomVectors(100, DIMENSION);
+ index.add(vectors);
+
+ float[] queries = generateRandomVectors(3, DIMENSION);
+ SearchResult result = index.search(queries, 5);
+
+ // Test individual accessors
+ for (int q = 0; q < 3; q++) {
+ for (int n = 0; n < 5; n++) {
+ long label = result.getLabel(q, n);
+ float distance = result.getDistance(q, n);
+ assertTrue(label >= 0 && label < 100);
+ assertTrue(distance >= 0);
+ }
+ }
+
+ // Test out of bounds
+ assertThrows(
+ IndexOutOfBoundsException.class,
+ () -> {
+ result.getLabel(10, 0);
+ });
+ assertThrows(
+ IndexOutOfBoundsException.class,
+ () -> {
+ result.getLabel(0, 10);
+ });
+ }
+ }
+
+ private float[] generateRandomVectors(int n, int d) {
+ Random random = new Random(42);
+ float[] vectors = new float[n * d];
+ for (int i = 0; i < vectors.length; i++) {
+ vectors[i] = random.nextFloat();
+ }
+ return vectors;
+ }
+}
diff --git a/paimon-faiss/pom.xml b/paimon-faiss/pom.xml
new file mode 100644
index 000000000000..12d8a5e5c8d5
--- /dev/null
+++ b/paimon-faiss/pom.xml
@@ -0,0 +1,111 @@
+
+
+ This class provides a safe Java API for interacting with native FAISS indices, including
+ * automatic resource management through the {@link Closeable} interface.
+ *
+ * This implementation uses the paimon-faiss-jni library for native FAISS bindings.
+ */
+public class FaissIndex implements Closeable {
+
+ private final Index index;
+ private final int dimension;
+ private final FaissVectorMetric metric;
+ private final FaissIndexType indexType;
+ private volatile boolean closed = false;
+
+ private FaissIndex(
+ Index index, int dimension, FaissVectorMetric metric, FaissIndexType indexType) {
+ this.index = index;
+ this.dimension = dimension;
+ this.metric = metric;
+ this.indexType = indexType;
+ }
+
+ /**
+ * Create a flat index (exact search).
+ *
+ * @param dimension the dimension of vectors
+ * @param metric the distance metric
+ * @return the created index
+ */
+ public static FaissIndex createFlatIndex(int dimension, FaissVectorMetric metric) {
+ MetricType metricType = toMetricType(metric);
+ Index index = IndexFactory.create(dimension, "IDMap,Flat", metricType);
+ return new FaissIndex(index, dimension, metric, FaissIndexType.FLAT);
+ }
+
+ /**
+ * Create an HNSW index.
+ *
+ * @param dimension the dimension of vectors
+ * @param m the number of connections per layer
+ * @param efConstruction the size of the dynamic candidate list for construction
+ * @param metric the distance metric
+ * @return the created index
+ */
+ public static FaissIndex createHnswIndex(
+ int dimension, int m, int efConstruction, FaissVectorMetric metric) {
+ MetricType metricType = toMetricType(metric);
+ // Use IDMap2 wrapper to support addWithIds and get efConstruction
+ String description = String.format("IDMap2,HNSW%d", m);
+ Index index = IndexFactory.create(dimension, description, metricType);
+ return new FaissIndex(index, dimension, metric, FaissIndexType.HNSW);
+ }
+
+ /**
+ * Create an IVF index.
+ *
+ * @param dimension the dimension of vectors
+ * @param nlist the number of inverted lists (clusters)
+ * @param metric the distance metric
+ * @return the created index
+ */
+ public static FaissIndex createIvfIndex(int dimension, int nlist, FaissVectorMetric metric) {
+ MetricType metricType = toMetricType(metric);
+ String description = String.format("IDMap,IVF%d,Flat", nlist);
+ Index index = IndexFactory.create(dimension, description, metricType);
+ return new FaissIndex(index, dimension, metric, FaissIndexType.IVF);
+ }
+
+ /**
+ * Create an IVF-PQ index.
+ *
+ * @param dimension the dimension of vectors
+ * @param nlist the number of inverted lists (clusters)
+ * @param m the number of sub-quantizers
+ * @param nbits the number of bits per sub-quantizer
+ * @param metric the distance metric
+ * @return the created index
+ */
+ public static FaissIndex createIvfPqIndex(
+ int dimension, int nlist, int m, int nbits, FaissVectorMetric metric) {
+ MetricType metricType = toMetricType(metric);
+ String description = String.format("IDMap,IVF%d,PQ%dx%d", nlist, m, nbits);
+ Index index = IndexFactory.create(dimension, description, metricType);
+ return new FaissIndex(index, dimension, metric, FaissIndexType.IVF_PQ);
+ }
+
+ /**
+ * Load an index from serialized data.
+ *
+ * @param data the serialized index data
+ * @return the loaded index
+ */
+ public static FaissIndex fromBytes(byte[] data) {
+ Index index = Index.deserialize(data);
+ int dimension = index.getDimension();
+ // Note: metric and type are not stored in serialized form, use defaults
+ return new FaissIndex(index, dimension, FaissVectorMetric.L2, FaissIndexType.UNKNOWN);
+ }
+
+ /**
+ * Add vectors to the index.
+ *
+ * @param vectors the vectors to add (each row is a vector)
+ */
+ public void add(float[][] vectors) {
+ ensureOpen();
+ if (vectors.length == 0) {
+ return;
+ }
+ float[] flattened = flatten(vectors);
+ index.add(flattened);
+ }
+
+ /**
+ * Add vectors with IDs to the index.
+ *
+ * @param vectors the vectors to add (each row is a vector)
+ * @param ids the IDs for the vectors
+ */
+ public void addWithIds(float[][] vectors, long[] ids) {
+ ensureOpen();
+ if (vectors.length == 0) {
+ return;
+ }
+ if (vectors.length != ids.length) {
+ throw new IllegalArgumentException(
+ "Number of vectors and IDs must match: "
+ + vectors.length
+ + " vs "
+ + ids.length);
+ }
+ float[] flattened = flatten(vectors);
+ index.addWithIds(flattened, ids);
+ }
+
+ /**
+ * Add a single vector to the index.
+ *
+ * @param vector the vector to add
+ */
+ public void add(float[] vector) {
+ ensureOpen();
+ checkDimension(vector);
+ index.addSingle(vector);
+ }
+
+ /**
+ * Add a single vector with ID to the index.
+ *
+ * @param vector the vector to add
+ * @param id the ID for the vector
+ */
+ public void addWithId(float[] vector, long id) {
+ ensureOpen();
+ checkDimension(vector);
+ index.addWithIds(vector, new long[] {id});
+ }
+
+ /**
+ * Train the index (required for IVF-based indices).
+ *
+ * @param trainingVectors the training vectors
+ */
+ public void train(float[][] trainingVectors) {
+ ensureOpen();
+ if (trainingVectors.length == 0) {
+ return;
+ }
+ float[] flattened = flatten(trainingVectors);
+ index.train(flattened);
+ }
+
+ /**
+ * Check if the index is trained.
+ *
+ * @return true if the index is trained
+ */
+ public boolean isTrained() {
+ ensureOpen();
+ return index.isTrained();
+ }
+
+ /**
+ * Search for k nearest neighbors.
+ *
+ * @param queries the query vectors
+ * @param k the number of nearest neighbors to return
+ * @return search results containing distances and IDs
+ */
+ public SearchResult search(float[][] queries, int k) {
+ ensureOpen();
+ if (queries.length == 0) {
+ return new SearchResult(new float[0], new long[0], 0, k);
+ }
+ float[] flattened = flatten(queries);
+ org.apache.paimon.faiss.SearchResult result = index.search(flattened, k);
+ return new SearchResult(result.getDistances(), result.getLabels(), queries.length, k);
+ }
+
+ /**
+ * Search for k nearest neighbors for a single query.
+ *
+ * @param query the query vector
+ * @param k the number of nearest neighbors to return
+ * @return search results containing distances and IDs
+ */
+ public SearchResult search(float[] query, int k) {
+ ensureOpen();
+ checkDimension(query);
+ org.apache.paimon.faiss.SearchResult result = index.searchSingle(query, k);
+ return new SearchResult(result.getDistances(), result.getLabels(), 1, k);
+ }
+
+ /**
+ * Set HNSW search parameter efSearch.
+ *
+ * @param efSearch the size of the dynamic candidate list for search
+ */
+ public void setHnswEfSearch(int efSearch) {
+ ensureOpen();
+ IndexHNSW.setEfSearch(index, efSearch);
+ }
+
+ /**
+ * Set IVF search parameter nprobe.
+ *
+ * @param nprobe the number of clusters to visit during search
+ */
+ public void setIvfNprobe(int nprobe) {
+ ensureOpen();
+ IndexIVF.setNprobe(index, nprobe);
+ }
+
+ /**
+ * Get the number of vectors in the index.
+ *
+ * @return the number of vectors
+ */
+ public long size() {
+ ensureOpen();
+ return index.getCount();
+ }
+
+ /**
+ * Get the dimension of vectors in the index.
+ *
+ * @return the dimension
+ */
+ public int dimension() {
+ return dimension;
+ }
+
+ /**
+ * Get the metric used by this index.
+ *
+ * @return the metric
+ */
+ public FaissVectorMetric metric() {
+ return metric;
+ }
+
+ /**
+ * Get the type of this index.
+ *
+ * @return the index type
+ */
+ public FaissIndexType indexType() {
+ return indexType;
+ }
+
+ /**
+ * Serialize the index to a byte array.
+ *
+ * @return the serialized index
+ */
+ public byte[] toBytes() {
+ ensureOpen();
+ return index.serialize();
+ }
+
+ /** Reset the index (remove all vectors). */
+ public void reset() {
+ ensureOpen();
+ index.reset();
+ }
+
+ @Override
+ public void close() {
+ if (!closed) {
+ synchronized (this) {
+ if (!closed) {
+ index.close();
+ closed = true;
+ }
+ }
+ }
+ }
+
+ private void ensureOpen() {
+ if (closed) {
+ throw new IllegalStateException("Index has been closed");
+ }
+ }
+
+ private void checkDimension(float[] vector) {
+ if (vector.length != dimension) {
+ throw new IllegalArgumentException(
+ "Vector dimension mismatch: expected " + dimension + ", got " + vector.length);
+ }
+ }
+
+ private float[] flatten(float[][] vectors) {
+ int n = vectors.length;
+ int d = vectors[0].length;
+ float[] result = new float[n * d];
+ for (int i = 0; i < n; i++) {
+ if (vectors[i].length != d) {
+ throw new IllegalArgumentException(
+ "All vectors must have the same dimension: expected "
+ + d
+ + ", got "
+ + vectors[i].length
+ + " at index "
+ + i);
+ }
+ System.arraycopy(vectors[i], 0, result, i * d, d);
+ }
+ return result;
+ }
+
+ private static MetricType toMetricType(FaissVectorMetric metric) {
+ switch (metric) {
+ case L2:
+ return MetricType.L2;
+ case INNER_PRODUCT:
+ return MetricType.INNER_PRODUCT;
+ default:
+ throw new IllegalArgumentException("Unknown metric: " + metric);
+ }
+ }
+
+ /** Result of a search operation. */
+ public static class SearchResult {
+ private final float[] distances;
+ private final long[] labels;
+ private final int numQueries;
+ private final int k;
+
+ public SearchResult(float[] distances, long[] labels, int numQueries, int k) {
+ this.distances = distances;
+ this.labels = labels;
+ this.numQueries = numQueries;
+ this.k = k;
+ }
+
+ public float[] getDistances() {
+ return distances;
+ }
+
+ public long[] getLabels() {
+ return labels;
+ }
+
+ public int getNumQueries() {
+ return numQueries;
+ }
+
+ public int getK() {
+ return k;
+ }
+
+ /**
+ * Get distances for a specific query.
+ *
+ * @param queryIndex the query index
+ * @return the distances for that query
+ */
+ public float[] getDistancesForQuery(int queryIndex) {
+ float[] result = new float[k];
+ System.arraycopy(distances, queryIndex * k, result, 0, k);
+ return result;
+ }
+
+ /**
+ * Get labels for a specific query.
+ *
+ * @param queryIndex the query index
+ * @return the labels for that query
+ */
+ public long[] getLabelsForQuery(int queryIndex) {
+ long[] result = new long[k];
+ System.arraycopy(labels, queryIndex * k, result, 0, k);
+ return result;
+ }
+ }
+}
diff --git a/paimon-faiss/src/main/java/org/apache/paimon/faiss/index/FaissIndexType.java b/paimon-faiss/src/main/java/org/apache/paimon/faiss/index/FaissIndexType.java
new file mode 100644
index 000000000000..3756d622948d
--- /dev/null
+++ b/paimon-faiss/src/main/java/org/apache/paimon/faiss/index/FaissIndexType.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.faiss.index;
+
+/** Enumeration of supported FAISS index types. */
+public enum FaissIndexType {
+
+ /** Flat index - exact brute-force search. */
+ FLAT("Flat"),
+
+ /** HNSW (Hierarchical Navigable Small World) graph-based index. */
+ HNSW("HNSW"),
+
+ /** IVF (Inverted File) index with flat vectors. */
+ IVF("IVF"),
+
+ /** IVF-PQ (Inverted File with Product Quantization) index. */
+ IVF_PQ("IVF_PQ"),
+
+ /** Unknown index type (e.g., loaded from serialized data). */
+ UNKNOWN("Unknown");
+
+ private final String name;
+
+ FaissIndexType(String name) {
+ this.name = name;
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public static FaissIndexType fromString(String name) {
+ for (FaissIndexType type : values()) {
+ if (type.name.equalsIgnoreCase(name)) {
+ return type;
+ }
+ }
+ throw new IllegalArgumentException("Unknown index type: " + name);
+ }
+}
diff --git a/paimon-faiss/src/main/java/org/apache/paimon/faiss/index/FaissVectorGlobalIndexReader.java b/paimon-faiss/src/main/java/org/apache/paimon/faiss/index/FaissVectorGlobalIndexReader.java
new file mode 100644
index 000000000000..c9eae4d93c90
--- /dev/null
+++ b/paimon-faiss/src/main/java/org/apache/paimon/faiss/index/FaissVectorGlobalIndexReader.java
@@ -0,0 +1,342 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.faiss.index;
+
+import org.apache.paimon.fs.SeekableInputStream;
+import org.apache.paimon.globalindex.GlobalIndexIOMeta;
+import org.apache.paimon.globalindex.GlobalIndexReader;
+import org.apache.paimon.globalindex.GlobalIndexResult;
+import org.apache.paimon.globalindex.io.GlobalIndexFileReader;
+import org.apache.paimon.predicate.FieldRef;
+import org.apache.paimon.predicate.VectorSearch;
+import org.apache.paimon.types.ArrayType;
+import org.apache.paimon.types.DataType;
+import org.apache.paimon.types.FloatType;
+import org.apache.paimon.utils.IOUtils;
+import org.apache.paimon.utils.RoaringNavigableMap64;
+
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Optional;
+import java.util.PriorityQueue;
+
+/**
+ * Vector global index reader using FAISS.
+ *
+ * This implementation uses FAISS for efficient approximate nearest neighbor search.
+ */
+public class FaissVectorGlobalIndexReader implements GlobalIndexReader {
+
+ private static final int VERSION = 1;
+
+ private final List