Skip to content

Commit 9f34064

Browse files
committed
add parameter config
1 parent b0ca82e commit 9f34064

File tree

14 files changed

+763
-243
lines changed

14 files changed

+763
-243
lines changed

.github/actions/build-diskann-native/action.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@ runs:
2121
rustc --version
2222
cargo --version
2323
24+
- name: Install dependency bundling tools (Linux)
25+
if: startsWith(inputs.platform, 'linux')
26+
shell: bash
27+
run: |
28+
sudo apt-get update -qq && sudo apt-get install -y -qq patchelf
29+
2430
- name: Build native library
2531
shell: bash
2632
run: |

.github/workflows/build-diskann-native.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,5 +64,5 @@ jobs:
6464
uses: actions/upload-artifact@v6
6565
with:
6666
name: ${{ inputs.artifact-name }}
67-
path: paimon-diskann/paimon-diskann-jni/src/main/resources/linux/amd64/
67+
path: paimon-diskann/paimon-diskann-jni/src/main/resources/
6868
retention-days: ${{ inputs.retention-days }}

.github/workflows/utitcase.yml

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ on:
2929
- 'paimon-lucene/**'
3030
- 'paimon-faiss/**'
3131
- '.github/workflows/faiss-vector-index-tests.yml'
32+
- 'paimon-diskann/**'
33+
- '.github/workflows/publish-diskann_snapshot.yml'
3234
- 'paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java'
3335

3436
env:
@@ -40,15 +42,21 @@ concurrency:
4042
cancel-in-progress: true
4143

4244
jobs:
43-
build_native:
45+
build_faiss_native:
4446
uses: ./.github/workflows/build-faiss-native.yml
4547
with:
4648
platform: linux-amd64
4749
jdk-version: '8'
4850

51+
build_diskann_native:
52+
uses: ./.github/workflows/build-diskann-native.yml
53+
with:
54+
platform: linux-amd64
55+
jdk-version: '8'
56+
4957
build_test:
5058
runs-on: ubuntu-latest
51-
needs: build_native
59+
needs: [build_faiss_native, build_diskann_native]
5260

5361
steps:
5462
- name: Checkout code
@@ -60,21 +68,30 @@ jobs:
6068
java-version: ${{ env.JDK_VERSION }}
6169
distribution: 'temurin'
6270

63-
- name: Download native library artifact
71+
- name: Download FAISS native library artifact
6472
uses: actions/download-artifact@v7
6573
with:
6674
name: faiss-native-linux-amd64
6775
path: paimon-faiss/paimon-faiss-jni/src/main/resources/linux/amd64/
6876

69-
- name: List downloaded native library
77+
- name: Download DiskANN native library artifact
78+
uses: actions/download-artifact@v7
79+
with:
80+
name: diskann-native-linux-amd64
81+
path: paimon-diskann/paimon-diskann-jni/src/main/resources/
82+
83+
- name: List downloaded native libraries
7084
run: |
71-
echo "=== Downloaded native libraries ==="
85+
echo "=== FAISS native libraries ==="
7286
ls -la paimon-faiss/paimon-faiss-jni/src/main/resources/linux/amd64/
87+
echo ""
88+
echo "=== DiskANN native libraries ==="
89+
find paimon-diskann/paimon-diskann-jni/src/main/resources -type f -exec ls -la {} \;
7390
7491
- name: Build Others
7592
run: |
7693
echo "Start compiling modules"
77-
mvn -T 2C -B -ntp clean install -DskipTests -Pflink1,spark3,paimon-faiss
94+
mvn -T 2C -B -ntp clean install -DskipTests -Pflink1,spark3,paimon-faiss,paimon-diskann
7895
7996
- name: Test Others
8097
timeout-minutes: 60

paimon-diskann/PARAMETER_TUNING.md

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
<!--
2+
Licensed to the Apache Software Foundation (ASF) under one
3+
or more contributor license agreements. See the NOTICE file
4+
distributed with this work for additional information
5+
regarding copyright ownership. The ASF licenses this file
6+
to you under the Apache License, Version 2.0 (the
7+
"License"); you may not use this file except in compliance
8+
with the License. You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing,
13+
software distributed under the License is distributed on an
14+
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
KIND, either express or implied. See the License for the
16+
specific language governing permissions and limitations
17+
under the License.
18+
-->
19+
20+
# DiskANN Parameter Tuning Guide
21+
22+
This document provides guidance on tuning DiskANN vector index parameters for optimal performance in Apache Paimon.
23+
24+
## Overview
25+
26+
DiskANN is a graph-based approximate nearest neighbor (ANN) search algorithm designed for efficient billion-point vector search. The implementation in Paimon provides several parameters to control the trade-offs between accuracy, speed, and resource usage.
27+
28+
## Key Parameters
29+
30+
### 1. Graph Construction Parameters
31+
32+
#### `vector.diskann.max-degree` (R)
33+
- **Default**: 64
34+
- **Range**: 32-128
35+
- **Description**: Maximum degree (number of connections) for each node in the graph
36+
- **Impact**:
37+
- Higher values → Better recall, higher memory usage, longer build time
38+
- Lower values → Faster build, lower memory, potentially lower recall
39+
- **Recommendations**:
40+
- **32**: For memory-constrained environments or when build time is critical
41+
- **64**: Balanced default (Microsoft recommended)
42+
- **128**: For maximum recall when resources permit
43+
44+
#### `vector.diskann.build-list-size` (L)
45+
- **Default**: 100
46+
- **Range**: 50-200
47+
- **Description**: Size of the candidate list during graph construction
48+
- **Impact**:
49+
- Higher values → Better graph quality, longer build time
50+
- Lower values → Faster build, potentially lower recall
51+
- **Recommendations**:
52+
- Use default 100 for most cases
53+
- Increase to 150-200 for very high-dimensional data (>512 dimensions)
54+
55+
### 2. Search Parameters
56+
57+
#### `vector.diskann.search-list-size` (L)
58+
- **Default**: 100
59+
- **Range**: 16-500
60+
- **Description**: Size of the candidate list during search
61+
- **Impact**:
62+
- Higher values → Better recall, higher latency
63+
- Lower values → Lower latency, potentially lower recall
64+
- **Dynamic Behavior**: The implementation automatically adjusts this to be at least equal to the requested `k` (number of results)
65+
- **Recommendations**:
66+
- **16-32**: For latency-critical applications (QPS > 5000)
67+
- **100**: Balanced default
68+
- **200-500**: For maximum recall (recall > 95%)
69+
70+
#### `vector.search-factor`
71+
- **Default**: 10
72+
- **Range**: 5-20
73+
- **Description**: Multiplier for search limit when row filtering is applied
74+
- **Impact**: When filtering by row IDs, fetches `limit * search-factor` results to ensure sufficient matches after filtering
75+
- **Recommendations**:
76+
- **5**: When filtering is selective (<10% of data)
77+
- **10**: Default for typical filtering scenarios
78+
- **20**: When filtering is very broad (>50% of data)
79+
80+
### 3. Data Configuration
81+
82+
#### `vector.dim`
83+
- **Default**: 128
84+
- **Description**: Dimension of the vectors
85+
- **Recommendations**:
86+
- Must match your embedding model
87+
- Common values: 128, 256, 384, 512, 768, 1024
88+
89+
#### `vector.metric`
90+
- **Default**: L2
91+
- **Options**: L2, INNER_PRODUCT, COSINE
92+
- **Description**: Distance metric for similarity computation
93+
- **Recommendations**:
94+
- **L2**: For Euclidean distance (most common)
95+
- **INNER_PRODUCT**: For dot product similarity (use with normalized vectors)
96+
- **COSINE**: For cosine similarity
97+
98+
#### `vector.normalize`
99+
- **Default**: false
100+
- **Description**: Whether to L2-normalize vectors before indexing/searching
101+
- **Recommendations**:
102+
- **true**: When using COSINE metric or when vectors have varying magnitudes
103+
- **false**: When vectors are already normalized or using L2 metric
104+
105+
### 4. Index Organization
106+
107+
#### `vector.size-per-index`
108+
- **Default**: 2,000,000
109+
- **Description**: Number of vectors per index file
110+
- **Impact**:
111+
- Larger values → Fewer files, higher memory per index, better search efficiency
112+
- Smaller values → More files, lower memory per index, more overhead
113+
- **Recommendations**:
114+
- **500,000**: For small datasets or memory-constrained environments
115+
- **2,000,000**: Default for balanced performance
116+
- **5,000,000+**: For large-scale production systems with ample resources
117+
118+
#### `vector.diskann.index-type`
119+
- **Default**: MEMORY
120+
- **Options**: MEMORY, DISK
121+
- **Description**: Type of index structure
122+
- **Recommendations**:
123+
- **MEMORY**: For datasets that fit in RAM (best performance)
124+
- **DISK**: For datasets exceeding RAM (requires SSD)
125+
126+
## Performance Tuning Guide
127+
128+
### High Recall (>95%)
129+
```properties
130+
vector.diskann.max-degree = 128
131+
vector.diskann.build-list-size = 150
132+
vector.diskann.search-list-size = 200
133+
```
134+
135+
### Balanced (90-95% recall)
136+
```properties
137+
vector.diskann.max-degree = 64
138+
vector.diskann.build-list-size = 100
139+
vector.diskann.search-list-size = 100
140+
```
141+
142+
### High QPS (Low Latency)
143+
```properties
144+
vector.diskann.max-degree = 32
145+
vector.diskann.build-list-size = 75
146+
vector.diskann.search-list-size = 32
147+
```
148+
149+
### Memory-Constrained
150+
```properties
151+
vector.diskann.max-degree = 32
152+
vector.diskann.build-list-size = 75
153+
vector.size-per-index = 500000
154+
vector.diskann.index-type = DISK
155+
```
156+
157+
## Best Practices
158+
159+
1. **Start with defaults**: The default parameters are tuned for balanced performance
160+
2. **Measure first**: Profile your workload before tuning
161+
3. **Tune incrementally**: Change one parameter at a time and measure impact
162+
4. **Consider trade-offs**: Higher recall typically means higher latency and resource usage
163+
5. **Test with production data**: Parameter effectiveness depends on data characteristics
164+
165+
## Advanced Parameters (Future Enhancement)
166+
167+
The following parameters are documented in the official Microsoft DiskANN implementation but are not yet exposed in the current Rust-based native library:
168+
169+
- **alpha** (default: 1.2): Controls the graph construction pruning strategy
170+
- **saturate_graph** (default: true): Whether to saturate the graph during construction
171+
172+
These parameters may be added in future versions when the underlying Rust DiskANN crate exposes them through its configuration API.
173+
174+
## Performance Metrics
175+
176+
When tuning parameters, monitor these metrics:
177+
- **Recall**: Percentage of true nearest neighbors found
178+
- **QPS (Queries Per Second)**: Throughput of search operations
179+
- **Latency**: Time to complete a single query (p50, p95, p99)
180+
- **Memory Usage**: RAM consumed by indices
181+
- **Build Time**: Time to construct the index
182+
183+
## Recent Improvements
184+
185+
### Dynamic Search List Sizing (v1.0+)
186+
The search list size is now automatically adjusted to be at least equal to the requested `k`. This follows Milvus best practices and ensures optimal recall without manual tuning.
187+
188+
### Memory-Efficient Loading (v1.0+)
189+
Indices are now loaded through temporary files, allowing the OS to manage memory more efficiently for large indices. This is a step toward full mmap support.
190+
191+
## References
192+
193+
- [Microsoft DiskANN Paper](https://proceedings.neurips.cc/paper/2019/file/09853c7fb1d3f8ee67a61b6bf4a7f8e6-Paper.pdf)
194+
- [Microsoft DiskANN Library](https://github.com/microsoft/DiskANN)
195+
- [Milvus DiskANN Documentation](https://milvus.io/docs/diskann.md)

paimon-diskann/paimon-diskann-index/src/main/java/org/apache/paimon/diskann/index/DiskAnnIndex.java

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,23 @@ public class DiskAnnIndex implements Closeable {
3737
private final int dimension;
3838
private final DiskAnnVectorMetric metric;
3939
private final DiskAnnIndexType indexType;
40+
private final int maxDegree;
41+
private final int buildListSize;
4042
private volatile boolean closed = false;
4143

4244
private DiskAnnIndex(
43-
Index index, int dimension, DiskAnnVectorMetric metric, DiskAnnIndexType indexType) {
45+
Index index,
46+
int dimension,
47+
DiskAnnVectorMetric metric,
48+
DiskAnnIndexType indexType,
49+
int maxDegree,
50+
int buildListSize) {
4451
this.index = index;
4552
this.dimension = dimension;
4653
this.metric = metric;
4754
this.indexType = indexType;
55+
this.maxDegree = maxDegree;
56+
this.buildListSize = buildListSize;
4857
}
4958

5059
public static DiskAnnIndex create(
@@ -56,7 +65,7 @@ public static DiskAnnIndex create(
5665
MetricType metricType = metric.toMetricType();
5766
Index index =
5867
Index.create(dimension, metricType, indexType.value(), maxDegree, buildListSize);
59-
return new DiskAnnIndex(index, dimension, metric, indexType);
68+
return new DiskAnnIndex(index, dimension, metric, indexType, maxDegree, buildListSize);
6069
}
6170

6271
public void addWithIds(ByteBuffer vectorBuffer, ByteBuffer idBuffer, int n) {
@@ -66,7 +75,12 @@ public void addWithIds(ByteBuffer vectorBuffer, ByteBuffer idBuffer, int n) {
6675
index.addWithIds(n, vectorBuffer, idBuffer);
6776
}
6877

69-
public void build(int buildListSize) {
78+
/**
79+
* Build the index graph after adding vectors.
80+
*
81+
* <p>Uses the buildListSize parameter that was specified during index creation.
82+
*/
83+
public void build() {
7084
ensureOpen();
7185
index.build(buildListSize);
7286
}
@@ -114,6 +128,14 @@ public DiskAnnIndexType indexType() {
114128
return indexType;
115129
}
116130

131+
public int maxDegree() {
132+
return maxDegree;
133+
}
134+
135+
public int buildListSize() {
136+
return buildListSize;
137+
}
138+
117139
public long serializeSize() {
118140
ensureOpen();
119141
return index.serializeSize();
@@ -129,7 +151,23 @@ public long serialize(ByteBuffer buffer) {
129151

130152
public static DiskAnnIndex deserialize(byte[] data, DiskAnnVectorMetric metric) {
131153
Index index = Index.deserialize(data);
132-
return new DiskAnnIndex(index, index.getDimension(), metric, DiskAnnIndexType.UNKNOWN);
154+
return new DiskAnnIndex(
155+
index, index.getDimension(), metric, DiskAnnIndexType.UNKNOWN, 64, 100);
156+
}
157+
158+
/**
159+
* Reset the index (remove all vectors).
160+
*
161+
* <p>Note: This is not supported in the current implementation. DiskANN indices are immutable
162+
* once built. To "reset", you must create a new index.
163+
*
164+
* @throws UnsupportedOperationException always, as reset is not currently supported
165+
*/
166+
public void reset() {
167+
throw new UnsupportedOperationException(
168+
"Reset is not supported for DiskANN indices. "
169+
+ "DiskANN indices are immutable once built. "
170+
+ "Please create a new index instead.");
133171
}
134172

135173
public static ByteBuffer allocateVectorBuffer(int numVectors, int dimension) {

0 commit comments

Comments
 (0)