Skip to content

Commit f42b377

Browse files
committed
fix
1 parent 9f34064 commit f42b377

File tree

22 files changed

+2946
-408
lines changed

22 files changed

+2946
-408
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,5 @@ paimon-faiss/paimon-faiss-jni/build/
3737
paimon-faiss/paimon-faiss-jni/src/main/resources/darwin*
3838
paimon-faiss/paimon-faiss-jni/src/main/resources/linux*
3939
paimon-faiss/paimon-faiss-jni/src/main/native/cmake-build-debug/
40+
paimon-diskann/paimon-diskann-jni/src/main/resources/darwin*
41+
paimon-diskann/paimon-diskann-jni/src/main/resources/linux*

paimon-diskann/PARAMETER_TUNING.md

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -92,16 +92,9 @@ DiskANN is a graph-based approximate nearest neighbor (ANN) search algorithm des
9292
- **Description**: Distance metric for similarity computation
9393
- **Recommendations**:
9494
- **L2**: For Euclidean distance (most common)
95-
- **INNER_PRODUCT**: For dot product similarity (use with normalized vectors)
95+
- **INNER_PRODUCT**: For dot product similarity
9696
- **COSINE**: For cosine similarity
9797

98-
#### `vector.normalize`
99-
- **Default**: false
100-
- **Description**: Whether to L2-normalize vectors before indexing/searching
101-
- **Recommendations**:
102-
- **true**: When using COSINE metric or when vectors have varying magnitudes
103-
- **false**: When vectors are already normalized or using L2 metric
104-
10598
### 4. Index Organization
10699

107100
#### `vector.size-per-index`

paimon-diskann/paimon-diskann-index/src/main/java/org/apache/paimon/diskann/index/DiskAnnIndex.java

Lines changed: 13 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -35,24 +35,12 @@ public class DiskAnnIndex implements Closeable {
3535

3636
private final Index index;
3737
private final int dimension;
38-
private final DiskAnnVectorMetric metric;
39-
private final DiskAnnIndexType indexType;
40-
private final int maxDegree;
4138
private final int buildListSize;
4239
private volatile boolean closed = false;
4340

44-
private DiskAnnIndex(
45-
Index index,
46-
int dimension,
47-
DiskAnnVectorMetric metric,
48-
DiskAnnIndexType indexType,
49-
int maxDegree,
50-
int buildListSize) {
41+
private DiskAnnIndex(Index index, int dimension, int buildListSize) {
5142
this.index = index;
5243
this.dimension = dimension;
53-
this.metric = metric;
54-
this.indexType = indexType;
55-
this.maxDegree = maxDegree;
5644
this.buildListSize = buildListSize;
5745
}
5846

@@ -65,7 +53,7 @@ public static DiskAnnIndex create(
6553
MetricType metricType = metric.toMetricType();
6654
Index index =
6755
Index.create(dimension, metricType, indexType.value(), maxDegree, buildListSize);
68-
return new DiskAnnIndex(index, dimension, metric, indexType, maxDegree, buildListSize);
56+
return new DiskAnnIndex(index, dimension, buildListSize);
6957
}
7058

7159
public void addWithIds(ByteBuffer vectorBuffer, ByteBuffer idBuffer, int n) {
@@ -85,62 +73,22 @@ public void build() {
8573
index.build(buildListSize);
8674
}
8775

88-
public void search(
89-
float[] queryVectors,
90-
int n,
91-
int k,
92-
int searchListSize,
93-
float[] distances,
94-
long[] labels) {
95-
ensureOpen();
96-
if (queryVectors.length < n * dimension) {
97-
throw new IllegalArgumentException(
98-
"Query vectors array too small: required "
99-
+ (n * dimension)
100-
+ ", got "
101-
+ queryVectors.length);
102-
}
103-
if (distances.length < n * k) {
104-
throw new IllegalArgumentException(
105-
"Distances array too small: required " + (n * k) + ", got " + distances.length);
106-
}
107-
if (labels.length < n * k) {
108-
throw new IllegalArgumentException(
109-
"Labels array too small: required " + (n * k) + ", got " + labels.length);
110-
}
111-
index.search(n, queryVectors, k, searchListSize, distances, labels);
112-
}
113-
114-
public long size() {
115-
ensureOpen();
116-
return index.getCount();
117-
}
118-
119-
public int dimension() {
120-
return dimension;
121-
}
122-
123-
public DiskAnnVectorMetric metric() {
124-
return metric;
125-
}
126-
127-
public DiskAnnIndexType indexType() {
128-
return indexType;
129-
}
130-
131-
public int maxDegree() {
132-
return maxDegree;
133-
}
134-
135-
public int buildListSize() {
136-
return buildListSize;
137-
}
138-
76+
/** Return the number of bytes needed for serialization. */
13977
public long serializeSize() {
14078
ensureOpen();
14179
return index.serializeSize();
14280
}
14381

82+
/**
83+
* Serialize this index with its Vamana graph adjacency lists into the given direct ByteBuffer.
84+
*
85+
* <p>The serialized data is later split into an index file (header + graph) and a data file
86+
* (raw vectors) by the writer, then loaded by {@link DiskAnnVectorGlobalIndexReader} for
87+
* search.
88+
*
89+
* @param buffer a direct ByteBuffer of at least {@link #serializeSize()} bytes
90+
* @return the number of bytes written
91+
*/
14492
public long serialize(ByteBuffer buffer) {
14593
ensureOpen();
14694
if (!buffer.isDirect()) {
@@ -149,27 +97,6 @@ public long serialize(ByteBuffer buffer) {
14997
return index.serialize(buffer);
15098
}
15199

152-
public static DiskAnnIndex deserialize(byte[] data, DiskAnnVectorMetric metric) {
153-
Index index = Index.deserialize(data);
154-
return new DiskAnnIndex(
155-
index, index.getDimension(), metric, DiskAnnIndexType.UNKNOWN, 64, 100);
156-
}
157-
158-
/**
159-
* Reset the index (remove all vectors).
160-
*
161-
* <p>Note: This is not supported in the current implementation. DiskANN indices are immutable
162-
* once built. To "reset", you must create a new index.
163-
*
164-
* @throws UnsupportedOperationException always, as reset is not currently supported
165-
*/
166-
public void reset() {
167-
throw new UnsupportedOperationException(
168-
"Reset is not supported for DiskANN indices. "
169-
+ "DiskANN indices are immutable once built. "
170-
+ "Please create a new index instead.");
171-
}
172-
173100
public static ByteBuffer allocateVectorBuffer(int numVectors, int dimension) {
174101
return ByteBuffer.allocateDirect(numVectors * dimension * Float.BYTES)
175102
.order(ByteOrder.nativeOrder());

paimon-diskann/paimon-diskann-index/src/main/java/org/apache/paimon/diskann/index/DiskAnnIndexMeta.java

Lines changed: 47 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,17 @@
2525
import java.io.IOException;
2626
import java.io.Serializable;
2727

28-
/** Metadata for DiskANN vector index. */
28+
/**
29+
* Metadata for DiskANN vector index.
30+
*
31+
* <p>Stores the file names of the companion files that live alongside the index file:
32+
*
33+
* <ul>
34+
* <li>{@link #dataFileName()} — raw vector data file
35+
* <li>{@link #pqPivotsFileName()} — PQ codebook (pivots)
36+
* <li>{@link #pqCompressedFileName()} — PQ compressed codes
37+
* </ul>
38+
*/
2939
public class DiskAnnIndexMeta implements Serializable {
3040

3141
private static final long serialVersionUID = 1L;
@@ -38,31 +48,29 @@ public class DiskAnnIndexMeta implements Serializable {
3848
private final long numVectors;
3949
private final long minId;
4050
private final long maxId;
51+
private final String dataFileName;
52+
private final String pqPivotsFileName;
53+
private final String pqCompressedFileName;
4154

4255
public DiskAnnIndexMeta(
43-
int dim, int metricValue, int indexTypeValue, long numVectors, long minId, long maxId) {
56+
int dim,
57+
int metricValue,
58+
int indexTypeValue,
59+
long numVectors,
60+
long minId,
61+
long maxId,
62+
String dataFileName,
63+
String pqPivotsFileName,
64+
String pqCompressedFileName) {
4465
this.dim = dim;
4566
this.metricValue = metricValue;
4667
this.indexTypeValue = indexTypeValue;
4768
this.numVectors = numVectors;
4869
this.minId = minId;
4970
this.maxId = maxId;
50-
}
51-
52-
public int dim() {
53-
return dim;
54-
}
55-
56-
public int metricValue() {
57-
return metricValue;
58-
}
59-
60-
public int indexTypeValue() {
61-
return indexTypeValue;
62-
}
63-
64-
public long numVectors() {
65-
return numVectors;
71+
this.dataFileName = dataFileName;
72+
this.pqPivotsFileName = pqPivotsFileName;
73+
this.pqCompressedFileName = pqCompressedFileName;
6674
}
6775

6876
public long minId() {
@@ -73,6 +81,11 @@ public long maxId() {
7381
return maxId;
7482
}
7583

84+
/** The file name of the separate vector data file. */
85+
public String dataFileName() {
86+
return dataFileName;
87+
}
88+
7689
/** Serialize metadata to byte array. */
7790
public byte[] serialize() throws IOException {
7891
ByteArrayOutputStream baos = new ByteArrayOutputStream();
@@ -84,6 +97,9 @@ public byte[] serialize() throws IOException {
8497
out.writeLong(numVectors);
8598
out.writeLong(minId);
8699
out.writeLong(maxId);
100+
out.writeUTF(dataFileName);
101+
out.writeUTF(pqPivotsFileName);
102+
out.writeUTF(pqCompressedFileName);
87103
out.flush();
88104
return baos.toByteArray();
89105
}
@@ -101,6 +117,18 @@ public static DiskAnnIndexMeta deserialize(byte[] data) throws IOException {
101117
long numVectors = in.readLong();
102118
long minId = in.readLong();
103119
long maxId = in.readLong();
104-
return new DiskAnnIndexMeta(dim, metricValue, indexTypeValue, numVectors, minId, maxId);
120+
String dataFileName = in.readUTF();
121+
String pqPivotsFileName = in.readUTF();
122+
String pqCompressedFileName = in.readUTF();
123+
return new DiskAnnIndexMeta(
124+
dim,
125+
metricValue,
126+
indexTypeValue,
127+
numVectors,
128+
minId,
129+
maxId,
130+
dataFileName,
131+
pqPivotsFileName,
132+
pqCompressedFileName);
105133
}
106134
}

paimon-diskann/paimon-diskann-index/src/main/java/org/apache/paimon/diskann/index/DiskAnnIndexType.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,7 @@
2020

2121
/** DiskANN index type. */
2222
public enum DiskAnnIndexType {
23-
MEMORY(0),
24-
DISK(1),
25-
UNKNOWN(-1);
23+
MEMORY(0);
2624

2725
private final int value;
2826

0 commit comments

Comments
 (0)