-
Notifications
You must be signed in to change notification settings - Fork 25.7k
Implement v_hamming #132959
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement v_hamming #132959
Changes from 8 commits
3cd12e9
d82eaeb
615dcb8
ed5a7a8
1c38f15
95d2ea7
d303e29
e427c8b
d3ac81b
20c7495
eb2c822
d7cc153
4780604
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| pr: 132959 | ||
| summary: Implement `v_hamming` | ||
| area: ES|QL | ||
| type: feature | ||
| issues: [132056] | ||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,103 @@ | ||
| # Tests for hamming similarity function | ||
|
|
||
| similarityWithVectorField | ||
| required_capability: hamming_vector_similarity_function | ||
|
|
||
| // tag::vector-hamming[] | ||
| from colors | ||
| | eval similarity = v_hamming(rgb_vector, [0, 255, 255]) | ||
| | sort similarity desc, color asc | ||
| // end::vector-hamming[] | ||
| | limit 10 | ||
| | keep color, similarity | ||
| ; | ||
|
|
||
| // tag::vector-hamming-result[] | ||
| color:text | similarity:double | ||
| cyan | 1.0 | ||
| azure | 0.8333333134651184 | ||
| blue | 0.6666666865348816 | ||
| honeydew | 0.6666666865348816 | ||
| lime | 0.6666666865348816 | ||
| mint cream | 0.6666666865348816 | ||
| white | 0.6666666865348816 | ||
| thistle | 0.625 | ||
| lavender | 0.5833333134651184 | ||
| aqua marine | 0.5416666865348816 | ||
| // end::vector-hamming-result[] | ||
| ; | ||
|
|
||
| similarityAsPartOfExpression | ||
| required_capability: hamming_vector_similarity_function | ||
|
|
||
| from colors | ||
| | eval score = round((1 + v_hamming(rgb_vector, [0, 255, 255]) / 2), 3) | ||
| | sort score desc, color asc | ||
| | limit 10 | ||
| | keep color, score | ||
| ; | ||
|
|
||
| color:text | score:double | ||
| cyan | 1.5 | ||
| azure | 1.417 | ||
| blue | 1.333 | ||
| honeydew | 1.333 | ||
| lime | 1.333 | ||
| mint cream | 1.333 | ||
| white | 1.333 | ||
| thistle | 1.313 | ||
| lavender | 1.292 | ||
| aqua marine | 1.271 | ||
| ; | ||
|
|
||
| similarityWithLiteralVectors | ||
| required_capability: hamming_vector_similarity_function | ||
|
|
||
| row a = 1 | ||
| | eval similarity = round(v_hamming([1, 2, 3], [0, 1, 2]), 3) | ||
| | keep similarity | ||
| ; | ||
|
|
||
| similarity:double | ||
| 0.833 | ||
| ; | ||
|
|
||
| similarityWithStats | ||
| required_capability: hamming_vector_similarity_function | ||
|
|
||
| from colors | ||
| | eval similarity = round(v_hamming(rgb_vector, [0, 255, 255]), 3) | ||
| | stats avg = round(avg(similarity), 3), min = min(similarity), max = max(similarity) | ||
| ; | ||
|
|
||
| avg:double | min:double | max:double | ||
| 0.445 | 0.0 | 1.0 | ||
| ; | ||
|
|
||
| similarityWithNull | ||
| required_capability: hamming_vector_similarity_function | ||
| required_capability: vector_similarity_functions_support_null | ||
|
|
||
| from colors | ||
| | eval similarity = v_hamming(rgb_vector, null) | ||
| | stats total_null = count(*) where similarity is null | ||
| ; | ||
|
|
||
| total_null:long | ||
| 59 | ||
| ; | ||
|
|
||
| # TODO Need to implement a conversion function to convert a non-foldable row to a dense_vector | ||
| similarityWithRow-Ignore | ||
| required_capability: hamming_vector_similarity_function | ||
|
|
||
| row vector = [1, 2, 3] | ||
| | eval similarity = round(v_hamming(vector, [0, 1, 2]), 3) | ||
| | sort similarity desc, color asc | ||
| | limit 10 | ||
| | keep color, similarity | ||
| ; | ||
|
|
||
| similarity:double | ||
| 0.978 | ||
| ; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,88 @@ | ||
| /* | ||
| * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
| * or more contributor license agreements. Licensed under the Elastic License | ||
| * 2.0; you may not use this file except in compliance with the Elastic License | ||
| * 2.0. | ||
| */ | ||
|
|
||
| package org.elasticsearch.xpack.esql.expression.function.vector; | ||
|
|
||
| import org.apache.lucene.util.VectorUtil; | ||
| import org.elasticsearch.common.io.stream.NamedWriteableRegistry; | ||
| import org.elasticsearch.common.io.stream.StreamInput; | ||
| import org.elasticsearch.xpack.esql.core.expression.Expression; | ||
| import org.elasticsearch.xpack.esql.core.expression.function.scalar.BinaryScalarFunction; | ||
| import org.elasticsearch.xpack.esql.core.tree.NodeInfo; | ||
| import org.elasticsearch.xpack.esql.core.tree.Source; | ||
| import org.elasticsearch.xpack.esql.expression.function.Example; | ||
| import org.elasticsearch.xpack.esql.expression.function.FunctionAppliesTo; | ||
| import org.elasticsearch.xpack.esql.expression.function.FunctionAppliesToLifecycle; | ||
| import org.elasticsearch.xpack.esql.expression.function.FunctionInfo; | ||
| import org.elasticsearch.xpack.esql.expression.function.Param; | ||
|
|
||
| import java.io.IOException; | ||
|
|
||
| public class Hamming extends VectorSimilarityFunction { | ||
|
|
||
| public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "Hamming", Hamming::new); | ||
| static final SimilarityEvaluatorFunction SIMILARITY_FUNCTION = Hamming::calculateSimilarity; | ||
|
|
||
| @FunctionInfo( | ||
| returnType = "double", | ||
| preview = true, | ||
| description = "Calculates the hamming distance between two dense_vectors.", | ||
|
||
| examples = { @Example(file = "vector-hamming", tag = "vector-hamming") }, | ||
| appliesTo = { @FunctionAppliesTo(lifeCycle = FunctionAppliesToLifecycle.DEVELOPMENT) } | ||
svilen-mihaylov-elastic marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ) | ||
| public Hamming( | ||
| Source source, | ||
| @Param( | ||
| name = "left", | ||
| type = { "dense_vector" }, | ||
| description = "first dense_vector to calculate hamming distance between" | ||
|
||
| ) Expression left, | ||
| @Param( | ||
| name = "right", | ||
| type = { "dense_vector" }, | ||
| description = "second dense_vector to calculate hamming distance between" | ||
| ) Expression right | ||
| ) { | ||
| super(source, left, right); | ||
| } | ||
|
|
||
| private Hamming(StreamInput in) throws IOException { | ||
| super(in); | ||
| } | ||
|
|
||
| @Override | ||
| protected SimilarityEvaluatorFunction getSimilarityFunction() { | ||
| return SIMILARITY_FUNCTION; | ||
| } | ||
|
|
||
| @Override | ||
| protected BinaryScalarFunction replaceChildren(Expression newLeft, Expression newRight) { | ||
| return new Hamming(source(), newLeft, newRight); | ||
| } | ||
|
|
||
| @Override | ||
| protected NodeInfo<? extends Expression> info() { | ||
| return NodeInfo.create(this, Hamming::new, left(), right()); | ||
| } | ||
|
|
||
| @Override | ||
| public String getWriteableName() { | ||
| return ENTRY.name; | ||
| } | ||
|
|
||
| public static float calculateSimilarity(float[] leftScratch, float[] rightScratch) { | ||
| byte[] a = new byte[leftScratch.length]; | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Core of change. We assume here the floats as in range (0, 256), convert to byte vectors, and do the same as in ES815BitFlatVectorsFormat
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Per feedback, returning raw distance, not normalized between 0.0 and 1.0 as above. |
||
| byte[] b = new byte[rightScratch.length]; | ||
| for (int i = 0; i < leftScratch.length; i++) { | ||
| a[i] = (byte) leftScratch[i]; | ||
| } | ||
| for (int i = 0; i < leftScratch.length; i++) { | ||
| b[i] = (byte) rightScratch[i]; | ||
| } | ||
| return ((a.length * Byte.SIZE) - VectorUtil.xorBitCount(a, b)) / (float) (a.length * Byte.SIZE); | ||
|
||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.