Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions SConstruct
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,21 @@ add_option('disable-warnings-as-errors',
nargs=0,
)

add_option('distance-expression-non-bson',
help="Uses Do not uses BSON to pass the values to the distance expressions. You can pass a regular float vector.",
nargs=0,
)

add_option('distance-expression-use-avx2',
help="Uses BSON to pass the values to the distance expressions",
nargs=0,
)

add_option('distance-expression-use-avx512',
help="Uses BSON to pass the values to the distance expressions",
nargs=0,
)

add_option('detect-odr-violations',
help="Have the linker try to detect ODR violations, if supported",
nargs=0,
Expand Down Expand Up @@ -1813,6 +1828,23 @@ if env.TargetOSIs('posix'):
if env.TargetOSIs('linux', 'darwin', 'solaris'):
if not has_option("disable-warnings-as-errors"):
env.Append( CCFLAGS=["-Werror"] )
if has_option(distance-expression-non-bson):
env.Append( CCFLAGS=["-DDISTANCE_EXPRESSION_NOT_BSON"] )
else:
if not has_option("distance-expression-use-avx512"):
if has_option("distance-expression-use-avx2"):
env.Append( CCFLAGS=["-mavx2"] )
env.Append( CCFLAGS=["-march=haswell"] )
env.Append( CCFLAGS=["-mtune=intel"] )
env.Append( CCFLAGS=["-fopenmp"] )
env.Append( CCFLAGS=["-O3"] )
env.Append( CCFLAGS=["-DUSE_AVX2"] )
else:
env.Append( CCFLAGS=["-march=skylake-avx512"] )
env.Append( CCFLAGS=["-mtune=skylake-avx512"] )
env.Append( CCFLAGS=["-fopenmp"] )
env.Append( CCFLAGS=["-O3"] )
env.Append( CCFLAGS=["-DUSE_AVX512"] )

env.Append( CXXFLAGS=["-Woverloaded-virtual"] )
if env.ToolchainIs('clang'):
Expand Down
3 changes: 3 additions & 0 deletions build_with_avx2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

python2 buildscripts/scons.py mongod mongo mongos --disable-warnings-as-errors -j8 --release --distance-expression-use-avx2
65 changes: 65 additions & 0 deletions pytests/distancetest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Distance expression test.

from __future__ import print_function
from builtins import range
from pymongo import MongoClient
import numpy as np
import random
import time
import string
from bson import binary


client = MongoClient()
db = client["test_speed"]

vec_size = 4096
num_documents = 150000
fill_data_base = True
iterations = 10
functions = ['no_op', 'cossim', 'chi2', 'euclidean', 'squared_euclidean', 'manhattan', 'no_op']

vec = []

for _ in range(iterations):
vec.append(binary.Binary(np.random.rand(vec_size).astype(np.float32).tobytes()))


if fill_data_base:
print("load database")
db.test_speed.drop()
for i in range(num_documents):
if i % 1000 == 0: print(i)
db.test_speed.insert({
"id": random.randint(0, 1000000),
"other_id": ''.join(np.random.choice(list(string.ascii_uppercase)) for _ in range(6)),
"vector": binary.Binary(np.random.rand(vec_size).astype(np.float32).tobytes())
})

print("database loaded", db.test_speed.count())

times_aggregate_base = np.zeros([iterations, 1], dtype=np.float32)
for function in functions:
for index in range(iterations):
start = time.time()
result = db.test_speed.aggregate([
{
'$project':
{
'id': '$id',
"other_id": '$other_id',
'distance': {'${}'.format(function): [vec[index], '$vector']},
},
},
{"$sort": {"distance": -1}},
{"$limit": 20}
])
selection = list(result)
times_aggregate_base[index] = time.time() - start

print("Aggregate distance {}:".format(function))
print(" - average: {:.5f}ms".format(np.mean(times_aggregate_base) * 1000))
print(" - std: {:.5f}ms".format(np.std(times_aggregate_base) * 1000))
print(" - max: {:.5f}ms".format(np.max(times_aggregate_base) * 1000))
print(" - min: {:.5f}ms".format(np.min(times_aggregate_base) * 1000))
print(" - median: {:.5f}ms".format(np.median(times_aggregate_base) * 1000))
21 changes: 21 additions & 0 deletions src/mongo/db/db.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,27 @@ ExitCode _initAndListen(int listenPort) {
l << (is32bit ? " 32" : " 64") << "-bit host=" << getHostNameCached() << endl;
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Những

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain what is your intention @anhems with this line?
Seems completely unrelated to this PR,

Best,
Miguel

}

{
LogstreamBuilder l = log(LogComponent::kControl);

l << "Distance Expression Info: ";

#ifdef DISTANCE_EXPRESSION_NOT_BSON
l << "Not using BSON";
#else
l << "Using BSON";
#ifdef USE_AVX512
l << " - With AVX512";
#else
#if USE_AVX2
l << " - With AVX2";
#endif
#endif
#endif

l << std::endl;
}

DEV log(LogComponent::kControl) << "DEBUG build (which is slower)" << endl;

#if defined(_WIN32)
Expand Down
4 changes: 4 additions & 0 deletions src/mongo/db/pipeline/SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,10 @@ env.Library(
source=[
'expression.cpp',
'expression_trigonometric.cpp',
'expression_distance.cpp',
'expression_distance_avx2.cpp',
'expression_distance_avx512.cpp',
'expression_distance_non_bson.cpp',
],
LIBDEPS=[
'$BUILD_DIR/mongo/db/query/datetime/date_time_support',
Expand Down
95 changes: 95 additions & 0 deletions src/mongo/db/pipeline/expression_distance.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#if !defined(DISTANCE_EXPRESSION_NOT_BSON) && !defined(USE_AVX2) && !defined(USE_AVX512)

#include "mongo/db/pipeline/expression_distance.h"

namespace mongo {

REGISTER_EXPRESSION(euclidean, ExpressionEuclidean::parse)
REGISTER_EXPRESSION(cossim, ExpressionCosineSimilarity::parse)
REGISTER_EXPRESSION(chi2, ExpressionChi2::parse)
REGISTER_EXPRESSION(squared_euclidean, ExpressionSquaredEuclidean::parse)
REGISTER_EXPRESSION(manhattan, ExpressionManhattan::parse)
REGISTER_EXPRESSION(no_op, ExpressionNoOp::parse)

/* ------------------------- ExpressionEuclideanBin ----------------------------- */

Value ExpressionEuclidean::evaluateImpl(const float* p_pData1, const float* p_pData2, const size_t p_uiSize) const {
float r = 0.0;
for (size_t i = 0; i < p_uiSize; ++i, ++p_pData1, ++p_pData2) {
float diff = *p_pData1 - *p_pData2;
r += diff * diff;
}

return Value(double(std::sqrt(r)));
}

/* ------------------------- ExpressionCosineSimilarityBin ----------------------------- */

Value ExpressionCosineSimilarity::evaluateImpl(const float* p_pData1, const float* p_pData2, const size_t p_uiSize) const {
float dot = 0.0;
float norm_a = 0.0;
float norm_b = 0.0;

for (size_t i = 0; i < p_uiSize; ++i, ++p_pData1, ++p_pData2) {
float a = *p_pData1;
float b = *p_pData2;

dot += a * b;
norm_a += a * a;
norm_b += b * b;
}

float result = 1 - ( dot / ( (std::sqrt(norm_a*norm_b) + FLT_MIN) ));
return Value(double(result));
}

/* ------------------------- ExpressionChi2Bin ----------------------------- */

Value ExpressionChi2::evaluateImpl(const float* p_pData1, const float* p_pData2, const size_t p_uiSize) const {
float r = 0.0f;
for (size_t i = 0; i < p_uiSize; ++i, ++p_pData1, ++p_pData2) {
float a = *p_pData1;
float b = *p_pData2;
float t = a + b;
float diff = a - b;

r += (diff * diff) / ( t + FLT_MIN);
}

return Value(double(r));
}

/* ------------------------- ExpressionSquaredEuclideanBin ----------------------------- */

Value ExpressionSquaredEuclidean::evaluateImpl(const float* p_pData1, const float* p_pData2, const size_t p_uiSize) const {
float r = 0.f;
for (size_t i = 0; i < p_uiSize; ++i, ++p_pData1, ++p_pData2) {
float diff = *p_pData1 - *p_pData2;
r += diff * diff;
}

return Value(double(r));
}

/* ------------------------- ExpressionManhattanBin ----------------------------- */

Value ExpressionManhattan::evaluateImpl(const float* p_pData1, const float* p_pData2, const size_t p_uiSize) const {
float r = 0.0;

for (size_t i = 0; i < p_uiSize; ++i, ++p_pData1, ++p_pData2) {
r += std::fabs( *p_pData1 - *p_pData2 );
}

return Value( double(r) );
}

/* ------------------------- ExpressionNoOp ----------------------------- */

inline Value ExpressionNoOp::evaluateImpl(
const float* p_pData1, const float* p_pData2, const size_t p_uiSize) const {
return Value( 0.0 );
}

}

#endif
124 changes: 124 additions & 0 deletions src/mongo/db/pipeline/expression_distance.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@

#pragma once

#include "mongo/db/pipeline/expression.h"

namespace mongo {

// When not using BSON we will use vectors
#ifdef DISTANCE_EXPRESSION_NOT_BSON
#define DISTANCE_EVALUATE_IMPL_PROTO(type) \
Value evaluateImpl( \
const std::vector<Value>& vector1, \
const std::vector<Value>& vector2) const override final;
#else
// When using BSON we will get a pointer
#define DISTANCE_EVALUATE_IMPL_PROTO(type) \
Value evaluateImpl( \
const type* vector1, \
const type* vector2, const size_t size) const override final;
#endif

#define DECLARE_DISTANCE_EXPRESSION(key, class_name, type, error) \
class class_name final \
: public ExpressionDistance<class_name, type, error> { \
public: \
explicit class_name(const boost::intrusive_ptr<ExpressionContext>& expCtx) \
: ExpressionDistance<class_name, type, error>(expCtx) {} \
const char* getOpName() const final { \
return "$" #key; \
} \
protected: \
const std::vector<boost::intrusive_ptr<Expression>>& getOperandList() const final { \
return vpOperand; \
} \
DISTANCE_EVALUATE_IMPL_PROTO(type) \
};

// Template class for defining a distance expression
template <class SubClass, typename T, long ERROR>
class ExpressionDistance : public ExpressionNaryBase<SubClass> {
public:
explicit ExpressionDistance(const boost::intrusive_ptr<ExpressionContext>& expCtx)
: ExpressionNaryBase<SubClass>(expCtx) {}

Value evaluate(const Document& root) const final {
std::string sExpression = getOpName();
const auto& vpOperand = getOperandList();
const size_t n = vpOperand.size();

if (n != 2) {
uasserted(ERROR,
str::stream() << sExpression << " only suppports 2 expressions, not " << n);
}

const Value& value1 = vpOperand[0]->evaluate(root);
const Value& value2 = vpOperand[1]->evaluate(root);

#ifndef DISTANCE_EXPRESSION_NOT_BSON
const BSONBinData& vector1 = value1.getBinData();
const BSONBinData& vector2 = value2.getBinData();

if (vector1.length != vector2.length) {
uasserted(ERROR + 1000L,
str::stream() << sExpression << " both operands must have the same length.");
}

const T* pData1 = (const T*)vector1.data;
const T* pData2 = (const T*)vector2.data;

return evaluateImpl(pData1, pData2, vector1.length / sizeof(T));
#else
if (!value1.isArray()) {
uasserted(ErrorCodes::FailedToParse,
str::stream() << sExpression << " only supports array on 1st expression , not "
<< typeName(value1.getType()));
}

if (!value2.isArray()) {
uasserted(ErrorCodes::FailedToParse,
str::stream() << sExpression << " only supports array on 2nd expression, not "
<< typeName(value2.getType()));
}

const std::vector<Value>& vector1 = value1.getArray();
const std::vector<Value>& vector2 = value2.getArray();

if(vector1.size() != vector2.size()){
uasserted(ErrorCodes::FailedToParse,
str::stream() << sExpression << " vectors of different sizes found "
<< vector1.size() << " " << vector2.size());
}

return evaluateImpl(vector1, vector2);
#endif
}

bool isAssociative() const final {
return true;
}

bool isCommutative() const final {
return false;
}

virtual const char* getOpName() const = 0;

protected:
virtual const std::vector<boost::intrusive_ptr<Expression>>& getOperandList() const = 0;
#ifndef DISTANCE_EXPRESSION_NOT_BSON
virtual Value evaluateImpl(const T* p_pData1, const T* p_pData2, const size_t p_uiSize) const = 0;
#else
virtual Value evaluateImpl(const std::vector<Value>& vector1, const std::vector<Value>& vector2) const = 0;
#endif
};

DECLARE_DISTANCE_EXPRESSION(euclidean, ExpressionEuclidean, float, 9020)
DECLARE_DISTANCE_EXPRESSION(cossim, ExpressionCosineSimilarity, float, 90021)
DECLARE_DISTANCE_EXPRESSION(chi2, ExpressionChi2, float, 90022)
DECLARE_DISTANCE_EXPRESSION(squared_euclidean, ExpressionSquaredEuclidean, float, 90023)
DECLARE_DISTANCE_EXPRESSION(manhattan, ExpressionManhattan, float, 90024)
// Only for benchmarking
DECLARE_DISTANCE_EXPRESSION(no_op, ExpressionNoOp, float, 90025)

}
Loading