diff --git a/.github/workflows/generate-llms-txt.yml b/.github/workflows/generate-llms-txt.yml new file mode 100644 index 0000000000..5dc95262f6 --- /dev/null +++ b/.github/workflows/generate-llms-txt.yml @@ -0,0 +1,40 @@ +name: Generate llms.txt and llms-full.txt +on: + push: + branches: [ master ] + pull_request: + branches: [ '**' ] + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +permissions: + models: read + +jobs: + generate-llms: + permissions: + contents: write + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Setup Hugo + uses: peaceiris/actions-hugo@v2 + with: + hugo-version: "0.148.2" + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Install dependencies + run: | + pip install requests openai + - name: Run the generation script + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: python automation/generate-llms-txt.py + - uses: stefanzweifel/git-auto-commit-action@v5 + with: + commit_message: "Update llms.txt and llms-full.txt" + commit_user_name: "GitHub Actions" + commit_user_email: "team@qdrant.com" diff --git a/automation/generate-llms-txt.py b/automation/generate-llms-txt.py new file mode 100644 index 0000000000..362323fb60 --- /dev/null +++ b/automation/generate-llms-txt.py @@ -0,0 +1,208 @@ +import csv +import glob +import subprocess +import os +import re +import openai +from typing import Iterable +from dataclasses import dataclass + + +BASE_DIR = os.path.abspath(os.path.dirname(os.path.realpath(__file__)) + "/../qdrant-landing") +OUTPUT_DIR = BASE_DIR + "/static" +GENERAL_DESCRIPTION = ( + "Qdrant is a cutting-edge platform focused on delivering exceptional performance and efficiency in vector " + "similarity search. As a robust vector database, it specializes in managing, searching, and retrieving " + "high-dimensional vector data, essential for enhancing AI applications, machine learning, and modern search " + "engines. With a suite of powerful features such as state-of-the-art hybrid search capabilities, " + "retrieval-augmented generation (RAG) applications, and dense and sparse vector support, Qdrant stands out as an " + "industry leader. Its offerings include managed cloud services, enabling users to harness the robust functionality " + "of Qdrant without the burden of maintaining infrastructure. The platform supports advanced data security measures " + "and seamless integrations with popular platforms and frameworks, catering to diverse data handling and analytic " + "needs. Additionally, Qdrant offers comprehensive solutions for complex searching requirements through its " + "innovative Query API and multivector representations, allowing for precise matching and enhanced retrieval " + "quality. With its commitment to open-source principles and continuous innovation, Qdrant tailors solutions to " + "meet both small-scale projects and enterprise-level demands efficiently, helping organizations unlock profound " + "insights from their unstructured data and optimize their AI capabilities." +) + +@dataclass +class HugoContent: + path: str + absolute_url: str + title: str | None + content: str | None + + +def sort_key(line: dict) -> int: + """ + Calculate a score for the hugo content entry based on its path importance. + The more important the path, the higher the score. + :param line: A dictionary representing a line from the CSV output of `hugo list published`. + :return: + """ + path_boosts = { + "documentation/concepts": 10, + "documentation/quickstart": 9, + "articles": 7, + "documentation": 5, + "blog": 3, + } + path = line.get("path", "") + score = sum(boost for key, boost in path_boosts.items() if key in path) + return score + + +def load_frontmatter_and_content(raw_content: str) -> (dict, str): + """ + Load the front matter and content from the raw content string. + The front matter is expected to be in YAML format and enclosed in `---` at the beginning of the content. + The content is everything after the front matter. + :param raw_content: + :return: + """ + frontmatter = dict() + if raw_content.startswith("---"): + end_index = raw_content.find("---", 3) + 3 + raw_frontmatter = raw_content[:end_index].strip() + # Parse the front matter as a dictionary + for line in raw_frontmatter.splitlines()[1:-1]: + try: + key, value = line.split(":", 1) + frontmatter[key.strip()] = value.strip("\"' ") # Remove quotes and whitespace + except ValueError: + # If the line doesn't contain a key-value pair, skip it + continue + # Remove the front matter from the content + content = raw_content[end_index:].strip() + else: + content = raw_content.strip() + return frontmatter, content + +def iter_hugo_content() -> Iterable[HugoContent]: + """ + List the published content in Hugo. + :return: + """ + # Run os `hugo list published` command and capture the output. + cmd = ["hugo", "list", "published"] + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + raise RuntimeError(f"Command failed with error: {result.stderr}") + + # Parse the output to extract the paths. Output is expected to be a CSV format + # with the first line being a header. The first column contains the path. + csv_reader = csv.DictReader(result.stdout.splitlines()) + lines = list(csv_reader) + for line in sorted(lines, key=sort_key, reverse=True): + path = line.get("path") + if not path: + continue + + # Load the content of the file at the given path. + with open(os.path.join(BASE_DIR, path), "r", encoding="utf-8") as file: + frontmatter, content = load_frontmatter_and_content(file.read()) + if not content: + continue + + # Render the code snippets in the content. + # Example: {{< code-snippet path="/documentation/headless/snippets/create-collection/simple/" >}} + snippets_iter = re.finditer(r"{{<\s*code-snippet\s+path=\"([^\"]+)\"\s*>}}", content) + for snippet in snippets_iter: + snippet_dir = snippet.group(1) + snippet_files = glob.glob("content/" + snippet_dir.strip("/") + "/[a-z]*.md") + snippet_content = "" + for snippet_file in snippet_files: + with open(snippet_file, "r", encoding="utf-8") as f: + snippet_content += f.read() + snippet_content += "\n" # Add a newline between snippets + # Replace the code snippet placeholder with the actual content + content = content.replace(snippet.group(0), snippet_content.strip()) + + yield HugoContent( + path=path, + absolute_url=line.get("permalink"), + title=frontmatter.get("title"), + content=content, + ) + + +def summarize_content(content: str) -> str: + """ + Generate a summary for the given content using an LLM. + Use GitHub Models as a provider for the LLM. + :param content: + :return: + """ + # Truncate the content to a maximum of 8192 characters + content = content[:8192] + + # Call the GitHub Models API to generate a summary + client = openai.OpenAI( + api_key=os.environ.get("GITHUB_TOKEN"), + base_url="https://models.github.ai/inference", + ) + completions = client.chat.completions.create( + model="openai/gpt-4o", + messages=[ + { + "role": "user", + "content": ( + "Please summarize the following content in a concise manner, " + "focusing on the main points and key information. The summary should" + "not be longer than 2 sentences:\n\n" + f"{content}" + ) + } + ] + ) + summary = completions.choices[0].message.content.strip() + return summary + + +def main(): + """ + List all the content in Hugo and generate both llms.txt and llms-full.txt files. + Overwrite existing files if they exist. + :return: + """ + # Change the current working directory to the Hugo content directory + os.chdir(BASE_DIR) + + # Load the current state of the llms.txt file to avoid duplicates + with open(os.path.join(OUTPUT_DIR, "llms.txt"), "r", encoding="utf-8") as llms_file: + existing_urls = {line.split("](")[1].split(")")[0] for line in llms_file if line.startswith("- [")} + + # Load the paths to all the published content in Hugo and process them sequentially + # to generate the llms.txt and llms-full.txt files. + with (open(os.path.join(OUTPUT_DIR, "llms.txt"), "a+", encoding="utf-8") as llms_file, \ + open(os.path.join(OUTPUT_DIR, "llms-full.txt"), "w", encoding="utf-8") as llms_full_file): + + # Write the header for the full file + llms_full_file.write("# https://qdrant.tech/ llms-full.txt\n") + llms_full_file.write("## Overall Summary\n") + llms_full_file.write(f"> {GENERAL_DESCRIPTION}\n\n") + + for page_counter, content in enumerate(iter_hugo_content(), start=1): + # Write the content to the full file + # Honestly, I don't know why we need this kind of <|page-{page_counter}-lllmstxt|> marker, + # but it is used in the original llms-full.txt file, so I keep it for consistency. + llms_full_file.write(f"<|page-{page_counter}-lllmstxt|>\n") + llms_full_file.write(content.content + "\n\n") + + # Skip if there is no title, as we cannot generate a link without it + if not content.title: + continue + + # Only append to the llms.txt file if the URL does not already exist + if content.absolute_url in existing_urls: + print(f"Skipping {content.title} ({content.absolute_url}) - already exists in llms.txt") + continue + + content_summary = summarize_content(content.content) + llms_file.write(f"- [{content.title}]({content.absolute_url}): {content_summary}\n") + print(f"Processed {content.title} ({content.absolute_url})") + + +if __name__ == "__main__": + main() diff --git a/qdrant-landing/static/llms-full.txt b/qdrant-landing/static/llms-full.txt index dfadf95875..c0e8a092dc 100644 --- a/qdrant-landing/static/llms-full.txt +++ b/qdrant-landing/static/llms-full.txt @@ -1,2918 +1,18028 @@ # https://qdrant.tech/ llms-full.txt - ## Overall Summary - > Qdrant is a cutting-edge platform focused on delivering exceptional performance and efficiency in vector similarity search. As a robust vector database, it specializes in managing, searching, and retrieving high-dimensional vector data, essential for enhancing AI applications, machine learning, and modern search engines. With a suite of powerful features such as state-of-the-art hybrid search capabilities, retrieval-augmented generation (RAG) applications, and dense and sparse vector support, Qdrant stands out as an industry leader. Its offerings include managed cloud services, enabling users to harness the robust functionality of Qdrant without the burden of maintaining infrastructure. The platform supports advanced data security measures and seamless integrations with popular platforms and frameworks, catering to diverse data handling and analytic needs. Additionally, Qdrant offers comprehensive solutions for complex searching requirements through its innovative Query API and multivector representations, allowing for precise matching and enhanced retrieval quality. With its commitment to open-source principles and continuous innovation, Qdrant tailors solutions to meet both small-scale projects and enterprise-level demands efficiently, helping organizations unlock profound insights from their unstructured data and optimize their AI capabilities. <|page-1-lllmstxt|> -## backups -- [Documentation](https://qdrant.tech/documentation/) -- [Private cloud](https://qdrant.tech/documentation/private-cloud/) -- Backups - -# [Anchor](https://qdrant.tech/documentation/private-cloud/backups/\#backups) Backups - -To create a one-time backup, create a `QdrantClusterSnapshot` resource: - -```yaml -apiVersion: qdrant.io/v1 -kind: QdrantClusterSnapshot -metadata: - name: "qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840-snapshot-timestamp" - labels: - cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" - customer-id: "acme-industries" -spec: - cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" - retention: 1h - -``` - -You can also create a recurring backup with the `QdrantClusterScheduledSnapshot` resource: - -```yaml -apiVersion: qdrant.io/v1 -kind: QdrantClusterScheduledSnapshot -metadata: - name: "qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840-snapshot-timestamp" - labels: - cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" - customer-id: "acme-industries" -spec: - scheduleShortId: a7d8d973 - cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" - # every hour - schedule: "0 * * * *" - retention: 1h +# Concepts -``` +Think of these concepts as a glossary. Each of these concepts include a link to +detailed information, usually with examples. If you're new to AI, these concepts +can help you learn more about AI and the Qdrant approach. -To resture from a backup, create a `QdrantClusterRestore` resource: +## Collections -```yaml -apiVersion: qdrant.io/v1 -kind: QdrantClusterRestore -metadata: - name: "qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840-snapshot-restore-01" - labels: - cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" - customer-id: "acme-industries" -spec: - source: - snapshotName: qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840-snapshot-timestamp - namespace: qdrant-private-cloud - destination: - name: qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840 - namespace: qdrant-private-cloud +[Collections](/documentation/concepts/collections/) define a named set of points that you can use for your search. -``` +## Payload -Note that with all resources `cluster-id` and `customer-id` label must be set to the values of the corresponding `QdrantCluster` resource. +A [Payload](/documentation/concepts/payload/) describes information that you can store with vectors. -##### Was this page useful? +## Points -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +[Points](/documentation/concepts/points/) are a record which consists of a vector and an optional payload. -Thank you for your feedback! 🙏 +## Search -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/private-cloud/backups.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +[Search](/documentation/concepts/search/) describes _similarity search_, which set up related objects close to each other in vector space. -On this page: +## Explore -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/private-cloud/backups.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +[Explore](/documentation/concepts/explore/) includes several APIs for exploring data in your collections. -× +## Hybrid Queries -[Powered by](https://qdrant.tech/) +[Hybrid Queries](/documentation/concepts/hybrid-queries/) combines multiple queries or performs them in more than one stage. -<|page-2-lllmstxt|> -## benchmark-faq -# Benchmarks F.A.Q. +## Filtering -January 01, 0001 +[Filtering](/documentation/concepts/filtering/) defines various database-style clauses, conditions, and more. -# [Anchor](https://qdrant.tech/benchmarks/benchmark-faq/\#benchmarks-faq) Benchmarks F.A.Q. +## Optimizer -## [Anchor](https://qdrant.tech/benchmarks/benchmark-faq/\#are-we-biased) Are we biased? +[Optimizer](/documentation/concepts/optimizer/) describes options to rebuild +database structures for faster search. They include a vacuum, a merge, and an +indexing optimizer. -Probably, yes. Even if we try to be objective, we are not experts in using all the existing vector databases. -We build Qdrant and know the most about it. -Due to that, we could have missed some important tweaks in different vector search engines. +## Storage -However, we tried our best, kept scrolling the docs up and down, experimented with combinations of different configurations, and gave all of them an equal chance to stand out. If you believe you can do it better than us, our **benchmarks are fully [open-sourced](https://github.com/qdrant/vector-db-benchmark), and contributions are welcome**! +[Storage](/documentation/concepts/storage/) describes the configuration of storage in segments, which include indexes and an ID mapper. -## [Anchor](https://qdrant.tech/benchmarks/benchmark-faq/\#what-do-we-measure) What do we measure? +## Indexing -There are several factors considered while deciding on which database to use. -Of course, some of them support a different subset of functionalities, and those might be a key factor to make the decision. -But in general, we all care about the search precision, speed, and resources required to achieve it. +[Indexing](/documentation/concepts/indexing/) lists and describes available indexes. They include payload, vector, sparse vector, and a filterable index. -There is one important thing - **the speed of the vector databases should to be compared only if they achieve the same precision**. Otherwise, they could maximize the speed factors by providing inaccurate results, which everybody would rather avoid. Thus, our benchmark results are compared only at a specific search precision threshold. +## Snapshots -## [Anchor](https://qdrant.tech/benchmarks/benchmark-faq/\#how-we-select-hardware) How we select hardware? +[Snapshots](/documentation/concepts/snapshots/) describe the backup/restore process (and more) for each node at specific times. -In our experiments, we are not focusing on the absolute values of the metrics but rather on a relative comparison of different engines. -What is important is the fact we used the same machine for all the tests. -It was just wiped off between launching different engines. +<|page-2-lllmstxt|> +# Collections -We selected an average machine, which you can easily rent from almost any cloud provider. No extra quota or custom configuration is required. +A collection is a named set of points (vectors with a payload) among which you can search. The vector of each point within the same collection must have the same dimensionality and be compared by a single metric. [Named vectors](#collection-with-multiple-vectors) can be used to have multiple vectors in a single point, each of which can have their own dimensionality and metric requirements. -## [Anchor](https://qdrant.tech/benchmarks/benchmark-faq/\#why-you-are-not-comparing-with-faiss-or-annoy) Why you are not comparing with FAISS or Annoy? +Distance metrics are used to measure similarities among vectors. +The choice of metric depends on the way vectors obtaining and, in particular, on the method of neural network encoder training. -Libraries like FAISS provide a great tool to do experiments with vector search. But they are far away from real usage in production environments. -If you are using FAISS in production, in the best case, you never need to update it in real-time. In the worst case, you have to create your custom wrapper around it to support CRUD, high availability, horizontal scalability, concurrent access, and so on. +Qdrant supports these most popular types of metrics: -Some vector search engines even use FAISS under the hood, but a search engine is much more than just an indexing algorithm. +* Dot product: `Dot` - [[wiki]](https://en.wikipedia.org/wiki/Dot_product) +* Cosine similarity: `Cosine` - [[wiki]](https://en.wikipedia.org/wiki/Cosine_similarity) +* Euclidean distance: `Euclid` - [[wiki]](https://en.wikipedia.org/wiki/Euclidean_distance) +* Manhattan distance: `Manhattan` - [[wiki]](https://en.wikipedia.org/wiki/Taxicab_geometry) -We do, however, use the same benchmark datasets as the famous [ann-benchmarks project](https://github.com/erikbern/ann-benchmarks), so you can align your expectations for any practical reasons. + -### [Anchor](https://qdrant.tech/benchmarks/benchmark-faq/\#why-we-decided-to-test-with-the-python-client) Why we decided to test with the Python client +In addition to metrics and vector size, each collection uses its own set of parameters that controls collection optimization, index construction, and vacuum. +These settings can be changed at any time by a corresponding request. -There is no consensus when it comes to the best technology to run benchmarks. You’re free to choose Go, Java or Rust-based systems. But there are two main reasons for us to use Python for this: +## Setting up multitenancy -1. While generating embeddings you’re most likely going to use Python and python based ML frameworks. -2. Based on GitHub stars, python clients are one of the most popular clients across all the engines. +**How many collections should you create?** In most cases, you should only use a single collection with payload-based partitioning. This approach is called [multitenancy](https://en.wikipedia.org/wiki/Multitenancy). It is efficient for most of users, but it requires additional configuration. [Learn how to set it up](/documentation/tutorials/multiple-partitions/) -From the user’s perspective, the crucial thing is the latency perceived while using a specific library - in most cases a Python client. -Nobody can and even should redefine the whole technology stack, just because of using a specific search tool. -That’s why we decided to focus primarily on official Python libraries, provided by the database authors. -Those may use some different protocols under the hood, but at the end of the day, we do not care how the data is transferred, as long as it ends up in the target location. +**When should you create multiple collections?** When you have a limited number of users and you need isolation. This approach is flexible, but it may be more costly, since creating numerous collections may result in resource overhead. Also, you need to ensure that they do not affect each other in any way, including performance-wise. -## [Anchor](https://qdrant.tech/benchmarks/benchmark-faq/\#what-about-closed-source-saas-platforms) What about closed-source SaaS platforms? +## Create a collection -There are some vector databases available as SaaS only so that we couldn’t test them on the same machine as the rest of the systems. -That makes the comparison unfair. That’s why we purely focused on testing the Open Source vector databases, so everybody may reproduce the benchmarks easily. -This is not the final list, and we’ll continue benchmarking as many different engines as possible. +```python +from qdrant_client import QdrantClient, models -## [Anchor](https://qdrant.tech/benchmarks/benchmark-faq/\#how-to-reproduce-the-benchmark) How to reproduce the benchmark? +client = QdrantClient(url="http://localhost:6333") -The source code is available on [Github](https://github.com/qdrant/vector-db-benchmark) and has a `README.md` file describing the process of running the benchmark for a specific engine. +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=100, distance=models.Distance.COSINE), +) +``` +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -## [Anchor](https://qdrant.tech/benchmarks/benchmark-faq/\#how-to-contribute) How to contribute? +var client = new QdrantClient("localhost", 6334); -We made the benchmark Open Source because we believe that it has to be transparent. We could have misconfigured one of the engines or just done it inefficiently. If you feel like you could help us out, check out our [benchmark repository](https://github.com/qdrant/vector-db-benchmark). +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { Size = 100, Distance = Distance.Cosine } +); +``` -Share this article +```bash +curl -X PUT http://localhost:6333/collections/{collection_name} \ + -H 'Content-Type: application/json' \ + --data-raw '{ + "vectors": { + "size": 300, + "distance": "Cosine" + } + }' +``` -[x](https://twitter.com/intent/tweet?url=https%3A%2F%2Fqdrant.tech%2Fbenchmarks%2Fbenchmark-faq%2F&text=Benchmarks%20F.A.Q. "x")[LinkedIn](https://www.linkedin.com/sharing/share-offsite/?url=https%3A%2F%2Fqdrant.tech%2Fbenchmarks%2Fbenchmark-faq%2F "LinkedIn") +```go +import ( + "context" -Up! + "github.com/qdrant/go-client/qdrant" +) -<|page-3-lllmstxt|> -## qdrant.tech -# High-Performance Vector Search at Scale +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -Powering the next generation of AI applications with advanced, open-source vector similarity search technology. +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 100, + Distance: qdrant.Distance_Cosine, + }), +}) +``` -[Get Started](https://cloud.qdrant.io/signup) [Learn More](https://qdrant.tech/qdrant-vector-database/) +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 300, + "distance": "Cosine" + } +} +``` -[Star us\\ -24.2k](https://github.com/qdrant/qdrant) +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -![Hero image: an astronaut looking at dark hole from the planet surface.](https://qdrant.tech/img/hero-home-illustration-x1.png) +const client = new QdrantClient({ host: "localhost", port: 6333 }); -Qdrant Powers Thousands of Top AI Solutions. [Customer Stories](https://qdrant.tech/customers/) +client.createCollection("{collection_name}", { + vectors: { size: 100, distance: "Cosine" }, +}); +``` -## AI Meets Advanced Vector Search +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{CreateCollectionBuilder, VectorParamsBuilder}; -The leading open source vector database and similarity search engine designed to handle high-dimensional vectors for performance and massive-scale AI applications. +let client = Qdrant::from_url("http://localhost:6334").build()?; -[All features](https://qdrant.tech/qdrant-vector-database/) +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(100, Distance::Cosine)), + ) + .await?; +``` -[**Cloud-Native Scalability & High-Availability** \\ -\\ -Enterprise-grade Managed Cloud. Vertical and horizontal scaling and zero-downtime upgrades.\\ -\\ -Qdrant Cloud](https://qdrant.tech/cloud/) +```java +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; -[**Ease of Use & Simple Deployment** \\ -\\ -Quick deployment in any environment with Docker and a lean API for easy integration, ideal for local testing.\\ -\\ -Quick Start Guide](https://qdrant.tech/documentation/quick-start/) +QdrantClient client = new QdrantClient( + QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -[**Cost Efficiency with Storage Options** \\ -\\ -Dramatically reduce memory usage with built-in compression options and offload data to disk.\\ -\\ -Quantization](https://qdrant.tech/documentation/guides/quantization/) +client.createCollectionAsync("{collection_name}", + VectorParams.newBuilder().setDistance(Distance.Cosine).setSize(100).build()).get(); +``` -[**Rust-Powered Reliability & Performance** \\ -\\ -Purpose built in Rust for unmatched speed and reliability even when processing billions of vectors.\\ -\\ -Benchmarks](https://qdrant.tech/benchmarks/) +In addition to the required options, you can also specify custom values for the following collection options: -### Our Customers Words +* `hnsw_config` - see [indexing](/documentation/concepts/indexing/#vector-index) for details. +* `wal_config` - Write-Ahead-Log related configuration. See more details about [WAL](/documentation/concepts/storage/#versioning) +* `optimizers_config` - see [optimizer](/documentation/concepts/optimizer/) for details. +* `shard_number` - which defines how many shards the collection should have. See [distributed deployment](/documentation/guides/distributed_deployment/#sharding) section for details. +* `on_disk_payload` - defines where to store payload data. If `true` - payload will be stored on disk only. Might be useful for limiting the RAM usage in case of large payload. +* `quantization_config` - see [quantization](/documentation/guides/quantization/#setting-up-quantization-in-qdrant) for details. +* `strict_mode_config` - see [strict mode](/documentation/guides/administration/#strict-mode) for details. -[Customer Stories](https://qdrant.tech/customers/) +Default parameters for the optional collection parameters are defined in [configuration file](https://github.com/qdrant/qdrant/blob/master/config/config.yaml). -![Cognizant](https://qdrant.tech/img/brands/cognizant.svg) +See [schema definitions](https://api.qdrant.tech/api-reference/collections/create-collection) and a [configuration file](https://github.com/qdrant/qdrant/blob/master/config/config.yaml) for more information about collection and vector parameters. -“We LOVE Qdrant! The exceptional engineering, strong business value, and outstanding team behind the product drove our choice. Thank you for your great contribution to the technology community!” +*Available as of v1.2.0* -![Kyle Tobin](https://qdrant.tech/img/customers/kyle-tobin.png) +Vectors all live in RAM for very quick access. The `on_disk` parameter can be +set in the vector configuration. If true, all vectors will live on disk. This +will enable the use of +[memmaps](/documentation/concepts/storage/#configuring-memmap-storage), +which is suitable for ingesting a large amount of data. -Kyle Tobin +### Create collection from another collection -Principal, Cognizant +*Available as of v1.0.0* -![Hubspot](https://qdrant.tech/img/brands/hubspot.svg) +It is possible to initialize a collection from another existing collection. -“Qdrant powers our demanding recommendation and RAG applications. We chose it for its ease of deployment and high performance at scale, and have been consistently impressed with its results.” +This might be useful for experimenting quickly with different configurations for the same data set. -![Srubin Sethu Madhavan](https://qdrant.tech/img/customers/srubin-sethu-madhavan.svg) + -Srubin Sethu Madhavan +Make sure the vectors have the same `size` and `distance` function when setting up the vectors configuration in the new collection. If you used the previous sample +code, `"size": 300` and `"distance": "Cosine"`. -Technical Lead II at Hubspot -![Bayer](https://qdrant.tech/img/brands/bayer.svg) +```python +from qdrant_client import QdrantClient, models -“VectorStores are definitely here to stay, the objects in the world around us from image, sound, video and text become easily universal and searchable thanks to the embedding models. I personally recommend Qdrant. We have been using it for a while and couldn't be happier.“ +client = QdrantClient(url="http://localhost:6333") -![Hooman Sedghamiz](https://qdrant.tech/img/customers/hooman-sedghamiz.svg) +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=100, distance=models.Distance.COSINE), + init_from=models.InitFrom(collection="{from_collection_name}"), +) +``` -Hooman Sedghamiz +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -Director Al /ML, Bayer +var client = new QdrantClient("localhost", 6334); -![CB Insights](https://qdrant.tech/img/brands/cb-insights.svg) +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { Size = 100, Distance = Distance.Cosine }, + initFromCollection: "{from_collection_name}" +); +``` -“We looked at all the big options out there right now for vector databases, with our focus on ease of use, performance, pricing, and communication. **Qdrant came out on top in each category...** ultimately, it wasn't much of a contest.” +```bash +curl -X PUT http://localhost:6333/collections/{collection_name} \ + -H 'Content-Type: application/json' \ + --data-raw '{ + "vectors": { + "size": 300, + "distance": "Cosine" + }, + "init_from": { + "collection": {from_collection_name} + } + }' +``` -![Alex Webb](https://qdrant.tech/img/customers/alex-webb.svg) +```go +import ( + "context" -Alex Webb + "github.com/qdrant/go-client/qdrant" +) -Director of Engineering, CB Insights +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -![Bosch](https://qdrant.tech/img/brands/bosch.svg) +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 100, + Distance: qdrant.Distance_Cosine, + }), + InitFromCollection: qdrant.PtrOf("{from_collection_name}"), +}) +``` -“With Qdrant, we found the missing piece to develop our own provider independent multimodal generative AI platform on enterprise scale.” +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 100, + "distance": "Cosine" + }, + "init_from": { + "collection": "{from_collection_name}" + } +} +``` -![Jeremy T. & Daly Singh](https://qdrant.tech/img/customers/jeremy-t.png)![Jeremy T. & Daly Singh](https://qdrant.tech/img/customers/daly-singh.png) +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -Jeremy T. & Daly Singh +const client = new QdrantClient({ host: "localhost", port: 6333 }); -Generative AI Expert & Product Owner, Bosch +client.createCollection("{collection_name}", { + vectors: { size: 100, distance: "Cosine" }, + init_from: { collection: "{from_collection_name}" }, +}); +``` -![Cognizant](https://qdrant.tech/img/brands/cognizant.svg) +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{CreateCollectionBuilder, Distance, VectorParamsBuilder}; -“We LOVE Qdrant! The exceptional engineering, strong business value, and outstanding team behind the product drove our choice. Thank you for your great contribution to the technology community!” +let client = Qdrant::from_url("http://localhost:6334").build()?; -![Kyle Tobin](https://qdrant.tech/img/customers/kyle-tobin.png) +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(100, Distance::Cosine)) + .init_from_collection("{from_collection_name}"), + ) + .await?; +``` -Kyle Tobin +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; -Principal, Cognizant +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -![Hubspot](https://qdrant.tech/img/brands/hubspot.svg) +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(100) + .setDistance(Distance.Cosine) + .build())) + .setInitFromCollection("{from_collection_name}") + .build()) + .get(); +``` -“Qdrant powers our demanding recommendation and RAG applications. We chose it for its ease of deployment and high performance at scale, and have been consistently impressed with its results.” +### Collection with multiple vectors -![Srubin Sethu Madhavan](https://qdrant.tech/img/customers/srubin-sethu-madhavan.svg) +*Available as of v0.10.0* -Srubin Sethu Madhavan +It is possible to have multiple vectors per record. +This feature allows for multiple vector storages per collection. +To distinguish vectors in one record, they should have a unique name defined when creating the collection. +Each named vector in this mode has its distance and size: -Technical Lead II at Hubspot -![Bayer](https://qdrant.tech/img/brands/bayer.svg) +```python +from qdrant_client import QdrantClient, models -“VectorStores are definitely here to stay, the objects in the world around us from image, sound, video and text become easily universal and searchable thanks to the embedding models. I personally recommend Qdrant. We have been using it for a while and couldn't be happier.“ -![Hooman Sedghamiz](https://qdrant.tech/img/customers/hooman-sedghamiz.svg) +client = QdrantClient(url="http://localhost:6333") -Hooman Sedghamiz +client.create_collection( + collection_name="{collection_name}", + vectors_config={ + "image": models.VectorParams(size=4, distance=models.Distance.DOT), + "text": models.VectorParams(size=8, distance=models.Distance.COSINE), + }, +) +``` -Director Al /ML, Bayer +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -![CB Insights](https://qdrant.tech/img/brands/cb-insights.svg) +var client = new QdrantClient("localhost", 6334); -“We looked at all the big options out there right now for vector databases, with our focus on ease of use, performance, pricing, and communication. **Qdrant came out on top in each category...** ultimately, it wasn't much of a contest.” +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParamsMap + { + Map = + { + ["image"] = new VectorParams { Size = 4, Distance = Distance.Dot }, + ["text"] = new VectorParams { Size = 8, Distance = Distance.Cosine }, + } + } +); +``` -![Alex Webb](https://qdrant.tech/img/customers/alex-webb.svg) +```bash +curl -X PUT http://localhost:6333/collections/{collection_name} \ + -H 'Content-Type: application/json' \ + --data-raw '{ + "vectors": { + "image": { + "size": 4, + "distance": "Dot" + }, + "text": { + "size": 8, + "distance": "Cosine" + } + } + }' +``` -Alex Webb +```go +import ( + "context" -Director of Engineering, CB Insights + "github.com/qdrant/go-client/qdrant" +) -![Bosch](https://qdrant.tech/img/brands/bosch.svg) +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -“With Qdrant, we found the missing piece to develop our own provider independent multimodal generative AI platform on enterprise scale.” +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfigMap( + map[string]*qdrant.VectorParams{ + "image": { + Size: 4, + Distance: qdrant.Distance_Dot, + }, + "text": { + Size: 8, + Distance: qdrant.Distance_Cosine, + }, + }), +}) +``` -![Jeremy T. & Daly Singh](https://qdrant.tech/img/customers/jeremy-t.png)![Jeremy T. & Daly Singh](https://qdrant.tech/img/customers/daly-singh.png) +```http +PUT /collections/{collection_name} +{ + "vectors": { + "image": { + "size": 4, + "distance": "Dot" + }, + "text": { + "size": 8, + "distance": "Cosine" + } + } +} +``` -Jeremy T. & Daly Singh +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -Generative AI Expert & Product Owner, Bosch +const client = new QdrantClient({ host: "localhost", port: 6333 }); -![Cognizant](https://qdrant.tech/img/brands/cognizant.svg) +client.createCollection("{collection_name}", { + vectors: { + image: { size: 4, distance: "Dot" }, + text: { size: 8, distance: "Cosine" }, + }, +}); +``` -“We LOVE Qdrant! The exceptional engineering, strong business value, and outstanding team behind the product drove our choice. Thank you for your great contribution to the technology community!” +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{ + CreateCollectionBuilder, Distance, VectorParamsBuilder, VectorsConfigBuilder, +}; -![Kyle Tobin](https://qdrant.tech/img/customers/kyle-tobin.png) +let client = Qdrant::from_url("http://localhost:6334").build()?; -Kyle Tobin +let mut vectors_config = VectorsConfigBuilder::default(); +vectors_config + .add_named_vector_params("image", VectorParamsBuilder::new(4, Distance::Dot).build()); +vectors_config.add_named_vector_params( + "text", + VectorParamsBuilder::new(8, Distance::Cosine).build(), +); -Principal, Cognizant +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}").vectors_config(vectors_config), + ) + .await?; +``` -![Hubspot](https://qdrant.tech/img/brands/hubspot.svg) +```java +import java.util.Map; -“Qdrant powers our demanding recommendation and RAG applications. We chose it for its ease of deployment and high performance at scale, and have been consistently impressed with its results.” +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.VectorParams; -![Srubin Sethu Madhavan](https://qdrant.tech/img/customers/srubin-sethu-madhavan.svg) +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -Srubin Sethu Madhavan +client + .createCollectionAsync( + "{collection_name}", + Map.of( + "image", VectorParams.newBuilder().setSize(4).setDistance(Distance.Dot).build(), + "text", + VectorParams.newBuilder().setSize(8).setDistance(Distance.Cosine).build())) + .get(); +``` -Technical Lead II at Hubspot +For rare use cases, it is possible to create a collection without any vector storage. -![Bayer](https://qdrant.tech/img/brands/bayer.svg) +*Available as of v1.1.1* -“VectorStores are definitely here to stay, the objects in the world around us from image, sound, video and text become easily universal and searchable thanks to the embedding models. I personally recommend Qdrant. We have been using it for a while and couldn't be happier.“ +For each named vector you can optionally specify +[`hnsw_config`](/documentation/concepts/indexing/#vector-index) or +[`quantization_config`](/documentation/guides/quantization/#setting-up-quantization-in-qdrant) to +deviate from the collection configuration. This can be useful to fine-tune +search performance on a vector level. -![Hooman Sedghamiz](https://qdrant.tech/img/customers/hooman-sedghamiz.svg) +*Available as of v1.2.0* -Hooman Sedghamiz +Vectors all live in RAM for very quick access. On a per-vector basis you can set +`on_disk` to true to store all vectors on disk at all times. This will enable +the use of +[memmaps](/documentation/concepts/storage/#configuring-memmap-storage), +which is suitable for ingesting a large amount of data. -Director Al /ML, Bayer -![CB Insights](https://qdrant.tech/img/brands/cb-insights.svg) +### Vector datatypes -“We looked at all the big options out there right now for vector databases, with our focus on ease of use, performance, pricing, and communication. **Qdrant came out on top in each category...** ultimately, it wasn't much of a contest.” +*Available as of v1.9.0* -![Alex Webb](https://qdrant.tech/img/customers/alex-webb.svg) +Some embedding providers may provide embeddings in a pre-quantized format. +One of the most notable examples is the [Cohere int8 & binary embeddings](https://cohere.com/blog/int8-binary-embeddings). +Qdrant has direct support for uint8 embeddings, which you can also use in combination with binary quantization. -Alex Webb +To create a collection with uint8 embeddings, you can use the following configuration: -Director of Engineering, CB Insights +```python +from qdrant_client import QdrantClient, models -![Bosch](https://qdrant.tech/img/brands/bosch.svg) +client = QdrantClient(url="http://localhost:6333") -“With Qdrant, we found the missing piece to develop our own provider independent multimodal generative AI platform on enterprise scale.” +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams( + size=1024, + distance=models.Distance.COSINE, + datatype=models.Datatype.UINT8, + ), +) +``` -![Jeremy T. & Daly Singh](https://qdrant.tech/img/customers/jeremy-t.png)![Jeremy T. & Daly Singh](https://qdrant.tech/img/customers/daly-singh.png) +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -Jeremy T. & Daly Singh +var client = new QdrantClient("localhost", 6334); -Generative AI Expert & Product Owner, Bosch +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { + Size = 1024, Distance = Distance.Cosine, Datatype = Datatype.Uint8 + } +); +``` -![Cognizant](https://qdrant.tech/img/brands/cognizant.svg) +```bash +curl -X PUT http://localhost:6333/collections/{collection_name} \ + -H 'Content-Type: application/json' \ + --data-raw '{ + "vectors": { + "size": 1024, + "distance": "Cosine", + "datatype": "uint8" + } + }' +``` -“We LOVE Qdrant! The exceptional engineering, strong business value, and outstanding team behind the product drove our choice. Thank you for your great contribution to the technology community!” +```go +import ( + "context" -![Kyle Tobin](https://qdrant.tech/img/customers/kyle-tobin.png) + "github.com/qdrant/go-client/qdrant" +) -Kyle Tobin +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -Principal, Cognizant +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 1024, + Distance: qdrant.Distance_Cosine, + Datatype: qdrant.Datatype_Uint8.Enum(), + }), +}) +``` -![Hubspot](https://qdrant.tech/img/brands/hubspot.svg) +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 1024, + "distance": "Cosine", + "datatype": "uint8" + } +} +``` -“Qdrant powers our demanding recommendation and RAG applications. We chose it for its ease of deployment and high performance at scale, and have been consistently impressed with its results.” +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -![Srubin Sethu Madhavan](https://qdrant.tech/img/customers/srubin-sethu-madhavan.svg) +const client = new QdrantClient({ host: "localhost", port: 6333 }); -Srubin Sethu Madhavan +client.createCollection("{collection_name}", { + vectors: { + image: { size: 1024, distance: "Cosine", datatype: "uint8" }, + }, +}); +``` -Technical Lead II at Hubspot +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{ + CreateCollectionBuilder, Datatype, Distance, VectorParamsBuilder, +}; -See what our community is saying on our -[Vector Space Wall](https://testimonial.to/qdrant/all) +let client = Qdrant::from_url("http://localhost:6334").build()?; -## Integrations - -Qdrant integrates with all leading -[embeddings](https://qdrant.tech/documentation/embeddings/) and -[frameworks](https://qdrant.tech/documentation/frameworks/). - -[See Integrations](https://qdrant.tech/documentation/frameworks/) - -### Deploy Qdrant locally with Docker - -Get started with our -[Quick Start Guide](https://qdrant.tech/documentation/quick-start/), or our main -[GitHub repository](https://github.com/qdrant/qdrant). - -`1 docker pull qdrant/qdrant -2 docker run -p 6333:6333 qdrant/qdrant -` - -## Vectors in Action - -Turn embeddings or neural network encoders into full-fledged applications for matching, searching, recommending, and more. - -#### Advanced Search - -Elevate your apps with advanced search capabilities. Qdrant excels in processing high-dimensional data, enabling nuanced similarity searches, and understanding semantics in depth. Qdrant also handles multimodal data with fast and accurate search algorithms. - -[Learn More](https://qdrant.tech/advanced-search/) - -#### Recommendation Systems - -Create highly responsive and personalized recommendation systems with tailored suggestions. Qdrant’s Recommendation API offers great flexibility, featuring options such as best score recommendation strategy. This enables new scenarios of using multiple vectors in a single query to impact result relevancy. - -[Learn More](https://qdrant.tech/recommendations/) - -#### Retrieval Augmented Generation (RAG) +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}").vectors_config( + VectorParamsBuilder::new(1024, Distance::Cosine).datatype(Datatype::Uint8), + ), + ) + .await?; +``` -Enhance the quality of AI-generated content. Leverage Qdrant's efficient nearest neighbor search and payload filtering features for retrieval-augmented generation. You can then quickly access relevant vectors and integrate a vast array of data points. +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.grpc.Collections.Datatype; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.VectorParams; -[Learn More](https://qdrant.tech/rag/) +QdrantClient client = new QdrantClient( + QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -#### Data Analysis and Anomaly Detection +client + .createCollectionAsync("{collection_name}", + VectorParams.newBuilder() + .setSize(1024) + .setDistance(Distance.Cosine) + .setDatatype(Datatype.Uint8) + .build()) + .get(); +``` -Transform your approach to Data Analysis and Anomaly Detection. Leverage vectors to quickly identify patterns and outliers in complex datasets. This ensures robust and real-time anomaly detection for critical applications. +Vectors with `uint8` datatype are stored in a more compact format, which can save memory and improve search speed at the cost of some precision. +If you choose to use the `uint8` datatype, elements of the vector will be stored as unsigned 8-bit integers, which can take values **from 0 to 255**. -[Learn More](https://qdrant.tech/data-analysis-anomaly-detection/) -#### AI Agents +### Collection with sparse vectors -Unlock the full potential of your AI agents with Qdrant’s powerful vector search and scalable infrastructure, allowing them to handle complex tasks, adapt in real time, and drive smarter, data-driven outcomes across any environment. +*Available as of v1.7.0* -[Learn More](https://qdrant.tech/ai-agents/) +Qdrant supports sparse vectors as a first-class citizen. -### Get started for free +Sparse vectors are useful for text search, where each word is represented as a separate dimension. -Turn embeddings or neural network encoders into full-fledged applications for matching, searching, recommending, and more. +Collections can contain sparse vectors as additional [named vectors](#collection-with-multiple-vectors) along side regular dense vectors in a single point. -[Get Started](https://cloud.qdrant.io/signup) +Unlike dense vectors, sparse vectors must be named. +And additionally, sparse vectors and dense vectors must have different names within a collection. -<|page-4-lllmstxt|> -## fastembed -- [Documentation](https://qdrant.tech/documentation/) -- FastEmbed +```python +from qdrant_client import QdrantClient, models -# [Anchor](https://qdrant.tech/documentation/fastembed/\#what-is-fastembed) What is FastEmbed? +client = QdrantClient(url="http://localhost:6333") -FastEmbed is a lightweight Python library built for embedding generation. It supports popular embedding models and offers a user-friendly experience for embedding data into vector space. +client.create_collection( + collection_name="{collection_name}", + vectors_config={}, + sparse_vectors_config={ + "text": models.SparseVectorParams(), + }, +) +``` -By using FastEmbed, you can ensure that your embedding generation process is not only fast and efficient but also highly accurate, meeting the needs of various machine learning and natural language processing applications. +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -FastEmbed easily integrates with Qdrant for a variety of multimodal search purposes. +var client = new QdrantClient("localhost", 6334); -## [Anchor](https://qdrant.tech/documentation/fastembed/\#how-to-get-started-with-fastembed) How to get started with FastEmbed +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + sparseVectorsConfig: ("text", new SparseVectorParams()) +); +``` -| Beginner | Advanced | -| --- | --- | -| [Generate Text Embedings with FastEmbed](https://qdrant.tech/documentation/fastembed/fastembed-quickstart/) | [Combine FastEmbed with Qdrant for Vector Search](https://qdrant.tech/documentation/fastembed/fastembed-semantic-search/) | +```bash +curl -X PUT http://localhost:6333/collections/{collection_name} \ + -H 'Content-Type: application/json' \ + --data-raw '{ + "sparse_vectors": { + "text": { } + } + }' +``` -## [Anchor](https://qdrant.tech/documentation/fastembed/\#why-is-fastembed-useful) Why is FastEmbed useful? +```go +import ( + "context" -- Light: Unlike other inference frameworks, such as PyTorch, FastEmbed requires very little external dependencies. Because it uses the ONNX runtime, it is perfect for serverless environments like AWS Lambda. -- Fast: By using ONNX, FastEmbed ensures high-performance inference across various hardware platforms. -- Accurate: FastEmbed aims for better accuracy and recall than models like OpenAI’s `Ada-002`. It always uses model which demonstrate strong results on the MTEB leaderboard. -- Support: FastEmbed supports a wide range of models, including multilingual ones, to meet diverse use case needs. + "github.com/qdrant/go-client/qdrant" +) -##### Was this page useful? +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + SparseVectorsConfig: qdrant.NewSparseVectorsConfig( + map[string]*qdrant.SparseVectorParams{ + "text": {}, + }), +}) +``` -Thank you for your feedback! 🙏 +```http +PUT /collections/{collection_name} +{ + "sparse_vectors": { + "text": { } + } +} +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/fastembed/_index.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -On this page: +const client = new QdrantClient({ host: "localhost", port: 6333 }); -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/fastembed/_index.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +client.createCollection("{collection_name}", { + sparse_vectors: { + text: { }, + }, +}); +``` -× +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{ + CreateCollectionBuilder, SparseVectorParamsBuilder, SparseVectorsConfigBuilder, +}; -[Powered by](https://qdrant.tech/) +let client = Qdrant::from_url("http://localhost:6334").build()?; -<|page-5-lllmstxt|> -## hybrid-cloud -- [Documentation](https://qdrant.tech/documentation/) -- Hybrid Cloud +let mut sparse_vector_config = SparseVectorsConfigBuilder::default(); -# [Anchor](https://qdrant.tech/documentation/hybrid-cloud/\#qdrant-hybrid-cloud) Qdrant Hybrid Cloud +sparse_vector_config.add_named_vector_params("text", SparseVectorParamsBuilder::default()); -Seamlessly deploy and manage your vector database across diverse environments, ensuring performance, security, and cost efficiency for AI-driven applications. +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .sparse_vectors_config(sparse_vector_config), + ) + .await?; +``` -[Qdrant Hybrid Cloud](https://qdrant.tech/hybrid-cloud/) integrates Kubernetes clusters from any setting - cloud, on-premises, or edge - into a unified, enterprise-grade managed service. +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.SparseVectorConfig; +import io.qdrant.client.grpc.Collections.SparseVectorParams; -You can use [Qdrant Cloud’s UI](https://qdrant.tech/documentation/cloud/create-cluster/) to create and manage your database clusters, while they still remain within your infrastructure. **All Qdrant databases will operate solely within your network, using your storage and compute resources. All user data will stay securely within your environment and won’t be accessible by the Qdrant Cloud platform, or anyone else outside your organization.** +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -Qdrant Hybrid Cloud ensures data privacy, deployment flexibility, low latency, and delivers cost savings, elevating standards for vector search and AI applications. +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setSparseVectorsConfig( + SparseVectorConfig.newBuilder() + .putMap("text", SparseVectorParams.getDefaultInstance())) + .build()) + .get(); +``` -**How it works:** Qdrant Hybrid Cloud relies on Kubernetes and works with any standard compliant Kubernetes distribution. When you onboard a Kubernetes cluster as a Hybrid Cloud Environment, you can deploy the Qdrant Kubernetes Operator and Cloud Agent into this cluster. These will manage Qdrant databases within your Kubernetes cluster and establish an outgoing connection to Qdrant Cloud to transport telemetry and receive management instructions. You can then benefit from the same cloud management features and transport telemetry that is available with any managed Qdrant Cloud cluster. +Outside of a unique name, there are no required configuration parameters for sparse vectors. -**Setup instructions:** To begin using Qdrant Hybrid Cloud, [read our installation guide](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/). +The distance function for sparse vectors is always `Dot` and does not need to be specified. -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/\#hybrid-cloud-architecture) Hybrid Cloud architecture +However, there are optional parameters to tune the underlying [sparse vector index](/documentation/concepts/indexing/#sparse-vector-index). -The Hybrid Cloud onboarding will install a Kubernetes Operator and Cloud Agent into your Kubernetes cluster. +### Check collection existence -The Cloud Agent will establish an outgoing connection to `cloud.qdrant.io` on port `443` to transport telemetry and receive management instructions. It will also interact with the Kubernetes API through a ServiceAccount to create, read, update and delete the necessary Qdrant CRs (Custom Resources) based on the configuration setup in the Qdrant Cloud Console. +*Available as of v1.8.0* -The Qdrant Kubernetes Operator will manage the Qdrant databases within your Kubernetes cluster. Based on the Qdrant CRs, it will interact with the Kubernetes API through a ServiceAccount to create and manage the necessary resources to deploy and run Qdrant databases, such as Pods, Services, ConfigMaps, and Secrets. +```python +client.collection_exists(collection_name="{collection_name}") +``` -Both component’s access is limited to the Kubernetes namespace that you chose during the onboarding process. +```csharp +await client.CollectionExistsAsync("{collection_name}"); +``` -The Cloud Agent only sends telemetry data and status information to the Qdrant Cloud platform. It does not send any user data or sensitive information. The telemetry data includes: +```bash +curl -X GET http://localhost:6333/collections/{collection_name}/exists +``` -- The health status and resource (CPU, memory, disk and network) usage of the Qdrant databases and Qdrant control plane components. -- Information about the Qdrant databases, such as the number, name and configuration of collections, the number of vectors, the number of queries, and the number of indexing operations. -- Telemetry and notification data from the Qdrant databases. -- Kubernetes operations and scheduling events reported for the Qdrant databases and Qdrant control plane components. +```go +import "context" -After the initial onboarding, the lifecycle of these components will be controlled by the Qdrant Cloud platform via the built-in Helm controller. +client.CollectionExists(context.Background(), "my_collection") +``` -You don’t need to expose your Kubernetes Cluster to the Qdrant Cloud platform, you don’t need to open any ports for incoming traffic, and you don’t need to provide any Kubernetes or cloud provider credentials to the Qdrant Cloud platform. +```http +GET http://localhost:6333/collections/{collection_name}/exists +``` -![hybrid-cloud-architecture](https://qdrant.tech/blog/hybrid-cloud/hybrid-cloud-architecture.png) +```typescript +client.collectionExists("{collection_name}"); +``` -##### Was this page useful? +```rust +client.collection_exists("{collection_name}").await?; +``` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +```java +client.collectionExistsAsync("{collection_name}").get(); +``` -Thank you for your feedback! 🙏 +### Delete collection -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/hybrid-cloud/_index.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +```python +client.delete_collection(collection_name="{collection_name}") +``` -On this page: +```csharp +await client.DeleteCollectionAsync("{collection_name}"); +``` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/hybrid-cloud/_index.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +```bash +curl -X DELETE http://localhost:6333/collections/{collection_name} +``` -× +```go +import "context" -[Powered by](https://qdrant.tech/) +client.DeleteCollection(context.Background(), "{collection_name}") +``` -<|page-6-lllmstxt|> -## cloud -- [Documentation](https://qdrant.tech/documentation/) -- Managed Cloud +```http +DELETE http://localhost:6333/collections/{collection_name} +``` -# [Anchor](https://qdrant.tech/documentation/cloud/\#about-qdrant-managed-cloud) About Qdrant Managed Cloud +```typescript +client.deleteCollection("{collection_name}"); +``` -Qdrant Managed Cloud is our SaaS (software-as-a-service) solution, providing managed Qdrant database clusters on the cloud. We provide you the same fast and reliable similarity search engine, but without the need to maintain your own infrastructure. +```rust +client.delete_collection("{collection_name}").await?; +``` -Transitioning to the Managed Cloud version of Qdrant does not change how you interact with the service. All you need is a [Qdrant Cloud account](https://qdrant.to/cloud/) and an [API key](https://qdrant.tech/documentation/cloud/authentication/) for each request. +```java +client.deleteCollectionAsync("{collection_name}").get(); +``` -You can also attach your own infrastructure as a Hybrid Cloud Environment. For details, see our [Hybrid Cloud](https://qdrant.tech/documentation/hybrid-cloud/) documentation. +### Update collection parameters -## [Anchor](https://qdrant.tech/documentation/cloud/\#cluster-configuration) Cluster Configuration +Dynamic parameter updates may be helpful, for example, for more efficient initial loading of vectors. +For example, you can disable indexing during the upload process, and enable it immediately after the upload is finished. +As a result, you will not waste extra computation resources on rebuilding the index. -Each database cluster comes pre-configured with the following tools, features, and support services: +The following command enables indexing for segments that have more than 10000 kB of vectors stored: -- Allows the creation of highly available clusters with automatic failover. -- Supports upgrades to later versions of Qdrant as they are released. -- Upgrades are zero-downtime on highly available clusters. -- Includes monitoring and logging to observe the health of each cluster. -- Horizontally and vertically scalable. -- Available natively on AWS and GCP, and Azure. -- Available on your own infrastructure and other providers if you use the Hybrid Cloud. +```python +client.update_collection( + collection_name="{collection_name}", + optimizers_config=models.OptimizersConfigDiff(indexing_threshold=10000), +) +``` -##### Was this page useful? +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +var client = new QdrantClient("localhost", 6334); -Thank you for your feedback! 🙏 +await client.UpdateCollectionAsync( + collectionName: "{collection_name}", + optimizersConfig: new OptimizersConfigDiff { IndexingThreshold = 10000 } +); +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud/_index.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +```bash +curl -X PATCH http://localhost:6333/collections/{collection_name} \ + -H 'Content-Type: application/json' \ + --data-raw '{ + "optimizers_config": { + "indexing_threshold": 10000 + } + }' +``` -On this page: +```go +import ( + "context" -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud/_index.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) + "github.com/qdrant/go-client/qdrant" +) -× +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -[Powered by](https://qdrant.tech/) +client.UpdateCollection(context.Background(), &qdrant.UpdateCollection{ + CollectionName: "{collection_name}", + OptimizersConfig: &qdrant.OptimizersConfigDiff{ + IndexingThreshold: qdrant.PtrOf(uint64(10000)), + }, +}) +``` -<|page-7-lllmstxt|> -## migration -- [Documentation](https://qdrant.tech/documentation/) -- [Database tutorials](https://qdrant.tech/documentation/database-tutorials/) -- Migration to Qdrant +```http +PATCH /collections/{collection_name} +{ + "optimizers_config": { + "indexing_threshold": 10000 + } +} +``` -# [Anchor](https://qdrant.tech/documentation/database-tutorials/migration/\#migration) Migration +```typescript +client.updateCollection("{collection_name}", { + optimizers_config: { + indexing_threshold: 10000, + }, +}); +``` -Migrating data between vector databases, especially across regions, platforms, or deployment types, can be a hassle. That’s where the [Qdrant Migration Tool](https://github.com/qdrant/migration) comes in. It supports a wide range of migration needs, including transferring data between Qdrant instances and migrating from other vector database providers to Qdrant. +```rust +use qdrant_client::qdrant::{OptimizersConfigDiffBuilder, UpdateCollectionBuilder}; -You can run the migration tool on any machine where you have connectivity to both the source and the target Qdrant databases. Direct connectivity between both databases is not required. For optimal performance, you should run the tool on a machine with a fast network connection and minimum latency to both databases. +client + .update_collection( + UpdateCollectionBuilder::new("{collection_name}").optimizers_config( + OptimizersConfigDiffBuilder::default().indexing_threshold(10000), + ), + ) + .await?; +``` -In this tutorial, we will learn how to use the migration tool and walk through a practical example of migrating from other vector databases to Qdrant. +```java +import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; +import io.qdrant.client.grpc.Collections.UpdateCollection; -## [Anchor](https://qdrant.tech/documentation/database-tutorials/migration/\#why-use-this-instead-of-qdrants-native-snapshotting) Why use this instead of Qdrant’s Native Snapshotting? +client.updateCollectionAsync( + UpdateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setOptimizersConfig( + OptimizersConfigDiff.newBuilder().setIndexingThreshold(10000).build()) + .build()); +``` -Qdrant supports [snapshot-based backups](https://qdrant.tech/documentation/concepts/snapshots/), low-level disk operations built for same cluster recovery or local backups. These snapshots: +The following parameters can be updated: -- Require snapshot consistency across nodes. -- Can be hard to port across machines or cloud zones. +* `optimizers_config` - see [optimizer](/documentation/concepts/optimizer/) for details. +* `hnsw_config` - see [indexing](/documentation/concepts/indexing/#vector-index) for details. +* `quantization_config` - see [quantization](/documentation/guides/quantization/#setting-up-quantization-in-qdrant) for details. +* `vectors_config` - vector-specific configuration, including individual `hnsw_config`, `quantization_config` and `on_disk` settings. +* `params` - other collection parameters, including `write_consistency_factor` and `on_disk_payload`. +* `strict_mode_config` - see [strict mode](/documentation/guides/administration/#strict-mode) for details. -On the other hand, the Qdrant Migration Tool: +Full API specification is available in [schema definitions](https://api.qdrant.tech/api-reference/collections/update-collection). -- Streams data in live batches. -- Can resume interrupted migrations. -- Works even when data is being inserted. -- Supports collection reconfiguration (e.g., change replication, and quantization) -- Supports migrating from other vector DBs (Pinecone, Chroma, Weaviate, etc.) +Calls to this endpoint may be blocking as it waits for existing optimizers to +finish. We recommended against using this in a production database as it may +introduce huge overhead due to the rebuilding of the index. -## [Anchor](https://qdrant.tech/documentation/database-tutorials/migration/\#how-to-use-the-qdrant-migration-tool) How to Use the Qdrant Migration Tool +#### Update vector parameters -You can run the tool via Docker. +*Available as of v1.4.0* -Installation: + -```shell -docker pull registry.cloud.qdrant.io/library/qdrant-migration +Qdrant 1.4 adds support for updating more collection parameters at runtime. HNSW +index, quantization and disk configurations can now be changed without +recreating a collection. Segments (with index and quantized data) will +automatically be rebuilt in the background to match updated parameters. -``` +To put vector data on disk for a collection that **does not have** named vectors, +use `""` as name: -Here is an example of how to perform a Qdrant to Qdrant migration: ```bash -docker run --rm -it \ - -e SOURCE_API_KEY='your-source-key' \ - -e TARGET_API_KEY='your-target-key' \ - registry.cloud.qdrant.io/library/qdrant-migration qdrant \ - --source-url 'https://source-instance.cloud.qdrant.io' \ - --source-collection 'benchmark' \ - --target-url 'https://target-instance.cloud.qdrant.io' \ - --target-collection 'benchmark' - +curl -X PATCH http://localhost:6333/collections/{collection_name} \ + -H 'Content-Type: application/json' \ + --data-raw '{ + "vectors": { + "": { + "on_disk": true + } + } + }' ``` -## [Anchor](https://qdrant.tech/documentation/database-tutorials/migration/\#example-migrate-from-pinecone-to-qdrant) Example: Migrate from Pinecone to Qdrant - -Let’s now walk through an example of migrating from Pinecone to Qdrant. Assuming your Pinecone index looks like this: +```http +PATCH /collections/{collection_name} +{ + "vectors": { + "": { + "on_disk": true + } + } +} +``` -![Pinecone Dashboard showing index details](https://qdrant.tech/documentation/guides/pinecone-index.png) -The information you need from Pinecone is: +To put vector data on disk for a collection that **does have** named vectors: -- Your Pinecone API key -- The index name -- The index host URL +Note: To create a vector name, follow the procedure from our [Points](/documentation/concepts/points/#create-vector-name). -With that information, you can migrate your vector database from Pinecone to Qdrant with the following command: ```bash -docker run --net=host --rm -it registry.cloud.qdrant.io/library/qdrant-migration pinecone \ - --pinecone.index-host 'https://sample-movies-efgjrye.svc.aped-4627-b74a.pinecone.io' \ - --pinecone.index-name 'sample-movies' \ - --pinecone.api-key 'pcsk_7Dh5MW_
' \ - --qdrant.url 'https://5f1a5c6c-7d47-45c3-8d47-d7389b1fad66.eu-west-1-0.aws.cloud.qdrant.io:6334' \ - --qdrant.api-key 'eyJhbGciOiJIUzI1NiIsInR5c
' \ - --qdrant.collection 'sample-movies' \ - --migration.batch-size 64 - +curl -X PATCH http://localhost:6333/collections/{collection_name} \ + -H 'Content-Type: application/json' \ + --data-raw '{ + "vectors": { + "my_vector": { + "on_disk": true + } + } + }' ``` -When the migration is complete, you will see the new collection on Qdrant with all the vectors. - -## [Anchor](https://qdrant.tech/documentation/database-tutorials/migration/\#conclusion) Conclusion - -The **Qdrant Migration Tool** makes data transfer across vector database instances effortless. Whether you’re moving between cloud regions, upgrading from self-hosted to Qdrant Cloud, or switching from other databases such as Pinecone, this tool saves you hours of manual effort. [Try it today](https://github.com/qdrant/migration). - -##### Was this page useful? - -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No - -Thank you for your feedback! 🙏 - -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/database-tutorials/migration.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. - -On this page: - -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/database-tutorials/migration.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) - -× - -[Powered by](https://qdrant.tech/) - -<|page-8-lllmstxt|> -## interfaces -- [Documentation](https://qdrant.tech/documentation/) -- API & SDKs - -# [Anchor](https://qdrant.tech/documentation/interfaces/\#interfaces) Interfaces - -Qdrant supports these “official” clients. - -> **Note:** If you are using a language that is not listed here, you can use the REST API directly or generate a client for your language -> using [OpenAPI](https://github.com/qdrant/qdrant/blob/master/docs/redoc/master/openapi.json) -> or [protobuf](https://github.com/qdrant/qdrant/tree/master/lib/api/src/grpc/proto) definitions. - -## [Anchor](https://qdrant.tech/documentation/interfaces/\#client-libraries) Client Libraries - -| | Client Repository | Installation | Version | -| --- | --- | --- | --- | -| [![python](https://qdrant.tech/docs/misc/python.webp)](https://python-client.qdrant.tech/) | **[Python](https://github.com/qdrant/qdrant-client)** \+ **[(Client Docs)](https://python-client.qdrant.tech/)** | `pip install qdrant-client[fastembed]` | [Latest Release](https://github.com/qdrant/qdrant-client/releases) | -| ![typescript](https://qdrant.tech/docs/misc/ts.webp) | **[JavaScript / Typescript](https://github.com/qdrant/qdrant-js)** | `npm install @qdrant/js-client-rest` | [Latest Release](https://github.com/qdrant/qdrant-js/releases) | -| ![rust](https://qdrant.tech/docs/misc/rust.png) | **[Rust](https://github.com/qdrant/rust-client)** | `cargo add qdrant-client` | [Latest Release](https://github.com/qdrant/rust-client/releases) | -| ![golang](https://qdrant.tech/docs/misc/go.webp) | **[Go](https://github.com/qdrant/go-client)** | `go get github.com/qdrant/go-client` | [Latest Release](https://github.com/qdrant/go-client/releases) | -| ![.net](https://qdrant.tech/docs/misc/dotnet.webp) | **[.NET](https://github.com/qdrant/qdrant-dotnet)** | `dotnet add package Qdrant.Client` | [Latest Release](https://github.com/qdrant/qdrant-dotnet/releases) | -| ![java](https://qdrant.tech/docs/misc/java.webp) | **[Java](https://github.com/qdrant/java-client)** | [Available on Maven Central](https://central.sonatype.com/artifact/io.qdrant/client) | [Latest Release](https://github.com/qdrant/java-client/releases) | - -## [Anchor](https://qdrant.tech/documentation/interfaces/\#api-reference) API Reference - -All interaction with Qdrant takes place via the REST API. We recommend using REST API if you are using Qdrant for the first time or if you are working on a prototype. +```http +PATCH /collections/{collection_name} +{ + "vectors": { + "my_vector": { + "on_disk": true + } + } +} +``` -| API | Documentation | -| --- | --- | -| REST API | [OpenAPI Specification](https://api.qdrant.tech/api-reference) | -| gRPC API | [gRPC Documentation](https://github.com/qdrant/qdrant/blob/master/docs/grpc/docs.md) | +In the following example the HNSW index and quantization parameters are updated, +both for the whole collection, and for `my_vector` specifically: -### [Anchor](https://qdrant.tech/documentation/interfaces/\#grpc-interface) gRPC Interface -The gRPC methods follow the same principles as REST. For each REST endpoint, there is a corresponding gRPC method. +```python +client.update_collection( + collection_name="{collection_name}", + vectors_config={ + "my_vector": models.VectorParamsDiff( + hnsw_config=models.HnswConfigDiff( + m=32, + ef_construct=123, + ), + quantization_config=models.ProductQuantization( + product=models.ProductQuantizationConfig( + compression=models.CompressionRatio.X32, + always_ram=True, + ), + ), + on_disk=True, + ), + }, + hnsw_config=models.HnswConfigDiff( + ef_construct=123, + ), + quantization_config=models.ScalarQuantization( + scalar=models.ScalarQuantizationConfig( + type=models.ScalarType.INT8, + quantile=0.8, + always_ram=False, + ), + ), +) +``` -As per the [configuration file](https://github.com/qdrant/qdrant/blob/master/config/config.yaml), the gRPC interface is available on the specified port. +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -```yaml -service: - grpc_port: 6334 +var client = new QdrantClient("localhost", 6334); +await client.UpdateCollectionAsync( + collectionName: "{collection_name}", + hnswConfig: new HnswConfigDiff { EfConstruct = 123 }, + vectorsConfig: new VectorParamsDiffMap + { + Map = + { + { + "my_vector", + new VectorParamsDiff + { + HnswConfig = new HnswConfigDiff { M = 3, EfConstruct = 123 } + } + } + } + }, + quantizationConfig: new QuantizationConfigDiff + { + Scalar = new ScalarQuantization + { + Type = QuantizationType.Int8, + Quantile = 0.8f, + AlwaysRam = true + } + } +); ``` -Running the service inside of Docker will look like this: - ```bash -docker run -p 6333:6333 -p 6334:6334 \ - -v $(pwd)/qdrant_storage:/qdrant/storage:z \ - qdrant/qdrant - +curl -X PATCH http://localhost:6333/collections/{collection_name} \ + -H 'Content-Type: application/json' \ + --data-raw '{ + "vectors": { + "my_vector": { + "hnsw_config": { + "m": 32, + "ef_construct": 123 + }, + "quantization_config": { + "product": { + "compression": "x32", + "always_ram": true + } + }, + "on_disk": true + } + }, + "hnsw_config": { + "ef_construct": 123 + }, + "quantization_config": { + "scalar": { + "type": "int8", + "quantile": 0.8, + "always_ram": false + } + } +}' ``` -**When to use gRPC:** The choice between gRPC and the REST API is a trade-off between convenience and speed. gRPC is a binary protocol and can be more challenging to debug. We recommend using gRPC if you are already familiar with Qdrant and are trying to optimize the performance of your application. - -##### Was this page useful? - -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No - -Thank you for your feedback! 🙏 - -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/interfaces.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. - -On this page: - -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/interfaces.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) - -× - -[Powered by](https://qdrant.tech/) - -<|page-9-lllmstxt|> -## single-node-speed-benchmark-2022 -# Single node benchmarks (2022) - -August 23, 2022 - -Dataset:deep-image-96-angulargist-960-euclideanglove-100-angular +```go +import ( + "context" -Search threads:1008421 + "github.com/qdrant/go-client/qdrant" +) -Plot values: +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -RPS - -Latency - -p95 latency - -Index time +client.UpdateCollection(context.Background(), &qdrant.UpdateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfigDiffMap( + map[string]*qdrant.VectorParamsDiff{ + "my_vector": { + HnswConfig: &qdrant.HnswConfigDiff{ + M: qdrant.PtrOf(uint64(3)), + EfConstruct: qdrant.PtrOf(uint64(123)), + }, + }, + }), + QuantizationConfig: qdrant.NewQuantizationDiffScalar( + &qdrant.ScalarQuantization{ + Type: qdrant.QuantizationType_Int8, + Quantile: qdrant.PtrOf(float32(0.8)), + AlwaysRam: qdrant.PtrOf(true), + }), +}) +``` -| Engine | Setup | Dataset | Upload Time(m) | Upload + Index Time(m) | Latency(ms) | P95(ms) | P99(ms) | RPS | Precision | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| qdrant | qdrant-rps-m-64-ef-512 | deep-image-96-angular | 14.096 | 149.32 | 24.73 | 55.75 | 63.73 | 1541.86 | 0.96 | -| weaviate | weaviate-m-16-ef-128 | deep-image-96-angular | 148.70 | 148.70 | 190.94 | 351.75 | 414.16 | 507.33 | 0.94 | -| milvus | milvus-m-16-ef-128 | deep-image-96-angular | 6.074 | 35.28 | 171.50 | 220.26 | 236.97 | 339.44 | 0.97 | -| elastic | elastic-m-16-ef-128 | deep-image-96-angular | 87.54 | 101.16 | 923.031 | 1116.83 | 1671.31 | 95.90 | 0.97 | +```http +PATCH /collections/{collection_name} +{ + "vectors": { + "my_vector": { + "hnsw_config": { + "m": 32, + "ef_construct": 123 + }, + "quantization_config": { + "product": { + "compression": "x32", + "always_ram": true + } + }, + "on_disk": true + } + }, + "hnsw_config": { + "ef_construct": 123 + }, + "quantization_config": { + "scalar": { + "type": "int8", + "quantile": 0.8, + "always_ram": false + } + } +} +``` -_Download raw data: [here](https://qdrant.tech/benchmarks/result-2022-08-10.json)_ +```typescript +client.updateCollection("{collection_name}", { + vectors: { + my_vector: { + hnsw_config: { + m: 32, + ef_construct: 123, + }, + quantization_config: { + product: { + compression: "x32", + always_ram: true, + }, + }, + on_disk: true, + }, + }, + hnsw_config: { + ef_construct: 123, + }, + quantization_config: { + scalar: { + type: "int8", + quantile: 0.8, + always_ram: true, + }, + }, +}); +``` -This is an archived version of Single node benchmarks. Please refer to the new version [here](https://qdrant.tech/benchmarks/single-node-speed-benchmark/). +```rust +use std::collections::HashMap; -Share this article +use qdrant_client::qdrant::{ + quantization_config_diff::Quantization, vectors_config_diff::Config, HnswConfigDiffBuilder, + QuantizationType, ScalarQuantizationBuilder, UpdateCollectionBuilder, VectorParamsDiffBuilder, + VectorParamsDiffMap, +}; -[x](https://twitter.com/intent/tweet?url=https%3A%2F%2Fqdrant.tech%2Fbenchmarks%2Fsingle-node-speed-benchmark-2022%2F&text=Single%20node%20benchmarks%20%282022%29 "x")[LinkedIn](https://www.linkedin.com/sharing/share-offsite/?url=https%3A%2F%2Fqdrant.tech%2Fbenchmarks%2Fsingle-node-speed-benchmark-2022%2F "LinkedIn") +client + .update_collection( + UpdateCollectionBuilder::new("{collection_name}") + .hnsw_config(HnswConfigDiffBuilder::default().ef_construct(123)) + .vectors_config(Config::ParamsMap(VectorParamsDiffMap { + map: HashMap::from([( + ("my_vector".into()), + VectorParamsDiffBuilder::default() + .hnsw_config(HnswConfigDiffBuilder::default().m(32).ef_construct(123)) + .build(), + )]), + })) + .quantization_config(Quantization::Scalar( + ScalarQuantizationBuilder::default() + .r#type(QuantizationType::Int8.into()) + .quantile(0.8) + .always_ram(true) + .build(), + )), + ) + .await?; +``` -Up! +```java +import io.qdrant.client.grpc.Collections.HnswConfigDiff; +import io.qdrant.client.grpc.Collections.QuantizationConfigDiff; +import io.qdrant.client.grpc.Collections.QuantizationType; +import io.qdrant.client.grpc.Collections.ScalarQuantization; +import io.qdrant.client.grpc.Collections.UpdateCollection; +import io.qdrant.client.grpc.Collections.VectorParamsDiff; +import io.qdrant.client.grpc.Collections.VectorParamsDiffMap; +import io.qdrant.client.grpc.Collections.VectorsConfigDiff; -<|page-10-lllmstxt|> -## using-multivector-representations -- [Documentation](https://qdrant.tech/documentation/) -- [Advanced tutorials](https://qdrant.tech/documentation/advanced-tutorials/) -- How to Use Multivector Representations with Qdrant Effectively +client + .updateCollectionAsync( + UpdateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setHnswConfig(HnswConfigDiff.newBuilder().setEfConstruct(123).build()) + .setVectorsConfig( + VectorsConfigDiff.newBuilder() + .setParamsMap( + VectorParamsDiffMap.newBuilder() + .putMap( + "my_vector", + VectorParamsDiff.newBuilder() + .setHnswConfig( + HnswConfigDiff.newBuilder() + .setM(3) + .setEfConstruct(123) + .build()) + .build()))) + .setQuantizationConfig( + QuantizationConfigDiff.newBuilder() + .setScalar( + ScalarQuantization.newBuilder() + .setType(QuantizationType.Int8) + .setQuantile(0.8f) + .setAlwaysRam(true) + .build())) + .build()) + .get(); +``` -# [Anchor](https://qdrant.tech/documentation/advanced-tutorials/using-multivector-representations/\#how-to-effectively-use-multivector-representations-in-qdrant-for-reranking) How to Effectively Use Multivector Representations in Qdrant for Reranking +## Collection info -Multivector Representations are one of the most powerful features of Qdrant. However, most people don’t use them effectively, resulting in massive RAM overhead, slow inserts, and wasted compute. +Qdrant allows determining the configuration parameters of an existing collection to better understand how the points are +distributed and indexed. -In this tutorial, you’ll discover how to effectively use multivector representations in Qdrant. +```python +client.get_collection(collection_name="{collection_name}") +``` -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/using-multivector-representations/\#what-are-multivector-representations) What are Multivector Representations? +```csharp +await client.GetCollectionInfoAsync("{collection_name}"); +``` -In most vector engines, each document is represented by a single vector - an approach that works well for short texts but often struggles with longer documents. Single vector representations perform pooling of the token-level embeddings, which obviously leads to losing some information. +```bash +curl -X GET http://localhost:6333/collections/{collection_name} +``` -Multivector representations offer a more fine-grained alternative where a single document is represented using multiple vectors, often at the token or phrase level. This enables more precise matching between specific query terms and relevant parts of the document. Matching is especially effective in Late Interaction models like [ColBERT](https://qdrant.tech/documentation/fastembed/fastembed-colbert/), which retain token-level embeddings and perform interaction during query time leading to relevance scoring. +```go +import "context" -![Multivector Representations](https://qdrant.tech/documentation/advanced-tutorials/multivectors.png) +client.GetCollectionInfo(context.Background(), "{collection_name}") +``` -As you will see later in the tutorial, Qdrant supports multivectors and thus late interaction models natively. +```http +GET /collections/{collection_name} +``` -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/using-multivector-representations/\#why-token-level-vectors-are-useful) Why Token-level Vectors are Useful +```typescript +client.getCollection("{collection_name}"); +``` -With token-level vectors, models like ColBERT can match specific query tokens to the most relevant parts of a document, enabling high-accuracy retrieval through Late Interaction. +```rust +client.collection_info("{collection_name}").await?; +``` -In late interaction, each document is converted into multiple token-level vectors instead of a single vector. The query is also tokenized and embedded into various vectors. Then, the query and document vectors are matched using a similarity function: MaxSim. You can see how it is calculated [here](https://qdrant.tech/documentation/concepts/vectors/#multivectors). +```java +client.getCollectionInfoAsync("{collection_name}").get(); +``` -In traditional retrieval, the query and document are converted into single embeddings, after which similarity is computed. This is an early interaction because the information is compressed before retrieval. +
+Expected result -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/using-multivector-representations/\#what-is-rescoring-and-why-is-it-used) What is Rescoring, and Why is it Used? +```json +{ + "result": { + "status": "green", + "optimizer_status": "ok", + "vectors_count": 1068786, + "indexed_vectors_count": 1024232, + "points_count": 1068786, + "segments_count": 31, + "config": { + "params": { + "vectors": { + "size": 384, + "distance": "Cosine" + }, + "shard_number": 1, + "replication_factor": 1, + "write_consistency_factor": 1, + "on_disk_payload": false + }, + "hnsw_config": { + "m": 16, + "ef_construct": 100, + "full_scan_threshold": 10000, + "max_indexing_threads": 0 + }, + "optimizer_config": { + "deleted_threshold": 0.2, + "vacuum_min_vector_number": 1000, + "default_segment_number": 0, + "max_segment_size": null, + "memmap_threshold": null, + "indexing_threshold": 20000, + "flush_interval_sec": 5, + "max_optimization_threads": 1 + }, + "wal_config": { + "wal_capacity_mb": 32, + "wal_segments_ahead": 0 + } + }, + "payload_schema": {} + }, + "status": "ok", + "time": 0.00010143 +} +``` -Rescoring is two-fold: +
-- Retrieve relevant documents using a fast model. -- Rerank them using a more accurate but slower model such as ColBERT. +If you insert the vectors into the collection, the `status` field may become +`yellow` whilst it is optimizing. It will become `green` once all the points are +successfully processed. -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/using-multivector-representations/\#why-indexing-every-vector-by-default-is-a-problem) Why Indexing Every Vector by Default is a Problem +The following color statuses are possible: -In multivector representations (such as those used by Late Interaction models like ColBERT), a single logical document results in hundreds of token-level vectors. Indexing each of these vectors individually with HNSW in Qdrant can lead to: +- 🟱 `green`: collection is ready +- 🟡 `yellow`: collection is optimizing +- ⚫ `grey`: collection is pending optimization ([help](#grey-collection-status)) +- 🔮 `red`: an error occurred which the engine could not recover from -- High RAM usage -- Slow insert times due to the complexity of maintaining the HNSW graph +### Grey collection status -However, because multivectors are typically used in the reranking stage (after a first-pass retrieval using dense vectors), there’s often no need to index these token-level vectors with HNSW. +_Available as of v1.9.0_ -Instead, they can be stored as multi-vector fields (without HNSW indexing) and used at query-time for reranking, which reduces resource overhead and improves performance. +A collection may have the grey ⚫ status or show "optimizations pending, +awaiting update operation" as optimization status. This state is normally caused +by restarting a Qdrant instance while optimizations were ongoing. -For more on this, check out Qdrant’s detailed breakdown in our [Scaling PDF Retrieval with Qdrant tutorial](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/#math-behind-the-scaling). +It means the collection has optimizations pending, but they are paused. You must +send any update operation to trigger and start the optimizations again. -With Qdrant, you have full control of how indexing works. You can disable indexing by setting the HNSW `m` parameter to `0`: +For example: ```python -from qdrant_client import QdrantClient, models - -client = QdrantClient("http://localhost:6333") -collection_name = "dense_multivector_demo" -client.create_collection( - collection_name=collection_name, - vectors_config={ - "dense": models.VectorParams( - size=384, - distance=models.Distance.COSINE - # Leave HNSW indexing ON for dense - ), - "colbert": models.VectorParams( - size=128, - distance=models.Distance.COSINE, - multivector_config=models.MultiVectorConfig( - comparator=models.MultiVectorComparator.MAX_SIM - ), - hnsw_config=models.HnswConfigDiff(m=0) # Disable HNSW for reranking - ) - } +client.update_collection( + collection_name="{collection_name}", + optimizer_config=models.OptimizersConfigDiff(), ) - ``` -By disabling HNSW on multivectors, you: - -- Save compute. -- Reduce memory usage. -- Speed up vector uploads. - -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/using-multivector-representations/\#how-to-generate-multivectors-using-fastembed) How to Generate Multivectors Using FastEmbed +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -Let’s demonstrate how to effectively use multivectors using [FastEmbed](https://github.com/qdrant/fastembed), which wraps ColBERT into a simple API. +var client = new QdrantClient("localhost", 6334); -Install FastEmbed and Qdrant: +await client.UpdateCollectionAsync( + collectionName: "{collection_name}", + optimizersConfig: new OptimizersConfigDiff { } +); +``` ```bash -pip install qdrant-client[fastembed]>=1.14.2 - +curl -X PATCH http://localhost:6333/collections/{collection_name} \ + -H 'Content-Type: application/json' \ + --data-raw '{ + "optimizers_config": {} + }' ``` -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/using-multivector-representations/\#step-by-step-colbert--qdrant-setup) Step-by-Step: ColBERT + Qdrant Setup - -Ensure that Qdrant is running and create a client: +```go +import ( + "context" -```python -from qdrant_client import QdrantClient, models + "github.com/qdrant/go-client/qdrant" +) -# 1. Connect to Qdrant server -client = QdrantClient("http://localhost:6333") +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) +client.UpdateCollection(context.Background(), &qdrant.UpdateCollection{ + CollectionName: "{collection_name}", + OptimizersConfig: &qdrant.OptimizersConfigDiff{}, +}) ``` -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/using-multivector-representations/\#1-encode-documents) 1\. Encode Documents - -Next, encode your documents: - -```python -from fastembed import TextEmbedding, LateInteractionTextEmbedding -# Example documents and query -documents = [\ - "Artificial intelligence is used in hospitals for cancer diagnosis and treatment.",\ - "Self-driving cars use AI to detect obstacles and make driving decisions.",\ - "AI is transforming customer service through chatbots and automation.",\ - # ...\ -] -query_text = "How does AI help in medicine?" - -dense_documents = [\ - models.Document(text=doc, model="BAAI/bge-small-en")\ - for doc in documents\ -] -dense_query = models.Document(text=query_text, model="BAAI/bge-small-en") - -colbert_documents = [\ - models.Document(text=doc, model="colbert-ir/colbertv2.0")\ - for doc in documents\ -] -colbert_query = models.Document(text=query_text, model="colbert-ir/colbertv2.0") - +```http +PATCH /collections/{collection_name} +{ + "optimizers_config": {} +} ``` -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/using-multivector-representations/\#2-create-a-qdrant-collection) 2\. Create a Qdrant collection - -Then create a Qdrant collection with both vector types. Note that we leave indexing on for the `dense` vector but turn it off for the `colbert` vector that will be used for reranking. - -```python -collection_name = "dense_multivector_demo" -client.create_collection( - collection_name=collection_name, - vectors_config={ - "dense": models.VectorParams( - size=384, - distance=models.Distance.COSINE - # Leave HNSW indexing ON for dense - ), - "colbert": models.VectorParams( - size=128, - distance=models.Distance.COSINE, - multivector_config=models.MultiVectorConfig( - comparator=models.MultiVectorComparator.MAX_SIM - ), - hnsw_config=models.HnswConfigDiff(m=0) # Disable HNSW for reranking - ) - } -) - +```typescript +client.updateCollection("{collection_name}", { + optimizers_config: {}, +}); ``` -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/using-multivector-representations/\#3-upload-documents-dense--multivector) 3\. Upload Documents (Dense + Multivector) - -Now upload the vectors: - -```python -points = [\ - models.PointStruct(\ - id=i,\ - vector={\ - "dense": dense_documents[i],\ - "colbert": colbert_documents[i]\ - },\ - payload={"text": documents[i]}\ - ) for i in range(len(documents))\ -] -client.upsert(collection_name="dense_multivector_demo", points=points) +```rust +use qdrant_client::qdrant::{OptimizersConfigDiffBuilder, UpdateCollectionBuilder}; +client + .update_collection( + UpdateCollectionBuilder::new("{collection_name}") + .optimizers_config(OptimizersConfigDiffBuilder::default()), + ) + .await?; ``` -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/using-multivector-representations/\#query-with-retrieval--reranking-in-one-call) Query with Retrieval + Reranking in One Call - -Now let’s run a search: - -```python -results = client.query_points( - collection_name="dense_multivector_demo", - prefetch=models.Prefetch( - query=dense_query, - using="dense", - ), - query=colbert_query, - using="colbert", - limit=3, - with_payload=True -) +```java +import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; +import io.qdrant.client.grpc.Collections.UpdateCollection; +client.updateCollectionAsync( + UpdateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setOptimizersConfig( + OptimizersConfigDiff.getDefaultInstance()) + .build()); ``` -- The dense vector retrieves the top candidates quickly. -- The Colbert multivector reranks them using token-level `MaxSim` with fine-grained precision. -- Returns the top 3 results. +Alternatively you may use the `Trigger Optimizers` button in the [Qdrant Web UI](/documentation/web-ui/). +It is shown next to the grey collection status on the collection info page. -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/using-multivector-representations/\#conclusion) Conclusion +### Approximate point and vector counts -Multivector search is one of the most powerful features of a vector database when used correctly. With this functionality in Qdrant, you can: +You may be interested in the count attributes: -- Store token-level embeddings natively. -- Disable indexing to reduce overhead. -- Run fast retrieval and accurate reranking in one API call. -- Efficiently scale late interaction. +- `points_count` - total number of objects (vectors and their payloads) stored in the collection +- `vectors_count` - total number of vectors in a collection, useful if you have multiple vectors per point +- `indexed_vectors_count` - total number of vectors stored in the HNSW or sparse index. Qdrant does not store all the vectors in the index, but only if an index segment might be created for a given configuration. -Combining FastEmbed and Qdrant leads to a production-ready pipeline for ColBERT-style reranking without wasting resources. You can do this locally or use Qdrant Cloud. Qdrant offers an easy-to-use API to get started with your search engine, so if you’re ready to dive in, sign up for free at [Qdrant Cloud](https://qdrant.tech/cloud/) and start building. +The above counts are not exact, but should be considered approximate. Depending +on how you use Qdrant these may give very different numbers than what you may +expect. It's therefore important **not** to rely on them. -##### Was this page useful? +More specifically, these numbers represent the count of points and vectors in +Qdrant's internal storage. Internally, Qdrant may temporarily duplicate points +as part of automatic optimizations. It may keep changed or deleted points for a +bit. And it may delay indexing of new points. All of that is for optimization +reasons. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Updates you do are therefore not directly reflected in these numbers. If you see +a wildly different count of points, it will likely resolve itself once a new +round of automatic optimizations has completed. -Thank you for your feedback! 🙏 +To clarify: these numbers don't represent the exact amount of points or vectors +you have inserted, nor does it represent the exact number of distinguishable +points or vectors you can query. If you want to know exact counts, refer to the +[count API](/documentation/concepts/points/#counting-points). -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/advanced-tutorials/using-multivector-representations.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +_Note: these numbers may be removed in a future version of Qdrant._ -On this page: +### Indexing vectors in HNSW -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/advanced-tutorials/using-multivector-representations.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +In some cases, you might be surprised the value of `indexed_vectors_count` is lower than `vectors_count`. This is an intended behaviour and +depends on the [optimizer configuration](/documentation/concepts/optimizer/). A new index segment is built if the size of non-indexed vectors is higher than the +value of `indexing_threshold`(in kB). If your collection is very small or the dimensionality of the vectors is low, there might be no HNSW segment +created and `indexed_vectors_count` might be equal to `0`. -× +It is possible to reduce the `indexing_threshold` for an existing collection by [updating collection parameters](#update-collection-parameters). -[Powered by](https://qdrant.tech/) +## Collection aliases -<|page-11-lllmstxt|> -## cloud-api -- [Documentation](https://qdrant.tech/documentation/) -- Qdrant Cloud API +In a production environment, it is sometimes necessary to switch different versions of vectors seamlessly. +For example, when upgrading to a new version of the neural network. -# [Anchor](https://qdrant.tech/documentation/cloud-api/\#qdrant-cloud-api-powerful-grpc-and-flexible-restjson-interfaces) Qdrant Cloud API: Powerful gRPC and Flexible REST/JSON Interfaces +There is no way to stop the service and rebuild the collection with new vectors in these situations. +Aliases are additional names for existing collections. +All queries to the collection can also be done identically, using an alias instead of the collection name. -**Note:** This is not the Qdrant REST or gPRC API of the database itself. For database APIs & SDKs, see our list of [interfaces](https://qdrant.tech/documentation/interfaces/) +Thus, it is possible to build a second collection in the background and then switch alias from the old to the new collection. +Since all changes of aliases happen atomically, no concurrent requests will be affected during the switch. -## [Anchor](https://qdrant.tech/documentation/cloud-api/\#introduction) Introduction +### Create alias -The Qdrant Cloud API lets you automate the Qdrant Cloud platform. You can use this API to manage your accounts, clusters, backup schedules, authentication methods, hybrid cloud environments, and more. +```python +client.update_collection_aliases( + change_aliases_operations=[ + models.CreateAliasOperation( + create_alias=models.CreateAlias( + collection_name="example_collection", alias_name="production_collection" + ) + ) + ] +) +``` -To cater to diverse integration needs, the Qdrant Cloud API offers two primary interaction models: +```csharp +await client.CreateAliasAsync(aliasName: "production_collection", collectionName: "example_collection"); +``` -- **gRPC API**: For high-performance, low-latency, and type-safe communication. This is the recommended way for backend services and applications requiring maximum efficiency. The API is defined using Protocol Buffers. -- **REST/JSON API**: A conventional HTTP/1.1 (and HTTP/2) interface with JSON payloads. This API is provided via a gRPC Gateway, translating RESTful calls into gRPC messages, offering ease of use for web clients, scripts, and broader tool compatibility. +```bash +curl -X POST http://localhost:6333/collections/aliases \ + -H 'Content-Type: application/json' \ + --data-raw '{ + "actions": [ + { + "create_alias": { + "collection_name": "example_collection", + "alias_name": "production_collection" + } + } + ] +}' +``` -You can find the API definitions and generated client libraries in our Qdrant Cloud Public API [GitHub repository](https://github.com/qdrant/qdrant-cloud-public-api). -**Note:** The API is splitted into multiple services to make it easier to use. +```go +import "context" -### [Anchor](https://qdrant.tech/documentation/cloud-api/\#qdrant-cloud-api-endpoints) Qdrant Cloud API Endpoints +client.CreateAlias(context.Background(), "production_collection", "example_collection") +``` -- **gRPC Endpoint**: grpc.cloud.qdrant.io:443 -- **REST/JSON Endpoint**: [https://api.cloud.qdrant.io](https://api.cloud.qdrant.io/) +```http +POST /collections/aliases +{ + "actions": [ + { + "create_alias": { + "collection_name": "example_collection", + "alias_name": "production_collection" + } + } + ] +} +``` -### [Anchor](https://qdrant.tech/documentation/cloud-api/\#authentication) Authentication +```typescript +client.updateCollectionAliases({ + actions: [ + { + create_alias: { + collection_name: "example_collection", + alias_name: "production_collection", + }, + }, + ], +}); +``` -Most of the Qdrant Cloud API requests must be authenticated. Authentication is handled via API keys (so called management keys), which should be passed in the Authorization header. -**Management Keys**: `Authorization: apikey ` +```rust +use qdrant_client::qdrant::CreateAliasBuilder; -Replace with the actual API key obtained from your Qdrant Cloud dashboard or generated programmatically. +client + .create_alias(CreateAliasBuilder::new( + "example_collection", + "production_collection", + )) + .await?; +``` -You can create a management key in the Cloud Console UI. Go to **Access Management** \> **Cloud Management Keys**. -![Authentication](https://qdrant.tech/documentation/cloud/authentication.png) +```java +client.createAliasAsync("production_collection", "example_collection").get(); +``` -**Note:** Ensure that the API key is kept secure and not exposed in public repositories or logs. Once authenticated, the API allows you to manage clusters, backup schedules, and perform other operations available to your account. +### Remove alias -### [Anchor](https://qdrant.tech/documentation/cloud-api/\#samples) Samples +```python +client.update_collection_aliases( + change_aliases_operations=[ + models.DeleteAliasOperation( + delete_alias=models.DeleteAlias(alias_name="production_collection") + ), + ] +) +``` -For samples on how to use the API, with a tool like grpcurl, curl or any of the provided SDKs, please see the [Qdrant Cloud Public API](https://github.com/qdrant/qdrant-cloud-public-api) repository. +```csharp +await client.DeleteAliasAsync("production_collection"); +``` -## [Anchor](https://qdrant.tech/documentation/cloud-api/\#terraform-provider) Terraform Provider +```bash +curl -X POST http://localhost:6333/collections/aliases \ + -H 'Content-Type: application/json' \ + --data-raw '{ + "actions": [ + { + "delete_alias": { + "alias_name": "production_collection" + } + } + ] +}' +``` -Qdrant Cloud also provides a Terraform provider to manage your Qdrant Cloud resources. [Learn more](https://qdrant.tech/documentation/infrastructure/terraform/). +```go +import "context" -## [Anchor](https://qdrant.tech/documentation/cloud-api/\#deprecated-openapi-specification) Deprecated OpenAPI specification +client.DeleteAlias(context.Background(), "production_collection") +``` -We still support our deprecated OpenAPI endpoint, but this is scheduled to be removed later this year (November 1st, 2025). -We do _NOT_ recommend to use this endpoint anymore and use the replacement as described above. +```http +POST /collections/aliases +{ + "actions": [ + { + "delete_alias": { + "alias_name": "production_collection" + } + } + ] +} +``` -| REST API | Documentation | -| --- | --- | -| v.0.1.0 | [OpenAPI Specification](https://cloud.qdrant.io/pa/v1/docs) | +```typescript +client.updateCollectionAliases({ + actions: [ + { + delete_alias: { + alias_name: "production_collection", + }, + }, + ], +}); +``` + +```rust +client.delete_alias("production_collection").await?; +``` + +```java +client.deleteAliasAsync("production_collection").get(); +``` + +### Switch collection + +Multiple alias actions are performed atomically. +For example, you can switch underlying collection with the following command: + +```python +client.update_collection_aliases( + change_aliases_operations=[ + models.DeleteAliasOperation( + delete_alias=models.DeleteAlias(alias_name="production_collection") + ), + models.CreateAliasOperation( + create_alias=models.CreateAlias( + collection_name="example_collection", alias_name="production_collection" + ) + ), + ] +) +``` + +```csharp +await client.DeleteAliasAsync("production_collection"); +await client.CreateAliasAsync(aliasName: "production_collection", collectionName: "example_collection"); +``` + +```bash +curl -X POST http://localhost:6333/collections/aliases \ + -H 'Content-Type: application/json' \ + --data-raw '{ + "actions": [ + { + "delete_alias": { + "alias_name": "production_collection" + } + }, + { + "create_alias": { + "collection_name": "example_collection", + "alias_name": "production_collection" + } + } + ] +}' +``` + +```go +import "context" + +client.DeleteAlias(context.Background(), "production_collection") +client.CreateAlias(context.Background(), "production_collection", "example_collection") +``` + +```http +POST /collections/aliases +{ + "actions": [ + { + "delete_alias": { + "alias_name": "production_collection" + } + }, + { + "create_alias": { + "collection_name": "example_collection", + "alias_name": "production_collection" + } + } + ] +} +``` + +```typescript +client.updateCollectionAliases({ + actions: [ + { + delete_alias: { + alias_name: "production_collection", + }, + }, + { + create_alias: { + collection_name: "example_collection", + alias_name: "production_collection", + }, + }, + ], +}); +``` + +```rust +use qdrant_client::qdrant::CreateAliasBuilder; + +client.delete_alias("production_collection").await?; +client + .create_alias(CreateAliasBuilder::new( + "example_collection", + "production_collection", + )) + .await?; +``` + +```java +client.deleteAliasAsync("production_collection").get(); +client.createAliasAsync("production_collection", "example_collection").get(); +``` + +### List collection aliases + +```python +from qdrant_client import QdrantClient + +client = QdrantClient(url="http://localhost:6333") + +client.get_collection_aliases(collection_name="{collection_name}") +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.ListCollectionAliasesAsync("{collection_name}"); +``` + +```bash +curl -X GET http://localhost:6333/collections/{collection_name}/aliases +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.ListCollectionAliases(context.Background(), "{collection_name}") +``` + +```http +GET /collections/{collection_name}/aliases +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.getCollectionAliases("{collection_name}"); +``` + +```rust +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client.list_collection_aliases("{collection_name}").await?; +``` + +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.listCollectionAliasesAsync("{collection_name}").get(); +``` + +### List all aliases + +```python +from qdrant_client import QdrantClient + +client = QdrantClient(url="http://localhost:6333") + +client.get_aliases() +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.ListAliasesAsync(); +``` + +```bash +curl -X GET http://localhost:6333/aliases +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.ListAliases(context.Background()) +``` + +```http +GET /aliases +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.getAliases(); +``` + +```rust +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client.list_aliases().await?; +``` + +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.listAliasesAsync().get(); +``` + +### List all collections + +```python +from qdrant_client import QdrantClient + +client = QdrantClient(url="http://localhost:6333") + +client.get_collections() +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.ListCollectionsAsync(); +``` + +```bash +curl -X GET http://localhost:6333/collections +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.ListCollections(context.Background()) +``` + +```http +GET /collections +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.getCollections(); +``` + +```rust +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client.list_collections().await?; +``` + +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.listCollectionsAsync().get(); +``` + +<|page-3-lllmstxt|> +# Points + +The points are the central entity that Qdrant operates with. +A point is a record consisting of a [vector](/documentation/concepts/vectors/) and an optional [payload](/documentation/concepts/payload/). + +It looks like this: + +```json +// This is a simple point +{ + "id": 129, + "vector": [0.1, 0.2, 0.3, 0.4], + "payload": {"color": "red"}, +} +``` + +You can search among the points grouped in one [collection](/documentation/concepts/collections/) based on vector similarity. +This procedure is described in more detail in the [search](/documentation/concepts/search/) and [filtering](/documentation/concepts/filtering/) sections. + +This section explains how to create and manage vectors. + +Any point modification operation is asynchronous and takes place in 2 steps. +At the first stage, the operation is written to the Write-ahead-log. + +After this moment, the service will not lose the data, even if the machine loses power supply. + + +## Point IDs + +Qdrant supports using both `64-bit unsigned integers` and `UUID` as identifiers for points. + +Examples of UUID string representations: + +- simple: `936DA01F9ABD4d9d80C702AF85C822A8` +- hyphenated: `550e8400-e29b-41d4-a716-446655440000` +- urn: `urn:uuid:F9168C5E-CEB2-4faa-B6BF-329BF39FA1E4` + +That means that in every request UUID string could be used instead of numerical id. +Example: + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.upsert( + collection_name="{collection_name}", + points=[ + models.PointStruct( + id="5c56c793-69f3-4fbf-87e6-c4bf54c28c26", + payload={ + "color": "red", + }, + vector=[0.9, 0.1, 0.1], + ), + ], +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.UpsertAsync( + collectionName: "{collection_name}", + points: new List + { + new() + { + Id = Guid.Parse("5c56c793-69f3-4fbf-87e6-c4bf54c28c26"), + Vectors = new[] { 0.05f, 0.61f, 0.76f, 0.74f }, + Payload = { ["color"] = "Red" } + } + } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Upsert(context.Background(), &qdrant.UpsertPoints{ + CollectionName: "{collection_name}", + Points: []*qdrant.PointStruct{ + { + Id: qdrant.NewID("5c56c793-69f3-4fbf-87e6-c4bf54c28c26"), + Vectors: qdrant.NewVectors(0.05, 0.61, 0.76, 0.74), + Payload: qdrant.NewValueMap(map[string]any{"color": "Red"}), + }, + }, +}) +``` + +```http +PUT /collections/{collection_name}/points +{ + "points": [ + { + "id": "5c56c793-69f3-4fbf-87e6-c4bf54c28c26", + "payload": {"color": "red"}, + "vector": [0.9, 0.1, 0.1] + } + ] +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.upsert("{collection_name}", { + points: [ + { + id: "5c56c793-69f3-4fbf-87e6-c4bf54c28c26", + payload: { + color: "red", + }, + vector: [0.9, 0.1, 0.1], + }, + ], +}); +``` + +```rust +use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder}; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .upsert_points( + UpsertPointsBuilder::new( + "{collection_name}", + vec![PointStruct::new( + "5c56c793-69f3-4fbf-87e6-c4bf54c28c26", + vec![0.9, 0.1, 0.1], + [("color", "Red".into())], + )], + ) + .wait(true), + ) + .await?; +``` + +```java +import java.util.List; +import java.util.Map; +import java.util.UUID; + +import static io.qdrant.client.PointIdFactory.id; +import static io.qdrant.client.ValueFactory.value; +import static io.qdrant.client.VectorsFactory.vectors; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.PointStruct; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .upsertAsync( + "{collection_name}", + List.of( + PointStruct.newBuilder() + .setId(id(UUID.fromString("5c56c793-69f3-4fbf-87e6-c4bf54c28c26"))) + .setVectors(vectors(0.05f, 0.61f, 0.76f, 0.74f)) + .putAllPayload(Map.of("color", value("Red"))) + .build())) + .get(); +``` + +and + +```python +client.upsert( + collection_name="{collection_name}", + points=[ + models.PointStruct( + id=1, + payload={ + "color": "red", + }, + vector=[0.9, 0.1, 0.1], + ), + ], +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.UpsertAsync( + collectionName: "{collection_name}", + points: new List + { + new() + { + Id = 1, + Vectors = new[] { 0.05f, 0.61f, 0.76f, 0.74f }, + Payload = { ["color"] = "Red" } + } + } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Upsert(context.Background(), &qdrant.UpsertPoints{ + CollectionName: "{collection_name}", + Points: []*qdrant.PointStruct{ + { + Id: qdrant.NewIDNum(1), + Vectors: qdrant.NewVectors(0.05, 0.61, 0.76, 0.74), + Payload: qdrant.NewValueMap(map[string]any{"color": "Red"}), + }, + }, +}) +``` + +```http +PUT /collections/{collection_name}/points +{ + "points": [ + { + "id": 1, + "payload": {"color": "red"}, + "vector": [0.9, 0.1, 0.1] + } + ] +} +``` + +```typescript +client.upsert("{collection_name}", { + points: [ + { + id: 1, + payload: { + color: "red", + }, + vector: [0.9, 0.1, 0.1], + }, + ], +}); +``` + +```rust +use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder}; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .upsert_points( + UpsertPointsBuilder::new( + "{collection_name}", + vec![PointStruct::new( + 1, + vec![0.9, 0.1, 0.1], + [("color", "Red".into())], + )], + ) + .wait(true), + ) + .await?; +``` + +```java +import java.util.List; +import java.util.Map; + +import static io.qdrant.client.PointIdFactory.id; +import static io.qdrant.client.ValueFactory.value; +import static io.qdrant.client.VectorsFactory.vectors; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.PointStruct; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .upsertAsync( + "{collection_name}", + List.of( + PointStruct.newBuilder() + .setId(id(1)) + .setVectors(vectors(0.05f, 0.61f, 0.76f, 0.74f)) + .putAllPayload(Map.of("color", value("Red"))) + .build())) + .get(); +``` + +are both possible. + +## Vectors + +Each point in qdrant may have one or more vectors. +Vectors are the central component of the Qdrant architecture, +qdrant relies on different types of vectors to provide different types of data exploration and search. + +Here is a list of supported vector types: + +||| +|-|-| +| Dense Vectors | A regular vectors, generated by majority of the embedding models. | +| Sparse Vectors | Vectors with no fixed length, but only a few non-zero elements.
Useful for exact token match and collaborative filtering recommendations. | +| MultiVectors | Matrices of numbers with fixed length but variable height.
Usually obtained from late interaction models like ColBERT. | + +It is possible to attach more than one type of vector to a single point. +In Qdrant we call these Named Vectors. + +Read more about vector types, how they are stored and optimized in the [vectors](/documentation/concepts/vectors/) section. + + +## Upload points + +To optimize performance, Qdrant supports batch loading of points. I.e., you can load several points into the service in one API call. +Batching allows you to minimize the overhead of creating a network connection. + +The Qdrant API supports two ways of creating batches - record-oriented and column-oriented. +Internally, these options do not differ and are made only for the convenience of interaction. + +Create points with batch: + +```python +client.upsert( + collection_name="{collection_name}", + points=models.Batch( + ids=[1, 2, 3], + payloads=[ + {"color": "red"}, + {"color": "green"}, + {"color": "blue"}, + ], + vectors=[ + [0.9, 0.1, 0.1], + [0.1, 0.9, 0.1], + [0.1, 0.1, 0.9], + ], + ), +) +``` + +```http +PUT /collections/{collection_name}/points +{ + "batch": { + "ids": [1, 2, 3], + "payloads": [ + {"color": "red"}, + {"color": "green"}, + {"color": "blue"} + ], + "vectors": [ + [0.9, 0.1, 0.1], + [0.1, 0.9, 0.1], + [0.1, 0.1, 0.9] + ] + } +} +``` + +```typescript +client.upsert("{collection_name}", { + batch: { + ids: [1, 2, 3], + payloads: [{ color: "red" }, { color: "green" }, { color: "blue" }], + vectors: [ + [0.9, 0.1, 0.1], + [0.1, 0.9, 0.1], + [0.1, 0.1, 0.9], + ], + }, +}); +``` + +or record-oriented equivalent: + +```python +client.upsert( + collection_name="{collection_name}", + points=[ + models.PointStruct( + id=1, + payload={ + "color": "red", + }, + vector=[0.9, 0.1, 0.1], + ), + models.PointStruct( + id=2, + payload={ + "color": "green", + }, + vector=[0.1, 0.9, 0.1], + ), + models.PointStruct( + id=3, + payload={ + "color": "blue", + }, + vector=[0.1, 0.1, 0.9], + ), + ], +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.UpsertAsync( + collectionName: "{collection_name}", + points: new List + { + new() + { + Id = 1, + Vectors = new[] { 0.9f, 0.1f, 0.1f }, + Payload = { ["color"] = "red" } + }, + new() + { + Id = 2, + Vectors = new[] { 0.1f, 0.9f, 0.1f }, + Payload = { ["color"] = "green" } + }, + new() + { + Id = 3, + Vectors = new[] { 0.1f, 0.1f, 0.9f }, + Payload = { ["color"] = "blue" } + } + } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Upsert(context.Background(), &qdrant.UpsertPoints{ + CollectionName: "{collection_name}", + Points: []*qdrant.PointStruct{ + { + Id: qdrant.NewIDNum(1), + Vectors: qdrant.NewVectors(0.9, 0.1, 0.1), + Payload: qdrant.NewValueMap(map[string]any{"color": "red"}), + }, + { + Id: qdrant.NewIDNum(2), + Vectors: qdrant.NewVectors(0.1, 0.9, 0.1), + Payload: qdrant.NewValueMap(map[string]any{"color": "green"}), + }, + { + Id: qdrant.NewIDNum(3), + Vectors: qdrant.NewVectors(0.1, 0.1, 0.9), + Payload: qdrant.NewValueMap(map[string]any{"color": "blue"}), + }, + }, +}) +``` + +```http +PUT /collections/{collection_name}/points +{ + "points": [ + { + "id": 1, + "payload": {"color": "red"}, + "vector": [0.9, 0.1, 0.1] + }, + { + "id": 2, + "payload": {"color": "green"}, + "vector": [0.1, 0.9, 0.1] + }, + { + "id": 3, + "payload": {"color": "blue"}, + "vector": [0.1, 0.1, 0.9] + } + ] +} +``` + +```typescript +client.upsert("{collection_name}", { + points: [ + { + id: 1, + payload: { color: "red" }, + vector: [0.9, 0.1, 0.1], + }, + { + id: 2, + payload: { color: "green" }, + vector: [0.1, 0.9, 0.1], + }, + { + id: 3, + payload: { color: "blue" }, + vector: [0.1, 0.1, 0.9], + }, + ], +}); +``` + +```rust +use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder}; + +client + .upsert_points( + UpsertPointsBuilder::new( + "{collection_name}", + vec![ + PointStruct::new(1, vec![0.9, 0.1, 0.1], [("city", "red".into())]), + PointStruct::new(2, vec![0.1, 0.9, 0.1], [("city", "green".into())]), + PointStruct::new(3, vec![0.1, 0.1, 0.9], [("city", "blue".into())]), + ], + ) + .wait(true), + ) + .await?; +``` + +```java +import java.util.List; +import java.util.Map; + +import static io.qdrant.client.PointIdFactory.id; +import static io.qdrant.client.ValueFactory.value; +import static io.qdrant.client.VectorsFactory.vectors; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.PointStruct; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .upsertAsync( + "{collection_name}", + List.of( + PointStruct.newBuilder() + .setId(id(1)) + .setVectors(vectors(0.9f, 0.1f, 0.1f)) + .putAllPayload(Map.of("color", value("red"))) + .build(), + PointStruct.newBuilder() + .setId(id(2)) + .setVectors(vectors(0.1f, 0.9f, 0.1f)) + .putAllPayload(Map.of("color", value("green"))) + .build(), + PointStruct.newBuilder() + .setId(id(3)) + .setVectors(vectors(0.1f, 0.1f, 0.9f)) + .putAllPayload(Map.of("color", value("blue"))) + .build())) + .get(); +``` + +The Python client has additional features for loading points, which include: + +- Parallelization +- A retry mechanism +- Lazy batching support + +For example, you can read your data directly from hard drives, to avoid storing all data in RAM. You can use these +features with the `upload_collection` and `upload_points` methods. +Similar to the basic upsert API, these methods support both record-oriented and column-oriented formats. + + + +Column-oriented format: + +```python +client.upload_collection( + collection_name="{collection_name}", + ids=[1, 2], + payload=[ + {"color": "red"}, + {"color": "green"}, + ], + vectors=[ + [0.9, 0.1, 0.1], + [0.1, 0.9, 0.1], + ], + parallel=4, + max_retries=3, +) +``` + + + +Record-oriented format: + +```python +client.upload_points( + collection_name="{collection_name}", + points=[ + models.PointStruct( + id=1, + payload={ + "color": "red", + }, + vector=[0.9, 0.1, 0.1], + ), + models.PointStruct( + id=2, + payload={ + "color": "green", + }, + vector=[0.1, 0.9, 0.1], + ), + ], + parallel=4, + max_retries=3, +) +``` + +All APIs in Qdrant, including point loading, are idempotent. +It means that executing the same method several times in a row is equivalent to a single execution. + +In this case, it means that points with the same id will be overwritten when re-uploaded. + +Idempotence property is useful if you use, for example, a message queue that doesn't provide an exactly-ones guarantee. +Even with such a system, Qdrant ensures data consistency. + +[_Available as of v0.10.0_](#create-vector-name) + +If the collection was created with multiple vectors, each vector data can be provided using the vector's name: + +```python +client.upsert( + collection_name="{collection_name}", + points=[ + models.PointStruct( + id=1, + vector={ + "image": [0.9, 0.1, 0.1, 0.2], + "text": [0.4, 0.7, 0.1, 0.8, 0.1, 0.1, 0.9, 0.2], + }, + ), + models.PointStruct( + id=2, + vector={ + "image": [0.2, 0.1, 0.3, 0.9], + "text": [0.5, 0.2, 0.7, 0.4, 0.7, 0.2, 0.3, 0.9], + }, + ), + ], +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.UpsertAsync( + collectionName: "{collection_name}", + points: new List + { + new() + { + Id = 1, + Vectors = new Dictionary + { + ["image"] = [0.9f, 0.1f, 0.1f, 0.2f], + ["text"] = [0.4f, 0.7f, 0.1f, 0.8f, 0.1f, 0.1f, 0.9f, 0.2f] + } + }, + new() + { + Id = 2, + Vectors = new Dictionary + { + ["image"] = [0.2f, 0.1f, 0.3f, 0.9f], + ["text"] = [0.5f, 0.2f, 0.7f, 0.4f, 0.7f, 0.2f, 0.3f, 0.9f] + } + } + } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Upsert(context.Background(), &qdrant.UpsertPoints{ + CollectionName: "{collection_name}", + Points: []*qdrant.PointStruct{ + { + Id: qdrant.NewIDNum(1), + Vectors: qdrant.NewVectorsMap(map[string]*qdrant.Vector{ + "image": qdrant.NewVector(0.9, 0.1, 0.1, 0.2), + "text": qdrant.NewVector(0.4, 0.7, 0.1, 0.8, 0.1, 0.1, 0.9, 0.2), + }), + }, + { + Id: qdrant.NewIDNum(2), + Vectors: qdrant.NewVectorsMap(map[string]*qdrant.Vector{ + "image": qdrant.NewVector(0.2, 0.1, 0.3, 0.9), + "text": qdrant.NewVector(0.5, 0.2, 0.7, 0.4, 0.7, 0.2, 0.3, 0.9), + }), + }, + }, +}) +``` + +```http +PUT /collections/{collection_name}/points +{ + "points": [ + { + "id": 1, + "vector": { + "image": [0.9, 0.1, 0.1, 0.2], + "text": [0.4, 0.7, 0.1, 0.8, 0.1, 0.1, 0.9, 0.2] + } + }, + { + "id": 2, + "vector": { + "image": [0.2, 0.1, 0.3, 0.9], + "text": [0.5, 0.2, 0.7, 0.4, 0.7, 0.2, 0.3, 0.9] + } + } + ] +} +``` + +```typescript +client.upsert("{collection_name}", { + points: [ + { + id: 1, + vector: { + image: [0.9, 0.1, 0.1, 0.2], + text: [0.4, 0.7, 0.1, 0.8, 0.1, 0.1, 0.9, 0.2], + }, + }, + { + id: 2, + vector: { + image: [0.2, 0.1, 0.3, 0.9], + text: [0.5, 0.2, 0.7, 0.4, 0.7, 0.2, 0.3, 0.9], + }, + }, + ], +}); +``` + +```rust +use std::collections::HashMap; + +use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder}; +use qdrant_client::Payload; + +client + .upsert_points( + UpsertPointsBuilder::new( + "{collection_name}", + vec![ + PointStruct::new( + 1, + HashMap::from([ + ("image".to_string(), vec![0.9, 0.1, 0.1, 0.2]), + ( + "text".to_string(), + vec![0.4, 0.7, 0.1, 0.8, 0.1, 0.1, 0.9, 0.2], + ), + ]), + Payload::default(), + ), + PointStruct::new( + 2, + HashMap::from([ + ("image".to_string(), vec![0.2, 0.1, 0.3, 0.9]), + ( + "text".to_string(), + vec![0.5, 0.2, 0.7, 0.4, 0.7, 0.2, 0.3, 0.9], + ), + ]), + Payload::default(), + ), + ], + ) + .wait(true), + ) + .await?; +``` + +```java +import java.util.List; +import java.util.Map; + +import static io.qdrant.client.PointIdFactory.id; +import static io.qdrant.client.VectorFactory.vector; +import static io.qdrant.client.VectorsFactory.namedVectors; + +import io.qdrant.client.grpc.Points.PointStruct; + +client + .upsertAsync( + "{collection_name}", + List.of( + PointStruct.newBuilder() + .setId(id(1)) + .setVectors( + namedVectors( + Map.of( + "image", + vector(List.of(0.9f, 0.1f, 0.1f, 0.2f)), + "text", + vector(List.of(0.4f, 0.7f, 0.1f, 0.8f, 0.1f, 0.1f, 0.9f, 0.2f))))) + .build(), + PointStruct.newBuilder() + .setId(id(2)) + .setVectors( + namedVectors( + Map.of( + "image", + List.of(0.2f, 0.1f, 0.3f, 0.9f), + "text", + List.of(0.5f, 0.2f, 0.7f, 0.4f, 0.7f, 0.2f, 0.3f, 0.9f)))) + .build())) + .get(); +``` + +_Available as of v1.2.0_ + +Named vectors are optional. When uploading points, some vectors may be omitted. +For example, you can upload one point with only the `image` vector and a second +one with only the `text` vector. + +When uploading a point with an existing ID, the existing point is deleted first, +then it is inserted with just the specified vectors. In other words, the entire +point is replaced, and any unspecified vectors are set to null. To keep existing +vectors unchanged and only update specified vectors, see [update vectors](#update-vectors). + +_Available as of v1.7.0_ + +Points can contain dense and sparse vectors. + +A sparse vector is an array in which most of the elements have a value of zero. + +It is possible to take advantage of this property to have an optimized representation, for this reason they have a different shape than dense vectors. + +They are represented as a list of `(index, value)` pairs, where `index` is an integer and `value` is a floating point number. The `index` is the position of the non-zero value in the vector. The `values` is the value of the non-zero element. + +For example, the following vector: + +``` +[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 0.0, 0.0] +``` + +can be represented as a sparse vector: + +``` +[(6, 1.0), (7, 2.0)] +``` + +Qdrant uses the following JSON representation throughout its APIs. + +```json +{ + "indices": [6, 7], + "values": [1.0, 2.0] +} +``` + +The `indices` and `values` arrays must have the same length. +And the `indices` must be unique. + +If the `indices` are not sorted, Qdrant will sort them internally so you may not rely on the order of the elements. + +Sparse vectors must be named and can be uploaded in the same way as dense vectors. + +```python +client.upsert( + collection_name="{collection_name}", + points=[ + models.PointStruct( + id=1, + vector={ + "text": models.SparseVector( + indices=[6, 7], + values=[1.0, 2.0], + ) + }, + ), + models.PointStruct( + id=2, + vector={ + "text": models.SparseVector( + indices=[1, 2, 3, 4, 5], + values=[0.1, 0.2, 0.3, 0.4, 0.5], + ) + }, + ), + ], +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.UpsertAsync( + collectionName: "{collection_name}", + points: new List + { + new() + { + Id = 1, + Vectors = new Dictionary { ["text"] = ([1.0f, 2.0f], [6, 7]) } + }, + new() + { + Id = 2, + Vectors = new Dictionary + { + ["text"] = ([0.1f, 0.2f, 0.3f, 0.4f, 0.5f], [1, 2, 3, 4, 5]) + } + } + } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Upsert(context.Background(), &qdrant.UpsertPoints{ + CollectionName: "{collection_name}", + Points: []*qdrant.PointStruct{ + { + Id: qdrant.NewIDNum(1), + Vectors: qdrant.NewVectorsMap(map[string]*qdrant.Vector{ + "text": qdrant.NewVectorSparse( + []uint32{6, 7}, + []float32{1.0, 2.0}), + }), + }, + { + Id: qdrant.NewIDNum(2), + Vectors: qdrant.NewVectorsMap(map[string]*qdrant.Vector{ + "text": qdrant.NewVectorSparse( + []uint32{1, 2, 3, 4, 5}, + []float32{0.1, 0.2, 0.3, 0.4, 0.5}), + }), + }, + }, +}) +``` + +```http +PUT /collections/{collection_name}/points +{ + "points": [ + { + "id": 1, + "vector": { + "text": { + "indices": [6, 7], + "values": [1.0, 2.0] + } + } + }, + { + "id": 2, + "vector": { + "text": { + "indices": [1, 2, 4, 15, 33, 34], + "values": [0.1, 0.2, 0.3, 0.4, 0.5] + } + } + } + ] +} +``` + +```typescript +client.upsert("{collection_name}", { + points: [ + { + id: 1, + vector: { + text: { + indices: [6, 7], + values: [1.0, 2.0], + }, + }, + }, + { + id: 2, + vector: { + text: { + indices: [1, 2, 3, 4, 5], + values: [0.1, 0.2, 0.3, 0.4, 0.5], + }, + }, + }, + ], +}); +``` + +```rust +use std::collections::HashMap; + +use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder, Vector}; +use qdrant_client::Payload; + +client + .upsert_points( + UpsertPointsBuilder::new( + "{collection_name}", + vec![ + PointStruct::new( + 1, + HashMap::from([("text".to_string(), vec![(6, 1.0), (7, 2.0)])]), + Payload::default(), + ), + PointStruct::new( + 2, + HashMap::from([( + "text".to_string(), + vec![(1, 0.1), (2, 0.2), (3, 0.3), (4, 0.4), (5, 0.5)], + )]), + Payload::default(), + ), + ], + ) + .wait(true), + ) + .await?; +``` + +```java +import java.util.List; +import java.util.Map; + +import static io.qdrant.client.PointIdFactory.id; +import static io.qdrant.client.VectorFactory.vector; + +import io.qdrant.client.grpc.Points.NamedVectors; +import io.qdrant.client.grpc.Points.PointStruct; +import io.qdrant.client.grpc.Points.Vectors; + +client + .upsertAsync( + "{collection_name}", + List.of( + PointStruct.newBuilder() + .setId(id(1)) + .setVectors( + Vectors.newBuilder() + .setVectors( + NamedVectors.newBuilder() + .putAllVectors( + Map.of( + "text", vector(List.of(1.0f, 2.0f), List.of(6, 7)))) + .build()) + .build()) + .build(), + PointStruct.newBuilder() + .setId(id(2)) + .setVectors( + Vectors.newBuilder() + .setVectors( + NamedVectors.newBuilder() + .putAllVectors( + Map.of( + "text", + vector( + List.of(0.1f, 0.2f, 0.3f, 0.4f, 0.5f), + List.of(1, 2, 3, 4, 5)))) + .build()) + .build()) + .build())) + .get(); +``` + +## Modify points + +To change a point, you can modify its vectors or its payload. There are several +ways to do this. + +### Update vectors + +_Available as of v1.2.0_ + +This method updates the specified vectors on the given points. Unspecified +vectors are kept unchanged. All given points must exist. + +REST API ([Schema](https://api.qdrant.tech/api-reference/points/update-vectors)): + +```python +client.update_vectors( + collection_name="{collection_name}", + points=[ + models.PointVectors( + id=1, + vector={ + "image": [0.1, 0.2, 0.3, 0.4], + }, + ), + models.PointVectors( + id=2, + vector={ + "text": [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2], + }, + ), + ], +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.UpdateVectorsAsync( + collectionName: "{collection_name}", + points: new List + { + new() { Id = 1, Vectors = ("image", new float[] { 0.1f, 0.2f, 0.3f, 0.4f }) }, + new() + { + Id = 2, + Vectors = ("text", new float[] { 0.9f, 0.8f, 0.7f, 0.6f, 0.5f, 0.4f, 0.3f, 0.2f }) + } + } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.UpdateVectors(context.Background(), &qdrant.UpdatePointVectors{ + CollectionName: "{collection_name}", + Points: []*qdrant.PointVectors{ + { + Id: qdrant.NewIDNum(1), + Vectors: qdrant.NewVectorsMap(map[string]*qdrant.Vector{ + "image": qdrant.NewVector(0.1, 0.2, 0.3, 0.4), + }), + }, + { + Id: qdrant.NewIDNum(2), + Vectors: qdrant.NewVectorsMap(map[string]*qdrant.Vector{ + "text": qdrant.NewVector(0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2), + }), + }, + }, +}) +``` + +```http +PUT /collections/{collection_name}/points/vectors +{ + "points": [ + { + "id": 1, + "vector": { + "image": [0.1, 0.2, 0.3, 0.4] + } + }, + { + "id": 2, + "vector": { + "text": [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2] + } + } + ] +} +``` + +```typescript +client.updateVectors("{collection_name}", { + points: [ + { + id: 1, + vector: { + image: [0.1, 0.2, 0.3, 0.4], + }, + }, + { + id: 2, + vector: { + text: [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2], + }, + }, + ], +}); +``` + +```rust +use std::collections::HashMap; + +use qdrant_client::qdrant::{ + PointVectors, UpdatePointVectorsBuilder, +}; + +client + .update_vectors( + UpdatePointVectorsBuilder::new( + "{collection_name}", + vec![ + PointVectors { + id: Some(1.into()), + vectors: Some( + HashMap::from([("image".to_string(), vec![0.1, 0.2, 0.3, 0.4])]).into(), + ), + }, + PointVectors { + id: Some(2.into()), + vectors: Some( + HashMap::from([( + "text".to_string(), + vec![0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2], + )]) + .into(), + ), + }, + ], + ) + .wait(true), + ) + .await?; +``` + +```java +import java.util.List; +import java.util.Map; + +import static io.qdrant.client.PointIdFactory.id; +import static io.qdrant.client.VectorFactory.vector; +import static io.qdrant.client.VectorsFactory.namedVectors; + +client + .updateVectorsAsync( + "{collection_name}", + List.of( + PointVectors.newBuilder() + .setId(id(1)) + .setVectors(namedVectors(Map.of("image", vector(List.of(0.1f, 0.2f, 0.3f, 0.4f))))) + .build(), + PointVectors.newBuilder() + .setId(id(2)) + .setVectors( + namedVectors( + Map.of( + "text", vector(List.of(0.9f, 0.8f, 0.7f, 0.6f, 0.5f, 0.4f, 0.3f, 0.2f))))) + .build())) + .get(); +``` + +To update points and replace all of its vectors, see [uploading +points](#upload-points). + +### Delete vectors + +_Available as of v1.2.0_ + +This method deletes just the specified vectors from the given points. Other +vectors are kept unchanged. Points are never deleted. + +REST API ([Schema](https://api.qdrant.tech/api-reference/points/delete-vectors)): + +```python +client.delete_vectors( + collection_name="{collection_name}", + points=[0, 3, 100], + vectors=["text", "image"], +) +``` + +```csharp +await client.DeleteVectorsAsync("{collection_name}", ["text", "image"], [0, 3, 10]); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client.DeleteVectors(context.Background(), &qdrant.DeletePointVectors{ + CollectionName: "{collection_name}", + PointsSelector: qdrant.NewPointsSelector( + qdrant.NewIDNum(0), qdrant.NewIDNum(3), qdrant.NewIDNum(10)), + Vectors: &qdrant.VectorsSelector{ + Names: []string{"text", "image"}, + }, +}) +``` + +```http +POST /collections/{collection_name}/points/vectors/delete +{ + "points": [0, 3, 100], + "vectors": ["text", "image"] +} +``` + +```typescript +client.deleteVectors("{collection_name}", { + points: [0, 3, 10], + vector: ["text", "image"], +}); +``` + +```rust +use qdrant_client::qdrant::{ + DeletePointVectorsBuilder, PointsIdsList, +}; + +client + .delete_vectors( + DeletePointVectorsBuilder::new("{collection_name}") + .points_selector(PointsIdsList { + ids: vec![0.into(), 3.into(), 10.into()], + }) + .vectors(vec!["text".into(), "image".into()]) + .wait(true), + ) + .await?; +``` + +```java +import java.util.List; + +import static io.qdrant.client.PointIdFactory.id; + +client + .deleteVectorsAsync( + "{collection_name}", List.of("text", "image"), List.of(id(0), id(3), id(10))) + .get(); +``` + +To delete entire points, see [deleting points](#delete-points). + +### Update payload + +Learn how to modify the payload of a point in the [Payload](/documentation/concepts/payload/#update-payload) section. + +## Delete points + +REST API ([Schema](https://api.qdrant.tech/api-reference/points/delete-points)): + +```python +client.delete( + collection_name="{collection_name}", + points_selector=models.PointIdsList( + points=[0, 3, 100], + ), +) +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.DeleteAsync(collectionName: "{collection_name}", ids: [0, 3, 100]); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Delete(context.Background(), &qdrant.DeletePoints{ + CollectionName: "{collection_name}", + Points: qdrant.NewPointsSelector( + qdrant.NewIDNum(0), qdrant.NewIDNum(3), qdrant.NewIDNum(100), + ), +}) +``` + +```http +POST /collections/{collection_name}/points/delete +{ + "points": [0, 3, 100] +} +``` + +```typescript +client.delete("{collection_name}", { + points: [0, 3, 100], +}); +``` + +```rust +use qdrant_client::qdrant::{DeletePointsBuilder, PointsIdsList}; + +client + .delete_points( + DeletePointsBuilder::new("{collection_name}") + .points(PointsIdsList { + ids: vec![0.into(), 3.into(), 100.into()], + }) + .wait(true), + ) + .await?; +``` + +```java +import java.util.List; + +import static io.qdrant.client.PointIdFactory.id; + +client.deleteAsync("{collection_name}", List.of(id(0), id(3), id(100))); +``` + +Alternative way to specify which points to remove is to use filter. + +```python +client.delete( + collection_name="{collection_name}", + points_selector=models.FilterSelector( + filter=models.Filter( + must=[ + models.FieldCondition( + key="color", + match=models.MatchValue(value="red"), + ), + ], + ) + ), +) +``` + +```csharp +using Qdrant.Client; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +await client.DeleteAsync(collectionName: "{collection_name}", filter: MatchKeyword("color", "red")); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Delete(context.Background(), &qdrant.DeletePoints{ + CollectionName: "{collection_name}", + Points: qdrant.NewPointsSelectorFilter( + &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("color", "red"), + }, + }, + ), +}) +``` + +```http +POST /collections/{collection_name}/points/delete +{ + "filter": { + "must": [ + { + "key": "color", + "match": { + "value": "red" + } + } + ] + } +} +``` + +```typescript +client.delete("{collection_name}", { + filter: { + must: [ + { + key: "color", + match: { + value: "red", + }, + }, + ], + }, +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, DeletePointsBuilder, Filter}; + +client + .delete_points( + DeletePointsBuilder::new("{collection_name}") + .points(Filter::must([Condition::matches( + "color", + "red".to_string(), + )])) + .wait(true), + ) + .await?; +``` + +```java +import static io.qdrant.client.ConditionFactory.matchKeyword; + +import io.qdrant.client.grpc.Points.Filter; + +client + .deleteAsync( + "{collection_name}", + Filter.newBuilder().addMust(matchKeyword("color", "red")).build()) + .get(); +``` + +This example removes all points with `{ "color": "red" }` from the collection. + +## Retrieve points + +There is a method for retrieving points by their ids. + +REST API ([Schema](https://api.qdrant.tech/api-reference/points/get-points)): + +```python +client.retrieve( + collection_name="{collection_name}", + ids=[0, 3, 100], +) +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.RetrieveAsync( + collectionName: "{collection_name}", + ids: [0, 30, 100], + withPayload: false, + withVectors: false +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Get(context.Background(), &qdrant.GetPoints{ + CollectionName: "{collection_name}", + Ids: []*qdrant.PointId{ + qdrant.NewIDNum(0), qdrant.NewIDNum(3), qdrant.NewIDNum(100), + }, +}) +``` + +```http +POST /collections/{collection_name}/points +{ + "ids": [0, 3, 100] +} +``` + +```typescript +client.retrieve("{collection_name}", { + ids: [0, 3, 100], +}); +``` + +```rust +use qdrant_client::qdrant::GetPointsBuilder; + +client + .get_points(GetPointsBuilder::new( + "{collection_name}", + vec![0.into(), 30.into(), 100.into()], + )) + .await?; +``` + +```java +import java.util.List; + +import static io.qdrant.client.PointIdFactory.id; + +client + .retrieveAsync("{collection_name}", List.of(id(0), id(30), id(100)), false, false, null) + .get(); +``` + +This method has additional parameters `with_vectors` and `with_payload`. +Using these parameters, you can select parts of the point you want as a result. +Excluding helps you not to waste traffic transmitting useless data. + +The single point can also be retrieved via the API: + +REST API ([Schema](https://api.qdrant.tech/api-reference/points/get-point)): + +```http +GET /collections/{collection_name}/points/{point_id} +``` + +## Scroll points + +Sometimes it might be necessary to get all stored points without knowing ids, or iterate over points that correspond to a filter. + +REST API ([Schema](https://api.qdrant.tech/master/api-reference/points/scroll-points)): + +```python +client.scroll( + collection_name="{collection_name}", + scroll_filter=models.Filter( + must=[ + models.FieldCondition(key="color", match=models.MatchValue(value="red")), + ] + ), + limit=1, + with_payload=True, + with_vectors=False, +) +``` + +```csharp +using Qdrant.Client; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +await client.ScrollAsync( + collectionName: "{collection_name}", + filter: MatchKeyword("color", "red"), + limit: 1, + payloadSelector: true +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Scroll(context.Background(), &qdrant.ScrollPoints{ + CollectionName: "{collection_name}", + Filter: &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("color", "red"), + }, + }, + Limit: qdrant.PtrOf(uint32(1)), + WithPayload: qdrant.NewWithPayload(true), +}) +``` + +```http +POST /collections/{collection_name}/points/scroll +{ + "filter": { + "must": [ + { + "key": "color", + "match": { + "value": "red" + } + } + ] + }, + "limit": 1, + "with_payload": true, + "with_vector": false +} +``` + +```typescript +client.scroll("{collection_name}", { + filter: { + must: [ + { + key: "color", + match: { + value: "red", + }, + }, + ], + }, + limit: 1, + with_payload: true, + with_vector: false, +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; + +client + .scroll( + ScrollPointsBuilder::new("{collection_name}") + .filter(Filter::must([Condition::matches( + "color", + "red".to_string(), + )])) + .limit(1) + .with_payload(true) + .with_vectors(false), + ) + .await?; +``` + +```java +import static io.qdrant.client.ConditionFactory.matchKeyword; +import static io.qdrant.client.WithPayloadSelectorFactory.enable; + +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.ScrollPoints; + +client + .scrollAsync( + ScrollPoints.newBuilder() + .setCollectionName("{collection_name}") + .setFilter(Filter.newBuilder().addMust(matchKeyword("color", "red")).build()) + .setLimit(1) + .setWithPayload(enable(true)) + .build()) + .get(); +``` + +Returns all point with `color` = `red`. + +```json +{ + "result": { + "next_page_offset": 1, + "points": [ + { + "id": 0, + "payload": { + "color": "red" + } + } + ] + }, + "status": "ok", + "time": 0.0001 +} +``` + +The Scroll API will return all points that match the filter in a page-by-page manner. + +All resulting points are sorted by ID. To query the next page it is necessary to specify the largest seen ID in the `offset` field. +For convenience, this ID is also returned in the field `next_page_offset`. +If the value of the `next_page_offset` field is `null` - the last page is reached. + +### Order points by payload key + +_Available as of v1.8.0_ + +When using the [`scroll`](#scroll-points) API, you can sort the results by payload key. For example, you can retrieve points in chronological order if your payloads have a `"timestamp"` field, as is shown from the example below: + + + +```python +client.scroll( + collection_name="{collection_name}", + limit=15, + order_by="timestamp", # <-- this! +) +``` + +```csharp +await client.ScrollAsync("{collection_name}", limit: 15, orderBy: "timestamp"); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Scroll(context.Background(), &qdrant.ScrollPoints{ + CollectionName: "{collection_name}", + Limit: qdrant.PtrOf(uint32(15)), + OrderBy: &qdrant.OrderBy{ + Key: "timestamp", + }, +}) +``` + +```http +POST /collections/{collection_name}/points/scroll +{ + "limit": 15, + "order_by": "timestamp", // <-- this! +} +``` + +```typescript +client.scroll("{collection_name}", { + limit: 15, + order_by: "timestamp", // <-- this! +}); +``` + +```rust +use qdrant_client::qdrant::{OrderByBuilder, ScrollPointsBuilder}; + +client + .scroll( + ScrollPointsBuilder::new("{collection_name}") + .limit(15) + .order_by(OrderByBuilder::new("timestamp")), + ) + .await?; +``` + +```java +import io.qdrant.client.grpc.Points.OrderBy; +import io.qdrant.client.grpc.Points.ScrollPoints; + +client.scrollAsync(ScrollPoints.newBuilder() + .setCollectionName("{collection_name}") + .setLimit(15) + .setOrderBy(OrderBy.newBuilder().setKey("timestamp").build()) + .build()).get(); +``` + +You need to use the `order_by` `key` parameter to specify the payload key. Then you can add other fields to control the ordering, such as `direction` and `start_from`: + +```python +order_by=models.OrderBy( + key="timestamp", + direction="desc", # default is "asc" + start_from=123, # start from this value +) +``` + +```csharp +using Qdrant.Client.Grpc; + +new OrderBy +{ + Key = "timestamp", + Direction = Direction.Desc, + StartFrom = 123 +}; +``` + +```go +import "github.com/qdrant/go-client/qdrant" + +qdrant.OrderBy{ + Key: "timestamp", + Direction: qdrant.Direction_Desc.Enum(), + StartFrom: qdrant.NewStartFromInt(123), +} +``` + +```http +"order_by": { + "key": "timestamp", + "direction": "desc" // default is "asc" + "start_from": 123, // start from this value +} +``` + +```typescript +order_by: { + key: "timestamp", + direction: "desc", // default is "asc" + start_from: 123, // start from this value +} +``` + +```rust +use qdrant_client::qdrant::{start_from::Value, Direction, OrderByBuilder}; + +OrderByBuilder::new("timestamp") + .direction(Direction::Desc.into()) + .start_from(Value::Integer(123)) + .build(); +``` + +```java +import io.qdrant.client.grpc.Points.Direction; +import io.qdrant.client.grpc.Points.OrderBy; +import io.qdrant.client.grpc.Points.StartFrom; + +OrderBy.newBuilder() + .setKey("timestamp") + .setDirection(Direction.Desc) + .setStartFrom(StartFrom.newBuilder() + .setInteger(123) + .build()) + .build(); +``` + + + +When sorting is based on a non-unique value, it is not possible to rely on an ID offset. Thus, next_page_offset is not returned within the response. However, you can still do pagination by combining `"order_by": { "start_from": ... }` with a `{ "must_not": [{ "has_id": [...] }] }` filter. + +## Counting points + +_Available as of v0.8.4_ + +Sometimes it can be useful to know how many points fit the filter conditions without doing a real search. + +Among others, for example, we can highlight the following scenarios: + +- Evaluation of results size for faceted search +- Determining the number of pages for pagination +- Debugging the query execution speed + +REST API ([Schema](https://api.qdrant.tech/master/api-reference/points/count-points)): + +```python +client.count( + collection_name="{collection_name}", + count_filter=models.Filter( + must=[ + models.FieldCondition(key="color", match=models.MatchValue(value="red")), + ] + ), + exact=True, +) +``` + +```csharp +using Qdrant.Client; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +await client.CountAsync( + collectionName: "{collection_name}", + filter: MatchKeyword("color", "red"), + exact: true +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Count(context.Background(), &qdrant.CountPoints{ + CollectionName: "midlib", + Filter: &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("color", "red"), + }, + }, +}) +``` + +```http +POST /collections/{collection_name}/points/count +{ + "filter": { + "must": [ + { + "key": "color", + "match": { + "value": "red" + } + } + ] + }, + "exact": true +} +``` + +```typescript +client.count("{collection_name}", { + filter: { + must: [ + { + key: "color", + match: { + value: "red", + }, + }, + ], + }, + exact: true, +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, CountPointsBuilder, Filter}; + +client + .count( + CountPointsBuilder::new("{collection_name}") + .filter(Filter::must([Condition::matches( + "color", + "red".to_string(), + )])) + .exact(true), + ) + .await?; +``` + +```java +import static io.qdrant.client.ConditionFactory.matchKeyword; + +import io.qdrant.client.grpc.Points.Filter; + +client + .countAsync( + "{collection_name}", + Filter.newBuilder().addMust(matchKeyword("color", "red")).build(), + true) + .get(); +``` + +Returns number of counts matching given filtering conditions: + +```json +{ + "count": 3811 +} +``` + +## Batch update + +_Available as of v1.5.0_ + +You can batch multiple point update operations. This includes inserting, +updating and deleting points, vectors and payload. + +A batch update request consists of a list of operations. These are executed in +order. These operations can be batched: + +- [Upsert points](#upload-points): `upsert` or `UpsertOperation` +- [Delete points](#delete-points): `delete_points` or `DeleteOperation` +- [Update vectors](#update-vectors): `update_vectors` or `UpdateVectorsOperation` +- [Delete vectors](#delete-vectors): `delete_vectors` or `DeleteVectorsOperation` +- [Set payload](/documentation/concepts/payload/#set-payload): `set_payload` or `SetPayloadOperation` +- [Overwrite payload](/documentation/concepts/payload/#overwrite-payload): `overwrite_payload` or `OverwritePayload` +- [Delete payload](/documentation/concepts/payload/#delete-payload-keys): `delete_payload` or `DeletePayloadOperation` +- [Clear payload](/documentation/concepts/payload/#clear-payload): `clear_payload` or `ClearPayloadOperation` + +The following example snippet makes use of all operations. + +REST API ([Schema](https://api.qdrant.tech/master/api-reference/points/batch-update)): + +```python +client.batch_update_points( + collection_name="{collection_name}", + update_operations=[ + models.UpsertOperation( + upsert=models.PointsList( + points=[ + models.PointStruct( + id=1, + vector=[1.0, 2.0, 3.0, 4.0], + payload={}, + ), + ] + ) + ), + models.UpdateVectorsOperation( + update_vectors=models.UpdateVectors( + points=[ + models.PointVectors( + id=1, + vector=[1.0, 2.0, 3.0, 4.0], + ) + ] + ) + ), + models.DeleteVectorsOperation( + delete_vectors=models.DeleteVectors(points=[1], vector=[""]) + ), + models.OverwritePayloadOperation( + overwrite_payload=models.SetPayload( + payload={"test_payload": 1}, + points=[1], + ) + ), + models.SetPayloadOperation( + set_payload=models.SetPayload( + payload={ + "test_payload_2": 2, + "test_payload_3": 3, + }, + points=[1], + ) + ), + models.DeletePayloadOperation( + delete_payload=models.DeletePayload(keys=["test_payload_2"], points=[1]) + ), + models.ClearPayloadOperation(clear_payload=models.PointIdsList(points=[1])), + models.DeleteOperation(delete=models.PointIdsList(points=[1])), + ], +) +``` + +```http +POST /collections/{collection_name}/points/batch +{ + "operations": [ + { + "upsert": { + "points": [ + { + "id": 1, + "vector": [1.0, 2.0, 3.0, 4.0], + "payload": {} + } + ] + } + }, + { + "update_vectors": { + "points": [ + { + "id": 1, + "vector": [1.0, 2.0, 3.0, 4.0] + } + ] + } + }, + { + "delete_vectors": { + "points": [1], + "vector": [""] + } + }, + { + "overwrite_payload": { + "payload": { + "test_payload": "1" + }, + "points": [1] + } + }, + { + "set_payload": { + "payload": { + "test_payload_2": "2", + "test_payload_3": "3" + }, + "points": [1] + } + }, + { + "delete_payload": { + "keys": ["test_payload_2"], + "points": [1] + } + }, + { + "clear_payload": { + "points": [1] + } + }, + {"delete": {"points": [1]}} + ] +} +``` + +```typescript +client.batchUpdate("{collection_name}", { + operations: [ + { + upsert: { + points: [ + { + id: 1, + vector: [1.0, 2.0, 3.0, 4.0], + payload: {}, + }, + ], + }, + }, + { + update_vectors: { + points: [ + { + id: 1, + vector: [1.0, 2.0, 3.0, 4.0], + }, + ], + }, + }, + { + delete_vectors: { + points: [1], + vector: [""], + }, + }, + { + overwrite_payload: { + payload: { + test_payload: 1, + }, + points: [1], + }, + }, + { + set_payload: { + payload: { + test_payload_2: 2, + test_payload_3: 3, + }, + points: [1], + }, + }, + { + delete_payload: { + keys: ["test_payload_2"], + points: [1], + }, + }, + { + clear_payload: { + points: [1], + }, + }, + { + delete: { + points: [1], + }, + }, + ], +}); +``` + +```rust +use std::collections::HashMap; + +use qdrant_client::qdrant::{ + points_update_operation::{ + ClearPayload, DeletePayload, DeletePoints, DeleteVectors, Operation, OverwritePayload, + PointStructList, SetPayload, UpdateVectors, + }, + PointStruct, PointVectors, PointsUpdateOperation, UpdateBatchPointsBuilder, VectorsSelector, +}; +use qdrant_client::Payload; + +client + .update_points_batch( + UpdateBatchPointsBuilder::new( + "{collection_name}", + vec![ + PointsUpdateOperation { + operation: Some(Operation::Upsert(PointStructList { + points: vec![PointStruct::new( + 1, + vec![1.0, 2.0, 3.0, 4.0], + Payload::default(), + )], + ..Default::default() + })), + }, + PointsUpdateOperation { + operation: Some(Operation::UpdateVectors(UpdateVectors { + points: vec![PointVectors { + id: Some(1.into()), + vectors: Some(vec![1.0, 2.0, 3.0, 4.0].into()), + }], + ..Default::default() + })), + }, + PointsUpdateOperation { + operation: Some(Operation::DeleteVectors(DeleteVectors { + points_selector: Some(vec![1.into()].into()), + vectors: Some(VectorsSelector { + names: vec!["".into()], + }), + ..Default::default() + })), + }, + PointsUpdateOperation { + operation: Some(Operation::OverwritePayload(OverwritePayload { + points_selector: Some(vec![1.into()].into()), + payload: HashMap::from([("test_payload".to_string(), 1.into())]), + ..Default::default() + })), + }, + PointsUpdateOperation { + operation: Some(Operation::SetPayload(SetPayload { + points_selector: Some(vec![1.into()].into()), + payload: HashMap::from([ + ("test_payload_2".to_string(), 2.into()), + ("test_payload_3".to_string(), 3.into()), + ]), + ..Default::default() + })), + }, + PointsUpdateOperation { + operation: Some(Operation::DeletePayload(DeletePayload { + points_selector: Some(vec![1.into()].into()), + keys: vec!["test_payload_2".to_string()], + ..Default::default() + })), + }, + PointsUpdateOperation { + operation: Some(Operation::ClearPayload(ClearPayload { + points: Some(vec![1.into()].into()), + ..Default::default() + })), + }, + PointsUpdateOperation { + operation: Some(Operation::DeletePoints(DeletePoints { + points: Some(vec![1.into()].into()), + ..Default::default() + })), + }, + ], + ) + .wait(true), + ) + .await?; +``` + +```java +import java.util.List; +import java.util.Map; + +import static io.qdrant.client.PointIdFactory.id; +import static io.qdrant.client.ValueFactory.value; +import static io.qdrant.client.VectorsFactory.vectors; + +import io.qdrant.client.grpc.Points.PointStruct; +import io.qdrant.client.grpc.Points.PointVectors; +import io.qdrant.client.grpc.Points.PointsIdsList; +import io.qdrant.client.grpc.Points.PointsSelector; +import io.qdrant.client.grpc.Points.PointsUpdateOperation; +import io.qdrant.client.grpc.Points.PointsUpdateOperation.ClearPayload; +import io.qdrant.client.grpc.Points.PointsUpdateOperation.DeletePayload; +import io.qdrant.client.grpc.Points.PointsUpdateOperation.DeletePoints; +import io.qdrant.client.grpc.Points.PointsUpdateOperation.DeleteVectors; +import io.qdrant.client.grpc.Points.PointsUpdateOperation.PointStructList; +import io.qdrant.client.grpc.Points.PointsUpdateOperation.SetPayload; +import io.qdrant.client.grpc.Points.PointsUpdateOperation.UpdateVectors; +import io.qdrant.client.grpc.Points.VectorsSelector; + +client + .batchUpdateAsync( + "{collection_name}", + List.of( + PointsUpdateOperation.newBuilder() + .setUpsert( + PointStructList.newBuilder() + .addPoints( + PointStruct.newBuilder() + .setId(id(1)) + .setVectors(vectors(1.0f, 2.0f, 3.0f, 4.0f)) + .build()) + .build()) + .build(), + PointsUpdateOperation.newBuilder() + .setUpdateVectors( + UpdateVectors.newBuilder() + .addPoints( + PointVectors.newBuilder() + .setId(id(1)) + .setVectors(vectors(1.0f, 2.0f, 3.0f, 4.0f)) + .build()) + .build()) + .build(), + PointsUpdateOperation.newBuilder() + .setDeleteVectors( + DeleteVectors.newBuilder() + .setPointsSelector( + PointsSelector.newBuilder() + .setPoints(PointsIdsList.newBuilder().addIds(id(1)).build()) + .build()) + .setVectors(VectorsSelector.newBuilder().addNames("").build()) + .build()) + .build(), + PointsUpdateOperation.newBuilder() + .setOverwritePayload( + SetPayload.newBuilder() + .setPointsSelector( + PointsSelector.newBuilder() + .setPoints(PointsIdsList.newBuilder().addIds(id(1)).build()) + .build()) + .putAllPayload(Map.of("test_payload", value(1))) + .build()) + .build(), + PointsUpdateOperation.newBuilder() + .setSetPayload( + SetPayload.newBuilder() + .setPointsSelector( + PointsSelector.newBuilder() + .setPoints(PointsIdsList.newBuilder().addIds(id(1)).build()) + .build()) + .putAllPayload( + Map.of("test_payload_2", value(2), "test_payload_3", value(3))) + .build()) + .build(), + PointsUpdateOperation.newBuilder() + .setDeletePayload( + DeletePayload.newBuilder() + .setPointsSelector( + PointsSelector.newBuilder() + .setPoints(PointsIdsList.newBuilder().addIds(id(1)).build()) + .build()) + .addKeys("test_payload_2") + .build()) + .build(), + PointsUpdateOperation.newBuilder() + .setClearPayload( + ClearPayload.newBuilder() + .setPoints( + PointsSelector.newBuilder() + .setPoints(PointsIdsList.newBuilder().addIds(id(1)).build()) + .build()) + .build()) + .build(), + PointsUpdateOperation.newBuilder() + .setDeletePoints( + DeletePoints.newBuilder() + .setPoints( + PointsSelector.newBuilder() + .setPoints(PointsIdsList.newBuilder().addIds(id(1)).build()) + .build()) + .build()) + .build())) + .get(); +``` + +To batch many points with a single operation type, please use batching +functionality in that operation directly. + + +## Awaiting result + +If the API is called with the `&wait=false` parameter, or if it is not explicitly specified, the client will receive an acknowledgment of receiving data: + +```json +{ + "result": { + "operation_id": 123, + "status": "acknowledged" + }, + "status": "ok", + "time": 0.000206061 +} +``` + +This response does not mean that the data is available for retrieval yet. This +uses a form of eventual consistency. It may take a short amount of time before it +is actually processed as updating the collection happens in the background. In +fact, it is possible that such request eventually fails. +If inserting a lot of vectors, we also recommend using asynchronous requests to take advantage of pipelining. + +If the logic of your application requires a guarantee that the vector will be available for searching immediately after the API responds, then use the flag `?wait=true`. +In this case, the API will return the result only after the operation is finished: + +```json +{ + "result": { + "operation_id": 0, + "status": "completed" + }, + "status": "ok", + "time": 0.000206061 +} +``` + +<|page-4-lllmstxt|> +# Vectors + +Vectors (or embeddings) are the core concept of the Qdrant Vector Search engine. +Vectors define the similarity between objects in the vector space. + +If a pair of vectors are similar in vector space, it means that the objects they represent are similar in some way. + +For example, if you have a collection of images, you can represent each image as a vector. +If two images are similar, their vectors will be close to each other in the vector space. + +In order to obtain a vector representation of an object, you need to apply a vectorization algorithm to the object. +Usually, this algorithm is a neural network that converts the object into a fixed-size vector. + +The neural network is usually [trained](/articles/metric-learning-tips/) on a pairs or [triplets](/articles/triplet-loss/) of similar and dissimilar objects, so it learns to recognize a specific type of similarity. + +By using this property of vectors, you can explore your data in a number of ways; e.g. by searching for similar objects, clustering objects, and more. + + +## Vector Types + +Modern neural networks can output vectors in different shapes and sizes, and Qdrant supports most of them. +Let's take a look at the most common types of vectors supported by Qdrant. + + +### Dense Vectors + +This is the most common type of vector. It is a simple list of numbers, it has a fixed length and each element of the list is a floating-point number. + +It looks like this: + +```json + +// A piece of a real-world dense vector +[ + -0.013052909, + 0.020387933, + -0.007869, + -0.11111383, + -0.030188112, + -0.0053388323, + 0.0010654867, + 0.072027855, + -0.04167721, + 0.014839341, + -0.032948174, + -0.062975034, + -0.024837125, + .... +] +``` + +The majority of neural networks create dense vectors, so you can use them with Qdrant without any additional processing. +Although compatible with most embedding models out there, Qdrant has been tested with the following [verified embedding providers](/documentation/embeddings/). + +### Sparse Vectors + +Sparse vectors are a special type of vectors. +Mathematically, they are the same as dense vectors, but they contain many zeros so they are stored in a special format. + +Sparse vectors in Qdrant don't have a fixed length, as it is dynamically allocated during vector insertion. +The amount of non-zero values in sparse vectors is currently limited to u32 datatype range (4294967295). + +In order to define a sparse vector, you need to provide a list of non-zero elements and their indexes. + +```json +// A sparse vector with 4 non-zero elements +{ + "indexes": [1, 3, 5, 7], + "values": [0.1, 0.2, 0.3, 0.4] +} +``` + +Sparse vectors in Qdrant are kept in special storage and indexed in a separate index, so their configuration is different from dense vectors. + +To create a collection with sparse vectors: + + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.create_collection( + collection_name="{collection_name}", + vectors_config={}, + sparse_vectors_config={ + "text": models.SparseVectorParams(), + }, +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + sparseVectorsConfig: ("text", new SparseVectorParams()) +); +``` + +```bash +curl -X PUT http://localhost:6333/collections/{collection_name} \ + -H 'Content-Type: application/json' \ + --data-raw '{ + "sparse_vectors": { + "text": { } + } + }' +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + SparseVectorsConfig: qdrant.NewSparseVectorsConfig( + map[string]*qdrant.SparseVectorParams{ + "text": {}, + }), +}) +``` + +```http +PUT /collections/{collection_name} +{ + "sparse_vectors": { + "text": { } + } +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.createCollection("{collection_name}", { + sparse_vectors: { + text: { }, + }, +}); +``` + +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{ + CreateCollectionBuilder, SparseVectorParamsBuilder, SparseVectorsConfigBuilder, +}; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +let mut sparse_vector_config = SparseVectorsConfigBuilder::default(); + +sparse_vector_config.add_named_vector_params("text", SparseVectorParamsBuilder::default()); + +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .sparse_vectors_config(sparse_vector_config), + ) + .await?; +``` + +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.SparseVectorConfig; +import io.qdrant.client.grpc.Collections.SparseVectorParams; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setSparseVectorsConfig( + SparseVectorConfig.newBuilder() + .putMap("text", SparseVectorParams.getDefaultInstance())) + .build()) + .get(); +``` + +Insert a point with a sparse vector into the created collection: + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.upsert( + collection_name="{collection_name}", + points=[ + models.PointStruct( + id=1, + payload={}, # Add any additional payload if necessary + vector={ + "text": models.SparseVector( + indices=[1, 3, 5, 7], + values=[0.1, 0.2, 0.3, 0.4] + ) + }, + ) + ], +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.UpsertAsync( + collectionName: "{collection_name}", + points: new List < PointStruct > { + new() { + Id = 1, + Vectors = new Dictionary { + ["text"] = ([0.1f, 0.2f, 0.3f, 0.4f], [1, 3, 5, 7]) + } + } + } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Upsert(context.Background(), &qdrant.UpsertPoints{ + CollectionName: "{collection_name}", + Points: []*qdrant.PointStruct{ + { + Id: qdrant.NewIDNum(1), + Vectors: qdrant.NewVectorsMap( + map[string]*qdrant.Vector{ + "text": qdrant.NewVectorSparse( + []uint32{1, 3, 5, 7}, + []float32{0.1, 0.2, 0.3, 0.4}), + }), + }, + }, +}) +``` + +```http +PUT /collections/{collection_name}/points +{ + "points": [ + { + "id": 1, + "vector": { + "text": { + "indices": [1, 3, 5, 7], + "values": [0.1, 0.2, 0.3, 0.4] + } + } + } + ] +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.upsert("{collection_name}", { + points: [ + { + id: 1, + vector: { + text: { + indices: [1, 3, 5, 7], + values: [0.1, 0.2, 0.3, 0.4] + }, + }, + } + ] +}); +``` + +```rust +use qdrant_client::qdrant::{NamedVectors, PointStruct, UpsertPointsBuilder, Vector}; + +use qdrant_client::{Payload, Qdrant}; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +let points = vec![PointStruct::new( + 1, + NamedVectors::default().add_vector( + "text", + Vector::new_sparse(vec![1, 3, 5, 7], vec![0.1, 0.2, 0.3, 0.4]), + ), + Payload::new(), +)]; + +client + .upsert_points(UpsertPointsBuilder::new("{collection_name}", points)) + .await?; +``` + +```java +import java.util.List; +import java.util.Map; + +import static io.qdrant.client.PointIdFactory.id; +import static io.qdrant.client.VectorFactory.vector; +import static io.qdrant.client.VectorsFactory.namedVectors; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.PointStruct; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .upsertAsync( + "{collection_name}", + List.of( + PointStruct.newBuilder() + .setId(id(1)) + .setVectors( + namedVectors(Map.of( + "text", vector(List.of(1.0f, 2.0f), List.of(6, 7)))) + ) + .build())) + .get(); +``` + +Now you can run a search with sparse vectors: + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + + +result = client.query_points( + collection_name="{collection_name}", + query=models.SparseVector(indices=[1, 3, 5, 7], values=[0.1, 0.2, 0.3, 0.4]), + using="text", +).points +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: new (float, uint)[] {(0.1f, 1), (0.2f, 3), (0.3f, 5), (0.4f, 7)}, + usingVector: "text", + limit: 3 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQuerySparse( + []uint32{1, 3, 5, 7}, + []float32{0.1, 0.2, 0.3, 0.4}), + Using: qdrant.PtrOf("text"), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": { + "indices": [1, 3, 5, 7], + "values": [0.1, 0.2, 0.3, 0.4] + }, + "using": "text" +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + query: { + indices: [1, 3, 5, 7], + values: [0.1, 0.2, 0.3, 0.4] + }, + using: "text", + limit: 3, +}); +``` + +```rust +use qdrant_client::qdrant::QueryPointsBuilder; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query(vec![(1, 0.2), (3, 0.1), (5, 0.9), (7, 0.7)]) + .limit(10) + .using("text"), + ) + .await?; +``` + +```java +import java.util.List; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.QueryPoints; + +import static io.qdrant.client.QueryFactory.nearest; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setUsing("text") + .setQuery(nearest(List.of(0.1f, 0.2f, 0.3f, 0.4f), List.of(1, 3, 5, 7))) + .setLimit(3) + .build()) + .get(); +``` + +### Multivectors + +**Available as of v1.10.0** + +Qdrant supports the storing of a variable amount of same-shaped dense vectors in a single point. +This means that instead of a single dense vector, you can upload a matrix of dense vectors. + +The length of the matrix is fixed, but the number of vectors in the matrix can be different for each point. + +Multivectors look like this: + +```json +// A multivector of size 4 +"vector": [ + [-0.013, 0.020, -0.007, -0.111], + [-0.030, -0.055, 0.001, 0.072], + [-0.041, 0.014, -0.032, -0.062], + .... +] + +``` + +There are two scenarios where multivectors are useful: + +* **Multiple representation of the same object** - For example, you can store multiple embeddings for pictures of the same object, taken from different angles. This approach assumes that the payload is same for all vectors. +* **Late interaction embeddings** - Some text embedding models can output multiple vectors for a single text. +For example, a family of models such as ColBERT output a relatively small vector for each token in the text. + +In order to use multivectors, we need to specify a function that will be used to compare between matrices of vectors + +Currently, Qdrant supports `max_sim` function, which is defined as a sum of maximum similarities between each pair of vectors in the matrices. + +$$ +score = \sum_{i=1}^{N} \max_{j=1}^{M} \text{Sim}(\text{vectorA}_i, \text{vectorB}_j) +$$ + +Where $N$ is the number of vectors in the first matrix, $M$ is the number of vectors in the second matrix, and $\text{Sim}$ is a similarity function, for example, cosine similarity. + +To use multivectors, create a collection with the following configuration: + +```python + +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams( + size=128, + distance=models.Distance.COSINE, + multivector_config=models.MultiVectorConfig( + comparator=models.MultiVectorComparator.MAX_SIM + ), + ), +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { + Size = 128, + Distance = Distance.Cosine, + MultivectorConfig = new() { + Comparator = MultiVectorComparator.MaxSim + } + } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 128, + Distance: qdrant.Distance_Cosine, + MultivectorConfig: &qdrant.MultiVectorConfig{ + Comparator: qdrant.MultiVectorComparator_MaxSim, + }, + }), +}) +``` + +```http +PUT collections/{collection_name} +{ + "vectors": { + "size": 128, + "distance": "Cosine", + "multivector_config": { + "comparator": "max_sim" + } + } +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.createCollection("{collection_name}", { + vectors: { + size: 128, + distance: "Cosine", + multivector_config: { + comparator: "max_sim" + } + }, +}); +``` + +```rust +use qdrant_client::qdrant::{ + CreateCollectionBuilder, Distance, VectorParamsBuilder, + MultiVectorComparator, MultiVectorConfigBuilder, +}; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config( + VectorParamsBuilder::new(100, Distance::Cosine) + .multivector_config( + MultiVectorConfigBuilder::new(MultiVectorComparator::MaxSim) + ), + ), + ) + .await?; +``` + +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.MultiVectorComparator; +import io.qdrant.client.grpc.Collections.MultiVectorConfig; +import io.qdrant.client.grpc.Collections.VectorParams; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.createCollectionAsync("{collection_name}", + VectorParams.newBuilder().setSize(128) + .setDistance(Distance.Cosine) + .setMultivectorConfig(MultiVectorConfig.newBuilder() + .setComparator(MultiVectorComparator.MaxSim) + .build()) + .build()).get(); +``` + +To insert a point with multivector: + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.upsert( + collection_name="{collection_name}", + points=[ + models.PointStruct( + id=1, + vector=[ + [-0.013, 0.020, -0.007, -0.111], + [-0.030, -0.055, 0.001, 0.072], + [-0.041, 0.014, -0.032, -0.062] + ], + ) + ], +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.UpsertAsync( + collectionName: "{collection_name}", + points: new List { + new() { + Id = 1, + Vectors = new float[][] { + [-0.013f, 0.020f, -0.007f, -0.111f], + [-0.030f, -0.05f, 0.001f, 0.072f], + [-0.041f, 0.014f, -0.032f, -0.062f ], + }, + }, + } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Upsert(context.Background(), &qdrant.UpsertPoints{ + CollectionName: "{collection_name}", + Points: []*qdrant.PointStruct{ + { + Id: qdrant.NewIDNum(1), + Vectors: qdrant.NewVectorsMulti( + [][]float32{ + {-0.013, 0.020, -0.007, -0.111}, + {-0.030, -0.055, 0.001, 0.072}, + {-0.041, 0.014, -0.032, -0.062}}), + }, + }, +}) +``` + +```http +PUT collections/{collection_name}/points +{ + "points": [ + { + "id": 1, + "vector": [ + [-0.013, 0.020, -0.007, -0.111, ...], + [-0.030, -0.055, 0.001, 0.072, ...], + [-0.041, 0.014, -0.032, -0.062, ...] + ] + } + ] +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.upsert("{collection_name}", { + points: [ + { + id: 1, + vector: [ + [-0.013, 0.020, -0.007, -0.111, ...], + [-0.030, -0.055, 0.001, 0.072, ...], + [-0.041, 0.014, -0.032, -0.062, ...] + ], + } + ] +}); +``` + +```rust +use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder, Vector}; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +let points = vec![ + PointStruct::new( + 1, + Vector::new_multi(vec![ + vec![-0.013, 0.020, -0.007, -0.111], + vec![-0.030, -0.055, 0.001, 0.072], + vec![-0.041, 0.014, -0.032, -0.062], + ]), + Payload::new() + ) +]; + +client + .upsert_points( + UpsertPointsBuilder::new("{collection_name}", points) + ).await?; + +``` + +```java +import java.util.List; + +import static io.qdrant.client.PointIdFactory.id; +import static io.qdrant.client.VectorsFactory.vectors; +import static io.qdrant.client.VectorFactory.multiVector; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.PointStruct; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client +.upsertAsync( + "{collection_name}", + List.of( + PointStruct.newBuilder() + .setId(id(1)) + .setVectors(vectors(multiVector(new float[][] { + {-0.013f, 0.020f, -0.007f, -0.111f}, + {-0.030f, -0.055f, 0.001f, 0.072f}, + {-0.041f, 0.014f, -0.032f, -0.062f} + }))) + .build() + )) +.get(); +``` + +To search with multivector (available in `query` API): + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.query_points( + collection_name="{collection_name}", + query=[ + [-0.013, 0.020, -0.007, -0.111], + [-0.030, -0.055, 0.001, 0.072], + [-0.041, 0.014, -0.032, -0.062] + ], +) +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: new float[][] { + [-0.013f, 0.020f, -0.007f, -0.111f], + [-0.030f, -0.055f, 0.001 , 0.072f], + [-0.041f, 0.014f, -0.032f, -0.062f], + } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQueryMulti( + [][]float32{ + {-0.013, 0.020, -0.007, -0.111}, + {-0.030, -0.055, 0.001, 0.072}, + {-0.041, 0.014, -0.032, -0.062}, + }), +}) +``` + +```http +POST collections/{collection_name}/points/query +{ + "query": [ + [-0.013, 0.020, -0.007, -0.111, ...], + [-0.030, -0.055, 0.001, 0.072, ...], + [-0.041, 0.014, -0.032, -0.062, ...] + ] +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + "query": [ + [-0.013, 0.020, -0.007, -0.111], + [-0.030, -0.055, 0.001, 0.072], + [-0.041, 0.014, -0.032, -0.062] + ] +}); +``` + +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{ QueryPointsBuilder, VectorInput }; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +let res = client.query( + QueryPointsBuilder::new("{collection_name}") + .query(VectorInput::new_multi( + vec![ + vec![-0.013, 0.020, -0.007, -0.111], + vec![-0.030, -0.055, 0.001, 0.072], + vec![-0.041, 0.014, -0.032, -0.062], + ] + )) +).await?; +``` + +```java +import static io.qdrant.client.QueryFactory.nearest; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.QueryPoints; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.queryAsync(QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(nearest(new float[][] { + {-0.013f, 0.020f, -0.007f, -0.111f}, + {-0.030f, -0.055f, 0.001f, 0.072f}, + {-0.041f, 0.014f, -0.032f, -0.062f} + })) + .build()).get(); +``` + + +## Named Vectors + +In Qdrant, you can store multiple vectors of different sizes and [types](#vector-types) in the same data [point](/documentation/concepts/points/). This is useful when you need to define your data with multiple embeddings to represent different features or modalities (e.g., image, text or video). + +To store different vectors for each point, you need to create separate named vector spaces in the [collection](/documentation/concepts/collections/). You can define these vector spaces during collection creation and manage them independently. + + + +To create a collection with named vectors, you need to specify a configuration for each vector: + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.create_collection( + collection_name="{collection_name}", + vectors_config={ + "image": models.VectorParams(size=4, distance=models.Distance.DOT), + "text": models.VectorParams(size=5, distance=models.Distance.COSINE), + }, + sparse_vectors_config={"text-sparse": models.SparseVectorParams()}, +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParamsMap + { + Map = { + ["image"] = new VectorParams { + Size = 4, Distance = Distance.Dot + }, + ["text"] = new VectorParams { + Size = 5, Distance = Distance.Cosine + }, + } + }, + sparseVectorsConfig: new SparseVectorConfig + { + Map = { + ["text-sparse"] = new() + } + } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfigMap( + map[string]*qdrant.VectorParams{ + "image": { + Size: 4, + Distance: qdrant.Distance_Dot, + }, + "text": { + Size: 5, + Distance: qdrant.Distance_Cosine, + }, + }), + SparseVectorsConfig: qdrant.NewSparseVectorsConfig( + map[string]*qdrant.SparseVectorParams{ + "text-sparse": {}, + }, + ), +}) +``` + +```http +PUT /collections/{collection_name} +{ + "vectors": { + "image": { + "size": 4, + "distance": "Dot" + }, + "text": { + "size": 5, + "distance": "Cosine" + } + }, + "sparse_vectors": { + "text-sparse": {} + } +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.createCollection("{collection_name}", { + vectors: { + image: { size: 4, distance: "Dot" }, + text: { size: 5, distance: "Cosine" }, + }, + sparse_vectors: { + text_sparse: {} + } +}); +``` + +```rust +use qdrant_client::qdrant::{ + CreateCollectionBuilder, Distance, SparseVectorParamsBuilder, SparseVectorsConfigBuilder, + VectorParamsBuilder, VectorsConfigBuilder, +}; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +let mut vector_config = VectorsConfigBuilder::default(); +vector_config.add_named_vector_params("text", VectorParamsBuilder::new(5, Distance::Dot)); +vector_config.add_named_vector_params("image", VectorParamsBuilder::new(4, Distance::Cosine)); + +let mut sparse_vectors_config = SparseVectorsConfigBuilder::default(); +sparse_vectors_config + .add_named_vector_params("text-sparse", SparseVectorParamsBuilder::default()); + +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(vector_config) + .sparse_vectors_config(sparse_vectors_config), + ) + .await?; +``` + +```java +import java.util.Map; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.SparseVectorConfig; +import io.qdrant.client.grpc.Collections.SparseVectorParams; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorParamsMap; +import io.qdrant.client.grpc.Collections.VectorsConfig; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig(VectorsConfig.newBuilder().setParamsMap( + VectorParamsMap.newBuilder().putAllMap(Map.of("image", + VectorParams.newBuilder() + .setSize(4) + .setDistance(Distance.Dot) + .build(), + "text", + VectorParams.newBuilder() + .setSize(5) + .setDistance(Distance.Cosine) + .build())))) + .setSparseVectorsConfig(SparseVectorConfig.newBuilder().putMap( + "text-sparse", SparseVectorParams.getDefaultInstance())) + .build()) + .get(); +``` + +To insert a point with named vectors: + +```python +client.upsert( + collection_name="{collection_name}", + points=[ + models.PointStruct( + id=1, + vector={ + "image": [0.9, 0.1, 0.1, 0.2], + "text": [0.4, 0.7, 0.1, 0.8, 0.1], + "text-sparse": { + "indices": [1, 3, 5, 7], + "values": [0.1, 0.2, 0.3, 0.4], + }, + }, + ), + ], +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +await client.UpsertAsync( + collectionName: "{collection_name}", + points: new List + { + new() + { + Id = 1, + Vectors = new Dictionary + { + ["image"] = new() { + Data = {0.9f, 0.1f, 0.1f, 0.2f} + }, + ["text"] = new() { + Data = {0.4f, 0.7f, 0.1f, 0.8f, 0.1f} + }, + ["text-sparse"] = ([0.1f, 0.2f, 0.3f, 0.4f], [1, 3, 5, 7]), + } + } + } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client.Upsert(context.Background(), &qdrant.UpsertPoints{ + CollectionName: "{collection_name}", + Points: []*qdrant.PointStruct{ + { + Id: qdrant.NewIDNum(1), + Vectors: qdrant.NewVectorsMap(map[string]*qdrant.Vector{ + "image": qdrant.NewVector(0.9, 0.1, 0.1, 0.2), + "text": qdrant.NewVector(0.4, 0.7, 0.1, 0.8, 0.1), + "text-sparse": qdrant.NewVectorSparse( + []uint32{1, 3, 5, 7}, + []float32{0.1, 0.2, 0.3, 0.4}), + }), + }, + }, +}) +``` + +```http +PUT /collections/{collection_name}/points?wait=true +{ + "points": [ + { + "id": 1, + "vector": { + "image": [0.9, 0.1, 0.1, 0.2], + "text": [0.4, 0.7, 0.1, 0.8, 0.1], + "text-sparse": { + "indices": [1, 3, 5, 7], + "values": [0.1, 0.2, 0.3, 0.4] + } + } + } + ] +} +``` + +```typescript +client.upsert("{collection_name}", { + points: [ + { + id: 1, + vector: { + image: [0.9, 0.1, 0.1, 0.2], + text: [0.4, 0.7, 0.1, 0.8, 0.1], + text_sparse: { + indices: [1, 3, 5, 7], + values: [0.1, 0.2, 0.3, 0.4] + } + }, + }, + ], +}); +``` + +```rust + +use qdrant_client::qdrant::{ + NamedVectors, PointStruct, UpsertPointsBuilder, Vector, +}; +use qdrant_client::Payload; + +client + .upsert_points( + UpsertPointsBuilder::new( + "{collection_name}", + vec![PointStruct::new( + 1, + NamedVectors::default() + .add_vector("text", Vector::new_dense(vec![0.4, 0.7, 0.1, 0.8, 0.1])) + .add_vector("image", Vector::new_dense(vec![0.9, 0.1, 0.1, 0.2])) + .add_vector( + "text-sparse", + Vector::new_sparse(vec![1, 3, 5, 7], vec![0.1, 0.2, 0.3, 0.4]), + ), + Payload::default(), + )], + ) + .wait(true), + ) + .await?; +``` + +```java +import java.util.List; +import java.util.Map; + +import static io.qdrant.client.PointIdFactory.id; +import static io.qdrant.client.VectorFactory.vector; +import static io.qdrant.client.VectorsFactory.namedVectors; + +import io.qdrant.client.grpc.Points.PointStruct; + +client + .upsertAsync( + "{collection_name}", + List.of( + PointStruct.newBuilder() + .setId(id(1)) + .setVectors( + namedVectors( + Map.of( + "image", + vector(List.of(0.9f, 0.1f, 0.1f, 0.2f)), + "text", + vector(List.of(0.4f, 0.7f, 0.1f, 0.8f, 0.1f)), + "text-sparse", + vector(List.of(0.1f, 0.2f, 0.3f, 0.4f), List.of(1, 3, 5, 7))))) + .build())) + .get(); +``` + +To search with named vectors (available in `query` API): + +```python +from qdrant_client import QdrantClient + +client = QdrantClient(url="http://localhost:6333") + +client.query_points( + collection_name="{collection_name}", + query=[0.2, 0.1, 0.9, 0.7], + using="image", + limit=3, +) +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, + usingVector: "image", + limit: 3 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), + Using: qdrant.PtrOf("image"), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": [0.2, 0.1, 0.9, 0.7], + "using": "image", + "limit": 3 +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + query: [0.2, 0.1, 0.9, 0.7], + using: "image", + limit: 3, +}); +``` + +```rust +use qdrant_client::qdrant::QueryPointsBuilder; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query(vec![0.2, 0.1, 0.9, 0.7]) + .limit(3) + .using("image"), + ) + .await?; +``` + +```java +import java.util.List; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.QueryPoints; + +import static io.qdrant.client.QueryFactory.nearest; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.queryAsync(QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) + .setUsing("image") + .setLimit(3) + .build()).get(); +``` + +## Datatypes + +Newest versions of embeddings models generate vectors with very large dimentionalities. +With OpenAI's `text-embedding-3-large` embedding model, the dimensionality can go up to 3072. + +The amount of memory required to store such vectors grows linearly with the dimensionality, +so it is important to choose the right datatype for the vectors. + +The choice between datatypes is a trade-off between memory consumption and precision of vectors. + +Qdrant supports a number of datatypes for both dense and sparse vectors: + +**Float32** + +This is the default datatype for vectors in Qdrant. It is a 32-bit (4 bytes) floating-point number. +The standard OpenAI embedding of 1536 dimensionality will require 6KB of memory to store in Float32. + +You don't need to specify the datatype for vectors in Qdrant, as it is set to Float32 by default. + +**Float16** + +This is a 16-bit (2 bytes) floating-point number. It is also known as half-precision float. +Intuitively, it looks like this: + +```text +float32 -> float16 delta (float32 - float16).abs + +0.79701585 -> 0.796875 delta 0.00014084578 +0.7850789 -> 0.78515625 delta 0.00007736683 +0.7775044 -> 0.77734375 delta 0.00016063452 +0.85776305 -> 0.85791016 delta 0.00014710426 +0.6616839 -> 0.6616211 delta 0.000062823296 +``` + +The main advantage of Float16 is that it requires half the memory of Float32, while having virtually no impact on the quality of vector search. + +To use Float16, you need to specify the datatype for vectors in the collection configuration: + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams( + size=128, + distance=models.Distance.COSINE, + datatype=models.Datatype.FLOAT16 + ), + sparse_vectors_config={ + "text": models.SparseVectorParams( + index=models.SparseIndexParams(datatype=models.Datatype.FLOAT16) + ), + }, +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { + Size = 128, + Distance = Distance.Cosine, + Datatype = Datatype.Float16 + }, + sparseVectorsConfig: ( + "text", + new SparseVectorParams { + Index = new SparseIndexConfig { + Datatype = Datatype.Float16 + } + } + ) +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 128, + Distance: qdrant.Distance_Cosine, + Datatype: qdrant.Datatype_Float16.Enum(), + }), + SparseVectorsConfig: qdrant.NewSparseVectorsConfig( + map[string]*qdrant.SparseVectorParams{ + "text": { + Index: &qdrant.SparseIndexConfig{ + Datatype: qdrant.Datatype_Float16.Enum(), + }, + }, + }), +}) +``` + +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 128, + "distance": "Cosine", + "datatype": "float16" // <-- For dense vectors + }, + "sparse_vectors": { + "text": { + "index": { + "datatype": "float16" // <-- And for sparse vectors + } + } + } +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.createCollection("{collection_name}", { + vectors: { + size: 128, + distance: "Cosine", + datatype: "float16" + }, + sparse_vectors: { + text: { + index: { + datatype: "float16" + } + } + } +}); +``` + +```rust +use qdrant_client::qdrant::{ + CreateCollectionBuilder, Datatype, Distance, SparseIndexConfigBuilder, SparseVectorParamsBuilder, SparseVectorsConfigBuilder, VectorParamsBuilder +}; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +let mut sparse_vector_config = SparseVectorsConfigBuilder::default(); +sparse_vector_config.add_named_vector_params( + "text", + SparseVectorParamsBuilder::default() + .index(SparseIndexConfigBuilder::default().datatype(Datatype::Float32)), +); + +let create_collection = CreateCollectionBuilder::new("{collection_name}") + .sparse_vectors_config(sparse_vector_config) + .vectors_config( + VectorParamsBuilder::new(128, Distance::Cosine).datatype(Datatype::Float16), + ); + +client.create_collection(create_collection).await?; +``` + +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Datatype; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.SparseIndexConfig; +import io.qdrant.client.grpc.Collections.SparseVectorConfig; +import io.qdrant.client.grpc.Collections.SparseVectorParams; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; + +QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig(VectorsConfig.newBuilder() + .setParams(VectorParams.newBuilder() + .setSize(128) + .setDistance(Distance.Cosine) + .setDatatype(Datatype.Float16) + .build()) + .build()) + .setSparseVectorsConfig( + SparseVectorConfig.newBuilder() + .putMap("text", SparseVectorParams.newBuilder() + .setIndex(SparseIndexConfig.newBuilder() + .setDatatype(Datatype.Float16) + .build()) + .build())) + .build()) + .get(); +``` + +**Uint8** + +Another step towards memory optimization is to use the Uint8 datatype for vectors. +Unlike Float16, Uint8 is not a floating-point number, but an integer number in the range from 0 to 255. + +Not all embeddings models generate vectors in the range from 0 to 255, so you need to be careful when using Uint8 datatype. + +In order to convert a number from float range to Uint8 range, you need to apply a process called quantization. + +Some embedding providers may provide embeddings in a pre-quantized format. +One of the most notable examples is the [Cohere int8 & binary embeddings](https://cohere.com/blog/int8-binary-embeddings). + +For other embeddings, you will need to apply quantization yourself. + + + + + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams( + size=128, distance=models.Distance.COSINE, datatype=models.Datatype.UINT8 + ), + sparse_vectors_config={ + "text": models.SparseVectorParams( + index=models.SparseIndexParams(datatype=models.Datatype.UINT8) + ), + }, +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { + Size = 128, + Distance = Distance.Cosine, + Datatype = Datatype.Uint8 + }, + sparseVectorsConfig: ( + "text", + new SparseVectorParams { + Index = new SparseIndexConfig { + Datatype = Datatype.Uint8 + } + } + ) +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 128, + Distance: qdrant.Distance_Cosine, + Datatype: qdrant.Datatype_Uint8.Enum(), + }), + SparseVectorsConfig: qdrant.NewSparseVectorsConfig( + map[string]*qdrant.SparseVectorParams{ + "text": { + Index: &qdrant.SparseIndexConfig{ + Datatype: qdrant.Datatype_Uint8.Enum(), + }, + }, + }), +}) +``` + +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 128, + "distance": "Cosine", + "datatype": "uint8" // <-- For dense vectors + }, + "sparse_vectors": { + "text": { + "index": { + "datatype": "uint8" // <-- For sparse vectors + } + } + } +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.createCollection("{collection_name}", { + vectors: { + size: 128, + distance: "Cosine", + datatype: "uint8" + }, + sparse_vectors: { + text: { + index: { + datatype: "uint8" + } + } + } +}); +``` + +```rust +use qdrant_client::qdrant::{ + CreateCollectionBuilder, Datatype, Distance, SparseIndexConfigBuilder, + SparseVectorParamsBuilder, SparseVectorsConfigBuilder, VectorParamsBuilder, +}; + +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +let mut sparse_vector_config = SparseVectorsConfigBuilder::default(); + +sparse_vector_config.add_named_vector_params( + "text", + SparseVectorParamsBuilder::default() + .index(SparseIndexConfigBuilder::default().datatype(Datatype::Uint8)), +); +let create_collection = CreateCollectionBuilder::new("{collection_name}") + .sparse_vectors_config(sparse_vector_config) + .vectors_config( + VectorParamsBuilder::new(128, Distance::Cosine) + .datatype(Datatype::Uint8) + ); + +client.create_collection(create_collection).await?; +``` + +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Datatype; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.SparseIndexConfig; +import io.qdrant.client.grpc.Collections.SparseVectorConfig; +import io.qdrant.client.grpc.Collections.SparseVectorParams; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; + +QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig(VectorsConfig.newBuilder() + .setParams(VectorParams.newBuilder() + .setSize(128) + .setDistance(Distance.Cosine) + .setDatatype(Datatype.Uint8) + .build()) + .build()) + .setSparseVectorsConfig( + SparseVectorConfig.newBuilder() + .putMap("text", SparseVectorParams.newBuilder() + .setIndex(SparseIndexConfig.newBuilder() + .setDatatype(Datatype.Uint8) + .build()) + .build())) + .build()) + .get(); +``` + +## Quantization + +Apart from changing the datatype of the original vectors, Qdrant can create quantized representations of vectors alongside the original ones. +This quantized representation can be used to quickly select candidates for rescoring with the original vectors or even used directly for search. + +Quantization is applied in the background, during the optimization process. + +More information about the quantization process can be found in the [Quantization](/documentation/guides/quantization/) section. + + +## Vector Storage + +Depending on the requirements of the application, Qdrant can use one of the data storage options. +Keep in mind that you will have to tradeoff between search speed and the size of RAM used. + +More information about the storage options can be found in the [Storage](/documentation/concepts/storage/#vector-storage) section. + +<|page-5-lllmstxt|> +# Payload + +One of the significant features of Qdrant is the ability to store additional information along with vectors. +This information is called `payload` in Qdrant terminology. + +Qdrant allows you to store any information that can be represented using JSON. + +Here is an example of a typical payload: + +```json +{ + "name": "jacket", + "colors": ["red", "blue"], + "count": 10, + "price": 11.99, + "locations": [ + { + "lon": 52.5200, + "lat": 13.4050 + } + ], + "reviews": [ + { + "user": "alice", + "score": 4 + }, + { + "user": "bob", + "score": 5 + } + ] +} +``` + +## Payload types + +In addition to storing payloads, Qdrant also allows you search based on certain kinds of values. +This feature is implemented as additional filters during the search and will enable you to incorporate custom logic on top of semantic similarity. + +During the filtering, Qdrant will check the conditions over those values that match the type of the filtering condition. If the stored value type does not fit the filtering condition - it will be considered not satisfied. + +For example, you will get an empty output if you apply the [range condition](/documentation/concepts/filtering/#range) on the string data. + +However, arrays (multiple values of the same type) are treated a little bit different. When we apply a filter to an array, it will succeed if at least one of the values inside the array meets the condition. + +The filtering process is discussed in detail in the section [Filtering](/documentation/concepts/filtering/). + +Let's look at the data types that Qdrant supports for searching: + +### Integer + +`integer` - 64-bit integer in the range from `-9223372036854775808` to `9223372036854775807`. + +Example of single and multiple `integer` values: + +```json +{ + "count": 10, + "sizes": [35, 36, 38] +} +``` + +### Float + +`float` - 64-bit floating point number. + +Example of single and multiple `float` values: + +```json +{ + "price": 11.99, + "ratings": [9.1, 9.2, 9.4] +} +``` + +### Bool + +Bool - binary value. Equals to `true` or `false`. + +Example of single and multiple `bool` values: + +```json +{ + "is_delivered": true, + "responses": [false, false, true, false] +} +``` + +### Keyword + +`keyword` - string value. + +Example of single and multiple `keyword` values: + +```json +{ + "name": "Alice", + "friends": [ + "bob", + "eva", + "jack" + ] +} +``` + +### Geo + +`geo` is used to represent geographical coordinates. + +Example of single and multiple `geo` values: + +```json +{ + "location": { + "lon": 52.5200, + "lat": 13.4050 + }, + "cities": [ + { + "lon": 51.5072, + "lat": 0.1276 + }, + { + "lon": 40.7128, + "lat": 74.0060 + } + ] +} +``` + +Coordinate should be described as an object containing two fields: `lon` - for longitude, and `lat` - for latitude. + +### Datetime + +*Available as of v1.8.0* + +`datetime` - date and time in [RFC 3339] format. + +See the following examples of single and multiple `datetime` values: + +```json +{ + "created_at": "2023-02-08T10:49:00Z", + "updated_at": [ + "2023-02-08T13:52:00Z", + "2023-02-21T21:23:00Z" + ] +} +``` + +The following formats are supported: + +- `"2023-02-08T10:49:00Z"` ([RFC 3339], UTC) +- `"2023-02-08T11:49:00+01:00"` ([RFC 3339], with timezone) +- `"2023-02-08T10:49:00"` (without timezone, UTC is assumed) +- `"2023-02-08T10:49"` (without timezone and seconds) +- `"2023-02-08"` (only date, midnight is assumed) + +Notes about the format: + +- `T` can be replaced with a space. +- The `T` and `Z` symbols are case-insensitive. +- UTC is always assumed when the timezone is not specified. +- Timezone can have the following formats: `±HH:MM`, `±HHMM`, `±HH`, or `Z`. +- Seconds can have up to 6 decimals, so the finest granularity for `datetime` is microseconds. + +[RFC 3339]: https://datatracker.ietf.org/doc/html/rfc3339#section-5.6 + +### UUID + +*Available as of v1.11.0* + +In addition to the basic `keyword` type, Qdrant supports `uuid` type for storing UUID values. +Functionally, it works the same as `keyword`, internally stores parsed UUID values. + +```json +{ + "uuid": "550e8400-e29b-41d4-a716-446655440000", + "uuids": [ + "550e8400-e29b-41d4-a716-446655440000", + "550e8400-e29b-41d4-a716-446655440001" + ] +} +``` + +String representation of UUID (e.g. `550e8400-e29b-41d4-a716-446655440000`) occupies 36 bytes. +But when numeric representation is used, it is only 128 bits (16 bytes). + +Usage of `uuid` index type is recommended in payload-heavy collections to save RAM and improve search performance. + + +## Create point with payload +REST API ([Schema](https://api.qdrant.tech/api-reference/points/upsert-points)) + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.upsert( + collection_name="{collection_name}", + points=[ + models.PointStruct( + id=1, + vector=[0.05, 0.61, 0.76, 0.74], + payload={ + "city": "Berlin", + "price": 1.99, + }, + ), + models.PointStruct( + id=2, + vector=[0.19, 0.81, 0.75, 0.11], + payload={ + "city": ["Berlin", "London"], + "price": 1.99, + }, + ), + models.PointStruct( + id=3, + vector=[0.36, 0.55, 0.47, 0.94], + payload={ + "city": ["Berlin", "Moscow"], + "price": [1.99, 2.99], + }, + ), + ], +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.UpsertAsync( + collectionName: "{collection_name}", + points: new List + { + new PointStruct + { + Id = 1, + Vectors = new[] { 0.05f, 0.61f, 0.76f, 0.74f }, + Payload = { ["city"] = "Berlin", ["price"] = 1.99 } + }, + new PointStruct + { + Id = 2, + Vectors = new[] { 0.19f, 0.81f, 0.75f, 0.11f }, + Payload = { ["city"] = new[] { "Berlin", "London" } } + }, + new PointStruct + { + Id = 3, + Vectors = new[] { 0.36f, 0.55f, 0.47f, 0.94f }, + Payload = + { + ["city"] = new[] { "Berlin", "Moscow" }, + ["price"] = new Value + { + ListValue = new ListValue { Values = { new Value[] { 1.99, 2.99 } } } + } + } + } + } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Upsert(context.Background(), &qdrant.UpsertPoints{ + CollectionName: "{collection_name}", + Points: []*qdrant.PointStruct{ + { + Id: qdrant.NewIDNum(1), + Vectors: qdrant.NewVectors(0.05, 0.61, 0.76, 0.74), + Payload: qdrant.NewValueMap(map[string]any{ + "city": "Berlin", "price": 1.99}), + }, + { + Id: qdrant.NewIDNum(2), + Vectors: qdrant.NewVectors(0.19, 0.81, 0.75, 0.11), + Payload: qdrant.NewValueMap(map[string]any{ + "city": []any{"Berlin", "London"}}), + }, + { + Id: qdrant.NewIDNum(3), + Vectors: qdrant.NewVectors(0.36, 0.55, 0.47, 0.94), + Payload: qdrant.NewValueMap(map[string]any{ + "city": []any{"Berlin", "London"}, + "price": []any{1.99, 2.99}}), + }, + }, +}) +``` + +```http +PUT /collections/{collection_name}/points +{ + "points": [ + { + "id": 1, + "vector": [0.05, 0.61, 0.76, 0.74], + "payload": {"city": "Berlin", "price": 1.99} + }, + { + "id": 2, + "vector": [0.19, 0.81, 0.75, 0.11], + "payload": {"city": ["Berlin", "London"], "price": 1.99} + }, + { + "id": 3, + "vector": [0.36, 0.55, 0.47, 0.94], + "payload": {"city": ["Berlin", "Moscow"], "price": [1.99, 2.99]} + } + ] +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.upsert("{collection_name}", { + points: [ + { + id: 1, + vector: [0.05, 0.61, 0.76, 0.74], + payload: { + city: "Berlin", + price: 1.99, + }, + }, + { + id: 2, + vector: [0.19, 0.81, 0.75, 0.11], + payload: { + city: ["Berlin", "London"], + price: 1.99, + }, + }, + { + id: 3, + vector: [0.36, 0.55, 0.47, 0.94], + payload: { + city: ["Berlin", "Moscow"], + price: [1.99, 2.99], + }, + }, + ], +}); +``` + +```rust +use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder}; +use qdrant_client::{Payload, Qdrant, QdrantError}; +use serde_json::json; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +let points = vec![ + PointStruct::new( + 1, + vec![0.05, 0.61, 0.76, 0.74], + Payload::try_from(json!({"city": "Berlin", "price": 1.99})).unwrap(), + ), + PointStruct::new( + 2, + vec![0.19, 0.81, 0.75, 0.11], + Payload::try_from(json!({"city": ["Berlin", "London"]})).unwrap(), + ), + PointStruct::new( + 3, + vec![0.36, 0.55, 0.47, 0.94], + Payload::try_from(json!({"city": ["Berlin", "Moscow"], "price": [1.99, 2.99]})) + .unwrap(), + ), +]; + +client + .upsert_points(UpsertPointsBuilder::new("{collection_name}", points).wait(true)) + .await?; +``` + +```java +import java.util.List; +import java.util.Map; + +import static io.qdrant.client.PointIdFactory.id; +import static io.qdrant.client.ValueFactory.value; +import static io.qdrant.client.VectorsFactory.vectors; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.PointStruct; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .upsertAsync( + "{collection_name}", + List.of( + PointStruct.newBuilder() + .setId(id(1)) + .setVectors(vectors(0.05f, 0.61f, 0.76f, 0.74f)) + .putAllPayload(Map.of("city", value("Berlin"), "price", value(1.99))) + .build(), + PointStruct.newBuilder() + .setId(id(2)) + .setVectors(vectors(0.19f, 0.81f, 0.75f, 0.11f)) + .putAllPayload( + Map.of("city", list(List.of(value("Berlin"), value("London"))))) + .build(), + PointStruct.newBuilder() + .setId(id(3)) + .setVectors(vectors(0.36f, 0.55f, 0.47f, 0.94f)) + .putAllPayload( + Map.of( + "city", + list(List.of(value("Berlin"), value("London"))), + "price", + list(List.of(value(1.99), value(2.99))))) + .build())) + .get(); +``` + +## Update payload + +Updating payloads in Qdrant offers flexible methods to manage vector metadata. The **set payload** method updates specific fields while keeping others unchanged, while the **overwrite** method replaces the entire payload. Developers can also use **clear payload** to remove all metadata or delete fields to remove specific keys without affecting the rest. These options provide precise control for adapting to dynamic datasets. + +### Set payload + +Set only the given payload values on a point. + +REST API ([Schema](https://api.qdrant.tech/api-reference/points/set-payload)): + +```python +client.set_payload( + collection_name="{collection_name}", + payload={ + "property1": "string", + "property2": "string", + }, + points=[0, 3, 10], +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.SetPayloadAsync( + collectionName: "{collection_name}", + payload: new Dictionary { { "property1", "string" }, { "property2", "string" } }, + ids: new ulong[] { 0, 3, 10 } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.SetPayload(context.Background(), &qdrant.SetPayloadPoints{ + CollectionName: "{collection_name}", + Payload: qdrant.NewValueMap( + map[string]any{"property1": "string", "property2": "string"}), + PointsSelector: qdrant.NewPointsSelector( + qdrant.NewIDNum(0), + qdrant.NewIDNum(3)), +}) +``` + +```http +POST /collections/{collection_name}/points/payload +{ + "payload": { + "property1": "string", + "property2": "string" + }, + "points": [ + 0, 3, 100 + ] +} +``` + +```typescript +client.setPayload("{collection_name}", { + payload: { + property1: "string", + property2: "string", + }, + points: [0, 3, 10], +}); +``` + +```rust +use qdrant_client::qdrant::{ + PointsIdsList, SetPayloadPointsBuilder, +}; +use qdrant_client::Payload,; +use serde_json::json; + +client + .set_payload( + SetPayloadPointsBuilder::new( + "{collection_name}", + Payload::try_from(json!({ + "property1": "string", + "property2": "string", + })) + .unwrap(), + ) + .points_selector(PointsIdsList { + ids: vec![0.into(), 3.into(), 10.into()], + }) + .wait(true), + ) + .await?; +``` + +```java +import java.util.List; +import java.util.Map; + +import static io.qdrant.client.PointIdFactory.id; +import static io.qdrant.client.ValueFactory.value; + +client + .setPayloadAsync( + "{collection_name}", + Map.of("property1", value("string"), "property2", value("string")), + List.of(id(0), id(3), id(10)), + true, + null, + null) + .get(); +``` + +You don't need to know the ids of the points you want to modify. The alternative +is to use filters. + +```python +client.set_payload( + collection_name="{collection_name}", + payload={ + "property1": "string", + "property2": "string", + }, + points=models.Filter( + must=[ + models.FieldCondition( + key="color", + match=models.MatchValue(value="red"), + ), + ], + ), +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +await client.SetPayloadAsync( + collectionName: "{collection_name}", + payload: new Dictionary { { "property1", "string" }, { "property2", "string" } }, + filter: MatchKeyword("color", "red") +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.SetPayload(context.Background(), &qdrant.SetPayloadPoints{ + CollectionName: "{collection_name}", + Payload: qdrant.NewValueMap( + map[string]any{"property1": "string", "property2": "string"}), + PointsSelector: qdrant.NewPointsSelectorFilter(&qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("color", "red"), + }, + }), +}) +``` + +```http +POST /collections/{collection_name}/points/payload +{ + "payload": { + "property1": "string", + "property2": "string" + }, + "filter": { + "must": [ + { + "key": "color", + "match": { + "value": "red" + } + } + ] + } +} +``` + +```typescript +client.setPayload("{collection_name}", { + payload: { + property1: "string", + property2: "string", + }, + filter: { + must: [ + { + key: "color", + match: { + value: "red", + }, + }, + ], + }, +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, Filter, SetPayloadPointsBuilder}; +use qdrant_client::Payload; +use serde_json::json; + +client + .set_payload( + SetPayloadPointsBuilder::new( + "{collection_name}", + Payload::try_from(json!({ + "property1": "string", + "property2": "string", + })) + .unwrap(), + ) + .points_selector(Filter::must([Condition::matches( + "color", + "red".to_string(), + )])) + .wait(true), + ) + .await?; +``` + +```java +import java.util.Map; + +import static io.qdrant.client.ConditionFactory.matchKeyword; +import static io.qdrant.client.ValueFactory.value; + +client + .setPayloadAsync( + "{collection_name}", + Map.of("property1", value("string"), "property2", value("string")), + Filter.newBuilder().addMust(matchKeyword("color", "red")).build(), + true, + null, + null) + .get(); +``` + +_Available as of v1.8.0_ + +It is possible to modify only a specific key of the payload by using the `key` parameter. + +For instance, given the following payload JSON object on a point: + +```json +{ + "property1": { + "nested_property": "foo", + }, + "property2": { + "nested_property": "bar", + } +} +``` + +You can modify the `nested_property` of `property1` with the following request: + +```http +POST /collections/{collection_name}/points/payload +{ + "payload": { + "nested_property": "qux", + }, + "key": "property1", + "points": [1] +} +``` + +Resulting in the following payload: + +```json +{ + "property1": { + "nested_property": "qux", + }, + "property2": { + "nested_property": "bar", + } +} +``` + +### Overwrite payload + +Fully replace any existing payload with the given one. + +REST API ([Schema](https://api.qdrant.tech/api-reference/points/overwrite-payload)): + +```python +client.overwrite_payload( + collection_name="{collection_name}", + payload={ + "property1": "string", + "property2": "string", + }, + points=[0, 3, 10], +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.OverwritePayloadAsync( + collectionName: "{collection_name}", + payload: new Dictionary { { "property1", "string" }, { "property2", "string" } }, + ids: new ulong[] { 0, 3, 10 } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.OverwritePayload(context.Background(), &qdrant.SetPayloadPoints{ + CollectionName: "{collection_name}", + Payload: qdrant.NewValueMap( + map[string]any{"property1": "string", "property2": "string"}), + PointsSelector: qdrant.NewPointsSelector( + qdrant.NewIDNum(0), + qdrant.NewIDNum(3)), +}) +``` + +```http +PUT /collections/{collection_name}/points/payload +{ + "payload": { + "property1": "string", + "property2": "string" + }, + "points": [ + 0, 3, 100 + ] +} +``` + +```typescript +client.overwritePayload("{collection_name}", { + payload: { + property1: "string", + property2: "string", + }, + points: [0, 3, 10], +}); +``` + +```rust +use qdrant_client::qdrant::{PointsIdsList, SetPayloadPointsBuilder}; +use qdrant_client::Payload; +use serde_json::json; + +client + .overwrite_payload( + SetPayloadPointsBuilder::new( + "{collection_name}", + Payload::try_from(json!({ + "property1": "string", + "property2": "string", + })) + .unwrap(), + ) + .points_selector(PointsIdsList { + ids: vec![0.into(), 3.into(), 10.into()], + }) + .wait(true), + ) + .await?; +``` + +```java +import java.util.List; + +import static io.qdrant.client.PointIdFactory.id; +import static io.qdrant.client.ValueFactory.value; + +client + .overwritePayloadAsync( + "{collection_name}", + Map.of("property1", value("string"), "property2", value("string")), + List.of(id(0), id(3), id(10)), + true, + null, + null) + .get(); +``` + +Like [set payload](#set-payload), you don't need to know the ids of the points +you want to modify. The alternative is to use filters. + +### Clear payload + +This method removes all payload keys from specified points + +REST API ([Schema](https://api.qdrant.tech/api-reference/points/clear-payload)): + +```python +client.clear_payload( + collection_name="{collection_name}", + points_selector=[0, 3, 100], +) +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.ClearPayloadAsync(collectionName: "{collection_name}", ids: new ulong[] { 0, 3, 100 }); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client.ClearPayload(context.Background(), &qdrant.ClearPayloadPoints{ + CollectionName: "{collection_name}", + Points: qdrant.NewPointsSelector( + qdrant.NewIDNum(0), + qdrant.NewIDNum(3)), +}) +``` + +```http +POST /collections/{collection_name}/points/payload/clear +{ + "points": [0, 3, 100] +} +``` + +```typescript +client.clearPayload("{collection_name}", { + points: [0, 3, 100], +}); +``` + +```rust +use qdrant_client::qdrant::{ClearPayloadPointsBuilder, PointsIdsList}; + +client + .clear_payload( + ClearPayloadPointsBuilder::new("{collection_name}") + .points(PointsIdsList { + ids: vec![0.into(), 3.into(), 10.into()], + }) + .wait(true), + ) + .await?; +``` + +```java +import java.util.List; + +import static io.qdrant.client.PointIdFactory.id; + +client + .clearPayloadAsync("{collection_name}", List.of(id(0), id(3), id(100)), true, null, null) + .get(); +``` + + + +### Delete payload keys + +Delete specific payload keys from points. + +REST API ([Schema](https://api.qdrant.tech/api-reference/points/delete-payload)): + +```python +client.delete_payload( + collection_name="{collection_name}", + keys=["color", "price"], + points=[0, 3, 100], +) +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.DeletePayloadAsync( + collectionName: "{collection_name}", + keys: ["color", "price"], + ids: new ulong[] { 0, 3, 100 } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.DeletePayload(context.Background(), &qdrant.DeletePayloadPoints{ + CollectionName: "{collection_name}", + Keys: []string{"color", "price"}, + PointsSelector: qdrant.NewPointsSelector( + qdrant.NewIDNum(0), + qdrant.NewIDNum(3)), +}) +``` + +```http +POST /collections/{collection_name}/points/payload/delete +{ + "keys": ["color", "price"], + "points": [0, 3, 100] +} +``` + +```typescript +client.deletePayload("{collection_name}", { + keys: ["color", "price"], + points: [0, 3, 100], +}); +``` + +```rust +use qdrant_client::qdrant::{DeletePayloadPointsBuilder, PointsIdsList}; + +client + .delete_payload( + DeletePayloadPointsBuilder::new( + "{collection_name}", + vec!["color".to_string(), "price".to_string()], + ) + .points_selector(PointsIdsList { + ids: vec![0.into(), 3.into(), 10.into()], + }) + .wait(true), + ) + .await?; +``` + +```java +import java.util.List; + +import static io.qdrant.client.PointIdFactory.id; + +client + .deletePayloadAsync( + "{collection_name}", + List.of("color", "price"), + List.of(id(0), id(3), id(100)), + true, + null, + null) + .get(); +``` + +Alternatively, you can use filters to delete payload keys from the points. + +```python +client.delete_payload( + collection_name="{collection_name}", + keys=["color", "price"], + points=models.Filter( + must=[ + models.FieldCondition( + key="color", + match=models.MatchValue(value="red"), + ), + ], + ), +) +``` + +```csharp +using Qdrant.Client; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +await client.DeletePayloadAsync( + collectionName: "{collection_name}", + keys: ["color", "price"], + filter: MatchKeyword("color", "red") +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.DeletePayload(context.Background(), &qdrant.DeletePayloadPoints{ + CollectionName: "{collection_name}", + Keys: []string{"color", "price"}, + PointsSelector: qdrant.NewPointsSelectorFilter( + &qdrant.Filter{ + Must: []*qdrant.Condition{qdrant.NewMatch("color", "red")}, + }, + ), +}) +``` + +```http +POST /collections/{collection_name}/points/payload/delete +{ + "keys": ["color", "price"], + "filter": { + "must": [ + { + "key": "color", + "match": { + "value": "red" + } + } + ] + } +} +``` + +```typescript +client.deletePayload("{collection_name}", { + keys: ["color", "price"], + filter: { + must: [ + { + key: "color", + match: { + value: "red", + }, + }, + ], + }, +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, DeletePayloadPointsBuilder, Filter}; + +client + .delete_payload( + DeletePayloadPointsBuilder::new( + "{collection_name}", + vec!["color".to_string(), "price".to_string()], + ) + .points_selector(Filter::must([Condition::matches( + "color", + "red".to_string(), + )])) + .wait(true), + ) + .await?; +``` + +```java +import java.util.List; + +import static io.qdrant.client.ConditionFactory.matchKeyword; + +client + .deletePayloadAsync( + "{collection_name}", + List.of("color", "price"), + Filter.newBuilder().addMust(matchKeyword("color", "red")).build(), + true, + null, + null) + .get(); +``` + +## Payload indexing + +To search more efficiently with filters, Qdrant allows you to create indexes for payload fields by specifying the name and type of field it is intended to be. + +The indexed fields also affect the vector index. See [Indexing](/documentation/concepts/indexing/) for details. + +In practice, we recommend creating an index on those fields that could potentially constrain the results the most. +For example, using an index for the object ID will be much more efficient, being unique for each record, than an index by its color, which has only a few possible values. + +In compound queries involving multiple fields, Qdrant will attempt to use the most restrictive index first. + +To create index for the field, you can use the following: + +REST API ([Schema](https://api.qdrant.tech/api-reference/indexes/create-field-index)) + +```python +client.create_payload_index( + collection_name="{collection_name}", + field_name="name_of_the_field_to_index", + field_schema="keyword", +) +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.CreatePayloadIndexAsync( + collectionName: "{collection_name}", + fieldName: "name_of_the_field_to_index" +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.CreateFieldIndex(context.Background(), &qdrant.CreateFieldIndexCollection{ + CollectionName: "{collection_name}", + FieldName: "name_of_the_field_to_index", + FieldType: qdrant.FieldType_FieldTypeKeyword.Enum(), +}) +``` + +```http +PUT /collections/{collection_name}/index +{ + "field_name": "name_of_the_field_to_index", + "field_schema": "keyword" +} +``` + +```typescript +client.createPayloadIndex("{collection_name}", { + field_name: "name_of_the_field_to_index", + field_schema: "keyword", +}); +``` + +```rust +use qdrant_client::qdrant::{CreateFieldIndexCollectionBuilder, FieldType}; + +client + .create_field_index( + CreateFieldIndexCollectionBuilder::new( + "{collection_name}", + "name_of_the_field_to_index", + FieldType::Keyword, + ) + .wait(true), + ) + .await?; +``` + +```java +import io.qdrant.client.grpc.Collections.PayloadSchemaType; + +client.createPayloadIndexAsync( + "{collection_name}", + "name_of_the_field_to_index", + PayloadSchemaType.Keyword, + null, + true, + null, + null); +``` + +The index usage flag is displayed in the payload schema with the [collection info API](https://api.qdrant.tech/api-reference/collections/get-collection). + +Payload schema example: + +```json +{ + "payload_schema": { + "property1": { + "data_type": "keyword" + }, + "property2": { + "data_type": "integer" + } + } +} +``` + +## Facet counts + +*Available as of v1.12.0* + +Faceting is a special counting technique that can be used for various purposes: +- Know which unique values exist for a payload key. +- Know the number of points that contain each unique value. +- Know how restrictive a filter would become by matching a specific value. + +Specifically, it is a counting aggregation for the values in a field, akin to a `GROUP BY` with `COUNT(*)` commands in SQL. + +These results for a specific field is called a "facet". For example, when you look at an e-commerce search results page, you might see a list of brands on the sidebar, showing the number of products for each brand. This would be a facet for a `"brand"` field. + + + +To get the facet counts for a field, you can use the following: + + + +REST API ([Facet](https://api.qdrant.tech/v-1-13-x/api-reference/points/facet)) + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.facet( + collection_name="{collection_name}", + key="size", + facet_filter=models.Filter(must=[models.Match("color", "red")]), +) +``` + +```csharp +using Qdrant.Client; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +await client.FacetAsync( + "{collection_name}", + key: "size", + filter: MatchKeyword("color", "red") +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +res, err := client.Facet(ctx, &qdrant.FacetCounts{ + CollectionName: "{collection_name}", + Key: "size", + Filter: &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("color", "red"), + }, + }, +}) +``` + +```http +POST /collections/{collection_name}/facet +{ + "key": "size", + "filter": { + "must": { + "key": "color", + "match": { "value": "red" } + } + } +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.facet("{collection_name}", { + filter: { + must: [ + { + key: "color", + match: { + value: "red", + }, + }, + ], + }, + key: "size", +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, FacetCountsBuilder, Filter}; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .facet( + FacetCountsBuilder::new("{collection_name}", "size") + .limit(10) + .filter(Filter::must(vec![Condition::matches( + "color", + "red".to_string(), + )])), + ) + .await?; +``` + +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; + +import static io.qdrant.client.ConditionFactory.matchKeyword; +import io.qdrant.client.grpc.Points; +import io.qdrant.client.grpc.Filter; + +QdrantClient client = new QdrantClient( + QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .facetAsync( + Points.FacetCounts.newBuilder() + .setCollectionName(collection_name) + .setKey("size") + .setFilter(Filter.newBuilder().addMust(matchKeyword("color", "red")).build()) + .build()) + .get(); +``` + +The response will contain the counts for each unique value in the field: + +```json +{ + "response": { + "hits": [ + {"value": "L", "count": 19}, + {"value": "S", "count": 10}, + {"value": "M", "count": 5}, + {"value": "XL", "count": 1}, + {"value": "XXL", "count": 1} + ] + }, + "time": 0.0001 +} +``` + +The results are sorted by the count in descending order, then by the value in ascending order. +Only values with non-zero counts will be returned. + +By default, the way Qdrant the counts for each value is approximate to achieve fast results. This should accurate enough for most cases, but if you need to debug your storage, you can use the `exact` parameter to get exact counts. + +```python +client.facet( + collection_name="{collection_name}", + key="size", + exact=True, +) +``` + +```csharp +using Qdrant.Client; + +await client.FacetAsync( + "{collection_name}", + key: "size", + exact: true, +); +``` + +```go +res, err := client.Facet(ctx, &qdrant.FacetCounts{ + CollectionName: "{collection_name}", + Key: "key", + Exact: true, +}) +``` + +```http +POST /collections/{collection_name}/facet +{ + "key": "size", + "exact": true +} +``` + +```typescript +client.facet("{collection_name}", { + key: "size", + exact: true, +}); +``` + +```rust +use qdrant_client::qdrant::FacetCountsBuilder; + +client + .facet( + FacetCountsBuilder::new("{collection_name}", "size") + .limit(10) + .exact(true), + ) + .await?; +``` + +```java + client + .facetAsync( + Points.FacetCounts.newBuilder() + .setCollectionName(collection_name) + .setKey("foo") + .setExact(true) + .build()) + .get(); +``` + +<|page-6-lllmstxt|> +# Similarity search + +Searching for the nearest vectors is at the core of many representational learning applications. +Modern neural networks are trained to transform objects into vectors so that objects close in the real world appear close in vector space. +It could be, for example, texts with similar meanings, visually similar pictures, or songs of the same genre. + +{{< figure src="/docs/encoders.png" caption="This is how vector similarity works" width="70%" >}} + +## Query API + +*Available as of v1.10.0* + +Qdrant provides a single interface for all kinds of search and exploration requests - the `Query API`. +Here is a reference list of what kind of queries you can perform with the `Query API` in Qdrant: + +Depending on the `query` parameter, Qdrant might prefer different strategies for the search. + +| | | +| --- | --- | +| Nearest Neighbors Search | Vector Similarity Search, also known as k-NN | +| Search By Id | Search by an already stored vector - skip embedding model inference | +| [Recommendations](/documentation/concepts/explore/#recommendation-api) | Provide positive and negative examples | +| [Discovery Search](/documentation/concepts/explore/#discovery-api) | Guide the search using context as a one-shot training set | +| [Scroll](/documentation/concepts/points/#scroll-points) | Get all points with optional filtering | +| [Grouping](/documentation/concepts/search/#grouping-api) | Group results by a certain field | +| [Order By](/documentation/concepts/hybrid-queries/#re-ranking-with-stored-values) | Order points by payload key | +| [Hybrid Search](/documentation/concepts/hybrid-queries/#hybrid-search) | Combine multiple queries to get better results | +| [Multi-Stage Search](/documentation/concepts/hybrid-queries/#multi-stage-queries) | Optimize performance for large embeddings | +| [Random Sampling](#random-sampling) | Get random points from the collection | + +**Nearest Neighbors Search** + +```python +client.query_points( + collection_name="{collection_name}", + query=[0.2, 0.1, 0.9, 0.7], # <--- Dense vector +) +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": [0.2, 0.1, 0.9, 0.7] // <--- Dense vector +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + query: [0.2, 0.1, 0.9, 0.7], // <--- Dense vector +}); +``` + +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{Condition, Filter, Query, QueryPointsBuilder}; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query(Query::new_nearest(vec![0.2, 0.1, 0.9, 0.7])) + ) + .await?; +``` + +```java +import java.util.List; + +import static io.qdrant.client.QueryFactory.nearest; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.QueryPoints; + +QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.queryAsync(QueryPoints.newBuilder() + .setCollectionName("{collectionName}") + .setQuery(nearest(List.of(0.2f, 0.1f, 0.9f, 0.7f))) + .build()).get(); +``` + +**Search By Id** + +```python +client.query_points( + collection_name="{collection_name}", + query="43cf51e2-8777-4f52-bc74-c2cbde0c8b04", # <--- point id +) +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: Guid.Parse("43cf51e2-8777-4f52-bc74-c2cbde0c8b04") +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQueryID(qdrant.NewID("43cf51e2-8777-4f52-bc74-c2cbde0c8b04")), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": "43cf51e2-8777-4f52-bc74-c2cbde0c8b04" // <--- point id +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + query: '43cf51e2-8777-4f52-bc74-c2cbde0c8b04', // <--- point id +}); +``` + +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{Condition, Filter, PointId, Query, QueryPointsBuilder}; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query(Query::new_nearest(PointId::new("43cf51e2-8777-4f52-bc74-c2cbde0c8b04"))) + ) + .await?; +``` + +```java +import java.util.UUID; + +import static io.qdrant.client.QueryFactory.nearest; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.QueryPoints; + +QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.queryAsync(QueryPoints.newBuilder() + .setCollectionName("{collectionName}") + .setQuery(nearest(UUID.fromString("43cf51e2-8777-4f52-bc74-c2cbde0c8b04"))) + .build()).get(); +``` + +## Metrics + +There are many ways to estimate the similarity of vectors with each other. +In Qdrant terms, these ways are called metrics. +The choice of metric depends on the vectors obtained and, in particular, on the neural network encoder training method. + +Qdrant supports these most popular types of metrics: + +* Dot product: `Dot` - +* Cosine similarity: `Cosine` - +* Euclidean distance: `Euclid` - +* Manhattan distance: `Manhattan`*- *Available as of v1.7 + +The most typical metric used in similarity learning models is the cosine metric. + +![Embeddings](/docs/cos.png) + +Qdrant counts this metric in 2 steps, due to which a higher search speed is achieved. +The first step is to normalize the vector when adding it to the collection. +It happens only once for each vector. + +The second step is the comparison of vectors. +In this case, it becomes equivalent to dot production - a very fast operation due to SIMD. + +Depending on the query configuration, Qdrant might prefer different strategies for the search. +Read more about it in the [query planning](#query-planning) section. + +## Search API + +Let's look at an example of a search query. + +REST API - API Schema definition is available [here](https://api.qdrant.tech/api-reference/search/query-points) + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.query_points( + collection_name="{collection_name}", + query=[0.2, 0.1, 0.9, 0.7], + query_filter=models.Filter( + must=[ + models.FieldCondition( + key="city", + match=models.MatchValue( + value="London", + ), + ) + ] + ), + search_params=models.SearchParams(hnsw_ef=128, exact=False), + limit=3, +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, + filter: MatchKeyword("city", "London"), + searchParams: new SearchParams { Exact = false, HnswEf = 128 }, + limit: 3 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), + Filter: &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("city", "London"), + }, + }, + Params: &qdrant.SearchParams{ + Exact: qdrant.PtrOf(false), + HnswEf: qdrant.PtrOf(uint64(128)), + }, +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": [0.2, 0.1, 0.9, 0.79], + "filter": { + "must": [ + { + "key": "city", + "match": { + "value": "London" + } + } + ] + }, + "params": { + "hnsw_ef": 128, + "exact": false + }, + "limit": 3 +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + query: [0.2, 0.1, 0.9, 0.7], + filter: { + must: [ + { + key: "city", + match: { + value: "London", + }, + }, + ], + }, + params: { + hnsw_ef: 128, + exact: false, + }, + limit: 3, +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, Filter, QueryPointsBuilder, SearchParamsBuilder}; +use qdrant_client::Qdrant; + +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query(vec![0.2, 0.1, 0.9, 0.7]) + .limit(3) + .filter(Filter::must([Condition::matches( + "city", + "London".to_string(), + )])) + .params(SearchParamsBuilder::default().hnsw_ef(128).exact(false)), + ) + .await?; +``` + +```java +import java.util.List; + +import static io.qdrant.client.ConditionFactory.matchKeyword; +import static io.qdrant.client.QueryFactory.nearest; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.QueryPoints; +import io.qdrant.client.grpc.Points.SearchParams; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.queryAsync(QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) + .setFilter(Filter.newBuilder().addMust(matchKeyword("city", "London")).build()) + .setParams(SearchParams.newBuilder().setExact(false).setHnswEf(128).build()) + .setLimit(3) + .build()).get(); +``` + +In this example, we are looking for vectors similar to vector `[0.2, 0.1, 0.9, 0.7]`. +Parameter `limit` (or its alias - `top`) specifies the amount of most similar results we would like to retrieve. + +Values under the key `params` specify custom parameters for the search. +Currently, it could be: + +* `hnsw_ef` - value that specifies `ef` parameter of the HNSW algorithm. +* `exact` - option to not use the approximate search (ANN). If set to true, the search may run for a long as it performs a full scan to retrieve exact results. +* `indexed_only` - With this option you can disable the search in those segments where vector index is not built yet. This may be useful if you want to minimize the impact to the search performance whilst the collection is also being updated. Using this option may lead to a partial result if the collection is not fully indexed yet, consider using it only if eventual consistency is acceptable for your use case. + +Since the `filter` parameter is specified, the search is performed only among those points that satisfy the filter condition. +See details of possible filters and their work in the [filtering](/documentation/concepts/filtering/) section. + +Example result of this API would be + +```json +{ + "result": [ + { "id": 10, "score": 0.81 }, + { "id": 14, "score": 0.75 }, + { "id": 11, "score": 0.73 } + ], + "status": "ok", + "time": 0.001 +} +``` + +The `result` contains ordered by `score` list of found point ids. + +Note that payload and vector data is missing in these results by default. +See [payload and vector in the result](#payload-and-vector-in-the-result) on how +to include it. + +If the collection was created with multiple vectors, the name of the vector to use for searching should be provided: + +```python +from qdrant_client import QdrantClient + +client = QdrantClient(url="http://localhost:6333") + +client.query_points( + collection_name="{collection_name}", + query=[0.2, 0.1, 0.9, 0.7], + using="image", + limit=3, +) +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, + usingVector: "image", + limit: 3 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), + Using: qdrant.PtrOf("image"), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": [0.2, 0.1, 0.9, 0.7], + "using": "image", + "limit": 3 +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + query: [0.2, 0.1, 0.9, 0.7], + using: "image", + limit: 3, +}); +``` + +```rust +use qdrant_client::qdrant::QueryPointsBuilder; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query(vec![0.2, 0.1, 0.9, 0.7]) + .limit(3) + .using("image"), + ) + .await?; +``` + +```java +import java.util.List; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.QueryPoints; + +import static io.qdrant.client.QueryFactory.nearest; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.queryAsync(QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) + .setUsing("image") + .setLimit(3) + .build()).get(); +``` + +Search is processing only among vectors with the same name. + +If the collection was created with sparse vectors, the name of the sparse vector to use for searching should be provided: + +You can still use payload filtering and other features of the search API with sparse vectors. + +There are however important differences between dense and sparse vector search: + +| Index| Sparse Query | Dense Query | +| --- | --- | --- | +| Scoring Metric | Default is `Dot product`, no need to specify it | `Distance` has supported metrics e.g. Dot, Cosine | +| Search Type | Always exact in Qdrant | HNSW is an approximate NN | +| Return Behaviour | Returns only vectors with non-zero values in the same indices as the query vector | Returns `limit` vectors | + +In general, the speed of the search is proportional to the number of non-zero values in the query vector. + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + + +result = client.query_points( + collection_name="{collection_name}", + query=models.SparseVector(indices=[1, 3, 5, 7], values=[0.1, 0.2, 0.3, 0.4]), + using="text", +).points +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: new (float, uint)[] {(0.1f, 1), (0.2f, 3), (0.3f, 5), (0.4f, 7)}, + usingVector: "text", + limit: 3 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQuerySparse( + []uint32{1, 3, 5, 7}, + []float32{0.1, 0.2, 0.3, 0.4}), + Using: qdrant.PtrOf("text"), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": { + "indices": [1, 3, 5, 7], + "values": [0.1, 0.2, 0.3, 0.4] + }, + "using": "text" +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + query: { + indices: [1, 3, 5, 7], + values: [0.1, 0.2, 0.3, 0.4] + }, + using: "text", + limit: 3, +}); +``` + +```rust +use qdrant_client::qdrant::QueryPointsBuilder; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query(vec![(1, 0.2), (3, 0.1), (5, 0.9), (7, 0.7)]) + .limit(10) + .using("text"), + ) + .await?; +``` + +```java +import java.util.List; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.QueryPoints; + +import static io.qdrant.client.QueryFactory.nearest; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setUsing("text") + .setQuery(nearest(List.of(0.1f, 0.2f, 0.3f, 0.4f), List.of(1, 3, 5, 7))) + .setLimit(3) + .build()) + .get(); +``` + +### Filtering results by score + +In addition to payload filtering, it might be useful to filter out results with a low similarity score. +For example, if you know the minimal acceptance score for your model and do not want any results which are less similar than the threshold. +In this case, you can use `score_threshold` parameter of the search query. +It will exclude all results with a score worse than the given. + + + +### Payload and vector in the result + +By default, retrieval methods do not return any stored information such as +payload and vectors. Additional parameters `with_vectors` and `with_payload` +alter this behavior. + +Example: + +```python +client.query_points( + collection_name="{collection_name}", + query=[0.2, 0.1, 0.9, 0.7], + with_vectors=True, + with_payload=True, +) +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, + payloadSelector: true, + vectorsSelector: true, + limit: 3 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), + WithPayload: qdrant.NewWithPayload(true), + WithVectors: qdrant.NewWithVectors(true), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": [0.2, 0.1, 0.9, 0.7], + "with_vectors": true, + "with_payload": true +} +``` + +```typescript +client.query("{collection_name}", { + query: [0.2, 0.1, 0.9, 0.7], + with_vector: true, + with_payload: true, +}); +``` + +```rust +use qdrant_client::qdrant::QueryPointsBuilder; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query(vec![0.2, 0.1, 0.9, 0.7]) + .limit(3) + .with_payload(true) + .with_vectors(true), + ) + .await?; +``` + +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.WithVectorsSelectorFactory; +import io.qdrant.client.grpc.Points.QueryPoints; + +import static io.qdrant.client.QueryFactory.nearest; +import static io.qdrant.client.WithPayloadSelectorFactory.enable; + + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) + .setWithPayload(enable(true)) + .setWithVectors(WithVectorsSelectorFactory.enable(true)) + .setLimit(3) + .build()) + .get(); +``` + +You can use `with_payload` to scope to or filter a specific payload subset. +You can even specify an array of items to include, such as `city`, +`village`, and `town`: + +```python +from qdrant_client import QdrantClient + +client = QdrantClient(url="http://localhost:6333") + +client.query_points( + collection_name="{collection_name}", + query=[0.2, 0.1, 0.9, 0.7], + with_payload=["city", "village", "town"], +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, + payloadSelector: new WithPayloadSelector + { + Include = new PayloadIncludeSelector + { + Fields = { new string[] { "city", "village", "town" } } + } + }, + limit: 3 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), + WithPayload: qdrant.NewWithPayloadInclude("city", "village", "town"), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": [0.2, 0.1, 0.9, 0.7], + "with_payload": ["city", "village", "town"] +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + query: [0.2, 0.1, 0.9, 0.7], + with_payload: ["city", "village", "town"], +}); +``` + +```rust +use qdrant_client::qdrant::{with_payload_selector::SelectorOptions, QueryPointsBuilder}; +use qdrant_client::Qdrant; + +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query(vec![0.2, 0.1, 0.9, 0.7]) + .limit(3) + .with_payload(SelectorOptions::Include( + vec![ + "city".to_string(), + "village".to_string(), + "town".to_string(), + ] + .into(), + )) + .with_vectors(true), + ) + .await?; +``` + +```java +import java.util.List; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.QueryPoints; + +import static io.qdrant.client.QueryFactory.nearest; +import static io.qdrant.client.WithPayloadSelectorFactory.include; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) + .setWithPayload(include(List.of("city", "village", "town"))) + .setLimit(3) + .build()) + .get(); +``` + +Or use `include` or `exclude` explicitly. For example, to exclude `city`: + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.query_points( + collection_name="{collection_name}", + query=[0.2, 0.1, 0.9, 0.7], + with_payload=models.PayloadSelectorExclude( + exclude=["city"], + ), +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, + payloadSelector: new WithPayloadSelector + { + Exclude = new PayloadExcludeSelector { Fields = { new string[] { "city" } } } + }, + limit: 3 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), + WithPayload: qdrant.NewWithPayloadExclude("city"), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": [0.2, 0.1, 0.9, 0.7], + "with_payload": { + "exclude": ["city"] + } +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + query: [0.2, 0.1, 0.9, 0.7], + with_payload: { + exclude: ["city"], + }, +}); +``` + +```rust +use qdrant_client::qdrant::{with_payload_selector::SelectorOptions, QueryPointsBuilder}; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query(vec![0.2, 0.1, 0.9, 0.7]) + .limit(3) + .with_payload(SelectorOptions::Exclude(vec!["city".to_string()].into())) + .with_vectors(true), + ) + .await?; +``` + +```java +import java.util.List; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.QueryPoints; + +import static io.qdrant.client.QueryFactory.nearest; +import static io.qdrant.client.WithPayloadSelectorFactory.exclude; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) + .setWithPayload(exclude(List.of("city"))) + .setLimit(3) + .build()) + .get(); +``` + +It is possible to target nested fields using a dot notation: +* `payload.nested_field` - for a nested field +* `payload.nested_array[].sub_field` - for projecting nested fields within an array + +Accessing array elements by index is currently not supported. + +## Batch search API + +The batch search API enables to perform multiple search requests via a single request. + +Its semantic is straightforward, `n` batched search requests are equivalent to `n` singular search requests. + +This approach has several advantages. Logically, fewer network connections are required which can be very beneficial on its own. + +More importantly, batched requests will be efficiently processed via the query planner which can detect and optimize requests if they have the same `filter`. + +This can have a great effect on latency for non trivial filters as the intermediary results can be shared among the request. + +In order to use it, simply pack together your search requests. All the regular attributes of a search request are of course available. + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +filter_ = models.Filter( + must=[ + models.FieldCondition( + key="city", + match=models.MatchValue( + value="London", + ), + ) + ] +) + +search_queries = [ + models.QueryRequest(query=[0.2, 0.1, 0.9, 0.7], filter=filter_, limit=3), + models.QueryRequest(query=[0.5, 0.3, 0.2, 0.3], filter=filter_, limit=3), +] + +client.query_batch_points(collection_name="{collection_name}", requests=search_queries) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +var filter = MatchKeyword("city", "London"); + +var queries = new List +{ + new() + { + CollectionName = "{collection_name}", + Query = new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, + Filter = filter, + Limit = 3 + }, + new() + { + CollectionName = "{collection_name}", + Query = new float[] { 0.5f, 0.3f, 0.2f, 0.3f }, + Filter = filter, + Limit = 3 + } +}; + +await client.QueryBatchAsync(collectionName: "{collection_name}", queries: queries); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +filter := qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("city", "London"), + }, +} + +client.QueryBatch(context.Background(), &qdrant.QueryBatchPoints{ + CollectionName: "{collection_name}", + QueryPoints: []*qdrant.QueryPoints{ + { + CollectionName: "{collection_name}", + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), + Filter: &filter, + }, + { + CollectionName: "{collection_name}", + Query: qdrant.NewQuery(0.5, 0.3, 0.2, 0.3), + Filter: &filter, + }, + }, +}) +``` + +```http +POST /collections/{collection_name}/points/query/batch +{ + "searches": [ + { + "query": [0.2, 0.1, 0.9, 0.7], + "filter": { + "must": [ + { + "key": "city", + "match": { + "value": "London" + } + } + ] + }, + "limit": 3 + }, + { + "query": [0.5, 0.3, 0.2, 0.3], + "filter": { + "must": [ + { + "key": "city", + "match": { + "value": "London" + } + } + ] + }, + "limit": 3 + } + ] +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +const filter = { + must: [ + { + key: "city", + match: { + value: "London", + }, + }, + ], +}; + +const searches = [ + { + query: [0.2, 0.1, 0.9, 0.7], + filter, + limit: 3, + }, + { + query: [0.5, 0.3, 0.2, 0.3], + filter, + limit: 3, + }, +]; + +client.queryBatch("{collection_name}", { + searches, +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, Filter, QueryBatchPointsBuilder, QueryPointsBuilder}; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +let filter = Filter::must([Condition::matches("city", "London".to_string())]); + +let searches = vec![ + QueryPointsBuilder::new("{collection_name}") + .query(vec![0.1, 0.2, 0.3, 0.4]) + .limit(3) + .filter(filter.clone()) + .build(), + QueryPointsBuilder::new("{collection_name}") + .query(vec![0.5, 0.3, 0.2, 0.3]) + .limit(3) + .filter(filter) + .build(), +]; + +client + .query_batch(QueryBatchPointsBuilder::new("{collection_name}", searches)) + .await?; +``` + +```java +import java.util.List; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.QueryPoints; + +import static io.qdrant.client.QueryFactory.nearest; +import static io.qdrant.client.ConditionFactory.matchKeyword; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +Filter filter = Filter.newBuilder().addMust(matchKeyword("city", "London")).build(); + +List searches = List.of( + QueryPoints.newBuilder() + .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) + .setFilter(filter) + .setLimit(3) + .build(), + QueryPoints.newBuilder() + .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) + .setFilter(filter) + .setLimit(3) + .build()); + +client.queryBatchAsync("{collection_name}", searches).get(); +``` + +The result of this API contains one array per search requests. + +```json +{ + "result": [ + [ + { "id": 10, "score": 0.81 }, + { "id": 14, "score": 0.75 }, + { "id": 11, "score": 0.73 } + ], + [ + { "id": 1, "score": 0.92 }, + { "id": 3, "score": 0.89 }, + { "id": 9, "score": 0.75 } + ] + ], + "status": "ok", + "time": 0.001 +} +``` + +## Query by ID + +Whenever you need to use a vector as an input, you can always use a [point ID](/documentation/concepts/points/#point-ids) instead. + +```python +client.query_points( + collection_name="{collection_name}", + query="43cf51e2-8777-4f52-bc74-c2cbde0c8b04", # <--- point id +) +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: Guid.Parse("43cf51e2-8777-4f52-bc74-c2cbde0c8b04") +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQueryID(qdrant.NewID("43cf51e2-8777-4f52-bc74-c2cbde0c8b04")), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": "43cf51e2-8777-4f52-bc74-c2cbde0c8b04" // <--- point id +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + query: '43cf51e2-8777-4f52-bc74-c2cbde0c8b04', // <--- point id +}); +``` + +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{Condition, Filter, PointId, Query, QueryPointsBuilder}; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query(Query::new_nearest(PointId::new("43cf51e2-8777-4f52-bc74-c2cbde0c8b04"))) + ) + .await?; +``` + +```java +import java.util.UUID; + +import static io.qdrant.client.QueryFactory.nearest; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.QueryPoints; + +QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.queryAsync(QueryPoints.newBuilder() + .setCollectionName("{collectionName}") + .setQuery(nearest(UUID.fromString("43cf51e2-8777-4f52-bc74-c2cbde0c8b04"))) + .build()).get(); +``` + +The above example will fetch the default vector from the point with this id, and use it as the query vector. + +If the `using` parameter is also specified, Qdrant will use the vector with that name. + +It is also possible to reference an ID from a different collection, by setting the `lookup_from` parameter. + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.query_points( + collection_name="{collection_name}", + query="43cf51e2-8777-4f52-bc74-c2cbde0c8b04", # <--- point id + using="512d-vector", + lookup_from=models.LookupLocation( + collection="another_collection", # <--- other collection name + vector="image-512", # <--- vector name in the other collection + ) +) +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: Guid.Parse("43cf51e2-8777-4f52-bc74-c2cbde0c8b04"), // <--- point id + usingVector: "512d-vector", + lookupFrom: new() { + CollectionName = "another_collection", // <--- other collection name + VectorName = "image-512" // <--- vector name in the other collection + } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQueryID(qdrant.NewID("43cf51e2-8777-4f52-bc74-c2cbde0c8b04")), + Using: qdrant.PtrOf("512d-vector"), + LookupFrom: &qdrant.LookupLocation{ + CollectionName: "another_collection", + VectorName: qdrant.PtrOf("image-512"), + }, +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": "43cf51e2-8777-4f52-bc74-c2cbde0c8b04", // <--- point id + "using": "512d-vector" + "lookup_from": { + "collection": "another_collection", // <--- other collection name + "vector": "image-512" // <--- vector name in the other collection + } +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + query: '43cf51e2-8777-4f52-bc74-c2cbde0c8b04', // <--- point id + using: '512d-vector', + lookup_from: { + collection: 'another_collection', // <--- other collection name + vector: 'image-512', // <--- vector name in the other collection + } +}); +``` + +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{LookupLocationBuilder, PointId, Query, QueryPointsBuilder}; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client.query( + QueryPointsBuilder::new("{collection_name}") + .query(Query::new_nearest("43cf51e2-8777-4f52-bc74-c2cbde0c8b04")) + .using("512d-vector") + .lookup_from( + LookupLocationBuilder::new("another_collection") + .vector_name("image-512") + ) +).await?; +``` + +```java +import static io.qdrant.client.QueryFactory.nearest; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.LookupLocation; +import io.qdrant.client.grpc.Points.QueryPoints; +import java.util.UUID; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(nearest(UUID.fromString("43cf51e2-8777-4f52-bc74-c2cbde0c8b04"))) + .setUsing("512d-vector") + .setLookupFrom( + LookupLocation.newBuilder() + .setCollectionName("another_collection") + .setVectorName("image-512") + .build()) + .build()) + .get(); +``` + +In the case above, Qdrant will fetch the `"image-512"` vector from the specified point id in the +collection `another_collection`. + + + + +## Pagination + +Search and [recommendation](/documentation/concepts/explore/#recommendation-api) APIs allow to skip first results of the search and return only the result starting from some specified offset: + +Example: + +```python +from qdrant_client import QdrantClient + +client = QdrantClient(url="http://localhost:6333") + +client.query_points( + collection_name="{collection_name}", + query=[0.2, 0.1, 0.9, 0.7], + with_vectors=True, + with_payload=True, + limit=10, + offset=100, +) +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, + payloadSelector: true, + vectorsSelector: true, + limit: 10, + offset: 100 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), + WithPayload: qdrant.NewWithPayload(true), + WithVectors: qdrant.NewWithVectors(true), + Offset: qdrant.PtrOf(uint64(100)), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": [0.2, 0.1, 0.9, 0.7], + "with_vectors": true, + "with_payload": true, + "limit": 10, + "offset": 100 +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + query: [0.2, 0.1, 0.9, 0.7], + with_vector: true, + with_payload: true, + limit: 10, + offset: 100, +}); +``` + +```rust +use qdrant_client::qdrant::QueryPointsBuilder; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query(vec![0.2, 0.1, 0.9, 0.7]) + .with_payload(true) + .with_vectors(true) + .limit(10) + .offset(100), + ) + .await?; +``` + +```java +import java.util.List; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.WithVectorsSelectorFactory; +import io.qdrant.client.grpc.Points.QueryPoints; + +import static io.qdrant.client.QueryFactory.nearest; +import static io.qdrant.client.WithPayloadSelectorFactory.enable; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) + .setWithPayload(enable(true)) + .setWithVectors(WithVectorsSelectorFactory.enable(true)) + .setLimit(10) + .setOffset(100) + .build()) + .get(); +``` + +Is equivalent to retrieving the 11th page with 10 records per page. + + + +Vector-based retrieval in general and HNSW index in particular, are not designed to be paginated. +It is impossible to retrieve Nth closest vector without retrieving the first N vectors first. + +However, using the offset parameter saves the resources by reducing network traffic and the number of times the storage is accessed. + +Using an `offset` parameter, will require to internally retrieve `offset + limit` points, but only access payload and vector from the storage those points which are going to be actually returned. + +## Grouping API + +It is possible to group results by a certain field. This is useful when you have multiple points for the same item, and you want to avoid redundancy of the same item in the results. + +For example, if you have a large document split into multiple chunks, and you want to search or [recommend](/documentation/concepts/explore/#recommendation-api) on a per-document basis, you can group the results by the document ID. + +Consider having points with the following payloads: + +```json +[ + { + "id": 0, + "payload": { + "chunk_part": 0, + "document_id": "a" + }, + "vector": [0.91] + }, + { + "id": 1, + "payload": { + "chunk_part": 1, + "document_id": ["a", "b"] + }, + "vector": [0.8] + }, + { + "id": 2, + "payload": { + "chunk_part": 2, + "document_id": "a" + }, + "vector": [0.2] + }, + { + "id": 3, + "payload": { + "chunk_part": 0, + "document_id": 123 + }, + "vector": [0.79] + }, + { + "id": 4, + "payload": { + "chunk_part": 1, + "document_id": 123 + }, + "vector": [0.75] + }, + { + "id": 5, + "payload": { + "chunk_part": 0, + "document_id": -10 + }, + "vector": [0.6] + } +] +``` + +With the ***groups*** API, you will be able to get the best *N* points for each document, assuming that the payload of the points contains the document ID. Of course there will be times where the best *N* points cannot be fulfilled due to lack of points or a big distance with respect to the query. In every case, the `group_size` is a best-effort parameter, akin to the `limit` parameter. + +### Search groups + +REST API ([Schema](https://api.qdrant.tech/api-reference/search/query-points-groups)): + +```python +client.query_points_groups( + collection_name="{collection_name}", + # Same as in the regular query_points() API + query=[1.1], + # Grouping parameters + group_by="document_id", # Path of the field to group by + limit=4, # Max amount of groups + group_size=2, # Max amount of points per group +) +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryGroupsAsync( + collectionName: "{collection_name}", + query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, + groupBy: "document_id", + limit: 4, + groupSize: 2 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.QueryGroups(context.Background(), &qdrant.QueryPointGroups{ + CollectionName: "{collection_name}", + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), + GroupBy: "document_id", + GroupSize: qdrant.PtrOf(uint64(2)), +}) +``` + +```http +POST /collections/{collection_name}/points/query/groups +{ + // Same as in the regular query API + "query": [1.1], + // Grouping parameters + "group_by": "document_id", // Path of the field to group by + "limit": 4, // Max amount of groups + "group_size": 2 // Max amount of points per group +} +``` + +```typescript +client.queryGroups("{collection_name}", { + query: [1.1], + group_by: "document_id", + limit: 4, + group_size: 2, +}); +``` + +```rust +use qdrant_client::qdrant::QueryPointGroupsBuilder; + +client + .query_groups( + QueryPointGroupsBuilder::new("{collection_name}", "document_id") + .query(vec![0.2, 0.1, 0.9, 0.7]) + .group_size(2u64) + .with_payload(true) + .with_vectors(true) + .limit(4u64), + ) + .await?; +``` + +```java +import java.util.List; + +import io.qdrant.client.grpc.Points.SearchPointGroups; + +client.queryGroupsAsync( + QueryPointGroups.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) + .setGroupBy("document_id") + .setLimit(4) + .setGroupSize(2) + .build()) + .get(); +``` + +The output of a ***groups*** call looks like this: + +```json +{ + "result": { + "groups": [ + { + "id": "a", + "hits": [ + { "id": 0, "score": 0.91 }, + { "id": 1, "score": 0.85 } + ] + }, + { + "id": "b", + "hits": [ + { "id": 1, "score": 0.85 } + ] + }, + { + "id": 123, + "hits": [ + { "id": 3, "score": 0.79 }, + { "id": 4, "score": 0.75 } + ] + }, + { + "id": -10, + "hits": [ + { "id": 5, "score": 0.6 } + ] + } + ] + }, + "status": "ok", + "time": 0.001 +} +``` + +The groups are ordered by the score of the top point in the group. Inside each group the points are sorted too. + +If the `group_by` field of a point is an array (e.g. `"document_id": ["a", "b"]`), the point can be included in multiple groups (e.g. `"document_id": "a"` and `document_id: "b"`). + + + +**Limitations**: + +* Only [keyword](/documentation/concepts/payload/#keyword) and [integer](/documentation/concepts/payload/#integer) payload values are supported for the `group_by` parameter. Payload values with other types will be ignored. +* At the moment, pagination is not enabled when using **groups**, so the `offset` parameter is not allowed. + +### Lookup in groups + +Having multiple points for parts of the same item often introduces redundancy in the stored data. Which may be fine if the information shared by the points is small, but it can become a problem if the payload is large, because it multiplies the storage space needed to store the points by a factor of the amount of points we have per group. + +One way of optimizing storage when using groups is to store the information shared by the points with the same group id in a single point in another collection. Then, when using the [**groups** API](#grouping-api), add the `with_lookup` parameter to bring the information from those points into each group. + +![Group id matches point id](/docs/lookup_id_linking.png) + + + +This has the extra benefit of having a single point to update when the information shared by the points in a group changes. + +For example, if you have a collection of documents, you may want to chunk them and store the points for the chunks in a separate collection, making sure that you store the point id from the document it belongs in the payload of the chunk point. + +In this case, to bring the information from the documents into the chunks grouped by the document id, you can use the `with_lookup` parameter: + +```python +client.query_points_groups( + collection_name="chunks", + # Same as in the regular search() API + query=[1.1], + # Grouping parameters + group_by="document_id", # Path of the field to group by + limit=2, # Max amount of groups + group_size=2, # Max amount of points per group + # Lookup parameters + with_lookup=models.WithLookup( + # Name of the collection to look up points in + collection="documents", + # Options for specifying what to bring from the payload + # of the looked up point, True by default + with_payload=["title", "text"], + # Options for specifying what to bring from the vector(s) + # of the looked up point, True by default + with_vectors=False, + ), +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.SearchGroupsAsync( + collectionName: "{collection_name}", + vector: new float[] { 0.2f, 0.1f, 0.9f, 0.7f}, + groupBy: "document_id", + limit: 2, + groupSize: 2, + withLookup: new WithLookup + { + Collection = "documents", + WithPayload = new WithPayloadSelector + { + Include = new PayloadIncludeSelector { Fields = { new string[] { "title", "text" } } } + }, + WithVectors = false + } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.QueryGroups(context.Background(), &qdrant.QueryPointGroups{ + CollectionName: "{collection_name}", + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), + GroupBy: "document_id", + GroupSize: qdrant.PtrOf(uint64(2)), + WithLookup: &qdrant.WithLookup{ + Collection: "documents", + WithPayload: qdrant.NewWithPayloadInclude("title", "text"), + }, +}) +``` + +```http +POST /collections/chunks/points/query/groups +{ + // Same as in the regular query API + "query": [1.1], + + // Grouping parameters + "group_by": "document_id", + "limit": 2, + "group_size": 2, + + // Lookup parameters + "with_lookup": { + // Name of the collection to look up points in + "collection": "documents", + + // Options for specifying what to bring from the payload + // of the looked up point, true by default + "with_payload": ["title", "text"], + + // Options for specifying what to bring from the vector(s) + // of the looked up point, true by default + "with_vectors": false + } +} +``` + +```typescript +client.queryGroups("{collection_name}", { + query: [1.1], + group_by: "document_id", + limit: 2, + group_size: 2, + with_lookup: { + collection: "documents", + with_payload: ["title", "text"], + with_vectors: false, + }, +}); +``` + +```rust +use qdrant_client::qdrant::{with_payload_selector::SelectorOptions, QueryPointGroupsBuilder, WithLookupBuilder}; + +client + .query_groups( + QueryPointGroupsBuilder::new("{collection_name}", "document_id") + .query(vec![0.2, 0.1, 0.9, 0.7]) + .limit(2u64) + .limit(2u64) + .with_lookup( + WithLookupBuilder::new("documents") + .with_payload(SelectorOptions::Include( + vec!["title".to_string(), "text".to_string()].into(), + )) + .with_vectors(false), + ), + ) + .await?; +``` + +```java +import java.util.List; + +import io.qdrant.client.grpc.Points.QueryPointGroups; +import io.qdrant.client.grpc.Points.WithLookup; + +import static io.qdrant.client.QueryFactory.nearest; +import static io.qdrant.client.WithVectorsSelectorFactory.enable; +import static io.qdrant.client.WithPayloadSelectorFactory.include; + +client.queryGroupsAsync( + QueryPointGroups.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) + .setGroupBy("document_id") + .setLimit(2) + .setGroupSize(2) + .setWithLookup( + WithLookup.newBuilder() + .setCollection("documents") + .setWithPayload(include(List.of("title", "text"))) + .setWithVectors(enable(false)) + .build()) + .build()) + .get(); +``` + +For the `with_lookup` parameter, you can also use the shorthand `with_lookup="documents"` to bring the whole payload and vector(s) without explicitly specifying it. + +The looked up result will show up under `lookup` in each group. + +```json +{ + "result": { + "groups": [ + { + "id": 1, + "hits": [ + { "id": 0, "score": 0.91 }, + { "id": 1, "score": 0.85 } + ], + "lookup": { + "id": 1, + "payload": { + "title": "Document A", + "text": "This is document A" + } + } + }, + { + "id": 2, + "hits": [ + { "id": 1, "score": 0.85 } + ], + "lookup": { + "id": 2, + "payload": { + "title": "Document B", + "text": "This is document B" + } + } + } + ] + }, + "status": "ok", + "time": 0.001 +} +``` + +Since the lookup is done by matching directly with the point id, the lookup collection must be pre-populated with points where the `id` matches the `group_by` value (e.g., document_id) from your primary collection. + +Any group id that is not an existing (and valid) point id in the lookup collection will be ignored, and the `lookup` field will be empty. + +## Random Sampling + +*Available as of v1.11.0* + +In some cases it might be useful to retrieve a random sample of points from the collection. This can be useful for debugging, testing, or for providing entry points for exploration. + +Random sampling API is a part of [Universal Query API](#query-api) and can be used in the same way as regular search API. + +```python +from qdrant_client import QdrantClient, models + + +sampled = client.query_points( + collection_name="{collection_name}", + query=models.SampleQuery(sample=models.Sample.RANDOM) +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync(collectionName: "{collection_name}", query: Sample.Random); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.QueryGroups(context.Background(), &qdrant.QueryPointGroups{ + CollectionName: "{collection_name}", + Query: qdrant.NewQuerySample(qdrant.Sample_Random), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": { + "sample": "random" + } +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +const sampled = await client.query("{collection_name}", { + query: { + sample: "random", + }, +}); +``` + +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{Query, QueryPointsBuilder}; +let client = Qdrant::from_url("http://localhost:6334").build()?; + +let sampled = client + .query( + QueryPointsBuilder::new("{collection_name}") + .query(Query::new_sample(Sample::Random)) + ) + .await?; + +``` + +```java +import static io.qdrant.client.QueryFactory.sample; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.QueryPoints; +import io.qdrant.client.grpc.Points.Sample; + + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + + +client + .queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(sample(Sample.Random)) + .build()) + .get(); +``` + +## Query planning + +Depending on the filter used in the search - there are several possible scenarios for query execution. +Qdrant chooses one of the query execution options depending on the available indexes, the complexity of the conditions and the cardinality of the filtering result. +This process is called query planning. + +The strategy selection process relies heavily on heuristics and can vary from release to release. +However, the general principles are: + +* planning is performed for each segment independently (see [storage](/documentation/concepts/storage/) for more information about segments) +* prefer a full scan if the amount of points is below a threshold +* estimate the cardinality of a filtered result before selecting a strategy +* retrieve points using payload index (see [indexing](/documentation/concepts/indexing/)) if cardinality is below threshold +* use filterable vector index if the cardinality is above a threshold + +You can adjust the threshold using a [configuration file](https://github.com/qdrant/qdrant/blob/master/config/config.yaml), as well as independently for each collection. + +<|page-7-lllmstxt|> +# Explore the data + +After mastering the concepts in [search](/documentation/concepts/search/), you can start exploring your data in other ways. Qdrant provides a stack of APIs that allow you to find similar vectors in a different fashion, as well as to find the most dissimilar ones. These are useful tools for recommendation systems, data exploration, and data cleaning. + +## Recommendation API + +In addition to the regular search, Qdrant also allows you to search based on multiple positive and negative examples. The API is called ***recommend***, and the examples can be point IDs, so that you can leverage the already encoded objects; and, as of v1.6, you can also use raw vectors as input, so that you can create your vectors on the fly without uploading them as points. + +REST API - API Schema definition is available [here](https://api.qdrant.tech/api-reference/search/recommend-points) + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.query_points( + collection_name="{collection_name}", + query=models.RecommendQuery( + recommend=models.RecommendInput( + positive=[100, 231], + negative=[718, [0.2, 0.3, 0.4, 0.5]], + strategy=models.RecommendStrategy.AVERAGE_VECTOR, + ) + ), + query_filter=models.Filter( + must=[ + models.FieldCondition( + key="city", + match=models.MatchValue( + value="London", + ), + ) + ] + ), + limit=3, +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: new RecommendInput { + Positive = { 100, 231 }, + Negative = { 718 } + }, + filter: MatchKeyword("city", "London"), + limit: 3 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQueryRecommend(&qdrant.RecommendInput{ + Positive: []*qdrant.VectorInput{ + qdrant.NewVectorInputID(qdrant.NewIDNum(100)), + qdrant.NewVectorInputID(qdrant.NewIDNum(231)), + }, + Negative: []*qdrant.VectorInput{ + qdrant.NewVectorInputID(qdrant.NewIDNum(718)), + }, + }), + Filter: &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("city", "London"), + }, + }, +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": { + "recommend": { + "positive": [100, 231], + "negative": [718, [0.2, 0.3, 0.4, 0.5]], + "strategy": "average_vector" + } + }, + "filter": { + "must": [ + { + "key": "city", + "match": { + "value": "London" + } + } + ] + } +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + query: { + recommend: { + positive: [100, 231], + negative: [718, [0.2, 0.3, 0.4, 0.5]], + strategy: "average_vector" + } + }, + filter: { + must: [ + { + key: "city", + match: { + value: "London", + }, + }, + ], + }, + limit: 3 +}); +``` + +```rust +use qdrant_client::qdrant::{ + Condition, Filter, QueryPointsBuilder, RecommendInputBuilder, RecommendStrategy, +}; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query( + RecommendInputBuilder::default() + .add_positive(100) + .add_positive(231) + .add_positive(vec![0.2, 0.3, 0.4, 0.5]) + .add_negative(718) + .strategy(RecommendStrategy::AverageVector) + .build(), + ) + .limit(3) + .filter(Filter::must([Condition::matches( + "city", + "London".to_string(), + )])), + ) + .await?; +``` + +```java +import java.util.List; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.QueryPoints; +import io.qdrant.client.grpc.Points.RecommendInput; +import io.qdrant.client.grpc.Points.RecommendStrategy; +import io.qdrant.client.grpc.Points.Filter; + +import static io.qdrant.client.ConditionFactory.matchKeyword; +import static io.qdrant.client.VectorInputFactory.vectorInput; +import static io.qdrant.client.QueryFactory.recommend; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.queryAsync(QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(recommend(RecommendInput.newBuilder() + .addAllPositive(List.of(vectorInput(100), vectorInput(200), vectorInput(100.0f, 231.0f))) + .addAllNegative(List.of(vectorInput(718), vectorInput(0.2f, 0.3f, 0.4f, 0.5f))) + .setStrategy(RecommendStrategy.AverageVector) + .build())) + .setFilter(Filter.newBuilder().addMust(matchKeyword("city", "London"))) + .setLimit(3) + .build()).get(); +``` + +Example result of this API would be + +```json +{ + "result": [ + { "id": 10, "score": 0.81 }, + { "id": 14, "score": 0.75 }, + { "id": 11, "score": 0.73 } + ], + "status": "ok", + "time": 0.001 +} +``` + +The algorithm used to get the recommendations is selected from the available `strategy` options. Each of them has its own strengths and weaknesses, so experiment and choose the one that works best for your case. + +### Average vector strategy + +The default and first strategy added to Qdrant is called `average_vector`. It preprocesses the input examples to create a single vector that is used for the search. Since the preprocessing step happens very fast, the performance of this strategy is on-par with regular search. The intuition behind this kind of recommendation is that each vector component represents an independent feature of the data, so, by averaging the examples, we should get a good recommendation. + +The way to produce the searching vector is by first averaging all the positive and negative examples separately, and then combining them into a single vector using the following formula: + +```rust +avg_positive + avg_positive - avg_negative +``` + +In the case of not having any negative examples, the search vector will simply be equal to `avg_positive`. + +This is the default strategy that's going to be set implicitly, but you can explicitly define it by setting `"strategy": "average_vector"` in the recommendation request. + +### Best score strategy + +*Available as of v1.6.0* + +A new strategy introduced in v1.6, is called `best_score`. It is based on the idea that the best way to find similar vectors is to find the ones that are closer to a positive example, while avoiding the ones that are closer to a negative one. +The way it works is that each candidate is measured against every example, then we select the best positive and best negative scores. The final score is chosen with this step formula: + +```rust +// Sigmoid function to normalize the score between 0 and 1 +let sigmoid = |x| 0.5 * (1.0 + (x / (1.0 + x.abs()))); + +let score = if best_positive_score > best_negative_score { + sigmoid(best_positive_score) +} else { + -sigmoid(best_negative_score) +}; +``` + + + +Since we are computing similarities to every example at each step of the search, the performance of this strategy will be linearly impacted by the amount of examples. This means that the more examples you provide, the slower the search will be. However, this strategy can be very powerful and should be more embedding-agnostic. + + + +To use this algorithm, you need to set `"strategy": "best_score"` in the recommendation request. + +#### Using only negative examples + +A beneficial side-effect of `best_score` strategy is that you can use it with only negative examples. This will allow you to find the most dissimilar vectors to the ones you provide. This can be useful for finding outliers in your data, or for finding the most dissimilar vectors to a given one. + +Combining negative-only examples with filtering can be a powerful tool for data exploration and cleaning. + +### Sum scores strategy + +Another strategy for using multiple query vectors simultaneously is to just sum their scores against the candidates. In qdrant, this is called `sum_scores` strategy. + +This strategy was used in [this paper](https://arxiv.org/abs/2210.10695) by [UKP Lab](http://www.ukp.tu-darmstadt.de/), [hessian.ai](https://hessian.ai) and [cohere.ai](https://cohere.ai) to incorporate relevance feedback into a subsequent search. In the paper this boosted the nDCG@20 performance by 5.6% points when using 2-8 positive feedback documents. + +The formula that this strategy implements is + +$$ +s_i = \sum_{v_q\in Q^+}s(v_q, v_i) - \sum_{v_q\in Q^-}s(v_q, v_i) +$$ + +where $Q^+$ is the set of positive examples, $Q^-$ is the set of negative examples, and $s(v_q, v_i)$ is the score of the vector $v_q$ against the vector $v_i$ + +As with `best_score`, this strategy also allows using only negative examples. + +### Multiple vectors + +*Available as of v0.10.0* + +If the collection was created with multiple vectors, the name of the vector should be specified in the recommendation request: + +```python +client.query_points( + collection_name="{collection_name}", + query=models.RecommendQuery( + recommend=models.RecommendInput( + positive=[100, 231], + negative=[718], + ) + ), + using="image", + limit=10, +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: new RecommendInput { + Positive = { 100, 231 }, + Negative = { 718 } + }, + usingVector: "image", + limit: 10 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQueryRecommend(&qdrant.RecommendInput{ + Positive: []*qdrant.VectorInput{ + qdrant.NewVectorInputID(qdrant.NewIDNum(100)), + qdrant.NewVectorInputID(qdrant.NewIDNum(231)), + }, + Negative: []*qdrant.VectorInput{ + qdrant.NewVectorInputID(qdrant.NewIDNum(718)), + }, + }), + Using: qdrant.PtrOf("image"), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": { + "recommend": { + "positive": [100, 231], + "negative": [718] + } + }, + "using": "image", + "limit": 10 +} +``` + +```typescript +client.query("{collection_name}", { + query: { + recommend: { + positive: [100, 231], + negative: [718], + } + }, + using: "image", + limit: 10 +}); +``` + +```rust +use qdrant_client::qdrant::{QueryPointsBuilder, RecommendInputBuilder}; + +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query( + RecommendInputBuilder::default() + .add_positive(100) + .add_positive(231) + .add_negative(718) + .build(), + ) + .limit(10) + .using("image"), + ) + .await?; +``` + +```java +import java.util.List; + +import io.qdrant.client.grpc.Points.QueryPoints; +import io.qdrant.client.grpc.Points.RecommendInput; + +import static io.qdrant.client.VectorInputFactory.vectorInput; +import static io.qdrant.client.QueryFactory.recommend; + +client.queryAsync(QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(recommend(RecommendInput.newBuilder() + .addAllPositive(List.of(vectorInput(100), vectorInput(231))) + .addAllNegative(List.of(vectorInput(718))) + .build())) + .setUsing("image") + .setLimit(10) + .build()).get(); +``` + +Parameter `using` specifies which stored vectors to use for the recommendation. + +### Lookup vectors from another collection + +*Available as of v0.11.6* + +If you have collections with vectors of the same dimensionality, +and you want to look for recommendations in one collection based on the vectors of another collection, +you can use the `lookup_from` parameter. + +It might be useful, e.g. in the item-to-user recommendations scenario. +Where user and item embeddings, although having the same vector parameters (distance type and dimensionality), are usually stored in different collections. + +```python +client.query_points( + collection_name="{collection_name}", + query=models.RecommendQuery( + recommend=models.RecommendInput( + positive=[100, 231], + negative=[718], + ) + ), + using="image", + limit=10, + lookup_from=models.LookupLocation( + collection="{external_collection_name}", vector="{external_vector_name}" + ), +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: new RecommendInput { + Positive = { 100, 231 }, + Negative = { 718 } + }, + usingVector: "image", + limit: 10, + lookupFrom: new LookupLocation + { + CollectionName = "{external_collection_name}", + VectorName = "{external_vector_name}", + } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQueryRecommend(&qdrant.RecommendInput{ + Positive: []*qdrant.VectorInput{ + qdrant.NewVectorInputID(qdrant.NewIDNum(100)), + qdrant.NewVectorInputID(qdrant.NewIDNum(231)), + }, + Negative: []*qdrant.VectorInput{ + qdrant.NewVectorInputID(qdrant.NewIDNum(718)), + }, + }), + Using: qdrant.PtrOf("image"), + LookupFrom: &qdrant.LookupLocation{ + CollectionName: "{external_collection_name}", + VectorName: qdrant.PtrOf("{external_vector_name}"), + }, +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": { + "recommend": { + "positive": [100, 231], + "negative": [718] + } + }, + "limit": 10, + "lookup_from": { + "collection": "{external_collection_name}", + "vector": "{external_vector_name}" + } +} +``` + +```typescript +client.query("{collection_name}", { + query: { + recommend: { + positive: [100, 231], + negative: [718], + } + }, + using: "image", + limit: 10, + lookup_from: { + collection: "{external_collection_name}", + vector: "{external_vector_name}" + } +}); +``` + +```rust +use qdrant_client::qdrant::{LookupLocationBuilder, QueryPointsBuilder, RecommendInputBuilder}; + +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query( + RecommendInputBuilder::default() + .add_positive(100) + .add_positive(231) + .add_negative(718) + .build(), + ) + .limit(10) + .using("image") + .lookup_from( + LookupLocationBuilder::new("{external_collection_name}") + .vector_name("{external_vector_name}"), + ), + ) + .await?; +``` + +```java +import java.util.List; + +import io.qdrant.client.grpc.Points.LookupLocation; +import io.qdrant.client.grpc.Points.QueryPoints; +import io.qdrant.client.grpc.Points.RecommendInput; + +import static io.qdrant.client.VectorInputFactory.vectorInput; +import static io.qdrant.client.QueryFactory.recommend; + +client.queryAsync(QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(recommend(RecommendInput.newBuilder() + .addAllPositive(List.of(vectorInput(100), vectorInput(231))) + .addAllNegative(List.of(vectorInput(718))) + .build())) + .setUsing("image") + .setLimit(10) + .setLookupFrom( + LookupLocation.newBuilder() + .setCollectionName("{external_collection_name}") + .setVectorName("{external_vector_name}") + .build()) + .build()).get(); +``` + +Vectors are retrieved from the external collection by ids provided in the `positive` and `negative` lists. +These vectors then used to perform the recommendation in the current collection, comparing against the "using" or default vector. + + +## Batch recommendation API + +*Available as of v0.10.0* + +Similar to the batch search API in terms of usage and advantages, it enables the batching of recommendation requests. + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +filter_ = models.Filter( + must=[ + models.FieldCondition( + key="city", + match=models.MatchValue( + value="London", + ), + ) + ] +) + +recommend_queries = [ + models.QueryRequest( + query=models.RecommendQuery( + recommend=models.RecommendInput(positive=[100, 231], negative=[718]) + ), + filter=filter_, + limit=3, + ), + models.QueryRequest( + query=models.RecommendQuery( + recommend=models.RecommendInput(positive=[200, 67], negative=[300]) + ), + filter=filter_, + limit=3, + ), +] + +client.query_batch_points( + collection_name="{collection_name}", requests=recommend_queries +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +var filter = MatchKeyword("city", "london"); + +await client.QueryBatchAsync( + collectionName: "{collection_name}", + queries: + [ + new QueryPoints() + { + CollectionName = "{collection_name}", + Query = new RecommendInput { + Positive = { 100, 231 }, + Negative = { 718 }, + }, + Limit = 3, + Filter = filter, + }, + new QueryPoints() + { + CollectionName = "{collection_name}", + Query = new RecommendInput { + Positive = { 200, 67 }, + Negative = { 300 }, + }, + Limit = 3, + Filter = filter, + } + ] +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +filter := qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("city", "London"), + }, +} +client.QueryBatch(context.Background(), &qdrant.QueryBatchPoints{ + CollectionName: "{collection_name}", + QueryPoints: []*qdrant.QueryPoints{ + { + CollectionName: "{collection_name}", + Query: qdrant.NewQueryRecommend(&qdrant.RecommendInput{ + Positive: []*qdrant.VectorInput{ + qdrant.NewVectorInputID(qdrant.NewIDNum(100)), + qdrant.NewVectorInputID(qdrant.NewIDNum(231)), + }, + Negative: []*qdrant.VectorInput{ + qdrant.NewVectorInputID(qdrant.NewIDNum(718)), + }, + }, + ), + Filter: &filter, + }, + { + CollectionName: "{collection_name}", + Query: qdrant.NewQueryRecommend(&qdrant.RecommendInput{ + Positive: []*qdrant.VectorInput{ + qdrant.NewVectorInputID(qdrant.NewIDNum(200)), + qdrant.NewVectorInputID(qdrant.NewIDNum(67)), + }, + Negative: []*qdrant.VectorInput{ + qdrant.NewVectorInputID(qdrant.NewIDNum(300)), + }, + }, + ), + Filter: &filter, + }, + }, +}, +) +``` + +```http +POST /collections/{collection_name}/query/batch +{ + "searches": [ + { + "query": { + "recommend": { + "positive": [100, 231], + "negative": [718] + } + }, + "filter": { + "must": [ + { + "key": "city", + "match": { + "value": "London" + } + } + ] + }, + "limit": 10 + }, + { + "query": { + "recommend": { + "positive": [200, 67], + "negative": [300] + } + }, + "filter": { + "must": [ + { + "key": "city", + "match": { + "value": "London" + } + } + ] + }, + "limit": 10 + } + ] +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +const filter = { + must: [ + { + key: "city", + match: { + value: "London", + }, + }, + ], +}; + +const searches = [ + { + query: { + recommend: { + positive: [100, 231], + negative: [718] + } + }, + filter, + limit: 3, + }, + { + query: { + recommend: { + positive: [200, 67], + negative: [300] + } + }, + filter, + limit: 3, + }, +]; + +client.queryBatch("{collection_name}", { + searches, +}); +``` + +```rust +use qdrant_client::qdrant::{ + Condition, Filter, QueryBatchPointsBuilder, QueryPointsBuilder, + RecommendInputBuilder, +}; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +let filter = Filter::must([Condition::matches("city", "London".to_string())]); + +let recommend_queries = vec![ + QueryPointsBuilder::new("{collection_name}") + .query( + RecommendInputBuilder::default() + .add_positive(100) + .add_positive(231) + .add_negative(718) + .build(), + ) + .filter(filter.clone()) + .build(), + QueryPointsBuilder::new("{collection_name}") + .query( + RecommendInputBuilder::default() + .add_positive(200) + .add_positive(67) + .add_negative(300) + .build(), + ) + .filter(filter) + .build(), +]; + +client + .query_batch(QueryBatchPointsBuilder::new( + "{collection_name}", + recommend_queries, + )) + .await?; +``` + +```java +import java.util.List; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.QueryPoints; +import io.qdrant.client.grpc.Points.RecommendInput; + +import static io.qdrant.client.ConditionFactory.matchKeyword; +import static io.qdrant.client.VectorInputFactory.vectorInput; +import static io.qdrant.client.QueryFactory.recommend; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +Filter filter = Filter.newBuilder().addMust(matchKeyword("city", "London")).build(); + +List recommendQueries = List.of( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(recommend( + RecommendInput.newBuilder() + .addAllPositive(List.of(vectorInput(100), vectorInput(231))) + .addAllNegative(List.of(vectorInput(731))) + .build())) + .setFilter(filter) + .setLimit(3) + .build(), + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(recommend( + RecommendInput.newBuilder() + .addAllPositive(List.of(vectorInput(200), vectorInput(67))) + .addAllNegative(List.of(vectorInput(300))) + .build())) + .setFilter(filter) + .setLimit(3) + .build()); + +client.queryBatchAsync("{collection_name}", recommendQueries).get(); +``` + +The result of this API contains one array per recommendation requests. + +```json +{ + "result": [ + [ + { "id": 10, "score": 0.81 }, + { "id": 14, "score": 0.75 }, + { "id": 11, "score": 0.73 } + ], + [ + { "id": 1, "score": 0.92 }, + { "id": 3, "score": 0.89 }, + { "id": 9, "score": 0.75 } + ] + ], + "status": "ok", + "time": 0.001 +} +``` + +## Discovery API + +*Available as of v1.7* + +REST API Schema definition available [here](https://api.qdrant.tech/api-reference/search/discover-points) + +In this API, Qdrant introduces the concept of `context`, which is used for splitting the space. Context is a set of positive-negative pairs, and each pair divides the space into positive and negative zones. In that mode, the search operation prefers points based on how many positive zones they belong to (or how much they avoid negative zones). + +The interface for providing context is similar to the recommendation API (ids or raw vectors). Still, in this case, they need to be provided in the form of positive-negative pairs. + +Discovery API lets you do two new types of search: +- **Discovery search**: Uses the context (the pairs of positive-negative vectors) and a target to return the points more similar to the target, but constrained by the context. +- **Context search**: Using only the context pairs, get the points that live in the best zone, where loss is minimized + +The way positive and negative examples should be arranged in the context pairs is completely up to you. So you can have the flexibility of trying out different permutation techniques based on your model and data. + + + +### Discovery search + +This type of search works specially well for combining multimodal, vector-constrained searches. Qdrant already has extensive support for filters, which constrain the search based on its payload, but using discovery search, you can also constrain the vector space in which the search is performed. + +![Discovery search](/docs/discovery-search.png) + +The formula for the discovery score can be expressed as: + +$$ +\text{rank}(v^+, v^-) = \begin{cases} + 1, &\quad s(v^+) \geq s(v^-) \\\\ + -1, &\quad s(v^+) < s(v^-) +\end{cases} +$$ +where $v^+$ represents a positive example, $v^-$ represents a negative example, and $s(v)$ is the similarity score of a vector $v$ to the target vector. The discovery score is then computed as: +$$ + \text{discovery score} = \text{sigmoid}(s(v_t))+ \sum \text{rank}(v_i^+, v_i^-), +$$ +where $s(v)$ is the similarity function, $v_t$ is the target vector, and again $v_i^+$ and $v_i^-$ are the positive and negative examples, respectively. The sigmoid function is used to normalize the score between 0 and 1 and the sum of ranks is used to penalize vectors that are closer to the negative examples than to the positive ones. In other words, the sum of individual ranks determines how many positive zones a point is in, while the closeness hierarchy comes second. + +Example: + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +discover_queries = [ + models.QueryRequest( + query=models.DiscoverQuery( + discover=models.DiscoverInput( + target=[0.2, 0.1, 0.9, 0.7], + context=[ + models.ContextPair( + positive=100, + negative=718, + ), + models.ContextPair( + positive=200, + negative=300, + ), + ], + ) + ), + limit=10, + ), +] + +client.query_batch_points( + collection_name="{collection_name}", requests=discover_queries +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: new DiscoverInput { + Target = new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, + Context = new ContextInput { + Pairs = { + new ContextInputPair { + Positive = 100, + Negative = 718 + }, + new ContextInputPair { + Positive = 200, + Negative = 300 + }, + } + }, + }, + limit: 10 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQueryDiscover(&qdrant.DiscoverInput{ + Target: qdrant.NewVectorInput(0.2, 0.1, 0.9, 0.7), + Context: &qdrant.ContextInput{ + Pairs: []*qdrant.ContextInputPair{ + { + Positive: qdrant.NewVectorInputID(qdrant.NewIDNum(100)), + Negative: qdrant.NewVectorInputID(qdrant.NewIDNum(718)), + }, + { + Positive: qdrant.NewVectorInputID(qdrant.NewIDNum(200)), + Negative: qdrant.NewVectorInputID(qdrant.NewIDNum(300)), + }, + }, + }, + }), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": { + "discover": { + "target": [0.2, 0.1, 0.9, 0.7], + "context": [ + { + "positive": 100, + "negative": 718 + }, + { + "positive": 200, + "negative": 300 + } + ] + } + }, + "limit": 10 +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + query: { + discover: { + target: [0.2, 0.1, 0.9, 0.7], + context: [ + { + positive: 100, + negative: 718, + }, + { + positive: 200, + negative: 300, + }, + ], + } + }, + limit: 10, +}); +``` + +```rust +use qdrant_client::qdrant::{ContextInputBuilder, DiscoverInputBuilder, QueryPointsBuilder}; +use qdrant_client::Qdrant; + +client + .query( + QueryPointsBuilder::new("{collection_name}").query( + DiscoverInputBuilder::new( + vec![0.2, 0.1, 0.9, 0.7], + ContextInputBuilder::default() + .add_pair(100, 718) + .add_pair(200, 300), + ) + .build(), + ), + ) + .await?; +``` + +```java +import java.util.List; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.ContextInput; +import io.qdrant.client.grpc.Points.ContextInputPair; +import io.qdrant.client.grpc.Points.DiscoverInput; +import io.qdrant.client.grpc.Points.QueryPoints; + +import static io.qdrant.client.VectorInputFactory.vectorInput; +import static io.qdrant.client.QueryFactory.discover; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.queryAsync(QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(discover(DiscoverInput.newBuilder() + .setTarget(vectorInput(0.2f, 0.1f, 0.9f, 0.7f)) + .setContext(ContextInput.newBuilder() + .addAllPairs(List.of( + ContextInputPair.newBuilder() + .setPositive(vectorInput(100)) + .setNegative(vectorInput(718)) + .build(), + ContextInputPair.newBuilder() + .setPositive(vectorInput(200)) + .setNegative(vectorInput(300)) + .build())) + .build()) + .build())) + .setLimit(10) + .build()).get(); +``` + + + +### Context search + +Conversely, in the absence of a target, a rigid integer-by-integer function doesn't provide much guidance for the search when utilizing a proximity graph like HNSW. Instead, context search employs a function derived from the [triplet-loss](/articles/triplet-loss/) concept, which is usually applied during model training. For context search, this function is adapted to steer the search towards areas with fewer negative examples. + +![Context search](/docs/context-search.png) + +We can directly associate the score function to a loss function, where 0.0 is the maximum score a point can have, which means it is only in positive areas. As soon as a point exists closer to a negative example, its loss will simply be the difference of the positive and negative similarities. + +$$ +\text{context score} = \sum \min(s(v^+_i) - s(v^-_i), 0.0) +$$ + +Where $v^+_i$ and $v^-_i$ are the positive and negative examples of each pair, and $s(v)$ is the similarity function. + +Using this kind of search, you can expect the output to not necessarily be around a single point, but rather, to be any point that isn’t closer to a negative example, which creates a constrained diverse result. So, even when the API is not called [`recommend`](#recommendation-api), recommendation systems can also use this approach and adapt it for their specific use-cases. + +Example: + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +discover_queries = [ + models.QueryRequest( + query=models.ContextQuery( + context=[ + models.ContextPair( + positive=100, + negative=718, + ), + models.ContextPair( + positive=200, + negative=300, + ), + ], + ), + limit=10, + ), +] + +client.query_batch_points( + collection_name="{collection_name}", requests=discover_queries +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: new ContextInput { + Pairs = { + new ContextInputPair { + Positive = 100, + Negative = 718 + }, + new ContextInputPair { + Positive = 200, + Negative = 300 + }, + } + }, + limit: 10 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQueryContext(&qdrant.ContextInput{ + Pairs: []*qdrant.ContextInputPair{ + { + Positive: qdrant.NewVectorInputID(qdrant.NewIDNum(100)), + Negative: qdrant.NewVectorInputID(qdrant.NewIDNum(718)), + }, + { + Positive: qdrant.NewVectorInputID(qdrant.NewIDNum(200)), + Negative: qdrant.NewVectorInputID(qdrant.NewIDNum(300)), + }, + }, + }), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": { + "context": [ + { + "positive": 100, + "negative": 718 + }, + { + "positive": 200, + "negative": 300 + } + ] + }, + "limit": 10 +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + query: { + context: [ + { + positive: 100, + negative: 718, + }, + { + positive: 200, + negative: 300, + }, + ] + }, + limit: 10, +}); +``` + +```rust +use qdrant_client::qdrant::{ContextInputBuilder, QueryPointsBuilder}; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .query( + QueryPointsBuilder::new("{collection_name}").query( + ContextInputBuilder::default() + .add_pair(100, 718) + .add_pair(200, 300) + .build(), + ), + ) + .await?; +``` + +```java +import java.util.List; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.ContextInput; +import io.qdrant.client.grpc.Points.ContextInputPair; +import io.qdrant.client.grpc.Points.QueryPoints; + +import static io.qdrant.client.VectorInputFactory.vectorInput; +import static io.qdrant.client.QueryFactory.context; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.queryAsync(QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(context(ContextInput.newBuilder() + .addAllPairs(List.of( + ContextInputPair.newBuilder() + .setPositive(vectorInput(100)) + .setNegative(vectorInput(718)) + .build(), + ContextInputPair.newBuilder() + .setPositive(vectorInput(200)) + .setNegative(vectorInput(300)) + .build())) + .build())) + .setLimit(10) + .build()).get(); +``` + + + +## Distance Matrix + +*Available as of v1.12.0* + +The distance matrix API allows to calculate the distance between sampled pairs of vectors and to return the result as a sparse matrix. + +Such API enables new data exploration use cases such as clustering similar vectors, visualization of connections or dimension reduction. + +The API input request consists of the following parameters: +- `sample`: the number of vectors to sample +- `limit`: the number of scores to return per sample +- `filter`: the filter to apply to constraint the samples + +Let's have a look at a basic example with `sample=100`, `limit=10`: + +The engine starts by selecting `100` random points from the collection, then for each of the selected points, it will compute the top `10` closest points **within** the samples. + +This will results in a total of 1000 scores represented as a sparse matrix for efficient processing. + +The distance matrix API offers two output formats to ease the integration with different tools. + +### Pairwise format + +Returns the distance matrix as a list of pairs of point `ids` with their respective score. + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.search_matrix_pairs( + collection_name="{collection_name}", + sample=10, + limit=2, + query_filter=models.Filter( + must=[ + models.FieldCondition( + key="color", match=models.MatchValue(value="red") + ), + ] + ), +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +await client.SearchMatrixPairsAsync( + collectionName: "{collection_name}", + filter: MatchKeyword("color", "red"), + sample: 10, + limit: 2 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +sample := uint64(10) +limit := uint64(2) +res, err := client.SearchMatrixPairs(ctx, &qdrant.SearchMatrixPoints{ + CollectionName: "{collection_name}", + Sample: &sample, + Limit: &limit, + Filter: &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("color", "red"), + }, + }, +}) +``` + +```http +POST /collections/{collection_name}/points/search/matrix/pairs +{ + "sample": 10, + "limit": 2, + "filter": { + "must": { + "key": "color", + "match": { "value": "red" } + } + } +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.searchMatrixPairs("{collection_name}", { + filter: { + must: [ + { + key: "color", + match: { + value: "red", + }, + }, + ], + }, + sample: 10, + limit: 2, +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, Filter, SearchMatrixPointsBuilder}; +use qdrant_client::Qdrant; + +client + .search_matrix_pairs( + SearchMatrixPointsBuilder::new("collection_name") + .filter(Filter::must(vec![Condition::matches( + "color", + "red".to_string(), + )])) + .sample(10) + .limit(2), + ) + .await?; +``` + +```java +import static io.qdrant.client.ConditionFactory.matchKeyword; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.SearchMatrixPoints; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .searchMatrixPairsAsync( + Points.SearchMatrixPoints.newBuilder() + .setCollectionName("{collection_name}") + .setFilter(Filter.newBuilder().addMust(matchKeyword("color", "red")).build()) + .setSample(10) + .setLimit(2) + .build()) + .get(); +``` + +Returns + +```json +{ + "result": { + "pairs": [ + {"a": 1, "b": 3, "score": 1.4063001}, + {"a": 1, "b": 4, "score": 1.2531}, + {"a": 2, "b": 1, "score": 1.1550001}, + {"a": 2, "b": 8, "score": 1.1359}, + {"a": 3, "b": 1, "score": 1.4063001}, + {"a": 3, "b": 4, "score": 1.2218001}, + {"a": 4, "b": 1, "score": 1.2531}, + {"a": 4, "b": 3, "score": 1.2218001}, + {"a": 5, "b": 3, "score": 0.70239997}, + {"a": 5, "b": 1, "score": 0.6146}, + {"a": 6, "b": 3, "score": 0.6353}, + {"a": 6, "b": 4, "score": 0.5093}, + {"a": 7, "b": 3, "score": 1.0990001}, + {"a": 7, "b": 1, "score": 1.0349001}, + {"a": 8, "b": 2, "score": 1.1359}, + {"a": 8, "b": 3, "score": 1.0553} + ] + } +} +``` + +### Offset format + +Returns the distance matrix as a four arrays: +- `offsets_row` and `offsets_col`, represent the positions of non-zero distance values in the matrix. +- `scores` contains the distance values. +- `ids` contains the point ids corresponding to the distance values. + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.search_matrix_offsets( + collection_name="{collection_name}", + sample=10, + limit=2, + query_filter=models.Filter( + must=[ + models.FieldCondition( + key="color", match=models.MatchValue(value="red") + ), + ] + ), +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +await client.SearchMatrixOffsetsAsync( + collectionName: "{collection_name}", + filter: MatchKeyword("color", "red"), + sample: 10, + limit: 2 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +sample := uint64(10) +limit := uint64(2) +res, err := client.SearchMatrixOffsets(ctx, &qdrant.SearchMatrixPoints{ + CollectionName: "{collection_name}", + Sample: &sample, + Limit: &limit, + Filter: &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("color", "red"), + }, + }, +}) +``` + +```http +POST /collections/{collection_name}/points/search/matrix/offsets +{ + "sample": 10, + "limit": 2, + "filter": { + "must": { + "key": "color", + "match": { "value": "red" } + } + } +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.searchMatrixOffsets("{collection_name}", { + filter: { + must: [ + { + key: "color", + match: { + value: "red", + }, + }, + ], + }, + sample: 10, + limit: 2, +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, Filter, SearchMatrixPointsBuilder}; +use qdrant_client::Qdrant; + +client + .search_matrix_offsets( + SearchMatrixPointsBuilder::new("collection_name") + .filter(Filter::must(vec![Condition::matches( + "color", + "red".to_string(), + )])) + .sample(10) + .limit(2), + ) + .await?; +``` + +```java +import static io.qdrant.client.ConditionFactory.matchKeyword; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.SearchMatrixPoints; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .searchMatrixOffsetsAsync( + SearchMatrixPoints.newBuilder() + .setCollectionName("{collection_name}") + .setFilter(Filter.newBuilder().addMust(matchKeyword("color", "red")).build()) + .setSample(10) + .setLimit(2) + .build()) + .get(); +``` + +Returns + +```json +{ + "result": { + "offsets_row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7], + "offsets_col": [2, 3, 0, 7, 0, 3, 0, 2, 2, 0, 2, 3, 2, 0, 1, 2], + "scores": [ + 1.4063001, 1.2531, 1.1550001, 1.1359, 1.4063001, + 1.2218001, 1.2531, 1.2218001, 0.70239997, 0.6146, 0.6353, + 0.5093, 1.0990001, 1.0349001, 1.1359, 1.0553 + ], + "ids": [1, 2, 3, 4, 5, 6, 7, 8] + } +} +``` + +<|page-8-lllmstxt|> +# Hybrid and Multi-Stage Queries + +_Available as of v1.10.0_ + +With the introduction of [many named vectors per point](/documentation/concepts/vectors/#named-vectors), there are use-cases when the best search is obtained by combining multiple queries, +or by performing the search in more than one stage. + +Qdrant has a flexible and universal interface to make this possible, called `Query API` ([API reference](https://api.qdrant.tech/api-reference/search/query-points)). + +The main component for making the combinations of queries possible is the `prefetch` parameter, which enables making sub-requests. + +Specifically, whenever a query has at least one prefetch, Qdrant will: + +1. Perform the prefetch query (or queries), +2. Apply the main query over the results of its prefetch(es). + +Additionally, prefetches can have prefetches themselves, so you can have nested prefetches. + + + +## Hybrid Search + +One of the most common problems when you have different representations of the same data is to combine the queried points for each representation into a single result. + +{{< figure src="/docs/fusion-idea.png" caption="Fusing results from multiple queries" width="80%" >}} + +For example, in text search, it is often useful to combine dense and sparse vectors get the best of semantics, +plus the best of matching specific words. + +Qdrant currently has two ways of combining the results from different queries: + +- `rrf` - + + Reciprocal Rank Fusion + + + Considers the positions of results within each query, and boosts the ones that appear closer to the top in multiple of them. + +- `dbsf` - + + Distribution-Based Score Fusion + _(available as of v1.11.0)_ + + Normalizes the scores of the points in each query, using the mean +/- the 3rd standard deviation as limits, and then sums the scores of the same point across different queries. + + + +Here is an example of Reciprocal Rank Fusion for a query containing two prefetches against different named vectors configured to respectively hold sparse and dense vectors. + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.query_points( + collection_name="{collection_name}", + prefetch=[ + models.Prefetch( + query=models.SparseVector(indices=[1, 42], values=[0.22, 0.8]), + using="sparse", + limit=20, + ), + models.Prefetch( + query=[0.01, 0.45, 0.67], # <-- dense vector + using="dense", + limit=20, + ), + ], + query=models.FusionQuery(fusion=models.Fusion.RRF), +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + prefetch: new List < PrefetchQuery > { + new() { + Query = new(float, uint)[] { + (0.22f, 1), (0.8f, 42), + }, + Using = "sparse", + Limit = 20 + }, + new() { + Query = new float[] { + 0.01f, 0.45f, 0.67f + }, + Using = "dense", + Limit = 20 + } + }, + query: Fusion.Rrf +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Prefetch: []*qdrant.PrefetchQuery{ + { + Query: qdrant.NewQuerySparse([]uint32{1, 42}, []float32{0.22, 0.8}), + Using: qdrant.PtrOf("sparse"), + }, + { + Query: qdrant.NewQueryDense([]float32{0.01, 0.45, 0.67}), + Using: qdrant.PtrOf("dense"), + }, + }, + Query: qdrant.NewQueryFusion(qdrant.Fusion_RRF), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "prefetch": [ + { + "query": { + "indices": [1, 42], // <┐ + "values": [0.22, 0.8] // <┮─sparse vector + }, + "using": "sparse", + "limit": 20 + }, + { + "query": [0.01, 0.45, 0.67, ...], // <-- dense vector + "using": "dense", + "limit": 20 + } + ], + "query": { "fusion": "rrf" }, // <--- reciprocal rank fusion + "limit": 10 +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + prefetch: [ + { + query: { + values: [0.22, 0.8], + indices: [1, 42], + }, + using: 'sparse', + limit: 20, + }, + { + query: [0.01, 0.45, 0.67], + using: 'dense', + limit: 20, + }, + ], + query: { + fusion: 'rrf', + }, +}); +``` + +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{Fusion, PrefetchQueryBuilder, Query, QueryPointsBuilder}; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client.query( + QueryPointsBuilder::new("{collection_name}") + .add_prefetch(PrefetchQueryBuilder::default() + .query(Query::new_nearest([(1, 0.22), (42, 0.8)].as_slice())) + .using("sparse") + .limit(20u64) + ) + .add_prefetch(PrefetchQueryBuilder::default() + .query(Query::new_nearest(vec![0.01, 0.45, 0.67])) + .using("dense") + .limit(20u64) + ) + .query(Query::new_fusion(Fusion::Rrf)) +).await?; +``` + +```java +import static io.qdrant.client.QueryFactory.nearest; + +import java.util.List; + +import static io.qdrant.client.QueryFactory.fusion; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.Fusion; +import io.qdrant.client.grpc.Points.PrefetchQuery; +import io.qdrant.client.grpc.Points.QueryPoints; + +QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client.queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .addPrefetch(PrefetchQuery.newBuilder() + .setQuery(nearest(List.of(0.22f, 0.8f), List.of(1, 42))) + .setUsing("sparse") + .setLimit(20) + .build()) + .addPrefetch(PrefetchQuery.newBuilder() + .setQuery(nearest(List.of(0.01f, 0.45f, 0.67f))) + .setUsing("dense") + .setLimit(20) + .build()) + .setQuery(fusion(Fusion.RRF)) + .build()) + .get(); +``` + +## Multi-stage queries + +In many cases, the usage of a larger vector representation gives more accurate search results, but it is also more expensive to compute. + +Splitting the search into two stages is a known technique: + +- First, use a smaller and cheaper representation to get a large list of candidates. +- Then, re-score the candidates using the larger and more accurate representation. + +There are a few ways to build search architectures around this idea: + +- The quantized vectors as a first stage, and the full-precision vectors as a second stage. +- Leverage Matryoshka Representation Learning (MRL) to generate candidate vectors with a shorter vector, and then refine them with a longer one. +- Use regular dense vectors to pre-fetch the candidates, and then re-score them with a multi-vector model like ColBERT. + +To get the best of all worlds, Qdrant has a convenient interface to perform the queries in stages, +such that the coarse results are fetched first, and then they are refined later with larger vectors. + +### Re-scoring examples + +Fetch 1000 results using a shorter MRL byte vector, then re-score them using the full vector and get the top 10. + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.query_points( + collection_name="{collection_name}", + prefetch=models.Prefetch( + query=[1, 23, 45, 67], # <------------- small byte vector + using="mrl_byte", + limit=1000, + ), + query=[0.01, 0.299, 0.45, 0.67], # <-- full vector + using="full", + limit=10, +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + prefetch: new List { + new() { + Query = new float[] { 1,23, 45, 67 }, // <------------- small byte vector + Using = "mrl_byte", + Limit = 1000 + } + }, + query: new float[] { 0.01f, 0.299f, 0.45f, 0.67f }, // <-- full vector + usingVector: "full", + limit: 10 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Prefetch: []*qdrant.PrefetchQuery{ + { + Query: qdrant.NewQueryDense([]float32{1, 23, 45, 67}), + Using: qdrant.PtrOf("mrl_byte"), + Limit: qdrant.PtrOf(uint64(1000)), + }, + }, + Query: qdrant.NewQueryDense([]float32{0.01, 0.299, 0.45, 0.67}), + Using: qdrant.PtrOf("full"), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "prefetch": { + "query": [1, 23, 45, 67], // <------------- small byte vector + "using": "mrl_byte" + "limit": 1000 + }, + "query": [0.01, 0.299, 0.45, 0.67, ...], // <-- full vector + "using": "full", + "limit": 10 +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + prefetch: { + query: [1, 23, 45, 67], // <------------- small byte vector + using: 'mrl_byte', + limit: 1000, + }, + query: [0.01, 0.299, 0.45, 0.67], // <-- full vector, + using: 'full', + limit: 10, +}); +``` + +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{PrefetchQueryBuilder, Query, QueryPointsBuilder}; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client.query( + QueryPointsBuilder::new("{collection_name}") + .add_prefetch(PrefetchQueryBuilder::default() + .query(Query::new_nearest(vec![1.0, 23.0, 45.0, 67.0])) + .using("mlr_byte") + .limit(1000u64) + ) + .query(Query::new_nearest(vec![0.01, 0.299, 0.45, 0.67])) + .using("full") + .limit(10u64) +).await?; +``` + +```java +import static io.qdrant.client.QueryFactory.nearest; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.PrefetchQuery; +import io.qdrant.client.grpc.Points.QueryPoints; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .addPrefetch( + PrefetchQuery.newBuilder() + .setQuery(nearest(1, 23, 45, 67)) // <------------- small byte vector + .setLimit(1000) + .setUsing("mrl_byte") + .build()) + .setQuery(nearest(0.01f, 0.299f, 0.45f, 0.67f)) // <-- full vector + .setUsing("full") + .setLimit(10) + .build()) + .get(); +``` + +Fetch 100 results using the default vector, then re-score them using a multi-vector to get the top 10. + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.query_points( + collection_name="{collection_name}", + prefetch=models.Prefetch( + query=[0.01, 0.45, 0.67, 0.53], # <-- dense vector + limit=100, + ), + query=[ + [0.1, 0.2, 0.32], # <─┐ + [0.2, 0.1, 0.52], # < ├─ multi-vector + [0.8, 0.9, 0.93], # < ┘ + ], + using="colbert", + limit=10, +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + prefetch: new List { + new() { + Query = new float[] { 0.01f, 0.45f, 0.67f }, // <-- dense vector**** + Limit = 100 + } + }, + query: new float[][] { + [0.1f, 0.2f], // <─┐ + [0.2f, 0.1f], // < ├─ multi-vector + [0.8f, 0.9f] // < ┘ + }, + usingVector: "colbert", + limit: 10 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Prefetch: []*qdrant.PrefetchQuery{ + { + Query: qdrant.NewQueryDense([]float32{0.01, 0.45, 0.67}), + Limit: qdrant.PtrOf(uint64(100)), + }, + }, + Query: qdrant.NewQueryMulti([][]float32{ + {0.1, 0.2}, + {0.2, 0.1}, + {0.8, 0.9}, + }), + Using: qdrant.PtrOf("colbert"), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "prefetch": { + "query": [0.01, 0.45, 0.67, ...], // <-- dense vector + "limit": 100 + }, + "query": [ // <─┐ + [0.1, 0.2, ...], // < │ + [0.2, 0.1, ...], // < ├─ multi-vector + [0.8, 0.9, ...] // < │ + ], // <─┘ + "using": "colbert", + "limit": 10 +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + prefetch: { + query: [1, 23, 45, 67], // <------------- small byte vector + limit: 100, + }, + query: [ + [0.1, 0.2], // <─┐ + [0.2, 0.1], // < ├─ multi-vector + [0.8, 0.9], // < ┘ + ], + using: 'colbert', + limit: 10, +}); +``` + +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{PrefetchQueryBuilder, Query, QueryPointsBuilder}; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client.query( + QueryPointsBuilder::new("{collection_name}") + .add_prefetch(PrefetchQueryBuilder::default() + .query(Query::new_nearest(vec![0.01, 0.45, 0.67])) + .limit(100u64) + ) + .query(Query::new_nearest(vec![ + vec![0.1, 0.2], + vec![0.2, 0.1], + vec![0.8, 0.9], + ])) + .using("colbert") + .limit(10u64) +).await?; +``` + +```java +import static io.qdrant.client.QueryFactory.nearest; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.PrefetchQuery; +import io.qdrant.client.grpc.Points.QueryPoints; + + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .addPrefetch( + PrefetchQuery.newBuilder() + .setQuery(nearest(0.01f, 0.45f, 0.67f)) // <-- dense vector + .setLimit(100) + .build()) + .setQuery( + nearest( + new float[][] { + {0.1f, 0.2f}, // <─┐ + {0.2f, 0.1f}, // < ├─ multi-vector + {0.8f, 0.9f} // < ┘ + })) + .setUsing("colbert") + .setLimit(10) + .build()) + .get(); +``` + +It is possible to combine all the above techniques in a single query: + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.query_points( + collection_name="{collection_name}", + prefetch=models.Prefetch( + prefetch=models.Prefetch( + query=[1, 23, 45, 67], # <------ small byte vector + using="mrl_byte", + limit=1000, + ), + query=[0.01, 0.45, 0.67], # <-- full dense vector + using="full", + limit=100, + ), + query=[ + [0.17, 0.23, 0.52], # <─┐ + [0.22, 0.11, 0.63], # < ├─ multi-vector + [0.86, 0.93, 0.12], # < ┘ + ], + using="colbert", + limit=10, +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + prefetch: new List { + new() { + Prefetch = { + new List { + new() { + Query = new float[] { 1, 23, 45, 67 }, // <------------- small byte vector + Using = "mrl_byte", + Limit = 1000 + }, + } + }, + Query = new float[] {0.01f, 0.45f, 0.67f}, // <-- dense vector + Using = "full", + Limit = 100 + } + }, + query: new float[][] { + [0.1f, 0.2f], // <─┐ + [0.2f, 0.1f], // < ├─ multi-vector + [0.8f, 0.9f] // < ┘ + }, + usingVector: "colbert", + limit: 10 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Prefetch: []*qdrant.PrefetchQuery{ + { + Prefetch: []*qdrant.PrefetchQuery{ + { + Query: qdrant.NewQueryDense([]float32{1, 23, 45, 67}), + Using: qdrant.PtrOf("mrl_byte"), + Limit: qdrant.PtrOf(uint64(1000)), + }, + }, + Query: qdrant.NewQueryDense([]float32{0.01, 0.45, 0.67}), + Limit: qdrant.PtrOf(uint64(100)), + Using: qdrant.PtrOf("full"), + }, + }, + Query: qdrant.NewQueryMulti([][]float32{ + {0.1, 0.2}, + {0.2, 0.1}, + {0.8, 0.9}, + }), + Using: qdrant.PtrOf("colbert"), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "prefetch": { + "prefetch": { + "query": [1, 23, 45, 67], // <------ small byte vector + "using": "mrl_byte" + "limit": 1000 + }, + "query": [0.01, 0.45, 0.67, ...], // <-- full dense vector + "using": "full" + "limit": 100 + }, + "query": [ // <─┐ + [0.1, 0.2, ...], // < │ + [0.2, 0.1, ...], // < ├─ multi-vector + [0.8, 0.9, ...] // < │ + ], // <─┘ + "using": "colbert", + "limit": 10 +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + prefetch: { + prefetch: { + query: [1, 23, 45, 67], // <------------- small byte vector + using: 'mrl_byte', + limit: 1000, + }, + query: [0.01, 0.45, 0.67], // <-- full dense vector + using: 'full', + limit: 100, + }, + query: [ + [0.1, 0.2], // <─┐ + [0.2, 0.1], // < ├─ multi-vector + [0.8, 0.9], // < ┘ + ], + using: 'colbert', + limit: 10, +}); +``` + +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{PrefetchQueryBuilder, Query, QueryPointsBuilder}; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client.query( + QueryPointsBuilder::new("{collection_name}") + .add_prefetch(PrefetchQueryBuilder::default() + .add_prefetch(PrefetchQueryBuilder::default() + .query(Query::new_nearest(vec![1.0, 23.0, 45.0, 67.0])) + .using("mlr_byte") + .limit(1000u64) + ) + .query(Query::new_nearest(vec![0.01, 0.45, 0.67])) + .using("full") + .limit(100u64) + ) + .query(Query::new_nearest(vec![ + vec![0.1, 0.2], + vec![0.2, 0.1], + vec![0.8, 0.9], + ])) + .using("colbert") + .limit(10u64) +).await?; +``` + +```java +import static io.qdrant.client.QueryFactory.nearest; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.PrefetchQuery; +import io.qdrant.client.grpc.Points.QueryPoints; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .addPrefetch( + PrefetchQuery.newBuilder() + .addPrefetch( + PrefetchQuery.newBuilder() + .setQuery(nearest(1, 23, 45, 67)) // <------------- small byte vector + .setUsing("mrl_byte") + .setLimit(1000) + .build()) + .setQuery(nearest(0.01f, 0.45f, 0.67f)) // <-- dense vector + .setUsing("full") + .setLimit(100) + .build()) + .setQuery( + nearest( + new float[][] { + {0.1f, 0.2f}, // <─┐ + {0.2f, 0.1f}, // < ├─ multi-vector + {0.8f, 0.9f} // < ┘ + })) + .setUsing("colbert") + .setLimit(10) + .build()) + .get(); +``` + +### Maximal Marginal Relevance (MMR) + +_Available as of v1.15.0_ + +A useful algorithm to improve the diversity of the results is [Maximal Marginal Relevance (MMR)](https://www.cs.cmu.edu/~jgc/publication/The_Use_MMR_Diversity_Based_LTMIR_1998.pdf). It excels when the dataset has many redundant or very similar points for a query. + +MMR selects candidates iteratively, starting with the most relevant point (higher similarity to the query). For each next point, it selects the one that hasn't been chosen yet which has the best combination of relevance and higher separation to the already selected points. + +$$ +MMR = \arg \max_{D_i \in R\setminus S}[\lambda sim(D_i, Q) - (1 - \lambda)\max_{D_j \in S}sim(D_i, D_j)] +$$ + +
Where $R$ is the candidates set, $S$ is the selected set, $Q$ is the query vector, $sim$ is the similarity function, and $\lambda = 1 - diversity$.
+ +
+ +This is implemented in Qdrant as a parameter of a nearest neighbors query. You define the vector to get the nearest candidates, and a `diversity` parameter which controls the balance between relevance (0.0) and diversity (1.0). + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.query_points( + collection_name="{collection_name}", + query=models.NearestQuery( + nearest=[0.01, 0.45, 0.67], # search vector + mmr=models.Mmr( + diversity=0.5, # 0.0 - relevance; 1.0 - diversity + candidates_limit=100, # num of candidates to preselect + ) + ), + limit=10, +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + query: ( + new float[] { 0.01f, 0.45f, 0.67f }, + new Mmr + { + Diversity = 0.5f, // 0.0 - relevance; 1.0 - diversity + CandidatesLimit = 100 // Number of candidates to preselect + } + ), + limit: 10 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQueryMMR( + qdrant.NewVectorInput(0.01, 0.45, 0.67), + &qdrant.Mmr{ + Diversity: qdrant.PtrOf(float32(0.5)), // 0.0 - relevance; 1.0 - diversity + CandidatesLimit: qdrant.PtrOf(uint32(100)), // num of candidates to preselect + }), + Limit: qdrant.PtrOf(uint64(10)), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "query": { + "nearest": [0.01, 0.45, 0.67, ...], // search vector + "mmr": { + "diversity": 0.5, // 0.0 - relevance; 1.0 - diversity + "candidates_limit": 100 // num of candidates to preselect + } + }, + "limit": 10 +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.query("{collection_name}", { + query: { + nearest: [0.01, 0.45, 0.67, ...], // search vector + mmr: { + diversity: 0.5, // 0.0 - relevance; 1.0 - diversity + candidates_limit: 100 // num of candidates to preselect + } + }, + limit: 10, +}); +``` + +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{PrefetchQueryBuilder, Query, QueryPointsBuilder}; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client.query( + QueryPointsBuilder::new("{collection_name}") + .query(Query::new_nearest_with_mmr( + vec![0.01, 0.45, 0.67], // search vector + MmrBuilder::new() + .diversity(0.5) // 0.0 - relevance; 1.0 - diversity + .candidates_limit(100) // num of candidates to preselect + )) + .limit(10) +).await?; +``` + +```java +import static io.qdrant.client.QueryFactory.nearest; +import static io.qdrant.client.VectorInputFactory.vectorInput; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.Mmr; +import io.qdrant.client.grpc.Points.QueryPoints; + +QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery( + nearest( + vectorInput(0.01f, 0.45f, 0.67f), // <-- search vector + Mmr.newBuilder() + .setDiversity(0.5f) // 0.0 - relevance; 1.0 - diversity + .setCandidatesLimit(100) // num of candidates to preselect + .build())) + .setLimit(10) + .build()) + .get(); +``` + +**Caveat:** Since MMR ranks one point at a time, the scores produced by MMR in Qdrant refer to the similarity to the query vector. This means that the response will not be ordered by score, but rather by the order of selection of MMR. + +## Score boosting + +_Available as of v1.14.0_ + +When introducing vector search to specific applications, sometimes business logic needs to be considered for ranking the final list of results. + +A quick example is [our own documentation search bar](https://github.com/qdrant/page-search). +It has vectors for every part of the documentation site. If one were to perform a search by "just" using the vectors, all kinds of elements would be equally considered good results. +However, when searching for documentation, we can establish a hierarchy of importance: + +`title > content > snippets` + +One way to solve this is to weight the results based on the kind of element. +For example, we can assign a higher weight to titles and content, and keep snippets unboosted. + +Pseudocode would be something like: + +`score = score + (is_title * 0.5) + (is_content * 0.25)` + +Query API can rescore points with custom formulas. They can be based on: + +- Dynamic payload values +- Conditions +- Scores of prefetches + +To express the formula, the syntax uses objects to identify each element. +Taking the documentation example, the request would look like this: + +```python +from qdrant_client import models + + +tag_boosted = client.query_points( + collection_name="{collection_name}", + prefetch=models.Prefetch( + query=[0.2, 0.8, ...], # <-- dense vector + limit=50 + ), + query=models.FormulaQuery( + formula=models.SumExpression(sum=[ + "$score", + models.MultExpression(mult=[0.5, models.FieldCondition(key="tag", match=models.MatchAny(any=["h1", "h2", "h3", "h4"]))]), + models.MultExpression(mult=[0.25, models.FieldCondition(key="tag", match=models.MatchAny(any=["p", "li"]))]) + ] + )) +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + prefetch: + [ + new PrefetchQuery { Query = new float[] { 0.01f, 0.45f, 0.67f }, Limit = 100 }, + ], + query: new Formula + { + Expression = new SumExpression + { + Sum = + { + "$score", + new MultExpression + { + Mult = { 0.5f, Match("tag", ["h1", "h2", "h3", "h4"]) }, + }, + new MultExpression { Mult = { 0.25f, Match("tag", ["p", "li"]) } }, + }, + }, + }, + limit: 10 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Prefetch: []*qdrant.PrefetchQuery{ + { + Query: qdrant.NewQuery(0.01, 0.45, 0.67), + }, + }, + Query: qdrant.NewQueryFormula(&qdrant.Formula{ + Expression: qdrant.NewExpressionSum(&qdrant.SumExpression{ + Sum: []*qdrant.Expression{ + qdrant.NewExpressionVariable("$score"), + qdrant.NewExpressionMult(&qdrant.MultExpression{ + Mult: []*qdrant.Expression{ + qdrant.NewExpressionConstant(0.5), + qdrant.NewExpressionCondition(qdrant.NewMatchKeywords("tag", "h1", "h2", "h3", "h4")), + }, + }), + qdrant.NewExpressionMult(&qdrant.MultExpression{ + Mult: []*qdrant.Expression{ + qdrant.NewExpressionConstant(0.25), + qdrant.NewExpressionCondition(qdrant.NewMatchKeywords("tag", "p", "li")), + }, + }), + }, + }), + }), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "prefetch": { + "query": [0.2, 0.8, ...], // <-- dense vector + "limit": 50 + } + "query": { + "formula": { + "sum": [ + "$score, + { + "mult": [ + 0.5, + { + "key": "tag", + "match": { "any": ["h1", "h2", "h3", "h4"] } } + ] + }, + { + "mult": [ + 0.25, + { + "key": "tag", + "match": { "any": ["p", "li"] } + } + ] + } + ] + } + } +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +const tag_boosted = await client.query(collectionName, { + prefetch: { + query: [0.2, 0.8, 0.1, 0.9], + limit: 50 + }, + query: { + formula: { + sum: [ + "$score", + { + mult: [ 0.5, { key: "tag", match: { any: ["h1", "h2", "h3", "h4"] }} ] + }, + { + mult: [ 0.25, { key: "tag", match: { any: ["p", "li"] }} ] + } + ] + } + } +}); +``` + + +```rust +use qdrant_client::qdrant::{ + Condition, Expression, FormulaBuilder, PrefetchQueryBuilder, QueryPointsBuilder, +}; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +let _tag_boosted = client.query( + QueryPointsBuilder::new("{collection_name}") + .add_prefetch(PrefetchQueryBuilder::default() + .query(vec![0.01, 0.45, 0.67]) + .limit(100u64) + ) + .query(FormulaBuilder::new(Expression::sum_with([ + Expression::score(), + Expression::mult_with([ + Expression::constant(0.5), + Expression::condition(Condition::matches("tag", ["h1", "h2", "h3", "h4"])), + ]), + Expression::mult_with([ + Expression::constant(0.25), + Expression::condition(Condition::matches("tag", ["p", "li"])), + ]), + ]))) + .limit(10) + ).await?; +``` + +```java +import java.util.List; + +import static io.qdrant.client.ConditionFactory.matchKeywords; +import static io.qdrant.client.ExpressionFactory.condition; +import static io.qdrant.client.ExpressionFactory.constant; +import static io.qdrant.client.ExpressionFactory.mult; +import static io.qdrant.client.ExpressionFactory.sum; +import static io.qdrant.client.ExpressionFactory.variable; +import static io.qdrant.client.QueryFactory.formula; +import static io.qdrant.client.QueryFactory.nearest; + + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.Formula; +import io.qdrant.client.grpc.Points.MultExpression; +import io.qdrant.client.grpc.Points.PrefetchQuery; +import io.qdrant.client.grpc.Points.QueryPoints; +import io.qdrant.client.grpc.Points.SumExpression; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .addPrefetch( + PrefetchQuery.newBuilder() + .setQuery(nearest(0.01f, 0.45f, 0.67f)) + .setLimit(100) + .build()) + .setQuery( + formula( + Formula.newBuilder() + .setExpression( + sum( + SumExpression.newBuilder() + .addSum(variable("$score")) + .addSum( + mult( + MultExpression.newBuilder() + .addMult(constant(0.5f)) + .addMult( + condition( + matchKeywords( + "tag", + List.of("h1", "h2", "h3", "h4")))) + .build())) + .addSum(mult(MultExpression.newBuilder() + .addMult(constant(0.25f)) + .addMult( + condition( + matchKeywords( + "tag", + List.of("p", "li")))) + .build())) + .build())) + .build())) + .build()) + .get(); +``` + +There are multiple expressions available, check the [API docs for specific details](https://api.qdrant.tech/v-1-14-x/api-reference/search/query-points#request.body.query.Query%20Interface.Query.Formula%20Query.formula). + +- **constant** - A floating point number. e.g. `0.5`. +- `"$score"` - Reference to the score of the point in the prefetch. This is the same as `"$score[0]"`. +- `"$score[0]"`, `"$score[1]"`, `"$score[2]"`, ... - When using multiple prefetches, you can reference specific prefetch with the index within the array of prefetches. +- **payload key** - Any plain string will refer to a payload key. This uses the jsonpath format used in every other place, e.g. `key` or `key.subkey`. It will try to extract a number from the given key. +- **condition** - A filtering condition. If the condition is met, it becomes `1.0`, otherwise `0.0`. +- **mult** - Multiply an array of expressions. +- **sum** - Sum an array of expressions. +- **div** - Divide an expression by another expression. +- **abs** - Absolute value of an expression. +- **pow** - Raise an expression to the power of another expression. +- **sqrt** - Square root of an expression. +- **log10** - Base 10 logarithm of an expression. +- **ln** - Natural logarithm of an expression. +- **exp** - Exponential function of an expression (`e^x`). +- **geo distance** - Haversine distance between two geographic points. Values need to be `{ "lat": 0.0, "lon": 0.0 }` objects. +- **decay** - Apply a decay function to an expression, which clamps the output between 0 and 1. Available decay functions are **linear**, **exponential**, and **gaussian**. [See more](#boost-points-closer-to-user). +- **datetime** - Parse a datetime string (see formats [here](/documentation/concepts/payload/#datetime)), and use it as a POSIX timestamp, in seconds. +- **datetime key** - Specify that a payload key contains a datetime string to be parsed into POSIX seconds. + +It is possible to define a default for when the variable (either from payload or prefetch score) is not found. This is given in the form of a mapping from variable to value. +If there is no variable, and no defined default, a default value of `0.0` is used. + + + +### Boost points closer to user + +Another example. Combine the score with how close the result is to a user. + +Considering each point has an associated geo location, we can calculate the distance between the point and the request's location. + +Assuming we have cosine scores in the prefetch, we can use a helper function to clamp the geographical distance between 0 and 1, by using a decay function. Once clamped, we can sum the score and the distance together. Pseudocode: + +`score = score + gauss_decay(distance)` + +In this case we use a **gauss_decay** function. + +```python +from qdrant_client import models + + +geo_boosted = client.query_points( + collection_name="{collection_name}", + prefetch=models.Prefetch( + query=[0.2, 0.8, ...], # <-- dense vector + limit=50 + ), + query=models.FormulaQuery( + formula=models.SumExpression(sum=[ + "$score", + models.GaussDecayExpression( + gauss_decay=models.DecayParamsExpression( + x=models.GeoDistance( + geo_distance=models.GeoDistanceParams( + origin=models.GeoPoint( + lat=52.504043, + lon=13.393236 + ), # Berlin + to="geo.location" + ) + ), + scale=5000 # 5km + ) + ) + ]), + defaults={"geo.location": models.GeoPoint(lat=48.137154, lon=11.576124)} # Munich + ) +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; +using static Qdrant.Client.Grpc.Expression; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + prefetch: + [ + new PrefetchQuery { Query = new float[] { 0.01f, 0.45f, 0.67f }, Limit = 100 }, + ], + query: new Formula + { + Expression = new SumExpression + { + Sum = + { + "$score", + FromExpDecay( + new() + { + X = new GeoDistance + { + Origin = new GeoPoint { Lat = 52.504043, Lon = 13.393236 }, + To = "geo.location", + }, + Scale = 5000, + } + ), + }, + }, + Defaults = + { + ["geo.location"] = new Dictionary + { + ["lat"] = 48.137154, + ["lon"] = 11.576124, + }, + }, + } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Prefetch: []*qdrant.PrefetchQuery{ + { + Query: qdrant.NewQuery(0.2, 0.8), + }, + }, + Query: qdrant.NewQueryFormula(&qdrant.Formula{ + Expression: qdrant.NewExpressionSum(&qdrant.SumExpression{ + Sum: []*qdrant.Expression{ + qdrant.NewExpressionVariable("$score"), + qdrant.NewExpressionExpDecay(&qdrant.DecayParamsExpression{ + X: qdrant.NewExpressionGeoDistance(&qdrant.GeoDistance{ + Origin: &qdrant.GeoPoint{ + Lat: 52.504043, + Lon: 13.393236, + }, + To: "geo.location", + }), + }), + }, + }), + Defaults: qdrant.NewValueMap(map[string]any{ + "geo.location": map[string]any{ + "lat": 48.137154, + "lon": 11.576124, + }, + }), + }), +}) +``` + +```http +POST /collections/{collection_name}/points/query +{ + "prefetch": { "query": [0.2, 0.8, ...], "limit": 50 }, + "query": { + "formula": { + "sum": [ + "$score", + { + "gauss_decay": { + "x": { + "geo_distance": { + "origin": { "lat": 52.504043, "lon": 13.393236 } + "to": "geo.location" + } + }, + "scale": 5000 // 5km + } + } + ] + }, + "defaults": { "geo.location": {"lat": 48.137154, "lon": 11.576124} } + } +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +const distance_boosted = await client.query(collectionName, { + prefetch: { + query: [0.2, 0.8, ...], + limit: 50 + }, + query: { + formula: { + sum: [ + "$score", + { + gauss_decay: { + x: { + geo_distance: { + origin: { lat: 52.504043, lon: 13.393236 }, // Berlin + to: "geo.location" + } + }, + scale: 5000 // 5km + } + } + ] + }, + defaults: { "geo.location": { lat: 48.137154, lon: 11.576124 } } // Munich + } +}); + +``` +```rust +use qdrant_client::qdrant::{ + GeoPoint, DecayParamsExpressionBuilder, Expression, FormulaBuilder, PrefetchQueryBuilder, QueryPointsBuilder, +}; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +let _geo_boosted = client.query( + QueryPointsBuilder::new("{collection_name}") + .add_prefetch( + PrefetchQueryBuilder::default() + .query(vec![0.01, 0.45, 0.67]) + .limit(100u64), + ) + .query( + FormulaBuilder::new(Expression::sum_with([ + Expression::score(), + Expression::exp_decay( + DecayParamsExpressionBuilder::new(Expression::geo_distance_with( + // Berlin + GeoPoint { lat: 52.504043, lon: 13.393236 }, + "geo.location", + )) + .scale(5_000.0), + ), + ])) + // Munich + .add_default("geo.location", GeoPoint { lat: 48.137154, lon: 11.576124 }), + ) + .limit(10), + ) + .await?; +``` + +```java +import static io.qdrant.client.ExpressionFactory.expDecay; +import static io.qdrant.client.ExpressionFactory.geoDistance; +import static io.qdrant.client.ExpressionFactory.sum; +import static io.qdrant.client.ExpressionFactory.variable; +import static io.qdrant.client.PointIdFactory.id; +import static io.qdrant.client.QueryFactory.formula; +import static io.qdrant.client.QueryFactory.nearest; +import static io.qdrant.client.ValueFactory.value; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.DecayParamsExpression; +import io.qdrant.client.grpc.Points.Formula; +import io.qdrant.client.grpc.Points.GeoDistance; +import io.qdrant.client.grpc.Points.GeoPoint; +import io.qdrant.client.grpc.Points.PrefetchQuery; +import io.qdrant.client.grpc.Points.QueryPoints; +import io.qdrant.client.grpc.Points.SumExpression; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .addPrefetch( + PrefetchQuery.newBuilder() + .setQuery(nearest(0.01f, 0.45f, 0.67f)) + .setLimit(100) + .build()) + .setQuery( + formula( + Formula.newBuilder() + .setExpression( + sum( + SumExpression.newBuilder() + .addSum(variable("$score")) + .addSum( + expDecay( + DecayParamsExpression.newBuilder() + .setX( + geoDistance( + GeoDistance.newBuilder() + .setOrigin( + GeoPoint.newBuilder() + .setLat(52.504043) + .setLon(13.393236) + .build()) + .setTo("geo.location") + .build())) + .setScale(5000) + .build())) + .build())) + .putDefaults( + "geo.location", + value( + Map.of( + "lat", value(48.137154), + "lon", value(11.576124)))) + .build())) + .build()) + .get(); +``` + +For all decay functions, there are these parameters available + +| Parameter | Default | Description | +| ---------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `x` | N/A | The value to decay | +| `target` | 0.0 | The value at which the decay will be at its peak. For distances it is usually set at 0.0, but can be set to any value. | +| `scale` | 1.0 | The value at which the decay function will be equal to `midpoint`. This is in terms of `x` units, for example, if `x` is in meters, `scale` of 5000 means 5km. Must be a non-zero positive number | +| `midpoint` | 0.5 | Output is `midpoint` when `x` equals `scale`. Must be in the range (0.0, 1.0), exclusive | + +The formulas for each decay function are as follows: + + + +
+ +| Decay Function | Color | Range | Formula | +|----------------|-------|-------|---------| +| **`lin_decay`** | green | `[0, 1]` | $\text{lin_decay}(x) = \max\left(0,\ -\frac{(1-m_{idpoint})}{s_{cale}}\cdot {abs}(x-t_{arget})+1\right)$ | +| **`exp_decay`** | red | `(0, 1]` | $\text{exp_decay}(x) = \exp\left(\frac{\ln(m_{idpoint})}{s_{cale}}\cdot {abs}(x-t_{arget})\right)$ | +| **`gauss_decay`** | purple | `(0, 1]` | $\text{gauss_decay}(x) = \exp\left(\frac{\ln(m_{idpoint})}{s_{cale}^{2}}\cdot (x-t_{arget})^{2}\right)$ | + +## Grouping + +_Available as of v1.11.0_ + +It is possible to group results by a certain field. This is useful when you have multiple points for the same item, and you want to avoid redundancy of the same item in the results. + +REST API ([Schema](https://api.qdrant.tech/master/api-reference/search/query-points-groups)): + +```python +client.query_points_groups( + collection_name="{collection_name}", + # Same as in the regular query_points() API + query=[1.1], + # Grouping parameters + group_by="document_id", # Path of the field to group by + limit=4, # Max amount of groups + group_size=2, # Max amount of points per group +) +``` + +```csharp +using Qdrant.Client; + +var client = new QdrantClient("localhost", 6334); + +await client.QueryGroupsAsync( + collectionName: "{collection_name}", + query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, + groupBy: "document_id", + limit: 4, + groupSize: 2 +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.QueryGroups(context.Background(), &qdrant.QueryPointGroups{ + CollectionName: "{collection_name}", + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), + GroupBy: "document_id", + GroupSize: qdrant.PtrOf(uint64(2)), +}) +``` + +```http +POST /collections/{collection_name}/points/query/groups +{ + // Same as in the regular query API + "query": [1.1], + // Grouping parameters + "group_by": "document_id", // Path of the field to group by + "limit": 4, // Max amount of groups + "group_size": 2 // Max amount of points per group +} +``` + +```typescript +client.queryGroups("{collection_name}", { + query: [1.1], + group_by: "document_id", + limit: 4, + group_size: 2, +}); +``` + +```rust +use qdrant_client::qdrant::QueryPointGroupsBuilder; + +client + .query_groups( + QueryPointGroupsBuilder::new("{collection_name}", "document_id") + .query(vec![0.2, 0.1, 0.9, 0.7]) + .group_size(2u64) + .with_payload(true) + .with_vectors(true) + .limit(4u64), + ) + .await?; +``` + +```java +import java.util.List; + +import io.qdrant.client.grpc.Points.SearchPointGroups; + +client.queryGroupsAsync( + QueryPointGroups.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) + .setGroupBy("document_id") + .setLimit(4) + .setGroupSize(2) + .build()) + .get(); +``` + +For more information on the `grouping` capabilities refer to the reference documentation for search with [grouping](/documentation/concepts/search/#search-groups) and [lookup](/documentation/concepts/search/#lookup-in-groups). + +<|page-9-lllmstxt|> +# Filtering + +With Qdrant, you can set conditions when searching or retrieving points. +For example, you can impose conditions on both the [payload](/documentation/concepts/payload/) and the `id` of the point. + +Setting additional conditions is important when it is impossible to express all the features of the object in the embedding. +Examples include a variety of business requirements: stock availability, user location, or desired price range. + +## Related Content +|[A Complete Guide to Filtering in Vector Search](/articles/vector-search-filtering/)|Developer advice on proper usage and advanced practices.| +|-|-| + +## Filtering clauses + +Qdrant allows you to combine conditions in clauses. +Clauses are different logical operations, such as `OR`, `AND`, and `NOT`. +Clauses can be recursively nested into each other so that you can reproduce an arbitrary boolean expression. + +Let's take a look at the clauses implemented in Qdrant. + +Suppose we have a set of points with the following payload: + +```json +[ + { "id": 1, "city": "London", "color": "green" }, + { "id": 2, "city": "London", "color": "red" }, + { "id": 3, "city": "London", "color": "blue" }, + { "id": 4, "city": "Berlin", "color": "red" }, + { "id": 5, "city": "Moscow", "color": "green" }, + { "id": 6, "city": "Moscow", "color": "blue" } +] +``` + +### Must + +Example: + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.scroll( + collection_name="{collection_name}", + scroll_filter=models.Filter( + must=[ + models.FieldCondition( + key="city", + match=models.MatchValue(value="London"), + ), + models.FieldCondition( + key="color", + match=models.MatchValue(value="red"), + ), + ] + ), +) +``` + +```csharp +using Qdrant.Client; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +// & operator combines two conditions in an AND conjunction(must) +await client.ScrollAsync( + collectionName: "{collection_name}", + filter: MatchKeyword("city", "London") & MatchKeyword("color", "red") +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Scroll(context.Background(), &qdrant.ScrollPoints{ + CollectionName: "{collection_name}", + Filter: &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("city", "London"), + qdrant.NewMatch("color", "red"), + }, + }, +}) +``` + +```http +POST /collections/{collection_name}/points/scroll +{ + "filter": { + "must": [ + { "key": "city", "match": { "value": "London" } }, + { "key": "color", "match": { "value": "red" } } + ] + } + ... +} +``` + +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.scroll("{collection_name}", { + filter: { + must: [ + { + key: "city", + match: { value: "London" }, + }, + { + key: "color", + match: { value: "red" }, + }, + ], + }, +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .scroll( + ScrollPointsBuilder::new("{collection_name}").filter(Filter::must([ + Condition::matches("city", "london".to_string()), + Condition::matches("color", "red".to_string()), + ])), + ) + .await?; +``` + +```java +import java.util.List; + +import static io.qdrant.client.ConditionFactory.matchKeyword; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.ScrollPoints; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .scrollAsync( + ScrollPoints.newBuilder() + .setCollectionName("{collection_name}") + .setFilter( + Filter.newBuilder() + .addAllMust( + List.of(matchKeyword("city", "London"), matchKeyword("color", "red"))) + .build()) + .build()) + .get(); +``` + +Filtered points would be: + +```json +[{ "id": 2, "city": "London", "color": "red" }] +``` + +When using `must`, the clause becomes `true` only if every condition listed inside `must` is satisfied. +In this sense, `must` is equivalent to the operator `AND`. + +### Should + +Example: + +```python +client.scroll( + collection_name="{collection_name}", + scroll_filter=models.Filter( + should=[ + models.FieldCondition( + key="city", + match=models.MatchValue(value="London"), + ), + models.FieldCondition( + key="color", + match=models.MatchValue(value="red"), + ), + ] + ), +) +``` + +```csharp +using Qdrant.Client; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +// | operator combines two conditions in an OR disjunction(should) +await client.ScrollAsync( + collectionName: "{collection_name}", + filter: MatchKeyword("city", "London") | MatchKeyword("color", "red") +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Scroll(context.Background(), &qdrant.ScrollPoints{ + CollectionName: "{collection_name}", + Filter: &qdrant.Filter{ + Should: []*qdrant.Condition{ + qdrant.NewMatch("city", "London"), + qdrant.NewMatch("color", "red"), + }, + }, +}) +``` + +```http +POST /collections/{collection_name}/points/scroll +{ + "filter": { + "should": [ + { "key": "city", "match": { "value": "London" } }, + { "key": "color", "match": { "value": "red" } } + ] + } +} +``` + +```typescript +client.scroll("{collection_name}", { + filter: { + should: [ + { + key: "city", + match: { value: "London" }, + }, + { + key: "color", + match: { value: "red" }, + }, + ], + }, +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .scroll( + ScrollPointsBuilder::new("{collection_name}").filter(Filter::should([ + Condition::matches("city", "london".to_string()), + Condition::matches("color", "red".to_string()), + ])), + ) + .await?; +``` + +```java +import static io.qdrant.client.ConditionFactory.matchKeyword; + +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.ScrollPoints; +import java.util.List; + +client + .scrollAsync( + ScrollPoints.newBuilder() + .setCollectionName("{collection_name}") + .setFilter( + Filter.newBuilder() + .addAllShould( + List.of(matchKeyword("city", "London"), matchKeyword("color", "red"))) + .build()) + .build()) + .get(); +``` + +Filtered points would be: + +```json +[ + { "id": 1, "city": "London", "color": "green" }, + { "id": 2, "city": "London", "color": "red" }, + { "id": 3, "city": "London", "color": "blue" }, + { "id": 4, "city": "Berlin", "color": "red" } +] +``` + +When using `should`, the clause becomes `true` if at least one condition listed inside `should` is satisfied. +In this sense, `should` is equivalent to the operator `OR`. + +### Must Not + +Example: + +```python +client.scroll( + collection_name="{collection_name}", + scroll_filter=models.Filter( + must_not=[ + models.FieldCondition(key="city", match=models.MatchValue(value="London")), + models.FieldCondition(key="color", match=models.MatchValue(value="red")), + ] + ), +) +``` + +```csharp +using Qdrant.Client; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +// The ! operator negates the condition(must not) +await client.ScrollAsync( + collectionName: "{collection_name}", + filter: !(MatchKeyword("city", "London") & MatchKeyword("color", "red")) +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Scroll(context.Background(), &qdrant.ScrollPoints{ + CollectionName: "{collection_name}", + Filter: &qdrant.Filter{ + MustNot: []*qdrant.Condition{ + qdrant.NewMatch("city", "London"), + qdrant.NewMatch("color", "red"), + }, + }, +}) +``` + +```http +POST /collections/{collection_name}/points/scroll +{ + "filter": { + "must_not": [ + { "key": "city", "match": { "value": "London" } }, + { "key": "color", "match": { "value": "red" } } + ] + } +} +``` + +```typescript +client.scroll("{collection_name}", { + filter: { + must_not: [ + { + key: "city", + match: { value: "London" }, + }, + { + key: "color", + match: { value: "red" }, + }, + ], + }, +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .scroll( + ScrollPointsBuilder::new("{collection_name}").filter(Filter::must_not([ + Condition::matches("city", "london".to_string()), + Condition::matches("color", "red".to_string()), + ])), + ) + .await?; +``` + +```java +import java.util.List; + +import static io.qdrant.client.ConditionFactory.matchKeyword; + +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.ScrollPoints; + +client + .scrollAsync( + ScrollPoints.newBuilder() + .setCollectionName("{collection_name}") + .setFilter( + Filter.newBuilder() + .addAllMustNot( + List.of(matchKeyword("city", "London"), matchKeyword("color", "red"))) + .build()) + .build()) + .get(); +``` + +Filtered points would be: + +```json +[ + { "id": 5, "city": "Moscow", "color": "green" }, + { "id": 6, "city": "Moscow", "color": "blue" } +] +``` + +When using `must_not`, the clause becomes `true` if none of the conditions listed inside `must_not` is satisfied. +In this sense, `must_not` is equivalent to the expression `(NOT A) AND (NOT B) AND (NOT C)`. + +### Clauses combination + +It is also possible to use several clauses simultaneously: + +```python +client.scroll( + collection_name="{collection_name}", + scroll_filter=models.Filter( + must=[ + models.FieldCondition(key="city", match=models.MatchValue(value="London")), + ], + must_not=[ + models.FieldCondition(key="color", match=models.MatchValue(value="red")), + ], + ), +) +``` + +```csharp +using Qdrant.Client; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +await client.ScrollAsync( + collectionName: "{collection_name}", + filter: MatchKeyword("city", "London") & !MatchKeyword("color", "red") +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Scroll(context.Background(), &qdrant.ScrollPoints{ + CollectionName: "{collection_name}", + Filter: &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("city", "London"), + }, + MustNot: []*qdrant.Condition{ + qdrant.NewMatch("color", "red"), + }, + }, +}) +``` + +```http +POST /collections/{collection_name}/points/scroll +{ + "filter": { + "must": [ + { "key": "city", "match": { "value": "London" } } + ], + "must_not": [ + { "key": "color", "match": { "value": "red" } } + ] + } +} +``` + +```typescript +client.scroll("{collection_name}", { + filter: { + must: [ + { + key: "city", + match: { value: "London" }, + }, + ], + must_not: [ + { + key: "color", + match: { value: "red" }, + }, + ], + }, +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; + +client + .scroll( + ScrollPointsBuilder::new("{collection_name}").filter(Filter { + must: vec![Condition::matches("city", "London".to_string())], + must_not: vec![Condition::matches("color", "red".to_string())], + ..Default::default() + }), + ) + .await?; +``` + +```java +import static io.qdrant.client.ConditionFactory.matchKeyword; + +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.ScrollPoints; + +client + .scrollAsync( + ScrollPoints.newBuilder() + .setCollectionName("{collection_name}") + .setFilter( + Filter.newBuilder() + .addMust(matchKeyword("city", "London")) + .addMustNot(matchKeyword("color", "red")) + .build()) + .build()) + .get(); +``` + +Filtered points would be: + +```json +[ + { "id": 1, "city": "London", "color": "green" }, + { "id": 3, "city": "London", "color": "blue" } +] +``` + +In this case, the conditions are combined by `AND`. + +Also, the conditions could be recursively nested. Example: + +```python +client.scroll( + collection_name="{collection_name}", + scroll_filter=models.Filter( + must_not=[ + models.Filter( + must=[ + models.FieldCondition( + key="city", match=models.MatchValue(value="London") + ), + models.FieldCondition( + key="color", match=models.MatchValue(value="red") + ), + ], + ), + ], + ), +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +await client.ScrollAsync( + collectionName: "{collection_name}", + filter: new Filter { MustNot = { MatchKeyword("city", "London") & MatchKeyword("color", "red") } } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Scroll(context.Background(), &qdrant.ScrollPoints{ + CollectionName: "{collection_name}", + Filter: &qdrant.Filter{ + MustNot: []*qdrant.Condition{ + qdrant.NewFilterAsCondition(&qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("city", "London"), + qdrant.NewMatch("color", "red"), + }, + }), + }, + }, +}) +``` + +```http +POST /collections/{collection_name}/points/scroll +{ + "filter": { + "must_not": [ + { + "must": [ + { "key": "city", "match": { "value": "London" } }, + { "key": "color", "match": { "value": "red" } } + ] + } + ] + } +} +``` + +```typescript +client.scroll("{collection_name}", { + filter: { + must_not: [ + { + must: [ + { + key: "city", + match: { value: "London" }, + }, + { + key: "color", + match: { value: "red" }, + }, + ], + }, + ], + }, +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; + +client + .scroll( + ScrollPointsBuilder::new("{collection_name}").filter(Filter::must_not([Filter::must( + [ + Condition::matches("city", "London".to_string()), + Condition::matches("color", "red".to_string()), + ], + ) + .into()])), + ) + .await?; +``` + +```java +import java.util.List; + +import static io.qdrant.client.ConditionFactory.filter; +import static io.qdrant.client.ConditionFactory.matchKeyword; + +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.ScrollPoints; + +client + .scrollAsync( + ScrollPoints.newBuilder() + .setCollectionName("{collection_name}") + .setFilter( + Filter.newBuilder() + .addMustNot( + filter( + Filter.newBuilder() + .addAllMust( + List.of( + matchKeyword("city", "London"), + matchKeyword("color", "red"))) + .build())) + .build()) + .build()) + .get(); +``` + +Filtered points would be: + +```json +[ + { "id": 1, "city": "London", "color": "green" }, + { "id": 3, "city": "London", "color": "blue" }, + { "id": 4, "city": "Berlin", "color": "red" }, + { "id": 5, "city": "Moscow", "color": "green" }, + { "id": 6, "city": "Moscow", "color": "blue" } +] +``` + +## Filtering conditions + +Different types of values in payload correspond to different kinds of queries that we can apply to them. +Let's look at the existing condition variants and what types of data they apply to. + +### Match + +```python +models.FieldCondition( + key="color", + match=models.MatchValue(value="red"), +) +``` + +```csharp +using static Qdrant.Client.Grpc.Conditions; + +MatchKeyword("color", "red"); +``` + +```json +{ + "key": "color", + "match": { + "value": "red" + } +} +``` + +```go +import "github.com/qdrant/go-client/qdrant" + +qdrant.NewMatch("color", "red") +``` + +```typescript +{ + key: 'color', + match: {value: 'red'} +} +``` + +```rust +Condition::matches("color", "red".to_string()) +``` + +```java +matchKeyword("color", "red"); +``` + +For the other types, the match condition will look exactly the same, except for the type used: + +```python +models.FieldCondition( + key="count", + match=models.MatchValue(value=0), +) +``` + +```csharp +using static Qdrant.Client.Grpc.Conditions; + +Match("count", 0); +``` + +```json +{ + "key": "count", + "match": { + "value": 0 + } +} +``` + +```go +import "github.com/qdrant/go-client/qdrant" + +qdrant.NewMatchInt("count", 0) +``` + +```typescript +{ + key: 'count', + match: {value: 0} +} +``` + +```rust +Condition::matches("count", 0) +``` + +```java +import static io.qdrant.client.ConditionFactory.match; + +match("count", 0); +``` + +The simplest kind of condition is one that checks if the stored value equals the given one. +If several values are stored, at least one of them should match the condition. +You can apply it to [keyword](/documentation/concepts/payload/#keyword), [integer](/documentation/concepts/payload/#integer) and [bool](/documentation/concepts/payload/#bool) payloads. + +### Match Any + +*Available as of v1.1.0* + +In case you want to check if the stored value is one of multiple values, you can use the Match Any condition. +Match Any works as a logical OR for the given values. It can also be described as a `IN` operator. + +You can apply it to [keyword](/documentation/concepts/payload/#keyword) and [integer](/documentation/concepts/payload/#integer) payloads. + +Example: + +```python +models.FieldCondition( + key="color", + match=models.MatchAny(any=["black", "yellow"]), +) +``` + +```csharp +using static Qdrant.Client.Grpc.Conditions; + +Match("color", ["black", "yellow"]); +``` + +```json +{ + "key": "color", + "match": { + "any": ["black", "yellow"] + } +} +``` + +```go +import "github.com/qdrant/go-client/qdrant" + +qdrant.NewMatchKeywords("color", "black", "yellow") +``` + +```typescript +{ + key: 'color', + match: {any: ['black', 'yellow']} +} +``` + +```rust +Condition::matches("color", vec!["black".to_string(), "yellow".to_string()]) +``` + +```java +import static io.qdrant.client.ConditionFactory.matchKeywords; + +matchKeywords("color", List.of("black", "yellow")); +``` + +In this example, the condition will be satisfied if the stored value is either `black` or `yellow`. + +If the stored value is an array, it should have at least one value matching any of the given values. E.g. if the stored value is `["black", "green"]`, the condition will be satisfied, because `"black"` is in `["black", "yellow"]`. + + +### Match Except + +*Available as of v1.2.0* + +In case you want to check if the stored value is not one of multiple values, you can use the Match Except condition. +Match Except works as a logical NOR for the given values. +It can also be described as a `NOT IN` operator. + +You can apply it to [keyword](/documentation/concepts/payload/#keyword) and [integer](/documentation/concepts/payload/#integer) payloads. + +Example: + +```python +models.FieldCondition( + key="color", + match=models.MatchExcept(**{"except": ["black", "yellow"]}), +) +``` + +```csharp +using static Qdrant.Client.Grpc.Conditions; + +Match("color", ["black", "yellow"]); +``` + +```json +{ + "key": "color", + "match": { + "except": ["black", "yellow"] + } +} +``` + +```go +import "github.com/qdrant/go-client/qdrant" + +qdrant.NewMatchExcept("color", "black", "yellow") +``` + +```typescript +{ + key: 'color', + match: {except: ['black', 'yellow']} +} +``` + +```rust +use qdrant_client::qdrant::r#match::MatchValue; + +Condition::matches( + "color", + !MatchValue::from(vec!["black".to_string(), "yellow".to_string()]), +) +``` + +```java +import static io.qdrant.client.ConditionFactory.matchExceptKeywords; + +matchExceptKeywords("color", List.of("black", "yellow")); +``` + +In this example, the condition will be satisfied if the stored value is neither `black` nor `yellow`. + +If the stored value is an array, it should have at least one value not matching any of the given values. E.g. if the stored value is `["black", "green"]`, the condition will be satisfied, because `"green"` does not match `"black"` nor `"yellow"`. + +### Nested key + +*Available as of v1.1.0* + +Payloads being arbitrary JSON object, it is likely that you will need to filter on a nested field. + +For convenience, we use a syntax similar to what can be found in the [Jq](https://stedolan.github.io/jq/manual/#Basicfilters) project. + +Suppose we have a set of points with the following payload: + +```json +[ + { + "id": 1, + "country": { + "name": "Germany", + "cities": [ + { + "name": "Berlin", + "population": 3.7, + "sightseeing": ["Brandenburg Gate", "Reichstag"] + }, + { + "name": "Munich", + "population": 1.5, + "sightseeing": ["Marienplatz", "Olympiapark"] + } + ] + } + }, + { + "id": 2, + "country": { + "name": "Japan", + "cities": [ + { + "name": "Tokyo", + "population": 9.3, + "sightseeing": ["Tokyo Tower", "Tokyo Skytree"] + }, + { + "name": "Osaka", + "population": 2.7, + "sightseeing": ["Osaka Castle", "Universal Studios Japan"] + } + ] + } + } +] +``` + +You can search on a nested field using a dot notation. + +```python +client.scroll( + collection_name="{collection_name}", + scroll_filter=models.Filter( + should=[ + models.FieldCondition( + key="country.name", match=models.MatchValue(value="Germany") + ), + ], + ), +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +await client.ScrollAsync(collectionName: "{collection_name}", filter: MatchKeyword("country.name", "Germany")); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Scroll(context.Background(), &qdrant.ScrollPoints{ + CollectionName: "{collection_name}", + Filter: &qdrant.Filter{ + Should: []*qdrant.Condition{ + qdrant.NewMatch("country.name", "Germany"), + }, + }, +}) +``` + +```http +POST /collections/{collection_name}/points/scroll +{ + "filter": { + "should": [ + { + "key": "country.name", + "match": { + "value": "Germany" + } + } + ] + } +} +``` + +```typescript +client.scroll("{collection_name}", { + filter: { + should: [ + { + key: "country.name", + match: { value: "Germany" }, + }, + ], + }, +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; + +client + .scroll( + ScrollPointsBuilder::new("{collection_name}").filter(Filter::should([ + Condition::matches("country.name", "Germany".to_string()), + ])), + ) + .await?; +``` + +```java +import static io.qdrant.client.ConditionFactory.matchKeyword; + +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.ScrollPoints; + +client + .scrollAsync( + ScrollPoints.newBuilder() + .setCollectionName("{collection_name}") + .setFilter( + Filter.newBuilder() + .addShould(matchKeyword("country.name", "Germany")) + .build()) + .build()) + .get(); +``` + +You can also search through arrays by projecting inner values using the `[]` syntax. + +```python +client.scroll( + collection_name="{collection_name}", + scroll_filter=models.Filter( + should=[ + models.FieldCondition( + key="country.cities[].population", + range=models.Range( + gt=None, + gte=9.0, + lt=None, + lte=None, + ), + ), + ], + ), +) +``` + +```csharp +using Qdrant.Client; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +await client.ScrollAsync( + collectionName: "{collection_name}", + filter: Range("country.cities[].population", new Qdrant.Client.Grpc.Range { Gte = 9.0 }) +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Scroll(context.Background(), &qdrant.ScrollPoints{ + CollectionName: "{collection_name}", + Filter: &qdrant.Filter{ + Should: []*qdrant.Condition{ + qdrant.NewRange("country.cities[].population", &qdrant.Range{ + Gte: qdrant.PtrOf(9.0), + }), + }, + }, +}) +``` + +```http +POST /collections/{collection_name}/points/scroll +{ + "filter": { + "should": [ + { + "key": "country.cities[].population", + "range": { + "gte": 9.0, + } + } + ] + } +} +``` + +```typescript +client.scroll("{collection_name}", { + filter: { + should: [ + { + key: "country.cities[].population", + range: { + gt: null, + gte: 9.0, + lt: null, + lte: null, + }, + }, + ], + }, +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, Filter, Range, ScrollPointsBuilder}; + +client + .scroll( + ScrollPointsBuilder::new("{collection_name}").filter(Filter::should([ + Condition::range( + "country.cities[].population", + Range { + gte: Some(9.0), + ..Default::default() + }, + ), + ])), + ) + .await?; +``` + +```java +import static io.qdrant.client.ConditionFactory.range; + +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.Range; +import io.qdrant.client.grpc.Points.ScrollPoints; + +client + .scrollAsync( + ScrollPoints.newBuilder() + .setCollectionName("{collection_name}") + .setFilter( + Filter.newBuilder() + .addShould( + range( + "country.cities[].population", + Range.newBuilder().setGte(9.0).build())) + .build()) + .build()) + .get(); +``` + +This query would only output the point with id 2 as only Japan has a city with population greater than 9.0. + +And the leaf nested field can also be an array. + +```python +client.scroll( + collection_name="{collection_name}", + scroll_filter=models.Filter( + should=[ + models.FieldCondition( + key="country.cities[].sightseeing", + match=models.MatchValue(value="Osaka Castle"), + ), + ], + ), +) +``` + +```csharp +using Qdrant.Client; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +await client.ScrollAsync( + collectionName: "{collection_name}", + filter: MatchKeyword("country.cities[].sightseeing", "Germany") +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Scroll(context.Background(), &qdrant.ScrollPoints{ + CollectionName: "{collection_name}", + Filter: &qdrant.Filter{ + Should: []*qdrant.Condition{ + qdrant.NewMatch("country.cities[].sightseeing", "Germany"), + }, + }, +}) +``` + +```http +POST /collections/{collection_name}/points/scroll +{ + "filter": { + "should": [ + { + "key": "country.cities[].sightseeing", + "match": { + "value": "Osaka Castle" + } + } + ] + } +} +``` + +```typescript +client.scroll("{collection_name}", { + filter: { + should: [ + { + key: "country.cities[].sightseeing", + match: { value: "Osaka Castle" }, + }, + ], + }, +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; + +client + .scroll( + ScrollPointsBuilder::new("{collection_name}").filter(Filter::should([ + Condition::matches("country.cities[].sightseeing", "Osaka Castle".to_string()), + ])), + ) + .await?; +``` + +```java +import static io.qdrant.client.ConditionFactory.matchKeyword; + +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.ScrollPoints; + +client + .scrollAsync( + ScrollPoints.newBuilder() + .setCollectionName("{collection_name}") + .setFilter( + Filter.newBuilder() + .addShould(matchKeyword("country.cities[].sightseeing", "Germany")) + .build()) + .build()) + .get(); +``` + +This query would only output the point with id 2 as only Japan has a city with the "Osaka castke" as part of the sightseeing. + +### Nested object filter + +*Available as of v1.2.0* + +By default, the conditions are taking into account the entire payload of a point. + +For instance, given two points with the following payload: + +```json +[ + { + "id": 1, + "dinosaur": "t-rex", + "diet": [ + { "food": "leaves", "likes": false}, + { "food": "meat", "likes": true} + ] + }, + { + "id": 2, + "dinosaur": "diplodocus", + "diet": [ + { "food": "leaves", "likes": true}, + { "food": "meat", "likes": false} + ] + } +] +``` + +The following query would match both points: + +```python +client.scroll( + collection_name="{collection_name}", + scroll_filter=models.Filter( + must=[ + models.FieldCondition( + key="diet[].food", match=models.MatchValue(value="meat") + ), + models.FieldCondition( + key="diet[].likes", match=models.MatchValue(value=True) + ), + ], + ), +) +``` + +```csharp +using Qdrant.Client; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +await client.ScrollAsync( + collectionName: "{collection_name}", + filter: MatchKeyword("diet[].food", "meat") & Match("diet[].likes", true) +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Scroll(context.Background(), &qdrant.ScrollPoints{ + CollectionName: "{collection_name}", + Filter: &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("diet[].food", "meat"), + qdrant.NewMatchBool("diet[].likes", true), + }, + }, +}) +``` + +```http +POST /collections/{collection_name}/points/scroll +{ + "filter": { + "must": [ + { + "key": "diet[].food", + "match": { + "value": "meat" + } + }, + { + "key": "diet[].likes", + "match": { + "value": true + } + } + ] + } +} +``` + +```typescript +client.scroll("{collection_name}", { + filter: { + must: [ + { + key: "diet[].food", + match: { value: "meat" }, + }, + { + key: "diet[].likes", + match: { value: true }, + }, + ], + }, +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; + +client + .scroll( + ScrollPointsBuilder::new("{collection_name}").filter(Filter::must([ + Condition::matches("diet[].food", "meat".to_string()), + Condition::matches("diet[].likes", true), + ])), + ) + .await?; +``` + +```java +import java.util.List; + +import static io.qdrant.client.ConditionFactory.match; +import static io.qdrant.client.ConditionFactory.matchKeyword; + +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.ScrollPoints; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .scrollAsync( + ScrollPoints.newBuilder() + .setCollectionName("{collection_name}") + .setFilter( + Filter.newBuilder() + .addAllMust( + List.of(matchKeyword("diet[].food", "meat"), match("diet[].likes", true))) + .build()) + .build()) + .get(); +``` + +This happens because both points are matching the two conditions: + +- the "t-rex" matches food=meat on `diet[1].food` and likes=true on `diet[1].likes` +- the "diplodocus" matches food=meat on `diet[1].food` and likes=true on `diet[0].likes` + +To retrieve only the points which are matching the conditions on an array element basis, that is the point with id 1 in this example, you would need to use a nested object filter. + +Nested object filters allow arrays of objects to be queried independently of each other. + +It is achieved by using the `nested` condition type formed by a payload key to focus on and a filter to apply. + +The key should point to an array of objects and can be used with or without the bracket notation ("data" or "data[]"). + +```python +client.scroll( + collection_name="{collection_name}", + scroll_filter=models.Filter( + must=[ + models.NestedCondition( + nested=models.Nested( + key="diet", + filter=models.Filter( + must=[ + models.FieldCondition( + key="food", match=models.MatchValue(value="meat") + ), + models.FieldCondition( + key="likes", match=models.MatchValue(value=True) + ), + ] + ), + ) + ) + ], + ), +) +``` + +```csharp +using Qdrant.Client; +using static Qdrant.Client.Grpc.Conditions; + +var client = new QdrantClient("localhost", 6334); + +await client.ScrollAsync( + collectionName: "{collection_name}", + filter: Nested("diet", MatchKeyword("food", "meat") & Match("likes", true)) +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Scroll(context.Background(), &qdrant.ScrollPoints{ + CollectionName: "{collection_name}", + Filter: &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewNestedFilter("diet", &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("food", "meat"), + qdrant.NewMatchBool("likes", true), + }, + }), + }, + }, +}) +``` + +```http +POST /collections/{collection_name}/points/scroll +{ + "filter": { + "must": [{ + "nested": { + "key": "diet", + "filter":{ + "must": [ + { + "key": "food", + "match": { + "value": "meat" + } + }, + { + "key": "likes", + "match": { + "value": true + } + } + ] + } + } + }] + } +} +``` + +```typescript +client.scroll("{collection_name}", { + filter: { + must: [ + { + nested: { + key: "diet", + filter: { + must: [ + { + key: "food", + match: { value: "meat" }, + }, + { + key: "likes", + match: { value: true }, + }, + ], + }, + }, + }, + ], + }, +}); +``` + +```rust +use qdrant_client::qdrant::{Condition, Filter, NestedCondition, ScrollPointsBuilder}; + +client + .scroll( + ScrollPointsBuilder::new("{collection_name}").filter(Filter::must([NestedCondition { + key: "diet".to_string(), + filter: Some(Filter::must([ + Condition::matches("food", "meat".to_string()), + Condition::matches("likes", true), + ])), + } + .into()])), + ) + .await?; +``` + +```java +import java.util.List; + +import static io.qdrant.client.ConditionFactory.match; +import static io.qdrant.client.ConditionFactory.matchKeyword; +import static io.qdrant.client.ConditionFactory.nested; + +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.ScrollPoints; + +client + .scrollAsync( + ScrollPoints.newBuilder() + .setCollectionName("{collection_name}") + .setFilter( + Filter.newBuilder() + .addMust( + nested( + "diet", + Filter.newBuilder() + .addAllMust( + List.of( + matchKeyword("food", "meat"), match("likes", true))) + .build())) + .build()) + .build()) + .get(); +``` + +The matching logic is modified to be applied at the level of an array element within the payload. + +Nested filters work in the same way as if the nested filter was applied to a single element of the array at a time. +Parent document is considered to match the condition if at least one element of the array matches the nested filter. + +**Limitations** + +The `has_id` condition is not supported within the nested object filter. If you need it, place it in an adjacent `must` clause. + +```python +client.scroll( + collection_name="{collection_name}", + scroll_filter=models.Filter( + must=[ + models.NestedCondition( + nested=models.Nested( + key="diet", + filter=models.Filter( + must=[ + models.FieldCondition( + key="food", match=models.MatchValue(value="meat") + ), + models.FieldCondition( + key="likes", match=models.MatchValue(value=True) + ), + ] + ), + ) + ), + models.HasIdCondition(has_id=[1]), + ], + ), +) +``` -##### Was this page useful? +```csharp +using Qdrant.Client; +using static Qdrant.Client.Grpc.Conditions; -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +var client = new QdrantClient("localhost", 6334); -Thank you for your feedback! 🙏 +await client.ScrollAsync( + collectionName: "{collection_name}", + filter: Nested("diet", MatchKeyword("food", "meat") & Match("likes", true)) & HasId(1) +); +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-api.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +```go +import ( + "context" -On this page: + "github.com/qdrant/go-client/qdrant" +) -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-api.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -× +client.Scroll(context.Background(), &qdrant.ScrollPoints{ + CollectionName: "{collection_name}", + Filter: &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewNestedFilter("diet", &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("food", "meat"), + qdrant.NewMatchBool("likes", true), + }, + }), + qdrant.NewHasID(qdrant.NewIDNum(1)), + }, + }, +}) +``` -[Powered by](https://qdrant.tech/) +```http +POST /collections/{collection_name}/points/scroll +{ + "filter":{ + "must":[ + { + "nested":{ + "key":"diet", + "filter":{ + "must":[ + { + "key":"food", + "match":{ + "value":"meat" + } + }, + { + "key":"likes", + "match":{ + "value":true + } + } + ] + } + } + }, + { + "has_id":[ + 1 + ] + } + ] + } +} +``` -<|page-12-lllmstxt|> -## configuration -- [Documentation](https://qdrant.tech/documentation/) -- [Private cloud](https://qdrant.tech/documentation/private-cloud/) -- Configuration +```typescript +client.scroll("{collection_name}", { + filter: { + must: [ + { + nested: { + key: "diet", + filter: { + must: [ + { + key: "food", + match: { value: "meat" }, + }, + { + key: "likes", + match: { value: true }, + }, + ], + }, + }, + }, + { + has_id: [1], + }, + ], + }, +}); +``` -# [Anchor](https://qdrant.tech/documentation/private-cloud/configuration/\#private-cloud-configuration) Private Cloud Configuration +```rust +use qdrant_client::qdrant::{Condition, Filter, NestedCondition, ScrollPointsBuilder}; -The Qdrant Private Cloud helm chart has several configuration options. The following YAML shows all configuration options with their default values: +client + .scroll( + ScrollPointsBuilder::new("{collection_name}").filter(Filter::must([ + NestedCondition { + key: "diet".to_string(), + filter: Some(Filter::must([ + Condition::matches("food", "meat".to_string()), + Condition::matches("likes", true), + ])), + } + .into(), + Condition::has_id([1]), + ])), + ) + .await?; +``` -```yaml -operator: - # Amount of replicas for the Qdrant operator (v2) - replicaCount: 1 +```java +import java.util.List; - image: - # Image repository for the qdrant operator - repository: registry.cloud.qdrant.io/qdrant/operator - # Image pullPolicy - pullPolicy: IfNotPresent - # Overrides the image tag whose default is the chart appVersion. - tag: "" +import static io.qdrant.client.ConditionFactory.hasId; +import static io.qdrant.client.ConditionFactory.match; +import static io.qdrant.client.ConditionFactory.matchKeyword; +import static io.qdrant.client.ConditionFactory.nested; +import static io.qdrant.client.PointIdFactory.id; - # Optional image pull secrets - imagePullSecrets: - - name: qdrant-registry-creds +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.ScrollPoints; - nameOverride: "" - fullnameOverride: "operator" +client + .scrollAsync( + ScrollPoints.newBuilder() + .setCollectionName("{collection_name}") + .setFilter( + Filter.newBuilder() + .addMust( + nested( + "diet", + Filter.newBuilder() + .addAllMust( + List.of( + matchKeyword("food", "meat"), match("likes", true))) + .build())) + .addMust(hasId(id(1))) + .build()) + .build()) + .get(); +``` - # Service account configuration - serviceAccount: - create: true - annotations: {} +### Full Text Match - # Additional pod annotations - podAnnotations: {} +*Available as of v0.10.0* - # pod security context - podSecurityContext: - runAsNonRoot: true - runAsUser: 10001 - runAsGroup: 20001 - fsGroup: 30001 +A special case of the `match` condition is the `text` match condition. +It allows you to search for a specific substring, token or phrase within the text field. - # container security context - securityContext: - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - runAsNonRoot: true - runAsUser: 10001 - runAsGroup: 20001 - allowPrivilegeEscalation: false - seccompProfile: - type: RuntimeDefault +Exact texts that will match the condition depend on full-text index configuration. +Configuration is defined during the index creation and describe at [full-text index](/documentation/concepts/indexing/#full-text-index). - # Configuration for the Qdrant operator service to expose metrics - service: - enabled: true - type: ClusterIP - metricsPort: 9290 +If there is no full-text index for the field, the condition will work as exact substring match. - # Configuration for the Qdrant operator service monitor to scrape metrics - serviceMonitor: - enabled: false +```python +models.FieldCondition( + key="description", + match=models.MatchText(text="good cheap"), +) +``` - # Resource requests and limits for the Qdrant operator - resources: {} +```csharp +using static Qdrant.Client.Grpc.Conditions; - # Node selector for the Qdrant operator - nodeSelector: {} +MatchText("description", "good cheap"); +``` - # Tolerations for the Qdrant operator - tolerations: [] +```json +{ + "key": "description", + "match": { + "text": "good cheap" + } +} +``` - # Affinity configuration for the Qdrant operator - affinity: {} +```go +import "github.com/qdrant/go-client/qdrant" - watch: - # If true, watches only the namespace where the Qdrant operator is deployed, otherwise watches the namespaces in watch.namespaces - onlyReleaseNamespace: true - # an empty list watches all namespaces. - namespaces: [] +qdrant.NewMatchText("description", "good cheap") +``` - limitRBAC: true +```typescript +{ + key: 'description', + match: {text: 'good cheap'} +} +``` - # Configuration for the Qdrant operator (v2) - settings: - # Does the operator run inside of a Kubernetes cluster (kubernetes) or outside (local) - appEnvironment: kubernetes - # The log level for the operator - # Available options: DEBUG | INFO | WARN | ERROR - logLevel: INFO - # Metrics contains the operator config related the metrics - metrics: - # The port used for metrics - port: 9290 - # Health contains the operator config related the health probe - healthz: - # The port used for the health probe - port: 8285 - # Controller related settings - controller: - # The period a forced recync is done by the controller (if watches are missed / nothing happened) - forceResyncPeriod: 10h - # QPS indicates the maximum QPS to the master from this client. - # Default is 200 - qps: 200 - # Maximum burst for throttle. - # Default is 500. - burst: 500 - # Features contains the settings for enabling / disabling the individual features of the operator - features: - # ClusterManagement contains the settings for qdrant (database) cluster management - clusterManagement: - # Whether or not the Qdrant cluster features are enabled. - # If disabled, all other properties in this struct are disregarded. Otherwise, the individual features will be inspected. - # Default is true. - enable: true - # The StorageClass used to make database and snapshot PVCs. - # Default is nil, meaning the default storage class of Kubernetes. - storageClass: - # The StorageClass used to make database PVCs. - # Default is nil, meaning the default storage class of Kubernetes. - #database: - # The StorageClass used to make snapshot PVCs. - # Default is nil, meaning the default storage class of Kubernetes. - #snapshot: - # Qdrant config contains settings specific for the database - qdrant: - # The config where to find the image for qdrant - image: - # The repository where to find the image for qdrant - # Default is "qdrant/qdrant" - repository: registry.cloud.qdrant.io/qdrant/qdrant - # Docker image pull policy - # Default "IfNotPresent", unless the tag is dev, master or latest. Then "Always" - #pullPolicy: - # Docker image pull secret name - # This secret should be available in the namespace where the cluster is running - # Default not set - pullSecretName: qdrant-registry-creds - # storage contains the settings for the storage of the Qdrant cluster - storage: - performance: - # CPU budget, how many CPUs (threads) to allocate for an optimization job. - # If 0 - auto selection, keep 1 or more CPUs unallocated depending on CPU size - # If negative - subtract this number of CPUs from the available CPUs. - # If positive - use this exact number of CPUs. - optimizerCpuBudget: 0 - # Enable async scorer which uses io_uring when rescoring. - # Only supported on Linux, must be enabled in your kernel. - # See: - asyncScorer: false - # Qdrant DB log level - # Available options: DEBUG | INFO | WARN | ERROR - # Default is "INFO" - logLevel: INFO - # Default Qdrant security context configuration - securityContext: - # Enable default security context - # Default is false - enabled: false - # Default user for qdrant container - # Default not set - #user: 1000 - # Default fsGroup for qdrant container - # Default not set - #fsUser: 2000 - # Default group for qdrant container - # Default not set - #group: 3000 - # Network policies configuration for the Qdrant databases - networkPolicies: - # Whether or not NetworkPolicy management is enabled. - # If set to false, no NetworkPolicies will be created. - # Default is true. - enable: true - ingress: - - ports: - - protocol: TCP - port: 6333 - - protocol: TCP - port: 6334 - # Allow DNS resolution from qdrant pods at Kubernetes internal DNS server - egress: - - ports: - - protocol: UDP - port: 53 - # Scheduling config contains the settings specific for scheduling - scheduling: - # Default topology spread constraints (list from type corev1.TopologySpreadConstraint) - # Default is an empty list - topologySpreadConstraints: [] - # Default pod disruption budget (object from type policyv1.PodDisruptionBudgetSpec) - # Default is not set - podDisruptionBudget: {} - # ClusterManager config contains the settings specific for cluster manager - clusterManager: - # Whether or not the cluster manager (on operator level). - # If disabled, all other properties in this struct are disregarded. Otherwise, the individual features will be inspected. - # Default is false. - enable: true - # The endpoint address where the cluster manager can be reached - endpointAddress: "http://qdrant-cluster-manager" - # InvocationInterval is the interval between calls (started after the previous call is retured) - # Default is 10 seconds - invocationInterval: 10s - # Timeout is the duration a single call to the cluster manager is allowed to take. - # Default is 30 seconds - timeout: 30s - # Specifies overrides for the manage rules - manageRulesOverrides: - #dry_run: - #max_transfers: - #max_transfers_per_collection: - #rebalance: - #replicate: - # Ingress config contains the settings specific for ingress - ingress: - # Whether or not the Ingress feature is enabled. - # Default is true. - enable: false - # Which specific ingress provider should be used - # Default is KubernetesIngress - provider: KubernetesIngress - # The specific settings when the Provider is QdrantCloudTraefik - qdrantCloudTraefik: - # Enable tls - # Default is false - tls: false - # Secret with TLS certificate - # Default is None - secretName: "" - # List of Traefik middlewares to apply - # Default is an empty list - middlewares: [] - # IP Allowlist Strategy for Traefik - # Default is None - ipAllowlistStrategy: - # Enable body validator plugin and matching ingressroute rules - # Default is false - enableBodyValidatorPlugin: false - # The specific settings when the Provider is KubernetesIngress - kubernetesIngress: - # Name of the ingress class - # Default is None - #ingressClassName: - # TelemetryTimeout is the duration a single call to the cluster telemetry endpoint is allowed to take. - # Default is 3 seconds - telemetryTimeout: 3s - # MaxConcurrentReconciles is the maximum number of concurrent Reconciles which can be run. Defaults to 20. - maxConcurrentReconciles: 20 - # VolumeExpansionMode specifies the expansion mode, which can be online or offline (e.g. in case of Azure). - # Available options: Online, Offline - # Default is Online - volumeExpansionMode: Online - # BackupManagementConfig contains the settings for backup management - backupManagement: - # Whether or not the backup features are enabled. - # If disabled, all other properties in this struct are disregarded. Otherwise, the individual features will be inspected. - # Default is true. - enable: true - # Snapshots contains the settings for snapshots as part of backup management. - snapshots: - # Whether or not the Snapshot feature is enabled. - # Default is true. - enable: true - # The VolumeSnapshotClass used to make VolumeSnapshots. - # Default is "csi-snapclass". - volumeSnapshotClass: "csi-snapclass" - # The duration a snapshot is retained when the phase becomes Failed or Skipped - # Default is 72h (3d). - retainUnsuccessful: 72h - # MaxConcurrentReconciles is the maximum number of concurrent Reconciles which can be run. Defaults to 1. - maxConcurrentReconciles: 1 - # ScheduledSnapshots contains the settings for scheduled snapshot as part of backup management. - scheduledSnapshots: - # Whether or not the ScheduledSnapshot feature is enabled. - # Default is true. - enable: true - # MaxConcurrentReconciles is the maximum number of concurrent Reconciles which can be run. Defaults to 1. - maxConcurrentReconciles: 1 - # Restores contains the settings for restoring (a snapshot) as part of backup management. - restores: - # Whether or not the Restore feature is enabled. - # Default is true. - enable: true - # MaxConcurrentReconciles is the maximum number of concurrent Reconciles which can be run. Defaults to 1. - maxConcurrentReconciles: 1 +```rust +use qdrant_client::qdrant::Condition; -qdrant-cluster-manager: - replicaCount: 1 +Condition::matches_text("description", "good cheap") +``` - image: - repository: registry.cloud.qdrant.io/qdrant/cluster-manager - pullPolicy: IfNotPresent - # Overrides the image tag whose default is the chart appVersion. - tag: "" +```java +import static io.qdrant.client.ConditionFactory.matchText; - imagePullSecrets: - - name: qdrant-registry-creds - nameOverride: "" - fullnameOverride: "qdrant-cluster-manager" +matchText("description", "good cheap"); +``` - serviceAccount: - # Specifies whether a service account should be created - create: true - # Automatically mount a ServiceAccount's API credentials? - automount: true - # Annotations to add to the service account - annotations: {} - # The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: "" +If the query has several words, then the condition will be satisfied only if all of them are present in the text. + +### Phrase Match + +*Available as of v1.15.0* + +A match `phrase` condition also leverages [full-text index](/documentation/concepts/indexing/#full-text-index), to perform exact phrase comparisons. +It allows you to search for a specific token phrase within the text field. + +For example, the text `"quick brown fox"` will be matched by the query `"brown fox"`, but not by `"fox brown"`. + + + +If there is no full-text index for the field, the condition will work as exact substring match. + +```python +models.FieldCondition( + key="description", + match=models.MatchPhrase(phrase="brown fox"), +) +``` + +```csharp +using static Qdrant.Client.Grpc.Conditions; + +MatchPhrase("description", "brown fox"); +``` + +```json +{ + "key": "description", + "match": { + "phrase": "brown fox" + } +} +``` + +```go +import "github.com/qdrant/go-client/qdrant" + +qdrant.NewMatchPhrase("description", "brown fox") +``` + +```typescript +{ + key: 'description', + match: {phrase: 'brown fox'} +} +``` + +```rust +use qdrant_client::qdrant::Condition; + +Condition::matches_phrase("description", "brown fox") +``` + +```java +import static io.qdrant.client.ConditionFactory.matchPhrase; + +matchPhrase("description", "brown fox"); +``` + +### Range + +```python +models.FieldCondition( + key="price", + range=models.Range( + gt=None, + gte=100.0, + lt=None, + lte=450.0, + ), +) +``` + +```csharp +using static Qdrant.Client.Grpc.Conditions; + +Range("price", new Qdrant.Client.Grpc.Range { Gte = 100.0, Lte = 450 }); +``` + +```json +{ + "key": "price", + "range": { + "gt": null, + "gte": 100.0, + "lt": null, + "lte": 450.0 + } +} +``` + +```go +import "github.com/qdrant/go-client/qdrant" + +qdrant.NewRange("price", &qdrant.Range{ + Gte: qdrant.PtrOf(100.0), + Lte: qdrant.PtrOf(450.0), +}) - podAnnotations: {} - podLabels: {} +``` - podSecurityContext: - runAsNonRoot: true - runAsUser: 10001 - runAsGroup: 20001 - fsGroup: 30001 +```typescript +{ + key: 'price', + range: { + gt: null, + gte: 100.0, + lt: null, + lte: 450.0 + } +} +``` - securityContext: - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - runAsNonRoot: true - runAsUser: 10001 - runAsGroup: 20001 - allowPrivilegeEscalation: false - seccompProfile: - type: RuntimeDefault +```rust +use qdrant_client::qdrant::{Condition, Range}; - service: - type: ClusterIP +Condition::range( + "price", + Range { + gt: None, + gte: Some(100.0), + lt: None, + lte: Some(450.0), + }, +) +``` - networkPolicy: - create: true +```java +import static io.qdrant.client.ConditionFactory.range; - resources: {} - # We usually recommend not to specify default resources and to leave this as a conscious - # choice for the user. This also increases chances charts run on environments with little - # resources, such as Minikube. If you do want to specify resources, uncomment the following - # lines, adjust them as necessary, and remove the curly braces after 'resources:'. - # limits: - # cpu: 100m - # memory: 128Mi - # requests: - # cpu: 100m - # memory: 128Mi +import io.qdrant.client.grpc.Points.Range; - nodeSelector: {} +range("price", Range.newBuilder().setGte(100.0).setLte(450).build()); +``` - tolerations: [] +The `range` condition sets the range of possible values for stored payload values. +If several values are stored, at least one of them should match the condition. - affinity: {} +Comparisons that can be used: -``` +- `gt` - greater than +- `gte` - greater than or equal +- `lt` - less than +- `lte` - less than or equal + +Can be applied to [float](/documentation/concepts/payload/#float) and [integer](/documentation/concepts/payload/#integer) payloads. -##### Was this page useful? +### Datetime Range -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +The datetime range is a unique range condition, used for [datetime](/documentation/concepts/payload/#datetime) payloads, which supports RFC 3339 formats. +You do not need to convert dates to UNIX timestaps. During comparison, timestamps are parsed and converted to UTC. -Thank you for your feedback! 🙏 +_Available as of v1.8.0_ -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/private-cloud/configuration.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +```python +models.FieldCondition( + key="date", + range=models.DatetimeRange( + gt="2023-02-08T10:49:00Z", + gte=None, + lt=None, + lte="2024-01-31T10:14:31Z", + ), +) +``` -On this page: +```csharp +using Qdrant.Client.Grpc; -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/private-cloud/configuration.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Conditions.DatetimeRange( + field: "date", + gt: new DateTime(2023, 2, 8, 10, 49, 0, DateTimeKind.Utc), + lte: new DateTime(2024, 1, 31, 10, 14, 31, DateTimeKind.Utc) +); +``` -× +```json +{ + "key": "date", + "range": { + "gt": "2023-02-08T10:49:00Z", + "gte": null, + "lt": null, + "lte": "2024-01-31 10:14:31Z" + } +} +``` -[Powered by](https://qdrant.tech/) +```go +import ( + "time" -<|page-13-lllmstxt|> -## sparse-vectors -- [Articles](https://qdrant.tech/articles/) -- What is a Sparse Vector? How to Achieve Vector-based Hybrid Search + "github.com/qdrant/go-client/qdrant" + "google.golang.org/protobuf/types/known/timestamppb" +) -[Back to Vector Search Manuals](https://qdrant.tech/articles/vector-search-manuals/) +qdrant.NewDatetimeRange("date", &qdrant.DatetimeRange{ + Gt: timestamppb.New(time.Date(2023, 2, 8, 10, 49, 0, 0, time.UTC)), + Lte: timestamppb.New(time.Date(2024, 1, 31, 10, 14, 31, 0, time.UTC)), +}) +``` -# What is a Sparse Vector? How to Achieve Vector-based Hybrid Search +```typescript +{ + key: 'date', + range: { + gt: '2023-02-08T10:49:00Z', + gte: null, + lt: null, + lte: '2024-01-31T10:14:31Z' + } +} +``` -Nirant Kasliwal +```rust +use qdrant_client::qdrant::{Condition, DatetimeRange, Timestamp}; -· +Condition::datetime_range( + "date", + DatetimeRange { + gt: Some(Timestamp::date_time(2023, 2, 8, 10, 49, 0).unwrap()), + gte: None, + lt: None, + lte: Some(Timestamp::date_time(2024, 1, 31, 10, 14, 31).unwrap()), + }, +) +``` -December 09, 2023 +```java +import static io.qdrant.client.ConditionFactory.datetimeRange; -![What is a Sparse Vector? How to Achieve Vector-based Hybrid Search](https://qdrant.tech/articles_data/sparse-vectors/preview/title.jpg) +import com.google.protobuf.Timestamp; +import io.qdrant.client.grpc.Points.DatetimeRange; +import java.time.Instant; -Think of a library with a vast index card system. Each index card only has a few keywords marked out (sparse vector) of a large possible set for each book (document). This is what sparse vectors enable for text. +long gt = Instant.parse("2023-02-08T10:49:00Z").getEpochSecond(); +long lte = Instant.parse("2024-01-31T10:14:31Z").getEpochSecond(); -## [Anchor](https://qdrant.tech/articles/sparse-vectors/\#what-are-sparse-and-dense-vectors) What are sparse and dense vectors? +datetimeRange("date", + DatetimeRange.newBuilder() + .setGt(Timestamp.newBuilder().setSeconds(gt)) + .setLte(Timestamp.newBuilder().setSeconds(lte)) + .build()); +``` -Sparse vectors are like the Marie Kondo of data—keeping only what sparks joy (or relevance, in this case). +### UUID Match -Consider a simplified example of 2 documents, each with 200 words. A dense vector would have several hundred non-zero values, whereas a sparse vector could have, much fewer, say only 20 non-zero values. +_Available as of v1.11.0_ -In this example: We assume it selects only 2 words or tokens from each document. The rest of the values are zero. This is why it’s called a sparse vector. +Matching of UUID values works similarly to the regular `match` condition for strings. +Functionally, it will work with `keyword` and `uuid` indexes exactly the same, but `uuid` index is more memory efficient. ```python -dense = [0.2, 0.3, 0.5, 0.7, ...] # several hundred floats -sparse = [{331: 0.5}, {14136: 0.7}] # 20 key value pairs - +models.FieldCondition( + key="uuid", + match=models.MatchValue(value="f47ac10b-58cc-4372-a567-0e02b2c3d479"), +) ``` -The numbers 331 and 14136 map to specific tokens in the vocabulary e.g. `['chocolate', 'icecream']`. The rest of the values are zero. This is why it’s called a sparse vector. +```csharp +using static Qdrant.Client.Grpc.Conditions; -The tokens aren’t always words though, sometimes they can be sub-words: `['ch', 'ocolate']` too. +MatchKeyword("uuid", "f47ac10b-58cc-4372-a567-0e02b2c3d479"); +``` -They’re pivotal in information retrieval, especially in ranking and search systems. BM25, a standard ranking function used by search engines like [Elasticsearch](https://www.elastic.co/blog/practical-bm25-part-2-the-bm25-algorithm-and-its-variables?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors), exemplifies this. BM25 calculates the relevance of documents to a given search query. +```json +{ + "key": "uuid", + "match": { + "value": "f47ac10b-58cc-4372-a567-0e02b2c3d479" + } +} +``` -BM25’s capabilities are well-established, yet it has its limitations. +```go +import "github.com/qdrant/go-client/qdrant" -BM25 relies solely on the frequency of words in a document and does not attempt to comprehend the meaning or the contextual importance of the words. Additionally, it requires the computation of the entire corpus’s statistics in advance, posing a challenge for large datasets. +qdrant.NewMatch("uuid", "f47ac10b-58cc-4372-a567-0e02b2c3d479") +``` -Sparse vectors harness the power of neural networks to surmount these limitations while retaining the ability to query exact words and phrases. -They excel in handling large text data, making them crucial in modern data processing a and marking an advancement over traditional methods such as BM25. +```typescript +{ + key: 'uuid', + match: {value: 'f47ac10b-58cc-4372-a567-0e02b2c3d479'} +} +``` -## [Anchor](https://qdrant.tech/articles/sparse-vectors/\#understanding-sparse-vectors) Understanding sparse vectors +```rust +Condition::matches("uuid", "f47ac10b-58cc-4372-a567-0e02b2c3d479".to_string()) +``` -Sparse Vectors are a representation where each dimension corresponds to a word or subword, greatly aiding in interpreting document rankings. This clarity is why sparse vectors are essential in modern search and recommendation systems, complimenting the meaning-rich embedding or dense vectors. +```java +matchKeyword("uuid", "f47ac10b-58cc-4372-a567-0e02b2c3d479"); +``` -Dense vectors from models like OpenAI Ada-002 or Sentence Transformers contain non-zero values for every element. In contrast, sparse vectors focus on relative word weights per document, with most values being zero. This results in a more efficient and interpretable system, especially in text-heavy applications like search. +### Geo -Sparse Vectors shine in domains and scenarios where many rare keywords or specialized terms are present. -For example, in the medical domain, many rare terms are not present in the general vocabulary, so general-purpose dense vectors cannot capture the nuances of the domain. +#### Geo Bounding Box -| Feature | Sparse Vectors | Dense Vectors | -| --- | --- | --- | -| **Data Representation** | Majority of elements are zero | All elements are non-zero | -| **Computational Efficiency** | Generally higher, especially in operations involving zero elements | Lower, as operations are performed on all elements | -| **Information Density** | Less dense, focuses on key features | Highly dense, capturing nuanced relationships | -| **Example Applications** | Text search, Hybrid search | [RAG](https://qdrant.tech/articles/what-is-rag-in-ai/), many general machine learning tasks | +```python +models.FieldCondition( + key="location", + geo_bounding_box=models.GeoBoundingBox( + bottom_right=models.GeoPoint( + lon=13.455868, + lat=52.495862, + ), + top_left=models.GeoPoint( + lon=13.403683, + lat=52.520711, + ), + ), +) +``` -Where do sparse vectors fail though? They’re not great at capturing nuanced relationships between words. For example, they can’t capture the relationship between “king” and “queen” as well as dense vectors. +```csharp +using static Qdrant.Client.Grpc.Conditions; -## [Anchor](https://qdrant.tech/articles/sparse-vectors/\#splade) SPLADE +GeoBoundingBox("location", 52.520711, 13.403683, 52.495862, 13.455868); +``` -Let’s check out [SPLADE](https://europe.naverlabs.com/research/computer-science/splade-a-sparse-bi-encoder-bert-based-model-achieves-effective-and-efficient-full-text-document-ranking/?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors), an excellent way to make sparse vectors. Let’s look at some numbers first. Higher is better: +```json +{ + "key": "location", + "geo_bounding_box": { + "bottom_right": { + "lon": 13.455868, + "lat": 52.495862 + }, + "top_left": { + "lon": 13.403683, + "lat": 52.520711 + } + } +} +``` -| Model | MRR@10 (MS MARCO Dev) | Type | -| --- | --- | --- | -| BM25 | 0.184 | Sparse | -| TCT-ColBERT | 0.359 | Dense | -| doc2query-T5 [link](https://github.com/castorini/docTTTTTquery) | 0.277 | Sparse | -| SPLADE | 0.322 | Sparse | -| SPLADE-max | 0.340 | Sparse | -| SPLADE-doc | 0.322 | Sparse | -| DistilSPLADE-max | 0.368 | Sparse | +```go +import "github.com/qdrant/go-client/qdrant" -All numbers are from [SPLADEv2](https://arxiv.org/abs/2109.10086). MRR is [Mean Reciprocal Rank](https://www.wikiwand.com/en/Mean_reciprocal_rank#References), a standard metric for ranking. [MS MARCO](https://microsoft.github.io/MSMARCO-Passage-Ranking/?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors) is a dataset for evaluating ranking and retrieval for passages. +qdrant.NewGeoBoundingBox("location", 52.520711, 13.403683, 52.495862, 13.455868) +``` -SPLADE is quite flexible as a method, with regularization knobs that can be tuned to obtain [different models](https://github.com/naver/splade) as well: +```typescript +{ + key: 'location', + geo_bounding_box: { + bottom_right: { + lon: 13.455868, + lat: 52.495862 + }, + top_left: { + lon: 13.403683, + lat: 52.520711 + } + } +} +``` -> SPLADE is more a class of models rather than a model per se: depending on the regularization magnitude, we can obtain different models (from very sparse to models doing intense query/doc expansion) with different properties and performance. +```rust +use qdrant_client::qdrant::{Condition, GeoBoundingBox, GeoPoint}; -First, let’s look at how to create a sparse vector. Then, we’ll look at the concepts behind SPLADE. +Condition::geo_bounding_box( + "location", + GeoBoundingBox { + bottom_right: Some(GeoPoint { + lon: 13.455868, + lat: 52.495862, + }), + top_left: Some(GeoPoint { + lon: 13.403683, + lat: 52.520711, + }), + }, +) +``` -## [Anchor](https://qdrant.tech/articles/sparse-vectors/\#creating-a-sparse-vector) Creating a sparse vector +```java +import static io.qdrant.client.ConditionFactory.geoBoundingBox; -We’ll explore two different ways to create a sparse vector. The higher performance way to create a sparse vector from dedicated document and query encoders. We’ll look at a simpler approach – here we will use the same model for both document and query. We will get a dictionary of token ids and their corresponding weights for a sample text - representing a document. +geoBoundingBox("location", 52.520711, 13.403683, 52.495862, 13.455868); +``` -If you’d like to follow along, here’s a [Colab Notebook](https://colab.research.google.com/gist/NirantK/ad658be3abefc09b17ce29f45255e14e/splade-single-encoder.ipynb), [alternate link](https://gist.github.com/NirantK/ad658be3abefc09b17ce29f45255e14e) with all the code. +It matches with `location`s inside a rectangle with the coordinates of the upper left corner in `bottom_right` and the coordinates of the lower right corner in `top_left`. -### [Anchor](https://qdrant.tech/articles/sparse-vectors/\#setting-up) Setting Up +#### Geo Radius ```python -from transformers import AutoModelForMaskedLM, AutoTokenizer - -model_id = "naver/splade-cocondenser-ensembledistil" +models.FieldCondition( + key="location", + geo_radius=models.GeoRadius( + center=models.GeoPoint( + lon=13.403683, + lat=52.520711, + ), + radius=1000.0, + ), +) +``` -tokenizer = AutoTokenizer.from_pretrained(model_id) -model = AutoModelForMaskedLM.from_pretrained(model_id) +```csharp +using static Qdrant.Client.Grpc.Conditions; -text = """Arthur Robert Ashe Jr. (July 10, 1943 – February 6, 1993) was an American professional tennis player. He won three Grand Slam titles in singles and two in doubles.""" +GeoRadius("location", 52.520711, 13.403683, 1000.0f); +``` +```json +{ + "key": "location", + "geo_radius": { + "center": { + "lon": 13.403683, + "lat": 52.520711 + }, + "radius": 1000.0 + } +} ``` -### [Anchor](https://qdrant.tech/articles/sparse-vectors/\#computing-the-sparse-vector) Computing the sparse vector +```go +import "github.com/qdrant/go-client/qdrant" -```python -import torch +qdrant.NewGeoRadius("location", 52.520711, 13.403683, 1000.0) +``` -def compute_vector(text): - """ - Computes a vector from logits and attention mask using ReLU, log, and max operations. - """ - tokens = tokenizer(text, return_tensors="pt") - output = model(**tokens) - logits, attention_mask = output.logits, tokens.attention_mask - relu_log = torch.log(1 + torch.relu(logits)) - weighted_log = relu_log * attention_mask.unsqueeze(-1) - max_val, _ = torch.max(weighted_log, dim=1) - vec = max_val.squeeze() +```typescript +{ + key: 'location', + geo_radius: { + center: { + lon: 13.403683, + lat: 52.520711 + }, + radius: 1000.0 + } +} +``` - return vec, tokens +```rust +use qdrant_client::qdrant::{Condition, GeoPoint, GeoRadius}; -vec, tokens = compute_vector(text) -print(vec.shape) +Condition::geo_radius( + "location", + GeoRadius { + center: Some(GeoPoint { + lon: 13.403683, + lat: 52.520711, + }), + radius: 1000.0, + }, +) +``` + +```java +import static io.qdrant.client.ConditionFactory.geoRadius; +geoRadius("location", 52.520711, 13.403683, 1000.0f); ``` -You’ll notice that there are 38 tokens in the text based on this tokenizer. This will be different from the number of tokens in the vector. In a TF-IDF, we’d assign weights only to these tokens or words. In SPLADE, we assign weights to all the tokens in the vocabulary using this vector using our learned model. +It matches with `location`s inside a circle with the `center` at the center and a radius of `radius` meters. + +If several values are stored, at least one of them should match the condition. +These conditions can only be applied to payloads that match the [geo-data format](/documentation/concepts/payload/#geo). + +#### Geo Polygon +Geo Polygons search is useful for when you want to find points inside an irregularly shaped area, for example a country boundary or a forest boundary. A polygon always has an exterior ring and may optionally include interior rings. A lake with an island would be an example of an interior ring. If you wanted to find points in the water but not on the island, you would make an interior ring for the island. + +When defining a ring, you must pick either a clockwise or counterclockwise ordering for your points. The first and last point of the polygon must be the same. -## [Anchor](https://qdrant.tech/articles/sparse-vectors/\#term-expansion-and-weights) Term expansion and weights +Currently, we only support unprojected global coordinates (decimal degrees longitude and latitude) and we are datum agnostic. ```python -def extract_and_map_sparse_vector(vector, tokenizer): - """ - Extracts non-zero elements from a given vector and maps these elements to their human-readable tokens using a tokenizer. The function creates and returns a sorted dictionary where keys are the tokens corresponding to non-zero elements in the vector, and values are the weights of these elements, sorted in descending order of weights. +models.FieldCondition( + key="location", + geo_polygon=models.GeoPolygon( + exterior=models.GeoLineString( + points=[ + models.GeoPoint( + lon=-70.0, + lat=-70.0, + ), + models.GeoPoint( + lon=60.0, + lat=-70.0, + ), + models.GeoPoint( + lon=60.0, + lat=60.0, + ), + models.GeoPoint( + lon=-70.0, + lat=60.0, + ), + models.GeoPoint( + lon=-70.0, + lat=-70.0, + ), + ] + ), + interiors=[ + models.GeoLineString( + points=[ + models.GeoPoint( + lon=-65.0, + lat=-65.0, + ), + models.GeoPoint( + lon=0.0, + lat=-65.0, + ), + models.GeoPoint( + lon=0.0, + lat=0.0, + ), + models.GeoPoint( + lon=-65.0, + lat=0.0, + ), + models.GeoPoint( + lon=-65.0, + lat=-65.0, + ), + ] + ) + ], + ), +) +``` - This function is useful in NLP tasks where you need to understand the significance of different tokens based on a model's output vector. It first identifies non-zero values in the vector, maps them to tokens, and sorts them by weight for better interpretability. +```csharp +using Qdrant.Client.Grpc; +using static Qdrant.Client.Grpc.Conditions; - Args: - vector (torch.Tensor): A PyTorch tensor from which to extract non-zero elements. - tokenizer: The tokenizer used for tokenization in the model, providing the mapping from tokens to indices. +GeoPolygon( + field: "location", + exterior: new GeoLineString + { + Points = + { + new GeoPoint { Lat = -70.0, Lon = -70.0 }, + new GeoPoint { Lat = 60.0, Lon = -70.0 }, + new GeoPoint { Lat = 60.0, Lon = 60.0 }, + new GeoPoint { Lat = -70.0, Lon = 60.0 }, + new GeoPoint { Lat = -70.0, Lon = -70.0 } + } + }, + interiors: [ + new() + { + Points = + { + new GeoPoint { Lat = -65.0, Lon = -65.0 }, + new GeoPoint { Lat = 0.0, Lon = -65.0 }, + new GeoPoint { Lat = 0.0, Lon = 0.0 }, + new GeoPoint { Lat = -65.0, Lon = 0.0 }, + new GeoPoint { Lat = -65.0, Lon = -65.0 } + } + } + ] +); +``` - Returns: - dict: A sorted dictionary mapping human-readable tokens to their corresponding non-zero weights. - """ +```json - # Extract indices and values of non-zero elements in the vector - cols = vector.nonzero().squeeze().cpu().tolist() - weights = vector[cols].cpu().tolist() +{ + "key": "location", + "geo_polygon": { + "exterior": { + "points": [ + { "lon": -70.0, "lat": -70.0 }, + { "lon": 60.0, "lat": -70.0 }, + { "lon": 60.0, "lat": 60.0 }, + { "lon": -70.0, "lat": 60.0 }, + { "lon": -70.0, "lat": -70.0 } + ] + }, + "interiors": [ + { + "points": [ + { "lon": -65.0, "lat": -65.0 }, + { "lon": 0.0, "lat": -65.0 }, + { "lon": 0.0, "lat": 0.0 }, + { "lon": -65.0, "lat": 0.0 }, + { "lon": -65.0, "lat": -65.0 } + ] + } + ] + } +} +``` - # Map indices to tokens and create a dictionary - idx2token = {idx: token for token, idx in tokenizer.get_vocab().items()} - token_weight_dict = { - idx2token[idx]: round(weight, 2) for idx, weight in zip(cols, weights) - } +```go +import "github.com/qdrant/go-client/qdrant" - # Sort the dictionary by weights in descending order - sorted_token_weight_dict = { - k: v - for k, v in sorted( - token_weight_dict.items(), key=lambda item: item[1], reverse=True - ) - } +qdrant.NewGeoPolygon("location", + &qdrant.GeoLineString{ + Points: []*qdrant.GeoPoint{ + {Lat: -70, Lon: -70}, + {Lat: 60, Lon: -70}, + {Lat: 60, Lon: 60}, + {Lat: -70, Lon: 60}, + {Lat: -70, Lon: -70}, + }, + }, &qdrant.GeoLineString{ + Points: []*qdrant.GeoPoint{ + {Lat: -65, Lon: -65}, + {Lat: 0, Lon: -65}, + {Lat: 0, Lon: 0}, + {Lat: -65, Lon: 0}, + {Lat: -65, Lon: -65}, + }, + }) +``` - return sorted_token_weight_dict +```typescript +{ + key: "location", + geo_polygon: { + exterior: { + points: [ + { + lon: -70.0, + lat: -70.0 + }, + { + lon: 60.0, + lat: -70.0 + }, + { + lon: 60.0, + lat: 60.0 + }, + { + lon: -70.0, + lat: 60.0 + }, + { + lon: -70.0, + lat: -70.0 + } + ] + }, + interiors: [ + { + points: [ + { + lon: -65.0, + lat: -65.0 + }, + { + lon: 0, + lat: -65.0 + }, + { + lon: 0, + lat: 0 + }, + { + lon: -65.0, + lat: 0 + }, + { + lon: -65.0, + lat: -65.0 + } + ] + } + ] + } +} +``` -# Usage example -sorted_tokens = extract_and_map_sparse_vector(vec, tokenizer) -sorted_tokens +```rust +use qdrant_client::qdrant::{Condition, GeoLineString, GeoPoint, GeoPolygon}; +Condition::geo_polygon( + "location", + GeoPolygon { + exterior: Some(GeoLineString { + points: vec![ + GeoPoint { + lon: -70.0, + lat: -70.0, + }, + GeoPoint { + lon: 60.0, + lat: -70.0, + }, + GeoPoint { + lon: 60.0, + lat: 60.0, + }, + GeoPoint { + lon: -70.0, + lat: 60.0, + }, + GeoPoint { + lon: -70.0, + lat: -70.0, + }, + ], + }), + interiors: vec![GeoLineString { + points: vec![ + GeoPoint { + lon: -65.0, + lat: -65.0, + }, + GeoPoint { + lon: 0.0, + lat: -65.0, + }, + GeoPoint { lon: 0.0, lat: 0.0 }, + GeoPoint { + lon: -65.0, + lat: 0.0, + }, + GeoPoint { + lon: -65.0, + lat: -65.0, + }, + ], + }], + }, +) ``` -There will be 102 sorted tokens in total. This has expanded to include tokens that weren’t in the original text. This is the term expansion we will talk about next. - -Here are some terms that are added: “Berlin”, and “founder” - despite having no mention of Arthur’s race (which leads to Owen’s Berlin win) and his work as the founder of Arthur Ashe Institute for Urban Health. Here are the top few `sorted_tokens` with a weight of more than 1: +```java +import static io.qdrant.client.ConditionFactory.geoPolygon; -```python -{ - "ashe": 2.95, - "arthur": 2.61, - "tennis": 2.22, - "robert": 1.74, - "jr": 1.55, - "he": 1.39, - "founder": 1.36, - "doubles": 1.24, - "won": 1.22, - "slam": 1.22, - "died": 1.19, - "singles": 1.1, - "was": 1.07, - "player": 1.06, - "titles": 0.99, - ... -} +import io.qdrant.client.grpc.Points.GeoLineString; +import io.qdrant.client.grpc.Points.GeoPoint; +geoPolygon( + "location", + GeoLineString.newBuilder() + .addAllPoints( + List.of( + GeoPoint.newBuilder().setLon(-70.0).setLat(-70.0).build(), + GeoPoint.newBuilder().setLon(60.0).setLat(-70.0).build(), + GeoPoint.newBuilder().setLon(60.0).setLat(60.0).build(), + GeoPoint.newBuilder().setLon(-70.0).setLat(60.0).build(), + GeoPoint.newBuilder().setLon(-70.0).setLat(-70.0).build())) + .build(), + List.of( + GeoLineString.newBuilder() + .addAllPoints( + List.of( + GeoPoint.newBuilder().setLon(-65.0).setLat(-65.0).build(), + GeoPoint.newBuilder().setLon(0.0).setLat(-65.0).build(), + GeoPoint.newBuilder().setLon(0.0).setLat(0.0).build(), + GeoPoint.newBuilder().setLon(-65.0).setLat(0.0).build(), + GeoPoint.newBuilder().setLon(-65.0).setLat(-65.0).build())) + .build())); ``` -If you’re interested in using the higher-performance approach, check out the following models: - -1. [naver/efficient-splade-VI-BT-large-doc](https://huggingface.co/naver/efficient-splade-vi-bt-large-doc) -2. [naver/efficient-splade-VI-BT-large-query](https://huggingface.co/naver/efficient-splade-vi-bt-large-doc) +A match is considered any point location inside or on the boundaries of the given polygon's exterior but not inside any interiors. -## [Anchor](https://qdrant.tech/articles/sparse-vectors/\#why-splade-works-term-expansion) Why SPLADE works: term expansion +If several location values are stored for a point, then any of them matching will include that point as a candidate in the resultset. +These conditions can only be applied to payloads that match the [geo-data format](/documentation/concepts/payload/#geo). -Consider a query “solar energy advantages”. SPLADE might expand this to include terms like “renewable,” “sustainable,” and “photovoltaic,” which are contextually relevant but not explicitly mentioned. This process is called term expansion, and it’s a key component of SPLADE. +### Values count -SPLADE learns the query/document expansion to include other relevant terms. This is a crucial advantage over other sparse methods which include the exact word, but completely miss the contextually relevant ones. +In addition to the direct value comparison, it is also possible to filter by the amount of values. -This expansion has a direct relationship with what we can control when making a SPLADE model: Sparsity via Regularisation. The number of tokens (BERT wordpieces) we use to represent each document. If we use more tokens, we can represent more terms, but the vectors become denser. This number is typically between 20 to 200 per document. As a reference point, the dense BERT vector is 768 dimensions, OpenAI Embedding is 1536 dimensions, and the sparse vector is 30 dimensions. +For example, given the data: -For example, assume a 1M document corpus. Say, we use 100 sparse token ids + weights per document. Correspondingly, dense BERT vector would be 768M floats, the OpenAI Embedding would be 1.536B floats, and the sparse vector would be a maximum of 100M integers + 100M floats. This could mean a **10x reduction in memory usage**, which is a huge win for large-scale systems: +```json +[ + { "id": 1, "name": "product A", "comments": ["Very good!", "Excellent"] }, + { "id": 2, "name": "product B", "comments": ["meh", "expected more", "ok"] } +] +``` -| Vector Type | Memory (GB) | -| --- | --- | -| Dense BERT Vector | 6.144 | -| OpenAI Embedding | 12.288 | -| Sparse Vector | 1.12 | +We can perform the search only among the items with more than two comments: -### [Anchor](https://qdrant.tech/articles/sparse-vectors/\#how-splade-works-leveraging-bert) How SPLADE works: leveraging BERT +```python +models.FieldCondition( + key="comments", + values_count=models.ValuesCount(gt=2), +) +``` -SPLADE leverages a transformer architecture to generate sparse representations of documents and queries, enabling efficient retrieval. Let’s dive into the process. +```csharp +using Qdrant.Client.Grpc; +using static Qdrant.Client.Grpc.Conditions; -The output logits from the transformer backbone are inputs upon which SPLADE builds. The transformer architecture can be something familiar like BERT. Rather than producing dense probability distributions, SPLADE utilizes these logits to construct sparse vectors—think of them as a distilled essence of tokens, where each dimension corresponds to a term from the vocabulary and its associated weight in the context of the given document or query. +ValuesCount("comments", new ValuesCount { Gt = 2 }); +``` -This sparsity is critical; it mirrors the probability distributions from a typical [Masked Language Modeling](http://jalammar.github.io/illustrated-bert/?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors) task but is tuned for retrieval effectiveness, emphasizing terms that are both: +```json +{ + "key": "comments", + "values_count": { + "gt": 2 + } +} +``` -1. Contextually relevant: Terms that represent a document well should be given more weight. -2. Discriminative across documents: Terms that a document has, and other documents don’t, should be given more weight. +```go +import "github.com/qdrant/go-client/qdrant" -The token-level distributions that you’d expect in a standard transformer model are now transformed into token-level importance scores in SPLADE. These scores reflect the significance of each term in the context of the document or query, guiding the model to allocate more weight to terms that are likely to be more meaningful for retrieval purposes. +qdrant.NewValuesCount("comments", &qdrant.ValuesCount{ + Gt: qdrant.PtrOf(uint64(2)), +}) +``` -The resulting sparse vectors are not only memory-efficient but also tailored for precise matching in the high-dimensional space of a search engine like Qdrant. +```typescript +{ + key: 'comments', + values_count: {gt: 2} +} +``` -### [Anchor](https://qdrant.tech/articles/sparse-vectors/\#interpreting-splade) Interpreting SPLADE +```rust +use qdrant_client::qdrant::{Condition, ValuesCount}; -A downside of dense vectors is that they are not interpretable, making it difficult to understand why a document is relevant to a query. +Condition::values_count( + "comments", + ValuesCount { + gt: Some(2), + ..Default::default() + }, +) +``` -SPLADE importance estimation can provide insights into the ‘why’ behind a document’s relevance to a query. By shedding light on which tokens contribute most to the retrieval score, SPLADE offers some degree of interpretability alongside performance, a rare feat in the realm of neural IR systems. For engineers working on search, this transparency is invaluable. +```java +import static io.qdrant.client.ConditionFactory.valuesCount; -## [Anchor](https://qdrant.tech/articles/sparse-vectors/\#known-limitations-of-splade) Known limitations of SPLADE +import io.qdrant.client.grpc.Points.ValuesCount; -### [Anchor](https://qdrant.tech/articles/sparse-vectors/\#pooling-strategy) Pooling strategy +valuesCount("comments", ValuesCount.newBuilder().setGt(2).build()); +``` -The switch to max pooling in SPLADE improved its performance on the MS MARCO and TREC datasets. However, this indicates a potential limitation of the baseline SPLADE pooling method, suggesting that SPLADE’s performance is sensitive to the choice of pooling strategy​​. +The result would be: -### [Anchor](https://qdrant.tech/articles/sparse-vectors/\#document-and-query-eecoder) Document and query Eecoder +```json +[{ "id": 2, "name": "product B", "comments": ["meh", "expected more", "ok"] }] +``` -The SPLADE model variant that uses a document encoder with max pooling but no query encoder reaches the same performance level as the prior SPLADE model. This suggests a limitation in the necessity of a query encoder, potentially affecting the efficiency of the model​​. +If stored value is not an array - it is assumed that the amount of values is equals to 1. -### [Anchor](https://qdrant.tech/articles/sparse-vectors/\#other-sparse-vector-methods) Other sparse vector methods +### Is Empty -SPLADE is not the only method to create sparse vectors. +Sometimes it is also useful to filter out records that are missing some value. +The `IsEmpty` condition may help you with that: -Essentially, sparse vectors are a superset of TF-IDF and BM25, which are the most popular text retrieval methods. -In other words, you can create a sparse vector using the term frequency and inverse document frequency (TF-IDF) to reproduce the BM25 score exactly. +```python +models.IsEmptyCondition( + is_empty=models.PayloadField(key="reports"), +) +``` -Additionally, attention weights from Sentence Transformers can be used to create sparse vectors. -This method preserves the ability to query exact words and phrases but avoids the computational overhead of query expansion used in SPLADE. +```csharp +using Qdrant.Client.Grpc; +using static Qdrant.Client.Grpc.Conditions; -We will cover these methods in detail in a future article. +IsEmpty("reports"); +``` -## [Anchor](https://qdrant.tech/articles/sparse-vectors/\#leveraging-sparse-vectors-in-qdrant-for-hybrid-search) Leveraging sparse vectors in Qdrant for hybrid search +```json +{ + "is_empty": { + "key": "reports" + } +} +``` -Qdrant supports a separate index for Sparse Vectors. -This enables you to use the same collection for both dense and sparse vectors. -Each “Point” in Qdrant can have both dense and sparse vectors. +```go +import "github.com/qdrant/go-client/qdrant" -But let’s first take a look at how you can work with sparse vectors in Qdrant. +qdrant.NewIsEmpty("reports") +``` -## [Anchor](https://qdrant.tech/articles/sparse-vectors/\#practical-implementation-in-python) Practical implementation in Python +```typescript +{ + is_empty: { + key: "reports" + } +} +``` -Let’s dive into how Qdrant handles sparse vectors with an example. Here is what we will cover: +```rust +use qdrant_client::qdrant::Condition; -1. Setting Up Qdrant Client: Initially, we establish a connection with Qdrant using the QdrantClient. This setup is crucial for subsequent operations. +Condition::is_empty("reports") +``` -2. Creating a Collection with Sparse Vector Support: In Qdrant, a collection is a container for your vectors. Here, we create a collection specifically designed to support sparse vectors. This is done using the create\_collection method where we define the parameters for sparse vectors, such as setting the index configuration. +```java +import static io.qdrant.client.ConditionFactory.isEmpty; -3. Inserting Sparse Vectors: Once the collection is set up, we can insert sparse vectors into it. This involves defining the sparse vector with its indices and values, and then upserting this point into the collection. +isEmpty("reports"); +``` -4. Querying with Sparse Vectors: To perform a search, we first prepare a query vector. This involves computing the vector from a query text and extracting its indices and values. We then use these details to construct a query against our collection. +This condition will match all records where the field `reports` either does not exist, or has `null` or `[]` value. -5. Retrieving and Interpreting Results: The search operation returns results that include the id of the matching document, its score, and other relevant details. The score is a crucial aspect, reflecting the similarity between the query and the documents in the collection. + +### Is Null -### [Anchor](https://qdrant.tech/articles/sparse-vectors/\#1-set-up) 1\. Set up +It is not possible to test for `NULL` values with the match condition. +We have to use `IsNull` condition instead: ```python -# Qdrant client setup -client = QdrantClient(":memory:") +models.IsNullCondition( + is_null=models.PayloadField(key="reports"), +) +``` -# Define collection name -COLLECTION_NAME = "example_collection" +```csharp +using Qdrant.Client.Grpc; +using static Qdrant.Client.Grpc.Conditions; -# Insert sparse vector into Qdrant collection -point_id = 1 # Assign a unique ID for the point +IsNull("reports"); +``` +```json +{ + "is_null": { + "key": "reports" + } +} ``` -### [Anchor](https://qdrant.tech/articles/sparse-vectors/\#2-create-a-collection-with-sparse-vector-support) 2\. Create a collection with sparse vector support +```go +import "github.com/qdrant/go-client/qdrant" -```python -client.create_collection( - collection_name=COLLECTION_NAME, - vectors_config={}, - sparse_vectors_config={ - "text": models.SparseVectorParams( - index=models.SparseIndexParams( - on_disk=False, - ) - ) - }, -) +qdrant.NewIsNull("reports") +``` +```typescript +{ + is_null: { + key: "reports" + } +} ``` -### [Anchor](https://qdrant.tech/articles/sparse-vectors/\#3-insert-sparse-vectors) 3\. Insert sparse vectors +```rust +use qdrant_client::qdrant::Condition; -Here, we see the process of inserting a sparse vector into the Qdrant collection. This step is key to building a dataset that can be quickly retrieved in the first stage of the retrieval process, utilizing the efficiency of sparse vectors. Since this is for demonstration purposes, we insert only one point with Sparse Vector and no dense vector. +Condition::is_null("reports") +``` -```python -client.upsert( - collection_name=COLLECTION_NAME, - points=[\ - models.PointStruct(\ - id=point_id,\ - payload={}, # Add any additional payload if necessary\ - vector={\ - "text": models.SparseVector(\ - indices=indices.tolist(), values=values.tolist()\ - )\ - },\ - )\ - ], -) +```java +import static io.qdrant.client.ConditionFactory.isNull; +isNull("reports"); ``` -By upserting points with sparse vectors, we prepare our dataset for rapid first-stage retrieval, laying the groundwork for subsequent detailed analysis using dense vectors. Notice that we use “text” to denote the name of the sparse vector. +This condition will match all records where the field `reports` exists and has `NULL` value. -Those familiar with the Qdrant API will notice that the extra care taken to be consistent with the existing named vectors API – this is to make it easier to use sparse vectors in existing codebases. As always, you’re able to **apply payload filters**, shard keys, and other advanced features you’ve come to expect from Qdrant. To make things easier for you, the indices and values don’t have to be sorted before upsert. Qdrant will sort them when the index is persisted e.g. on disk. -### [Anchor](https://qdrant.tech/articles/sparse-vectors/\#4-query-with-sparse-vectors) 4\. Query with sparse vectors +### Has id -We use the same process to prepare a query vector as well. This involves computing the vector from a query text and extracting its indices and values. We then use these details to construct a query against our collection. +This type of query is not related to payload, but can be very useful in some situations. +For example, the user could mark some specific search results as irrelevant, or we want to search only among the specified points. ```python -# Preparing a query vector +client.scroll( + collection_name="{collection_name}", + scroll_filter=models.Filter( + must=[ + models.HasIdCondition(has_id=[1, 3, 5, 7, 9, 11]), + ], + ), +) +``` -query_text = "Who was Arthur Ashe?" -query_vec, query_tokens = compute_vector(query_text) -query_vec.shape +```csharp +using Qdrant.Client; +using static Qdrant.Client.Grpc.Conditions; -query_indices = query_vec.nonzero().numpy().flatten() -query_values = query_vec.detach().numpy()[query_indices] +var client = new QdrantClient("localhost", 6334); +await client.ScrollAsync(collectionName: "{collection_name}", filter: HasId([1, 3, 5, 7, 9, 11])); ``` -In this example, we use the same model for both document and query. This is not a requirement, but it’s a simpler approach. - -### [Anchor](https://qdrant.tech/articles/sparse-vectors/\#5-retrieve-and-interpret-results) 5\. Retrieve and interpret results - -After setting up the collection and inserting sparse vectors, the next critical step is retrieving and interpreting the results. This process involves executing a search query and then analyzing the returned results. +```go +import ( + "context" -```python -# Searching for similar documents -result = client.search( - collection_name=COLLECTION_NAME, - query_vector=models.NamedSparseVector( - name="text", - vector=models.SparseVector( - indices=query_indices, - values=query_values, - ), - ), - with_vectors=True, + "github.com/qdrant/go-client/qdrant" ) -result +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Scroll(context.Background(), &qdrant.ScrollPoints{ + CollectionName: "{collection_name}", + Filter: &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewHasID( + qdrant.NewIDNum(1), + qdrant.NewIDNum(3), + qdrant.NewIDNum(5), + qdrant.NewIDNum(7), + qdrant.NewIDNum(9), + qdrant.NewIDNum(11), + ), + }, + }, +}) +``` +```http +POST /collections/{collection_name}/points/scroll +{ + "filter": { + "must": [ + { "has_id": [1,3,5,7,9,11] } + ] + } + ... +} ``` -In the above code, we execute a search against our collection using the prepared sparse vector query. The `client.search` method takes the collection name and the query vector as inputs. The query vector is constructed using the `models.NamedSparseVector`, which includes the indices and values derived from the query text. This is a crucial step in efficiently retrieving relevant documents. +```typescript +client.scroll("{collection_name}", { + filter: { + must: [ + { + has_id: [1, 3, 5, 7, 9, 11], + }, + ], + }, +}); +``` -```python -ScoredPoint( - id=1, - version=0, - score=3.4292831420898438, - payload={}, - vector={ - "text": SparseVector( - indices=[2001, 2002, 2010, 2018, 2032, ...], - values=[\ - 1.0660614967346191,\ - 1.391068458557129,\ - 0.8903818726539612,\ - 0.2502821087837219,\ - ...,\ - ], - ) - }, -) +```rust +use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; +use qdrant_client::Qdrant; +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .scroll( + ScrollPointsBuilder::new("{collection_name}") + .filter(Filter::must([Condition::has_id([1, 3, 5, 7, 9, 11])])), + ) + .await?; ``` -The result, as shown above, is a `ScoredPoint` object containing the ID of the retrieved document, its version, a similarity score, and the sparse vector. The score is a key element as it quantifies the similarity between the query and the document, based on their respective vectors. +```java +import java.util.List; -To understand how this scoring works, we use the familiar dot product method: +import static io.qdrant.client.ConditionFactory.hasId; +import static io.qdrant.client.PointIdFactory.id; -Similarity(Query,Document)=∑i∈IQueryi×Documenti +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.ScrollPoints; -This formula calculates the similarity score by multiplying corresponding elements of the query and document vectors and summing these products. This method is particularly effective with sparse vectors, where many elements are zero, leading to a computationally efficient process. The higher the score, the greater the similarity between the query and the document, making it a valuable metric for assessing the relevance of the retrieved documents. +client + .scrollAsync( + ScrollPoints.newBuilder() + .setCollectionName("{collection_name}") + .setFilter( + Filter.newBuilder() + .addMust(hasId(List.of(id(1), id(3), id(5), id(7), id(9), id(11)))) + .build()) + .build()) + .get(); +``` -## [Anchor](https://qdrant.tech/articles/sparse-vectors/\#hybrid-search-combining-sparse-and-dense-vectors) Hybrid search: combining sparse and dense vectors +Filtered points would be: -By combining search results from both dense and sparse vectors, you can achieve a hybrid search that is both efficient and accurate. -Results from sparse vectors will guarantee, that all results with the required keywords are returned, -while dense vectors will cover the semantically similar results. +```json +[ + { "id": 1, "city": "London", "color": "green" }, + { "id": 3, "city": "London", "color": "blue" }, + { "id": 5, "city": "Moscow", "color": "green" } +] +``` -The mixture of dense and sparse results can be presented directly to the user, or used as a first stage of a two-stage retrieval process. -Let’s see how you can make a hybrid search query in Qdrant. +### Has vector -First, you need to create a collection with both dense and sparse vectors: +*Available as of v1.13.0* -```python -client.create_collection( - collection_name=COLLECTION_NAME, - vectors_config={ - "text-dense": models.VectorParams( - size=1536, # OpenAI Embeddings - distance=models.Distance.COSINE, - ) +This condition enables filtering by the presence of a given named vector on a point. + +For example, if we have two named vector in our collection. + +```http +PUT /collections/{collection_name} +{ + "vectors": { + "image": { + "size": 4, + "distance": "Dot" + }, + "text": { + "size": 8, + "distance": "Cosine" + } }, - sparse_vectors_config={ - "text-sparse": models.SparseVectorParams( - index=models.SparseIndexParams( - on_disk=False, - ) - ) + "sparse_vectors": { + "sparse-image": {}, + "sparse-text": {}, }, -) - +} ``` -Then, assuming you have upserted both dense and sparse vectors, you can query them together: +Some points in the collection might have all vectors, some might have only a subset of them. + + + +This is how you can search for points which have the dense `image` vector defined: ```python -query_text = "Who was Arthur Ashe?" +from qdrant_client import QdrantClient, models -# Compute sparse and dense vectors -query_indices, query_values = compute_sparse_vector(query_text) -query_dense_vector = compute_dense_vector(query_text) +client = QdrantClient(url="http://localhost:6333") -client.search_batch( - collection_name=COLLECTION_NAME, - requests=[\ - models.SearchRequest(\ - vector=models.NamedVector(\ - name="text-dense",\ - vector=query_dense_vector,\ - ),\ - limit=10,\ - ),\ - models.SearchRequest(\ - vector=models.NamedSparseVector(\ - name="text-sparse",\ - vector=models.SparseVector(\ - indices=query_indices,\ - values=query_values,\ - ),\ - ),\ - limit=10,\ - ),\ - ], +client.scroll( + collection_name="{collection_name}", + scroll_filter=models.Filter( + must=[ + models.HasVectorCondition(has_vector="image"), + ], + ), ) - ``` -The result will be a pair of result lists, one for dense and one for sparse vectors. +```csharp +using Qdrant.Client; +using static Qdrant.Client.Grpc.Conditions; -Having those results, there are several ways to combine them: +var client = new QdrantClient("localhost", 6334); + +await client.ScrollAsync(collectionName: "{collection_name}", filter: HasVector("image")); +``` -### [Anchor](https://qdrant.tech/articles/sparse-vectors/\#mixing-or-fusion) Mixing or fusion +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" +) -You can mix the results from both dense and sparse vectors, based purely on their relative scores. This is a simple and effective approach, but it doesn’t take into account the semantic similarity between the results. Among the [popular mixing methods](https://medium.com/plain-simple-software/distribution-based-score-fusion-dbsf-a-new-approach-to-vector-search-ranking-f87c37488b18) are: +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) +client.Scroll(context.Background(), &qdrant.ScrollPoints{ + CollectionName: "{collection_name}", + Filter: &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewHasVector( + "image", + ), + }, + }, +}) ``` -- Reciprocal Ranked Fusion (RRF) -- Relative Score Fusion (RSF) -- Distribution-Based Score Fusion (DBSF) +```http +POST /collections/{collection_name}/points/scroll +{ + "filter": { + "must": [ + { "has_vector": "image" } + ] + } +} ``` -![Relative Score Fusion](https://qdrant.tech/articles_data/sparse-vectors/mixture.png) +```typescript +client.scroll("{collection_name}", { + filter: { + must: [ + { + has_vector: "image", + }, + ], + }, +}); +``` -Relative Score Fusion +```rust +use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; +use qdrant_client::Qdrant; -[Ranx](https://github.com/AmenRa/ranx) is a great library for mixing results from different sources. +let client = Qdrant::from_url("http://localhost:6334").build()?; + +client + .scroll( + ScrollPointsBuilder::new("{collection_name}") + .filter(Filter::must([Condition::has_vector("image")])), + ) + .await?; +``` -### [Anchor](https://qdrant.tech/articles/sparse-vectors/\#re-ranking) Re-ranking +```java +import java.util.List; -You can use obtained results as a first stage of a two-stage retrieval process. In the second stage, you can re-rank the results from the first stage using a more complex model, such as [Cross-Encoders](https://www.sbert.net/examples/applications/cross-encoder/README.html) or services like [Cohere Rerank](https://txt.cohere.com/rerank/). +import static io.qdrant.client.ConditionFactory.hasVector; +import static io.qdrant.client.PointIdFactory.id; -And that’s it! You’ve successfully achieved hybrid search with Qdrant! +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.ScrollPoints; -## [Anchor](https://qdrant.tech/articles/sparse-vectors/\#additional-resources) Additional resources +client + .scrollAsync( + ScrollPoints.newBuilder() + .setCollectionName("{collection_name}") + .setFilter( + Filter.newBuilder() + .addMust(hasVector("image")) + .build()) + .build()) + .get(); +``` -For those who want to dive deeper, here are the top papers on the topic most of which have code available: +<|page-10-lllmstxt|> +# Optimizer -1. Problem Motivation: [Sparse Overcomplete Word Vector Representations](https://ar5iv.org/abs/1506.02004?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors) -2. [SPLADE v2: Sparse Lexical and Expansion Model for Information Retrieval](https://ar5iv.org/abs/2109.10086?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors) -3. [SPLADE: Sparse Lexical and Expansion Model for First Stage Ranking](https://ar5iv.org/abs/2107.05720?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors) -4. Late Interaction - [ColBERTv2: Effective and Efficient Retrieval via Lightweight Late Interaction](https://ar5iv.org/abs/2112.01488?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors) -5. [SparseEmbed: Learning Sparse Lexical Representations with Contextual Embeddings for Retrieval](https://research.google/pubs/pub52289/?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors) +It is much more efficient to apply changes in batches than perform each change individually, as many other databases do. Qdrant here is no exception. Since Qdrant operates with data structures that are not always easy to change, it is sometimes necessary to rebuild those structures completely. -**Why just read when you can try it out?** +Storage optimization in Qdrant occurs at the segment level (see [storage](/documentation/concepts/storage/)). +In this case, the segment to be optimized remains readable for the time of the rebuild. -We’ve packed an easy-to-use Colab for you on how to make a Sparse Vector: [Sparse Vectors Single Encoder Demo](https://colab.research.google.com/drive/1wa2Yr5BCOgV0MTOFFTude99BOXCLHXky?usp=sharing). Run it, tinker with it, and start seeing the magic unfold in your projects. We can’t wait to hear how you use it! +![Segment optimization](/docs/optimization.svg) -## [Anchor](https://qdrant.tech/articles/sparse-vectors/\#conclusion) Conclusion +The availability is achieved by wrapping the segment into a proxy that transparently handles data changes. +Changed data is placed in the copy-on-write segment, which has priority for retrieval and subsequent updates. -Alright, folks, let’s wrap it up. Better search isn’t a ’nice-to-have,’ it’s a game-changer, and Qdrant can get you there. +## Vacuum Optimizer -Got questions? Our [Discord community](https://qdrant.to/discord?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors) is teeming with answers. +The simplest example of a case where you need to rebuild a segment repository is to remove points. +Like many other databases, Qdrant does not delete entries immediately after a query. +Instead, it marks records as deleted and ignores them for future queries. -If you enjoyed reading this, why not sign up for our [newsletter](https://qdrant.tech/subscribe/?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors) to stay ahead of the curve. +This strategy allows us to minimize disk access - one of the slowest operations. +However, a side effect of this strategy is that, over time, deleted records accumulate, occupy memory and slow down the system. -And, of course, a big thanks to you, our readers, for pushing us to make ranking better for everyone. +To avoid these adverse effects, Vacuum Optimizer is used. +It is used if the segment has accumulated too many deleted records. -##### Was this page useful? +The criteria for starting the optimizer are defined in the configuration file. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Here is an example of parameter values: -Thank you for your feedback! 🙏 +```yaml +storage: + optimizers: + # The minimal fraction of deleted vectors in a segment, required to perform segment optimization + deleted_threshold: 0.2 + # The minimal number of vectors in a segment, required to perform segment optimization + vacuum_min_vector_number: 1000 +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/sparse-vectors.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +## Merge Optimizer -On this page: +The service may require the creation of temporary segments. +Such segments, for example, are created as copy-on-write segments during optimization itself. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/sparse-vectors.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +It is also essential to have at least one small segment that Qdrant will use to store frequently updated data. +On the other hand, too many small segments lead to suboptimal search performance. -× +The merge optimizer constantly tries to reduce the number of segments if there +currently are too many. The desired number of segments is specified +with `default_segment_number` and defaults to the number of CPUs. The optimizer +may takes at least the three smallest segments and merges them into one. -[Powered by](https://qdrant.tech/) +Segments will not be merged if they'll exceed the maximum configured segment +size with `max_segment_size_kb`. It prevents creating segments that are too +large to efficiently index. Increasing this number may help to reduce the number +of segments if you have a lot of data, and can potentially improve search performance. -<|page-14-lllmstxt|> -## rag-chatbot-red-hat-openshift-haystack -- [Documentation](https://qdrant.tech/documentation/) -- [Examples](https://qdrant.tech/documentation/examples/) -- Private Chatbot for Interactive Learning +The criteria for starting the optimizer are defined in the configuration file. -# [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/\#private-chatbot-for-interactive-learning) Private Chatbot for Interactive Learning +Here is an example of parameter values: -| Time: 120 min | Level: Advanced | | | -| --- | --- | --- | --- | +```yaml +storage: + optimizers: + # Target amount of segments optimizer will try to keep. + # Real amount of segments may vary depending on multiple parameters: + # - Amount of stored points + # - Current write RPS + # + # It is recommended to select default number of segments as a factor of the number of search threads, + # so that each segment would be handled evenly by one of the threads. + # If `default_segment_number = 0`, will be automatically selected by the number of available CPUs + default_segment_number: 0 -With chatbots, companies can scale their training programs to accommodate a large workforce, delivering consistent and standardized learning experiences across departments, locations, and time zones. Furthermore, having already completed their online training, corporate employees might want to refer back old course materials. Most of this information is proprietary to the company, and manually searching through an entire library of materials takes time. However, a chatbot built on this knowledge can respond in the blink of an eye. + # Do not create segments larger this size (in KiloBytes). + # Large segments might require disproportionately long indexation times, + # therefore it makes sense to limit the size of segments. + # + # If indexation speed have more priority for your - make this parameter lower. + # If search speed is more important - make this parameter higher. + # Note: 1Kb = 1 vector of size 256 + # If not set, will be automatically selected considering the number of available CPUs. + max_segment_size_kb: null +``` -With a simple RAG pipeline, you can build a private chatbot. In this tutorial, you will combine open source tools inside of a closed infrastructure and tie them together with a reliable framework. This custom solution lets you run a chatbot without public internet access. You will be able to keep sensitive data secure without compromising privacy. +## Indexing Optimizer -![OpenShift](https://qdrant.tech/documentation/examples/student-rag-haystack-red-hat-openshift-hc/openshift-diagram.png)**Figure 1:** The LLM and Qdrant Hybrid Cloud are containerized as separate services. Haystack combines them into a RAG pipeline and exposes the API via Hayhooks. +Qdrant allows you to choose the type of indexes and data storage methods used depending on the number of records. +So, for example, if the number of points is less than 10000, using any index would be less efficient than a brute force scan. -## [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/\#components) Components +The Indexing Optimizer is used to implement the enabling of indexes and memmap storage when the minimal amount of records is reached. -To maintain complete data isolation, we need to limit ourselves to open-source tools and use them in a private environment, such as [Red Hat OpenShift](https://www.redhat.com/en/technologies/cloud-computing/openshift). The pipeline will run internally and will be inaccessible from the internet. +The criteria for starting the optimizer are defined in the configuration file. -- **Dataset:** [Red Hat Interactive Learning Portal](https://developers.redhat.com/learn), an online library of Red Hat course materials. -- **LLM:** `mistralai/Mistral-7B-Instruct-v0.1`, deployed as a standalone service on OpenShift. -- **Embedding Model:** `BAAI/bge-base-en-v1.5`, lightweight embedding model deployed from within the Haystack pipeline -with [FastEmbed](https://github.com/qdrant/fastembed) -- **Vector DB:** [Qdrant Hybrid Cloud](https://hybrid-cloud.qdrant.tech/) running on OpenShift. -- **Framework:** [Haystack 2.x](https://haystack.deepset.ai/) to connect all and [Hayhooks](https://docs.haystack.deepset.ai/docs/hayhooks) to serve the app through HTTP endpoints. +Here is an example of parameter values: -### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/\#procedure) Procedure +```yaml +storage: + optimizers: + # Maximum size (in kilobytes) of vectors to store in-memory per segment. + # Segments larger than this threshold will be stored as read-only memmaped file. + # Memmap storage is disabled by default, to enable it, set this threshold to a reasonable value. + # To disable memmap storage, set this to `0`. + # Note: 1Kb = 1 vector of size 256 + memmap_threshold: 200000 -The [Haystack](https://haystack.deepset.ai/) framework leverages two pipelines, which combine our components sequentially to process data. + # Maximum size (in kilobytes) of vectors allowed for plain index, exceeding this threshold will enable vector indexing + # Default value is 20,000, based on . + # To disable vector indexing, set to `0`. + # Note: 1kB = 1 vector of size 256. + indexing_threshold_kb: 20000 +``` -1. The **Indexing Pipeline** will run offline in batches, when new data is added or updated. -2. The **Search Pipeline** will retrieve information from Qdrant and use an LLM to produce an answer. +In addition to the configuration file, you can also set optimizer parameters separately for each [collection](/documentation/concepts/collections/). -> **Note:** We will define the pipelines in Python and then export them to YAML format, so that [Hayhooks](https://docs.haystack.deepset.ai/docs/hayhooks) can run them as a web service. +Dynamic parameter updates may be useful, for example, for more efficient initial loading of points. You can disable indexing during the upload process with these settings and enable it immediately after it is finished. As a result, you will not waste extra computation resources on rebuilding the index. -## [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/\#prerequisites) Prerequisites +<|page-11-lllmstxt|> +# Storage -### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/\#deploy-the-llm-to-openshift) Deploy the LLM to OpenShift +All data within one collection is divided into segments. +Each segment has its independent vector and payload storage as well as indexes. -Follow the steps in [Chapter 6. Serving large language models](https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/2.5/html/working_on_data_science_projects/serving-large-language-models_serving-large-language-models#doc-wrapper). This will download the LLM from the [HuggingFace](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1), and deploy it to OpenShift using a _single model serving platform_. +Data stored in segments usually do not overlap. +However, storing the same point in different segments will not cause problems since the search contains a deduplication mechanism. -Your LLM service will have a URL, which you need to store as an environment variable. +The segments consist of vector and payload storages, vector and payload [indexes](/documentation/concepts/indexing/), and id mapper, which stores the relationship between internal and external ids. -shellpython +A segment can be `appendable` or `non-appendable` depending on the type of storage and index used. +You can freely add, delete and query data in the `appendable` segment. +With `non-appendable` segment can only read and delete data. -```shell -export INFERENCE_ENDPOINT_URL="http://mistral-service.default.svc.cluster.local" +The configuration of the segments in the collection can be different and independent of one another, but at least one `appendable' segment must be present in a collection. -``` +## Vector storage -```python -import os +Depending on the requirements of the application, Qdrant can use one of the data storage options. +The choice has to be made between the search speed and the size of the RAM used. -os.environ["INFERENCE_ENDPOINT_URL"] = "http://mistral-service.default.svc.cluster.local" +**In-memory storage** - Stores all vectors in RAM, has the highest speed since disk access is required only for persistence. -``` +**Memmap storage** - Creates a virtual address space associated with the file on disk. [Wiki](https://en.wikipedia.org/wiki/Memory-mapped_file). +Mmapped files are not directly loaded into RAM. Instead, they use page cache to access the contents of the file. +This scheme allows flexible use of available memory. With sufficient RAM, it is almost as fast as in-memory storage. -### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/\#launch-qdrant-hybrid-cloud) Launch Qdrant Hybrid Cloud -Complete **How to Set Up Qdrant on Red Hat OpenShift**. When in Hybrid Cloud, your Qdrant instance is private and and its nodes run on the same OpenShift infrastructure as your other components. +### Configuring Memmap storage -Retrieve your Qdrant URL and API key and store them as environment variables: +There are two ways to configure the usage of memmap(also known as on-disk) storage: -shellpython +- Set up `on_disk` option for the vectors in the collection create API: -```shell -export QDRANT_URL="https://qdrant.example.com" -export QDRANT_API_KEY="your-api-key" +*Available as of v1.2.0* -``` ```python -os.environ["QDRANT_URL"] = "https://qdrant.example.com" -os.environ["QDRANT_API_KEY"] = "your-api-key" - -``` - -## [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/\#implementation) Implementation +from qdrant_client import QdrantClient, models -We will first create an indexing pipeline to add documents to the system. -Then, the search pipeline will retrieve relevant data from our documents. -After the pipelines are tested, we will export them to YAML files. +client = QdrantClient(url="http://localhost:6333") -### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/\#indexing-pipeline) Indexing pipeline +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams( + size=768, distance=models.Distance.COSINE, on_disk=True + ), +) +``` -[Haystack 2.x](https://haystack.deepset.ai/) comes packed with a lot of useful components, from data fetching, through -HTML parsing, up to the vector storage. Before we start, there are a few Python packages that we need to install: +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -```shell -pip install haystack-ai \ - qdrant-client \ - qdrant-haystack \ - fastembed-haystack +var client = new QdrantClient("localhost", 6334); +await client.CreateCollectionAsync( + "{collection_name}", + new VectorParams + { + Size = 768, + Distance = Distance.Cosine, + OnDisk = true + } +); ``` -Our environment is now ready, so we can jump right into the code. Let’s define an empty pipeline and gradually add -components to it: +```go +import ( + "context" -```python -from haystack import Pipeline + "github.com/qdrant/go-client/qdrant" +) -indexing_pipeline = Pipeline() +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 768, + Distance: qdrant.Distance_Cosine, + OnDisk: qdrant.PtrOf(true), + }), +}) ``` -#### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/\#data-fetching-and-conversion) Data fetching and conversion +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 768, + "distance": "Cosine", + "on_disk": true + } +} +``` -In this step, we will use Haystack’s `LinkContentFetcher` to download course content from a list of URLs and store it in Qdrant for retrieval. -As we don’t want to store raw HTML, this tool will extract text content from each webpage. Then, the fetcher will divide them into digestible chunks, since the documents might be pretty long. +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -Let’s start with data fetching and text conversion: +const client = new QdrantClient({ host: "localhost", port: 6333 }); -```python -from haystack.components.fetchers import LinkContentFetcher -from haystack.components.converters import HTMLToDocument +client.createCollection("{collection_name}", { + vectors: { + size: 768, + distance: "Cosine", + on_disk: true, + }, +}); +``` -fetcher = LinkContentFetcher() -converter = HTMLToDocument() +```rust +use qdrant_client::qdrant::{CreateCollectionBuilder, Distance, VectorParamsBuilder}; +use qdrant_client::Qdrant; -indexing_pipeline.add_component("fetcher", fetcher) -indexing_pipeline.add_component("converter", converter) +let client = Qdrant::from_url("http://localhost:6334").build()?; +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine).on_disk(true)), + ) + .await?; ``` -Our pipeline knows there are two components, but they are not connected yet. We need to define the flow between them: +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.VectorParams; -```python -indexing_pipeline.connect("fetcher.streams", "converter.sources") +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +client + .createCollectionAsync( + "{collection_name}", + VectorParams.newBuilder() + .setSize(768) + .setDistance(Distance.Cosine) + .setOnDisk(true) + .build()) + .get(); ``` -Each component has a set of inputs and outputs which might be combined in a directed graph. The definitions of the -inputs and outputs are usually provided in the documentation of the component. The `LinkContentFetcher` has the -following parameters: - -![Parameters of the LinkContentFetcher](https://qdrant.tech/documentation/examples/student-rag-haystack-red-hat-openshift-hc/haystack-link-content-fetcher.png) +This will create a collection with all vectors immediately stored in memmap storage. +This is the recommended way, in case your Qdrant instance operates with fast disks and you are working with large collections. -_Source: [https://docs.haystack.deepset.ai/docs/linkcontentfetcher](https://docs.haystack.deepset.ai/docs/linkcontentfetcher)_ -#### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/\#chunking-and-creating-the-embeddings) Chunking and creating the embeddings +- Set up `memmap_threshold` option. This option will set the threshold after which the segment will be converted to memmap storage. -We used `HTMLToDocument` to convert the HTML sources into `Document` instances of Haystack, which is a -base class containing some data to be queried. However, a single document might be too long to be processed by the -embedding model, and it also carries way too much information to make the search relevant. +There are two ways to do this: -Therefore, we need to split the document into smaller parts and convert them into embeddings. For this, we will use the -`DocumentSplitter` and `FastembedDocumentEmbedder` pointed to our `BAAI/bge-base-en-v1.5` model: +1. You can set the threshold globally in the [configuration file](/documentation/guides/configuration/). The parameter is called `memmap_threshold` (previously `memmap_threshold_kb`). +2. You can set the threshold for each collection separately during [creation](/documentation/concepts/collections/#create-collection) or [update](/documentation/concepts/collections/#update-collection-parameters). ```python -from haystack.components.preprocessors import DocumentSplitter -from haystack_integrations.components.embedders.fastembed import FastembedDocumentEmbedder - -splitter = DocumentSplitter(split_by="sentence", split_length=5, split_overlap=2) -embedder = FastembedDocumentEmbedder(model="BAAI/bge-base-en-v1.5") -embedder.warm_up() - -indexing_pipeline.add_component("splitter", splitter) -indexing_pipeline.add_component("embedder", embedder) +from qdrant_client import QdrantClient, models -indexing_pipeline.connect("converter.documents", "splitter.documents") -indexing_pipeline.connect("splitter.documents", "embedder.documents") +client = QdrantClient(url="http://localhost:6333") +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), + optimizers_config=models.OptimizersConfigDiff(memmap_threshold=20000), +) ``` -#### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/\#writing-data-to-qdrant) Writing data to Qdrant +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -The splitter will be producing chunks with a maximum length of 5 sentences, with an overlap of 2 sentences. Then, these -smaller portions will be converted into embeddings. +var client = new QdrantClient("localhost", 6334); -Finally, we need to store our embeddings in Qdrant. +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine }, + optimizersConfig: new OptimizersConfigDiff { MemmapThreshold = 20000 } +); +``` -```python -from haystack.utils import Secret -from haystack_integrations.document_stores.qdrant import QdrantDocumentStore -from haystack.components.writers import DocumentWriter +```go +import ( + "context" -document_store = QdrantDocumentStore( - os.environ["QDRANT_URL"], - api_key=Secret.from_env_var("QDRANT_API_KEY"), - index="red-hat-learning", - return_embedding=True, - embedding_dim=768, + "github.com/qdrant/go-client/qdrant" ) -writer = DocumentWriter(document_store=document_store) -indexing_pipeline.add_component("writer", writer) +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -indexing_pipeline.connect("embedder.documents", "writer.documents") +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 768, + Distance: qdrant.Distance_Cosine, + }), + OptimizersConfig: &qdrant.OptimizersConfigDiff{ + MaxSegmentSize: qdrant.PtrOf(uint64(20000)), + }, +}) +``` +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 768, + "distance": "Cosine" + }, + "optimizers_config": { + "memmap_threshold": 20000 + } +} ``` -Our pipeline is now complete. Haystack comes with a handy visualization of the pipeline, so you can see and verify the -connections between the components. It is displayed in the Jupyter notebook, but you can also export it to a file: +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -```python -indexing_pipeline.draw("indexing_pipeline.png") +const client = new QdrantClient({ host: "localhost", port: 6333 }); +client.createCollection("{collection_name}", { + vectors: { + size: 768, + distance: "Cosine", + }, + optimizers_config: { + memmap_threshold: 20000, + }, +}); ``` -![Structure of the indexing pipeline](https://qdrant.tech/documentation/examples/student-rag-haystack-red-hat-openshift-hc/indexing_pipeline.png) - -#### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/\#test-the-entire-pipeline) Test the entire pipeline - -We can finally run it on a list of URLs to index the content in Qdrant. We have a bunch of URLs to all the Red Hat -OpenShift Foundations course lessons, so let’s use them: - -```python -course_urls = [\ - "https://developers.redhat.com/learn/openshift/foundations-openshift",\ - "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:openshift-and-developer-sandbox",\ - "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:overview-web-console",\ - "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:use-terminal-window-within-red-hat-openshift-web-console",\ - "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:install-application-source-code-github-repository-using-openshift-web-console",\ - "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:install-application-linux-container-image-repository-using-openshift-web-console",\ - "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:install-application-linux-container-image-using-oc-cli-tool",\ - "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:install-application-source-code-using-oc-cli-tool",\ - "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:scale-applications-using-openshift-web-console",\ - "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:scale-applications-using-oc-cli-tool",\ - "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:work-databases-openshift-using-oc-cli-tool",\ - "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:work-databases-openshift-web-console",\ - "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:view-performance-information-using-openshift-web-console",\ -] +```rust +use qdrant_client::qdrant::{ + CreateCollectionBuilder, Distance, OptimizersConfigDiffBuilder, VectorParamsBuilder, +}; +use qdrant_client::Qdrant; -indexing_pipeline.run(data={ - "fetcher": { - "urls": course_urls, - } -}) +let client = Qdrant::from_url("http://localhost:6334").build()?; +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) + .optimizers_config(OptimizersConfigDiffBuilder::default().memmap_threshold(20000)), + ) + .await?; ``` -The execution might take a while, as the model needs to process all the documents. After the process is finished, we -should have all the documents stored in Qdrant, ready for search. You should see a short summary of processed documents: +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; -```shell -{'writer': {'documents_written': 381}} +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(768) + .setDistance(Distance.Cosine) + .build()) + .build()) + .setOptimizersConfig( + OptimizersConfigDiff.newBuilder().setMemmapThreshold(20000).build()) + .build()) + .get(); ``` -### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/\#search-pipeline) Search pipeline - -Our documents are now indexed and ready for search. The next pipeline is a bit simpler, but we still need to define a -few components. Let’s start again with an empty pipeline: - -```python -search_pipeline = Pipeline() +The rule of thumb to set the memmap threshold parameter is simple: -``` +- if you have a balanced use scenario - set memmap threshold the same as `indexing_threshold` (default is 20000). In this case the optimizer will not make any extra runs and will optimize all thresholds at once. +- if you have a high write load and low RAM - set memmap threshold lower than `indexing_threshold` to e.g. 10000. In this case the optimizer will convert the segments to memmap storage first and will only apply indexing after that. -Our second process takes user input, converts it into embeddings and then searches for the most relevant documents -using the query embedding. This might look familiar, but we arent working with `Document` instances -anymore, since the query only accepts raw text. Thus, some of the components will be different, especially the embedder, -as it has to accept a single string as an input and produce a single embedding as an output: +In addition, you can use memmap storage not only for vectors, but also for HNSW index. +To enable this, you need to set the `hnsw_config.on_disk` parameter to `true` during collection [creation](/documentation/concepts/collections/#create-a-collection) or [updating](/documentation/concepts/collections/#update-collection-parameters). ```python -from haystack_integrations.components.embedders.fastembed import FastembedTextEmbedder -from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever +from qdrant_client import QdrantClient, models -query_embedder = FastembedTextEmbedder(model="BAAI/bge-base-en-v1.5") -query_embedder.warm_up() +client = QdrantClient(url="http://localhost:6333") -retriever = QdrantEmbeddingRetriever( - document_store=document_store, # The same document store as the one used for indexing - top_k=3, # Number of documents to return +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE, on_disk=True), + hnsw_config=models.HnswConfigDiff(on_disk=True), ) +``` -search_pipeline.add_component("query_embedder", query_embedder) -search_pipeline.add_component("retriever", retriever) +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -search_pipeline.connect("query_embedder.embedding", "retriever.query_embedding") +var client = new QdrantClient("localhost", 6334); +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine, OnDisk = true }, + hnswConfig: new HnswConfigDiff { OnDisk = true } +); ``` -#### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/\#run-a-test-query) Run a test query - -If our goal was to just retrieve the relevant documents, we could stop here. Let’s try the current pipeline on a simple -query: +```go +import ( + "context" -```python -query = "How to install an application using the OpenShift web console?" + "github.com/qdrant/go-client/qdrant" +) -search_pipeline.run(data={ - "query_embedder": { - "text": query - } +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, }) +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 768, + Distance: qdrant.Distance_Cosine, + OnDisk: qdrant.PtrOf(true), + }), + HnswConfig: &qdrant.HnswConfigDiff{ + OnDisk: qdrant.PtrOf(true), + }, +}) ``` -We set the `top_k` parameter to 3, so the retriever should return the three most relevant documents. Your output should look like this: - -```text +```http +PUT /collections/{collection_name} { - 'retriever': { - 'documents': [\ - Document(id=867b4aa4c37a91e72dc7ff452c47972c1a46a279a7531cd6af14169bcef1441b, content: 'Install a Node.js application from GitHub using the web console The following describes the steps r...', meta: {'content_type': 'text/html', 'source_id': 'f56e8f827dda86abe67c0ba3b4b11331d896e2d4f7b2b43c74d3ce973d07be0c', 'url': 'https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:work-databases-openshift-web-console'}, score: 0.9209432),\ - Document(id=0c74381c178597dd91335ebfde790d13bf5989b682d73bf5573c7734e6765af7, content: 'How to remove an application from OpenShift using the web console. In addition to providing the cap...', meta: {'content_type': 'text/html', 'source_id': '2a0759f3ce4a37d9f5c2af9c0ffcc80879077c102fb8e41e576e04833c9d24ce', 'url': 'https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:install-application-linux-container-image-repository-using-openshift-web-console'}, score: 0.9132109500000001),\ - Document(id=3e5f8923a34ab05611ef20783211e5543e880c709fd6534d9c1f63576edc4061, content: 'Path resource: Install an application from source code in a GitHub repository using the OpenShift w...', meta: {'content_type': 'text/html', 'source_id': 'a4c4cd62d07c0d9d240e3289d2a1cc0a3d1127ae70704529967f715601559089', 'url': 'https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:install-application-source-code-github-repository-using-openshift-web-console'}, score: 0.912748935)\ - ] + "vectors": { + "size": 768, + "distance": "Cosine", + "on_disk": true + }, + "hnsw_config": { + "on_disk": true } } - ``` -#### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/\#generating-the-answer) Generating the answer - -Retrieval should serve more than just documents. Therefore, we will need to use an LLM to generate exact answers to our question. -This is the final component of our second pipeline. - -Haystack will create a prompt which adds your documents to the model’s context. - -```python -from haystack.components.builders.prompt_builder import PromptBuilder -from haystack.components.generators import HuggingFaceTGIGenerator - -prompt_builder = PromptBuilder(""" -Given the following information, answer the question. +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -Context: -{% for document in documents %} - {{ document.content }} -{% endfor %} +const client = new QdrantClient({ host: "localhost", port: 6333 }); -Question: {{ query }} -""") -llm = HuggingFaceTGIGenerator( - model="mistralai/Mistral-7B-Instruct-v0.1", - url=os.environ["INFERENCE_ENDPOINT_URL"], - generation_kwargs={ - "max_new_tokens": 1000, # Allow longer responses - }, -) +client.createCollection("{collection_name}", { + vectors: { + size: 768, + distance: "Cosine", + on_disk: true, + }, + hnsw_config: { + on_disk: true, + }, +}); +``` -search_pipeline.add_component("prompt_builder", prompt_builder) -search_pipeline.add_component("llm", llm) +```rust +use qdrant_client::qdrant::{ + CreateCollectionBuilder, Distance, HnswConfigDiffBuilder, + VectorParamsBuilder, +}; +use qdrant_client::Qdrant; -search_pipeline.connect("retriever.documents", "prompt_builder.documents") -search_pipeline.connect("prompt_builder.prompt", "llm.prompt") +let client = Qdrant::from_url("http://localhost:6334").build()?; +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine).on_disk(true)) + .hnsw_config(HnswConfigDiffBuilder::default().on_disk(true)), + ) + .await?; ``` -The `PromptBuilder` is a Jinja2 template that will be filled with the documents and the query. The -`HuggingFaceTGIGenerator` connects to the LLM service and generates the answer. Let’s run the pipeline again: - -```python -query = "How to install an application using the OpenShift web console?" +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.HnswConfigDiff; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; -response = search_pipeline.run(data={ - "query_embedder": { - "text": query - }, - "prompt_builder": { - "query": query - }, -}) +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(768) + .setDistance(Distance.Cosine) + .setOnDisk(true) + .build()) + .build()) + .setHnswConfig(HnswConfigDiff.newBuilder().setOnDisk(true).build()) + .build()) + .get(); ``` -The LLM may provide multiple replies, if asked to do so, so let’s iterate over and print them out: - -```python -for reply in response["llm"]["replies"]: - print(reply.strip()) +## Payload storage -``` +Qdrant supports two types of payload storages: InMemory and OnDisk. -In our case there is a single response, which should be the answer to the question: +InMemory payload storage is organized in the same way as in-memory vectors. +The payload data is loaded into RAM at service startup while disk and [Gridstore](/articles/gridstore-key-value-storage/) are used for persistence only. +This type of storage works quite fast, but it may require a lot of space to keep all the data in RAM, especially if the payload has large values attached - abstracts of text or even images. -```text -Answer: To install an application using the OpenShift web console, follow these steps: +In the case of large payload values, it might be better to use OnDisk payload storage. +This type of storage will read and write payload directly to RocksDB, so it won't require any significant amount of RAM to store. +The downside, however, is the access latency. +If you need to query vectors with some payload-based conditions - checking values stored on disk might take too much time. +In this scenario, we recommend creating a payload index for each field used in filtering conditions to avoid disk access. +Once you create the field index, Qdrant will preserve all values of the indexed field in RAM regardless of the payload storage type. -1. Select +Add on the left side of the web console. -2. Identify the container image to install. -3. Using your web browser, navigate to the Developer Sandbox for Red Hat OpenShift and select Start your Sandbox for free. -4. Install an application from source code stored in a GitHub repository using the OpenShift web console. +You can specify the desired type of payload storage with [configuration file](/documentation/guides/configuration/) or with collection parameter `on_disk_payload` during [creation](/documentation/concepts/collections/#create-collection) of the collection. -``` +## Versioning -Our final search pipeline might also be visualized, so we can see how the components are glued together: +To ensure data integrity, Qdrant performs all data changes in 2 stages. +In the first step, the data is written to the Write-ahead-log(WAL), which orders all operations and assigns them a sequential number. -```python -search_pipeline.draw("search_pipeline.png") +Once a change has been added to the WAL, it will not be lost even if a power loss occurs. +Then the changes go into the segments. +Each segment stores the last version of the change applied to it as well as the version of each individual point. +If the new change has a sequential number less than the current version of the point, the updater will ignore the change. +This mechanism allows Qdrant to safely and efficiently restore the storage from the WAL in case of an abnormal shutdown. -``` +<|page-12-lllmstxt|> +# Indexing -![Structure of the search pipeline](https://qdrant.tech/documentation/examples/student-rag-haystack-red-hat-openshift-hc/search_pipeline.png) +A key feature of Qdrant is the effective combination of vector and traditional indexes. It is essential to have this because for vector search to work effectively with filters, having a vector index only is not enough. In simpler terms, a vector index speeds up vector search, and payload indexes speed up filtering. -## [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/\#deployment) Deployment +The indexes in the segments exist independently, but the parameters of the indexes themselves are configured for the whole collection. -The pipelines are now ready, and we can export them to YAML. Hayhooks will use these files to run the -pipelines as HTTP endpoints. To do this, specify both file paths and your environment variables. +Not all segments automatically have indexes. +Their necessity is determined by the [optimizer](/documentation/concepts/optimizer/) settings and depends, as a rule, on the number of stored points. -> Note: The indexing pipeline might be run inside your ETL tool, but search should be definitely exposed as an HTTP endpoint. +## Payload Index -Let’s run it on the local machine: +Payload index in Qdrant is similar to the index in conventional document-oriented databases. +This index is built for a specific field and type, and is used for quick point requests by the corresponding filtering condition. -```shell -pip install hayhooks +The index is also used to accurately estimate the filter cardinality, which helps the [query planning](/documentation/concepts/search/#query-planning) choose a search strategy. -``` +Creating an index requires additional computational resources and memory, so choosing fields to be indexed is essential. Qdrant does not make this choice but grants it to the user. -First of all, we need to save the pipelines to the YAML file: +To mark a field as indexable, you can use the following: ```python -with open("search-pipeline.yaml", "w") as fp: - search_pipeline.dump(fp) - -``` - -And now we are able to run the Hayhooks service: - -```shell -hayhooks run - +client.create_payload_index( + collection_name="{collection_name}", + field_name="name_of_the_field_to_index", + field_schema="keyword", +) ``` -The command should start the service on the default port, so you can access it at `http://localhost:1416`. The pipeline -is not deployed yet, but we can do it with just another command: +```csharp +using Qdrant.Client; -```shell -hayhooks deploy search-pipeline.yaml +var client = new QdrantClient("localhost", 6334); +await client.CreatePayloadIndexAsync( + collectionName: "{collection_name}", + fieldName: "name_of_the_field_to_index" +); ``` -Once it’s finished, you should be able to see the OpenAPI documentation at -[http://localhost:1416/docs](http://localhost:1416/docs), and test the newly created endpoint. - -![Search pipeline in the OpenAPI documentation](https://qdrant.tech/documentation/examples/student-rag-haystack-red-hat-openshift-hc/hayhooks-openapi.png) +```go +import ( + "context" -Our search is now accessible through the HTTP endpoint, so we can integrate it with any other service. We can even -control the other parameters, like the number of documents to return: + "github.com/qdrant/go-client/qdrant" +) -```shell -curl -X 'POST' \ - 'http://localhost:1416/search-pipeline' \ - -H 'Accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "llm": { - }, - "prompt_builder": { - "query": "How can I remove an application?" - }, - "query_embedder": { - "text": "How can I remove an application?" - }, - "retriever": { - "top_k": 5 - } -}' +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) +client.CreateFieldIndex(context.Background(), &qdrant.CreateFieldIndexCollection{ + CollectionName: "{collection_name}", + FieldName: "name_of_the_field_to_index", + FieldType: qdrant.FieldType_FieldTypeKeyword.Enum(), +}) ``` -The response should be similar to the one we got in the Python before: - -```json +```http +PUT /collections/{collection_name}/index { - "llm": { - "replies": [\ - "\n\nAnswer: You can remove an application running in OpenShift by right-clicking on the circular graphic representing the application in Topology view and selecting the Delete Application text from the dialog that appears when you click the graphic’s outer ring. Alternatively, you can use the oc CLI tool to delete an installed application using the oc delete all command."\ - ], - "meta": [\ - {\ - "model": "mistralai/Mistral-7B-Instruct-v0.1",\ - "index": 0,\ - "finish_reason": "eos_token",\ - "usage": {\ - "completion_tokens": 75,\ - "prompt_tokens": 642,\ - "total_tokens": 717\ - }\ - }\ - ] - } + "field_name": "name_of_the_field_to_index", + "field_schema": "keyword" } - ``` -## [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/\#next-steps) Next steps - -- In this example, [Red Hat OpenShift](https://www.redhat.com/en/technologies/cloud-computing/openshift) is the infrastructure of choice for proprietary chatbots. [Read more](https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/2.8) about how to host AI projects in their [extensive documentation](https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/2.8). +```typescript +client.createPayloadIndex("{collection_name}", { + field_name: "name_of_the_field_to_index", + field_schema: "keyword", +}); +``` -- [Haystack’s documentation](https://docs.haystack.deepset.ai/docs/kubernetes) describes [how to deploy the Hayhooks service in a Kubernetes\\ -environment](https://docs.haystack.deepset.ai/docs/kubernetes), so you can easily move it to your own OpenShift infrastructure. +```rust +use qdrant_client::qdrant::{CreateFieldIndexCollectionBuilder, FieldType}; -- If you are just getting started and need more guidance on Qdrant, read the [quickstart](https://qdrant.tech/documentation/quick-start/) or try out our [beginner tutorial](https://qdrant.tech/documentation/tutorials/neural-search/). +client + .create_field_index( + CreateFieldIndexCollectionBuilder::new( + "{collection_name}", + "name_of_the_field_to_index", + FieldType::Keyword, + ) + .wait(true), + ) + .await?; +``` +```java +import io.qdrant.client.grpc.Collections.PayloadSchemaType; -##### Was this page useful? +client.createPayloadIndexAsync( + "{collection_name}", + "name_of_the_field_to_index", + PayloadSchemaType.Keyword, + null, + true, + null, + null); +``` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +You can use dot notation to specify a nested field for indexing. Similar to specifying [nested filters](/documentation/concepts/filtering/#nested-key). -Thank you for your feedback! 🙏 +Available field types are: -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/rag-chatbot-red-hat-openshift-haystack.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +* `keyword` - for [keyword](/documentation/concepts/payload/#keyword) payload, affects [Match](/documentation/concepts/filtering/#match) filtering conditions. +* `integer` - for [integer](/documentation/concepts/payload/#integer) payload, affects [Match](/documentation/concepts/filtering/#match) and [Range](/documentation/concepts/filtering/#range) filtering conditions. +* `float` - for [float](/documentation/concepts/payload/#float) payload, affects [Range](/documentation/concepts/filtering/#range) filtering conditions. +* `bool` - for [bool](/documentation/concepts/payload/#bool) payload, affects [Match](/documentation/concepts/filtering/#match) filtering conditions (available as of v1.4.0). +* `geo` - for [geo](/documentation/concepts/payload/#geo) payload, affects [Geo Bounding Box](/documentation/concepts/filtering/#geo-bounding-box) and [Geo Radius](/documentation/concepts/filtering/#geo-radius) filtering conditions. +* `datetime` - for [datetime](/documentation/concepts/payload/#datetime) payload, affects [Range](/documentation/concepts/filtering/#range) filtering conditions (available as of v1.8.0). +* `text` - a special kind of index, available for [keyword](/documentation/concepts/payload/#keyword) / string payloads, affects [Full Text search](/documentation/concepts/filtering/#full-text-match) filtering conditions. Read more about [text index configuration](#full-text-index) +* `uuid` - a special type of index, similar to `keyword`, but optimized for [UUID values](/documentation/concepts/payload/#uuid). +Affects [Match](/documentation/concepts/filtering/#match) filtering conditions. (available as of v1.11.0) + +Payload index may occupy some additional memory, so it is recommended to only use the index for those fields that are used in filtering conditions. +If you need to filter by many fields and the memory limits do not allow for indexing all of them, it is recommended to choose the field that limits the search result the most. +As a rule, the more different values a payload value has, the more efficiently the index will be used. -On this page: +### Parameterized index -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/rag-chatbot-red-hat-openshift-haystack.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +*Available as of v1.8.0* -× +We've added a parameterized variant to the `integer` index, which allows +you to fine-tune indexing and search performance. -[Powered by](https://qdrant.tech/) +Both the regular and parameterized `integer` indexes use the following flags: -<|page-15-lllmstxt|> -## qdrant-1.7.x -- [Articles](https://qdrant.tech/articles/) -- Qdrant 1.7.0 has just landed! +- `lookup`: enables support for direct lookup using + [Match](/documentation/concepts/filtering/#match) filters. +- `range`: enables support for + [Range](/documentation/concepts/filtering/#range) filters. -[Back to Qdrant Articles](https://qdrant.tech/articles/) +The regular `integer` index assumes both `lookup` and `range` are `true`. In +contrast, to configure a parameterized index, you would set only one of these +filters to `true`: -# Qdrant 1.7.0 has just landed! +| `lookup` | `range` | Result | +|----------|---------|-----------------------------| +| `true` | `true` | Regular integer index | +| `true` | `false` | Parameterized integer index | +| `false` | `true` | Parameterized integer index | +| `false` | `false` | No integer index | -Kacper Ɓukawski +The parameterized index can enhance performance in collections with millions +of points. We encourage you to try it out. If it does not enhance performance +in your use case, you can always restore the regular `integer` index. -· +Note: If you set `"lookup": true` with a range filter, that may lead to +significant performance issues. -December 10, 2023 +For example, the following code sets up a parameterized integer index which +supports only range filters: -![Qdrant 1.7.0 has just landed!](https://qdrant.tech/articles_data/qdrant-1.7.x/preview/title.jpg) +```python +from qdrant_client import QdrantClient, models -Please welcome the long-awaited [Qdrant 1.7.0 release](https://github.com/qdrant/qdrant/releases/tag/v1.7.0). Except for a handful of minor fixes and improvements, this release brings some cool brand-new features that we are excited to share! -The latest version of your favorite vector search engine finally supports **sparse vectors**. That’s the feature many of you requested, so why should we ignore it? -We also decided to continue our journey with [vector similarity beyond search](https://qdrant.tech/articles/vector-similarity-beyond-search/). The new Discovery API covers some utterly new use cases. We’re more than excited to see what you will build with it! -But there is more to it! Check out what’s new in **Qdrant 1.7.0**! +client = QdrantClient(url="http://localhost:6333") -1. Sparse vectors: do you want to use keyword-based search? Support for sparse vectors is finally here! -2. Discovery API: an entirely new way of using vectors for restricted search and exploration. -3. User-defined sharding: you can now decide which points should be stored on which shard. -4. Snapshot-based shard transfer: a new option for moving shards between nodes. +client.create_payload_index( + collection_name="{collection_name}", + field_name="name_of_the_field_to_index", + field_schema=models.IntegerIndexParams( + type=models.IntegerIndexType.INTEGER, + lookup=False, + range=True, + ), +) +``` -Do you see something missing? Your feedback drives the development of Qdrant, so do not hesitate to [join our Discord community](https://qdrant.to/discord) and help us build the best vector search engine out there! +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -## [Anchor](https://qdrant.tech/articles/qdrant-1.7.x/\#new-features) New features +var client = new QdrantClient("localhost", 6334); -Qdrant 1.7.0 brings a bunch of new features. Let’s take a closer look at them! +await client.CreatePayloadIndexAsync( + collectionName: "{collection_name}", + fieldName: "name_of_the_field_to_index", + schemaType: PayloadSchemaType.Integer, + indexParams: new PayloadIndexParams + { + IntegerIndexParams = new() + { + Lookup = false, + Range = true + } + } +); +``` -### [Anchor](https://qdrant.tech/articles/qdrant-1.7.x/\#sparse-vectors) Sparse vectors +```go +import ( + "context" -Traditional keyword-based search mechanisms often rely on algorithms like TF-IDF, BM25, or comparable methods. While these techniques internally utilize vectors, they typically involve sparse vector representations. In these methods, the **vectors are predominantly filled with zeros, containing a relatively small number of non-zero values**. -Those sparse vectors are theoretically high dimensional, definitely way higher than the dense vectors used in semantic search. However, since the majority of dimensions are usually zeros, we store them differently and just keep the non-zero dimensions. + "github.com/qdrant/go-client/qdrant" +) -Until now, Qdrant has not been able to handle sparse vectors natively. Some were trying to convert them to dense vectors, but that was not the best solution or a suggested way. We even wrote a piece with [our thoughts on building a hybrid search](https://qdrant.tech/articles/hybrid-search/), and we encouraged you to use a different tool for keyword lookup. +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -Things have changed since then, as so many of you wanted a single tool for sparse and dense vectors. And responding to this [popular](https://github.com/qdrant/qdrant/issues/1678) [demand](https://github.com/qdrant/qdrant/issues/1135), we’ve now introduced sparse vectors! +client.CreateFieldIndex(context.Background(), &qdrant.CreateFieldIndexCollection{ + CollectionName: "{collection_name}", + FieldName: "name_of_the_field_to_index", + FieldType: qdrant.FieldType_FieldTypeInteger.Enum(), + FieldIndexParams: qdrant.NewPayloadIndexParamsInt( + &qdrant.IntegerIndexParams{ + Lookup: false, + Range: true, + }), +}) +``` -If you’re coming across the topic of sparse vectors for the first time, our [Brief History of Search](https://qdrant.tech/documentation/overview/vector-search/) explains the difference between sparse and dense vectors. +```http +PUT /collections/{collection_name}/index +{ + "field_name": "name_of_the_field_to_index", + "field_schema": { + "type": "integer", + "lookup": false, + "range": true + } +} +``` -Check out the [sparse vectors article](https://qdrant.tech/articles/sparse-vectors/) and [sparse vectors index docs](https://qdrant.tech/documentation/concepts/indexing/#sparse-vector-index) for more details on what this new index means for Qdrant users. +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -### [Anchor](https://qdrant.tech/articles/qdrant-1.7.x/\#discovery-api) Discovery API +const client = new QdrantClient({ host: "localhost", port: 6333 }); -The recently launched [Discovery API](https://qdrant.tech/documentation/concepts/explore/#discovery-api) extends the range of scenarios for leveraging vectors. While its interface mirrors the [Recommendation API](https://qdrant.tech/documentation/concepts/explore/#recommendation-api), it focuses on refining the search parameters for greater precision. -The concept of ‘context’ refers to a collection of positive-negative pairs that define zones within a space. Each pair effectively divides the space into positive or negative segments. This concept guides the search operation to prioritize points based on their inclusion within positive zones or their avoidance of negative zones. Essentially, the search algorithm favors points that fall within multiple positive zones or steer clear of negative ones. +client.createPayloadIndex("{collection_name}", { + field_name: "name_of_the_field_to_index", + field_schema: { + type: "integer", + lookup: false, + range: true, + }, +}); +``` -The Discovery API can be used in two ways - either with or without the target point. The first case is called a **discovery search**, while the second is called a **context search**. +```rust +use qdrant_client::qdrant::{ + payload_index_params::IndexParams, CreateFieldIndexCollectionBuilder, FieldType, + IntegerIndexParams, PayloadIndexParams, +}; +use qdrant_client::Qdrant; -#### [Anchor](https://qdrant.tech/articles/qdrant-1.7.x/\#discovery-search) Discovery search +let client = Qdrant::from_url("http://localhost:6334").build()?; -_Discovery search_ is an operation that uses a target point to find the most relevant points in the collection, while performing the search in the preferred areas only. That is basically a search operation with more control over the search space. +client + .create_field_index( + CreateFieldIndexCollectionBuilder::new( + "{collection_name}", + "name_of_the_field_to_index", + FieldType::Integer, + ) + .field_index_params(PayloadIndexParams { + index_params: Some(IndexParams::IntegerIndexParams(IntegerIndexParams { + lookup: false, + range: true, + })), + }), + ) + .await?; +``` -![Discovery search visualization](https://qdrant.tech/articles_data/qdrant-1.7.x/discovery-search.png) +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.IntegerIndexParams; +import io.qdrant.client.grpc.Collections.PayloadIndexParams; +import io.qdrant.client.grpc.Collections.PayloadSchemaType; -Please refer to the [Discovery API documentation on discovery search](https://qdrant.tech/documentation/concepts/explore/#discovery-search) for more details and the internal mechanics of the operation. +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -#### [Anchor](https://qdrant.tech/articles/qdrant-1.7.x/\#context-search) Context search +client + .createPayloadIndexAsync( + "{collection_name}", + "name_of_the_field_to_index", + PayloadSchemaType.Integer, + PayloadIndexParams.newBuilder() + .setIntegerIndexParams( + IntegerIndexParams.newBuilder().setLookup(false).setRange(true).build()) + .build(), + null, + null, + null) + .get(); +``` -The mode of _context search_ is similar to the discovery search, but it does not use a target point. Instead, the `context` is used to navigate the [HNSW graph](https://arxiv.org/abs/1603.09320) towards preferred zones. It is expected that the results in that mode will be diverse, and not centered around one point. -_Context Search_ could serve as a solution for individuals seeking a more exploratory approach to navigate the vector space. +### On-disk payload index -![Context search visualization](https://qdrant.tech/articles_data/qdrant-1.7.x/context-search.png) +*Available as of v1.11.0* -### [Anchor](https://qdrant.tech/articles/qdrant-1.7.x/\#user-defined-sharding) User-defined sharding +By default all payload-related structures are stored in memory. In this way, the vector index can quickly access payload values during search. +As latency in this case is critical, it is recommended to keep hot payload indexes in memory. -Qdrant’s collections are divided into shards. A single **shard** is a self-contained store of points, which can be moved between nodes. Up till now, the points were distributed among shards by using a consistent hashing algorithm, so that shards were managing non-intersecting subsets of points. -The latter one remains true, but now you can define your own sharding and decide which points should be stored on which shard. Sounds cool, right? But why would you need that? Well, there are multiple scenarios in which you may want to use custom sharding. For example, you may want to store some points on a dedicated node, or you may want to store points from the same user on the same shard and +There are, however, cases when payload indexes are too large or rarely used. In those cases, it is possible to store payload indexes on disk. -While the existing behavior is still the default one, you can now define the shards when you create a collection. Then, you can assign each point to a shard by providing a `shard_key` in the `upsert` operation. What’s more, you can also search over the selected shards only, by providing the `shard_key` parameter in the search operation. + -```http -POST /collections/my_collection/points/search -{ - "vector": [0.29, 0.81, 0.75, 0.11], - "shard_key": ["cats", "dogs"], - "limit": 10, - "with_payload": true, -} +To configure on-disk payload index, you can use the following index parameters: +```python +client.create_payload_index( + collection_name="{collection_name}", + field_name="payload_field_name", + field_schema=models.KeywordIndexParams( + type=models.KeywordIndexType.KEYWORD, + on_disk=True, + ), +) ``` -If you want to know more about the user-defined sharding, please refer to the [sharding documentation](https://qdrant.tech/documentation/guides/distributed_deployment/#sharding). - -### [Anchor](https://qdrant.tech/articles/qdrant-1.7.x/\#snapshot-based-shard-transfer) Snapshot-based shard transfer - -That’s a really more in depth technical improvement for the distributed mode users, that we implemented a new options the shard transfer mechanism. The new approach is based on the snapshot of the shard, which is transferred to the target node. +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -Moving shards is required for dynamical scaling of the cluster. Your data can migrate between nodes, and the way you move it is crucial for the performance of the whole system. The good old `stream_records` method (still the default one) transmits all the records between the machines and indexes them on the target node. -In the case of moving the shard, it’s necessary to recreate the HNSW index each time. However, with the introduction of the new `snapshot` approach, the snapshot itself, inclusive of all data and potentially quantized content, is transferred to the target node. This comprehensive snapshot includes the entire index, enabling the target node to seamlessly load it and promptly begin handling requests without the need for index recreation. +var client = new QdrantClient("localhost", 6334); -There are multiple scenarios in which you may prefer one over the other. Please check out the docs of the [shard transfer method](https://qdrant.tech/documentation/guides/distributed_deployment/#shard-transfer-method) for more details and head-to-head comparison. As for now, the old `stream_records` method is still the default one, but we may decide to change it in the future. +await client.CreatePayloadIndexAsync( + collectionName: "{collection_name}", + fieldName: "payload_field_name", + schemaType: PayloadSchemaType.Keyword, + indexParams: new PayloadIndexParams + { + KeywordIndexParams = new KeywordIndexParams + { + OnDisk = true + } + } +); +``` -## [Anchor](https://qdrant.tech/articles/qdrant-1.7.x/\#minor-improvements) Minor improvements +```go +import ( + "context" -Beyond introducing new features, Qdrant 1.7.0 enhances performance and addresses various minor issues. Here’s a rundown of the key improvements: + "github.com/qdrant/go-client/qdrant" +) -1. Improvement of HNSW Index Building on High CPU Systems ( [PR#2869](https://github.com/qdrant/qdrant/pull/2869)). +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -2. Improving [Search Tail Latencies](https://github.com/qdrant/qdrant/pull/2931): improvement for high CPU systems with many parallel searches, directly impacting the user experience by reducing latency. +client.CreateFieldIndex(context.Background(), &qdrant.CreateFieldIndexCollection{ + CollectionName: "{collection_name}", + FieldName: "name_of_the_field_to_index", + FieldType: qdrant.FieldType_FieldTypeKeyword.Enum(), + FieldIndexParams: qdrant.NewPayloadIndexParamsKeyword( + &qdrant.KeywordIndexParams{ + OnDisk: qdrant.PtrOf(true), + }), +}) +``` -3. [Adding Index for Geo Map Payloads](https://github.com/qdrant/qdrant/pull/2768): index for geo map payloads can significantly improve search performance, especially for applications involving geographical data. +```http +PUT /collections/{collection_name}/index +{ + "field_name": "payload_field_name", + "field_schema": { + "type": "keyword", + "on_disk": true + } +} +``` -4. Stability of Consensus on Big High Load Clusters: enhancing the stability of consensus in large, high-load environments is critical for ensuring the reliability and scalability of the system ( [PR#3013](https://github.com/qdrant/qdrant/pull/3013), [PR#3026](https://github.com/qdrant/qdrant/pull/3026), [PR#2942](https://github.com/qdrant/qdrant/pull/2942), [PR#3103](https://github.com/qdrant/qdrant/pull/3103), [PR#3054](https://github.com/qdrant/qdrant/pull/3054)). +```typescript +client.createPayloadIndex("{collection_name}", { + field_name: "payload_field_name", + field_schema: { + type: "keyword", + on_disk: true + }, +}); +``` -5. Configurable Timeout for Searches: allowing users to configure the timeout for searches provides greater flexibility and can help optimize system performance under different operational conditions ( [PR#2748](https://github.com/qdrant/qdrant/pull/2748), [PR#2771](https://github.com/qdrant/qdrant/pull/2771)). +```rust +use qdrant_client::qdrant::{ + CreateFieldIndexCollectionBuilder, + KeywordIndexParamsBuilder, + FieldType +}; +use qdrant_client::{Qdrant, QdrantError}; +let client = Qdrant::from_url("http://localhost:6334").build()?; -## [Anchor](https://qdrant.tech/articles/qdrant-1.7.x/\#release-notes) Release notes +client.create_field_index( + CreateFieldIndexCollectionBuilder::new( + "{collection_name}", + "payload_field_name", + FieldType::Keyword, + ) + .field_index_params( + KeywordIndexParamsBuilder::default() + .on_disk(true), + ), +); +``` -[Our release notes](https://github.com/qdrant/qdrant/releases/tag/v1.7.0) are a place to go if you are interested in more details. Please remember that Qdrant is an open source project, so feel free to [contribute](https://github.com/qdrant/qdrant/issues)! +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.PayloadIndexParams; +import io.qdrant.client.grpc.Collections.PayloadSchemaType; +import io.qdrant.client.grpc.Collections.KeywordIndexParams; -##### Was this page useful? +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +client + .createPayloadIndexAsync( + "{collection_name}", + "payload_field_name", + PayloadSchemaType.Keyword, + PayloadIndexParams.newBuilder() + .setKeywordIndexParams( + KeywordIndexParams.newBuilder() + .setOnDisk(true) + .build()) + .build(), + null, + null, + null) + .get(); +``` -Thank you for your feedback! 🙏 +Payload index on-disk is supported for the following types: -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/qdrant-1.7.x.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +* `keyword` +* `integer` +* `float` +* `datetime` +* `uuid` +* `text` +* `geo` -On this page: +The list will be extended in future versions. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/qdrant-1.7.x.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +### Tenant Index -× +*Available as of v1.11.0* -[Powered by](https://qdrant.tech/) +Many vector search use-cases require multitenancy. In a multi-tenant scenario the collection is expected to contain multiple subsets of data, where each subset belongs to a different tenant. -<|page-16-lllmstxt|> -## logging-monitoring -- [Documentation](https://qdrant.tech/documentation/) -- [Private cloud](https://qdrant.tech/documentation/private-cloud/) -- Logging & Monitoring +Qdrant supports efficient multi-tenant search by enabling [special configuration](/documentation/guides/multiple-partitions/) vector index, which disables global search and only builds sub-indexes for each tenant. -# [Anchor](https://qdrant.tech/documentation/private-cloud/logging-monitoring/\#configuring-logging--monitoring-in-qdrant-private-cloud) Configuring Logging & Monitoring in Qdrant Private Cloud + -## [Anchor](https://qdrant.tech/documentation/private-cloud/logging-monitoring/\#logging) Logging +However, knowing that the collection contains multiple tenants unlocks more opportunities for optimization. +To optimize storage in Qdrant further, you can enable tenant indexing for payload fields. -You can access the logs with kubectl or the Kubernetes log management tool of your choice. For example: +This option will tell Qdrant which fields are used for tenant identification and will allow Qdrant to structure storage for faster search of tenant-specific data. +One example of such optimization is localizing tenant-specific data closer on disk, which will reduce the number of disk reads during search. -```bash -kubectl -n qdrant-private-cloud logs -l app=qdrant,cluster-id=a7d8d973-0cc5-42de-8d7b-c29d14d24840 +To enable tenant index for a field, you can use the following index parameters: +```python +client.create_payload_index( + collection_name="{collection_name}", + field_name="payload_field_name", + field_schema=models.KeywordIndexParams( + type=models.KeywordIndexType.KEYWORD, + is_tenant=True, + ), +) ``` -**Configuring log levels:** You can configure log levels for the databases individually through the QdrantCluster spec. Example: +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -```yaml -apiVersion: qdrant.io/v1 -kind: QdrantCluster -metadata: - name: qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840 - labels: - cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" - customer-id: "acme-industries" -spec: - id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" - version: "v1.11.3" - size: 1 - resources: - cpu: 100m - memory: "1Gi" - storage: "2Gi" - config: - log_level: "DEBUG" +var client = new QdrantClient("localhost", 6334); +await client.CreatePayloadIndexAsync( + collectionName: "{collection_name}", + fieldName: "payload_field_name", + schemaType: PayloadSchemaType.Keyword, + indexParams: new PayloadIndexParams + { + KeywordIndexParams = new KeywordIndexParams + { + IsTenant = true + } + } +); ``` -### [Anchor](https://qdrant.tech/documentation/private-cloud/logging-monitoring/\#integrating-with-a-log-management-system) Integrating with a log management system +```go +import ( + "context" -You can integrate the logs into any log management system that supports Kubernetes. There are no Qdrant specific configurations necessary. Just configure the agents of your system to collect the logs from all Pods in the Qdrant namespace. + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.CreateFieldIndex(context.Background(), &qdrant.CreateFieldIndexCollection{ + CollectionName: "{collection_name}", + FieldName: "name_of_the_field_to_index", + FieldType: qdrant.FieldType_FieldTypeKeyword.Enum(), + FieldIndexParams: qdrant.NewPayloadIndexParamsKeyword( + &qdrant.KeywordIndexParams{ + IsTenant: qdrant.PtrOf(true), + }), +}) +``` -## [Anchor](https://qdrant.tech/documentation/private-cloud/logging-monitoring/\#monitoring) Monitoring +```http +PUT /collections/{collection_name}/index +{ + "field_name": "payload_field_name", + "field_schema": { + "type": "keyword", + "is_tenant": true + } +} +``` -The Qdrant Cloud console gives you access to basic metrics about CPU, memory and disk usage of your Qdrant clusters. +```typescript +client.createPayloadIndex("{collection_name}", { + field_name: "payload_field_name", + field_schema: { + type: "keyword", + is_tenant: true + }, +}); +``` -If you want to integrate the Qdrant metrics into your own monitoring system, you can instruct it to scrape the following endpoints that provide metrics in a Prometheus/OpenTelemetry compatible format: +```rust +use qdrant_client::qdrant::{ + CreateFieldIndexCollectionBuilder, + KeywordIndexParamsBuilder, + FieldType +}; +use qdrant_client::{Qdrant, QdrantError}; -- `/metrics` on port 6333 of every Qdrant database Pod, this provides metrics about each the database and its internals itself -- `/metrics` on port 9290 of the Qdrant Operator Pod, this provides metrics about the Operator, as well as the status of Qdrant Clusters and Snapshots -- For metrics about the state of Kubernetes resources like Pods and PersistentVolumes within the Qdrant Hybrid Cloud namespace, we recommend using [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) +let client = Qdrant::from_url("http://localhost:6334").build()?; -### [Anchor](https://qdrant.tech/documentation/private-cloud/logging-monitoring/\#grafana-dashboard) Grafana dashboard +client.create_field_index( + CreateFieldIndexCollectionBuilder::new( + "{collection_name}", + "payload_field_name", + FieldType::Keyword, + ) + .field_index_params( + KeywordIndexParamsBuilder::default() + .is_tenant(true), + ), +); +``` -If you scrape the above metrics into your own monitoring system, and your are using Grafana, you can use our [Grafana dashboard](https://github.com/qdrant/qdrant-cloud-grafana-dashboard) to visualize these metrics. +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.PayloadIndexParams; +import io.qdrant.client.grpc.Collections.PayloadSchemaType; +import io.qdrant.client.grpc.Collections.KeywordIndexParams; -![Grafa dashboard](https://qdrant.tech/documentation/cloud/cloud-grafana-dashboard.png) +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -##### Was this page useful? +client + .createPayloadIndexAsync( + "{collection_name}", + "payload_field_name", + PayloadSchemaType.Keyword, + PayloadIndexParams.newBuilder() + .setKeywordIndexParams( + KeywordIndexParams.newBuilder() + .setIsTenant(true) + .build()) + .build(), + null, + null, + null) + .get(); +``` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Tenant optimization is supported for the following datatypes: -Thank you for your feedback! 🙏 +* `keyword` +* `uuid` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/private-cloud/logging-monitoring.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +### Principal Index -On this page: +*Available as of v1.11.0* -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/private-cloud/logging-monitoring.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Similar to the tenant index, the principal index is used to optimize storage for faster search, assuming that the search request is primarily filtered by the principal field. -× +A good example of a use case for the principal index is time-related data, where each point is associated with a timestamp. In this case, the principal index can be used to optimize storage for faster search with time-based filters. -[Powered by](https://qdrant.tech/) +```python +client.create_payload_index( + collection_name="{collection_name}", + field_name="timestamp", + field_schema=models.IntegerIndexParams( + type=models.IntegerIndexType.INTEGER, + is_principal=True, + ), +) +``` -<|page-17-lllmstxt|> -## administration -- [Documentation](https://qdrant.tech/documentation/) -- [Guides](https://qdrant.tech/documentation/guides/) -- Administration +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -# [Anchor](https://qdrant.tech/documentation/guides/administration/\#administration) Administration +var client = new QdrantClient("localhost", 6334); -Qdrant exposes administration tools which enable to modify at runtime the behavior of a qdrant instance without changing its configuration manually. +await client.CreatePayloadIndexAsync( + collectionName: "{collection_name}", + fieldName: "timestamp", + schemaType: PayloadSchemaType.Integer, + indexParams: new PayloadIndexParams + { + IntegerIndexParams = new IntegerIndexParams + { + IsPrincipal = true + } + } +); +``` -## [Anchor](https://qdrant.tech/documentation/guides/administration/\#locking) Locking +```go +import ( + "context" -A locking API enables users to restrict the possible operations on a qdrant process. -It is important to mention that: + "github.com/qdrant/go-client/qdrant" +) -- The configuration is not persistent therefore it is necessary to lock again following a restart. -- Locking applies to a single node only. It is necessary to call lock on all the desired nodes in a distributed deployment setup. +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -Lock request sample: +client.CreateFieldIndex(context.Background(), &qdrant.CreateFieldIndexCollection{ + CollectionName: "{collection_name}", + FieldName: "name_of_the_field_to_index", + FieldType: qdrant.FieldType_FieldTypeInteger.Enum(), + FieldIndexParams: qdrant.NewPayloadIndexParamsInt( + &qdrant.IntegerIndexParams{ + IsPrincipal: qdrant.PtrOf(true), + }), +}) +``` ```http -POST /locks +PUT /collections/{collection_name}/index { - "error_message": "write is forbidden", - "write": true + "field_name": "timestamp", + "field_schema": { + "type": "integer", + "is_principal": true + } } - ``` -Write flags enables/disables write lock. -If the write lock is set to true, qdrant doesn’t allow creating new collections or adding new data to the existing storage. -However, deletion operations or updates are not forbidden under the write lock. -This feature enables administrators to prevent a qdrant process from using more disk space while permitting users to search and delete unnecessary data. - -You can optionally provide the error message that should be used for error responses to users. - -## [Anchor](https://qdrant.tech/documentation/guides/administration/\#recovery-mode) Recovery mode +```typescript +client.createPayloadIndex("{collection_name}", { + field_name: "timestamp", + field_schema: { + type: "integer", + is_principal: true + }, +}); +``` -_Available as of v1.2.0_ +```rust +use qdrant_client::qdrant::{ + CreateFieldIndexCollectionBuilder, + IntegerIndexParamsBuilder, + FieldType +}; +use qdrant_client::{Qdrant, QdrantError}; -Recovery mode can help in situations where Qdrant fails to start repeatedly. -When starting in recovery mode, Qdrant only loads collection metadata to prevent -going out of memory. This allows you to resolve out of memory situations, for -example, by deleting a collection. After resolving Qdrant can be restarted -normally to continue operation. +let client = Qdrant::from_url("http://localhost:6334").build()?; -In recovery mode, collection operations are limited to -[deleting](https://qdrant.tech/documentation/concepts/collections/#delete-collection) a -collection. That is because only collection metadata is loaded during recovery. +client.create_field_index( + CreateFieldIndexCollectionBuilder::new( + "{collection_name}", + "timestamp", + FieldType::Integer, + ) + .field_index_params( + IntegerIndexParamsBuilder::default() + .is_principal(true), + ), +); +``` -To enable recovery mode with the Qdrant Docker image you must set the -environment variable `QDRANT_ALLOW_RECOVERY_MODE=true`. The container will try -to start normally first, and restarts in recovery mode if initialisation fails -due to an out of memory error. This behavior is disabled by default. +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.PayloadIndexParams; +import io.qdrant.client.grpc.Collections.PayloadSchemaType; +import io.qdrant.client.grpc.Collections.IntegerIndexParams; -If using a Qdrant binary, recovery mode can be enabled by setting a recovery -message in an environment variable, such as -`QDRANT__STORAGE__RECOVERY_MODE="My recovery message"`. +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -## [Anchor](https://qdrant.tech/documentation/guides/administration/\#strict-mode) Strict mode +client + .createPayloadIndexAsync( + "{collection_name}", + "timestamp", + PayloadSchemaType.Integer, + PayloadIndexParams.newBuilder() + .setIntegerIndexParams( + KeywordIndexParams.newBuilder() + .setIsPrincipa(true) + .build()) + .build(), + null, + null, + null) + .get(); +``` -_Available as of v1.13.0_ +Principal optimization is supported for following types: -Strict mode is a feature to restrict certain type of operations on the collection in order to protect it. +* `integer` +* `float` +* `datetime` -The goal is to prevent inefficient usage patterns that could overload the collections. -This configuration ensures a more predictible and responsive service when you do not have control over the queries that are being executed. +## Full-text index -Here is a non exhaustive list of operations that can be restricted using strict mode: +Qdrant supports full-text search for string payload. +Full-text index allows you to filter points by the presence of a word or a phrase in the payload field. -- Preventing querying non indexed payload which can be very slow -- Maximum number of filtering conditions in a query -- Maximum batch size when inserting vectors -- Maximum collection size (in terms of vectors or payload size) +Full-text index configuration is a bit more complex than other indexes, as you can specify the tokenization parameters. +Tokenization is the process of splitting a string into tokens, which are then indexed in the inverted index. -See [schema definitions](https://api.qdrant.tech/api-reference/collections/create-collection#request.body.strict_mode_config) for all the `strict_mode_config` parameters. +See [Full Text match](/documentation/concepts/filtering/#full-text-match) for examples of querying with a full-text index. -Upon crossing a limit, the server will return a client side error with the information about the limit that was crossed. +To create a full-text index, you can use the following: -As part of the config, the `enabled` field act as a toggle to enable or disable the strict mode dynamically. +```python +from qdrant_client import QdrantClient, models -The `strict_mode_config` can be enabled when [creating](https://qdrant.tech/documentation/guides/administration/#create-a-collection) a collection, for instance below to activate the `unindexed_filtering_retrieve` limit. +client = QdrantClient(url="http://localhost:6333") -Setting `unindexed_filtering_retrieve` to false prevents the usage of filtering on a non indexed payload key. +client.create_payload_index( + collection_name="{collection_name}", + field_name="name_of_the_field_to_index", + field_schema=models.TextIndexParams( + type="text", + tokenizer=models.TokenizerType.WORD, + min_token_len=2, + max_token_len=10, + lowercase=True, + ), +) +``` -httpbashpythontypescriptrustjavacsharpgo +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -```http -PUT /collections/{collection_name} -{ - "strict_mode_config": { - "enabled": true, - "unindexed_filtering_retrieve": false - } -} +var client = new QdrantClient("localhost", 6334); +await client.CreatePayloadIndexAsync( + collectionName: "{collection_name}", + fieldName: "name_of_the_field_to_index", + schemaType: PayloadSchemaType.Text, + indexParams: new PayloadIndexParams + { + TextIndexParams = new TextIndexParams + { + Tokenizer = TokenizerType.Word, + MinTokenLen = 2, + MaxTokenLen = 10, + Lowercase = true + } + } +); ``` -```bash -curl -X PUT http://localhost:6333/collections/{collection_name} \ - -H 'Content-Type: application/json' \ - --data-raw '{ - "strict_mode_config": { - "enabled":" true, - "unindexed_filtering_retrieve": false - } - }' - -``` +```go +import ( + "context" -```python -from qdrant_client import QdrantClient, models + "github.com/qdrant/go-client/qdrant" +) -client = QdrantClient(url="http://localhost:6333") +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -client.create_collection( - collection_name="{collection_name}", - strict_mode_config=models.StrictModeConfig(enabled=True, unindexed_filtering_retrieve=false), -) +client.CreateFieldIndex(context.Background(), &qdrant.CreateFieldIndexCollection{ + CollectionName: "{collection_name}", + FieldName: "name_of_the_field_to_index", + FieldType: qdrant.FieldType_FieldTypeText.Enum(), + FieldIndexParams: qdrant.NewPayloadIndexParamsText( + &qdrant.TextIndexParams{ + Tokenizer: qdrant.TokenizerType_Whitespace, + MinTokenLen: qdrant.PtrOf(uint64(2)), + MaxTokenLen: qdrant.PtrOf(uint64(10)), + Lowercase: qdrant.PtrOf(true), + }), +}) +``` +```http +PUT /collections/{collection_name}/index +{ + "field_name": "name_of_the_field_to_index", + "field_schema": { + "type": "text", + "tokenizer": "word", + "min_token_len": 2, + "max_token_len": 10, + "lowercase": true + } +} ``` ```typescript @@ -2920,48 +18030,116 @@ import { QdrantClient } from "@qdrant/js-client-rest"; const client = new QdrantClient({ host: "localhost", port: 6333 }); -client.createCollection("{collection_name}", { - strict_mode_config: { - enabled: true, - unindexed_filtering_retrieve: false, +client.createPayloadIndex("{collection_name}", { + field_name: "name_of_the_field_to_index", + field_schema: { + type: "text", + tokenizer: "word", + min_token_len: 2, + max_token_len: 10, + lowercase: true, }, }); - ``` ```rust +use qdrant_client::qdrant::{ + CreateFieldIndexCollectionBuilder, + TextIndexParamsBuilder, + FieldType, + TokenizerType, +}; use qdrant_client::Qdrant; -use qdrant_client::qdrant::{CreateCollectionBuilder, StrictModeConfigBuilder}; let client = Qdrant::from_url("http://localhost:6334").build()?; +let text_index_params = TextIndexParamsBuilder::new(TokenizerType::Word) + .min_token_len(2) + .max_token_len(10) + .lowercase(true); + client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .strict_config_mode(StrictModeConfigBuilder::default().enabled(true).unindexed_filtering_retrieve(false)), + .create_field_index( + CreateFieldIndexCollectionBuilder::new( + "{collection_name}", + "name_of_the_field_to_index", + FieldType::Text, + ).field_index_params(text_index_params.build()), ) .await?; - ``` ```java import io.qdrant.client.QdrantClient; import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.StrictModeCOnfig; +import io.qdrant.client.grpc.Collections.PayloadIndexParams; +import io.qdrant.client.grpc.Collections.PayloadSchemaType; +import io.qdrant.client.grpc.Collections.TextIndexParams; +import io.qdrant.client.grpc.Collections.TokenizerType; QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setStrictModeConfig( - StrictModeConfig.newBuilder().setEnabled(true).setUnindexedFilteringRetrieve(false).build()) - .build()) + .createPayloadIndexAsync( + "{collection_name}", + "name_of_the_field_to_index", + PayloadSchemaType.Text, + PayloadIndexParams.newBuilder() + .setTextIndexParams( + TextIndexParams.newBuilder() + .setTokenizer(TokenizerType.Word) + .setMinTokenLen(2) + .setMaxTokenLen(10) + .setLowercase(true) + .build()) + .build(), + null, + null, + null) .get(); +``` + +### Tokenizers + +Tokenizers are algorithms used to split text into smaller units called tokens, which are then indexed and searched in a full-text index. +In the context of Qdrant, tokenizers determine how string payloads are broken down for efficient searching and filtering. +The choice of tokenizer affects how queries match the indexed text, supporting different languages, word boundaries, and search behaviours such as prefix or phrase matching. + +Available tokenizers are: + +* `word` - splits the string into words, separated by spaces, punctuation marks, and special characters. +* `whitespace` - splits the string into words, separated by spaces. +* `prefix` - splits the string into words, separated by spaces, punctuation marks, and special characters, and then creates a prefix index for each word. For example: `hello` will be indexed as `h`, `he`, `hel`, `hell`, `hello`. +* `multilingual` - a special type of tokenizer based on multiple packages like [charabia](https://github.com/meilisearch/charabia) and [vaporetto](https://github.com/daac-tools/vaporetto) to deliver fast and accurate tokenization for a large variety of languages. It allows proper tokenization and lemmatization for multiple languages, including those with non-Latin alphabets and non-space delimiters. See the [charabia documentation](https://github.com/meilisearch/charabia) for a full list of supported languages and normalization options. Note: For the Japanese language, Qdrant relies on the `vaporetto` project, which has much less overhead compared to `charabia`, while maintaining comparable performance. +### Stemmer + +A **stemmer** is an algorithm used in text processing to reduce words to their root or base form, known as the "stem." For example, the words "running", "runner and "runs" can all be reduced to the stem "run." +When configuring a full-text index in Qdrant, you can specify a stemmer to be used for a particular language. This enables the index to recognize and match different inflections or derivations of a word. + +Qdrant provides an implementation of [Snowball stemmer](https://snowballstem.org/), a widely used and performant variant for some of the most popular languages. +For the list of supported languages, please visit the [rust-stemmers repository](https://github.com/qdrant/rust-stemmers). + +Here is an example of full-text Index configuration with Snowball stemmer: + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.create_payload_index( + collection_name="{collection_name}", + field_name="name_of_the_field_to_index", + field_schema=models.TextIndexParams( + type="text", + tokenizer=models.TokenizerType.WORD, + stemmer=models.SnowballParams( + type=models.Snowball.SNOWBALL, + language=models.SnowballLanguage.ENGLISH + ) + ), +) ``` ```csharp @@ -2970,112 +18148,191 @@ using Qdrant.Client.Grpc; var client = new QdrantClient("localhost", 6334); -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - strictModeConfig: new StrictModeConfig { enabled = true, unindexed_filtering_retrieve = false } +await client.CreatePayloadIndexAsync( + collectionName: "{collection_name}", + fieldName: "name_of_the_field_to_index", + schemaType: PayloadSchemaType.Text, + indexParams: new PayloadIndexParams + { + TextIndexParams = new TextIndexParams + { + Tokenizer = TokenizerType.Word, + Stemmer = new StemmingAlgorithm + { + Snowball = new SnowballParams + { + Language = "english" + } + } + } + } ); - ``` ```go import ( - "context" + "context" - "github.com/qdrant/go-client/qdrant" + "github.com/qdrant/go-client/qdrant" ) client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, + Host: "localhost", + Port: 6334, }) -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - StrictModeConfig: &qdrant.StrictModeConfig{ - Enabled: qdrant.PtrOf(true), - IndexingThreshold: qdrant.PtrOf(false), - }, +client.CreateFieldIndex(context.Background(), &qdrant.CreateFieldIndexCollection{ + CollectionName: "{collection_name}", + FieldName: "name_of_the_field_to_index", + FieldType: qdrant.FieldType_FieldTypeText.Enum(), + FieldIndexParams: qdrant.NewPayloadIndexParamsText( + &qdrant.TextIndexParams{ + Tokenizer: qdrant.TokenizerType_Word, + Stemmer: qdrant.NewStemmingAlgorithmSnowball(&qdrant.SnowballParams{ + Language: "english", + }), + }), }) - ``` -Or activate it later on an existing collection through the [collection update](https://qdrant.tech/documentation/guides/administration/#update-collection-parameters) API: - -httpbashpythontypescriptrustjavacsharpgo - ```http -PATCH /collections/{collection_name} +PUT /collections/{collection_name}/index { - "strict_mode_config": { - "enabled": true, - "unindexed_filtering_retrieve": false + "field_name": "name_of_the_field_to_index", + "field_schema": { + "type": "text", + "tokenizer": "word", + "stemmer": { + "type": "snowball", + "language": "english" + } } } - ``` -```bash -curl -X PATCH http://localhost:6333/collections/{collection_name} \ - -H 'Content-Type: application/json' \ - --data-raw '{ - "strict_mode_config": { - "enabled": true, - "unindexed_filtering_retrieve": false - } - }' +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.createPayloadIndex("{collection_name}", { + field_name: "name_of_the_field_to_index", + field_schema: { + type: "text", + tokenizer: "word", + stemmer: { + type: "snowball", + language: "english" + } + } +}); ``` -```python -from qdrant_client import QdrantClient, models +```rust +use qdrant_client::qdrant::{ + CreateFieldIndexCollectionBuilder, + TextIndexParamsBuilder, + FieldType, + TokenizerType, +}; +use qdrant_client::Qdrant; -client = QdrantClient(url="http://localhost:6333") +let client = Qdrant::from_url("http://localhost:6334").build()?; -client.update_collection( - collection_name="{collection_name}", - strict_mode_config=models.StrictModeConfig(enabled=True, unindexed_filtering_retrieve=False), -) +let text_index_params = TextIndexParamsBuilder::new(TokenizerType::Word) + .snowball_stemmer("english".to_string()); +client + .create_field_index( + CreateFieldIndexCollectionBuilder::new( + "{collection_name}", + "{field_name}", + FieldType::Text, + ).field_index_params(text_index_params.build()), + ) + .await?; ``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.PayloadIndexParams; +import io.qdrant.client.grpc.Collections.PayloadSchemaType; +import io.qdrant.client.grpc.Collections.SnowballParams; +import io.qdrant.client.grpc.Collections.StemmingAlgorithm; +import io.qdrant.client.grpc.Collections.TextIndexParams; +import io.qdrant.client.grpc.Collections.TokenizerType; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .createPayloadIndexAsync( + "{collection_name}", + "name_of_the_field_to_index", + PayloadSchemaType.Text, + PayloadIndexParams.newBuilder() + .setTextIndexParams( + TextIndexParams.newBuilder() + .setTokenizer(TokenizerType.Word) + .setStemmer( + StemmingAlgorithm.newBuilder() + .setSnowball( + SnowballParams.newBuilder().setLanguage("english").build()) + .build()) + .build()) + .build(), + true, + null, + null) + .get(); +``` -const client = new QdrantClient({ host: "localhost", port: 6333 }); +### Stopwords -client.updateCollection("{collection_name}", { - strict_mode_config: { - enabled: true, - unindexed_filtering_retrieve: false, - }, -}); +Stopwords are common words (such as "the", "is", "at", "which", and "on") that are often filtered out during text processing because they carry little meaningful information for search and retrieval tasks. -``` +In Qdrant, you can specify a list of stopwords to be ignored during full-text indexing and search. This helps simplify search queries and improves relevance. -```rust -use qdrant_client::qdrant::{StrictModeConfigBuilder, UpdateCollectionBuilder}; +You can configure stopwords based on predefined languages, as well as extend existing stopword lists with custom words. -client - .update_collection( - UpdateCollectionBuilder::new("{collection_name}").strict_mode_config( - StrictModeConfigBuilder::default().enabled(true).unindexed_filtering_retrieve(false), - ), - ) - .await?; +Here is an example of configuring a full-text index with custom stopwords: -``` -```java -import io.qdrant.client.grpc.Collections.StrictModeConfigBuilder; -import io.qdrant.client.grpc.Collections.UpdateCollection; +```python +from qdrant_client import QdrantClient, models -client.updateCollectionAsync( - UpdateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setStrictModeConfig( - StrictModeConfig.newBuilder().setEnabled(true).setUnindexedFilteringRetrieve(false).build()) - .build()); +client = QdrantClient(url="http://localhost:6333") + +# Simple +client.create_payload_index( + collection_name="{collection_name}", + field_name="name_of_the_field_to_index", + field_schema=models.TextIndexParams( + type="text", + tokenizer=models.TokenizerType.WORD, + stopwords=models.Language.ENGLISH, + ), +) +# Explicit +client.create_payload_index( + collection_name="{collection_name}", + field_name="name_of_the_field_to_index", + field_schema=models.TextIndexParams( + type="text", + tokenizer=models.TokenizerType.WORD, + stopwords=models.StopwordsSet( + languages=[ + models.Language.ENGLISH, + models.Language.SPANISH, + ], + custom=[ + "example" + ] + ), + ), +) ``` ```csharp @@ -3084,70 +18341,82 @@ using Qdrant.Client.Grpc; var client = new QdrantClient("localhost", 6334); -await client.UpdateCollectionAsync( - collectionName: "{collection_name}", - strictModeConfig: new StrictModeConfig { Enabled = true, UnindexedFilteringRetrieve = false } +await client.CreatePayloadIndexAsync( + collectionName: "{collection_name}", + fieldName: "name_of_the_field_to_index", + schemaType: PayloadSchemaType.Text, + indexParams: new PayloadIndexParams + { + TextIndexParams = new TextIndexParams + { + Tokenizer = TokenizerType.Word, + Stopwords = new StopwordsSet + { + Languages = { "english", "spanish" }, + Custom = { "example" } + } + } + } ); - ``` ```go import ( - "context" + "context" - "github.com/qdrant/go-client/qdrant" + "github.com/qdrant/go-client/qdrant" ) client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, + Host: "localhost", + Port: 6334, }) -client.UpdateCollection(context.Background(), &qdrant.UpdateCollection{ - CollectionName: "{collection_name}", - StrictModeConfig: &qdrant.StrictModeConfig{ - Enabled: qdrant.PtrOf(true), - UnindexedFilteringRetrieve: qdrant.PtrOf(false), - }, +client.CreateFieldIndex(context.Background(), &qdrant.CreateFieldIndexCollection{ + CollectionName: "{collection_name}", + FieldName: "name_of_the_field_to_index", + FieldType: qdrant.FieldType_FieldTypeText.Enum(), + FieldIndexParams: qdrant.NewPayloadIndexParamsText( + &qdrant.TextIndexParams{ + Tokenizer: qdrant.TokenizerType_Word, + Stopwords: &qdrant.StopwordsSet{ + Languages: []string{"english", "spanish"}, + Custom: []string{"example"}, + }, + }), }) - ``` -To disable completely strict mode on an existing collection use: - -httpbashpythontypescriptrustjavacsharpgo - ```http -PATCH /collections/{collection_name} +// Simple +PUT collections/{collection_name}/index { - "strict_mode_config": { - "enabled": false + "field_name": "name_of_the_field_to_index", + "field_schema": { + "type": "text", + "tokenizer": "word", + "stopwords": "english" } } -``` - -```bash -curl -X PATCH http://localhost:6333/collections/{collection_name} \ - -H 'Content-Type: application/json' \ - --data-raw '{ - "strict_mode_config": { - "enabled": false, +// Explicit +PUT collections/{collection_name}/index +{ + "field_name": "name_of_the_field_to_index", + "field_schema": { + "type": "text", + "tokenizer": "word", + "stopwords": { + "languages": [ + "english", + "spanish" + ], + "custom": [ + "example" + ] + } } - }' - -``` - -```python -from qdrant_client import QdrantClient, models - -client = QdrantClient(url="http://localhost:6333") - -client.update_collection( - collection_name="{collection_name}", - strict_mode_config=models.StrictModeConfig(enabled=False), -) - +} ``` ```typescript @@ -3155,38 +18424,144 @@ import { QdrantClient } from "@qdrant/js-client-rest"; const client = new QdrantClient({ host: "localhost", port: 6333 }); -client.updateCollection("{collection_name}", { - strict_mode_config: { - enabled: false, + +// Simple +client.createPayloadIndex("{collection_name}", { + field_name: "name_of_the_field_to_index", + field_schema: { + type: "text", + tokenizer: "word", + stopwords: "english" }, }); +// Explicit +client.createPayloadIndex("{collection_name}", { + field_name: "name_of_the_field_to_index", + field_schema: { + type: "text", + tokenizer: "word", + stopwords: { + languages: [ + "english", + "spanish" + ], + custom: [ + "example" + ] + } + }, +}); ``` ```rust -use qdrant_client::qdrant::{StrictModeConfigBuilder, UpdateCollectionBuilder}; +use qdrant_client::qdrant::{ + CreateFieldIndexCollectionBuilder, + TextIndexParamsBuilder, + FieldType, + TokenizerType, + StopwordsSet, +}; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; + +// Simple +let text_index_params = TextIndexParamsBuilder::new(TokenizerType::Word) + .stopwords_language("english".to_string()); client - .update_collection( - UpdateCollectionBuilder::new("{collection_name}").strict_mode_config( - StrictModeConfigBuilder::default().enabled(false), - ), + .create_field_index( + CreateFieldIndexCollectionBuilder::new( + "{collection_name}", + "name_of_the_field_to_index", + FieldType::Text, + ).field_index_params(text_index_params.build()), ) .await?; +// Explicit +let text_index_params = TextIndexParamsBuilder::new(TokenizerType::Word) + .stopwords(StopwordsSet { + languages: vec![ + "english".to_string(), + "spanish".to_string(), + ], + custom: vec!["example".to_string()], + }); + +client + .create_field_index( + CreateFieldIndexCollectionBuilder::new( + "{collection_name}", + "{field_name}", + FieldType::Text, + ).field_index_params(text_index_params.build()), + ) + .await?; ``` ```java -import io.qdrant.client.grpc.Collections.StrictModeConfigBuilder; -import io.qdrant.client.grpc.Collections.UpdateCollection; +import java.util.List; -client.updateCollectionAsync( - UpdateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setStrictModeConfig( - StrictModeConfig.newBuilder().setEnabled(false).build()) - .build()); +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.PayloadIndexParams; +import io.qdrant.client.grpc.Collections.PayloadSchemaType; +import io.qdrant.client.grpc.Collections.StopwordsSet; +import io.qdrant.client.grpc.Collections.TextIndexParams; +import io.qdrant.client.grpc.Collections.TokenizerType; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .createPayloadIndexAsync( + "{collection_name}", + "name_of_the_field_to_index", + PayloadSchemaType.Text, + PayloadIndexParams.newBuilder() + .setTextIndexParams( + TextIndexParams.newBuilder() + .setTokenizer(TokenizerType.Word) + .setStopwords( + StopwordsSet.newBuilder() + .addAllLanguages(List.of("english", "spanish")) + .addAllCustom(List.of("example")) + .build()) + .build()) + .build(), + true, + null, + null) + .get(); +``` + +### Phrase Search + +Phrase search in Qdrant allows you to find documents or points where a specific sequence of words appears together, in the same order, within a text payload field. +This is useful when you want to match exact phrases rather than individual words scattered throughout the text. + +When using a full-text index with phrase search enabled, you can perform phrase search by enclosing the desired phrase in double quotes in your filter query. +For example, searching for `"machine learning"` will only return results where the words "machine" and "learning" appear together as a phrase, not just anywhere in the text. + +For efficient phrase search, Qdrant requires building an additional data structure, so it needs to be configured during the creation of the full-text index: +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.create_payload_index( + collection_name="{collection_name}", + field_name="name_of_the_field_to_index", + field_schema=models.TextIndexParams( + type="text", + tokenizer=models.TokenizerType.WORD, + lowercase=True, + phrase_matching=True, + ), +) ``` ```csharp @@ -3195,15873 +18570,18026 @@ using Qdrant.Client.Grpc; var client = new QdrantClient("localhost", 6334); -await client.UpdateCollectionAsync( - collectionName: "{collection_name}", - strictModeConfig: new StrictModeConfig { Enabled = false } +await client.CreatePayloadIndexAsync( + collectionName: "{collection_name}", + fieldName: "name_of_the_field_to_index", + schemaType: PayloadSchemaType.Text, + indexParams: new PayloadIndexParams + { + TextIndexParams = new TextIndexParams + { + Tokenizer = TokenizerType.Word, + Lowercase = true, + PhraseMatching = true + } + } ); - ``` ```go import ( - "context" + "context" - "github.com/qdrant/go-client/qdrant" + "github.com/qdrant/go-client/qdrant" ) client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, + Host: "localhost", + Port: 6334, }) -client.UpdateCollection(context.Background(), &qdrant.UpdateCollection{ - CollectionName: "{collection_name}", - StrictModeConfig: &qdrant.StrictModeConfig{ - Enabled: qdrant.PtrOf(false), - }, +client.CreateFieldIndex(context.Background(), &qdrant.CreateFieldIndexCollection{ + CollectionName: "{collection_name}", + FieldName: "name_of_the_field_to_index", + FieldType: qdrant.FieldType_FieldTypeText.Enum(), + FieldIndexParams: qdrant.NewPayloadIndexParamsText( + &qdrant.TextIndexParams{ + Tokenizer: qdrant.TokenizerType_Whitespace, + Lowercase: qdrant.PtrOf(true), + PhraseMatching: qdrant.PtrOf(true), + }), }) - ``` -##### Was this page useful? - -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No - -Thank you for your feedback! 🙏 - -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/administration.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. - -On this page: - -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/administration.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) - -× - -[Powered by](https://qdrant.tech/) - -<|page-18-lllmstxt|> -## new-recommendation-api -- [Articles](https://qdrant.tech/articles/) -- Deliver Better Recommendations with Qdrant’s new API - -[Back to Qdrant Articles](https://qdrant.tech/articles/) - -# Deliver Better Recommendations with Qdrant’s new API - -Kacper Ɓukawski - -· - -October 25, 2023 - -![Deliver Better Recommendations with Qdrant’s new API](https://qdrant.tech/articles_data/new-recommendation-api/preview/title.jpg) - -The most popular use case for vector search engines, such as Qdrant, is Semantic search with a single query vector. Given the -query, we can vectorize (embed) it and find the closest points in the index. But [Vector Similarity beyond Search](https://qdrant.tech/articles/vector-similarity-beyond-search/) -does exist, and recommendation systems are a great example. Recommendations might be seen as a multi-aim search, where we want -to find items close to positive and far from negative examples. This use of vector databases has many applications, including -recommendation systems for e-commerce, content, or even dating apps. - -Qdrant has provided the [Recommendation API](https://qdrant.tech/documentation/concepts/search/#recommendation-api) for a while, and with the latest release, [Qdrant 1.6](https://github.com/qdrant/qdrant/releases/tag/v1.6.0), -we’re glad to give you more flexibility and control over the Recommendation API. -Here, we’ll discuss some internals and show how they may be used in practice. - -### [Anchor](https://qdrant.tech/articles/new-recommendation-api/\#recap-of-the-old-recommendations-api) Recap of the old recommendations API - -The previous [Recommendation API](https://qdrant.tech/documentation/concepts/search/#recommendation-api) in Qdrant came with some limitations. First of all, it was required to pass vector IDs for -both positive and negative example points. If you wanted to use vector embeddings directly, you had to either create a new point -in a collection or mimic the behaviour of the Recommendation API by using the [Search API](https://qdrant.tech/documentation/concepts/search/#search-api). -Moreover, in the previous releases of Qdrant, you were always asked to provide at least one positive example. This requirement -was based on the algorithm used to combine multiple samples into a single query vector. It was a simple, yet effective approach. -However, if the only information you had was that your user dislikes some items, you couldn’t use it directly. - -Qdrant 1.6 brings a more flexible API. You can now provide both IDs and vectors of positive and negative examples. You can even -combine them within a single request. That makes the new implementation backward compatible, so you can easily upgrade an existing -Qdrant instance without any changes in your code. And the default behaviour of the API is still the same as before. However, we -extended the API, so **you can now choose the strategy of how to find the recommended points**. - ```http -POST /collections/{collection_name}/points/recommend +PUT /collections/{collection_name}/index { - "positive": [100, 231], - "negative": [718, [0.2, 0.3, 0.4, 0.5]], - "filter": { - "must": [\ - {\ - "key": "city",\ - "match": {\ - "value": "London"\ - }\ - }\ - ] - }, - "strategy": "average_vector", - "limit": 3 + "field_name": "name_of_the_field_to_index", + "field_schema": { + "type": "text", + "tokenizer": "word", + "lowercase": true, + "phrase_matching": true + } } - ``` -There are two key changes in the request. First of all, we can adjust the strategy of search and set it to `average_vector` (the -default) or `best_score`. Moreover, we can pass both IDs ( `718`) and embeddings ( `[0.2, 0.3, 0.4, 0.5]`) as both positive and -negative examples. - -## [Anchor](https://qdrant.tech/articles/new-recommendation-api/\#hnsw-ann-example-and-strategy) HNSW ANN example and strategy - -Let’s start with an example to help you understand the [HNSW graph](https://qdrant.tech/articles/filtrable-hnsw/). Assume you want -to travel to a small city on another continent: - -1. You start from your hometown and take a bus to the local airport. -2. Then, take a flight to one of the closest hubs. -3. From there, you have to take another flight to a hub on your destination continent. -4. Hopefully, one last flight to your destination city. -5. You still have one more leg on local transport to get to your final address. - -This journey is similar to the HNSW graph’s use in Qdrant’s approximate nearest neighbours search. - -![Transport network](https://qdrant.tech/articles_data/new-recommendation-api/example-transport-network.png) - -HNSW is a multilayer graph of vectors (embeddings), with connections based on vector proximity. The top layer has the least -points, and the distances between those points are the biggest. The deeper we go, the more points we have, and the distances -get closer. The graph is built in a way that the points are connected to their closest neighbours at every layer. - -All the points from a particular layer are also in the layer below, so switching the search layer while staying in the same -location is possible. In the case of transport networks, the top layer would be the airline hubs, well-connected but with big -distances between the airports. Local airports, along with railways and buses, with higher density and smaller distances, make -up the middle layers. Lastly, our bottom layer consists of local means of transport, which is the densest and has the smallest -distances between the points. - -You don’t have to check all the possible connections when you travel. You select an intercontinental flight, then a local one, -and finally a bus or a taxi. All the decisions are made based on the distance between the points. - -The search process in HNSW is also based on similarly traversing the graph. Start from the entry point in the top layer, find -its closest point and then use that point as the entry point into the next densest layer. This process repeats until we reach -the bottom layer. Visited points and distances to the original query vector are kept in memory. If none of the neighbours of -the current point is better than the best match, we can stop the traversal, as this is a local minimum. We start at the biggest -scale, and then gradually zoom in. - -In this oversimplified example, we assumed that the distance between the points is the only factor that matters. In reality, we -might want to consider other criteria, such as the ticket price, or avoid some specific locations due to certain restrictions. -That means, there are various strategies for choosing the best match, which is also true in the case of vector recommendations. -We can use different approaches to determine the path of traversing the HNSW graph by changing how we calculate the score of a -candidate point during traversal. The default behaviour is based on pure distance, but Qdrant 1.6 exposes two strategies for the -recommendation API. - -### [Anchor](https://qdrant.tech/articles/new-recommendation-api/\#average-vector) Average vector - -The default strategy, called `average_vector` is the previous one, based on the average of positive and negative examples. It -simplifies the recommendations process and converts it into a single vector search. It supports both point IDs and vectors as -parameters. For example, you can get recommendations based on past interactions with existing points combined with query vector -embedding. Internally, that mechanism is based on the averages of positive and negative examples and was calculated with the -following formula: - -average vector=avg(positive vectors)+(avg(positive vectors)−avg(negative vectors)) - -The `average_vector` converts the problem of recommendations into a single vector search. - -### [Anchor](https://qdrant.tech/articles/new-recommendation-api/\#the-new-hotness---best-score) The new hotness - Best score - -The new strategy is called `best_score`. It does not rely on averages and is more flexible. It allows you to pass just negative -samples and uses a slightly more sophisticated algorithm under the hood. - -The best score is chosen at every step of HNSW graph traversal. We separately calculate the distance between a traversed point -and every positive and negative example. In the case of the best score strategy, **there is no single query vector anymore, but a** -**bunch of positive and negative queries**. As a result, for each sample in the query, we have a set of distances, one for each -sample. In the next step, we simply take the best scores for positives and negatives, creating two separate values. Best scores -are just the closest distances of a query to positives and negatives. The idea is: **if a point is closer to any negative than to** -**any positive example, we do not want it**. We penalize being close to the negatives, so instead of using the similarity value -directly, we check if it’s closer to positives or negatives. The following formula is used to calculate the score of a traversed -potential point: +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -```rust -if best_positive_score > best_negative_score { - score = best_positive_score -} else { - score = -(best_negative_score * best_negative_score) -} +const client = new QdrantClient({ host: "localhost", port: 6333 }); +client.createPayloadIndex("{collection_name}", { + field_name: "name_of_the_field_to_index", + field_schema: { + type: "text", + tokenizer: "word", + lowercase: true, + phrase_matching: true, + }, +}); ``` -If the point is closer to the negatives, we penalize it by taking the negative squared value of the best negative score. For a -closer negative, the score of the candidate point will always be lower or equal to zero, making the chances of choosing that point -significantly lower. However, if the best negative score is higher than the best positive score, we still prefer those that are -further away from the negatives. That procedure effectively **pulls the traversal procedure away from the negative examples**. - -If you want to know more about the internals of HNSW, you can check out the article about the -[Filtrable HNSW](https://qdrant.tech/articles/filtrable-hnsw/) that covers the topic thoroughly. - -## [Anchor](https://qdrant.tech/articles/new-recommendation-api/\#food-discovery-demo) Food Discovery demo - -Our [Food Discovery demo](https://qdrant.tech/articles/food-discovery-demo/) is an application built on top of the new [Recommendation API](https://qdrant.tech/documentation/concepts/search/#recommendation-api). -It allows you to find a meal based on liked and disliked photos. There are some updates, enabled by the new Qdrant release: - -- **Ability to include multiple textual queries in the recommendation request.** Previously, we only allowed passing a single -query to solve the cold start problem. Right now, you can pass multiple queries and mix them with the liked/disliked photos. -This became possible because of the new flexibility in parameters. We can pass both point IDs and embedding vectors in the same -request, and user queries are obviously not a part of the collection. -- **Switch between the recommendation strategies.** You can now choose between the `average_vector` and the `best_score` scoring -algorithm. - -### [Anchor](https://qdrant.tech/articles/new-recommendation-api/\#differences-between-the-strategies) Differences between the strategies - -The UI of the Food Discovery demo allows you to switch between the strategies. The `best_vector` is the default one, but with just -a single switch, you can see how the results differ when using the previous `average_vector` strategy. - -If you select just a single positive example, both algorithms work identically. - -##### [Anchor](https://qdrant.tech/articles/new-recommendation-api/\#one-positive-example) One positive example - -The difference only becomes apparent when you start adding more examples, especially if you choose some negatives. - -##### [Anchor](https://qdrant.tech/articles/new-recommendation-api/\#one-positive-and-one-negative-example) One positive and one negative example - -The more likes and dislikes we add, the more diverse the results of the `best_score` strategy will be. In the old strategy, there -is just a single vector, so all the examples are similar to it. The new one takes into account all the examples separately, making -the variety richer. - -##### [Anchor](https://qdrant.tech/articles/new-recommendation-api/\#multiple-positive-and-negative-examples) Multiple positive and negative examples - -Choosing the right strategy is dataset-dependent, and the embeddings play a significant role here. Thus, it’s always worth trying -both of them and comparing the results in a particular case. - -#### [Anchor](https://qdrant.tech/articles/new-recommendation-api/\#handling-the-negatives-only) Handling the negatives only - -In the case of our Food Discovery demo, passing just the negative images can work as an outlier detection mechanism. While the dataset -was supposed to contain only food photos, this is not actually true. A simple way to find these outliers is to pass in food item photos -as negatives, leading to the results being the most “unlike” food images. In our case you will see pill bottles and books. - -**The `average_vector` strategy still requires providing at least one positive example!** However, since cosine distance is set up -for the collection used in the demo, we faked it using [a trick described in the previous article](https://qdrant.tech/articles/food-discovery-demo/#negative-feedback-only). -In a nutshell, if you only pass negative examples, their vectors will be averaged, and the negated resulting vector will be used as -a query to the search endpoint. - -##### [Anchor](https://qdrant.tech/articles/new-recommendation-api/\#negatives-only) Negatives only - -Still, both methods return different results, so they each have their place depending on the questions being asked and the datasets -being used. +```rust +use qdrant_client::qdrant::{ + CreateFieldIndexCollectionBuilder, + TextIndexParamsBuilder, + FieldType, + TokenizerType, +}; +use qdrant_client::Qdrant; -#### [Anchor](https://qdrant.tech/articles/new-recommendation-api/\#challenges-with-multimodality) Challenges with multimodality +let client = Qdrant::from_url("http://localhost:6334").build()?; -Food Discovery uses the [CLIP embeddings model](https://huggingface.co/sentence-transformers/clip-ViT-B-32), which is multimodal, -allowing both images and texts encoded into the same vector space. Using this model allows for image queries, text queries, or both of -them combined. We utilized that mechanism in the updated demo, allowing you to pass the textual queries to filter the results further. +let text_index_params = TextIndexParamsBuilder::new(TokenizerType::Word) + .phrase_matching(true) + .lowercase(true); -##### [Anchor](https://qdrant.tech/articles/new-recommendation-api/\#a-single-text-query) A single text query +client + .create_field_index( + CreateFieldIndexCollectionBuilder::new( + "{collection_name}", + "name_of_the_field_to_index", + FieldType::Text, + ).field_index_params(text_index_params.build()), + ) + .await?; +``` -Text queries might be mixed with the liked and disliked photos, so you can combine them in a single request. However, you might be -surprised by the results achieved with the new strategy, if you start adding the negative examples. +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.PayloadIndexParams; +import io.qdrant.client.grpc.Collections.PayloadSchemaType; +import io.qdrant.client.grpc.Collections.TextIndexParams; +import io.qdrant.client.grpc.Collections.TokenizerType; -##### [Anchor](https://qdrant.tech/articles/new-recommendation-api/\#a-single-text-query-with-negative-example) A single text query with negative example +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -This is an issue related to the embeddings themselves. Our dataset contains a bunch of image embeddings that are pretty close to each -other. On the other hand, our text queries are quite far from most of the image embeddings, but relatively close to some of them, so the -text-to-image search seems to work well. When all query items come from the same domain, such as only text, everything works fine. -However, if we mix positive text and negative image embeddings, the results of the `best_score` are overwhelmed by the negative samples, -which are simply closer to the dataset embeddings. If you experience such a problem, the `average_vector` strategy might be a better -choice. +client + .createPayloadIndexAsync( + "{collection_name}", + "name_of_the_field_to_index", + PayloadSchemaType.Text, + PayloadIndexParams.newBuilder() + .setTextIndexParams( + TextIndexParams.newBuilder() + .setTokenizer(TokenizerType.Word) + .setLowercase(true) + .setPhraseMatching(true) + .build()) + .build(), + null, + null, + null) + .get(); +``` -### [Anchor](https://qdrant.tech/articles/new-recommendation-api/\#check-out-the-demo) Check out the demo +See [Phrase Match](/documentation/concepts/filtering/#phrase-match) for examples of querying phrases with a full-text index. -The [Food Discovery Demo](https://food-discovery.qdrant.tech/) is available online, so you can test and see the difference. -This is an open source project, so you can easily deploy it on your own. The source code is available in the [GitHub repository](https://github.com/qdrant/demo-food-discovery/) and the [README](https://github.com/qdrant/demo-food-discovery/blob/main/README.md) describes the process of setting it up. -Since calculating the embeddings takes a while, we precomputed them and exported them as a [snapshot](https://storage.googleapis.com/common-datasets-snapshots/wolt-clip-ViT-B-32.snapshot), -which might be easily imported into any Qdrant instance. [Qdrant Cloud is the easiest way to start](https://cloud.qdrant.io/), though! -##### Was this page useful? +## Vector Index -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +A vector index is a data structure built on vectors through a specific mathematical model. +Through the vector index, we can efficiently query several vectors similar to the target vector. -Thank you for your feedback! 🙏 +Qdrant currently only uses HNSW as a dense vector index. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/new-recommendation-api.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +[HNSW](https://arxiv.org/abs/1603.09320) (Hierarchical Navigable Small World Graph) is a graph-based indexing algorithm. It builds a multi-layer navigation structure for an image according to certain rules. In this structure, the upper layers are more sparse and the distances between nodes are farther. The lower layers are denser and the distances between nodes are closer. The search starts from the uppermost layer, finds the node closest to the target in this layer, and then enters the next layer to begin another search. After multiple iterations, it can quickly approach the target position. -On this page: +In order to improve performance, HNSW limits the maximum degree of nodes on each layer of the graph to `m`. In addition, you can use `ef_construct` (when building an index) or `ef` (when searching targets) to specify a search range. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/new-recommendation-api.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +The corresponding parameters could be configured in the configuration file: -× +```yaml +storage: + # Default parameters of HNSW Index. Could be overridden for each collection or named vector individually + hnsw_index: + # Number of edges per node in the index graph. + # Larger the value - more accurate the search, more space required. + m: 16 + # Number of neighbours to consider during the index building. + # Larger the value - more accurate the search, more time required to build index. + ef_construct: 100 + # Minimal size threshold (in KiloBytes) below which full-scan is preferred over HNSW search. + # This measures the total size of vectors being queried against. + # When the maximum estimated amount of points that a condition satisfies is smaller than + # `full_scan_threshold_kb`, the query planner will use full-scan search instead of HNSW index + # traversal for better performance. + # Note: 1Kb = 1 vector of size 256 + full_scan_threshold: 10000 -[Powered by](https://qdrant.tech/) +``` -<|page-19-lllmstxt|> -## hybrid-search-llamaindex-jinaai -- [Documentation](https://qdrant.tech/documentation/) -- [Examples](https://qdrant.tech/documentation/examples/) -- Chat With Product PDF Manuals Using Hybrid Search +And so in the process of creating a [collection](/documentation/concepts/collections/). The `ef` parameter is configured during [the search](/documentation/concepts/search/) and by default is equal to `ef_construct`. -# [Anchor](https://qdrant.tech/documentation/examples/hybrid-search-llamaindex-jinaai/\#chat-with-product-pdf-manuals-using-hybrid-search) Chat With Product PDF Manuals Using Hybrid Search +HNSW is chosen for several reasons. +First, HNSW is well-compatible with the modification that allows Qdrant to use filters during a search. +Second, it is one of the most accurate and fastest algorithms, according to [public benchmarks](https://github.com/erikbern/ann-benchmarks). -| Time: 120 min | Level: Advanced | Output: [GitHub](https://github.com/infoslack/qdrant-example/blob/main/HC-demo/HC-DO-LlamaIndex-Jina-v2.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/infoslack/qdrant-example/blob/main/HC-demo/HC-DO-LlamaIndex-Jina-v2.ipynb) | -| --- | --- | --- | --- | +*Available as of v1.1.1* -With the proliferation of digital manuals and the increasing demand for quick and accurate customer support, having a chatbot capable of efficiently parsing through complex PDF documents and delivering precise information can be a game-changer for any business. +The HNSW parameters can also be configured on a collection and named vector +level by setting [`hnsw_config`](/documentation/concepts/indexing/#vector-index) to fine-tune search +performance. -In this tutorial, we’ll walk you through the process of building a RAG-based chatbot, designed specifically to assist users with understanding the operation of various household appliances. -We’ll cover the essential steps required to build your system, including data ingestion, natural language understanding, and response generation for customer support use cases. +## Sparse Vector Index -## [Anchor](https://qdrant.tech/documentation/examples/hybrid-search-llamaindex-jinaai/\#components) Components +*Available as of v1.7.0* -- **Embeddings:** Jina Embeddings, served via the [Jina Embeddings API](https://jina.ai/embeddings/#apiform) -- **Database:** [Qdrant Hybrid Cloud](https://qdrant.tech/documentation/hybrid-cloud/), deployed in a managed Kubernetes cluster on [DigitalOcean\\ -(DOKS)](https://www.digitalocean.com/products/kubernetes) -- **LLM:** [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) language model on HuggingFace -- **Framework:** [LlamaIndex](https://www.llamaindex.ai/) for extended RAG functionality and [Hybrid Search support](https://docs.llamaindex.ai/en/stable/examples/vector_stores/qdrant_hybrid/). -- **Parser:** [LlamaParse](https://github.com/run-llama/llama_parse) as a way to parse complex documents with embedded objects such as tables and figures. +Sparse vectors in Qdrant are indexed with a special data structure, which is optimized for vectors that have a high proportion of zeroes. In some ways, this indexing method is similar to the inverted index, which is used in text search engines. -![Architecture diagram](https://qdrant.tech/documentation/examples/hybrid-search-llamaindex-jinaai/architecture-diagram.png) +- A sparse vector index in Qdrant is exact, meaning it does not use any approximation algorithms. +- All sparse vectors added to the collection are immediately indexed in the mutable version of a sparse index. -### [Anchor](https://qdrant.tech/documentation/examples/hybrid-search-llamaindex-jinaai/\#procedure) Procedure +With Qdrant, you can benefit from a more compact and efficient immutable sparse index, which is constructed during the same optimization process as the dense vector index. -Retrieval Augmented Generation (RAG) combines search with language generation. An external information retrieval system is used to identify documents likely to provide information relevant to the user’s query. These documents, along with the user’s request, are then passed on to a text-generating language model, producing a natural response. +This approach is particularly useful for collections storing both dense and sparse vectors. -This method enables a language model to respond to questions and access information from a much larger set of documents than it could see otherwise. The language model only looks at a few relevant sections of the documents when generating responses, which also helps to reduce inexplicable errors. +To configure a sparse vector index, create a collection with the following parameters: -## [Anchor](https://qdrant.tech/documentation/examples/hybrid-search-llamaindex-jinaai/\#heading) +```python +from qdrant_client import QdrantClient, models -[Service Managed Kubernetes](https://www.ovhcloud.com/en-in/public-cloud/kubernetes/), powered by OVH Public Cloud Instances, a leading European cloud provider. With OVHcloud Load Balancers and disks built in. OVHcloud Managed Kubernetes provides high availability, compliance, and CNCF conformance, allowing you to focus on your containerized software layers with total reversibility. +client = QdrantClient(url="http://localhost:6333") -## [Anchor](https://qdrant.tech/documentation/examples/hybrid-search-llamaindex-jinaai/\#prerequisites) Prerequisites +client.create_collection( + collection_name="{collection_name}", + vectors_config={}, + sparse_vectors_config={ + "text": models.SparseVectorParams( + index=models.SparseIndexParams(on_disk=False), + ) + }, +) +``` -### [Anchor](https://qdrant.tech/documentation/examples/hybrid-search-llamaindex-jinaai/\#deploying-qdrant-hybrid-cloud-on-digitalocean) Deploying Qdrant Hybrid Cloud on DigitalOcean +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -[DigitalOcean Kubernetes (DOKS)](https://www.digitalocean.com/products/kubernetes) is a managed Kubernetes service that lets you deploy Kubernetes clusters without the complexities of handling the control plane and containerized infrastructure. Clusters are compatible with standard Kubernetes toolchains and integrate natively with DigitalOcean Load Balancers and volumes. +var client = new QdrantClient("localhost", 6334); -1. To start using managed Kubernetes on DigitalOcean, follow the [platform-specific documentation](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/#digital-ocean). -2. Once your Kubernetes clusters are up, [you can begin deploying Qdrant Hybrid Cloud](https://qdrant.tech/documentation/hybrid-cloud/). -3. Once it’s deployed, you should have a running Qdrant cluster with an API key. +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + sparseVectorsConfig: ("splade-model-name", new SparseVectorParams{ + Index = new SparseIndexConfig { + OnDisk = false, + } + }) +); +``` -### [Anchor](https://qdrant.tech/documentation/examples/hybrid-search-llamaindex-jinaai/\#development-environment) Development environment +```go +import ( + "context" -Then, install all dependencies: + "github.com/qdrant/go-client/qdrant" +) -```python -!pip install -U \ - llama-index \ - llama-parse \ - python-dotenv \ - llama-index-embeddings-jinaai \ - llama-index-llms-huggingface \ - llama-index-vector-stores-qdrant \ - "huggingface_hub[inference]" \ - datasets +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + SparseVectorsConfig: qdrant.NewSparseVectorsConfig( + map[string]*qdrant.SparseVectorParams{ + "splade-model-name": { + Index: &qdrant.SparseIndexConfig{ + OnDisk: qdrant.PtrOf(false), + }}, + }), +}) ``` -Set up secret key values on `.env` file: - -```bash -JINAAI_API_KEY -HF_INFERENCE_API_KEY -LLAMA_CLOUD_API_KEY -QDRANT_HOST -QDRANT_API_KEY - +```http +PUT /collections/{collection_name} +{ + "sparse_vectors": { + "text": { + "index": { + "on_disk": false + } + } + } +} ``` -Load all environment variables: +```typescript +import { QdrantClient, Schemas } from "@qdrant/js-client-rest"; -```python -import os -from dotenv import load_dotenv -load_dotenv('./.env') +const client = new QdrantClient({ host: "localhost", port: 6333 }); +client.createCollection("{collection_name}", { + sparse_vectors: { + "splade-model-name": { + index: { + on_disk: false + } + } + } +}); ``` -## [Anchor](https://qdrant.tech/documentation/examples/hybrid-search-llamaindex-jinaai/\#implementation) Implementation +```rust +use qdrant_client::qdrant::{ + CreateCollectionBuilder, SparseIndexConfigBuilder, SparseVectorParamsBuilder, + SparseVectorsConfigBuilder, +}; +use qdrant_client::Qdrant; -### [Anchor](https://qdrant.tech/documentation/examples/hybrid-search-llamaindex-jinaai/\#connect-jina-embeddings-and-mixtral-llm) Connect Jina Embeddings and Mixtral LLM +let client = Qdrant::from_url("http://localhost:6334").build()?; -LlamaIndex provides built-in support for the [Jina Embeddings API](https://jina.ai/embeddings/#apiform). To use it, you need to initialize the `JinaEmbedding` object with your API Key and model name. +let mut sparse_vectors_config = SparseVectorsConfigBuilder::default(); -For the LLM, you need wrap it in a subclass of `llama_index.llms.CustomLLM` to make it compatible with LlamaIndex. +sparse_vectors_config.add_named_vector_params( + "splade-model-name", + SparseVectorParamsBuilder::default() + .index(SparseIndexConfigBuilder::default().on_disk(true)), +); -```python -# connect embeddings -from llama_index.embeddings.jinaai import JinaEmbedding +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .sparse_vectors_config(sparse_vectors_config), + ) + .await?; +``` -jina_embedding_model = JinaEmbedding( - model="jina-embeddings-v2-base-en", - api_key=os.getenv("JINAAI_API_KEY"), -) +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; -# connect LLM -from llama_index.llms.huggingface import HuggingFaceInferenceAPI +import io.qdrant.client.grpc.Collections; -mixtral_llm = HuggingFaceInferenceAPI( - model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1", - token=os.getenv("HF_INFERENCE_API_KEY"), -) +QdrantClient client = new QdrantClient( + QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -``` +client.createCollectionAsync( + Collections.CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setSparseVectorsConfig( + Collections.SparseVectorConfig.newBuilder().putMap( + "splade-model-name", + Collections.SparseVectorParams.newBuilder() + .setIndex( + Collections.SparseIndexConfig + .newBuilder() + .setOnDisk(false) + .build() + ).build() + ).build() + ).build() +).get(); -### [Anchor](https://qdrant.tech/documentation/examples/hybrid-search-llamaindex-jinaai/\#prepare-data-for-rag) Prepare data for RAG +```` -This example will use household appliance manuals, which are generally available as PDF documents. -LlamaPar -In the `data` folder, we have three documents, and we will use it to extract the textual content from the PDF and use it as a knowledge base in a simple RAG. +The following parameters may affect performance: -The free LlamaIndex Cloud plan is sufficient for our example: +- `on_disk: true` - The index is stored on disk, which lets you save memory. This may slow down search performance. +- `on_disk: false` - The index is still persisted on disk, but it is also loaded into memory for faster search. -```python -import nest_asyncio -nest_asyncio.apply() -from llama_parse import LlamaParse +Unlike a dense vector index, a sparse vector index does not require a predefined vector size. It automatically adjusts to the size of the vectors added to the collection. -llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY") +**Note:** A sparse vector index only supports dot-product similarity searches. It does not support other distance metrics. -llama_parse_documents = LlamaParse(api_key=llamaparse_api_key, result_type="markdown").load_data([\ - "data/DJ68-00682F_0.0.pdf",\ - "data/F500E_WF80F5E_03445F_EN.pdf",\ - "data/O_ME4000R_ME19R7041FS_AA_EN.pdf"\ -]) +### IDF Modifier -``` +*Available as of v1.10.0* -### [Anchor](https://qdrant.tech/documentation/examples/hybrid-search-llamaindex-jinaai/\#store-data-into-qdrant) Store data into Qdrant +For many search algorithms, it is important to consider how often an item occurs in a collection. +Intuitively speaking, the less frequently an item appears in a collection, the more important it is in a search. -The code below does the following: +This is also known as the Inverse Document Frequency (IDF). It is used in text search engines to rank search results based on the rarity of a word in a collection. -- create a vector store with Qdrant client; -- get an embedding for each chunk using Jina Embeddings API; -- combines `sparse` and `dense` vectors for hybrid search; -- stores all data into Qdrant; +IDF depends on the currently stored documents and therefore can't be pre-computed in the sparse vectors in streaming inference mode. +In order to support IDF in the sparse vector index, Qdrant provides an option to modify the sparse vector query with the IDF statistics automatically. -Hybrid search with Qdrant must be enabled from the beginning - we can simply set `enable_hybrid=True`. +The only requirement is to enable the IDF modifier in the collection configuration: ```python -# By default llamaindex uses OpenAI models -# setting embed_model to Jina and llm model to Mixtral -from llama_index.core import Settings -Settings.embed_model = jina_embedding_model -Settings.llm = mixtral_llm - -from llama_index.core import VectorStoreIndex, StorageContext -from llama_index.vector_stores.qdrant import QdrantVectorStore -import qdrant_client - -client = qdrant_client.QdrantClient( - url=os.getenv("QDRANT_HOST"), - api_key=os.getenv("QDRANT_API_KEY") -) +from qdrant_client import QdrantClient, models -vector_store = QdrantVectorStore( - client=client, collection_name="demo", enable_hybrid=True, batch_size=20 -) -Settings.chunk_size = 512 +client = QdrantClient(url="http://localhost:6333") -storage_context = StorageContext.from_defaults(vector_store=vector_store) -index = VectorStoreIndex.from_documents( - documents=llama_parse_documents, - storage_context=storage_context +client.create_collection( + collection_name="{collection_name}", + vectors_config={}, + sparse_vectors_config={ + "text": models.SparseVectorParams( + modifier=models.Modifier.IDF, + ), + }, ) - ``` -### [Anchor](https://qdrant.tech/documentation/examples/hybrid-search-llamaindex-jinaai/\#prepare-a-prompt) Prepare a prompt - -Here we will create a custom prompt template. This prompt asks the LLM to use only the context information retrieved from Qdrant. When querying with hybrid mode, we can set `similarity_top_k` and `sparse_top_k` separately: - -- `sparse_top_k` represents how many nodes will be retrieved from each dense and sparse query. -- `similarity_top_k` controls the final number of returned nodes. In the above setting, we end up with 10 nodes. - -Then, we assemble the query engine using the prompt. - -```python -from llama_index.core import PromptTemplate +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -qa_prompt_tmpl = ( - "Context information is below.\n" - "-------------------------------" - "{context_str}\n" - "-------------------------------" - "Given the context information and not prior knowledge," - "answer the query. Please be concise, and complete.\n" - "If the context does not contain an answer to the query," - "respond with \"I don't know!\"." - "Query: {query_str}\n" - "Answer: " -) -qa_prompt = PromptTemplate(qa_prompt_tmpl) +var client = new QdrantClient("localhost", 6334); -from llama_index.core.retrievers import VectorIndexRetriever -from llama_index.core.query_engine import RetrieverQueryEngine -from llama_index.core import get_response_synthesizer -from llama_index.core import Settings -Settings.embed_model = jina_embedding_model -Settings.llm = mixtral_llm +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + sparseVectorsConfig: ("text", new SparseVectorParams { + Modifier = Modifier.Idf, + }) +); +``` -# retriever -retriever = VectorIndexRetriever( - index=index, - similarity_top_k=2, - sparse_top_k=12, - vector_store_query_mode="hybrid" -) +```go +import ( + "context" -# response synthesizer -response_synthesizer = get_response_synthesizer( - llm=mixtral_llm, - text_qa_template=qa_prompt, - response_mode="compact", + "github.com/qdrant/go-client/qdrant" ) -# query engine -query_engine = RetrieverQueryEngine( - retriever=retriever, - response_synthesizer=response_synthesizer, -) +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + SparseVectorsConfig: qdrant.NewSparseVectorsConfig( + map[string]*qdrant.SparseVectorParams{ + "text": { + Modifier: qdrant.Modifier_Idf.Enum(), + }, + }), +}) ``` -## [Anchor](https://qdrant.tech/documentation/examples/hybrid-search-llamaindex-jinaai/\#run-a-test-query) Run a test query - -Now you can ask questions and receive answers based on the data: +```http +PUT /collections/{collection_name} +{ + "sparse_vectors": { + "text": { + "modifier": "idf" + } + } +} +``` -**Question** +```typescript +import { QdrantClient, Schemas } from "@qdrant/js-client-rest"; -```python -result = query_engine.query("What temperature should I use for my laundry?") -print(result.response) +const client = new QdrantClient({ host: "localhost", port: 6333 }); +client.createCollection("{collection_name}", { + sparse_vectors: { + "text": { + modifier: "idf" + } + } +}); ``` -**Answer** +```rust +use qdrant_client::qdrant::{ + CreateCollectionBuilder, Modifier, SparseVectorParamsBuilder, SparseVectorsConfigBuilder, +}; +use qdrant_client::{Qdrant, QdrantError}; -```text -The water temperature is set to 70 ˚C during the Eco Drum Clean cycle. You cannot change the water temperature. However, the temperature for other cycles is not specified in the context. +let client = Qdrant::from_url("http://localhost:6334").build()?; + +let mut sparse_vectors_config = SparseVectorsConfigBuilder::default(); +sparse_vectors_config.add_named_vector_params( + "text", + SparseVectorParamsBuilder::default().modifier(Modifier::Idf), +); +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .sparse_vectors_config(sparse_vectors_config), + ) + .await?; ``` -And that’s it! Feel free to scale this up to as many documents and complex PDFs as you like. +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Modifier; +import io.qdrant.client.grpc.Collections.SparseVectorConfig; +import io.qdrant.client.grpc.Collections.SparseVectorParams; -##### Was this page useful? +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setSparseVectorsConfig( + SparseVectorConfig.newBuilder() + .putMap("text", SparseVectorParams.newBuilder().setModifier(Modifier.Idf).build())) + .build()) + .get(); +``` -Thank you for your feedback! 🙏 +Qdrant uses the following formula to calculate the IDF modifier: -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/hybrid-search-llamaindex-jinaai.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +$$ +\text{IDF}(q_i) = \ln \left(\frac{N - n(q_i) + 0.5}{n(q_i) + 0.5}+1\right) +$$ -On this page: +Where: -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/hybrid-search-llamaindex-jinaai.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +- `N` is the total number of documents in the collection. +- `n` is the number of documents containing non-zero values for the given vector element. -× +## Filtrable Index -[Powered by](https://qdrant.tech/) +Separately, a payload index and a vector index cannot solve the problem of search using the filter completely. -<|page-20-lllmstxt|> -## fastembed-rerankers -- [Documentation](https://qdrant.tech/documentation/) -- [Fastembed](https://qdrant.tech/documentation/fastembed/) -- Reranking with FastEmbed +In the case of weak filters, you can use the HNSW index as it is. In the case of stringent filters, you can use the payload index and complete rescore. +However, for cases in the middle, this approach does not work well. -# [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-rerankers/\#how-to-use-rerankers-with-fastembed) How to use rerankers with FastEmbed +On the one hand, we cannot apply a full scan on too many vectors. On the other hand, the HNSW graph starts to fall apart when using too strict filters. -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-rerankers/\#rerankers) Rerankers +![HNSW fail](/docs/precision_by_m.png) -A reranker is a model that improves the ordering of search results. A subset of documents is initially retrieved using a fast, simple method (e.g., BM25 or dense embeddings). Then, a reranker – a more powerful, precise, but slower and heavier model – re-evaluates this subset to refine document relevance to the query. +![hnsw graph](/docs/graph.gif) -Rerankers analyze token-level interactions between the query and each document in depth, making them expensive to use but precise in defining relevance. They trade speed for accuracy, so they are best used on a limited candidate set rather than the entire corpus. +You can find more information on why this happens in our [blog post](https://blog.vasnetsov.com/posts/categorical-hnsw/). +Qdrant solves this problem by extending the HNSW graph with additional edges based on the stored payload values. -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-rerankers/\#goal-of-this-tutorial) Goal of this Tutorial +Extra edges allow you to efficiently search for nearby vectors using the HNSW index and apply filters as you search in the graph. -It’s common to use [cross-encoder](https://sbert.net/examples/applications/cross-encoder/README.html) models as rerankers. This tutorial uses [Jina Reranker v2 Base Multilingual](https://jina.ai/news/jina-reranker-v2-for-agentic-rag-ultra-fast-multilingual-function-calling-and-code-search/) (licensed under CC-BY-NC-4.0) – a cross-encoder reranker supported in FastEmbed. +This approach minimizes the overhead on condition checks since you only need to calculate the conditions for a small fraction of the points involved in the search. -We use the `all-MiniLM-L6-v2` dense embedding model (also supported in FastEmbed) as a first-stage retriever and then refine results with `Jina Reranker v2`. +<|page-13-lllmstxt|> +# Snapshots -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-rerankers/\#setup) Setup +*Available as of v0.8.4* -Install `qdrant-client` with `fastembed`. +Snapshots are `tar` archive files that contain data and configuration of a specific collection on a specific node at a specific time. In a distributed setup, when you have multiple nodes in your cluster, you must create snapshots for each node separately when dealing with a single collection. -```python -pip install "qdrant-client[fastembed]>=1.14.1" +This feature can be used to archive data or easily replicate an existing deployment. For disaster recovery, Qdrant Cloud users may prefer to use [Backups](/documentation/cloud/backups/) instead, which are physical disk-level copies of your data. -``` +A collection level snapshot only contains data within that collection, including the collection configuration, all points and payloads. Collection aliases are not included and can be migrated or recovered [separately](/documentation/concepts/collections/#collection-aliases). -Import cross-encoders and text embeddings for the first-stage retrieval. +For a step-by-step guide on how to use snapshots, see our [tutorial](/documentation/tutorials/create-snapshot/). -```python -from fastembed import TextEmbedding -from fastembed.rerank.cross_encoder import TextCrossEncoder +## Create snapshot -``` + -You can list the cross-encoder rerankers supported in FastEmbed using the following command. +To create a new snapshot for an existing collection: ```python -TextCrossEncoder.list_supported_models() - -``` - -This command displays the available models, including details such as output embedding dimensions, model description, model size, model sources, and model file. +from qdrant_client import QdrantClient -Avaliable models - -```python -[{'model': 'Xenova/ms-marco-MiniLM-L-6-v2',\ - 'size_in_GB': 0.08,\ - 'sources': {'hf': 'Xenova/ms-marco-MiniLM-L-6-v2'},\ - 'model_file': 'onnx/model.onnx',\ - 'description': 'MiniLM-L-6-v2 model optimized for re-ranking tasks.',\ - 'license': 'apache-2.0'},\ - {'model': 'Xenova/ms-marco-MiniLM-L-12-v2',\ - 'size_in_GB': 0.12,\ - 'sources': {'hf': 'Xenova/ms-marco-MiniLM-L-12-v2'},\ - 'model_file': 'onnx/model.onnx',\ - 'description': 'MiniLM-L-12-v2 model optimized for re-ranking tasks.',\ - 'license': 'apache-2.0'},\ - {'model': 'BAAI/bge-reranker-base',\ - 'size_in_GB': 1.04,\ - 'sources': {'hf': 'BAAI/bge-reranker-base'},\ - 'model_file': 'onnx/model.onnx',\ - 'description': 'BGE reranker base model for cross-encoder re-ranking.',\ - 'license': 'mit'},\ - {'model': 'jinaai/jina-reranker-v1-tiny-en',\ - 'size_in_GB': 0.13,\ - 'sources': {'hf': 'jinaai/jina-reranker-v1-tiny-en'},\ - 'model_file': 'onnx/model.onnx',\ - 'description': 'Designed for blazing-fast re-ranking with 8K context length and fewer parameters than jina-reranker-v1-turbo-en.',\ - 'license': 'apache-2.0'},\ - {'model': 'jinaai/jina-reranker-v1-turbo-en',\ - 'size_in_GB': 0.15,\ - 'sources': {'hf': 'jinaai/jina-reranker-v1-turbo-en'},\ - 'model_file': 'onnx/model.onnx',\ - 'description': 'Designed for blazing-fast re-ranking with 8K context length.',\ - 'license': 'apache-2.0'},\ - {'model': 'jinaai/jina-reranker-v2-base-multilingual',\ - 'size_in_GB': 1.11,\ - 'sources': {'hf': 'jinaai/jina-reranker-v2-base-multilingual'},\ - 'model_file': 'onnx/model.onnx',\ - 'description': 'A multi-lingual reranker model for cross-encoder re-ranking with 1K context length and sliding window',\ - 'license': 'cc-by-nc-4.0'}] # some of the fields are omitted for brevity +client = QdrantClient(url="http://localhost:6333") +client.create_snapshot(collection_name="{collection_name}") ``` -Now, load the first-stage retriever and reranker. +```csharp +using Qdrant.Client; -```python -encoder_name = "sentence-transformers/all-MiniLM-L6-v2" -dense_embedding_model = TextEmbedding(model_name=encoder_name) -reranker = TextCrossEncoder(model_name='jinaai/jina-reranker-v2-base-multilingual') +var client = new QdrantClient("localhost", 6334); +await client.CreateSnapshotAsync("{collection_name}"); ``` -The model files will be fetched and downloaded, with progress displayed. - -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-rerankers/\#embed--index-data-for-the-first-stage-retrieval) Embed & index data for the first-stage retrieval - -We will vectorize a toy movie description dataset using the `all-MiniLM-L6-v2` model and save the embeddings in Qdrant for first-stage retrieval. +```go +import ( + "context" -Then, we will use a cross-encoder reranking model to rerank a small subset of data retrieved in the first stage. + "github.com/qdrant/go-client/qdrant" +) -Movie description dataset - -```python -descriptions = ["In 1431, Jeanne d'Arc is placed on trial on charges of heresy. The ecclesiastical jurists attempt to force Jeanne to recant her claims of holy visions.",\ - "A film projectionist longs to be a detective, and puts his meagre skills to work when he is framed by a rival for stealing his girlfriend's father's pocketwatch.",\ - "A group of high-end professional thieves start to feel the heat from the LAPD when they unknowingly leave a clue at their latest heist.",\ - "A petty thief with an utter resemblance to a samurai warlord is hired as the lord's double. When the warlord later dies the thief is forced to take up arms in his place.",\ - "A young boy named Kubo must locate a magical suit of armour worn by his late father in order to defeat a vengeful spirit from the past.",\ - "A biopic detailing the 2 decades that Punjabi Sikh revolutionary Udham Singh spent planning the assassination of the man responsible for the Jallianwala Bagh massacre.",\ - "When a machine that allows therapists to enter their patients' dreams is stolen, all hell breaks loose. Only a young female therapist, Paprika, can stop it.",\ - "An ordinary word processor has the worst night of his life after he agrees to visit a girl in Soho whom he met that evening at a coffee shop.",\ - "A story that revolves around drug abuse in the affluent north Indian State of Punjab and how the youth there have succumbed to it en-masse resulting in a socio-economic decline.",\ - "A world-weary political journalist picks up the story of a woman's search for her son, who was taken away from her decades ago after she became pregnant and was forced to live in a convent.",\ - "Concurrent theatrical ending of the TV series Neon Genesis Evangelion (1995).",\ - "During World War II, a rebellious U.S. Army Major is assigned a dozen convicted murderers to train and lead them into a mass assassination mission of German officers.",\ - "The toys are mistakenly delivered to a day-care center instead of the attic right before Andy leaves for college, and it's up to Woody to convince the other toys that they weren't abandoned and to return home.",\ - "A soldier fighting aliens gets to relive the same day over and over again, the day restarting every time he dies.",\ - "After two male musicians witness a mob hit, they flee the state in an all-female band disguised as women, but further complications set in.",\ - "Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household.",\ - "A renegade reporter trailing a young runaway heiress for a big story joins her on a bus heading from Florida to New York, and they end up stuck with each other when the bus leaves them behind at one of the stops.",\ - "Story of 40-man Turkish task force who must defend a relay station.",\ - "Spinal Tap, one of England's loudest bands, is chronicled by film director Marty DiBergi on what proves to be a fateful tour.",\ - "Oskar, an overlooked and bullied boy, finds love and revenge through Eli, a beautiful but peculiar girl."] +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) +client.CreateSnapshot(context.Background(), "{collection_name}") ``` -```python -descriptions_embeddings = list( - dense_embedding_model.embed(descriptions) -) - +```http +POST /collections/{collection_name}/snapshots ``` -Let’s upload the embeddings to Qdrant. - -Qdrant Client offers a simple in-memory mode, allowing you to experiment locally with small data volumes. - -Alternatively, you can use [a free cluster](https://qdrant.tech/documentation/cloud/create-cluster/#create-a-cluster) in Qdrant Cloud for experiments. - -```python -from qdrant_client import QdrantClient, models +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -client = QdrantClient(":memory:") # Qdrant is running from RAM. +const client = new QdrantClient({ host: "localhost", port: 6333 }); +client.createSnapshot("{collection_name}"); ``` -Let’s create a [collection](https://qdrant.tech/documentation/concepts/collections/) with our movie data. +```rust +use qdrant_client::Qdrant; -```python -client.create_collection( - collection_name="movies", - vectors_config={ - "embedding": models.VectorParams( - size=client.get_embedding_size("sentence-transformers/all-MiniLM-L6-v2"), - distance=models.Distance.COSINE - ) - } -) +let client = Qdrant::from_url("http://localhost:6334").build()?; +client.create_snapshot("{collection_name}").await?; ``` -And upload the embeddings to it. +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; -```python -client.upload_points( - collection_name="movies", - points=[\ - models.PointStruct(\ - id=idx,\ - payload={"description": description},\ - vector={"embedding": vector}\ - )\ - for idx, (description, vector) in enumerate(\ - zip(descriptions, descriptions_embeddings)\ - )\ - ], -) +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +client.createSnapshotAsync("{collection_name}").get(); ``` -Upload with implicit embeddings computation - -```python -client.upload_points( - collection_name="movies", - points=[\ - models.PointStruct(\ - id=idx,\ - payload={"description": description},\ - vector={"embedding": models.Document(text=description, model=encoder_name)},\ - )\ - for idx, description in enumerate(descriptions)\ - ], -) - -``` +This is a synchronous operation for which a `tar` archive file will be generated into the `snapshot_path`. -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-rerankers/\#first-stage-retrieval) First-stage retrieval +### Delete snapshot -Let’s see how relevant the results will be using only an `all-MiniLM-L6-v2`-based dense retriever. +*Available as of v1.0.0* ```python -query = "A story about a strong historically significant female figure." -query_embedded = list(dense_embedding_model.query_embed(query))[0] - -initial_retrieval = client.query_points( - collection_name="movies", - using="embedding", - query=query_embedded, - with_payload=True, - limit=10 -) - -description_hits = [] -for i, hit in enumerate(initial_retrieval.points): - print(f'Result number {i+1} is \"{hit.payload["description"]}\"') - description_hits.append(hit.payload["description"]) - -``` - -Query points with implicit embeddings computation +from qdrant_client import QdrantClient -```python -query = "A story about a strong historically significant female figure." +client = QdrantClient(url="http://localhost:6333") -initial_retrieval = client.query_points( - collection_name="movies", - using="embedding", - query=models.Document(text=query, model=encoder_name), - with_payload=True, - limit=10 +client.delete_snapshot( + collection_name="{collection_name}", snapshot_name="{snapshot_name}" ) - ``` -The result is as follows: +```csharp +using Qdrant.Client; -```bash -Result number 1 is "A world-weary political journalist picks up the story of a woman's search for her son, who was taken away from her decades ago after she became pregnant and was forced to live in a convent." -Result number 2 is "Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household." -... -Result number 9 is "A biopic detailing the 2 decades that Punjabi Sikh revolutionary Udham Singh spent planning the assassination of the man responsible for the Jallianwala Bagh massacre." -Result number 10 is "In 1431, Jeanne d'Arc is placed on trial on charges of heresy. The ecclesiastical jurists attempt to force Jeanne to recant her claims of holy visions." +var client = new QdrantClient("localhost", 6334); +await client.DeleteSnapshotAsync(collectionName: "{collection_name}", snapshotName: "{snapshot_name}"); ``` -We can see that the description of _“The Messenger: The Story of Joan of Arc”_, which is the most fitting, appears 10th in the results. - -Let’s try refining the order of the retrieved subset with `Jina Reranker v2`. It takes a query and a set of documents (movie descriptions) as input and calculates a relevance score based on token-level interactions between the query and each document. +```go +import ( + "context" -```python -new_scores = list( - reranker.rerank(query, description_hits) -) # returns scores between query and each document + "github.com/qdrant/go-client/qdrant" +) -ranking = [\ - (i, score) for i, score in enumerate(new_scores)\ -] # saving document indices -ranking.sort( - key=lambda x: x[1], reverse=True -) # sorting them in order of relevance defined by reranker +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -for i, rank in enumerate(ranking): - print(f'''Reranked result number {i+1} is \"{description_hits[rank[0]]}\"''') +client.DeleteSnapshot(context.Background(), "{collection_name}", "{snapshot_name}") +``` +```http +DELETE /collections/{collection_name}/snapshots/{snapshot_name} ``` -The reranker moves the desired movie to the first position based on relevance. +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -```bash -Reranked result number 1 is "In 1431, Jeanne d'Arc is placed on trial on charges of heresy. The ecclesiastical jurists attempt to force Jeanne to recant her claims of holy visions." -Reranked result number 2 is "Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household." -... -Reranked result number 9 is "An ordinary word processor has the worst night of his life after he agrees to visit a girl in Soho whom he met that evening at a coffee shop." -Reranked result number 10 is "A biopic detailing the 2 decades that Punjabi Sikh revolutionary Udham Singh spent planning the assassination of the man responsible for the Jallianwala Bagh massacre." +const client = new QdrantClient({ host: "localhost", port: 6333 }); +client.deleteSnapshot("{collection_name}", "{snapshot_name}"); ``` -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-rerankers/\#conclusion) Conclusion - -Rerankers refine search results by reordering retrieved candidates through deeper semantic analysis. For efficiency, they should be applied **only to a small subset of retrieved results**. +```rust +use qdrant_client::qdrant::DeleteSnapshotRequestBuilder; +use qdrant_client::Qdrant; -Balance speed and accuracy in search by leveraging the power of rerankers! +let client = Qdrant::from_url("http://localhost:6334").build()?; -##### Was this page useful? +client + .delete_snapshot(DeleteSnapshotRequestBuilder::new( + "{collection_name}", + "{snapshot_name}", + )) + .await?; +``` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; -Thank you for your feedback! 🙏 +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/fastembed/fastembed-rerankers.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +client.deleteSnapshotAsync("{collection_name}", "{snapshot_name}").get(); +``` -On this page: +## List snapshot -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/fastembed/fastembed-rerankers.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +List of snapshots for a collection: -× +```python +from qdrant_client import QdrantClient -[Powered by](https://qdrant.tech/) +client = QdrantClient(url="http://localhost:6333") -<|page-21-lllmstxt|> -## late-interaction-models -- [Articles](https://qdrant.tech/articles/) -- Any\* Embedding Model Can Become a Late Interaction Model... If You Give It a Chance! +client.list_snapshots(collection_name="{collection_name}") +``` -[Back to Machine Learning](https://qdrant.tech/articles/machine-learning/) +```csharp +using Qdrant.Client; -# Any\* Embedding Model Can Become a Late Interaction Model... If You Give It a Chance! +var client = new QdrantClient("localhost", 6334); -Kacper Ɓukawski +await client.ListSnapshotsAsync("{collection_name}"); +``` -· +```go +import ( + "context" -August 14, 2024 + "github.com/qdrant/go-client/qdrant" +) -![Any* Embedding Model Can Become a Late Interaction Model... If You Give It a Chance!](https://qdrant.tech/articles_data/late-interaction-models/preview/title.jpg) +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -\\* At least any open-source model, since you need access to its internals. +client.ListSnapshots(context.Background(), "{collection_name}") +``` -## [Anchor](https://qdrant.tech/articles/late-interaction-models/\#you-can-adapt-dense-embedding-models-for-late-interaction) You Can Adapt Dense Embedding Models for Late Interaction +```http +GET /collections/{collection_name}/snapshots +``` -Qdrant 1.10 introduced support for multi-vector representations, with late interaction being a prominent example of this model. In essence, both documents and queries are represented by multiple vectors, and identifying the most relevant documents involves calculating a score based on the similarity between the corresponding query and document embeddings. If you’re not familiar with this paradigm, our updated [Hybrid Search](https://qdrant.tech/articles/hybrid-search/) article explains how multi-vector representations can enhance retrieval quality. +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -**Figure 1:** We can visualize late interaction between corresponding document-query embedding pairs. +const client = new QdrantClient({ host: "localhost", port: 6333 }); -![Late interaction model](https://qdrant.tech/articles_data/late-interaction-models/late-interaction.png) +client.listSnapshots("{collection_name}"); +``` -There are many specialized late interaction models, such as [ColBERT](https://qdrant.tech/documentation/fastembed/fastembed-colbert/), but **it appears that regular dense embedding models can also be effectively utilized in this manner**. +```rust +use qdrant_client::Qdrant; -> In this study, we will demonstrate that standard dense embedding models, traditionally used for single-vector representations, can be effectively adapted for late interaction scenarios using output token embeddings as multi-vector representations. +let client = Qdrant::from_url("http://localhost:6334").build()?; -By testing out retrieval with Qdrant’s multi-vector feature, we will show that these models can rival or surpass specialized late interaction models in retrieval performance, while offering lower complexity and greater efficiency. This work redefines the potential of dense models in advanced search pipelines, presenting a new method for optimizing retrieval systems. +client.list_snapshots("{collection_name}").await?; +``` -## [Anchor](https://qdrant.tech/articles/late-interaction-models/\#understanding-embedding-models) Understanding Embedding Models +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; -The inner workings of embedding models might be surprising to some. The model doesn’t operate directly on the input text; instead, it requires a tokenization step to convert the text into a sequence of token identifiers. Each token identifier is then passed through an embedding layer, which transforms it into a dense vector. Essentially, the embedding layer acts as a lookup table that maps token identifiers to dense vectors. These vectors are then fed into the transformer model as input. +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -**Figure 2:** The tokenization step, which takes place before vectors are added to the transformer model. +client.listSnapshotAsync("{collection_name}").get(); +``` -![Input token embeddings](https://qdrant.tech/articles_data/late-interaction-models/input-embeddings.png) +## Retrieve snapshot -The input token embeddings are context-free and are learned during the model’s training process. This means that each token always receives the same embedding, regardless of its position in the text. At this stage, the token embeddings are unaware of the context in which they appear. It is the transformer model’s role to contextualize these embeddings. + -Much has been discussed about the role of attention in transformer models, but in essence, this mechanism is responsible for capturing cross-token relationships. Each transformer module takes a sequence of token embeddings as input and produces a sequence of output token embeddings. Both sequences are of the same length, with each token embedding being enriched by information from the other token embeddings at the current step. +To download a specified snapshot from a collection as a file: -**Figure 3:** The mechanism that produces a sequence of output token embeddings. +```http +GET /collections/{collection_name}/snapshots/{snapshot_name} +``` -![Output token embeddings](https://qdrant.tech/articles_data/late-interaction-models/output-embeddings.png) +```shell +curl 'http://{qdrant-url}:6333/collections/{collection_name}/snapshots/snapshot-2022-10-10.snapshot' \ + -H 'api-key: ********' \ + --output 'filename.snapshot' +``` -**Figure 4:** The final step performed by the embedding model is pooling the output token embeddings to generate a single vector representation of the input text. +## Restore snapshot -![Pooling](https://qdrant.tech/articles_data/late-interaction-models/pooling.png) + -There are several pooling strategies, but regardless of which one a model uses, the output is always a single vector representation, which inevitably loses some information about the input. It’s akin to giving someone detailed, step-by-step directions to the nearest grocery store versus simply pointing in the general direction. While the vague direction might suffice in some cases, the detailed instructions are more likely to lead to the desired outcome. +Snapshots can be restored in three possible ways: -## [Anchor](https://qdrant.tech/articles/late-interaction-models/\#using-output-token-embeddings-for-multi-vector-representations) Using Output Token Embeddings for Multi-Vector Representations +1. [Recovering from a URL or local file](#recover-from-a-url-or-local-file) (useful for restoring a snapshot file that is on a remote server or already stored on the node) +3. [Recovering from an uploaded file](#recover-from-an-uploaded-file) (useful for migrating data to a new cluster) +3. [Recovering during start-up](#recover-during-start-up) (useful when running a self-hosted single-node Qdrant instance) -We often overlook the output token embeddings, but the fact is—they also serve as multi-vector representations of the input text. So, why not explore their use in a multi-vector retrieval model, similar to late interaction models? +Regardless of the method used, Qdrant will extract the shard data from the snapshot and properly register shards in the cluster. +If there are other active replicas of the recovered shards in the cluster, Qdrant will replicate them to the newly recovered node by default to maintain data consistency. -### [Anchor](https://qdrant.tech/articles/late-interaction-models/\#experimental-findings) Experimental Findings +### Recover from a URL or local file -We conducted several experiments to determine whether output token embeddings could be effectively used in place of traditional late interaction models. The results are quite promising. +*Available as of v0.11.3* -| Dataset | Model | Experiment | NDCG@10 | -| --- | --- | --- | --- | -| SciFact | `prithivida/Splade_PP_en_v1` | sparse vectors | 0.70928 | -| `colbert-ir/colbertv2.0` | late interaction model | 0.69579 | -| `all-MiniLM-L6-v2` | single dense vector representation | 0.64508 | -| output token embeddings | 0.70724 | -| `BAAI/bge-small-en` | single dense vector representation | 0.68213 | -| output token embeddings | 0.73696 | -| | -| NFCorpus | `prithivida/Splade_PP_en_v1` | sparse vectors | 0.34166 | -| `colbert-ir/colbertv2.0` | late interaction model | 0.35036 | -| `all-MiniLM-L6-v2` | single dense vector representation | 0.31594 | -| output token embeddings | 0.35779 | -| `BAAI/bge-small-en` | single dense vector representation | 0.29696 | -| output token embeddings | 0.37502 | -| | -| ArguAna | `prithivida/Splade_PP_en_v1` | sparse vectors | 0.47271 | -| `colbert-ir/colbertv2.0` | late interaction model | 0.44534 | -| `all-MiniLM-L6-v2` | single dense vector representation | 0.50167 | -| output token embeddings | 0.45997 | -| `BAAI/bge-small-en` | single dense vector representation | 0.58857 | -| output token embeddings | 0.57648 | +This method of recovery requires the snapshot file to be downloadable from a URL or exist as a local file on the node (like if you [created the snapshot](#create-snapshot) on this node previously). If instead you need to upload a snapshot file, see the next section. -The [source code for these experiments is open-source](https://github.com/kacperlukawski/beir-qdrant/blob/main/examples/retrieval/search/evaluate_all_exact.py) and utilizes [`beir-qdrant`](https://github.com/kacperlukawski/beir-qdrant), an integration of Qdrant with the [BeIR library](https://github.com/beir-cellar/beir). While this package is not officially maintained by the Qdrant team, it may prove useful for those interested in experimenting with various Qdrant configurations to see how they impact retrieval quality. All experiments were conducted using Qdrant in exact search mode, ensuring the results are not influenced by approximate search. +To recover from a URL or local file use the [snapshot recovery endpoint](https://api.qdrant.tech/master/api-reference/snapshots/recover-from-snapshot). This endpoint accepts either a URL like `https://example.com` or a [file URI](https://en.wikipedia.org/wiki/File_URI_scheme) like `file:///tmp/snapshot-2022-10-10.snapshot`. If the target collection does not exist, it will be created. -Even the simple `all-MiniLM-L6-v2` model can be applied in a late interaction model fashion, resulting in a positive impact on retrieval quality. However, the best results were achieved with the `BAAI/bge-small-en` model, which outperformed both sparse and late interaction models. +```python +from qdrant_client import QdrantClient -It’s important to note that ColBERT has not been trained on BeIR datasets, making its performance fully out of domain. Nevertheless, the `all-MiniLM-L6-v2` [training dataset](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2#training-data) also lacks any BeIR data, yet it still performs remarkably well. +client = QdrantClient(url="http://qdrant-node-2:6333") -## [Anchor](https://qdrant.tech/articles/late-interaction-models/\#comparative-analysis-of-dense-vs-late-interaction-models) Comparative Analysis of Dense vs. Late Interaction Models +client.recover_snapshot( + "{collection_name}", + "http://qdrant-node-1:6333/collections/collection_name/snapshots/snapshot-2022-10-10.snapshot", +) +``` -The retrieval quality speaks for itself, but there are other important factors to consider. +```http +PUT /collections/{collection_name}/snapshots/recover +{ + "location": "http://qdrant-node-1:6333/collections/{collection_name}/snapshots/snapshot-2022-10-10.snapshot" +} +``` -The traditional dense embedding models we tested are less complex than late interaction or sparse models. With fewer parameters, these models are expected to be faster during inference and more cost-effective to maintain. Below is a comparison of the models used in the experiments: +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -| Model | Number of parameters | -| --- | --- | -| `prithivida/Splade_PP_en_v1` | 109,514,298 | -| `colbert-ir/colbertv2.0` | 109,580,544 | -| `BAAI/bge-small-en` | 33,360,000 | -| `all-MiniLM-L6-v2` | 22,713,216 | +const client = new QdrantClient({ host: "localhost", port: 6333 }); -One argument against using output token embeddings is the increased storage requirements compared to ColBERT-like models. For instance, the `all-MiniLM-L6-v2` model produces 384-dimensional output token embeddings, which is three times more than the 128-dimensional embeddings generated by ColBERT-like models. This increase not only leads to higher memory usage but also impacts the computational cost of retrieval, as calculating distances takes more time. Mitigating this issue through vector compression would make a lot of sense. +client.recoverSnapshot("{collection_name}", { + location: "http://qdrant-node-1:6333/collections/{collection_name}/snapshots/snapshot-2022-10-10.snapshot", +}); +``` -## [Anchor](https://qdrant.tech/articles/late-interaction-models/\#exploring-quantization-for-multi-vector-representations) Exploring Quantization for Multi-Vector Representations + -Binary quantization is generally more effective for high-dimensional vectors, making the `all-MiniLM-L6-v2` model, with its relatively low-dimensional outputs, less ideal for this approach. However, scalar quantization appeared to be a viable alternative. The table below summarizes the impact of quantization on retrieval quality. +### Recover from an uploaded file -| Dataset | Model | Experiment | NDCG@10 | -| --- | --- | --- | --- | -| SciFact | `all-MiniLM-L6-v2` | output token embeddings | 0.70724 | -| output token embeddings (uint8) | 0.70297 | -| | -| NFCorpus | `all-MiniLM-L6-v2` | output token embeddings | 0.35779 | -| output token embeddings (uint8) | 0.35572 | +The snapshot file can also be uploaded as a file and restored using the [recover from uploaded snapshot](https://api.qdrant.tech/master/api-reference/snapshots/recover-from-uploaded-snapshot). This endpoint accepts the raw snapshot data in the request body. If the target collection does not exist, it will be created. -It’s important to note that quantization doesn’t always preserve retrieval quality at the same level, but in this case, scalar quantization appears to have minimal impact on retrieval performance. The effect is negligible, while the memory savings are substantial. +```bash +curl -X POST 'http://{qdrant-url}:6333/collections/{collection_name}/snapshots/upload?priority=snapshot' \ + -H 'api-key: ********' \ + -H 'Content-Type:multipart/form-data' \ + -F 'snapshot=@/path/to/snapshot-2022-10-10.snapshot' +``` -We managed to maintain the original quality while using four times less memory. Additionally, a quantized vector requires 384 bytes, compared to ColBERT’s 512 bytes. This results in a 25% reduction in memory usage, with retrieval quality remaining nearly unchanged. +This method is typically used to migrate data from one cluster to another, so we recommend setting the [priority](#snapshot-priority) to "snapshot" for that use-case. -## [Anchor](https://qdrant.tech/articles/late-interaction-models/\#practical-application-enhancing-retrieval-with-dense-models) Practical Application: Enhancing Retrieval with Dense Models +### Recover during start-up -If you’re using one of the sentence transformer models, the output token embeddings are calculated by default. While a single vector representation is more efficient in terms of storage and computation, there’s no need to discard the output token embeddings. According to our experiments, these embeddings can significantly enhance retrieval quality. You can store both the single vector and the output token embeddings in Qdrant, using the single vector for the initial retrieval step and then reranking the results with the output token embeddings. + -**Figure 5:** A single model pipeline that relies solely on the output token embeddings for reranking. +If you have a single-node deployment, you can recover any collection at start-up and it will be immediately available. +Restoring snapshots is done through the Qdrant CLI at start-up time via the `--snapshot` argument which accepts a list of pairs such as `:` -![Single model reranking](https://qdrant.tech/articles_data/late-interaction-models/single-model-reranking.png) +For example: -To demonstrate this concept, we implemented a simple reranking pipeline in Qdrant. This pipeline uses a dense embedding model for the initial oversampled retrieval and then relies solely on the output token embeddings for the reranking step. +```bash +./qdrant --snapshot /snapshots/test-collection-archive.snapshot:test-collection --snapshot /snapshots/test-collection-archive.snapshot:test-copy-collection +``` -### [Anchor](https://qdrant.tech/articles/late-interaction-models/\#single-model-retrieval-and-reranking-benchmarks) Single Model Retrieval and Reranking Benchmarks +The target collection **must** be absent otherwise the program will exit with an error. -Our tests focused on using the same model for both retrieval and reranking. The reported metric is NDCG@10. In all tests, we applied an oversampling factor of 5x, meaning the retrieval step returned 50 results, which were then narrowed down to 10 during the reranking step. Below are the results for some of the BeIR datasets: +If you wish instead to overwrite an existing collection, use the `--force_snapshot` flag with caution. -| Dataset | `all-miniLM-L6-v2` | `BAAI/bge-small-en` | -| --- | --- | --- | -| dense embeddings only | dense + reranking | dense embeddings only | dense + reranking | -| --- | --- | --- | --- | -| SciFact | 0.64508 | 0.70293 | 0.68213 | 0.73053 | -| NFCorpus | 0.31594 | 0.34297 | 0.29696 | 0.35996 | -| ArguAna | 0.50167 | 0.45378 | 0.58857 | 0.57302 | -| Touche-2020 | 0.16904 | 0.19693 | 0.13055 | 0.19821 | -| TREC-COVID | 0.47246 | 0.6379 | 0.45788 | 0.53539 | -| FiQA-2018 | 0.36867 | 0.41587 | 0.31091 | 0.39067 | +### Snapshot priority -The source code for the benchmark is publicly available, and [you can find it in the repository of the `beir-qdrant` package](https://github.com/kacperlukawski/beir-qdrant/blob/main/examples/retrieval/search/evaluate_reranking.py). +When recovering a snapshot to a non-empty node, there may be conflicts between the snapshot data and the existing data. The "priority" setting controls how Qdrant handles these conflicts. The priority setting is important because different priorities can give very +different end results. The default priority may not be best for all situations. -Overall, adding a reranking step using the same model typically improves retrieval quality. However, the quality of various late interaction models is [often reported based on their reranking performance when BM25 is used for the initial retrieval](https://huggingface.co/mixedbread-ai/mxbai-colbert-large-v1#1-reranking-performance). This experiment aimed to demonstrate how a single model can be effectively used for both retrieval and reranking, and the results are quite promising. +The available snapshot recovery priorities are: -Now, let’s explore how to implement this using the new Query API introduced in Qdrant 1.10. +- `replica`: _(default)_ prefer existing data over the snapshot. +- `snapshot`: prefer snapshot data over existing data. +- `no_sync`: restore snapshot without any additional synchronization. -## [Anchor](https://qdrant.tech/articles/late-interaction-models/\#setting-up-qdrant-for-late-interaction) Setting Up Qdrant for Late Interaction +To recover a new collection from a snapshot, you need to set +the priority to `snapshot`. With `snapshot` priority, all data from the snapshot +will be recovered onto the cluster. With `replica` priority _(default)_, you'd +end up with an empty collection because the collection on the cluster did not +contain any points and that source was preferred. -The new Query API in Qdrant 1.10 enables the construction of even more complex retrieval pipelines. We can use the single vector created after pooling for the initial retrieval step and then rerank the results using the output token embeddings. +`no_sync` is for specialized use cases and is not commonly used. It allows +managing shards and transferring shards between clusters manually without any +additional synchronization. Using it incorrectly will leave your cluster in a +broken state. -Assuming the collection is named `my-collection` and is configured to store two named vectors: `dense-vector` and `output-token-embeddings`, here’s how such a collection could be created in Qdrant: +To recover from a URL, you specify an additional parameter in the request body: ```python from qdrant_client import QdrantClient, models -client = QdrantClient("http://localhost:6333") +client = QdrantClient(url="http://qdrant-node-2:6333") -client.create_collection( - collection_name="my-collection", - vectors_config={ - "dense-vector": models.VectorParams( - size=384, - distance=models.Distance.COSINE, - ), - "output-token-embeddings": models.VectorParams( - size=384, - distance=models.Distance.COSINE, - multivector_config=models.MultiVectorConfig( - comparator=models.MultiVectorComparator.MAX_SIM - ), - ), - } +client.recover_snapshot( + "{collection_name}", + "http://qdrant-node-1:6333/collections/{collection_name}/snapshots/snapshot-2022-10-10.snapshot", + priority=models.SnapshotPriority.SNAPSHOT, ) +``` +```bash +curl -X POST 'http://qdrant-node-1:6333/collections/{collection_name}/snapshots/upload?priority=snapshot' \ + -H 'api-key: ********' \ + -H 'Content-Type:multipart/form-data' \ + -F 'snapshot=@/path/to/snapshot-2022-10-10.snapshot' ``` -Both vectors are of the same size since they are produced by the same `all-MiniLM-L6-v2` model. +```http +PUT /collections/{collection_name}/snapshots/recover +{ + "location": "http://qdrant-node-1:6333/collections/{collection_name}/snapshots/snapshot-2022-10-10.snapshot", + "priority": "snapshot" +} +``` -```python -from sentence_transformers import SentenceTransformer +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -model = SentenceTransformer("all-MiniLM-L6-v2") +const client = new QdrantClient({ host: "localhost", port: 6333 }); +client.recoverSnapshot("{collection_name}", { + location: "http://qdrant-node-1:6333/collections/{collection_name}/snapshots/snapshot-2022-10-10.snapshot", + priority: "snapshot" +}); ``` -Now, instead of using the search API with just a single dense vector, we can create a reranking pipeline. First, we retrieve 50 results using the dense vector, and then we rerank them using the output token embeddings to obtain the top 10 results. +## Snapshots for the whole storage -```python -query = "What else can be done with just all-MiniLM-L6-v2 model?" +*Available as of v0.8.5* -client.query_points( - collection_name="my-collection", - prefetch=[\ - # Prefetch the dense embeddings of the top-50 documents\ - models.Prefetch(\ - query=model.encode(query).tolist(),\ - using="dense-vector",\ - limit=50,\ - )\ - ], - # Rerank the top-50 documents retrieved by the dense embedding model - # and return just the top-10. Please note we call the same model, but - # we ask for the token embeddings by setting the output_value parameter. - query=model.encode(query, output_value="token_embeddings").tolist(), - using="output-token-embeddings", - limit=10, -) +Sometimes it might be handy to create snapshot not just for a single collection, but for the whole storage, including collection aliases. +Qdrant provides a dedicated API for that as well. It is similar to collection-level snapshots, but does not require `collection_name`. -``` + -## [Anchor](https://qdrant.tech/articles/late-interaction-models/\#try-the-experiment-yourself) Try the Experiment Yourself + -In a real-world scenario, you might take it a step further by first calculating the token embeddings and then performing pooling to obtain the single vector representation. This approach allows you to complete everything in a single pass. +### Create full storage snapshot + +```python +from qdrant_client import QdrantClient -The simplest way to start experimenting with building complex reranking pipelines in Qdrant is by using the forever-free cluster on [Qdrant Cloud](https://cloud.qdrant.io/) and reading [Qdrant’s documentation](https://qdrant.tech/documentation/). +client = QdrantClient(url="http://localhost:6333") -The [source code for these experiments is open-source](https://github.com/kacperlukawski/beir-qdrant/blob/main/examples/retrieval/search/evaluate_all_exact.py) and uses [`beir-qdrant`](https://github.com/kacperlukawski/beir-qdrant), an integration of Qdrant with the [BeIR library](https://github.com/beir-cellar/beir). +client.create_full_snapshot() +``` -## [Anchor](https://qdrant.tech/articles/late-interaction-models/\#future-directions-and-research-opportunities) Future Directions and Research Opportunities +```csharp +using Qdrant.Client; -The initial experiments using output token embeddings in the retrieval process have yielded promising results. However, we plan to conduct further benchmarks to validate these findings and explore the incorporation of sparse methods for the initial retrieval. Additionally, we aim to investigate the impact of quantization on multi-vector representations and its effects on retrieval quality. Finally, we will assess retrieval speed, a crucial factor for many applications. +var client = new QdrantClient("localhost", 6334); -##### Was this page useful? +await client.CreateFullSnapshotAsync(); +``` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +```go +import ( + "context" -Thank you for your feedback! 🙏 + "github.com/qdrant/go-client/qdrant" +) -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/late-interaction-models.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -On this page: +client.CreateFullSnapshot(context.Background()) +``` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/late-interaction-models.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +```http +POST /snapshots +``` -× +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -[Powered by](https://qdrant.tech/) +const client = new QdrantClient({ host: "localhost", port: 6333 }); -<|page-22-lllmstxt|> -## huggingface-datasets -- [Documentation](https://qdrant.tech/documentation/) -- [Database tutorials](https://qdrant.tech/documentation/database-tutorials/) -- Load a HuggingFace Dataset +client.createFullSnapshot(); +``` -# [Anchor](https://qdrant.tech/documentation/database-tutorials/huggingface-datasets/\#load-and-search-hugging-face-datasets-with-qdrant) Load and Search Hugging Face Datasets with Qdrant +```rust +use qdrant_client::Qdrant; -[Hugging Face](https://huggingface.co/) provides a platform for sharing and using ML models and -datasets. [Qdrant](https://huggingface.co/Qdrant) also publishes datasets along with the -embeddings that you can use to practice with Qdrant and build your applications based on semantic -search. **Please [let us know](https://qdrant.to/discord) if you’d like to see a specific dataset!** +let client = Qdrant::from_url("http://localhost:6334").build()?; -## [Anchor](https://qdrant.tech/documentation/database-tutorials/huggingface-datasets/\#arxiv-titles-instructorxl-embeddings) arxiv-titles-instructorxl-embeddings +client.create_full_snapshot().await?; +``` -[This dataset](https://huggingface.co/datasets/Qdrant/arxiv-titles-instructorxl-embeddings) contains -embeddings generated from the paper titles only. Each vector has a payload with the title used to -create it, along with the DOI (Digital Object Identifier). +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; -```json -{ - "title": "Nash Social Welfare for Indivisible Items under Separable, Piecewise-Linear Concave Utilities", - "DOI": "1612.05191" -} +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +client.createFullSnapshotAsync().get(); ``` -You can find a detailed description of the dataset in the [Practice Datasets](https://qdrant.tech/documentation/datasets/#journal-article-titles) -section. If you prefer loading the dataset from a Qdrant snapshot, it also linked there. +### Delete full storage snapshot -Loading the dataset is as simple as using the `load_dataset` function from the `datasets` library: +*Available as of v1.0.0* ```python -from datasets import load_dataset +from qdrant_client import QdrantClient -dataset = load_dataset("Qdrant/arxiv-titles-instructorxl-embeddings") +client = QdrantClient(url="http://localhost:6333") +client.delete_full_snapshot(snapshot_name="{snapshot_name}") ``` -The dataset contains 2,250,000 vectors. This is how you can check the list of the features in the dataset: +```csharp +using Qdrant.Client; -```python -dataset.features +var client = new QdrantClient("localhost", 6334); +await client.DeleteFullSnapshotAsync("{snapshot_name}"); ``` -### [Anchor](https://qdrant.tech/documentation/database-tutorials/huggingface-datasets/\#streaming-the-dataset) Streaming the dataset +```go +import ( + "context" -Dataset streaming lets you work with a dataset without downloading it. The data is streamed as -you iterate over the dataset. You can read more about it in the [Hugging Face\\ -documentation](https://huggingface.co/docs/datasets/stream). + "github.com/qdrant/go-client/qdrant" +) -```python -from datasets import load_dataset +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -dataset = load_dataset( - "Qdrant/arxiv-titles-instructorxl-embeddings", split="train", streaming=True -) +client.DeleteFullSnapshot(context.Background(), "{snapshot_name}") +``` +```http +DELETE /snapshots/{snapshot_name} ``` -### [Anchor](https://qdrant.tech/documentation/database-tutorials/huggingface-datasets/\#loading-the-dataset-into-qdrant) Loading the dataset into Qdrant +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -You can load the dataset into Qdrant using the [Python SDK](https://github.com/qdrant/qdrant-client). -The embeddings are already precomputed, so you can store them in a collection, that we’re going -to create in a second: +const client = new QdrantClient({ host: "localhost", port: 6333 }); -```python -from qdrant_client import QdrantClient, models +client.deleteFullSnapshot("{snapshot_name}"); +``` -client = QdrantClient("http://localhost:6333") +```rust +use qdrant_client::Qdrant; -client.create_collection( - collection_name="arxiv-titles-instructorxl-embeddings", - vectors_config=models.VectorParams( - size=768, - distance=models.Distance.COSINE, - ), -) +let client = Qdrant::from_url("http://localhost:6334").build()?; +client.delete_full_snapshot("{snapshot_name}").await?; ``` -It is always a good idea to use batching, while loading a large dataset, so let’s do that. -We are going to need a helper function to split the dataset into batches: - -```python -from itertools import islice +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; -def batched(iterable, n): - iterator = iter(iterable) - while batch := list(islice(iterator, n)): - yield batch +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +client.deleteFullSnapshotAsync("{snapshot_name}").get(); ``` -If you are a happy user of Python 3.12+, you can use the [`batched` function from the `itertools`](https://docs.python.org/3/library/itertools.html#itertools.batched) package instead. - -No matter what Python version you are using, you can use the `upsert` method to load the dataset, -batch by batch, into Qdrant: +### List full storage snapshots ```python -batch_size = 100 - -for batch in batched(dataset, batch_size): - ids = [point.pop("id") for point in batch] - vectors = [point.pop("vector") for point in batch] +from qdrant_client import QdrantClient - client.upsert( - collection_name="arxiv-titles-instructorxl-embeddings", - points=models.Batch( - ids=ids, - vectors=vectors, - payloads=batch, - ), - ) +client = QdrantClient("localhost", port=6333) +client.list_full_snapshots() ``` -Your collection is ready to be used for search! Please [let us know using Discord](https://qdrant.to/discord) -if you would like to see more datasets published on Hugging Face hub. - -##### Was this page useful? +```csharp +using Qdrant.Client; -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +var client = new QdrantClient("localhost", 6334); -Thank you for your feedback! 🙏 +await client.ListFullSnapshotsAsync(); +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/database-tutorials/huggingface-datasets.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +```go +import ( + "context" -On this page: + "github.com/qdrant/go-client/qdrant" +) -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/database-tutorials/huggingface-datasets.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -× +client.ListFullSnapshots(context.Background()) +``` -[Powered by](https://qdrant.tech/) +```http +GET /snapshots +``` -<|page-23-lllmstxt|> -## reranking-hybrid-search -- [Documentation](https://qdrant.tech/documentation/) -- [Advanced tutorials](https://qdrant.tech/documentation/advanced-tutorials/) -- Reranking in Hybrid Search +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -# [Anchor](https://qdrant.tech/documentation/advanced-tutorials/reranking-hybrid-search/\#reranking-hybrid-search-results-with-qdrant-vector-database) Reranking Hybrid Search Results with Qdrant Vector Database +const client = new QdrantClient({ host: "localhost", port: 6333 }); -Hybrid search combines dense and sparse retrieval to deliver precise and comprehensive results. By adding reranking with ColBERT, you can further refine search outputs for maximum relevance. +client.listFullSnapshots(); +``` -In this guide, we’ll show you how to implement hybrid search with reranking in Qdrant, leveraging dense, sparse, and late interaction embeddings to create an efficient, high-accuracy search system. Let’s get started! +```rust +use qdrant_client::Qdrant; -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/reranking-hybrid-search/\#overview) Overview +let client = Qdrant::from_url("http://localhost:6334").build()?; -Let’s start by breaking down the architecture: +client.list_full_snapshots().await?; +``` -![image3.png](https://qdrant.tech/documentation/examples/reranking-hybrid-search/image3.png) +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; -Processing Dense, Sparse, and Late Interaction Embeddings in Vector Databases (VDB) +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/reranking-hybrid-search/\#ingestion-stage) Ingestion Stage +client.listFullSnapshotAsync().get(); +``` -Here’s how we’re going to set up the advanced hybrid search. The process is similar to what we did earlier but with a few powerful additions: +### Download full storage snapshot -1. **Documents**: Just like before, we start with the raw input—our set of documents that need to be indexed for search. -2. **Dense Embeddings**: We’ll generate dense embeddings for each document, just like in the basic search. These embeddings capture the deeper, semantic meanings behind the text. -3. **Sparse Embeddings**: This is where it gets interesting. Alongside dense embeddings, we’ll create sparse embeddings using more traditional, keyword-based methods. Specifically, we’ll use BM25, a probabilistic retrieval model. BM25 ranks documents based on how relevant their terms are to a given query, taking into account how often terms appear, document length, and how common the term is across all documents. It’s perfect for keyword-heavy searches. -4. **Late Interaction Embeddings**: Now, we add the magic of ColBERT. ColBERT uses a two-stage approach. First, it generates contextualized embeddings for both queries and documents using BERT, and then it performs late interaction—matching those embeddings efficiently using a dot product to fine-tune relevance. This step allows for deeper, contextual understanding, making sure you get the most precise results. -5. **Vector Database**: All of these embeddings—dense, sparse, and late interaction—are stored in a vector database like Qdrant. This allows you to efficiently search, retrieve, and rerank your documents based on multiple layers of relevance. + -![image2.png](https://qdrant.tech/documentation/examples/reranking-hybrid-search/image2.png) +```http +GET /snapshots/{snapshot_name} +``` -Query Retrieval and Reranking Process in Search Systems +## Restore full storage snapshot -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/reranking-hybrid-search/\#retrieval-stage) Retrieval Stage +Restoring snapshots can only be done through the Qdrant CLI at startup time. -Now, let’s talk about how we’re going to pull the best results once the user submits a query: +For example: -1. **User’s Query**: The user enters a query, and that query is transformed into multiple types of embeddings. We’re talking about representations that capture both the deeper meaning (dense) and specific keywords (sparse). -2. **Embeddings**: The query gets converted into various embeddings—some for understanding the semantics (dense embeddings) and others for focusing on keyword matches (sparse embeddings). -3. **Hybrid Search**: Our hybrid search uses both dense and sparse embeddings to find the most relevant documents. The dense embeddings ensure we capture the overall meaning of the query, while sparse embeddings make sure we don’t miss out on those key, important terms. -4. **Rerank**: Once we’ve got a set of documents, the final step is reranking. This is where late interaction embeddings come into play, giving you results that are not only relevant but tuned to your query by prioritizing the documents that truly meet the user’s intent. +```bash +./qdrant --storage-snapshot /snapshots/full-snapshot-2022-07-18-11-20-51.snapshot +``` -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/reranking-hybrid-search/\#implementation) Implementation +## Storage -Let’s see it in action in this section. +Created, uploaded and recovered snapshots are stored as `.snapshot` files. By +default, they're stored on the [local file system](#local-file-system). You may +also configure to use an [S3 storage](#s3) service for them. -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/reranking-hybrid-search/\#additional-setup) Additional Setup +### Local file system -This time around, we’re using FastEmbed—a lightweight Python library designed for generating embeddings, and it supports popular text models right out of the box. First things first, you’ll need to install it: +By default, snapshots are stored at `./snapshots` or at `/qdrant/snapshots` when +using our Docker image. -```python -pip install fastembed +The target directory can be controlled through the [configuration](/documentation/guides/configuration/): +```yaml +storage: + # Specify where you want to store snapshots. + snapshots_path: ./snapshots ``` -* * * +Alternatively you may use the environment variable `QDRANT__STORAGE__SNAPSHOTS_PATH=./snapshots`. -Here are the models we’ll be pulling from FastEmbed: +*Available as of v1.3.0* -```python -from fastembed import TextEmbedding, LateInteractionTextEmbedding, SparseTextEmbedding +While a snapshot is being created, temporary files are placed in the configured +storage directory by default. In case of limited capacity or a slow +network attached disk, you can specify a separate location for temporary files: +```yaml +storage: + # Where to store temporary files + temp_path: /tmp ``` -* * * - -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/reranking-hybrid-search/\#ingestion) Ingestion +### S3 -As before, we’ll convert our documents into embeddings, but thanks to FastEmbed, the process is even more straightforward because all the models you need are conveniently available in one location. +*Available as of v1.10.0* -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/reranking-hybrid-search/\#embeddings) Embeddings +Rather than storing snapshots on the local file system, you may also configure +to store snapshots in an S3-compatible storage service. To enable this, you must +configure it in the [configuration](/documentation/guides/configuration/) file. -First, let’s load the models we need: +For example, to configure for AWS S3: -```python -dense_embedding_model = TextEmbedding("sentence-transformers/all-MiniLM-L6-v2") -bm25_embedding_model = SparseTextEmbedding("Qdrant/bm25") -late_interaction_embedding_model = LateInteractionTextEmbedding("colbert-ir/colbertv2.0") +```yaml +storage: + snapshots_config: + # Use 's3' to store snapshots on S3 + snapshots_storage: s3 -``` + s3_config: + # Bucket name + bucket: your_bucket_here -* * * + # Bucket region (e.g. eu-central-1) + region: your_bucket_region_here -Now, let’s convert our documents into embeddings: + # Storage access key + # Can be specified either here or in the `QDRANT__STORAGE__SNAPSHOTS_CONFIG__S3_CONFIG__ACCESS_KEY` environment variable. + access_key: your_access_key_here -```python -dense_embeddings = list(dense_embedding_model.embed(doc for doc in documents)) -bm25_embeddings = list(bm25_embedding_model.embed(doc for doc in documents)) -late_interaction_embeddings = list(late_interaction_embedding_model.embed(doc for doc in documents)) + # Storage secret key + # Can be specified either here or in the `QDRANT__STORAGE__SNAPSHOTS_CONFIG__S3_CONFIG__SECRET_KEY` environment variable. + secret_key: your_secret_key_here + # S3-Compatible Storage URL + # Can be specified either here or in the `QDRANT__STORAGE__SNAPSHOTS_CONFIG__S3_CONFIG__ENDPOINT_URL` environment variable. + endpoint_url: your_url_here ``` -* * * +Apart from Snapshots, Qdrant also provides the [Qdrant Migration Tool](https://github.com/qdrant/migration) that supports: +- Migration between Qdrant Cloud instances. +- Migrating vectors from other providers into Qdrant. +- Migrating from Qdrant OSS to Qdrant Cloud. -Since we’re dealing with multiple types of embeddings (dense, sparse, and late interaction), we’ll need to store them in a collection that supports a multi-vector setup. The previous collection we created won’t work here, so we’ll create a new one designed specifically for handling these different types of embeddings. +Follow our [migration guide](/documentation/database-tutorials/migration/) to learn how to effectively use the Qdrant Migration tool. -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/reranking-hybrid-search/\#create-collection) Create Collection +<|page-14-lllmstxt|> +# How to Get Started with Qdrant Locally -Now, we’re setting up a new collection in Qdrant for our hybrid search with the right configurations to handle all the different vector types we’re working with. +In this short example, you will use the Python Client to create a Collection, load data into it and run a basic search query. -Here’s how you do it: + -```python -from qdrant_client.models import Distance, VectorParams, models +## Download and run -client.create_collection( - "hybrid-search", - vectors_config={ - "all-MiniLM-L6-v2": models.VectorParams( - size=len(dense_embeddings[0]), - distance=models.Distance.COSINE, - ), - "colbertv2.0": models.VectorParams( - size=len(late_interaction_embeddings[0][0]), - distance=models.Distance.COSINE, - multivector_config=models.MultiVectorConfig( - comparator=models.MultiVectorComparator.MAX_SIM, - ), - hnsw_config=models.HnswConfigDiff(m=0) # Disable HNSW for reranking - ), - }, - sparse_vectors_config={ - "bm25": models.SparseVectorParams(modifier=models.Modifier.IDF - ) - } -) +First, download the latest Qdrant image from Dockerhub: +```bash +docker pull qdrant/qdrant ``` -* * * - -What’s happening here? We’re creating a collection called “hybrid-search”, and we’re configuring it to handle: - -- **Dense embeddings** from the model all-MiniLM-L6-v2 using cosine distance for comparisons. -- **Late interaction embeddings** from colbertv2.0, also using cosine distance, but with a multivector configuration to use the maximum similarity comparator. Note that we set `m=0` in the `colbertv2.0` vector to prevent indexing since it’s not needed for reranking. -- **Sparse embeddings** from BM25 for keyword-based searches. They use `dot_product` for similarity calculation. - -This setup ensures that all the different types of vectors are stored and compared correctly for your hybrid search. - -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/reranking-hybrid-search/\#upsert-data) Upsert Data - -Next, we need to insert the documents along with their multiple embeddings into the **hybrid-search** collection: +Then, run the service: -```python -from qdrant_client.models import PointStruct -points = [] -for idx, (dense_embedding, bm25_embedding, late_interaction_embedding, doc) in enumerate(zip(dense_embeddings, bm25_embeddings, late_interaction_embeddings, documents)): +```bash +docker run -p 6333:6333 -p 6334:6334 \ + -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \ + qdrant/qdrant +``` + - point = PointStruct( - id=idx, - vector={ - "all-MiniLM-L6-v2": dense_embedding, - "bm25": bm25_embedding.as_object(), - "colbertv2.0": late_interaction_embedding, - }, - payload={"document": doc} - ) - points.append(point) +Under the default configuration all data will be stored in the `./qdrant_storage` directory. This will also be the only directory that both the Container and the host machine can both see. -operation_info = client.upsert( - collection_name="hybrid-search", - points=points -) +Qdrant is now accessible: -``` +- REST API: [localhost:6333](http://localhost:6333) +- Web UI: [localhost:6333/dashboard](http://localhost:6333/dashboard) +- GRPC API: [localhost:6334](http://localhost:6334) -Upload with implicit embeddings computation +## Initialize the client ```python -from qdrant_client.models import PointStruct -points = [] +from qdrant_client import QdrantClient -for idx, doc in enumerate(documents): - point = PointStruct( - id=idx, - vector={ - "all-MiniLM-L6-v2": models.Document(text=doc, model="sentence-transformers/all-MiniLM-L6-v2"), - "bm25": models.Document(text=doc, model="Qdrant/bm25"), - "colbertv2.0": models.Document(text=doc, model="colbert-ir/colbertv2.0"), - }, - payload={"document": doc} - ) - points.append(point) +client = QdrantClient(url="http://localhost:6333") +``` -operation_info = client.upsert( - collection_name="hybrid-search", - points=points -) +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; +const client = new QdrantClient({ host: "localhost", port: 6333 }); ``` -* * * - -This code pulls everything together by creating a list of **PointStruct** objects, each containing the embeddings and corresponding documents. +```rust +use qdrant_client::Qdrant; -For each document, it adds: +// The Rust client uses Qdrant's gRPC interface +let client = Qdrant::from_url("http://localhost:6334").build()?; +``` -- **Dense embeddings** for the deep, semantic meaning. -- **BM25 embeddings** for powerful keyword-based search. -- **ColBERT embeddings** for precise contextual interactions. +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; -Once that’s done, the points are uploaded into our **“hybrid-search”** collection using the upsert method, ensuring everything’s in place. +// The Java client uses Qdrant's gRPC interface +QdrantClient client = new QdrantClient( + QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +``` -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/reranking-hybrid-search/\#retrieval) Retrieval +```csharp +using Qdrant.Client; -For retrieval, it’s time to convert the user’s query into the required embeddings. Here’s how you can do it: +// The C# client uses Qdrant's gRPC interface +var client = new QdrantClient("localhost", 6334); +``` -```python -dense_vectors = next(dense_embedding_model.query_embed(query)) -sparse_vectors = next(bm25_embedding_model.query_embed(query)) -late_vectors = next(late_interaction_embedding_model.query_embed(query)) +```go +import "github.com/qdrant/go-client/qdrant" +// The Go client uses Qdrant's gRPC interface +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) ``` -* * * + -The real magic of hybrid search lies in the **prefetch** parameter. This lets you run multiple sub-queries in one go, combining the power of dense and sparse embeddings. Here’s how to set it up, after which we execute the hybrid search: +## Create a collection + +You will be storing all of your vector data in a Qdrant collection. Let's call it `test_collection`. This collection will be using a dot product distance metric to compare vectors. ```python -prefetch = [\ - models.Prefetch(\ - query=dense_vectors,\ - using="all-MiniLM-L6-v2",\ - limit=20,\ - ),\ - models.Prefetch(\ - query=models.SparseVector(**sparse_vectors.as_object()),\ - using="bm25",\ - limit=20,\ - ),\ - ] +from qdrant_client.models import Distance, VectorParams +client.create_collection( + collection_name="test_collection", + vectors_config=VectorParams(size=4, distance=Distance.DOT), +) ``` -* * * - -This code kicks off a hybrid search by running two sub-queries: +```typescript +await client.createCollection("test_collection", { + vectors: { size: 4, distance: "Dot" }, +}); +``` -- One using dense embeddings from “all-MiniLM-L6-v2” to capture the semantic meaning of the query. -- The other using sparse embeddings from BM25 for strong keyword matching. +```rust +use qdrant_client::qdrant::{CreateCollectionBuilder, VectorParamsBuilder}; -Each sub-query is limited to 20 results. These sub-queries are bundled together using the prefetch parameter, allowing them to run in parallel. +client + .create_collection( + CreateCollectionBuilder::new("test_collection") + .vectors_config(VectorParamsBuilder::new(4, Distance::Dot)), + ) + .await?; +``` -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/reranking-hybrid-search/\#rerank) Rerank +```java +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.VectorParams; -Now that we’ve got our initial hybrid search results, it’s time to rerank them using late interaction embeddings for maximum precision. Here’s how you can do it: +client.createCollectionAsync("test_collection", + VectorParams.newBuilder().setDistance(Distance.Dot).setSize(4).build()).get(); +``` -```python -results = client.query_points( - "hybrid-search", - prefetch=prefetch, - query=late_vectors, - using="colbertv2.0", - with_payload=True, - limit=10, -) +```csharp +using Qdrant.Client.Grpc; +await client.CreateCollectionAsync(collectionName: "test_collection", vectorsConfig: new VectorParams +{ + Size = 4, Distance = Distance.Dot +}); ``` -Query points with implicit embeddings computation +```go +import ( + "context" -```python -prefetch = [\ - models.Prefetch(\ - query=models.Document(text=query, model="sentence-transformers/all-MiniLM-L6-v2"),\ - using="all-MiniLM-L6-v2",\ - limit=20,\ - ),\ - models.Prefetch(\ - query=models.Document(text=query, model="Qdrant/bm25"),\ - using="bm25",\ - limit=20,\ - ),\ - ] -results = client.query_points( - "hybrid-search", - prefetch=prefetch, - query=models.Document(text=query, model="colbert-ir/colbertv2.0"), - using="colbertv2.0", - with_payload=True, - limit=10, + "github.com/qdrant/go-client/qdrant" ) +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 4, + Distance: qdrant.Distance_Cosine, + }), +}) ``` -* * * +## Add vectors -Let’s look at how the positions change after applying reranking. Notice how some documents shift in rank based on their relevance according to the late interaction embeddings. +Let's now add a few vectors with a payload. Payloads are other data you want to associate with the vector: -| | **Document** | **First Query Rank** | **Second Query Rank** | **Rank Change** | -| --- | --- | --- | --- | --- | -| | In machine learning, feature scaling is the process of normalizing the range of independent variables or features. The goal is to ensure that all features contribute equally to the model, especially in algorithms like SVM or k-nearest neighbors where distance calculations matter. | 1 | 1 | No Change | -| | Feature scaling is commonly used in data preprocessing to ensure that features are on the same scale. This is particularly important for gradient descent-based algorithms where features with larger scales could disproportionately impact the cost function. | 2 | 6 | Moved Down | -| | Unsupervised learning algorithms, such as clustering methods, may benefit from feature scaling, which ensures that features with larger numerical ranges don’t dominate the learning process. | 3 | 4 | Moved Down | -| | Data preprocessing steps, including feature scaling, can significantly impact the performance of machine learning models, making it a crucial part of the modeling pipeline. | 5 | 2 | Moved Up | +```python +from qdrant_client.models import PointStruct -Great! We’ve now explored how reranking works and successfully implemented it. +operation_info = client.upsert( + collection_name="test_collection", + wait=True, + points=[ + PointStruct(id=1, vector=[0.05, 0.61, 0.76, 0.74], payload={"city": "Berlin"}), + PointStruct(id=2, vector=[0.19, 0.81, 0.75, 0.11], payload={"city": "London"}), + PointStruct(id=3, vector=[0.36, 0.55, 0.47, 0.94], payload={"city": "Moscow"}), + PointStruct(id=4, vector=[0.18, 0.01, 0.85, 0.80], payload={"city": "New York"}), + PointStruct(id=5, vector=[0.24, 0.18, 0.22, 0.44], payload={"city": "Beijing"}), + PointStruct(id=6, vector=[0.35, 0.08, 0.11, 0.44], payload={"city": "Mumbai"}), + ], +) -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/reranking-hybrid-search/\#best-practices-in-reranking) Best Practices in Reranking +print(operation_info) +``` -Reranking can dramatically improve the relevance of search results, especially when combined with hybrid search. Here are some best practices to keep in mind: +```typescript +const operationInfo = await client.upsert("test_collection", { + wait: true, + points: [ + { id: 1, vector: [0.05, 0.61, 0.76, 0.74], payload: { city: "Berlin" } }, + { id: 2, vector: [0.19, 0.81, 0.75, 0.11], payload: { city: "London" } }, + { id: 3, vector: [0.36, 0.55, 0.47, 0.94], payload: { city: "Moscow" } }, + { id: 4, vector: [0.18, 0.01, 0.85, 0.80], payload: { city: "New York" } }, + { id: 5, vector: [0.24, 0.18, 0.22, 0.44], payload: { city: "Beijing" } }, + { id: 6, vector: [0.35, 0.08, 0.11, 0.44], payload: { city: "Mumbai" } }, + ], +}); -- **Implement Hybrid Reranking**: Blend keyword-based (sparse) and vector-based (dense) search results for a more comprehensive ranking system. -- **Continuous Testing and Monitoring**: Regularly evaluate your reranking models to avoid overfitting and make timely adjustments to maintain performance. -- **Balance Relevance and Latency**: Reranking can be computationally expensive, so aim for a balance between relevance and speed. Therefore, the first step is to retrieve the relevant documents and then use reranking on it. +console.debug(operationInfo); +``` -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/reranking-hybrid-search/\#conclusion) Conclusion +```rust +use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder}; -Reranking is a powerful tool that boosts the relevance of search results, especially when combined with hybrid search methods. While it can add some latency due to its complexity, applying it to a smaller, pre-filtered subset of results ensures both speed and relevance. +let points = vec![ + PointStruct::new(1, vec![0.05, 0.61, 0.76, 0.74], [("city", "Berlin".into())]), + PointStruct::new(2, vec![0.19, 0.81, 0.75, 0.11], [("city", "London".into())]), + PointStruct::new(3, vec![0.36, 0.55, 0.47, 0.94], [("city", "Moscow".into())]), + // ..truncated +]; -Qdrant offers an easy-to-use API to get started with your own search engine, so if you’re ready to dive in, sign up for free at [Qdrant Cloud](https://qdrant.tech/) and start building +let response = client + .upsert_points(UpsertPointsBuilder::new("test_collection", points).wait(true)) + .await?; -##### Was this page useful? +dbg!(response); +``` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +```java +import java.util.List; +import java.util.Map; -Thank you for your feedback! 🙏 +import static io.qdrant.client.PointIdFactory.id; +import static io.qdrant.client.ValueFactory.value; +import static io.qdrant.client.VectorsFactory.vectors; -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/advanced-tutorials/reranking-hybrid-search.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +import io.qdrant.client.grpc.Points.PointStruct; +import io.qdrant.client.grpc.Points.UpdateResult; -On this page: +UpdateResult operationInfo = + client + .upsertAsync( + "test_collection", + List.of( + PointStruct.newBuilder() + .setId(id(1)) + .setVectors(vectors(0.05f, 0.61f, 0.76f, 0.74f)) + .putAllPayload(Map.of("city", value("Berlin"))) + .build(), + PointStruct.newBuilder() + .setId(id(2)) + .setVectors(vectors(0.19f, 0.81f, 0.75f, 0.11f)) + .putAllPayload(Map.of("city", value("London"))) + .build(), + PointStruct.newBuilder() + .setId(id(3)) + .setVectors(vectors(0.36f, 0.55f, 0.47f, 0.94f)) + .putAllPayload(Map.of("city", value("Moscow"))) + .build())) + // Truncated + .get(); -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/advanced-tutorials/reranking-hybrid-search.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +System.out.println(operationInfo); +``` -× +```csharp +using Qdrant.Client.Grpc; -[Powered by](https://qdrant.tech/) +var operationInfo = await client.UpsertAsync(collectionName: "test_collection", points: new List +{ + new() + { + Id = 1, + Vectors = new float[] + { + 0.05f, 0.61f, 0.76f, 0.74f + }, + Payload = { + ["city"] = "Berlin" + } + }, + new() + { + Id = 2, + Vectors = new float[] + { + 0.19f, 0.81f, 0.75f, 0.11f + }, + Payload = { + ["city"] = "London" + } + }, + new() + { + Id = 3, + Vectors = new float[] + { + 0.36f, 0.55f, 0.47f, 0.94f + }, + Payload = { + ["city"] = "Moscow" + } + }, + // Truncated +}); -<|page-24-lllmstxt|> -## code-search -- [Documentation](https://qdrant.tech/documentation/) -- [Advanced tutorials](https://qdrant.tech/documentation/advanced-tutorials/) -- Search Through Your Codebase +Console.WriteLine(operationInfo); +``` -# [Anchor](https://qdrant.tech/documentation/advanced-tutorials/code-search/\#navigate-your-codebase-with-semantic-search-and-qdrant) Navigate Your Codebase with Semantic Search and Qdrant +```go +import ( + "context" + "fmt" -| Time: 45 min | Level: Intermediate | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/qdrant/examples/blob/master/code-search/code-search.ipynb) | | -| --- | --- | --- | --- | + "github.com/qdrant/go-client/qdrant" +) -You too can enrich your applications with Qdrant semantic search. In this -tutorial, we describe how you can use Qdrant to navigate a codebase, to help -you find relevant code snippets. As an example, we will use the [Qdrant](https://github.com/qdrant/qdrant) -source code itself, which is mostly written in Rust. +operationInfo, err := client.Upsert(context.Background(), &qdrant.UpsertPoints{ + CollectionName: "test_collection", + Points: []*qdrant.PointStruct{ + { + Id: qdrant.NewIDNum(1), + Vectors: qdrant.NewVectors(0.05, 0.61, 0.76, 0.74), + Payload: qdrant.NewValueMap(map[string]any{"city": "Berlin"}), + }, + { + Id: qdrant.NewIDNum(2), + Vectors: qdrant.NewVectors(0.19, 0.81, 0.75, 0.11), + Payload: qdrant.NewValueMap(map[string]any{"city": "London"}), + }, + { + Id: qdrant.NewIDNum(3), + Vectors: qdrant.NewVectors(0.36, 0.55, 0.47, 0.94), + Payload: qdrant.NewValueMap(map[string]any{"city": "Moscow"}), + }, + // Truncated + }, +}) +if err != nil { + panic(err) +} +fmt.Println(operationInfo) +``` -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/code-search/\#the-approach) The approach +**Response:** -We want to search codebases using natural semantic queries, and searching for code based on similar logic. You can set up these tasks with embeddings: +```python +operation_id=0 status= +``` -1. General usage neural encoder for Natural Language Processing (NLP), in our case -`sentence-transformers/all-MiniLM-L6-v2`. -2. Specialized embeddings for code-to-code similarity search. We use the -`jina-embeddings-v2-base-code` model. +```typescript +{ operation_id: 0, status: 'completed' } +``` -To prepare our code for `all-MiniLM-L6-v2`, we preprocess the code to text that -more closely resembles natural language. The Jina embeddings model supports a -variety of standard programming languages, so there is no need to preprocess the -snippets. We can use the code as is. +```rust +PointsOperationResponse { + result: Some( + UpdateResult { + operation_id: Some( + 0, + ), + status: Completed, + }, + ), + time: 0.00094027, +} +``` -NLP-based search is based on function signatures, but code search may return -smaller pieces, such as loops. So, if we receive a particular function signature -from the NLP model and part of its implementation from the code model, we merge -the results and highlight the overlap. +```java +operation_id: 0 +status: Completed +``` -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/code-search/\#data-preparation) Data preparation +```csharp +{ "operationId": "0", "status": "Completed" } +``` -Chunking the application sources into smaller parts is a non-trivial task. In -general, functions, class methods, structs, enums, and all the other language-specific -constructs are good candidates for chunks. They are big enough to -contain some meaningful information, but small enough to be processed by -embedding models with a limited context window. You can also use docstrings, -comments, and other metadata can be used to enrich the chunks with additional -information. +```go +operation_id:0 status:Acknowledged +``` -![Code chunking strategy](https://qdrant.tech/documentation/tutorials/code-search/data-chunking.png) +## Run a query -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/code-search/\#parsing-the-codebase) Parsing the codebase +Let's ask a basic question - Which of our stored vectors are most similar to the query vector `[0.2, 0.1, 0.9, 0.7]`? -While our example uses Rust, you can use our approach with any other language. -You can parse code with a [Language Server Protocol](https://microsoft.github.io/language-server-protocol/) ( **LSP**) -compatible tool. You can use an LSP to build a graph of the codebase, and then extract chunks. -We did our work with the [rust-analyzer](https://rust-analyzer.github.io/). -We exported the parsed codebase into the [LSIF](https://microsoft.github.io/language-server-protocol/specifications/lsif/0.4.0/specification/) -format, a standard for code intelligence data. Next, we used the LSIF data to -navigate the codebase and extract the chunks. For details, see our [code search\\ -demo](https://github.com/qdrant/demo-code-search). +```python +search_result = client.query_points( + collection_name="test_collection", + query=[0.2, 0.1, 0.9, 0.7], + with_payload=False, + limit=3 +).points -We then exported the chunks into JSON documents with not only the code itself, -but also context with the location of the code in the project. For example, see -the description of the `await_ready_for_timeout` function from the `IsReady` -struct in the `common` module: +print(search_result) +``` -```json -{ - "name":"await_ready_for_timeout", - "signature":"fn await_ready_for_timeout (& self , timeout : Duration) -> bool", - "code_type":"Function", - "docstring":"= \" Return `true` if ready, `false` if timed out.\"", - "line":44, - "line_from":43, - "line_to":51, - "context":{ - "module":"common", - "file_path":"lib/collection/src/common/is_ready.rs", - "file_name":"is_ready.rs", - "struct_name":"IsReady", - "snippet":" /// Return `true` if ready, `false` if timed out.\n pub fn await_ready_for_timeout(&self, timeout: Duration) -> bool {\n let mut is_ready = self.value.lock();\n if !*is_ready {\n !self.condvar.wait_for(&mut is_ready, timeout).timed_out()\n } else {\n true\n }\n }\n" - } -} +```typescript +let searchResult = await client.query( + "test_collection", { + query: [0.2, 0.1, 0.9, 0.7], + limit: 3 +}); +console.debug(searchResult.points); ``` -You can examine the Qdrant structures, parsed in JSON, in the [`structures.jsonl`\\ -file](https://storage.googleapis.com/tutorial-attachments/code-search/structures.jsonl) -in our Google Cloud Storage bucket. Download it and use it as a source of data for our code search. +```rust +use qdrant_client::qdrant::QueryPointsBuilder; -```shell -wget https://storage.googleapis.com/tutorial-attachments/code-search/structures.jsonl +let search_result = client + .query( + QueryPointsBuilder::new("test_collection") + .query(vec![0.2, 0.1, 0.9, 0.7]) + ) + .await?; +dbg!(search_result); ``` -Next, load the file and parse the lines into a list of dictionaries: +```java +import java.util.List; -```python -import json +import io.qdrant.client.grpc.Points.ScoredPoint; +import io.qdrant.client.grpc.Points.QueryPoints; -structures = [] -with open("structures.jsonl", "r") as fp: - for i, row in enumerate(fp): - entry = json.loads(row) - structures.append(entry) +import static io.qdrant.client.QueryFactory.nearest; +List searchResult = + client.queryAsync(QueryPoints.newBuilder() + .setCollectionName("test_collection") + .setLimit(3) + .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) + .build()).get(); + +System.out.println(searchResult); ``` -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/code-search/\#code-to-natural-language-conversion) Code to _natural language_ conversion +```csharp +var searchResult = await client.QueryAsync( + collectionName: "test_collection", + query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, + limit: 3, +); -Each programming language has its own syntax which is not a part of the natural -language. Thus, a general-purpose model probably does not understand the code -as is. We can, however, normalize the data by removing code specifics and -including additional context, such as module, class, function, and file name. -We took the following steps: +Console.WriteLine(searchResult); +``` -1. Extract the signature of the function, method, or other code construct. -2. Divide camel case and snake case names into separate words. -3. Take the docstring, comments, and other important metadata. -4. Build a sentence from the extracted data using a predefined template. -5. Remove the special characters and replace them with spaces. +```go +import ( + "context" + "fmt" -As input, expect dictionaries with the same structure. Define a `textify` -function to do the conversion. We’ll use an `inflection` library to convert -with different naming conventions. + "github.com/qdrant/go-client/qdrant" +) -```shell -pip install inflection +searchResult, err := client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "test_collection", + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), +}) +if err != nil { + panic(err) +} +fmt.Println(searchResult) ``` -Once all dependencies are installed, we define the `textify` function: - -```python -import inflection -import re +**Response:** -from typing import Dict, Any +```json +[ + { + "id": 4, + "version": 0, + "score": 1.362, + "payload": null, + "vector": null + }, + { + "id": 1, + "version": 0, + "score": 1.273, + "payload": null, + "vector": null + }, + { + "id": 3, + "version": 0, + "score": 1.208, + "payload": null, + "vector": null + } +] +``` -def textify(chunk: Dict[str, Any]) -> str: - # Get rid of all the camel case / snake case - # - inflection.underscore changes the camel case to snake case - # - inflection.humanize converts the snake case to human readable form - name = inflection.humanize(inflection.underscore(chunk["name"])) - signature = inflection.humanize(inflection.underscore(chunk["signature"])) +The results are returned in decreasing similarity order. Note that payload and vector data is missing in these results by default. +See [payload and vector in the result](/documentation/concepts/search/#payload-and-vector-in-the-result) on how to enable it. - # Check if docstring is provided - docstring = "" - if chunk["docstring"]: - docstring = f"that does {chunk['docstring']} " +## Add a filter - # Extract the location of that snippet of code - context = ( - f"module {chunk['context']['module']} " - f"file {chunk['context']['file_name']}" - ) - if chunk["context"]["struct_name"]: - struct_name = inflection.humanize( - inflection.underscore(chunk["context"]["struct_name"]) - ) - context = f"defined in struct {struct_name} {context}" +We can narrow down the results further by filtering by payload. Let's find the closest results that include "London". - # Combine all the bits and pieces together - text_representation = ( - f"{chunk['code_type']} {name} " - f"{docstring}" - f"defined as {signature} " - f"{context}" - ) +```python +from qdrant_client.models import Filter, FieldCondition, MatchValue - # Remove any special characters and concatenate the tokens - tokens = re.split(r"\W", text_representation) - tokens = filter(lambda x: x, tokens) - return " ".join(tokens) +search_result = client.query_points( + collection_name="test_collection", + query=[0.2, 0.1, 0.9, 0.7], + query_filter=Filter( + must=[FieldCondition(key="city", match=MatchValue(value="London"))] + ), + with_payload=True, + limit=3, +).points +print(search_result) ``` -Now we can use `textify` to convert all chunks into text representations: - -```python -text_representations = list(map(textify, structures)) +```typescript +searchResult = await client.query("test_collection", { + query: [0.2, 0.1, 0.9, 0.7], + filter: { + must: [{ key: "city", match: { value: "London" } }], + }, + with_payload: true, + limit: 3, +}); +console.debug(searchResult); ``` -This is how the `await_ready_for_timeout` function description appears: +```rust +use qdrant_client::qdrant::{Condition, Filter, QueryPointsBuilder}; -```text -Function Await ready for timeout that does Return true if ready false if timed out defined as Fn await ready for timeout self timeout duration bool defined in struct Is ready module common file is_ready rs +let search_result = client + .query( + QueryPointsBuilder::new("test_collection") + .query(vec![0.2, 0.1, 0.9, 0.7]) + .filter(Filter::must([Condition::matches( + "city", + "London".to_string(), + )])) + .with_payload(true), + ) + .await?; +dbg!(search_result); ``` -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/code-search/\#ingestion-pipeline) Ingestion pipeline - -Next, we’ll build a pipeline for vectorizing the data and set up a semantic search mechanism for both embedding models. - -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/code-search/\#building-qdrant-collection) Building Qdrant collection - -We use the `qdrant-client` library with the `fastembed` extra to interact with the Qdrant server and generate vector embeddings locally. Let’s install it: +```java +import static io.qdrant.client.ConditionFactory.matchKeyword; -```shell -pip install "qdrant-client[fastembed]" +List searchResult = + client.queryAsync(QueryPoints.newBuilder() + .setCollectionName("test_collection") + .setLimit(3) + .setFilter(Filter.newBuilder().addMust(matchKeyword("city", "London"))) + .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) + .setWithPayload(enable(true)) + .build()).get(); +System.out.println(searchResult); ``` -Of course, we need a running Qdrant server for vector search. If you need one, -you can [use a local Docker container](https://qdrant.tech/documentation/quick-start/) -or deploy it using the [Qdrant Cloud](https://cloud.qdrant.io/). -You can use either to follow this tutorial. Configure the connection parameters: +```csharp +using static Qdrant.Client.Grpc.Conditions; -```python -QDRANT_URL = "https://my-cluster.cloud.qdrant.io:6333" # http://localhost:6333 for local instance -QDRANT_API_KEY = "THIS_IS_YOUR_API_KEY" # None for local instance +var searchResult = await client.QueryAsync( + collectionName: "test_collection", + query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, + filter: MatchKeyword("city", "London"), + limit: 3, + payloadSelector: true +); +Console.WriteLine(searchResult); ``` -Then use the library to create a collection: - -```python -from qdrant_client import QdrantClient, models +```go +import ( + "context" + "fmt" -client = QdrantClient(QDRANT_URL, api_key=QDRANT_API_KEY) -client.create_collection( - "qdrant-sources", - vectors_config={ - "text": models.VectorParams( - size=client.get_embedding_size( - model_name="sentence-transformers/all-MiniLM-L6-v2" - ), - distance=models.Distance.COSINE, - ), - "code": models.VectorParams( - size=client.get_embedding_size( - model_name="jinaai/jina-embeddings-v2-base-code" - ), - distance=models.Distance.COSINE, - ), - }, + "github.com/qdrant/go-client/qdrant" ) -``` - -Our newly created collection is ready to accept the data. Let’s upload the embeddings: +searchResult, err := client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "test_collection", + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), + Filter: &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("city", "London"), + }, + }, + WithPayload: qdrant.NewWithPayload(true), +}) +if err != nil { + panic(err) +} -```python -import uuid +fmt.Println(searchResult) +``` -# Extract the code snippets from the structures to a separate list -code_snippets = [\ - structure["context"]["snippet"] for structure in structures\ -] +**Response:** -points = [\ - models.PointStruct(\ - id=uuid.uuid4().hex,\ - vector={\ - "text": models.Document(\ - text=text, model="sentence-transformers/all-MiniLM-L6-v2"\ - ),\ - "code": models.Document(\ - text=code, model="jinaai/jina-embeddings-v2-base-code"\ - ),\ - },\ - payload=structure,\ - )\ - for text, code, structure in zip(text_representations, code_snippets, structures)\ +```json +[ + { + "id": 2, + "version": 0, + "score": 0.871, + "payload": { + "city": "London" + }, + "vector": null + } ] - -# Note: This might take a while since inference happens implicitly. -# Parallel processing can help. -# But too many processes may trigger swap memory and hurt performance. -client.upload_points("qdrant-sources", points=points, batch_size=64) - ``` -Internally, `qdrant-client` uses [FastEmbed](https://github.com/qdrant/fastembed) to implicitly convert our documents into their vector representations. -The uploaded points are immediately available for search. Next, query the -collection to find relevant code snippets. + -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/code-search/\#querying-the-codebase) Querying the codebase +You have just conducted vector search. You loaded vectors into a database and queried the database with a vector of your own. Qdrant found the closest results and presented you with a similarity score. -We use one of the models to search the collection. Start with text embeddings. -Run the following query “ _How do I count points in a collection?_”. Review the -results. +## Next steps -```python -query = "How do I count points in a collection?" +Now you know how Qdrant works. Getting started with [Qdrant Cloud](/documentation/cloud/quickstart-cloud/) is just as easy. [Create an account](https://qdrant.to/cloud) and use our SaaS completely free. We will take care of infrastructure maintenance and software updates. -hits = client.query_points( - "qdrant-sources", - query=models.Document(text=query, model="sentence-transformers/all-MiniLM-L6-v2"), - using="text", - limit=5, -).points +To move onto some more complex examples of vector search, read our [Tutorials](/documentation/tutorials/) and create your own app with the help of our [Examples](/documentation/examples/). -``` +**Note:** There is another way of running Qdrant locally. If you are a Python developer, we recommend that you try Local Mode in [Qdrant Client](https://github.com/qdrant/qdrant-client), as it only takes a few moments to get setup. -Now, review the results. The following table lists the module, the file name -and score. Each line includes a link to the signature, as a code block from -the file. +<|page-15-lllmstxt|> +## Hidden Structure -| module | file\_name | score | signature | -| --- | --- | --- | --- | -| toc | point\_ops.rs | 0.59448624 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`pub async fn count`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/storage/src/content_manager/toc/point_ops.rs#L120) | -| operations | types.rs | 0.5493385 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`pub struct CountRequestInternal`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/collection/src/operations/types.rs#L831) | -| collection\_manager | segments\_updater.rs | 0.5121002 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`pub(crate) fn upsert_points<'a, T>`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/collection/src/collection_manager/segments_updater.rs#L339) | -| collection | point\_ops.rs | 0.5063539 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`pub async fn count`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/collection/src/collection/point_ops.rs#L213) | -| map\_index | mod.rs | 0.49973983 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`fn get_points_with_value_count`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/segment/src/index/field_index/map_index/mod.rs#L88) | +When working with large collections of documents, images, or other arrays of unstructured data, it often becomes useful to understand the big picture. +Examining data points individually is not always the best way to grasp the structure of the data. -It seems we were able to find some relevant code structures. Let’s try the same with the code embeddings: +{{< figure src="/articles_data/distance-based-exploration/no-context-data.png" alt="Data visualization" caption="Datapoints without context, pretty much useless" >}} -```python -hits = client.query_points( - "qdrant-sources", - query=models.Document(text=query, model="jinaai/jina-embeddings-v2-base-code"), - using="code", - limit=5, -).points +As numbers in a table obtain meaning when plotted on a graph, visualising distances (similar/dissimilar) between unstructured data items can reveal hidden structures and patterns. -``` +{{< figure src="/articles_data/distance-based-exploration/data-on-chart.png" alt="Data visualization" caption="Vizualized chart, very intuitive" >}} +There are many tools to investigate data similarity, and Qdrant's [1.12 release](https://qdrant.tech/blog/qdrant-1.12.x/) made it much easier to start this investigation. With the new [Distance Matrix API](/documentation/concepts/explore/#distance-matrix), Qdrant handles the most computationally expensive part of the process—calculating the distances between data points. -Output: +In many implementations, the distance matrix calculation was part of the clustering or visualization processes, requiring either brute-force computation or building a temporary index. With Qdrant, however, the data is already indexed, and the distance matrix can be computed relatively cheaply. -| module | file\_name | score | signature | -| --- | --- | --- | --- | -| field\_index | geo\_index.rs | 0.73278356 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/segment/src/index/field_index/geo_index.rs#L612) | -| numeric\_index | mod.rs | 0.7254976 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/index/field_index/numeric_index/mod.rs#L322) | -| map\_index | mod.rs | 0.7124739 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/index/field_index/map_index/mod.rs#L315) | -| map\_index | mod.rs | 0.7124739 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/index/field_index/map_index/mod.rs#L429) | -| fixtures | payload\_context\_fixture.rs | 0.706204 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`fn total_point_count`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/fixtures/payload_context_fixture.rs#L122) | +In this article, we will explore several methods for data exploration using the Distance Matrix API. -While the scores retrieved by different models are not comparable, but we can -see that the results are different. Code and text embeddings can capture -different aspects of the codebase. We can use both models to query the collection -and then combine the results to get the most relevant code snippets, from a single batch request. +## Dimensionality Reduction -```python -responses = client.query_batch_points( - collection_name="qdrant-sources", - requests=[\ - models.QueryRequest(\ - query=models.Document(\ - text=query, model="sentence-transformers/all-MiniLM-L6-v2"\ - ),\ - using="text",\ - with_payload=True,\ - limit=5,\ - ),\ - models.QueryRequest(\ - query=models.Document(\ - text=query, model="jinaai/jina-embeddings-v2-base-code"\ - ),\ - using="code",\ - with_payload=True,\ - limit=5,\ - ),\ - ], -) +Initially, we might want to visualize an entire dataset, or at least a large portion of it, at a glance. However, high-dimensional data cannot be directly visualized. We must apply dimensionality reduction techniques to convert data into a lower-dimensional representation while preserving important data properties. -results = [response.points for response in responses] +In this article, we will use [UMAP](https://github.com/lmcinnes/umap) as our dimensionality reduction algorithm. -``` +Here is a **very** simplified but intuitive explanation of UMAP: -Output: +1. *Randomly generate points in 2D space*: Assign a random 2D point to each high-dimensional point. +2. *Compute distance matrix for high-dimensional points*: Calculate distances between all pairs of points. +3. *Compute distance matrix for 2D points*: Perform similarly to step 2. +4. *Match both distance matrices*: Adjust 2D points to minimize differences. -| module | file\_name | score | signature | -| --- | --- | --- | --- | -| toc | point\_ops.rs | 0.59448624 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`pub async fn count`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/storage/src/content_manager/toc/point_ops.rs#L120) | -| operations | types.rs | 0.5493385 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`pub struct CountRequestInternal`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/collection/src/operations/types.rs#L831) | -| collection\_manager | segments\_updater.rs | 0.5121002 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`pub(crate) fn upsert_points<'a, T>`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/collection/src/collection_manager/segments_updater.rs#L339) | -| collection | point\_ops.rs | 0.5063539 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`pub async fn count`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/collection/src/collection/point_ops.rs#L213) | -| map\_index | mod.rs | 0.49973983 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`fn get_points_with_value_count`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/segment/src/index/field_index/map_index/mod.rs#L88) | -| field\_index | geo\_index.rs | 0.73278356 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/segment/src/index/field_index/geo_index.rs#L612) | -| numeric\_index | mod.rs | 0.7254976 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/index/field_index/numeric_index/mod.rs#L322) | -| map\_index | mod.rs | 0.7124739 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/index/field_index/map_index/mod.rs#L315) | -| map\_index | mod.rs | 0.7124739 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/index/field_index/map_index/mod.rs#L429) | -| fixtures | payload\_context\_fixture.rs | 0.706204 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`fn total_point_count`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/fixtures/payload_context_fixture.rs#L122) | +{{< figure src="/articles_data/distance-based-exploration/umap.png" alt="UMAP" caption="Canonical example of UMAP results, [source](https://github.com/lmcinnes/umap?tab=readme-ov-file#performance-and-examples)" >}} -This is one example of how you can use different models and combine the results. -In a real-world scenario, you might run some reranking and deduplication, as -well as additional processing of the results. +UMAP preserves the relative distances between high-dimensional points; the actual coordinates are not essential. If we already have the distance matrix, step 2 can be skipped entirely. -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/code-search/\#code-search-demo) Code search demo +Let's use Qdrant to calculate the distance matrix and apply UMAP. +We will use one of the default datasets perfect for experimenting in Qdrant--[Midjourney Styles dataset](https://midlibrary.io/). -Our [Code search demo](https://code-search.qdrant.tech/) uses the following process: +Use this command to download and import the dataset into Qdrant: -1. The user sends a query. -2. Both models vectorize that query simultaneously. We get two different -vectors. -3. Both vectors are used in parallel to find relevant snippets. We expect -5 examples from the NLP search and 20 examples from the code search. -4. Once we retrieve results for both vectors, we merge them in one of the -following scenarios: -1. If both methods return different results, we prefer the results from - the general usage model (NLP). -2. If there is an overlap between the search results, we merge overlapping - snippets. +```http +PUT /collections/midlib/snapshots/recover +{ + "location": "http://snapshots.qdrant.io/midlib.snapshot" +} +``` -In the screenshot, we search for `flush of wal`. The result -shows relevant code, merged from both models. Note the highlighted -code in lines 621-629. It’s where both models agree. +
+We also need to prepare our python enviroment: -![Results from both models, with overlap](https://qdrant.tech/documentation/tutorials/code-search/code-search-demo-example.png) +```bash +pip install umap-learn seaborn matplotlib qdrant-client +``` -Now you see semantic code intelligence, in action. +Import the necessary libraries: -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/code-search/\#grouping-the-results) Grouping the results +```python +# Used to talk to Qdrant +from qdrant_client import QdrantClient +# Package with original UMAP implementation +from umap import UMAP +# Python implementation for sparse matrices +from scipy.sparse import csr_matrix +# For vizualization +import seaborn as sns +``` -You can improve the search results, by grouping them by payload properties. -In our case, we can group the results by the module. If we use code embeddings, -we can see multiple results from the `map_index` module. Let’s group the -results and assume a single result per module: +Establish connection to Qdrant: ```python -results = client.query_points_groups( - collection_name="qdrant-sources", - using="code", - query=models.Document(text=query, model="jinaai/jina-embeddings-v2-base-code"), - group_by="context.module", - limit=5, - group_size=1, -) - +client = QdrantClient("http://localhost:6333") ``` -Output: +
-| module | file\_name | score | signature | -| --- | --- | --- | --- | -| field\_index | geo\_index.rs | 0.73278356 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/segment/src/index/field_index/geo_index.rs#L612) | -| numeric\_index | mod.rs | 0.7254976 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/index/field_index/numeric_index/mod.rs#L322) | -| map\_index | mod.rs | 0.7124739 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/index/field_index/map_index/mod.rs#L315) | -| fixtures | payload\_context\_fixture.rs | 0.706204 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`fn total_point_count`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/fixtures/payload_context_fixture.rs#L122) | -| hnsw\_index | graph\_links.rs | 0.6998417 | [![](https://qdrant.tech/documentation/tutorials/code-search/github-mark.png)`fn num_points`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/index/hnsw_index/graph_links.rs#L477) | +After this is done, we can compute the distance matrix: -With the grouping feature, we get more diverse results. +```python -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/code-search/\#summary) Summary +# Request distances matrix from Qdrant +# `_offsets` suffix defines a format of the output matrix. +result = client.search_matrix_offsets( + collection_name="midlib", + sample=1000, # Select a subset of the data, as the whole dataset might be too large + limit=20, # For performance reasons, limit the number of closest neighbors to consider +) -This tutorial demonstrates how to use Qdrant to navigate a codebase. For an -end-to-end implementation, review the [code search\\ -notebook](https://colab.research.google.com/github/qdrant/examples/blob/master/code-search/code-search.ipynb) and the -[code-search-demo](https://github.com/qdrant/demo-code-search). You can also check out [a running version of the code\\ -search demo](https://code-search.qdrant.tech/) which exposes Qdrant codebase for search with a web interface. +# Convert distances matrix to python-native format +matrix = csr_matrix( + (result.scores, (result.offsets_row, result.offsets_col)) +) -##### Was this page useful? +# Make the matrix symmetric, as UMAP expects it. +# Distance matrix is always symmetric, but qdrant only computes half of it. +matrix = matrix + matrix.T +``` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Now we can apply UMAP to the distance matrix: -Thank you for your feedback! 🙏 +```python +umap = UMAP( + metric="precomputed", # We provide ready-made distance matrix + n_components=2, # output dimension + n_neighbors=20, # Same as the limit in the search_matrix_offsets +) -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/advanced-tutorials/code-search.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +vectors_2d = umap.fit_transform(matrix) +``` -On this page: +That's all that is needed to get the 2d representation of the data. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/advanced-tutorials/code-search.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +{{< figure src="/articles_data/distance-based-exploration/umap-midlib.png" alt="UMAP on Midlib" caption="UMAP applied to Midlib dataset" >}} -× + -[Powered by](https://qdrant.tech/) +UMAP isn't the only algorithm compatible with our distance matrix API. For example, `scikit-learn` also offers: -<|page-25-lllmstxt|> -## product-quantization -- [Articles](https://qdrant.tech/articles/) -- Product Quantization in Vector Search \| Qdrant +- [Isomap](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.Isomap.html) - Non-linear dimensionality reduction through Isometric Mapping. +- [SpectralEmbedding](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.SpectralEmbedding.html) - Forms an affinity matrix given by the specified function and applies spectral decomposition to the corresponding graph Laplacian. +- [TSNE](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html) - well-known algorithm for dimensionality reduction. -[Back to Qdrant Internals](https://qdrant.tech/articles/qdrant-internals/) +## Clustering -# Product Quantization in Vector Search \| Qdrant +Another approach to data structure understanding is clustering--grouping similar items. -Kacper Ɓukawski +*Note that there's no universally best clustering criterion or algorithm.* -· +{{< figure src="/articles_data/distance-based-exploration/clustering.png" alt="Clustering" caption="Clustering example, [source](https://scikit-learn.org/)" width="80%" >}} -May 30, 2023 +Many clustering algorithms accept precomputed distance matrix as input, so we can use the same distance matrix we calculated before. -![Product Quantization in Vector Search | Qdrant](https://qdrant.tech/articles_data/product-quantization/preview/title.jpg) +Let's consider a simple example of clustering the Midlib dataset with **KMeans algorithm**. -# [Anchor](https://qdrant.tech/articles/product-quantization/\#product-quantization-demystified-streamlining-efficiency-in-data-management) Product Quantization Demystified: Streamlining Efficiency in Data Management +From [scikit-learn.cluster documentation](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html) we know that `fit()` method of KMeans algorithm prefers as an input: -Qdrant 1.1.0 brought the support of [Scalar Quantization](https://qdrant.tech/articles/scalar-quantization/), -a technique of reducing the memory footprint by even four times, by using `int8` to represent -the values that would be normally represented by `float32`. -The memory usage in [vector search](https://qdrant.tech/solutions/) might be reduced even further! Please welcome **Product** -**Quantization**, a brand-new feature of Qdrant 1.2.0! +> `X : {array-like, sparse matrix} of shape (n_samples, n_features)`: +> Training instances to cluster. It must be noted that the data will be converted to C ordering, which will cause a memory copy if the given data is not C-contiguous. If a sparse matrix is passed, a copy will be made if it’s not in CSR format. -## [Anchor](https://qdrant.tech/articles/product-quantization/\#what-is-product-quantization) What is Product Quantization? -Product Quantization converts floating-point numbers into integers like every other quantization -method. However, the process is slightly more complicated than [Scalar Quantization](https://qdrant.tech/articles/scalar-quantization/) and is more customizable, so you can find the sweet spot between memory usage and search precision. This article -covers all the steps required to perform Product Quantization and the way it’s implemented in Qdrant. +So we can re-use `matrix` from the previous example: -## [Anchor](https://qdrant.tech/articles/product-quantization/\#how-does-product-quantization-work) How Does Product Quantization Work? -Let’s assume we have a few vectors being added to the collection and that our optimizer decided -to start creating a new segment. +```python +from sklearn.cluster import KMeans -![A list of raw vectors](https://qdrant.tech/articles_data/product-quantization/raw-vectors.png) +# Initialize KMeans with 10 clusters +kmeans = KMeans(n_clusters=10) -### [Anchor](https://qdrant.tech/articles/product-quantization/\#cutting-the-vector-into-pieces) Cutting the vector into pieces +# Generate index of the cluster each sample belongs to +cluster_labels = kmeans.fit_predict(matrix) +``` -First of all, our vectors are going to be divided into **chunks** aka **subvectors**. The number -of chunks is configurable, but as a rule of thumb - the lower it is, the higher the compression rate. -That also comes with reduced search precision, but in some cases, you may prefer to keep the memory -usage as low as possible. +With this simple code, we have clustered the data into 10 clusters, while the main CPU-intensive part of the process was done by Qdrant. -![A list of chunked vectors](https://qdrant.tech/articles_data/product-quantization/chunked-vectors.png) +{{< figure src="/articles_data/distance-based-exploration/clustering-midlib.png" alt="Clustering on Midlib" caption="Clustering applied to Midlib dataset" >}} -Qdrant API allows choosing the compression ratio from 4x up to 64x. In our example, we selected 16x, -so each subvector will consist of 4 floats (16 bytes), and it will eventually be represented by -a single byte. -### [Anchor](https://qdrant.tech/articles/product-quantization/\#clustering) Clustering +
+How to plot this chart -The chunks of our vectors are then used as input for clustering. Qdrant uses the K-means algorithm, -with K=256. It was selected a priori, as this is the maximum number of values a single byte -represents. As a result, we receive a list of 256 centroids for each chunk and assign each of them -a unique id. **The clustering is done separately for each group of chunks.** +```python +sns.scatterplot( + # Coordinates obtained from UMAP + x=vectors_2d[:, 0], y=vectors_2d[:, 1], + # Color datapoints by cluster + hue=cluster_labels, + palette=sns.color_palette("pastel", 10), + legend="full", +) +``` +
-![Clustered chunks of vectors](https://qdrant.tech/articles_data/product-quantization/chunks-clustering.png) -Each chunk of a vector might now be mapped to the closest centroid. That’s where we lose the precision, -as a single point will only represent a whole subspace. Instead of using a subvector, we can store -the id of the closest centroid. If we repeat that for each chunk, we can approximate the original -embedding as a vector of subsequent ids of the centroids. The dimensionality of the created vector -is equal to the number of chunks, in our case 2. +## Graphs -![A new vector built from the ids of the centroids](https://qdrant.tech/articles_data/product-quantization/vector-of-ids.png) +Clustering and dimensionality reduction both aim to provide a more transparent overview of the data. +However, they share a common characteristic - they require a training step before the results can be visualized. -### [Anchor](https://qdrant.tech/articles/product-quantization/\#full-process) Full process +This also implies that introducing new data points necessitates re-running the training step, which may be computationally expensive. -All those steps build the following pipeline of Product Quantization: +Graphs offer an alternative approach to data exploration, enabling direct, interactive visualization of relationships between data points. +In a graph representation, each data point is a node, and similarities between data points are represented as edges connecting the nodes. -![Full process of Product Quantization](https://qdrant.tech/articles_data/product-quantization/full-process.png) +Such a graph can be rendered in real-time using [force-directed layout](https://en.wikipedia.org/wiki/Force-directed_graph_drawing) algorithms, which aim to minimize the system's energy by repositioning nodes dynamically--the more similar the data points are, the stronger the edges between them. -## [Anchor](https://qdrant.tech/articles/product-quantization/\#measuring-the-distance) Measuring the distance +Adding new data points to the graph is as straightforward as inserting new nodes and edges without the need to re-run any training steps. -Vector search relies on the distances between the points. Enabling Product Quantization slightly changes -the way it has to be calculated. The query vector is divided into chunks, and then we figure the overall -distance as a sum of distances between the subvectors and the centroids assigned to the specific id of -the vector we compare to. We know the coordinates of the centroids, so that’s easy. +In practice, rendering a graph for an entire dataset at once may be computationally expensive and overwhelming for the user. Therefore, let's explore a few strategies to address this issue. -![Calculating the distance of between the query and the stored vector](https://qdrant.tech/articles_data/product-quantization/distance-calculation.png) +### Expanding from a single node -#### [Anchor](https://qdrant.tech/articles/product-quantization/\#qdrant-implementation) Qdrant implementation +This is the simplest approach, where we start with a single node and expand the graph by adding the most similar nodes to the graph. -Search operation requires calculating the distance to multiple points. Since we calculate the -distance to a finite set of centroids, those might be precomputed and reused. Qdrant creates -a lookup table for each query, so it can then simply sum up several terms to measure the -distance between a query and all the centroids. +{{< figure src="/articles_data/distance-based-exploration/graph.gif" alt="Graph" caption="Graph representation of the data" >}} -| | Centroid 0 | Centroid 1 | 
 | -| --- | --- | --- | --- | -| **Chunk 0** | 0.14213 | 0.51242 | | -| **Chunk 1** | 0.08421 | 0.00142 | | -| **
** | 
 | 
 | 
 | + -## [Anchor](https://qdrant.tech/articles/product-quantization/\#product-quantization-benchmarks) Product Quantization Benchmarks +### Sampling from a collection -Product Quantization comes with a cost - there are some additional operations to perform so -that the performance might be reduced. However, memory usage might be reduced drastically as -well. As usual, we did some benchmarks to give you a brief understanding of what you may expect. +Expanding a single node works well if you want to explore neighbors of a single point, but what if you want to explore the whole dataset? +If your dataset is small enough, you can render relations for all the data points at once. But it is a rare case in practice. -Again, we reused the same pipeline as in [the other benchmarks we published](https://qdrant.tech/benchmarks/). We -selected [Arxiv-titles-384-angular-no-filters](https://github.com/qdrant/ann-filtering-benchmark-datasets) -and [Glove-100](https://github.com/erikbern/ann-benchmarks/) datasets to measure the impact -of Product Quantization on precision and time. Both experiments were launched with EF=128. -The results are summarized in the tables: +Instead, we can sample a subset of the data and render the graph for this subset. +This way, we can get a good overview of the data without overwhelming the user with too much information. -#### [Anchor](https://qdrant.tech/articles/product-quantization/\#glove-100) Glove-100 +Let's try to do so in [Qdrant's Graph Exploration Tool](https://qdrant.tech/blog/qdrant-1.11.x/#web-ui-graph-exploration-tool): -| | Original | 1D clusters | 2D clusters | 3D clusters | -| --- | --- | --- | --- | --- | -| Mean precision | 0.7158 | 0.7143 | 0.6731 | 0.5854 | -| Mean search time | 2336 ”s | 2750 ”s | 2597 ”s | 2534 ”s | -| Compression | x1 | x4 | x8 | x12 | -| Upload & indexing time | 147 s | 339 s | 217 s | 178 s | +```json +{ + "limit": 5, # node neighbors to consider + "sample": 100 # nodes +} +``` -Product Quantization increases both indexing and searching time. The higher the compression ratio, -the lower the search precision. The main benefit is undoubtedly the reduced usage of memory. +{{< figure src="/articles_data/distance-based-exploration/graph-sampled.png" alt="Graph" caption="Graph representation of the data ([Qdrant's Graph Exploration Tool](https://qdrant.tech/blog/qdrant-1.11.x/#web-ui-graph-exploration-tool))">}} -#### [Anchor](https://qdrant.tech/articles/product-quantization/\#arxiv-titles-384-angular-no-filters) Arxiv-titles-384-angular-no-filters +This graph captures some high-level structure of the data, but as you might have noticed, it is quite noisy. +This is because the differences in similarities are relatively small, and they might be overwhelmed by the stretches and compressions of the force-directed layout algorithm. -| | Original | 1D clusters | 2D clusters | 4D clusters | 8D clusters | -| --- | --- | --- | --- | --- | --- | -| Mean precision | 0.9837 | 0.9677 | 0.9143 | 0.8068 | 0.6618 | -| Mean search time | 2719 ”s | 4134 ”s | 2947 ”s | 2175 ”s | 2053 ”s | -| Compression | x1 | x4 | x8 | x16 | x32 | -| Upload & indexing time | 332 s | 921 s | 597 s | 481 s | 474 s | +To make the graph more readable, let's concentrate on the most important similarities and build a so called [Minimum/Maximum Spanning Tree](https://en.wikipedia.org/wiki/Minimum_spanning_tree). -It turns out that in some cases, Product Quantization may not only reduce the memory usage, -but also the search time. +```json +{ + "limit": 5, + "sample": 100, + "tree": true +} +``` -## [Anchor](https://qdrant.tech/articles/product-quantization/\#product-quantization-vs-scalar-quantization) Product Quantization vs Scalar Quantization +{{< figure src="/articles_data/distance-based-exploration/spanning-tree.png" alt="Graph" caption="Spanning tree of the graph ([Qdrant's Graph Exploration Tool](https://qdrant.tech/blog/qdrant-1.11.x/#web-ui-graph-exploration-tool))" width="80%" >}} -Compared to [Scalar Quantization](https://qdrant.tech/articles/scalar-quantization/), Product Quantization offers a higher compression rate. However, this comes with considerable trade-offs in accuracy, and at times, in-RAM search speed. +This algorithm will only keep the most important edges and remove the rest while keeping the graph connected. +By doing so, we can reveal clusters of the data and the most important relations between them. -Product Quantization tends to be favored in certain specific scenarios: +In some sense, this is similar to hierarchical clustering, but with the ability to interactively explore the data. +Another analogy might be a dynamically constructed mind map. -- Deployment in a low-RAM environment where the limiting factor is the number of disk reads rather than the vector comparison itself -- Situations where the dimensionality of the original vectors is sufficiently high -- Cases where indexing speed is not a critical factor -In circumstances that do not align with the above, Scalar Quantization should be the preferred choice. + -Thank you for your feedback! 🙏 +## Conclusion -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/product-quantization.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Vector similarity goes beyond looking up the nearest neighbors--it provides a powerful tool for data exploration. +Many algorithms can construct human-readable data representations, and Qdrant makes using them easy. -On this page: +Several data exploration instruments are available in the Qdrant Web UI ([Visualization and Graph Exploration Tools](https://qdrant.tech/articles/web-ui-gsoc/)), and for more advanced use cases, you could directly utilise our distance matrix API. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/product-quantization.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Try it with your data and see what hidden structures you can reveal! -× +<|page-16-lllmstxt|> +Finding enough time to study all the modern solutions while keeping your production running is rarely feasible. +Dense retrievers, hybrid retrievers, late interaction
 How do they work, and where do they fit best? +If only we could compare retrievers as easily as products on Amazon! -[Powered by](https://qdrant.tech/) +We explored the most popular modern sparse neural retrieval models and broke them down for you. +By the end of this article, you’ll have a clear understanding of the current landscape in sparse neural retrieval and how to navigate through complex, math-heavy research papers with sky-high NDCG scores without getting overwhelmed. -<|page-26-lllmstxt|> -## platforms -- [Documentation](https://qdrant.tech/documentation/) -- Platforms +[The first part](#sparse-neural-retrieval-evolution) of this article is theoretical, comparing different approaches used in +modern sparse neural retrieval.\ +[The second part](#splade-in-qdrant) is more practical, showing how the best model in modern sparse neural retrieval, `SPLADE++`, +can be used in Qdrant and recommendations on when to choose sparse neural retrieval for your solutions. -## [Anchor](https://qdrant.tech/documentation/platforms/\#platform-integrations) Platform Integrations +## Sparse Neural Retrieval: As If Keyword-Based Retrievers Understood Meaning -| Platform | Description | -| --- | --- | -| [Apify](https://qdrant.tech/documentation/platforms/apify/) | Platform to build web scrapers and automate web browser tasks. | -| [Bubble](https://qdrant.tech/documentation/platforms/bubble/) | Development platform for application development with a no-code interface | -| [BuildShip](https://qdrant.tech/documentation/platforms/buildship/) | Low-code visual builder to create APIs, scheduled jobs, and backend workflows. | -| [DocsGPT](https://qdrant.tech/documentation/platforms/docsgpt/) | Tool for ingesting documentation sources and enabling conversations and queries. | -| [Keboola](https://qdrant.tech/documentation/platforms/keboola/) | Data operations platform that unifies data sources, transformations, and ML deployments. | -| [Kotaemon](https://qdrant.tech/documentation/platforms/kotaemon/) | Open-source & customizable RAG UI for chatting with your documents. | -| [Make](https://qdrant.tech/documentation/platforms/make/) | Cloud platform to build low-code workflows by integrating various software applications. | -| [Mulesoft Anypoint](https://qdrant.tech/documentation/platforms/mulesoft/) | Integration platform to connect applications, data, and devices across environments. | -| [N8N](https://qdrant.tech/documentation/platforms/n8n/) | Platform for node-based, low-code workflow automation. | -| [Pipedream](https://qdrant.tech/documentation/platforms/pipedream/) | Platform for connecting apps and developing event-driven automation. | -| [Portable.io](https://qdrant.tech/documentation/platforms/portable/) | Cloud platform for developing and deploying ELT transformations. | -| [PrivateGPT](https://qdrant.tech/documentation/platforms/privategpt/) | Tool to ask questions about your documents using local LLMs emphasising privacy. | -| [Rivet](https://qdrant.tech/documentation/platforms/rivet/) | A visual programming environment for building AI agents with LLMs. | -| [ToolJet](https://qdrant.tech/documentation/platforms/tooljet/) | A low-code platform for business apps that connect to DBs, cloud storages and more. | -| [Vectorize](https://qdrant.tech/documentation/platforms/vectorize/) | Platform to automate data extraction, RAG evaluation, deploy RAG pipelines. | +**Keyword-based (lexical) retrievers** like BM25 provide a good explainability. +If a document matches a query, it’s easy to understand why: query terms are present in the document, +and if these are rare terms, they are more important for retrieval. -##### Was this page useful? +![Keyword-based (Lexical) Retrieval](/articles_data/modern-sparse-neural-retrieval/LexicalRetrievers.png) -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +With their mechanism of exact term matching, they are super fast at retrieval. +A simple **inverted index**, which maps back from a term to a list of documents where this term occurs, saves time on checking millions of documents. -Thank you for your feedback! 🙏 +![Inverted Index](/articles_data/modern-sparse-neural-retrieval/InvertedIndex.png) -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/platforms/_index.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Lexical retrievers are still a strong baseline in retrieval tasks. +However, by design, they’re unable to bridge **vocabulary** and **semantic mismatch** gaps. +Imagine searching for a “*tasty cheese*” in an online store and not having a chance to get “*Gouda*” or “*Brie*” in your shopping basket. -On this page: +**Dense retrievers**, based on machine learning models which encode documents and queries in dense vector representations, +are capable of breaching this gap and finding you “*a piece of Gouda*”. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/platforms/_index.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +![Dense Retrieval](/articles_data/modern-sparse-neural-retrieval/DenseRetrievers.png) -× +However, explainability here suffers: why is this query representation close to this document representation? +Why, searching for “*cheese*”, we’re also offered “*mouse traps*”? What does each number in this vector representation mean? +Which one of them is capturing the cheesiness? -[Powered by](https://qdrant.tech/) +Without a solid understanding, balancing result quality and resource consumption becomes challenging. +Since, hypothetically, any document could match a query, relying on an inverted index with exact matching isn’t feasible. +This doesn’t mean dense retrievers are inherently slower. However, lexical retrieval has been around long enough to inspire several effective architectural choices, which are often worth reusing. -<|page-27-lllmstxt|> -## qdrant-cluster-management -- [Documentation](https://qdrant.tech/documentation/) -- [Private cloud](https://qdrant.tech/documentation/private-cloud/) -- Managing a Cluster +Sooner or later, there should have been somebody who would say, +“*Wait, but what if I want something timeproof like BM25 but with semantic understanding?*” -# [Anchor](https://qdrant.tech/documentation/private-cloud/qdrant-cluster-management/\#managing-a-qdrant-cluster) Managing a Qdrant Cluster +## Sparse Neural Retrieval Evolution -The most minimal QdrantCluster configuration is: +Imagine searching for a “*flabbergasting murder*” story. +”*Flabbergasting*” is a rarely used word, so a keyword-based retriever, for example, BM25, will assign huge importance to it. +Consequently, there is a high chance that a text unrelated to any crimes but mentioning something “*flabbergasting*” will pop up in the top results. -```yaml -apiVersion: qdrant.io/v1 -kind: QdrantCluster -metadata: - name: qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840 - labels: - cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" - customer-id: "acme-industries" -spec: - id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" - version: "v1.11.3" - size: 1 - resources: - cpu: 100m - memory: "1Gi" - storage: "2Gi" +What if we could instead of relying on term frequency in a document as a proxy of term’s importance as it happens in BM25, +directly predict a term’s importance? The goal is for rare but non-impactful terms to be assigned a much smaller weight than important terms with the same frequency, while both would be equally treated in the BM25 scenario. -``` +How can we determine if one term is more important than another? +Word impact is related to its meaning, and its meaning can be derived from its context (words which surround this particular word). +That’s how dense contextual embedding models come into the picture. + +All the sparse retrievers are based on the idea of taking a model which produces contextual dense vector representations for terms +and teaching it to produce sparse ones. Very often, +[Bidirectional Encoder Representations from the Transformers (BERT)](https://huggingface.co/docs/transformers/en/model_doc/bert) is used as a +base model, and a very simple trainable neural network is added on top of it to sparsify the representations out. +Training this small neural network is usually done by sampling from the [MS MARCO](https://microsoft.github.io/msmarco/) dataset a query, +relevant and irrelevant to it documents and shifting the parameters of the neural network in the direction of relevancy. -The `id` should be unique across all Qdrant clusters in the same namespace, the `name` must follow the above pattern and the `cluster-id` and `customer-id` labels are mandatory. -There are lots more configuration options to configure scheduling, security, networking, and more. For full details see the [Qdrant Private Cloud API Reference](https://qdrant.tech/documentation/private-cloud/api-reference/). +### The Pioneer Of Sparse Neural Retrieval -## [Anchor](https://qdrant.tech/documentation/private-cloud/qdrant-cluster-management/\#scaling-a-cluster) Scaling a Cluster +![Deep Contextualized Term Weighting (DeepCT)](/articles_data/modern-sparse-neural-retrieval/DeepCT.png) +The authors of one of the first sparse retrievers, the [`Deep Contextualized Term Weighting framework (DeepCT)`](https://arxiv.org/pdf/1910.10687), +predict an integer word’s impact value separately for each unique word in a document and a query. +They use a linear regression model on top of the contextual representations produced by the basic BERT model, the model's output is rounded. -To scale a cluster, update the CPU, memory and storage resources in the QdrantCluster spec. The Qdrant operator will automatically adjust the cluster configuration. This operation is highly available on a multi-node cluster with replicated collections. +When documents are uploaded into a database, the importance of words in a document is predicted by a trained linear regression model +and stored in the inverted index in the same way as term frequencies in BM25 retrievers. +Then, the retrieval process is identical to the BM25 one. -## [Anchor](https://qdrant.tech/documentation/private-cloud/qdrant-cluster-management/\#upgrading-the-qdrant-version) Upgrading the Qdrant version +***Why is DeepCT not a perfect solution?*** To train linear regression, the authors needed to provide the true value (**ground truth**) +of each word’s importance so the model could “see” what the right answer should be. +This score is hard to define in a way that it truly expresses the query-document relevancy. + Which score should have the most relevant word to a query when this word is taken from a five-page document? The second relevant? The third? -To upgrade the Qdrant version of a database cluster, update the `version` field in the QdrantCluster spec. The Qdrant operator will automatically upgrade the cluster to the new version. The upgrade process is highly available on a multi-node cluster with replicated collections. +### Sparse Neural Retrieval on Relevance Objective -Note, that you should not skip minor versions when upgrading. For example, if you are running version `v1.11.3`, you can upgrade to `v1.11.5` or `v1.12.6`, but not directly to `v1.13.0`. +![DeepImpact](/articles_data/modern-sparse-neural-retrieval/DeepImpact.png) +It’s much easier to define whether a document as a whole is relevant or irrelevant to a query. +That’s why the [`DeepImpact`](https://arxiv.org/pdf/2104.12016) Sparse Neural Retriever authors directly used the relevancy between a query and a document as a training objective. +They take BERT’s contextualized embeddings of the document’s words, transform them through a simple 2-layer neural network in a single scalar +score and sum these scores up for each word overlapping with a query. +The training objective is to make this score reflect the relevance between the query and the document. -## [Anchor](https://qdrant.tech/documentation/private-cloud/qdrant-cluster-management/\#exposing-a-cluster) Exposing a Cluster +***Why is DeepImpact not a perfect solution?*** +When converting texts into dense vector representations, +the BERT model does not work on a word level. Sometimes, it breaks the words into parts. +For example, the word “*vector*” will be processed by BERT as one piece, but for some words that, for example, +BERT hasn’t seen before, it is going to cut the word in pieces +[as “Qdrant” turns to “Q”, “#dra” and “#nt”](https://huggingface.co/spaces/Xenova/the-tokenizer-playground) -By default, a QdrantCluster will be exposed through an internal `ClusterIP` service. To expose the cluster to the outside world, you can create a `NodePort` service, a `LoadBalancer` service or an `Ingress` resource. +The DeepImpact model (like the DeepCT model) takes the first piece BERT produces for a word and discards the rest. +However, what can one find searching for “*Q*” instead of “*Qdrant*”? -This is an example on how to create a QdrantCluster with a `LoadBalancer` service: +### Know Thine Tokenization -```yaml -apiVersion: qdrant.io/v1 -kind: QdrantCluster -metadata: - name: qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840 - labels: - cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" - customer-id: "acme-industries" -spec: - id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" - version: "v1.11.3" - size: 1 - resources: - cpu: 100m - memory: "1Gi" - storage: "2Gi" - service: - type: LoadBalancer - annotations: - service.beta.kubernetes.io/aws-load-balancer-type: nlb +![Term Independent Likelihood MoDEl v2 (TILDE v2)](/articles_data/modern-sparse-neural-retrieval/TILDEv2.png) +To solve the problems of DeepImpact's architecture, the [`Term Independent Likelihood MoDEl (TILDEv2)`](https://arxiv.org/pdf/2108.08513) model generates +sparse encodings on a level of BERT’s representations, not on words level. Aside from that, its authors use the identical architecture +to the DeepImpact model. -``` +***Why is TILDEv2 not a perfect solution?*** +A single scalar importance score value might not be enough to capture all distinct meanings of a word. +**Homonyms** (pizza, cocktail, flower, and female name “*Margherita*”) are one of the troublemakers in information retrieval. -Especially if you create a LoadBalancer Service, you may need to provide annotations for the loadbalancer configration. Please refer to the documention of your cloud provider for more details. +### Sparse Neural Retriever Which Understood Homonyms -Examples: +![COntextualized Inverted List (COIL)](/articles_data/modern-sparse-neural-retrieval/COIL.png) -- [AWS EKS LoadBalancer annotations](https://kubernetes-sigs.github.io/aws-load-balancer-controller/latest/guide/service/annotations/) -- [Azure AKS Public LoadBalancer annotations](https://learn.microsoft.com/en-us/azure/aks/load-balancer-standard) -- [Azure AKS Internal LoadBalancer annotations](https://learn.microsoft.com/en-us/azure/aks/internal-lb) -- [GCP GKE LoadBalancer annotations](https://cloud.google.com/kubernetes-engine/docs/concepts/service-load-balancer-parameters) +If one value for the term importance score is insufficient, we could describe the term’s importance in a vector form! +Authors of the [`COntextualized Inverted List (COIL)`](https://arxiv.org/pdf/2104.07186) model based their work on this idea. +Instead of squeezing 768-dimensional BERT’s contextualised embeddings into one value, +they down-project them (through the similar “relevance” training objective) to 32 dimensions. +Moreover, not to miss a detail, they also encode the query terms as vectors. -## [Anchor](https://qdrant.tech/documentation/private-cloud/qdrant-cluster-management/\#authentication-and-authorization) Authentication and Authorization +For each vector representing a query token, COIL finds the closest match (using the maximum dot product) vector of the same token in a document. +So, for example, if we are searching for “*Revolut bank \*” and a document in a database has the sentence +“*Vivid bank \ was moved to the bank of Amstel \*”, out of two “banks”, +the first one will have a bigger value of a dot product with a “*bank*” in the query, and it will count towards the final score. +The final relevancy score of a document is a sum of scores of query terms matched. + +***Why is COIL not a perfect solution?*** This way of defining the importance score captures deeper semantics; +more meaning comes with more values used to describe it. +However, storing 32-dimensional vectors for every term is far more expensive, +and an inverted index does not work as-is with this architecture. -Authentication information is provided by Kubernetes secrets. +### Back to the Roots -One way to create a secret is with kubectl: +![Universal COntextualized Inverted List (UniCOIL)](/articles_data/modern-sparse-neural-retrieval/UNICOIL.png) +[`Universal COntextualized Inverted List (UniCOIL)`](https://arxiv.org/pdf/2106.14807), made by the authors of COIL as a follow-up, goes back to producing a scalar value as the importance score +rather than a vector, leaving unchanged all other COIL design decisions. \ +It optimizes resources consumption but the deep semantics understanding tied to COIL architecture is again lost. -```shell -kubectl create secret generic qdrant-api-key --from-literal=api-key=your-secret-api-key --from-literal=read-only-api-key=your-secret-read-only-api-key --namespace qdrant-private-cloud +## Did we Solve the Vocabulary Mismatch Yet? -``` +With the retrieval based on the exact matching, +however sophisticated the methods to predict term importance are, we can’t match relevant documents which have no query terms in them. +If you’re searching for “*pizza*” in a book of recipes, you won’t find “*Margherita*”. -The resulting secret will look like this: +A way to solve this problem is through the so-called **document expansion**. +Let’s append words which could be in a potential query searching for this document. +So, the “*Margherita*” document becomes “*Margherita pizza*”. Now, exact matching on “*pizza*” will work! -```yaml -apiVersion: v1 -data: - api-key: ... - read-only-api-key: ... -kind: Secret -metadata: - name: qdrant-api-key - namespace: qdrant-private-cloud -type: kubernetes.io/generic +![Document Expansion](/articles_data/modern-sparse-neural-retrieval/DocumentExpansion.png) -``` +There are two types of document expansion that are used in sparse neural retrieval: +**external** (one model is responsible for expansion, another one for retrieval) and **internal** (all is done by a single model). -You can reference the secret in the QdrantCluster spec: +### External Document Expansion +External document expansion uses a **generative model** (Mistral 7B, Chat-GPT, and Claude are all generative models, +generating words based on the input text) to compose additions to documents before converting them to sparse representations +and applying exact matching methods. -```yaml -apiVersion: qdrant.io/v1 -kind: QdrantCluster -metadata: - name: qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840 - labels: - cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" - customer-id: "acme-industries" -spec: - id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" - version: "v1.11.3" - size: 1 - resources: - cpu: 100m - memory: "1Gi" - storage: "2Gi" - config: - service: - api_key: - secretKeyRef: - name: qdrant-api-key - key: api-key - read_only_api_key: - secretKeyRef: - name: qdrant-api-key - key: read-only-api-key - jwt_rbac: true +#### External Document Expansion with docT5query -``` +![External Document Expansion with docT5query](/articles_data/modern-sparse-neural-retrieval/docT5queryDocumentExpansion.png) +[`docT5query`](https://github.com/castorini/docTTTTTquery) is the most used document expansion model. +It is based on the [Text-to-Text Transfer Transformer (T5)](https://huggingface.co/docs/transformers/en/model_doc/t5) model trained to +generate top-k possible queries for which the given document would be an answer. +These predicted short queries (up to ~50-60 words) can have repetitions in them, +so it also contributes to the frequency of the terms if the term frequency is considered by the retriever. -If you set the `jwt_rbac` flag, you will also be able to create granular [JWT tokens for role based access control](https://qdrant.tech/documentation/guides/security/#granular-access-control-with-jwt). +The problem with docT5query expansion is a very long inference time, as with any generative model: +it can generate only one token per run, and it spends a fair share of resources on it. -### [Anchor](https://qdrant.tech/documentation/private-cloud/qdrant-cluster-management/\#configuring-tls-for-database-access) Configuring TLS for Database Access +#### External Document Expansion with Term Independent Likelihood MODel (TILDE) -If you want to configure TLS for accessing your Qdrant database, there are two options: +![External Document Expansion with Term Independent Likelihood MODel (TILDE)](/articles_data/modern-sparse-neural-retrieval/TILDEDocumentExpansion.png) -- You can offload TLS at the ingress or loadbalancer level. -- You can configure TLS directly in the Qdrant database. +[`Term Independent Likelihood MODel (TILDE)`](https://github.com/ielab/TILDE) is an external expansion method that reduces the passage expansion time compared to +docT5query by 98%. It uses the assumption that words in texts are independent of each other +(as if we were inserting in our speech words without paying attention to their order), which allows for the parallelisation of document expansion. -If you want to configure TLS directly in the Qdrant database, you can provide this as a secret. +Instead of predicting queries, TILDE predicts the most likely terms to see next after reading a passage’s text +(**query likelihood paradigm**). TILDE takes the probability distribution of all tokens in a BERT vocabulary based on the document’s text +and appends top-k of them to the document without repetitions. -To create such a secret, you can use `kubectl`: +***Problems of external document expansion:*** External document expansion might not be feasible in many production scenarios where there’s not enough time or compute to expand each and every +document you want to store in a database and then additionally do all the calculations needed for retrievers. +To solve this problem, a generation of models was developed which do everything in one go, expanding documents “internally”. -```shell - kubectl create secret tls qdrant-tls --cert=mydomain.com.crt --key=mydomain.com.key --namespace the-qdrant-namespace +### Internal Document Expansion -``` +Let’s assume we don’t care about the context of query terms, so we can treat them as independent words that we combine in random order to get +the result. Then, for each contextualized term in a document, we are free to pre-compute how this term affects every word in our vocabulary. -The resulting secret will look like this: +For each document, a vector of the vocabulary length is created. To fill this vector in, for each word in the vocabulary, it is checked if the +influence of any document term on it is big enough to consider it. Otherwise, the vocabulary word’s score in a document vector will be zero. +For example, by pre-computing vectors for the document “*pizza Margherita*” on a vocabulary of 50,000 most used English words, +for this small document of two words, we will get a 50,000-dimensional vector of zeros, where non-zero values will be for a “*pizza*”, “*pizzeria*”, +“*flower*”, “*woman*”, “*girl*”, "*Margherita*", “*cocktail*” and “*pizzaiolo*”. -```yaml -apiVersion: v1 -data: - tls.crt: ... - tls.key: ... -kind: Secret -metadata: - name: qdrant-tls - namespace: the-qdrant-namespace -type: kubernetes.io/tls +### Sparse Neural Retriever with Internal Document Expansion +![Sparse Transformer Matching (SPARTA)](/articles_data/modern-sparse-neural-retrieval/SPARTA.png) -``` +The authors of the [`Sparse Transformer Matching (SPARTA)`](https://arxiv.org/pdf/2009.13013) model use BERT’s model and BERT’s vocabulary (around 30,000 tokens). +For each token in BERT vocabulary, they find the maximum dot product between it and contextualized tokens in a document +and learn a threshold of a considerable (non-zero) effect. +Then, at the inference time, the only thing to be done is to sum up all scores of query tokens in that document. -You can reference the secret in the QdrantCluster spec: +***Why is SPARTA not a perfect solution?*** Trained on the MS MARCO dataset, many sparse neural retrievers, including SPARTA, +show good results on MS MARCO test data, but when it comes to generalisation (working with other data), they +[could perform worse than BM25](https://arxiv.org/pdf/2307.10488). -```yaml -apiVersion: qdrant.io/v1 -kind: QdrantCluster -metadata: - name: test-cluster -spec: - id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" - version: "v1.11.3" - size: 1 - resources: - cpu: 100m - memory: "1Gi" - storage: "2Gi" - config: - service: - enable_tls: true - tls: - cert: - secretKeyRef: - name: qdrant-tls - key: tls.crt - key: - secretKeyRef: - name: qdrant-tls - key: tls.key +### State-of-the-Art of Modern Sparse Neural Retrieval -``` +![Sparse Lexical and Expansion Model Plus Plus, (SPLADE++)](/articles_data/modern-sparse-neural-retrieval/SPLADE++.png) +The authors of the [`Sparse Lexical and Expansion Model (SPLADE)]`](https://arxiv.org/pdf/2109.10086) family of models added dense model training tricks to the +internal document expansion idea, which made the retrieval quality noticeably better. -### [Anchor](https://qdrant.tech/documentation/private-cloud/qdrant-cluster-management/\#configuring-tls-for-inter-cluster-communication) Configuring TLS for Inter-cluster Communication +- The SPARTA model is not sparse enough by construction, so authors of the SPLADE family of models introduced explicit **sparsity regularisation**, +preventing the model from producing too many non-zero values. +- The SPARTA model mostly uses the BERT model as-is, without any additional neural network to capture the specifity of Information Retrieval problem, +so SPLADE models introduce a trainable neural network on top of BERT with a specific architecture choice to make it perfectly fit the task. +- SPLADE family of models, finally, uses **knowledge distillation**, which is learning from a bigger +(and therefore much slower, not-so-fit for production tasks) model how to predict good representations. -_Available as of Operator v2.2.0_ +One of the last versions of the SPLADE family of models is [`SPLADE++`](https://arxiv.org/pdf/2205.04733). \ +SPLADE++, opposed to SPARTA model, expands not only documents but also queries at inference time. +We’ll demonstrate this in the next section. -If you want to encrypt communication between Qdrant nodes, you need to enable TLS by providing -certificate, key, and root CA certificate used for generating the former. +## SPLADE++ in Qdrant +In Qdrant, you can use [`SPLADE++`](https://arxiv.org/pdf/2205.04733) easily with our lightweight library for embeddings called [FastEmbed](https://qdrant.tech/documentation/fastembed/). +#### Setup +Install `FastEmbed`. -Similar to the instruction stated in the previous section, you need to create a secret: +```python +pip install fastembed +``` -```shell - kubectl create secret generic qdrant-p2p-tls \ - --from-file=tls.crt=qdrant-nodes.crt \ - --from-file=tls.key=qdrant-nodes.key \ - --from-file=ca.crt=root-ca.crt - --namespace the-qdrant-namespace +Import sparse text embedding models supported in FastEmbed. +```python +from fastembed import SparseTextEmbedding ``` -The resulting secret will look like this: +You can list all sparse text embedding models currently supported. -```yaml -apiVersion: v1 -data: - tls.crt: ... - tls.key: ... - ca.crt: ... -kind: Secret -metadata: - name: qdrant-p2p-tls - namespace: the-qdrant-namespace -type: Opaque +```python +SparseTextEmbedding.list_supported_models() +``` +
+Output with a list of supported models +```bash +[{'model': 'prithivida/Splade_PP_en_v1', + 'vocab_size': 30522, + 'description': 'Independent Implementation of SPLADE++ Model for English', + 'size_in_GB': 0.532, + 'sources': {'hf': 'Qdrant/SPLADE_PP_en_v1'}, + 'model_file': 'model.onnx'}, + {'model': 'prithvida/Splade_PP_en_v1', + 'vocab_size': 30522, + 'description': 'Independent Implementation of SPLADE++ Model for English', + 'size_in_GB': 0.532, + 'sources': {'hf': 'Qdrant/SPLADE_PP_en_v1'}, + 'model_file': 'model.onnx'}, + {'model': 'Qdrant/bm42-all-minilm-l6-v2-attentions', + 'vocab_size': 30522, + 'description': 'Light sparse embedding model, which assigns an importance score to each token in the text', + 'size_in_GB': 0.09, + 'sources': {'hf': 'Qdrant/all_miniLM_L6_v2_with_attentions'}, + 'model_file': 'model.onnx', + 'additional_files': ['stopwords.txt'], + 'requires_idf': True}, + {'model': 'Qdrant/bm25', + 'description': 'BM25 as sparse embeddings meant to be used with Qdrant', + 'size_in_GB': 0.01, + 'sources': {'hf': 'Qdrant/bm25'}, + 'model_file': 'mock.file', + 'additional_files': ['arabic.txt', + 'azerbaijani.txt', + 'basque.txt', + 'bengali.txt', + 'catalan.txt', + 'chinese.txt', + 'danish.txt', + 'dutch.txt', + 'english.txt', + 'finnish.txt', + 'french.txt', + 'german.txt', + 'greek.txt', + 'hebrew.txt', + 'hinglish.txt', + 'hungarian.txt', + 'indonesian.txt', + 'italian.txt', + 'kazakh.txt', + 'nepali.txt', + 'norwegian.txt', + 'portuguese.txt', + 'romanian.txt', + 'russian.txt', + 'slovene.txt', + 'spanish.txt', + 'swedish.txt', + 'tajik.txt', + 'turkish.txt'], + 'requires_idf': True}] ``` +
-You can reference the secret in the QdrantCluster spec: +Load SPLADE++. +```python +sparse_model_name = "prithivida/Splade_PP_en_v1" +sparse_model = SparseTextEmbedding(model_name=sparse_model_name) +``` +The model files will be fetched and downloaded, with progress showing. -```yaml -apiVersion: qdrant.io/v1 -kind: QdrantCluster -metadata: - name: test-cluster - labels: - cluster-id: "my-cluster" - customer-id: "acme-industries" -spec: - id: "my-cluster" - version: "v1.13.3" - size: 2 - resources: - cpu: 100m - memory: "1Gi" - storage: "2Gi" - config: - service: - enable_tls: true - tls: - caCert: - secretKeyRef: - name: qdrant-p2p-tls - key: ca.crt - cert: - secretKeyRef: - name: qdrant-p2p-tls - key: tls.crt - key: - secretKeyRef: - name: qdrant-p2p-tls - key: tls.key +#### Embed data +We will use a toy movie description dataset. +
+ Movie description dataset + +```python +descriptions = ["In 1431, Jeanne d'Arc is placed on trial on charges of heresy. The ecclesiastical jurists attempt to force Jeanne to recant her claims of holy visions.", + "A film projectionist longs to be a detective, and puts his meagre skills to work when he is framed by a rival for stealing his girlfriend's father's pocketwatch.", + "A group of high-end professional thieves start to feel the heat from the LAPD when they unknowingly leave a clue at their latest heist.", + "A petty thief with an utter resemblance to a samurai warlord is hired as the lord's double. When the warlord later dies the thief is forced to take up arms in his place.", + "A young boy named Kubo must locate a magical suit of armour worn by his late father in order to defeat a vengeful spirit from the past.", + "A biopic detailing the 2 decades that Punjabi Sikh revolutionary Udham Singh spent planning the assassination of the man responsible for the Jallianwala Bagh massacre.", + "When a machine that allows therapists to enter their patients' dreams is stolen, all hell breaks loose. Only a young female therapist, Paprika, can stop it.", + "An ordinary word processor has the worst night of his life after he agrees to visit a girl in Soho whom he met that evening at a coffee shop.", + "A story that revolves around drug abuse in the affluent north Indian State of Punjab and how the youth there have succumbed to it en-masse resulting in a socio-economic decline.", + "A world-weary political journalist picks up the story of a woman's search for her son, who was taken away from her decades ago after she became pregnant and was forced to live in a convent.", + "Concurrent theatrical ending of the TV series Neon Genesis Evangelion (1995).", + "During World War II, a rebellious U.S. Army Major is assigned a dozen convicted murderers to train and lead them into a mass assassination mission of German officers.", + "The toys are mistakenly delivered to a day-care center instead of the attic right before Andy leaves for college, and it's up to Woody to convince the other toys that they weren't abandoned and to return home.", + "A soldier fighting aliens gets to relive the same day over and over again, the day restarting every time he dies.", + "After two male musicians witness a mob hit, they flee the state in an all-female band disguised as women, but further complications set in.", + "Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household.", + "A renegade reporter trailing a young runaway heiress for a big story joins her on a bus heading from Florida to New York, and they end up stuck with each other when the bus leaves them behind at one of the stops.", + "Story of 40-man Turkish task force who must defend a relay station.", + "Spinal Tap, one of England's loudest bands, is chronicled by film director Marty DiBergi on what proves to be a fateful tour.", + "Oskar, an overlooked and bullied boy, finds love and revenge through Eli, a beautiful but peculiar girl."] ``` +
-## [Anchor](https://qdrant.tech/documentation/private-cloud/qdrant-cluster-management/\#gpu-support) GPU support +Embed movie descriptions with SPLADE++. -Starting with Qdrant 1.13 and private-cloud version 1.6.1 you can create a cluster that uses GPUs to accelarate indexing. +```python +sparse_descriptions = list(sparse_model.embed(descriptions)) +``` +You can check how a sparse vector generated by SPLADE++ looks in Qdrant. -As a prerequisite, you need to have a Kubernetes cluster with GPU support. You can check the [Kubernetes documentation](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/) for generic information on GPUs and Kubernetes, or the documentation of your specific Kubernetes distribution. +```python +sparse_descriptions[0] +``` -Examples: +It is stored as **indices** of BERT tokens, weights of which are non-zero, and **values** of these weights. -- [AWS EKS GPU support](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/amazon-eks.html) -- [Azure AKS GPU support](https://docs.microsoft.com/en-us/azure/aks/gpu-cluster) -- [GCP GKE GPU support](https://cloud.google.com/kubernetes-engine/docs/how-to/gpus) -- [Vultr Kubernetes GPU support](https://blogs.vultr.com/whats-new-vultr-q2-2023) +```bash +SparseEmbedding( + values=array([1.57449973, 0.90787691, ..., 1.21796167, 1.1321187]), + indices=array([ 1040, 2001, ..., 28667, 29137]) +) +``` +#### Upload Embeddings to Qdrant +Install `qdrant-client` -Once you have a Kubernetes cluster with GPU support, you can create a QdrantCluster with GPU support: +```python +pip install qdrant-client +``` -```yaml -apiVersion: qdrant.io/v1 -kind: QdrantCluster -metadata: - name: qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840 - labels: - cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" - customer-id: "acme-industries" -spec: - id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" - version: "v1.13.4" - size: 1 - resources: - cpu: 2 - memory: "8Gi" - storage: "40Gi" - gpu: - gpuType: "nvidia" +Qdrant Client has a simple in-memory mode that allows you to experiment locally on small data volumes. +Alternatively, you could use for experiments [a free tier cluster](https://qdrant.tech/documentation/cloud/create-cluster/#create-a-cluster) +in Qdrant Cloud. +```python +from qdrant_client import QdrantClient, models +qdrant_client = QdrantClient(":memory:") # Qdrant is running from RAM. ``` -Once the cluster Pod has started, you can check in the logs if the GPU is detected: +Now, let's create a [collection](https://qdrant.tech/documentation/concepts/collections/) in which could upload our sparse SPLADE++ embeddings. \ +For that, we will use the [sparse vectors](https://qdrant.tech/documentation/concepts/vectors/#sparse-vectors) representation supported in Qdrant. -```shell -$ kubectl logs qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840-0 +```python +qdrant_client.create_collection( + collection_name="movies", + vectors_config={}, + sparse_vectors_config={ + "film_description": models.SparseVectorParams(), + }, +) +``` +To make this collection human-readable, let's save movie metadata (name, description and movie's length) together with an embeddings. +
+ Movie metadata + +```python +metadata = [{"movie_name": "The Passion of Joan of Arc", "movie_watch_time_min": 114, "movie_description": "In 1431, Jeanne d'Arc is placed on trial on charges of heresy. The ecclesiastical jurists attempt to force Jeanne to recant her claims of holy visions."}, +{"movie_name": "Sherlock Jr.", "movie_watch_time_min": 45, "movie_description": "A film projectionist longs to be a detective, and puts his meagre skills to work when he is framed by a rival for stealing his girlfriend's father's pocketwatch."}, +{"movie_name": "Heat", "movie_watch_time_min": 170, "movie_description": "A group of high-end professional thieves start to feel the heat from the LAPD when they unknowingly leave a clue at their latest heist."}, +{"movie_name": "Kagemusha", "movie_watch_time_min": 162, "movie_description": "A petty thief with an utter resemblance to a samurai warlord is hired as the lord's double. When the warlord later dies the thief is forced to take up arms in his place."}, +{"movie_name": "Kubo and the Two Strings", "movie_watch_time_min": 101, "movie_description": "A young boy named Kubo must locate a magical suit of armour worn by his late father in order to defeat a vengeful spirit from the past."}, +{"movie_name": "Sardar Udham", "movie_watch_time_min": 164, "movie_description": "A biopic detailing the 2 decades that Punjabi Sikh revolutionary Udham Singh spent planning the assassination of the man responsible for the Jallianwala Bagh massacre."}, +{"movie_name": "Paprika", "movie_watch_time_min": 90, "movie_description": "When a machine that allows therapists to enter their patients' dreams is stolen, all hell breaks loose. Only a young female therapist, Paprika, can stop it."}, +{"movie_name": "After Hours", "movie_watch_time_min": 97, "movie_description": "An ordinary word processor has the worst night of his life after he agrees to visit a girl in Soho whom he met that evening at a coffee shop."}, +{"movie_name": "Udta Punjab", "movie_watch_time_min": 148, "movie_description": "A story that revolves around drug abuse in the affluent north Indian State of Punjab and how the youth there have succumbed to it en-masse resulting in a socio-economic decline."}, +{"movie_name": "Philomena", "movie_watch_time_min": 98, "movie_description": "A world-weary political journalist picks up the story of a woman's search for her son, who was taken away from her decades ago after she became pregnant and was forced to live in a convent."}, +{"movie_name": "Neon Genesis Evangelion: The End of Evangelion", "movie_watch_time_min": 87, "movie_description": "Concurrent theatrical ending of the TV series Neon Genesis Evangelion (1995)."}, +{"movie_name": "The Dirty Dozen", "movie_watch_time_min": 150, "movie_description": "During World War II, a rebellious U.S. Army Major is assigned a dozen convicted murderers to train and lead them into a mass assassination mission of German officers."}, +{"movie_name": "Toy Story 3", "movie_watch_time_min": 103, "movie_description": "The toys are mistakenly delivered to a day-care center instead of the attic right before Andy leaves for college, and it's up to Woody to convince the other toys that they weren't abandoned and to return home."}, +{"movie_name": "Edge of Tomorrow", "movie_watch_time_min": 113, "movie_description": "A soldier fighting aliens gets to relive the same day over and over again, the day restarting every time he dies."}, +{"movie_name": "Some Like It Hot", "movie_watch_time_min": 121, "movie_description": "After two male musicians witness a mob hit, they flee the state in an all-female band disguised as women, but further complications set in."}, +{"movie_name": "Snow White and the Seven Dwarfs", "movie_watch_time_min": 83, "movie_description": "Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household."}, +{"movie_name": "It Happened One Night", "movie_watch_time_min": 105, "movie_description": "A renegade reporter trailing a young runaway heiress for a big story joins her on a bus heading from Florida to New York, and they end up stuck with each other when the bus leaves them behind at one of the stops."}, +{"movie_name": "Nefes: Vatan Sagolsun", "movie_watch_time_min": 128, "movie_description": "Story of 40-man Turkish task force who must defend a relay station."}, +{"movie_name": "This Is Spinal Tap", "movie_watch_time_min": 82, "movie_description": "Spinal Tap, one of England's loudest bands, is chronicled by film director Marty DiBergi on what proves to be a fateful tour."}, +{"movie_name": "Let the Right One In", "movie_watch_time_min": 114, "movie_description": "Oskar, an overlooked and bullied boy, finds love and revenge through Eli, a beautiful but peculiar girl."}] +``` +
-Starting initializing for pod 0 - _ _ - __ _ __| |_ __ __ _ _ __ | |_ - / _` |/ _` | '__/ _` | '_ \| __| -| (_| | (_| | | | (_| | | | | |_ - \__, |\__,_|_| \__,_|_| |_|\__| - |_| +Upload embedded descriptions with movie metadata into the collection. -Version: 1.13.4, build: 7abc6843 -Access web UI at http://localhost:6333/dashboard +```python +qdrant_client.upsert( + collection_name="movies", + points=[ + models.PointStruct( + id=idx, + payload=metadata[idx], + vector={ + "film_description": models.SparseVector( + indices=vector.indices, + values=vector.values + ) + }, + ) + for idx, vector in enumerate(sparse_descriptions) + ], +) +``` -2025-03-14T10:25:30.509636Z INFO gpu::instance: Found GPU device: NVIDIA A16-2Q -2025-03-14T10:25:30.509679Z INFO gpu::instance: Found GPU device: llvmpipe (LLVM 15.0.7, 256 bits) -2025-03-14T10:25:30.509734Z INFO gpu::device: Create GPU device NVIDIA A16-2Q -... + -``` +
+ Implicitly generate sparse vectors (Click to expand) -For more GPU configuration options, see the [Qdrant Private Cloud API Reference](https://qdrant.tech/documentation/private-cloud/api-reference/). +```python +qdrant_client.upsert( + collection_name="movies", + points=[ + models.PointStruct( + id=idx, + payload=metadata[idx], + vector={ + "film_description": models.Document( + text=description, model=sparse_model_name + ) + }, + ) + for idx, description in enumerate(descriptions) + ], +) +``` -## [Anchor](https://qdrant.tech/documentation/private-cloud/qdrant-cluster-management/\#ephemeral-snapshot-volumes) Ephemeral Snapshot Volumes +
-If you do not [create snapshots](https://api.qdrant.tech/api-reference/snapshots/create-snapshot), or there is no need -to keep them available after cluster restart, the snapshot storage classname can be set to `emptyDir`: +#### Querying +Let’s query our collection! -```yaml -apiVersion: qdrant.io/v1 -kind: QdrantCluster -metadata: - name: qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840 - labels: - cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" - customer-id: "acme-industries" -spec: - id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" - version: "v1.13.4" - size: 1 - resources: - cpu: 2 - memory: "8Gi" - storage: "40Gi" - storageClassNames: - snapshots: emptyDir +```python +query_embedding = list(sparse_model.embed("A movie about music"))[0] +response = qdrant_client.query_points( + collection_name="movies", + query=models.SparseVector(indices=query_embedding.indices, values=query_embedding.values), + using="film_description", + limit=1, + with_vectors=True, + with_payload=True +) +print(response) ``` -See [Kubernetes docs on emptyDir volumes](https://kubernetes.io/docs/concepts/storage/volumes/#emptydir) for more details, -on how k8s node ephemeral storage is allocated and used. +
+ Implicitly generate sparse vectors (Click to expand) -##### Was this page useful? +```python +response = qdrant_client.query_points( + collection_name="movies", + query=models.Document(text="A movie about music", model=sparse_model_name), + using="film_description", + limit=1, + with_vectors=True, + with_payload=True, +) +print(response) +``` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +
-Thank you for your feedback! 🙏 +Output looks like this: +```bash +points=[ScoredPoint( + id=18, + version=0, + score=9.6779785, + payload={ + 'movie_name': 'This Is Spinal Tap', + 'movie_watch_time_min': 82, + 'movie_description': "Spinal Tap, one of England's loudest bands, + is chronicled by film director Marty DiBergi on what proves to be a fateful tour." + }, + vector={ + 'film_description': SparseVector( + indices=[1010, 2001, ..., 25316, 25517], + values=[0.49717945, 0.19760133, ..., 1.2124698, 0.58689135]) + }, + shard_key=None, + order_value=None +)] +``` +As you can see, there are no overlapping words in the query and a description of a found movie, +even though the answer fits the query, and yet we’re working with **exact matching**. \ +This is possible due to the **internal expansion** of the query and the document that SPLADE++ does. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/private-cloud/qdrant-cluster-management.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +#### Internal Expansion by SPLADE++ -On this page: +Let’s check how did SPLADE++ expand the query and the document we got as an answer. \ +For that, we will need to use the HuggingFace library called [Tokenizers](https://huggingface.co/docs/tokenizers/en/index). +With it, we will be able to decode back to human-readable format **indices** of words in a vocabulary SPLADE++ uses. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/private-cloud/qdrant-cluster-management.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Firstly we will need to install this library. -× +```python +pip install tokenizers +``` -[Powered by](https://qdrant.tech/) +Then, let's write a function which will decode SPLADE++ sparse embeddings and return words SPLADE++ uses for encoding the input. \ +We would like to return them in the descending order based on the weight (**impact score**), SPLADE++ assigned them. -<|page-28-lllmstxt|> -## cloud-account-setup -- [Documentation](https://qdrant.tech/documentation/) -- Account Setup +```python +from tokenizers import Tokenizer -# [Anchor](https://qdrant.tech/documentation/cloud-account-setup/\#setting-up-a-qdrant-cloud-account) Setting up a Qdrant Cloud Account +tokenizer = Tokenizer.from_pretrained('Qdrant/SPLADE_PP_en_v1') -## [Anchor](https://qdrant.tech/documentation/cloud-account-setup/\#registration) Registration +def get_tokens_and_weights(sparse_embedding, tokenizer): + token_weight_dict = {} + for i in range(len(sparse_embedding.indices)): + token = tokenizer.decode([sparse_embedding.indices[i]]) + weight = sparse_embedding.values[i] + token_weight_dict[token] = weight -There are different ways to register for a Qdrant Cloud account: + # Sort the dictionary by weights + token_weight_dict = dict(sorted(token_weight_dict.items(), key=lambda item: item[1], reverse=True)) + return token_weight_dict +``` -- With an email address and passwordless login via email -- With a Google account -- With a GitHub account -- By connection an enterprise SSO solution +Firstly, we apply our function to the query. -Every account is tied to an email address. You can invite additional users to your account and manage their permissions. +```python +query_embedding = list(sparse_model.embed("A movie about music"))[0] +print(get_tokens_and_weights(query_embedding, tokenizer)) +``` +That’s how SPLADE++ expanded the query: +```bash +{ + "music": 2.764289617538452, + "movie": 2.674748420715332, + "film": 2.3489091396331787, + "musical": 2.276120901107788, + "about": 2.124547004699707, + "movies": 1.3825485706329346, + "song": 1.2893378734588623, + "genre": 0.9066758751869202, + "songs": 0.8926399946212769, + "a": 0.8900706768035889, + "musicians": 0.5638002157211304, + "sound": 0.49310919642448425, + "musician": 0.46415239572525024, + "drama": 0.462990403175354, + "tv": 0.4398191571235657, + "book": 0.38950803875923157, + "documentary": 0.3758136034011841, + "hollywood": 0.29099565744400024, + "story": 0.2697228491306305, + "nature": 0.25306591391563416, + "concerning": 0.205053448677063, + "game": 0.1546829640865326, + "rock": 0.11775632947683334, + "definition": 0.08842901140451431, + "love": 0.08636035025119781, + "soundtrack": 0.06807517260313034, + "religion": 0.053535860031843185, + "filmed": 0.025964470580220222, + "sounds": 0.0004048719711136073 +} +``` -### [Anchor](https://qdrant.tech/documentation/cloud-account-setup/\#email-registration) Email registration +Then, we apply our function to the answer. +```python +query_embedding = list(sparse_model.embed("A movie about music"))[0] -1. Register for a [Cloud account](https://cloud.qdrant.io/signup) with your email, Google or GitHub credentials. +response = qdrant_client.query_points( + collection_name="movies", + query=models.SparseVector(indices=query_embedding.indices, values=query_embedding.values), + using="film_description", + limit=1, + with_vectors=True, + with_payload=True +) -## [Anchor](https://qdrant.tech/documentation/cloud-account-setup/\#inviting-additional-users-to-an-account) Inviting additional users to an account +print(get_tokens_and_weights(response.points[0].vector['film_description'], tokenizer)) +``` -You can invite additional users to your account, and manage their permissions on the **Account -> Access Management** page in the Qdrant Cloud Console. +
+ Implicitly generate sparse vectors (Click to expand) -![Invitations](https://qdrant.tech/documentation/cloud/invitations.png) +```python +response = qdrant_client.query_points( + collection_name="movies", + query=models.Document(text="A movie about music", model=sparse_model_name), + using="film_description", + limit=1, + with_vectors=True, + with_payload=True, +) -Invited users will receive an email with an invitation link to join Qdrant Cloud. Once they signed up, they can accept the invitation from the Overview page. +print(get_tokens_and_weights(response.points[0].vector["film_description"], tokenizer)) +``` -![Accepting invitation](https://qdrant.tech/documentation/cloud/accept-invitation.png) +
+ +And that's how SPLADE++ expanded the answer. + +```python +{'spinal': 2.6548674, 'tap': 2.534881, 'marty': 2.223297, '##berg': 2.0402722, +'##ful': 2.0030282, 'fate': 1.935915, 'loud': 1.8381964, 'spine': 1.7507898, +'di': 1.6161551, 'bands': 1.5897619, 'band': 1.589473, 'uk': 1.5385966, 'tour': 1.4758654, +'chronicle': 1.4577943, 'director': 1.4423795, 'england': 1.4301306, '##est': 1.3025658, +'taps': 1.2124698, 'film': 1.1069428, '##berger': 1.1044296, 'tapping': 1.0424755, 'best': 1.0327196, +'louder': 0.9229055, 'music': 0.9056678, 'directors': 0.8887502, 'movie': 0.870712, 'directing': 0.8396196, +'sound': 0.83609974, 'genre': 0.803052, 'dave': 0.80212915, 'wrote': 0.7849579, 'hottest': 0.7594193, 'filmed': 0.750105, +'english': 0.72807616, 'who': 0.69502294, 'tours': 0.6833075, 'club': 0.6375339, 'vertebrae': 0.58689135, 'chronicles': 0.57296354, +'dance': 0.57278687, 'song': 0.50987065, ',': 0.49717945, 'british': 0.4971719, 'writer': 0.495709, 'directed': 0.4875775, +'cork': 0.475757, '##i': 0.47122696, '##band': 0.46837863, 'most': 0.44112885, '##liest': 0.44084555, 'destiny': 0.4264851, +'prove': 0.41789067, 'is': 0.40306947, 'famous': 0.40230379, 'hop': 0.3897451, 'noise': 0.38770816, '##iest': 0.3737782, +'comedy': 0.36903998, 'sport': 0.35883865, 'quiet': 0.3552795, 'detail': 0.3397654, 'fastest': 0.30345848, 'filmmaker': 0.3013101, +'festival': 0.28146765, '##st': 0.28040633, 'tram': 0.27373192, 'well': 0.2599603, 'documentary': 0.24368097, 'beat': 0.22953634, +'direction': 0.22925079, 'hardest': 0.22293334, 'strongest': 0.2018861, 'was': 0.19760133, 'oldest': 0.19532987, +'byron': 0.19360808, 'worst': 0.18397793, 'touring': 0.17598206, 'rock': 0.17319143, 'clubs': 0.16090117, +'popular': 0.15969758, 'toured': 0.15917331, 'trick': 0.1530599, 'celebrity': 0.14458777, 'musical': 0.13888633, +'filming': 0.1363699, 'culture': 0.13616633, 'groups': 0.1340591, 'ski': 0.13049376, 'venue': 0.12992987, +'style': 0.12853126, 'history': 0.12696269, 'massage': 0.11969914, 'theatre': 0.11673525, 'sounds': 0.108338095, +'visit': 0.10516077, 'editing': 0.078659914, 'death': 0.066746496, 'massachusetts': 0.055702563, 'stuart': 0.0447934, +'romantic': 0.041140396, 'pamela': 0.03561337, 'what': 0.016409796, 'smallest': 0.010815808, 'orchestra': 0.0020691194} +``` +Due to the expansion both the query and the document overlap in “*music*”, “*film*”, “*sounds*”, +and others, so **exact matching** works. -## [Anchor](https://qdrant.tech/documentation/cloud-account-setup/\#switching-between-accounts) Switching between accounts +## Key Takeaways: When to Choose Sparse Neural Models for Retrieval +Sparse Neural Retrieval makes sense: -If you have access to multiple accounts, you can switch between accounts with the account switcher on the top menu bar of the Qdrant Cloud Console. +- In areas where keyword matching is crucial but BM25 is insufficient for initial retrieval, semantic matching (e.g., synonyms, homonyms) adds significant value. This is especially true in fields such as medicine, academia, law, and e-commerce, where brand names and serial numbers play a critical role. Dense retrievers tend to return many false positives, while sparse neural retrieval helps narrow down these false positives. -![Switching between accounts](https://qdrant.tech/documentation/cloud/account-switcher.png) +- Sparse neural retrieval can be a valuable option for scaling, especially when working with large datasets. It leverages exact matching using an inverted index, which can be fast depending on the nature of your data. -## [Anchor](https://qdrant.tech/documentation/cloud-account-setup/\#light--dark-mode) Light & Dark Mode +- If you’re using traditional retrieval systems, sparse neural retrieval is compatible with them and helps bridge the semantic gap. -The Qdrant Cloud Console supports light and dark mode. You can switch between the two modes in the _Settings_ menu, by clicking on your account picture in the top right corner. +<|page-17-lllmstxt|> +## Introduction -![Light & Dark Mode](https://qdrant.tech/documentation/cloud/light-dark-mode.png) +Hi everyone! I’m Huong (Celine) Hoang, and I’m thrilled to share my experience working at Qdrant this summer as part of their Summer of Code 2024 program. During my internship, I worked on integrating cross-encoders into the FastEmbed library for re-ranking tasks. This enhancement widened the capabilities of the Qdrant ecosystem, enabling developers to build more context-aware search applications, such as question-answering systems, using Qdrant's suite of libraries. -## [Anchor](https://qdrant.tech/documentation/cloud-account-setup/\#account-settings) Account settings +This project was both technically challenging and rewarding, pushing me to grow my skills in handling large-scale ONNX (Open Neural Network Exchange) model integrations, tokenization, and more. Let me take you through the journey, the lessons learned, and where things are headed next. -You can configure your account settings in the Qdrant Cloud Console on the **Account -> Settings** page. +## Project Overview -The following functionality is available. +Qdrant is well known for its vector search capabilities, but my task was to go one step further — introducing cross-encoders for re-ranking. Traditionally, the FastEmbed library would generate embeddings, but cross-encoders don’t do that. Instead, they provide a list of scores based on how well a query matches a list of documents. This kind of re-ranking is critical when you want to refine search results and bring the most relevant answers to the top. -### [Anchor](https://qdrant.tech/documentation/cloud-account-setup/\#renaming-an-account) Renaming an account +The project revolved around creating a new input-output scheme: text data to scores. For this, I designed a family of classes to support ONNX models. Some of the key models I worked with included Xenova/ms-marco-MiniLM-L-6-v2, Xenova/ms-marco-MiniLM-L-12-v2, and BAAI/bge-reranker, all designed for re-ranking tasks. -If you use multiple accounts for different purposes, it is a good idea to give them descriptive names, for example _Development_, _Production_, _Testing_. You can also choose which account should be the default one, when you log in. +An important point to mention is that FastEmbed is a minimalistic library: it doesn’t have heavy dependencies like PyTorch or TensorFlow, and as a result, it is lightweight, occupying far less storage space. -![Account management](https://qdrant.tech/documentation/cloud/account-management.png) +Below is a diagram that represents the overall workflow for this project, detailing the key steps from user interaction to the final output validation: -### [Anchor](https://qdrant.tech/documentation/cloud-account-setup/\#deleting-an-account) Deleting an account +{{< figure src="/articles_data/cross-encoder-integration-gsoc/rerank-workflow.png" caption="Search workflow with reranking" alt="Search workflow with reranking" >}} -When you delete an account, all database clusters and associated data will be deleted. +## Technical Challenges -![Delete Account](https://qdrant.tech/documentation/cloud/account-delete.png) +### 1. Building a New Input-Output Scheme -## [Anchor](https://qdrant.tech/documentation/cloud-account-setup/\#enterprise-single-sign-on-sso) Enterprise Single-Sign-On (SSO) +FastEmbed already had support for embeddings, but re-ranking with cross-encoders meant building a completely new family of classes. These models accept a query and a set of documents, then return a list of relevance scores. For that, I created the base classes like `TextCrossEncoderBase` and `OnnxCrossEncoder`, taking inspiration from existing text embedding models. -Qdrant Cloud supports Enterprise Single-Sign-On for Premium Tier customers. The following providers are supported: +One thing I had to ensure was that the new class hierarchy was user-friendly. Users should be able to work with cross-encoders without needing to know the complexities of the underlying models. For instance, they should be able to just write: -- Active Directory/LDAP -- ADFS -- Azure Active Directory Native -- Google Workspace -- OpenID Connect -- Okta -- PingFederate -- SAML -- Azure Active Directory +```python +from fastembed.rerank.cross_encoder import TextCrossEncoder -Enterprise Sign-On is available as an add-on for [Premium Tier](https://qdrant.tech/documentation/cloud/premium/) customers. If you are interested in using SSO, please [contact us](https://qdrant.tech/contact-us/). +encoder = TextCrossEncoder(model_name="Xenova/ms-marco-MiniLM-L-6-v2") +scores = encoder.rerank(query, documents) +``` -##### Was this page useful? +Meanwhile, behind the scenes, we manage all the model loading, tokenization, and scoring. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +### 2. Handling Tokenization for Cross-Encoders -Thank you for your feedback! 🙏 +Cross-encoders require careful tokenization because they need to distinguish between the query and the documents. This is done using token type IDs, which help the model differentiate between the two. To implement this, I configured the tokenizer to handle pairs of inputs—concatenating the query with each document and assigning token types accordingly. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-account-setup.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Efficient tokenization is critical to ensure the performance of the models, and I optimized it specifically for ONNX models. -On this page: +### 3. Model Loading and Integration -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-account-setup.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +One of the most rewarding parts of the project was integrating the ONNX models into the FastEmbed library. ONNX models need to be loaded into a runtime environment that efficiently manages the computations. -× +While PyTorch is a common framework for these types of tasks, FastEmbed exclusively supports ONNX models, making it both lightweight and efficient. I focused on extensive testing to ensure that the ONNX models performed equivalently to their PyTorch counterparts, ensuring users could trust the results. -[Powered by](https://qdrant.tech/) +I added support for batching as well, allowing users to re-rank large sets of documents without compromising speed. -<|page-29-lllmstxt|> -## rag-contract-management-stackit-aleph-alpha -- [Documentation](https://qdrant.tech/documentation/) -- [Examples](https://qdrant.tech/documentation/examples/) -- Region-Specific Contract Management System +### 4. Debugging and Code Reviews -# [Anchor](https://qdrant.tech/documentation/examples/rag-contract-management-stackit-aleph-alpha/\#region-specific-contract-management-system) Region-Specific Contract Management System +During the project, I encountered a number of challenges, including issues with model configurations, tokenizers, and test cases. With the help of my mentor, George Panchuk, I was able to resolve these issues and improve my understanding of best practices, particularly around code readability, maintainability, and style. -| Time: 90 min | Level: Advanced | | | -| --- | --- | --- | --- | +One notable lesson was the importance of keeping the code organized and maintainable, with a strong focus on readability. This included properly structuring modules and ensuring the entire codebase followed a clear, consistent style. -Contract management benefits greatly from Retrieval Augmented Generation (RAG), streamlining the handling of lengthy business contract texts. With AI assistance, complex questions can be asked and well-informed answers generated, facilitating efficient document management. This proves invaluable for businesses with extensive relationships, like shipping companies, construction firms, and consulting practices. Access to such contracts is often restricted to authorized team members due to security and regulatory requirements, such as GDPR in Europe, necessitating secure storage practices. +### 5. Testing and Validation +To ensure the accuracy and performance of the models, I conducted extensive testing. I compared the output of ONNX models with their PyTorch counterparts, ensuring the conversion to ONNX was correct. A key part of this process was rigorous testing to verify the outputs and identify potential issues, such as incorrect conversions or bugs in our implementation. -Companies want their data to be kept and processed within specific geographical boundaries. For that reason, this RAG-centric tutorial focuses on dealing with a region-specific cloud provider. You will set up a contract management system using [Aleph Alpha’s](https://aleph-alpha.com/) embeddings and LLM. You will host everything on [STACKIT](https://www.stackit.de/), a German business cloud provider. On this platform, you will run Qdrant Hybrid Cloud as well as the rest of your RAG application. This setup will ensure that your data is stored and processed in Germany. +For instance, a test to validate the model's output was structured as follows: +```python +def test_rerank(): + is_ci = os.getenv("CI") -![Architecture diagram](https://qdrant.tech/documentation/examples/contract-management-stackit-aleph-alpha/architecture-diagram.png) + for model_desc in TextCrossEncoder.list_supported_models(): + if not is_ci and model_desc["size_in_GB"] > 1: + continue -## [Anchor](https://qdrant.tech/documentation/examples/rag-contract-management-stackit-aleph-alpha/\#components) Components + model_name = model_desc["model"] + model = TextCrossEncoder(model_name=model_name) -A contract management platform is not a simple CLI tool, but an application that should be available to all team -members. It needs an interface to upload, search, and manage the documents. Ideally, the system should be -integrated with org’s existing stack, and the permissions/access controls inherited from LDAP or Active -Directory. - -> **Note:** In this tutorial, we are going to build a solid foundation for such a system. However, it is up to your organization’s setup to implement the entire solution. - -- **Dataset** \- a collection of documents, using different formats, such as PDF or DOCx, scraped from internet -- **Asymmetric semantic embeddings** \- [Aleph Alpha embedding](https://docs.aleph-alpha.com/api/pharia-inference/semantic-embed/) to -convert the queries and the documents into vectors -- **Large Language Model** \- the [Luminous-extended-control\\ -model](https://docs.aleph-alpha.com/api/pharia-inference/available-models/), but you can play with a different one from the -Luminous family -- **Qdrant Hybrid Cloud** \- a knowledge base to store the vectors and search over the documents -- **STACKIT** \- a [German business cloud](https://www.stackit.de/) to run the Qdrant Hybrid Cloud and the application -processes - -We will implement the process of uploading the documents, converting them into vectors, and storing them in Qdrant. -Then, we will build a search interface to query the documents and get the answers. All that, assuming the user -interacts with the system with some set of permissions, and can only access the documents they are allowed to. + query = "What is the capital of France?" + documents = ["Paris is the capital of France.", "Berlin is the capital of Germany."] + scores = np.array(model.rerank(query, documents)) -## [Anchor](https://qdrant.tech/documentation/examples/rag-contract-management-stackit-aleph-alpha/\#prerequisites) Prerequisites + canonical_scores = CANONICAL_SCORE_VALUES[model_name] + assert np.allclose( + scores, canonical_scores, atol=1e-3 + ), f"Model: {model_name}, Scores: {scores}, Expected: {canonical_scores}" +``` -### [Anchor](https://qdrant.tech/documentation/examples/rag-contract-management-stackit-aleph-alpha/\#aleph-alpha-account) Aleph Alpha account +The `CANONICAL_SCORE_VALUES` were retrieved directly from the result of applying the original PyTorch models to the same input -Since you will be using Aleph Alpha’s models, [sign up](https://aleph-alpha.com/) with their managed service and obtain an API token. Once you have it ready, store it as an environment variable: +## Outcomes and Future Improvements -shellpython +By the end of my project, I successfully added cross-encoders to the FastEmbed library, allowing users to re-rank search results based on relevance scores. This enhancement opens up new possibilities for applications that rely on contextual ranking, such as search engines and recommendation systems. +This functionality will be available as of FastEmbed `0.4.0`. -```shell -export ALEPH_ALPHA_API_KEY="" +Some areas for future improvements include: +- Expanding Model Support: We could add more cross-encoder models, especially from the sentence transformers library, to give users more options. +- Parallelization: Optimizing batch processing to handle even larger datasets could further improve performance. +- Custom Tokenization: For models with non-standard tokenization, like BAAI/bge-reranker, more specific tokenizer configurations could be added. -``` +## Overall Experience and Wrapping Up -```python -import os +Looking back, this internship has been an incredibly valuable experience. I’ve grown not only as a developer but also as someone who can take on complex projects and see them through from start to finish. The Qdrant team has been so supportive, especially during the debugging and review stages. I’ve learned so much about model integration, ONNX, and how to build tools that are user-friendly and scalable. -os.environ["ALEPH_ALPHA_API_KEY"] = "" +One key takeaway for me is the importance of understanding the user experience. It’s not just about getting the models to work but making sure they are easy to use and integrate into real-world applications. This experience has solidified my passion for building solutions that truly make an impact, and I’m excited to continue working on projects like this in the future. -``` +Thank you for taking the time to read about my journey with Qdrant and the FastEmbed library. I’m excited to see how this work will continue to improve search experiences for users! -### [Anchor](https://qdrant.tech/documentation/examples/rag-contract-management-stackit-aleph-alpha/\#qdrant-hybrid-cloud-on-stackit) Qdrant Hybrid Cloud on STACKIT +<|page-18-lllmstxt|> +## What Is a Vector Database? -Please refer to our documentation to see [how to deploy Qdrant Hybrid Cloud on\\ -STACKIT](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/#stackit). Once you finish the deployment, you will -have the API endpoint to interact with the Qdrant server. Let’s store it in the environment variable as well: +![vector-database-architecture](/articles_data/what-is-a-vector-database/vector-database-1.jpeg) -shellpython +Most of the millions of terabytes of data we generate each day is **unstructured**. Think of the meal photos you snap, the PDFs shared at work, or the podcasts you save but may never listen to. None of it fits neatly into rows and columns. -```shell -export QDRANT_URL="https://qdrant.example.com" -export QDRANT_API_KEY="your-api-key" +Unstructured data lacks a strict format or schema, making it challenging for conventional databases to manage. Yet, this unstructured data holds immense potential for **AI**, **machine learning**, and **modern search engines**. -``` +> A [Vector Database](https://qdrant.tech/qdrant-vector-database/) is a specialized system designed to efficiently handle high-dimensional vector data. It excels at indexing, querying, and retrieving this data, enabling advanced analysis and similarity searches that traditional databases cannot easily perform. -```python -os.environ["QDRANT_URL"] = "https://qdrant.example.com" -os.environ["QDRANT_API_KEY"] = "your-api-key" +### The Challenge with Traditional Databases -``` +Traditional [OLTP](https://www.ibm.com/topics/oltp) and [OLAP](https://www.ibm.com/topics/olap) databases have been the backbone of data storage for decades. They are great at managing structured data with well-defined schemas, like `name`, `address`, `phone number`, and `purchase history`. -Qdrant will be running on a specific URL and access will be restricted by the API key. Make sure to store them both as environment variables as well: +Structure of OLTP and OLAP databases -_Optional:_ Whenever you use LangChain, you can also [configure LangSmith](https://docs.smith.langchain.com/), which will help us trace, monitor and debug LangChain applications. You can sign up for LangSmith [here](https://smith.langchain.com/). +But when data can't be easily categorized, like the content inside a PDF file, things start to get complicated. -```shell -export LANGCHAIN_TRACING_V2=true -export LANGCHAIN_API_KEY="your-api-key" -export LANGCHAIN_PROJECT="your-project" # if not specified, defaults to "default" +You can always store the PDF file as raw data, perhaps with some metadata attached. However, the database still wouldn’t be able to understand what's inside the document, categorize it, or even search for the information that it contains. -``` +Also, this applies to more than just PDF documents. Think about the vast amounts of text, audio, and image data you generate every day. If a database can’t grasp the **meaning** of this data, how can you search for or find relationships within the data? -## [Anchor](https://qdrant.tech/documentation/examples/rag-contract-management-stackit-aleph-alpha/\#implementation) Implementation +Structure of a Vector Database -To build the application, we can use the official SDKs of Aleph Alpha and Qdrant. However, to streamline the process -let’s use [LangChain](https://python.langchain.com/docs/get_started/introduction). This framework is already integrated with both services, so we can focus our efforts on -developing business logic. +Vector databases allow you to understand the **context** or **conceptual similarity** of unstructured data by representing them as vectors, enabling advanced analysis and retrieval based on data similarity. -### [Anchor](https://qdrant.tech/documentation/examples/rag-contract-management-stackit-aleph-alpha/\#qdrant-collection) Qdrant collection +## When to Use a Vector Database -Aleph Alpha embeddings are high dimensional vectors by default, with a dimensionality of `5120`. However, a pretty -unique feature of that model is that they might be compressed to a size of `128`, with a small drop in accuracy -performance (4-6%, according to the docs). Qdrant can store even the original vectors easily, and this sounds like a -good idea to enable [Binary Quantization](https://qdrant.tech/documentation/guides/quantization/#binary-quantization) to save space and -make the retrieval faster. Let’s create a collection with such settings: +Not sure if you should use a vector database or a traditional database? This chart may help. -```python -from qdrant_client import QdrantClient, models +| **Feature** | **OLTP Database** | **OLAP Database** | **Vector Database** | +|---------------------|--------------------------------------|--------------------------------------------|--------------------------------------------| +| **Data Structure** | Rows and columns | Rows and columns | Vectors | +| **Type of Data** | Structured | Structured/Partially Unstructured | Unstructured | +| **Query Method** | SQL-based (Transactional Queries) | SQL-based (Aggregations, Analytical Queries) | Vector Search (Similarity-Based) | +| **Storage Focus** | Schema-based, optimized for updates | Schema-based, optimized for reads | Context and Semantics | +| **Performance** | Optimized for high-volume transactions | Optimized for complex analytical queries | Optimized for unstructured data retrieval | +| **Use Cases** | Inventory, order processing, CRM | Business intelligence, data warehousing | Similarity search, recommendations, RAG, anomaly detection, etc. | -client = QdrantClient( - location=os.environ["QDRANT_URL"], - api_key=os.environ["QDRANT_API_KEY"], -) -client.create_collection( - collection_name="contracts", - vectors_config=models.VectorParams( - size=5120, - distance=models.Distance.COSINE, - quantization_config=models.BinaryQuantization( - binary=models.BinaryQuantizationConfig( - always_ram=True, - ) - ) - ), -) -``` +## What Is a Vector? -We are going to use the `contracts` collection to store the vectors of the documents. The `always_ram` flag is set to -`True` to keep the quantized vectors in RAM, which will speed up the search process. We also wanted to restrict access -to the individual documents, so only users with the proper permissions can see them. In Qdrant that should be solved by -adding a payload field that defines who can access the document. We’ll call this field `roles` and set it to an array -of strings with the roles that can access the document. +![vector-database-vector](/articles_data/what-is-a-vector-database/vector-database-7.jpeg) -```python -client.create_payload_index( - collection_name="contracts", - field_name="metadata.roles", - field_schema=models.PayloadSchemaType.KEYWORD, -) +When a machine needs to process unstructured data - an image, a piece of text, or an audio file, it first has to translate that data into a format it can work with: **vectors**. -``` +> A **vector** is a numerical representation of data that can capture the **context** and **semantics** of data. -Since we use Langchain, the `roles` field is a nested field of the `metadata`, so we have to define it as -`metadata.roles`. The schema says that the field is a keyword, which means it is a string or an array of strings. We are -going to use the name of the customers as the roles, so the access control will be based on the customer name. +When you deal with unstructured data, traditional databases struggle to understand its meaning. However, a vector can translate that data into something a machine can process. For example, a vector generated from text can represent relationships and meaning between words, making it possible for a machine to compare and understand their context. -### [Anchor](https://qdrant.tech/documentation/examples/rag-contract-management-stackit-aleph-alpha/\#ingestion-pipeline) Ingestion pipeline +There are three key elements that define a vector in a vector database: the **ID**, the **dimensions**, and the **payload**. These components work together to represent a vector effectively within the system. Together, they form a **point**, which is the core unit of data stored and retrieved in a vector database. -Semantic search systems rely on high-quality data as their foundation. With the [unstructured integration of Langchain](https://python.langchain.com/docs/integrations/providers/unstructured), ingestion of various document formats like PDFs, Microsoft Word files, and PowerPoint presentations becomes effortless. However, it’s crucial to split the text intelligently to avoid converting entire documents into vectors; instead, they should be divided into meaningful chunks. Subsequently, the extracted documents are converted into vectors using Aleph Alpha embeddings and stored in the Qdrant collection. +Representation of a Point in Qdrant -Let’s start by defining the components and connecting them together: +Each one of these parts plays an important role in how vectors are stored, retrieved, and interpreted. Let's see how. -```python -embeddings = AlephAlphaAsymmetricSemanticEmbedding( - model="luminous-base", - aleph_alpha_api_key=os.environ["ALEPH_ALPHA_API_KEY"], - normalize=True, -) +### 1. The ID: Your Vector’s Unique Identifier -qdrant = Qdrant( - client=client, - collection_name="contracts", - embeddings=embeddings, -) +Just like in a relational database, each vector in a vector database gets a unique ID. Think of it as your vector’s name tag, a **primary key** that ensures the vector can be easily found later. When a vector is added to the database, the ID is created automatically. -``` +While the ID itself doesn't play a part in the similarity search (which operates on the vector's numerical data), it is essential for associating the vector with its corresponding "real-world" data, whether that’s a document, an image, or a sound file. -Now it’s high time to index our documents. Each of the documents is a separate file, and we also have to know the -customer name to set the access control properly. There might be several roles for a single document, so let’s keep them -in a list. +After a search is performed and similar vectors are found, their IDs are returned. These can then be used to **fetch additional details or metadata** tied to the result. -```python -documents = { - "data/Data-Processing-Agreement_STACKIT_Cloud_version-1.2.pdf": ["stackit"], - "data/langchain-terms-of-service.pdf": ["langchain"], -} +### 2. The Dimensions: The Core Representation of the Data -``` +At the core of every vector is a set of numbers, which together form a representation of the data in a **multi-dimensional** space. -This is how the documents might look like: +#### From Text to Vectors: How Does It Work? -![Example of the indexed document](https://qdrant.tech/documentation/examples/contract-management-stackit-aleph-alpha/indexed-document.png) +These numbers are generated by **embedding models**, such as deep learning algorithms, and capture the essential patterns or relationships within the data. That's why the term **embedding** is often used interchangeably with vector when referring to the output of these models. -Each has to be split into chunks first; there is no silver bullet. Our chunking algorithm will be simple and based on -recursive splitting, with the maximum chunk size of 500 characters and the overlap of 100 characters. +To represent textual data, for example, an embedding will encapsulate the nuances of language, such as semantics and context within its dimensions. -```python -from langchain_text_splitters import RecursiveCharacterTextSplitter +Creation of a vector based on a sentence with an embedding model -text_splitter = RecursiveCharacterTextSplitter( - chunk_size=500, - chunk_overlap=100, -) +For that reason, when comparing two similar sentences, their embeddings will turn out to be very similar, because they have similar **linguistic elements**. -``` +Comparison of the embeddings of 2 similar sentences -Now we can iterate over the documents, split them into chunks, convert them into vectors with Aleph Alpha embedding -model, and store them in the Qdrant. +That’s the beauty of embeddings. Tthe complexity of the data is distilled into something that can be compared across a multi-dimensional space. -```python -from langchain_community.document_loaders.unstructured import UnstructuredFileLoader +### 3. The Payload: Adding Context with Metadata -for document_path, roles in documents.items(): - document_loader = UnstructuredFileLoader(file_path=document_path) +Sometimes you're going to need more than just numbers to fully understand or refine a search. While the dimensions capture the essence of the data, the payload holds **metadata** for structured information. - # Unstructured loads each file into a single Document object - loaded_documents = document_loader.load() - for doc in loaded_documents: - doc.metadata["roles"] = roles +It could be textual data like descriptions, tags, categories, or it could be numerical values like dates or prices. This extra information is vital when you want to filter or rank search results based on criteria that aren’t directly encoded in the vector. - # Chunks will have the same metadata as the original document - document_chunks = text_splitter.split_documents(loaded_documents) +> This metadata is invaluable when you need to apply additional **filters** or **sorting** criteria. - # Add the documents to the Qdrant collection - qdrant.add_documents(document_chunks, batch_size=20) +For example, if you’re searching for a picture of a dog, the vector helps the database find images that are visually similar. But let's say you want results showing only images taken within the last year, or those tagged with “vacation.” -``` +Filtering Example -Our collection is filled with data, and we can start searching over it. In a real-world scenario, the ingestion process -should be automated and triggered by the new documents uploaded to the system. Since we already use Qdrant Hybrid Cloud -running on Kubernetes, we can easily deploy the ingestion pipeline as a job to the same environment. On STACKIT, you -probably use the [STACKIT Kubernetes Engine (SKE)](https://www.stackit.de/en/product/kubernetes/) and launch it in a -container. The [Compute Engine](https://www.stackit.de/en/product/stackit-compute-engine/) is also an option, but -everything depends on the specifics of your organization. +The payload can help you narrow down those results by ignoring vectors that doesn't match your query vector filtering criteria. If you want the full picture of how filtering works in Qdrant, check out our [Complete Guide to Filtering.](https://qdrant.tech/articles/vector-search-filtering/) -### [Anchor](https://qdrant.tech/documentation/examples/rag-contract-management-stackit-aleph-alpha/\#search-application) Search application +## The Architecture of a Vector Database -Specialized Document Management Systems have a lot of features, but semantic search is not yet a standard. We are going -to build a simple search mechanism which could be possibly integrated with the existing system. The search process is -quite simple: we convert the query into a vector using the same Aleph Alpha model, and then search for the most similar -documents in the Qdrant collection. The access control is also applied, so the user can only see the documents they are -allowed to. +A vector database is made of multiple different entities and relations. Let's understand a bit of what's happening here: +Architecture Diagram of a Vector Database -We start with creating an instance of the LLM of our choice, and set the maximum number of tokens to 200, as the default -value is 64, which might be too low for our purposes. +### Collections -```python -from langchain.llms.aleph_alpha import AlephAlpha +A [collection](https://qdrant.tech/documentation/concepts/collections/) is essentially a group of **vectors** (or “[points](https://qdrant.tech/documentation/concepts/points/)”) that are logically grouped together **based on similarity or a specific task**. Every vector within a collection shares the same dimensionality and can be compared using a single metric. Avoid creating multiple collections unless necessary; instead, consider techniques like **sharding** for scaling across nodes or **multitenancy** for handling different use cases within the same infrastructure. -llm = AlephAlpha( - model="luminous-extended-control", - aleph_alpha_api_key=os.environ["ALEPH_ALPHA_API_KEY"], - maximum_tokens=200, -) +### Distance Metrics -``` +These metrics defines how similarity between vectors is calculated. The choice of distance metric is made when creating a collection and the right choice depends on the type of data you’re working with and how the vectors were created. Here are the three most common distance metrics: -Then, we can glue the components together and build the search process. `RetrievalQA` is a class that takes implements -the Question Retrieval process, with a specified retriever and Large Language Model. The instance of `Qdrant` might be -converted into a retriever, with additional filter that will be passed to the `similarity_search` method. The filter -is created as [in a regular Qdrant query](https://qdrant.tech/documentation/concepts/filtering/), with the `roles` field set to the -user’s roles. +- **Euclidean Distance:** The straight-line path. It’s like measuring the physical distance between two points in space. Pick this one when the actual distance (like spatial data) matters. -```python -user_roles = ["stackit", "aleph-alpha"] +- **Cosine Similarity:** This one is about the angle, not the length. It measures how two vectors point in the same direction, so it works well for text or documents when you care more about meaning than magnitude. For example, if two things are *similar*, *opposite*, or *unrelated*: -qdrant_retriever = qdrant.as_retriever( - search_kwargs={ - "filter": models.Filter( - must=[\ - models.FieldCondition(\ - key="metadata.roles",\ - match=models.MatchAny(any=user_roles)\ - )\ - ] - ) - } -) +Cosine Similarity Example -``` +- **Dot Product:** This looks at how much two vectors align. It’s popular in recommendation systems where you're interested in how much two things “agree” with each other. -We set the user roles to `stackit` and `aleph-alpha`, so the user can see the documents that are accessible to these -customers, but not to the others. The final step is to create the `RetrievalQA` instance and use it to search over the -documents, with the custom prompt. +### RAM-Based and Memmap Storage -```python -from langchain.prompts import PromptTemplate -from langchain.chains.retrieval_qa.base import RetrievalQA +By default, Qdrant stores vectors in RAM, delivering incredibly fast access for datasets that fit comfortably in memory. But when your dataset exceeds RAM capacity, Qdrant offers Memmap as an alternative. -prompt_template = """ -Question: {question} -Answer the question using the Source. If there's no answer, say "NO ANSWER IN TEXT". +Memmap allows you to store vectors **on disk**, yet still access them efficiently by mapping the data directly into memory if you have enough RAM. To enable it, you only need to set `"on_disk": true` when you are **creating a collection:** -Source: {context} +```python +from qdrant_client import QdrantClient, models -### Response: -""" -prompt = PromptTemplate( - template=prompt_template, input_variables=["context", "question"] -) +client = QdrantClient(url='http://localhost:6333') -retrieval_qa = RetrievalQA.from_chain_type( - llm=llm, - chain_type="stuff", - retriever=qdrant_retriever, - return_source_documents=True, - chain_type_kwargs={"prompt": prompt}, +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams( + size=768, distance=models.Distance.COSINE, on_disk=True + ), ) +``` -response = retrieval_qa.invoke({"query": "What are the rules of performing the audit?"}) -print(response["result"]) +For other configurations like `hnsw_config.on_disk` or `memmap_threshold`, see the Qdrant documentation for [Storage.](https://qdrant.tech/documentation/concepts/storage/) -``` +### SDKs -Output: +Qdrant offers a range of SDKs. You can use the programming language you're most comfortable with, whether you're coding in [Python](https://github.com/qdrant/qdrant-client), [Go](https://github.com/qdrant/go-client), [Rust](https://github.com/qdrant/rust-client), [Javascript/Typescript](https://github.com/qdrant/qdrant-js), [C#](https://github.com/qdrant/qdrant-dotnet) or [Java](https://github.com/qdrant/java-client). -```text -The rules for performing the audit are as follows: +## The Core Functionalities of Vector Databases -1. The Customer must inform the Contractor in good time (usually at least two weeks in advance) about any and all circumstances related to the performance of the audit. -2. The Customer is entitled to perform one audit per calendar year. Any additional audits may be performed if agreed with the Contractor and are subject to reimbursement of expenses. -3. If the Customer engages a third party to perform the audit, the Customer must obtain the Contractor's consent and ensure that the confidentiality agreements with the third party are observed. -4. The Contractor may object to any third party deemed unsuitable. +![vector-database-functions](/articles_data/what-is-a-vector-database/vector-database-3.jpeg) -``` +When you think of a traditional database, the operations are familiar: you **create**, **read**, **update**, and **delete** records. These are the fundamentals. And guess what? In many ways, vector databases work the same way, but the operations are translated for the complexity of vectors. -There are some other parameters that might be tuned to optimize the search process. The `k` parameter defines how many -documents should be returned, but Langchain allows us also to control the retrieval process by choosing the type of the -search operation. The default is `similarity`, which is just vector search, but we can also use `mmr` which stands for -Maximal Marginal Relevance. It is a technique to diversify the search results, so the user gets the most relevant -documents, but also the most diverse ones. The `mmr` search is slower, but might be more user-friendly. +### 1. Indexing: HNSW Index and Sending Data to Qdrant -Our search application is ready, and we can deploy it to the same environment as the ingestion pipeline on STACKIT. The -same rules apply here, so you can use the SKE or the Compute Engine, depending on the specifics of your organization. +Indexing your vectors is like creating an entry in a traditional database. But for vector databases, this step is very important. Vectors need to be indexed in a way that makes them easy to search later on. -## [Anchor](https://qdrant.tech/documentation/examples/rag-contract-management-stackit-aleph-alpha/\#next-steps) Next steps +**HNSW** (Hierarchical Navigable Small World) is a powerful indexing algorithm that most vector databases rely on to organize vectors for fast and efficient search. -We built a solid foundation for the contract management system, but there is still a lot to do. If you want to make the -system production-ready, you should consider implementing the mechanism into your existing stack. If you have any -questions, feel free to ask on our [Discord community](https://qdrant.to/discord). +It builds a multi-layered graph, where each vector is a node and connections represent similarity. The higher layers connect broadly similar vectors, while lower layers link vectors that are closely related, making searches progressively more refined as they go deeper. -##### Was this page useful? +Indexing Data with the HNSW algorithm -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +When you run a search, HNSW starts at the top, quickly narrowing down the search by hopping between layers. It focuses only on relevant vectors as it goes deeper, refining the search with each step. -Thank you for your feedback! 🙏 +### 1.1 Payload Indexing -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/rag-contract-management-stackit-aleph-alpha.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +In Qdrant, indexing is modular. You can configure indexes for **both vectors and payloads independently**. The payload index is responsible for optimizing filtering based on metadata. Each payload index is built for a specific field and allows you to quickly filter vectors based on specific conditions. -On this page: +Searching Data with the HNSW algorithm -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/rag-contract-management-stackit-aleph-alpha.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +You need to build the payload index for **each field** you'd like to search. The magic here is in the combination: HNSW finds similar vectors, and the payload index makes sure only the ones that fit your criteria come through. Learn more about Qdrant's [Filtrable HNSW](https://qdrant.tech/articles/filtrable-hnsw/) and why it was built like this. -× +> Combining [full-text search](https://qdrant.tech/documentation/concepts/indexing/#full-text-index) with vector-based search gives you even more versatility. You can simultaneously search for conceptually similar documents while ensuring specific keywords are present, all within the same query. -[Powered by](https://qdrant.tech/) +### 2. Searching: Approximate Nearest Neighbors (ANN) Search -<|page-30-lllmstxt|> -## automate-filtering-with-llms -- [Documentation](https://qdrant.tech/documentation/) -- [Search precision](https://qdrant.tech/documentation/search-precision/) -- Automate filtering with LLMs +Similarity search allows you to search by **meaning**. This way you can do searches such as similar songs that evoke the same mood, finding images that match your artistic vision, or even exploring emotional patterns in text. -# [Anchor](https://qdrant.tech/documentation/search-precision/automate-filtering-with-llms/\#automate-filtering-with-llms) Automate filtering with LLMs +Similar words grouped together -Our [complete guide to filtering in vector search](https://qdrant.tech/articles/vector-search-filtering/) describes why filtering is -important, and how to implement it with Qdrant. However, applying filters is easier when you build an application -with a traditional interface. Your UI may contain a form with checkboxes, sliders, and other elements that users can -use to set their criteria. But what if you want to build a RAG-powered application with just the conversational -interface, or even voice commands? In this case, you need to automate the filtering process! +The way it works is, when the user queries the database, this query is also converted into a vector. The algorithm quickly identifies the area of the graph likely to contain vectors closest to the **query vector**. -LLMs seem to be particularly good at this task. They can understand natural language and generate structured output -based on it. In this tutorial, we’ll show you how to use LLMs to automate filtering in your vector search application. +Approximate Nearest Neighbors (ANN) Search Graph -## [Anchor](https://qdrant.tech/documentation/search-precision/automate-filtering-with-llms/\#few-notes-on-qdrant-filters) Few notes on Qdrant filters +The search then moves down progressively narrowing down to more closely related and relevant vectors. Once the closest vectors are identified at the bottom layer, these points translate back to actual data, representing your **top-scored documents**. -Qdrant Python SDK defines the models using [Pydantic](https://docs.pydantic.dev/latest/). This library is de facto -standard for data validation and serialization in Python. It allows you to define the structure of your data using -Python type hints. For example, our `Filter` model is defined as follows: +Here's a high-level overview of this process: -```python -class Filter(BaseModel, extra="forbid"): - should: Optional[Union[List["Condition"], "Condition"]] = Field( - default=None, description="At least one of those conditions should match" - ) - min_should: Optional["MinShould"] = Field( - default=None, description="At least minimum amount of given conditions should match" - ) - must: Optional[Union[List["Condition"], "Condition"]] = Field(default=None, description="All conditions must match") - must_not: Optional[Union[List["Condition"], "Condition"]] = Field( - default=None, description="All conditions must NOT match" - ) +Vector Database Searching Functionality -``` +### 3. Updating Vectors: Real-Time and Bulk Adjustments -Qdrant filters may be nested, and you can express even the most complex conditions using the `must`, `should`, and -`must_not` notation. +Data isn't static, and neither are vectors. Keeping your vectors up to date is crucial for maintaining relevance in your searches. -## [Anchor](https://qdrant.tech/documentation/search-precision/automate-filtering-with-llms/\#structured-output-from-llms) Structured output from LLMs +Vector updates don’t always need to happen instantly, but when they do, Qdrant handles real-time modifications efficiently with a simple API call: -It isn’t an uncommon practice to use LLMs to generate structured output. It is primarily useful if their output is -intended for further processing by a different application. For example, you can use LLMs to generate SQL queries, -JSON objects, and most importantly, Qdrant filters. Pydantic got adopted by the LLM ecosystem quite well, so there is -plenty of libraries which uses Pydantic models to define the structure of the output for the Language Models. +```python +client.upsert( + collection_name='product_collection', + points=[PointStruct(id=product_id, vector=new_vector, payload=new_payload)] +) +``` -One of the interesting projects in this area is [Instructor](https://python.useinstructor.com/) that allows you to -play with different LLM providers and restrict their output to a specific structure. Let’s install the library and -already choose a provider we’ll use in this tutorial: +For large-scale changes, like re-indexing vectors after a model update, batch updating allows you to update multiple vectors in one operation without impacting search performance: -```shell -pip install "instructor[anthropic]" +```python +batch_of_updates = [ + PointStruct(id=product_id_1, vector=updated_vector_1, payload=new_payload_1), + PointStruct(id=product_id_2, vector=updated_vector_2, payload=new_payload_2), + # Add more points... +] +client.upsert( + collection_name='product_collection', + points=batch_of_updates +) ``` -Anthropic is not the only option out there, as Instructor supports many other providers including OpenAI, Ollama, -Llama, Gemini, Vertex AI, Groq, Litellm and others. You can choose the one that fits your needs the best, or the one -you already use in your RAG. +### 4. Deleting Vectors: Managing Outdated and Duplicate Data -## [Anchor](https://qdrant.tech/documentation/search-precision/automate-filtering-with-llms/\#using-instructor-to-generate-qdrant-filters) Using Instructor to generate Qdrant filters +Efficient vector management is key to keeping your searches accurate and your database lean. Deleting vectors that represent outdated or irrelevant data, such as expired products, old news articles, or archived profiles, helps maintain both performance and relevance. -Instructor has some helper methods to decorate the LLM APIs, so you can interact with them as if you were using their -normal SDKs. In case of Anthropic, you just pass an instance of `Anthropic` class to the `from_anthropic` function: +In Qdrant, removing vectors is straightforward, requiring only the vector IDs to be specified: ```python -import instructor -from anthropic import Anthropic - -anthropic_client = instructor.from_anthropic( - client=Anthropic( - api_key="YOUR_API_KEY", - ) +client.delete( + collection_name='data_collection', + points_selector=[point_id_1, point_id_2] ) - ``` +You can use deletion to remove outdated data, clean up duplicates, and manage the lifecycle of vectors by automatically deleting them after a set period to keep your dataset relevant and focused. -A decorated client slightly modifies the original API, so you can pass the `response_model` parameter to the -`.messages.create` method. This parameter should be a Pydantic model that defines the structure of the output. In case -of Qdrant filters, it should be a `Filter` model: +## Dense vs. Sparse Vectors -```python -from qdrant_client import models +![vector-database-dense-sparse](/articles_data/what-is-a-vector-database/vector-database-4.jpeg) -qdrant_filter = anthropic_client.messages.create( - model="claude-3-5-sonnet-latest", - response_model=models.Filter, - max_tokens=1024, - messages=[\ - {\ - "role": "user",\ - "content": "red T-shirt"\ - }\ - ], -) +Now that you understand what vectors are and how they are created, let's learn more about the two possible types of vectors you can use: **dense** or **sparse**. The main difference between the two are: -``` +### 1. Dense Vectors -The output of this code will be a Pydantic model that represents a Qdrant filter. Surprisingly, there is no need to pass -additional instructions to already figure out that the user wants to filter by the color and the type of the product. -Here is how the output looks like: +Dense vectors are, quite literally, dense with information. Every element in the vector contributes to the **semantic meaning**, **relationships** and **nuances** of the data. A dense vector representation of this sentence might look like this: -```python -Filter( - should=None, - min_should=None, - must=[\ - FieldCondition(\ - key="color",\ - match=MatchValue(value="red"),\ - range=None,\ - geo_bounding_box=None,\ - geo_radius=None,\ - geo_polygon=None,\ - values_count=None\ - ),\ - FieldCondition(\ - key="type",\ - match=MatchValue(value="t-shirt"),\ - range=None,\ - geo_bounding_box=None,\ - geo_radius=None,\ - geo_polygon=None,\ - values_count=None\ - )\ - ], - must_not=None -) +Representation of a Dense Vector -``` +Each number holds weight. Together, they convey the overall meaning of the sentence, and are better for identifying contextually similar items, even if the words don’t match exactly. -Obviously, giving the model complete freedom to generate the filter may lead to unexpected results, or no results at -all. Your collection probably has payloads with a specific structure, so it doesn’t make sense to use anything else. -Moreover, **it’s considered a good practice to filter by the fields that have been indexed**. That’s why it makes sense -to automatically determine the indexed fields and restrict the output to them. +### 2. Sparse Vectors -### [Anchor](https://qdrant.tech/documentation/search-precision/automate-filtering-with-llms/\#restricting-the-available-fields) Restricting the available fields +Sparse vectors operate differently. They focus only on the essentials. In most sparse vectors, a large number of elements are zeros. When a feature or token is present, it’s marked—otherwise, zero. -Qdrant collection info contains a list of the indexes created on a particular collection. You can use this information -to automatically determine the fields that can be used for filtering. Here is how you can do it: +In the image, you can see a sentence, *“I love Vector Similarity,”* broken down into tokens like *“i,” “love,” “vector”* through tokenization. Each token is assigned a unique `ID` from a large vocabulary. For example, *“i”* becomes `193`, and *“vector”* becomes `15012`. -```python -from qdrant_client import QdrantClient +How Sparse Vectors are Created -client = QdrantClient("http://localhost:6333") -collection_info = client.get_collection(collection_name="test_filter") -indexes = collection_info.payload_schema -print(indexes) +Sparse vectors, are used for **exact matching** and specific token-based identification. The values on the right, such as `193: 0.04` and `9182: 0.12`, are the scores or weights for each token, showing how relevant or important each token is in the context. The final result is a sparse vector: +```json +{ + 193: 0.04, + 9182: 0.12, + 15012: 0.73, + 6731: 0.69, + 454: 0.21 +} ``` -Output: +Everything else in the vector space is assumed to be zero. -```python -{ - "city.location": PayloadIndexInfo( - data_type=PayloadSchemaType.GEO, - ... - ), - "city.name": PayloadIndexInfo( - data_type=PayloadSchemaType.KEYWORD, - ... - ), - "color": PayloadIndexInfo( - data_type=PayloadSchemaType.KEYWORD, - ... - ), - "fabric": PayloadIndexInfo( - data_type=PayloadSchemaType.KEYWORD, - ... - ), - "price": PayloadIndexInfo( - data_type=PayloadSchemaType.FLOAT, - ... - ), -} +Sparse vectors are ideal for tasks like **keyword search** or **metadata filtering**, where you need to check for the presence of specific tokens without needing to capture the full meaning or context. They suited for exact matches within the **data itself**, rather than relying on external metadata, which is handled by payload filtering. -``` +## Benefits of Hybrid Search -Our LLM should know the names of the fields it can use, but also their type, as e.g., range filtering only makes sense -for numerical fields, and geo filtering on non-geo fields won’t yield anything meaningful. You can pass this information -as a part of the prompt to the LLM, so let’s encode it as a string: +![vector-database-get-started](/articles_data/what-is-a-vector-database/vector-database-5.jpeg) -```python -formatted_indexes = "\n".join([\ - f"- {index_name} - {index.data_type.name}"\ - for index_name, index in indexes.items()\ -]) -print(formatted_indexes) +Sometimes context alone isn’t enough. Sometimes you need precision, too. Dense vectors are fantastic when you need to retrieve results based on the context or meaning behind the data. Sparse vectors are useful when you also need **keyword or specific attribute matching**. -``` +> With hybrid search you don’t have to choose one over the othe and use both to get searches that are more **relevant** and **filtered**. -Output: +To achieve this balance, Qdrant uses **normalization** and **fusion** techniques to blend results from multiple search methods. One common approach is **Reciprocal Rank Fusion (RRF)**, where results from different methods are merged, giving higher importance to items ranked highly by both methods. This ensures that the best candidates, whether identified through dense or sparse vectors, appear at the top of the results. -```text -- fabric - KEYWORD -- city.name - KEYWORD -- color - KEYWORD -- price - FLOAT -- city.location - GEO +Qdrant combines dense and sparse vector results through a process of **normalization** and **fusion**. -``` +Hybrid Search API - How it works -**It’s a good idea to cache the list of the available fields and their types**, as they are not supposed to change -often. Our interactions with the LLM should be slightly different now: +### How to Use Hybrid Search in Qdrant +Qdrant makes it easy to implement hybrid search through its Query API. Here’s how you can make it happen in your own project: -```python -qdrant_filter = anthropic_client.messages.create( - model="claude-3-5-sonnet-latest", - response_model=models.Filter, - max_tokens=1024, - messages=[\ - {\ - "role": "user",\ - "content": (\ - "color is red"\ - f"\n{formatted_indexes}\n"\ - )\ - }\ - ], -) +Hybrid Query Example + +**Example Hybrid Query:** Let’s say a researcher is looking for papers on NLP, but the paper must specifically mention "transformers" in the content: +```json +search_query = { + "vector": query_vector, # Dense vector for semantic search + "filter": { # Filtering for specific terms + "must": [ + {"key": "text", "match": "transformers"} # Exact keyword match in the paper + ] + } +} ``` -Output: +In this query the dense vector search finds papers related to the broad topic of NLP and the sparse vector filtering ensures that the papers specifically mention “transformers”. -```python -Filter( - should=None, - min_should=None, - must=FieldCondition( - key="color", - match=MatchValue(value="red"), - range=None, - geo_bounding_box=None, - geo_radius=None, - geo_polygon=None, - values_count=None - ), - must_not=None -) +This is just a simple example and there's so much more you can do with it. See our complete [article on Hybrid Search](https://qdrant.tech/articles/hybrid-search/) guide to see what's happening behind the scenes and all the possibilities when building a hybrid search system. -``` +## Quantization: Get 40x Faster Results -The same query, restricted to the available fields, now generates better criteria, as it doesn’t try to filter by the -fields that don’t exist in the collection. +![vector-database-architecture](/articles_data/what-is-a-vector-database/vector-database-2.jpeg) -### [Anchor](https://qdrant.tech/documentation/search-precision/automate-filtering-with-llms/\#testing-the-llm-output) Testing the LLM output +As your vector dataset grow larger, so do the computational demands of searching through it. -Although the LLMs are quite powerful, they are not perfect. If you plan to automate filtering, it makes sense to run -some tests to see how well they perform. Especially edge cases, like queries that cannot be expressed as filters. Let’s -see how the LLM will handle the following query: +Quantized vectors are much smaller and easier to compare. With methods like [**Binary Quantization**](https://qdrant.tech/articles/binary-quantization/), you can see **search speeds improve by up to 40x while memory usage decreases by 32x**. Improvements that can be decicive when dealing with large datasets or needing low-latency results. -```python -qdrant_filter = anthropic_client.messages.create( - model="claude-3-5-sonnet-latest", - response_model=models.Filter, - max_tokens=1024, - messages=[\ - {\ - "role": "user",\ - "content": (\ - "fruit salad with no more than 100 calories"\ - f"\n{formatted_indexes}\n"\ - )\ - }\ - ], -) +It works by converting high-dimensional vectors, which typically use `4 bytes` per dimension, into binary representations, using just `1 bit` per dimension. Values above zero become "1", and everything else becomes "0". -``` + Binary Quantization example -Output: +Quantization reduces data precision, and yes, this does lead to some loss of accuracy. However, for binary quantization, **OpenAI embeddings** achieves this performance improvement at a cost of only 5% of accuracy. If you apply techniques like **oversampling** and **rescoring**, this loss can be brought down even further. + +However, binary quantization isn’t the only available option. Techniques like [**Scalar Quantization**](https://qdrant.tech/documentation/guides/quantization/#scalar-quantization) and [**Product Quantization**](https://qdrant.tech/documentation/guides/quantization/#product-quantization) are also popular alternatives when optimizing vector compression. + +You can set up your chosen quantization method using the `quantization_config` parameter when creating a new collection: ```python -Filter( - should=None, - min_should=None, - must=FieldCondition( - key="price", - match=None, - range=Range(lt=None, gt=None, gte=None, lte=100.0), - geo_bounding_box=None, - geo_radius=None, - geo_polygon=None, - values_count=None +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams( + size=1536, + distance=models.Distance.COSINE ), - must_not=None -) + # Choose your preferred quantization method + quantization_config=models.BinaryQuantization( + binary=models.BinaryQuantizationConfig( + always_ram=True, # Store the quantized vectors in RAM for faster access + ), + ), +) ``` +You can store original vectors on disk within the `vectors_config` by setting `on_disk=True` to save RAM space, while keeping quantized vectors in RAM for faster access -Surprisingly, the LLM extracted the calorie information from the query and generated a filter based on the price field. -It somehow extracts any numerical information from the query and tries to match it with the available fields. - -Generally, giving model some more guidance on how to interpret the query may lead to better results. Adding a system -prompt that defines the rules for the query interpretation may help the model to do a better job. Here is how you can -do it: +We recommend checking out our [Vector Quantization guide](https://qdrant.tech/articles/what-is-vector-quantization/) for a full breakdown of methods and tips on **optimizing performance** for your specific use case. -```python -SYSTEM_PROMPT = """ -You are extracting filters from a text query. Please follow the following rules: -1. Query is provided in the form of a text enclosed in tags. -2. Available indexes are put at the end of the text in the form of a list enclosed in tags. -3. You cannot use any field that is not available in the indexes. -4. Generate a filter only if you are certain that user's intent matches the field name. -5. Prices are always in USD. -6. It's better not to generate a filter than to generate an incorrect one. -""" +## Distributed Deployment -qdrant_filter = anthropic_client.messages.create( - model="claude-3-5-sonnet-latest", - response_model=models.Filter, - max_tokens=1024, - messages=[\ - {\ - "role": "user",\ - "content": SYSTEM_PROMPT.strip(),\ - },\ - {\ - "role": "assistant",\ - "content": "Okay, I will follow all the rules."\ - },\ - {\ - "role": "user",\ - "content": (\ - "fruit salad with no more than 100 calories"\ - f"\n{formatted_indexes}\n"\ - )\ - }\ - ], -) +When thinking about scaling, the key factors to consider are **fault tolerance**, **load balancing**, and **availability**. One node, no matter how powerful, can only take you so far. Eventually, you'll need to spread the workload across multiple machines to ensure the system remains fast and stable. -``` +### Sharding: Distributing Data Across Nodes -Current output: +In a distributed Qdrant cluster, data is split into smaller units called **shards**, which are distributed across different nodes. which helps balance the load and ensures that queries can be processed in parallel. -```python -Filter( - should=None, - min_should=None, - must=None, - must_not=None -) +Each collection—a group of related data points—can be split into non-overlapping subsets, which are then managed by different nodes. -``` + Distributed vector database with sharding and Raft consensus -### [Anchor](https://qdrant.tech/documentation/search-precision/automate-filtering-with-llms/\#handling-complex-queries) Handling complex queries +**Raft Consensus** ensures that all the nodes stay in sync and have a consistent view of the data. Each node knows where every shard is, and Raft ensures that all nodes are in sync. If one node fails, the others know where the missing data is located and can take over. -We have a bunch of indexes created on the collection, and it is quite interesting to see how the LLM will handle more -complex queries. For example, let’s see how it will handle the following query: +By default, the number of shards in your Qdrant system matches the number of nodes in your cluster. But if you need more control, you can choose the `shard_number` manually when creating a collection. ```python -qdrant_filter = anthropic_client.messages.create( - model="claude-3-5-sonnet-latest", - response_model=models.Filter, - max_tokens=1024, - messages=[\ - {\ - "role": "user",\ - "content": SYSTEM_PROMPT.strip(),\ - },\ - {\ - "role": "assistant",\ - "content": "Okay, I will follow all the rules."\ - },\ - {\ - "role": "user",\ - "content": (\ - ""\ - "white T-shirt available no more than 30 miles from London, "\ - "but not in the city itself, below $15.70, not made from polyester"\ - "\n"\ - "\n"\ - f"{formatted_indexes}\n"\ - ""\ - )\ - },\ - ], +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=300, distance=models.Distance.COSINE), + shard_number=4, # Custom number of shards ) - ``` -It might be surprising, but Anthropic Claude is able to generate even such complex filters. Here is the output: +There are two main types of sharding: -```python -Filter( - should=None, - min_should=None, - must=[\ - FieldCondition(\ - key="color",\ - match=MatchValue(value="white"),\ - range=None,\ - geo_bounding_box=None,\ - geo_radius=None,\ - geo_polygon=None,\ - values_count=None\ - ),\ - FieldCondition(\ - key="city.location",\ - match=None,\ - range=None,\ - geo_bounding_box=None,\ - geo_radius=GeoRadius(\ - center=GeoPoint(lon=-0.1276, lat=51.5074),\ - radius=48280.0\ - ),\ - geo_polygon=None,\ - values_count=None\ - ),\ - FieldCondition(\ - key="price",\ - match=None,\ - range=Range(lt=15.7, gt=None, gte=None, lte=None),\ - geo_bounding_box=None,\ - geo_radius=None,\ - geo_polygon=None,\ - values_count=None\ - )\ - ], must_not=[\ - FieldCondition(\ - key="city.name",\ - match=MatchValue(value="London"),\ - range=None,\ - geo_bounding_box=None,\ - geo_radius=None,\ - geo_polygon=None,\ - values_count=None\ - ),\ - FieldCondition(\ - key="fabric",\ - match=MatchValue(value="polyester"),\ - range=None,\ - geo_bounding_box=None,\ - geo_radius=None,\ - geo_polygon=None,\ - values_count=None\ - )\ - ] -) +1. **Automatic Sharding:** Points (vectors) are automatically distributed across shards using consistent hashing. Each shard contains non-overlapping subsets of the data. +2. **User-defined Sharding:** Specify how points are distributed, enabling more control over your data organization, especially for use cases like **multitenancy**, where each tenant (a user, client, or organization) has their own isolated data. -``` +Each shard is divided into **segments**. They are a smaller storage unit within a shard, storing a subset of vectors and their associated payloads (metadata). When a query is executed, it targets the only relevant segments, processing them in parallel. -The model even knows the coordinates of London and uses them to generate the geo filter. It isn’t the best idea to -rely on the model to generate such complex filters, but it’s quite impressive that it can do it. +Segments act as smaller storage units within a shard -## [Anchor](https://qdrant.tech/documentation/search-precision/automate-filtering-with-llms/\#further-steps) Further steps +### Replication: High Availability and Data Integrity -Real production systems would rather require more testing and validation of the LLM output. Building a ground truth -dataset with the queries and the expected filters would be a good idea. You can use this dataset to evaluate the model -performance and to see how it behaves in different scenarios. +You don’t want a single failure to take down your system, right? Replication keeps multiple copies of the same data across different nodes to ensure **high availability**. -##### Was this page useful? +In Qdrant, **Replica Sets** manage these copies of shards across different nodes. If one replica becomes unavailable, others are there to take over and keep the system running. Whether the data is local or remote is mainly influenced by how you've configured the cluster. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No + Replica Set and Replication diagram -Thank you for your feedback! 🙏 +When a query is made, if the relevant data is stored locally, the local shard handles the operation. If the data is on a remote shard, it’s retrieved via gRPC. + +You can control how many copies you want with the `replication_factor`. For example, creating a collection with 4 shards and a replication factor of 2 will result in 8 physical shards distributed across the cluster: -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/search-precision/automate-filtering-with-llms.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +```python +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=300, distance=models.Distance.COSINE), + shard_number=4, + replication_factor=2, +) +``` -On this page: +We recommend using sharding and replication together so that your data is both split across nodes and replicated for availability. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/search-precision/automate-filtering-with-llms.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +For more details on features like **user-defined sharding, node failure recovery**, and **consistency guarantees**, see our guide on [Distributed Deployment.](https://qdrant.tech/documentation/guides/distributed_deployment/) -× +## Multitenancy: Data Isolation for Multi-Tenant Architectures -[Powered by](https://qdrant.tech/) +![vector-database-get-started](/articles_data/what-is-a-vector-database/vector-database-6.png) -<|page-31-lllmstxt|> -## sitemap.xml -https://qdrant.tech/articles/distance-based-exploration/2025-03-11T14:27:31+01:00https://qdrant.tech/articles/modern-sparse-neural-retrieval/2025-05-15T19:37:07+05:30https://qdrant.tech/articles/cross-encoder-integration-gsoc/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/what-is-a-vector-database/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/what-is-vector-quantization/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/vector-search-resource-optimization/2025-05-09T12:38:02+05:30https://qdrant.tech/articles/vector-search-filtering/2025-01-06T10:45:10+01:00https://qdrant.tech/articles/immutable-data-structures/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/minicoil/2025-05-13T18:20:11+02:00https://qdrant.tech/articles/search-feedback-loop/2025-04-01T12:23:31+02:00https://qdrant.tech/articles/dedicated-vector-search/2025-02-18T12:54:36-05:00https://qdrant.tech/articles/late-interaction-models/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/indexing-optimization/2025-03-24T19:51:41+01:00https://qdrant.tech/articles/gridstore-key-value-storage/2025-02-05T09:42:23-05:00https://qdrant.tech/articles/agentic-rag/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/hybrid-search/2025-01-03T10:53:26+01:00https://qdrant.tech/articles/what-is-rag-in-ai/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/bm42/2025-04-10T12:02:16+02:00https://qdrant.tech/articles/qdrant-1.8.x/2024-07-07T18:34:56-07:00https://qdrant.tech/articles/rapid-rag-optimization-with-qdrant-and-quotient/2025-05-15T19:33:44+05:30https://qdrant.tech/articles/rag-is-dead/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/binary-quantization-openai/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/multitenancy/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/data-privacy/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/discovery-search/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/what-are-embeddings/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/sparse-vectors/2025-03-04T22:08:36+01:00https://qdrant.tech/articles/qdrant-1.7.x/2024-10-05T03:39:41+05:30https://qdrant.tech/articles/new-recommendation-api/2024-03-07T20:31:05+01:00https://qdrant.tech/articles/dedicated-service/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/fastembed/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/geo-polygon-filter-gsoc/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/binary-quantization/2025-04-10T09:21:38-03:00https://qdrant.tech/articles/food-discovery-demo/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/web-ui-gsoc/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/dimension-reduction-qsoc/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/search-as-you-type/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/vector-similarity-beyond-search/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/serverless/2025-02-18T21:01:07+05:30https://qdrant.tech/documentation/database-tutorials/bulk-upload/2025-03-25T21:43:45-03:00https://qdrant.tech/benchmarks/benchmarks-intro/2024-06-27T12:40:08+02:00https://qdrant.tech/documentation/faq/qdrant-fundamentals/2025-05-02T10:37:48+02:00https://qdrant.tech/documentation/search-precision/reranking-semantic-search/2025-05-21T15:27:35+08:00https://qdrant.tech/documentation/cloud-rbac/role-management/2025-05-02T16:53:21+02:00https://qdrant.tech/documentation/beginner-tutorials/search-beginners/2025-04-25T19:32:48+03:00https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/2025-03-10T22:19:22+01:00https://qdrant.tech/documentation/private-cloud/private-cloud-setup/2025-06-03T09:48:32+02:00https://qdrant.tech/documentation/overview/vector-search/2024-10-05T03:39:41+05:30https://qdrant.tech/articles/qdrant-1.3.x/2024-03-07T20:31:05+01:00https://qdrant.tech/benchmarks/single-node-speed-benchmark/2024-06-17T22:01:23+02:00https://qdrant.tech/benchmarks/single-node-speed-benchmark-2022/2024-01-11T19:41:06+05:30https://qdrant.tech/documentation/search-precision/automate-filtering-with-llms/2025-05-27T18:00:51+02:00https://qdrant.tech/documentation/beginner-tutorials/neural-search/2024-11-18T15:26:15-08:00https://qdrant.tech/documentation/private-cloud/configuration/2025-03-21T16:37:49+01:00https://qdrant.tech/documentation/database-tutorials/create-snapshot/2025-06-12T09:02:54+03:00https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-cluster-creation/2025-06-16T17:51:31+02:00https://qdrant.tech/documentation/data-ingestion-beginners/2025-05-15T20:16:43+05:30https://qdrant.tech/documentation/faq/database-optimization/2024-10-05T03:39:41+05:30https://qdrant.tech/documentation/2024-12-20T13:10:51+01:00https://qdrant.tech/documentation/advanced-tutorials/using-multivector-representations/2025-06-10T11:40:10+03:00https://qdrant.tech/documentation/database-tutorials/large-scale-search/2025-03-24T14:27:15-03:00https://qdrant.tech/documentation/fastembed/fastembed-quickstart/2024-08-06T15:42:27-07:00https://qdrant.tech/documentation/advanced-tutorials/reranking-hybrid-search/2025-06-05T14:05:27+03:00https://qdrant.tech/documentation/advanced-tutorials/code-search/2025-05-15T19:33:03+05:30https://qdrant.tech/documentation/agentic-rag-crewai-zoom/2025-04-09T12:55:16+02:00https://qdrant.tech/documentation/cloud-rbac/user-management/2025-05-02T18:40:38+02:00https://qdrant.tech/articles/io\_uring/2024-12-20T13:10:51+01:00https://qdrant.tech/benchmarks/filtered-search-intro/2024-01-11T19:41:06+05:30https://qdrant.tech/documentation/agentic-rag-langgraph/2025-05-15T19:37:07+05:30https://qdrant.tech/documentation/advanced-tutorials/collaborative-filtering/2024-11-18T15:26:15-08:00https://qdrant.tech/documentation/hybrid-cloud/operator-configuration/2024-12-23T12:11:13+01:00https://qdrant.tech/documentation/fastembed/fastembed-semantic-search/2025-04-26T13:30:39+03:00https://qdrant.tech/documentation/database-tutorials/huggingface-datasets/2024-11-18T15:26:15-08:00https://qdrant.tech/documentation/private-cloud/qdrant-cluster-management/2025-06-16T17:51:31+02:00https://qdrant.tech/documentation/cloud-rbac/permission-reference/2025-06-13T08:39:21+02:00https://qdrant.tech/documentation/beginner-tutorials/hybrid-search-fastembed/2025-04-26T18:10:19+03:00https://qdrant.tech/documentation/overview/2025-04-26T22:59:20-07:00https://qdrant.tech/articles/product-quantization/2025-02-04T13:55:26+01:00https://qdrant.tech/benchmarks/filtered-search-benchmark/2024-01-11T19:41:06+05:30https://qdrant.tech/documentation/agentic-rag-camelai-discord/2025-04-09T12:55:16+02:00https://qdrant.tech/documentation/private-cloud/backups/2024-09-05T15:17:16+02:00https://qdrant.tech/documentation/database-tutorials/async-api/2025-02-18T21:01:07+05:30https://qdrant.tech/documentation/cloud-quickstart/2025-05-29T08:51:37-04:00https://qdrant.tech/documentation/quickstart/2025-01-20T10:08:10+01:00https://qdrant.tech/documentation/private-cloud/logging-monitoring/2025-02-11T18:21:40+01:00https://qdrant.tech/documentation/beginner-tutorials/retrieval-quality/2024-11-18T15:26:15-08:00https://qdrant.tech/documentation/hybrid-cloud/networking-logging-monitoring/2025-02-11T18:21:40+01:00https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/2025-01-28T15:29:08+01:00https://qdrant.tech/articles/scalar-quantization/2024-12-20T13:10:51+01:00https://qdrant.tech/documentation/interfaces/2024-11-21T17:41:45+05:30https://qdrant.tech/documentation/private-cloud/api-reference/2025-06-03T09:48:32+02:00https://qdrant.tech/documentation/private-cloud/changelog/2025-06-03T09:48:32+02:00https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/2024-11-18T15:42:18-08:00https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/2025-04-30T22:48:05+05:30https://qdrant.tech/documentation/guides/installation/2025-05-02T10:37:48+02:00https://qdrant.tech/documentation/multimodal-search/2025-04-09T12:55:16+02:00https://qdrant.tech/documentation/fastembed/fastembed-splade/2025-04-25T19:38:33+03:00https://qdrant.tech/articles/seed-round/2024-03-07T20:31:05+01:00https://qdrant.tech/articles/langchain-integration/2024-12-20T13:10:51+01:00https://qdrant.tech/documentation/rag-deepseek/2025-04-26T13:02:13+03:00https://qdrant.tech/documentation/web-ui/2024-11-20T23:14:39+05:30https://qdrant.tech/documentation/fastembed/fastembed-colbert/2025-06-19T16:21:03+04:00https://qdrant.tech/articles/chatgpt-plugin/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/memory-consumption/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/qa-with-cohere-and-qdrant/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/qdrant-1.2.x/2024-03-07T20:31:05+01:00https://qdrant.tech/articles/dataset-quality/2024-12-20T13:10:51+01:00https://qdrant.tech/documentation/concepts/2024-11-14T18:59:28+01:00https://qdrant.tech/documentation/fastembed/fastembed-rerankers/2025-04-26T13:20:52+03:00https://qdrant.tech/articles/faq-question-answering/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/why-rust/2024-09-05T13:07:07-07:00https://qdrant.tech/articles/embedding-recycler/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/cars-recognition/2024-12-20T13:10:51+01:00https://qdrant.tech/documentation/guides/administration/2025-05-19T15:01:52+02:00https://qdrant.tech/benchmarks/benchmark-faq/2024-01-11T19:41:06+05:30https://qdrant.tech/documentation/guides/running-with-gpu/2025-03-20T15:19:07+01:00https://qdrant.tech/articles/vector-search-manuals/2024-12-20T13:10:51+01:00https://qdrant.tech/documentation/guides/capacity-planning/2024-10-05T03:39:41+05:30https://qdrant.tech/documentation/fastembed/2025-05-27T18:00:51+02:00https://qdrant.tech/documentation/guides/optimize/2025-04-07T00:40:39+02:00https://qdrant.tech/documentation/cloud-getting-started/2025-05-02T16:53:21+02:00https://qdrant.tech/documentation/guides/multiple-partitions/2025-04-07T00:40:39+02:00https://qdrant.tech/documentation/qdrant-mcp-server/2025-05-27T18:00:51+02:00https://qdrant.tech/documentation/cloud-account-setup/2025-05-02T18:40:38+02:00https://qdrant.tech/documentation/cloud-rbac/2025-05-02T16:53:21+02:00https://qdrant.tech/documentation/cloud/2025-05-02T16:53:21+02:00https://qdrant.tech/documentation/hybrid-cloud/2025-05-02T16:53:21+02:00https://qdrant.tech/documentation/beginner-tutorials/2024-11-18T15:26:15-08:00https://qdrant.tech/documentation/advanced-tutorials/2025-02-07T18:51:10-05:00https://qdrant.tech/documentation/private-cloud/2025-05-02T16:53:21+02:00https://qdrant.tech/documentation/cloud-pricing-payments/2025-05-02T16:53:21+02:00https://qdrant.tech/documentation/examples/qdrant-dspy-medicalbot/2025-06-19T11:54:06+03:00https://qdrant.tech/documentation/data-management/2025-05-31T21:49:18+02:00https://qdrant.tech/documentation/examples/llama-index-multitenancy/2024-04-11T13:13:14-07:00https://qdrant.tech/documentation/database-tutorials/2025-06-11T19:02:35+03:00https://qdrant.tech/documentation/embeddings/2024-11-28T08:54:13+05:30https://qdrant.tech/documentation/cloud-premium/2025-05-02T16:53:21+02:00https://qdrant.tech/articles/metric-learning-tips/2024-12-20T13:10:51+01:00https://qdrant.tech/documentation/cloud/create-cluster/2025-05-02T16:53:21+02:00https://qdrant.tech/documentation/frameworks/2025-05-19T21:17:24+05:30https://qdrant.tech/articles/qdrant-internals/2024-12-20T13:10:51+01:00https://qdrant.tech/documentation/observability/2024-11-14T18:59:28+01:00https://qdrant.tech/documentation/platforms/2025-05-14T07:24:10-04:00https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/2024-05-15T18:01:28+02:00https://qdrant.tech/documentation/examples/cohere-rag-connector/2025-02-18T21:01:07+05:30https://qdrant.tech/documentation/send-data/2024-11-14T18:59:28+01:00https://qdrant.tech/documentation/examples/2025-06-19T11:54:06+03:00https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws/2025-02-18T21:01:07+05:30https://qdrant.tech/documentation/examples/hybrid-search-llamaindex-jinaai/2024-04-15T17:41:39-07:00https://qdrant.tech/documentation/cloud-api/2025-06-06T09:56:35+02:00https://qdrant.tech/documentation/cloud-tools/2024-11-19T17:56:47-08:00https://qdrant.tech/documentation/examples/rag-contract-management-stackit-aleph-alpha/2025-02-18T21:01:07+05:30https://qdrant.tech/documentation/datasets/2024-11-14T18:59:28+01:00https://qdrant.tech/articles/detecting-coffee-anomalies/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/triplet-loss/2024-12-20T13:10:51+01:00https://qdrant.tech/documentation/cloud/authentication/2025-05-02T16:53:21+02:00https://qdrant.tech/documentation/concepts/collections/2025-04-07T00:40:39+02:00https://qdrant.tech/articles/data-exploration/2024-12-20T13:10:51+01:00https://qdrant.tech/documentation/examples/natural-language-search-oracle-cloud-infrastructure-cohere-langchain/2024-04-15T19:50:07-07:00https://qdrant.tech/documentation/examples/rag-chatbot-vultr-dspy-ollama/2025-05-15T19:37:07+05:30https://qdrant.tech/documentation/examples/recommendation-system-ovhcloud/2024-08-23T22:48:27+05:30https://qdrant.tech/documentation/examples/rag-chatbot-scaleway/2025-02-18T21:01:07+05:30https://qdrant.tech/documentation/cloud/cluster-access/2025-05-02T16:53:21+02:00https://qdrant.tech/documentation/support/2025-04-08T10:25:18+02:00https://qdrant.tech/documentation/send-data/databricks/2024-07-29T21:03:45+05:30https://qdrant.tech/documentation/send-data/qdrant-airflow-astronomer/2024-08-13T13:38:38+03:00https://qdrant.tech/articles/machine-learning/2024-12-20T13:10:51+01:00https://qdrant.tech/documentation/concepts/points/2025-04-07T00:40:39+02:00https://qdrant.tech/documentation/concepts/vectors/2025-04-07T00:40:39+02:00https://qdrant.tech/documentation/concepts/payload/2025-04-07T00:40:39+02:00https://qdrant.tech/documentation/send-data/data-streaming-kafka-qdrant/2024-07-22T17:09:17-07:00https://qdrant.tech/articles/neural-search-tutorial/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/rag-and-genai/2024-12-20T13:10:51+01:00https://qdrant.tech/documentation/cloud/cluster-scaling/2025-05-02T16:53:21+02:00https://qdrant.tech/documentation/concepts/search/2025-04-07T00:40:39+02:00https://qdrant.tech/documentation/concepts/explore/2025-06-12T10:45:50-04:00https://qdrant.tech/documentation/cloud/cluster-monitoring/2025-05-02T16:53:21+02:00https://qdrant.tech/documentation/cloud/cluster-upgrades/2025-05-02T16:53:21+02:00https://qdrant.tech/documentation/concepts/hybrid-queries/2025-04-23T11:15:58+02:00https://qdrant.tech/articles/filtrable-hnsw/2024-12-20T13:10:51+01:00https://qdrant.tech/documentation/concepts/filtering/2025-06-09T18:30:19+03:30https://qdrant.tech/articles/practicle-examples/2024-12-20T13:10:51+01:00https://qdrant.tech/documentation/cloud/backups/2025-05-02T16:53:21+02:00https://qdrant.tech/articles/qdrant-0-11-release/2022-12-06T13:12:27+01:00https://qdrant.tech/articles/qdrant-0-10-release/2024-05-15T18:01:28+02:00https://qdrant.tech/documentation/concepts/optimizer/2024-11-27T16:59:34+01:00https://qdrant.tech/documentation/concepts/storage/2025-04-07T00:40:39+02:00https://qdrant.tech/documentation/concepts/indexing/2025-04-07T00:40:39+02:00https://qdrant.tech/documentation/guides/distributed\_deployment/2025-02-03T17:33:39+06:00https://qdrant.tech/documentation/concepts/snapshots/2025-06-12T09:02:54+03:00https://qdrant.tech/documentation/guides/quantization/2025-04-07T00:40:39+02:00https://qdrant.tech/documentation/guides/monitoring/2025-02-11T18:21:40+01:00https://qdrant.tech/documentation/guides/configuration/2025-02-04T11:00:51+01:00https://qdrant.tech/documentation/guides/security/2025-01-20T16:32:23+01:00https://qdrant.tech/documentation/guides/usage-statistics/2024-12-03T17:03:30+01:00https://qdrant.tech/documentation/guides/common-errors/2025-05-27T12:04:07+02:00https://qdrant.tech/documentation/database-tutorials/migration/2025-06-11T18:57:35+03:00https://qdrant.tech/blog/hybrid-cloud-vultr/2024-05-21T10:11:09+02:00https://qdrant.tech/articles/quantum-quantization/2023-07-13T01:45:36+02:00https://qdrant.tech/blog/hybrid-cloud-stackit/2024-05-21T10:11:09+02:00https://qdrant.tech/blog/hybrid-cloud-scaleway/2024-05-21T10:11:09+02:00https://qdrant.tech/blog/hybrid-cloud-red-hat-openshift/2024-05-21T10:11:09+02:00https://qdrant.tech/blog/hybrid-cloud-ovhcloud/2024-05-21T10:11:09+02:00https://qdrant.tech/blog/hybrid-cloud-llamaindex/2024-05-21T10:11:09+02:00https://qdrant.tech/blog/hybrid-cloud-langchain/2024-05-21T10:11:09+02:00https://qdrant.tech/blog/hybrid-cloud-jinaai/2024-05-21T10:11:09+02:00https://qdrant.tech/blog/hybrid-cloud-haystack/2024-09-24T14:30:20-04:00https://qdrant.tech/blog/hybrid-cloud-digitalocean/2024-05-21T10:11:09+02:00https://qdrant.tech/blog/hybrid-cloud-aleph-alpha/2025-02-04T13:55:26+01:00https://qdrant.tech/blog/hybrid-cloud-airbyte/2025-02-04T13:55:26+01:00https://qdrant.tech/documentation/observability/openllmetry/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/observability/openlit/2024-08-15T08:50:37+05:30https://qdrant.tech/blog/case-study-lettria-v2/2025-06-16T22:38:02-07:00https://qdrant.tech/2025-06-19T16:21:03+04:00https://qdrant.tech/blog/beta-database-migration-tool/2025-06-18T11:55:05-04:00https://qdrant.tech/blog/case-study-lawme/2025-06-11T09:42:37-07:00https://qdrant.tech/blog/case-study-convosearch/2025-06-10T09:54:12-07:00https://qdrant.tech/blog/legal-tech-builders-guide/2025-06-13T15:44:13-07:00https://qdrant.tech/blog/soc-2-type-ii-hipaa/2025-06-17T16:48:22-07:00https://qdrant.tech/blog/n8n-node/2025-06-09T15:38:39+02:00https://qdrant.tech/blog/datatalks-course/2025-06-05T09:19:05-04:00https://qdrant.tech/blog/case-study-qovery/2025-05-27T11:19:41-07:00https://qdrant.tech/blog/case-study-tripadvisor/2025-05-13T23:15:13-07:00https://qdrant.tech/blog/case-study-aracor/2025-05-13T11:23:13-07:00https://qdrant.tech/blog/case-study-garden-intel/2025-05-09T11:56:26-07:00https://qdrant.tech/blog/product-ui-changes/2025-05-08T09:28:12-04:00https://qdrant.tech/blog/case-study-pariti/2025-05-01T10:05:43-07:00https://qdrant.tech/articles/vector-search-production/2025-04-30T17:47:55+02:00https://qdrant.tech/blog/case-study-dust-v2/2025-05-08T11:45:46-07:00https://qdrant.tech/blog/case-study-sayone/2025-04-29T09:15:10-07:00https://qdrant.tech/blog/superlinked-multimodal-search/2025-04-24T14:10:50+02:00https://qdrant.tech/blog/qdrant-1.14.x/2025-05-02T15:26:42-03:00https://qdrant.tech/blog/case-study-pathwork/2025-05-16T09:10:33-07:00https://qdrant.tech/blog/case-study-lyzr/2025-05-16T09:10:33-07:00https://qdrant.tech/blog/case-study-mixpeek/2025-05-16T09:10:33-07:00https://qdrant.tech/blog/qdrant-n8n-beyond-simple-similarity-search/2025-04-08T11:38:52+02:00https://qdrant.tech/blog/satellite-vector-broadcasting/2025-04-01T08:09:34+02:00https://qdrant.tech/blog/case-study-hubspot/2025-05-16T09:10:33-07:00https://qdrant.tech/blog/webinar-vibe-coding-rag/2025-03-21T16:36:29+01:00https://qdrant.tech/blog/case-study-deutsche-telekom/2025-04-03T08:09:56-04:00https://qdrant.tech/blog/enterprise-vector-search/2025-04-07T15:17:30-04:00https://qdrant.tech/blog/metadata-deasy-labs/2025-02-24T15:04:44-03:00https://qdrant.tech/blog/webinar-crewai-qdrant-obsidian/2025-01-24T16:10:16+01:00https://qdrant.tech/blog/qdrant-1.13.x/2025-01-24T04:19:54-05:00https://qdrant.tech/blog/static-embeddings/2025-01-17T14:53:25+01:00https://qdrant.tech/blog/case-study-voiceflow/2024-12-10T10:26:56-08:00https://qdrant.tech/blog/facial-recognition/2024-12-03T20:56:40-08:00https://qdrant.tech/blog/colpali-qdrant-optimization/2024-11-30T18:57:48-03:00https://qdrant.tech/blog/rag-evaluation-guide/2025-02-18T21:01:07+05:30https://qdrant.tech/blog/case-study-qatech/2024-11-21T16:42:35-08:00https://qdrant.tech/blog/qdrant-colpali/2024-11-06T17:18:48-08:00https://qdrant.tech/blog/case-study-sprinklr/2024-10-18T09:03:19-07:00https://qdrant.tech/blog/qdrant-1.12.x/2024-10-08T19:49:58-07:00https://qdrant.tech/blog/qdrant-deeplearning-ai-course/2024-10-07T12:25:14-07:00https://qdrant.tech/blog/qdrant-for-startups-launch/2024-10-02T19:07:16+05:30https://qdrant.tech/blog/case-study-shakudo/2025-03-13T17:47:05+01:00https://qdrant.tech/blog/qdrant-relari/2024-09-17T15:53:48-07:00https://qdrant.tech/blog/case-study-nyris/2024-09-23T14:05:33-07:00https://qdrant.tech/blog/case-study-kern/2024-09-23T14:05:33-07:00https://qdrant.tech/blog/qdrant-1.11.x/2024-08-16T00:01:23+02:00https://qdrant.tech/blog/case-study-kairoswealth/2024-09-11T14:59:00-07:00https://qdrant.tech/blog/qdrant-1.10.x/2024-07-16T22:00:30+05:30https://qdrant.tech/blog/community-highlights-1/2024-06-21T02:34:01-03:00https://qdrant.tech/blog/cve-2024-3829-response/2024-06-10T12:42:49-04:00https://qdrant.tech/blog/qdrant-soc2-type2-audit/2024-08-29T19:19:43+05:30https://qdrant.tech/blog/qdrant-stars-announcement/2024-10-05T03:39:41+05:30https://qdrant.tech/blog/qdrant-cpu-intel-benchmark/2024-10-08T12:41:46-07:00https://qdrant.tech/blog/qsoc24-interns-announcement/2024-05-08T18:04:46-03:00https://qdrant.tech/articles/semantic-cache-ai-data-retrieval/2024-12-20T13:10:51+01:00https://qdrant.tech/blog/are-you-vendor-locked/2024-05-21T10:11:09+02:00https://qdrant.tech/blog/case-study-visua/2024-05-01T17:59:13-07:00https://qdrant.tech/blog/qdrant-1.9.x/2024-05-21T10:11:09+02:00https://qdrant.tech/blog/hybrid-cloud-launch-partners/2024-05-21T10:11:09+02:00https://qdrant.tech/blog/hybrid-cloud/2024-05-21T10:11:09+02:00https://qdrant.tech/blog/rag-advancements-challenges/2024-04-12T14:45:02+00:00https://qdrant.tech/blog/building-search-rag-open-api/2024-04-12T14:23:42+00:00https://qdrant.tech/blog/gen-ai-and-vector-search/2024-07-07T19:32:50-07:00https://qdrant.tech/blog/teaching-vector-db-at-scale/2024-04-09T11:06:17+00:00https://qdrant.tech/blog/meow-with-cheshire-cat/2024-04-09T11:05:51+00:00https://qdrant.tech/blog/cve-2024-2221-response/2024-08-15T17:31:04+02:00https://qdrant.tech/blog/fastllm-announcement/2024-04-01T04:13:26-07:00https://qdrant.tech/blog/virtualbrain-best-rag/2024-09-20T10:12:14-04:00https://qdrant.tech/blog/youtube-without-paying-cent/2024-03-27T12:44:32+00:00https://qdrant.tech/blog/azure-marketplace/2024-10-05T03:39:41+05:30https://qdrant.tech/blog/real-time-news-distillation-rag/2024-03-25T08:49:27+00:00https://qdrant.tech/blog/insight-generation-platform/2024-03-25T08:51:56+00:00https://qdrant.tech/blog/llm-as-a-judge/2024-03-19T15:05:24+00:00https://qdrant.tech/blog/vector-search-vector-recommendation/2024-03-19T14:22:15+00:00https://qdrant.tech/blog/using-qdrant-and-langchain/2024-05-15T18:01:28+02:00https://qdrant.tech/blog/iris-agent-qdrant/2024-03-06T09:17:19-08:00https://qdrant.tech/blog/case-study-dailymotion/2024-03-07T20:31:05+01:00https://qdrant.tech/blog/comparing-qdrant-vs-pinecone-vector-databases/2025-02-04T13:55:26+01:00https://qdrant.tech/blog/what-is-vector-similarity/2024-09-05T13:07:07-07:00https://qdrant.tech/blog/dspy-vs-langchain/2025-05-15T19:37:07+05:30https://qdrant.tech/blog/qdrant-summer-of-code-24/2024-03-14T18:24:32+01:00https://qdrant.tech/blog/dust-and-qdrant/2024-09-20T10:19:38-04:00https://qdrant.tech/blog/bitter-lesson-generative-language-model/2024-01-29T16:31:02+00:00https://qdrant.tech/blog/indexify-content-extraction-engine/2024-03-07T18:59:29+00:00https://qdrant.tech/blog/qdrant-x-dust-vector-search/2024-07-07T19:40:44-07:00https://qdrant.tech/blog/series-a-funding-round/2024-10-08T12:41:46-07:00https://qdrant.tech/blog/qdrant-cloud-on-microsoft-azure/2024-03-07T20:31:05+01:00https://qdrant.tech/blog/qdrant-benchmarks-2024/2024-03-07T20:31:05+01:00https://qdrant.tech/blog/navigating-challenges-innovations/2024-05-21T09:57:56+02:00https://qdrant.tech/blog/open-source-vector-search-engine-vector-database/2024-07-07T19:36:05-07:00https://qdrant.tech/blog/vector-image-search-rag/2024-01-25T17:51:08+01:00https://qdrant.tech/blog/semantic-search-vector-database/2024-07-07T19:46:08-07:00https://qdrant.tech/blog/llm-complex-search-copilot/2024-01-10T11:42:02+00:00https://qdrant.tech/blog/entity-matching-qdrant/2024-01-10T11:37:51+00:00https://qdrant.tech/blog/fast-embed-models/2024-01-22T10:15:56-08:00https://qdrant.tech/blog/human-language-ai-models/2024-01-10T10:31:15+00:00https://qdrant.tech/blog/binary-quantization/2024-01-10T10:26:06+00:00https://qdrant.tech/blog/qdrant-unstructured/2024-03-07T20:31:05+01:00https://qdrant.tech/blog/qdrant-n8n/2024-03-07T20:31:05+01:00https://qdrant.tech/blog/vector-search-and-applications-record/2024-09-06T13:14:12+02:00https://qdrant.tech/blog/cohere-embedding-v3/2024-09-06T13:14:12+02:00https://qdrant.tech/blog/case-study-pienso/2024-04-10T17:59:48-07:00https://qdrant.tech/blog/case-study-bloop/2024-07-18T19:11:22-07:00https://qdrant.tech/articles/qdrant-introduces-full-text-filters-and-indexes/2024-09-18T15:57:29-07:00https://qdrant.tech/articles/storing-multiple-vectors-per-object-in-qdrant/2024-12-20T13:10:51+01:00https://qdrant.tech/articles/batch-vector-search-with-qdrant/2024-12-20T13:10:51+01:00https://qdrant.tech/blog/qdrant-supports-arm-architecture/2024-01-16T22:02:52+05:30https://qdrant.tech/about-us/2024-05-21T09:57:56+02:00https://qdrant.tech/data-analysis-anomaly-detection/2024-08-29T10:01:03-04:00https://qdrant.tech/advanced-search/2024-08-21T16:31:41-07:00https://qdrant.tech/ai-agents/2025-02-12T08:47:39-06:00https://qdrant.tech/e-commerce/2025-05-22T20:23:57+02:00https://qdrant.tech/documentation/data-management/airbyte/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/embeddings/aleph-alpha/2024-11-28T08:54:13+05:30https://qdrant.tech/get\_anonymous\_id/2025-03-05T11:26:52+00:00https://qdrant.tech/documentation/data-management/airflow/2025-02-18T21:01:07+05:30https://qdrant.tech/documentation/data-management/nifi/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/data-management/spark/2025-03-06T10:23:24+05:30https://qdrant.tech/documentation/platforms/apify/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/frameworks/autogen/2024-11-20T11:50:06+05:30https://qdrant.tech/documentation/embeddings/bedrock/2024-11-28T08:54:13+05:30https://qdrant.tech/documentation/frameworks/lakechain/2024-10-17T11:42:14+05:30https://qdrant.tech/about-us/about-us-resources/2025-05-30T14:14:31+03:00https://qdrant.tech/brand-resources/2024-06-17T16:56:32+03:00https://qdrant.tech/documentation/platforms/bubble/2024-08-15T08:50:37+05:30https://qdrant.tech/security/bug-bounty-program/2025-03-28T09:40:53+01:00https://qdrant.tech/documentation/build/2024-11-18T14:53:02-08:00https://qdrant.tech/documentation/platforms/buildship/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/frameworks/camel/2024-12-20T13:31:09+05:30https://qdrant.tech/documentation/frameworks/cheshire-cat/2025-01-24T11:47:11+01:00https://qdrant.tech/documentation/data-management/cocoindex/2025-04-20T23:11:21-07:00https://qdrant.tech/documentation/data-management/cognee/2025-05-31T22:06:39+02:00https://qdrant.tech/documentation/embeddings/cohere/2025-02-19T10:27:39+03:00https://qdrant.tech/community/2025-01-07T11:56:39-06:00https://qdrant.tech/documentation/data-management/confluent/2024-08-15T08:50:37+05:30https://qdrant.tech/contact-us/2025-03-13T17:47:05+01:00https://qdrant.tech/legal/credits/2022-04-25T15:19:19+02:00https://qdrant.tech/documentation/frameworks/crewai/2025-02-27T09:21:41+01:00https://qdrant.tech/customers/2024-06-17T16:56:32+03:00https://qdrant.tech/documentation/frameworks/dagster/2025-04-15T18:20:05+05:30https://qdrant.tech/documentation/observability/datadog/2024-10-31T05:56:39+05:30https://qdrant.tech/documentation/frameworks/deepeval/2025-04-24T16:09:40+08:00https://qdrant.tech/documentation/data-management/dlt/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/frameworks/docarray/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/platforms/docsgpt/2025-02-18T21:01:07+05:30https://qdrant.tech/documentation/frameworks/dsrag/2024-11-27T17:59:33+05:30https://qdrant.tech/documentation/frameworks/dynamiq/2025-03-24T10:22:45+02:00https://qdrant.tech/articles/ecosystem/2024-12-20T13:10:51+01:00https://qdrant.tech/enterprise-solutions/2024-08-20T14:08:09-04:00https://qdrant.tech/documentation/frameworks/feast/2025-02-18T21:01:07+05:30https://qdrant.tech/documentation/frameworks/fifty-one/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/frameworks/genkit/2024-10-05T03:39:41+05:30https://qdrant.tech/documentation/data-management/fondant/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/embeddings/gemini/2024-11-28T08:54:13+05:30https://qdrant.tech/documentation/frameworks/haystack/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/frameworks/honeyhive/2025-05-09T04:07:10-03:00https://qdrant.tech/hospitality-and-travel/2025-05-21T18:13:48+02:00https://qdrant.tech/legal/impressum/2024-02-28T17:57:34+01:00https://qdrant.tech/documentation/data-management/fluvio/2024-09-15T21:31:35+05:30https://qdrant.tech/documentation/platforms/rivet/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/embeddings/jina-embeddings/2024-11-28T08:54:13+05:30https://qdrant.tech/about-us/about-us-get-started/2025-05-30T14:14:31+03:00https://qdrant.tech/documentation/platforms/keboola/2025-05-14T07:24:10-04:00https://qdrant.tech/documentation/platforms/kotaemon/2024-11-07T03:37:15+05:30https://qdrant.tech/documentation/frameworks/langchain/2024-08-29T19:19:43+05:30https://qdrant.tech/documentation/frameworks/langchain-go/2024-11-04T16:55:24+01:00https://qdrant.tech/documentation/frameworks/langchain4j/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/frameworks/langgraph/2024-11-20T19:27:09+05:30https://qdrant.tech/legal-tech/2025-04-24T18:13:38+02:00https://qdrant.tech/documentation/frameworks/llama-index/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/platforms/make/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/frameworks/mastra/2024-12-20T13:30:42+05:30https://qdrant.tech/documentation/frameworks/mem0/2024-10-05T13:55:10+05:30https://qdrant.tech/documentation/frameworks/nlweb/2025-05-19T21:26:59+05:30https://qdrant.tech/documentation/data-management/mindsdb/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/embeddings/mistral/2024-11-28T08:54:13+05:30https://qdrant.tech/documentation/embeddings/mixedbread/2024-11-28T08:54:13+05:30https://qdrant.tech/documentation/embeddings/mixpeek/2024-11-28T08:54:13+05:30https://qdrant.tech/documentation/platforms/n8n/2025-06-06T22:10:24+05:30https://qdrant.tech/documentation/frameworks/neo4j-graphrag/2024-11-07T02:58:58+05:30https://qdrant.tech/documentation/embeddings/nomic/2024-11-28T08:54:13+05:30https://qdrant.tech/documentation/embeddings/nvidia/2024-11-28T08:54:13+05:30https://qdrant.tech/documentation/embeddings/ollama/2024-11-28T08:54:13+05:30https://qdrant.tech/documentation/embeddings/openai/2024-11-28T08:54:13+05:30https://qdrant.tech/documentation/frameworks/openai-agents/2025-04-30T14:10:48+05:30https://qdrant.tech/about-us/about-us-engineering-culture/2025-05-30T14:14:31+03:00https://qdrant.tech/documentation/frameworks/pandas-ai/2025-02-18T21:01:07+05:30https://qdrant.tech/partners/2024-06-17T16:56:32+03:00https://qdrant.tech/documentation/frameworks/canopy/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/platforms/pipedream/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/platforms/portable/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/platforms/powerapps/2025-01-10T21:05:50+05:30https://qdrant.tech/documentation/embeddings/premai/2024-11-28T08:54:13+05:30https://qdrant.tech/pricing/2024-08-20T12:47:35-07:00https://qdrant.tech/legal/privacy-policy/2025-06-19T13:22:43+02:00https://qdrant.tech/private-cloud/2024-05-21T09:57:56+02:00https://qdrant.tech/documentation/platforms/privategpt/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/cloud-tools/pulumi/2024-11-19T18:01:59-08:00https://qdrant.tech/articles/2024-12-20T13:10:51+01:00https://qdrant.tech/blog/2024-05-21T09:57:56+02:00https://qdrant.tech/cloud/2024-08-20T11:44:59-07:00https://qdrant.tech/demo/2024-09-06T13:14:12+02:00https://qdrant.tech/qdrant-for-startups/2024-09-30T18:44:08+02:00https://qdrant.tech/hybrid-cloud/2024-05-21T10:11:09+02:00https://qdrant.tech/stars/2024-06-17T16:56:32+03:00https://qdrant.tech/qdrant-vector-database/2024-08-29T08:43:52-04:00https://qdrant.tech/rag/rag-evaluation-guide/2024-09-16T18:43:11+02:00https://qdrant.tech/rag/2024-08-20T11:45:42-07:00https://qdrant.tech/documentation/frameworks/ragbits/2024-11-07T08:29:10+05:30https://qdrant.tech/recommendations/2024-08-20T12:49:28-07:00https://qdrant.tech/documentation/data-management/redpanda/2024-08-15T22:23:17+05:30https://qdrant.tech/documentation/frameworks/rig-rs/2024-11-07T08:04:53+05:30https://qdrant.tech/documentation/platforms/mulesoft/2025-01-10T21:16:11+05:30https://qdrant.tech/documentation/frameworks/semantic-router/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/frameworks/smolagents/2025-01-04T22:43:37+05:30https://qdrant.tech/documentation/embeddings/snowflake/2024-11-28T08:54:13+05:30https://qdrant.tech/documentation/frameworks/solon/2025-04-15T18:20:05+05:30https://qdrant.tech/documentation/frameworks/spring-ai/2024-08-29T19:19:43+05:30https://qdrant.tech/documentation/frameworks/dspy/2025-06-16T17:32:35+03:00https://qdrant.tech/subscribe-confirmation/2023-12-26T11:53:00+00:00https://qdrant.tech/subscribe/2025-02-04T13:55:26+01:00https://qdrant.tech/documentation/frameworks/superduper/2024-11-27T17:46:12+05:30https://qdrant.tech/documentation/frameworks/sycamore/2024-10-17T11:40:28+05:30https://qdrant.tech/legal/terms\_and\_conditions/2021-12-10T10:29:52+01:00https://qdrant.tech/documentation/cloud-tools/terraform/2024-11-19T18:01:59-08:00https://qdrant.tech/documentation/frameworks/testcontainers/2025-04-24T18:47:10+10:00https://qdrant.tech/documentation/platforms/tooljet/2025-03-06T14:58:05+05:30https://qdrant.tech/documentation/embeddings/twelvelabs/2025-01-07T21:51:22+05:30https://qdrant.tech/documentation/frameworks/txtai/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/data-management/unstructured/2025-02-18T21:01:07+05:30https://qdrant.tech/documentation/embeddings/upstage/2024-11-28T08:54:13+05:30https://qdrant.tech/documentation/frameworks/vanna-ai/2024-08-15T08:50:37+05:30https://qdrant.tech/documentation/frameworks/mirror-security/2025-02-21T09:20:59+05:30https://qdrant.tech/benchmarks/2023-02-16T18:40:22+04:00https://qdrant.tech/use-cases/2024-09-04T08:01:21-07:00https://qdrant.tech/documentation/platforms/vectorize/2025-02-05T06:14:34-05:00https://qdrant.tech/documentation/embeddings/voyage/2024-11-28T08:54:13+05:30https://qdrant.tech/documentation/cloud-intro/2025-05-02T16:53:21+02:00 +Sharding efficiently distributes data across nodes, while replication guarantees redundancy and fault tolerance. But what happens when you’ve got multiple clients or user groups, and you need to keep their data isolated within the same infrastructure? - +**Multitenancy** allows you to keep data for different tenants (users, clients, or organizations) isolated within a single cluster. Instead of creating separate collections for `Tenant 1` and `Tenant 2`, you store their data in the same collection but tag each vector with a `group_id` to identify which tenant it belongs to. - +Multitenancy dividing data between 2 tenants -https://qdrant.tech/articles/distance-based-exploration/ +In the backend, Qdrant can store `Tenant 1`’s data in Shard 1 located in Canada (perhaps for compliance reasons like GDPR), while `Tenant 2`’s data is stored in Shard 2 located in Germany. The data will be physically separated but still within the same infrastructure. -2025-03-11T14:27:31+01:00 +To implement this, you tag each vector with a tenant-specific `group_id` during the upsert operation: -... +```python +client.upsert( + collection_name="tenant_data", + points=[models.PointStruct( + id=2, + payload={"group_id": "tenant_1"}, + vector=[0.1, 0.9, 0.1] + )], + shard_key_selector="canada" +) +``` - +Each tenant’s data remains isolated while still benefiting from the shared infrastructure. Optimizing for data privacy, compliance with local regulations, and scalability, without the need to create excessive collections or maintain separate clusters for each tenant. - +If you want to learn more about working with a multitenant setup in Qdrant, you can check out our [Multitenancy and Custom Sharding dedicated guide.](https://qdrant.tech/articles/multitenancy/) -https://qdrant.tech/articles/modern-sparse-neural-retrieval/ +## Data Security and Access Control -2025-05-15T19:37:07+05:30 +A common security risk in vector databases is the possibility of **embedding inversion attacks**, where attackers could reconstruct the original data from embeddings. There are many layers of protection you can use to secure your instance that are very important before getting your vector database into production. -... +For quick security in simpler use cases, you can use the **API key authentication**. To enable it, set up the API key in the configuration or environment variable. - +```yaml +service: + api_key: your_secret_api_key_here + enable_tls: true # Make sure to enable TLS to protect the API key from being exposed +``` - +Once this is set up, remember to include the API key in all your requests: -https://qdrant.tech/articles/cross-encoder-integration-gsoc/ +```python +from qdrant_client import QdrantClient -2024-12-20T13:10:51+01:00 +client = QdrantClient( + url="https://localhost:6333", + api_key="your_secret_api_key_here" +) +``` -... +In more advanced setups, Qdrant uses **JWT (JSON Web Tokens)** to enforce **Role-Based Access Control (RBAC)**. - +RBAC defines roles and assigns permissions, while JWT securely encodes these roles into tokens. Each request is validated against the user's JWT, ensuring they can only access or modify data based on their assigned permissions. - +You can easily setup you access tokens and secure access to sensitive data through the **Qdrant Web UI:** -https://qdrant.tech/articles/what-is-a-vector-database/ +Qdrant Web UI for generating a new access token. -2024-12-20T13:10:51+01:00 +By default, Qdrant instances are **unsecured**, so it's important to configure security measures before moving to production. To learn more about how to configure security for your Qdrant instance and other advanced options, please check out the [official Qdrant documentation on security.](https://qdrant.tech/documentation/guides/security/) -... +## Time to Experiment - +As we've seen in this article, a vector database is definitely not **just** a database as we traditionally know it. It opens up a world of possibilities, from advanced similarity search to hybrid search that allows content retrieval with both context and precision. - +But there’s no better way to learn than by doing. Try building a [semantic search engine](https://qdrant.tech/documentation/tutorials/search-beginners/) or experiment deploying a [hybrid search service](https://qdrant.tech/documentation/tutorials/hybrid-search-fastembed/) from zero. You'll realize there are endless ways you can take advantage of vectors. -https://qdrant.tech/articles/what-is-vector-quantization/ +| **Use Case** | **How It Works** | **Examples** | +|-----------------------------------|------------------------------------------------------------------------------------------------------|-----------------------------------------------------------| +| **Similarity Search** | Finds similar data points using vector distances | Find similar product images, retrieve documents based on themes, discover related topics | +| **Anomaly Detection** | Identifies outliers based on deviations in vector space | Detect unusual user behavior in banking, spot irregular patterns | +| **Recommendation Systems** | Uses vector embeddings to learn and model user preferences | Personalized movie or music recommendations, e-commerce product suggestions | +| **RAG (Retrieval-Augmented Generation)** | Combines vector search with large language models (LLMs) for contextually relevant answers | Customer support, auto-generate summaries of documents, research reports | +| **Multimodal Search** | Search across different types of data like text, images, and audio in a single query. | Search for products with a description and image, retrieve images based on audio or text | +| **Voice & Audio Recognition** | Uses vector representations to recognize and retrieve audio content | Speech-to-text transcription, voice-controlled smart devices, identify and categorize sounds | +| **Knowledge Graph Augmentation** | Links unstructured data to concepts in knowledge graphs using vectors | Link research papers to related studies, connect customer reviews to product features, organize patents by innovation trends| -2024-12-20T13:10:51+01:00 -... +You can also watch our video tutorial and get started with Qdrant to generate semantic search results and recommendations from a sample dataset. + + - +Phew! I hope you found some of the concepts here useful. If you have any questions feel free to send them in our [Discord Community](https://discord.com/invite/qdrant) where our team will be more than happy to help you out! - +> Remember, don't get lost in vector space! 🚀 -https://qdrant.tech/articles/vector-search-resource-optimization/ +<|page-19-lllmstxt|> +Vector quantization is a data compression technique used to reduce the size of high-dimensional data. Compressing vectors reduces memory usage while maintaining nearly all of the essential information. This method allows for more efficient storage and faster search operations, particularly in large datasets. -2025-05-09T12:38:02+05:30 +When working with high-dimensional vectors, such as embeddings from providers like OpenAI, a single 1536-dimensional vector requires **6 KB of memory**. -... +1536-dimensional vector size is 6 KB - +With 1 million vectors needing around 6 GB of memory, as your dataset grows to multiple **millions of vectors**, the memory and processing demands increase significantly. - +To understand why this process is so computationally demanding, let's take a look at the nature of the [HNSW index](https://qdrant.tech/documentation/concepts/indexing/#vector-index). -https://qdrant.tech/articles/vector-search-filtering/ +The **HNSW (Hierarchical Navigable Small World) index** organizes vectors in a layered graph, connecting each vector to its nearest neighbors. At each layer, the algorithm narrows down the search area until it reaches the lower layers, where it efficiently finds the closest matches to the query. -2025-01-06T10:45:10+01:00 +HNSW Search visualization -... +Each time a new vector is added, the system must determine its position in the existing graph, a process similar to searching. This makes both inserting and searching for vectors complex operations. - +One of the key challenges with the HNSW index is that it requires a lot of **random reads** and **sequential traversals** through the graph. This makes the process computationally expensive, especially when you're dealing with millions of high-dimensional vectors. - +The system has to jump between various points in the graph in an unpredictable manner. This unpredictability makes optimization difficult, and as the dataset grows, the memory and processing requirements increase significantly. -https://qdrant.tech/articles/immutable-data-structures/ +HNSW Search visualization -2024-12-20T13:10:51+01:00 +Since vectors need to be stored in **fast storage** like **RAM** or **SSD** for low-latency searches, as the size of the data grows, so does the cost of storing and processing it efficiently. -... +**Quantization** offers a solution by compressing vectors to smaller memory sizes, making the process more efficient. - +There are several methods to achieve this, and here we will focus on three main ones: - +Types of Quantization: 1. Scalar Quantization, 2. Product Quantization, 3. Binary Quantization -https://qdrant.tech/articles/minicoil/ +## 1. What is Scalar Quantization? -2025-05-13T18:20:11+02:00 +![](/articles_data/what-is-vector-quantization/astronaut-mars.jpg) -... +In Qdrant, each dimension is represented by a `float32` value, which uses **4 bytes** of memory. When using [Scalar Quantization](https://qdrant.tech/documentation/guides/quantization/#scalar-quantization), we map our vectors to a range that the smaller `int8` type can represent. An `int8` is only **1 byte** and can represent 256 values (from -128 to 127, or 0 to 255). This results in a **75% reduction** in memory size. - +For example, if our data lies in the range of -1.0 to 1.0, Scalar Quantization will transform these values to a range that `int8` can represent, i.e., within -128 to 127. The system **maps** the `float32` values into this range. - +Here's a simple linear example of what this process looks like: -https://qdrant.tech/articles/search-feedback-loop/ +![Scalar Quantization example](/articles_data/what-is-vector-quantization/scalar-quant.png) -2025-04-01T12:23:31+02:00 +To set up Scalar Quantization in Qdrant, you need to include the `quantization_config` section when creating or updating a collection: -... +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 128, + "distance": "Cosine" + }, + "quantization_config": { + "scalar": { + "type": "int8", + "quantile": 0.99, + "always_ram": true + } + } +} +``` + +```python +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=128, distance=models.Distance.COSINE), + quantization_config=models.ScalarQuantization( + scalar=models.ScalarQuantizationConfig( + type=models.ScalarType.INT8, + quantile=0.99, + always_ram=True, + ), + ), +) +``` - +The `quantile` parameter is used to calculate the quantization bounds. For example, if you specify a `0.99` quantile, the most extreme 1% of values will be excluded from the quantization bounds. - +This parameter only affects the resulting precision, not the memory footprint. You can adjust it if you experience a significant decrease in search quality. -https://qdrant.tech/articles/dedicated-vector-search/ +Scalar Quantization is a great choice if you're looking to boost search speed and compression without losing much accuracy. It also slightly improves performance, as distance calculations (such as dot product or cosine similarity) using `int8` values are computationally simpler than using `float32` values. -2025-02-18T12:54:36-05:00 +While the performance gains of Scalar Quantization may not match those achieved with Binary Quantization (which we'll discuss later), it remains an excellent default choice when Binary Quantization isn’t suitable for your use case. -... +## 2. What is Binary Quantization? - +![Astronaut in surreal white environment](/articles_data/what-is-vector-quantization/astronaut-white-surreal.jpg) - +[Binary Quantization](https://qdrant.tech/documentation/guides/quantization/#binary-quantization) is an excellent option if you're looking to **reduce memory** usage while also achieving a significant **boost in speed**. It works by converting high-dimensional vectors into simple binary (0 or 1) representations. -https://qdrant.tech/articles/late-interaction-models/ +- Values greater than zero are converted to 1. +- Values less than or equal to zero are converted to 0. -2024-12-20T13:10:51+01:00 +Let's consider our initial example of a 1536-dimensional vector that requires **6 KB** of memory (4 bytes for each `float32` value). -... +After Binary Quantization, each dimension is reduced to 1 bit (1/8 byte), so the memory required is: - +$$ +\frac{1536 \text{ dimensions}}{8 \text{ bits per byte}} = 192 \text{ bytes} +$$ - +This leads to a **32x** memory reduction. -https://qdrant.tech/articles/indexing-optimization/ +Binary Quantization example -2025-03-24T19:51:41+01:00 +Qdrant automates the Binary Quantization process during indexing. As vectors are added to your collection, each 32-bit floating-point component is converted into a binary value according to the configuration you define. -... +Here’s how you can set it up: - +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 1536, + "distance": "Cosine" + }, + "quantization_config": { + "binary": { + "always_ram": true + } + } +} +``` - +```python +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE), + quantization_config=models.BinaryQuantization( + binary=models.BinaryQuantizationConfig( + always_ram=True, + ), + ), +) +``` -https://qdrant.tech/articles/gridstore-key-value-storage/ +Binary Quantization is by far the quantization method that provides the most significant processing **speed gains** compared to Scalar and Product Quantizations. This is because the binary representation allows the system to use highly optimized CPU instructions, such as [XOR](https://en.wikipedia.org/wiki/XOR_gate#:~:text=XOR%20represents%20the%20inequality%20function,the%20other%20but%20not%20both%22) and [Popcount](https://en.wikipedia.org/wiki/Hamming_weight), for fast distance computations. -2025-02-05T09:42:23-05:00 +It can speed up search operations by **up to 40x**, depending on the dataset and hardware. -... +Not all models are equally compatible with Binary Quantization, and in the comparison above, we are only using models that are compatible. Some models may experience a greater loss in accuracy when quantized. We recommend using Binary Quantization with models that have **at least 1024 dimensions** to minimize accuracy loss. - +The models that have shown the best compatibility with this method include: - +- **OpenAI text-embedding-ada-002** (1536 dimensions) +- **Cohere AI embed-english-v2.0** (4096 dimensions) -https://qdrant.tech/articles/agentic-rag/ +These models demonstrate minimal accuracy loss while still benefiting from substantial speed and memory gains. -2024-12-20T13:10:51+01:00 +Even though Binary Quantization is incredibly fast and memory-efficient, the trade-offs are in **precision** and **model compatibility**, so you may need to ensure search quality using techniques like oversampling and rescoring. -... +If you're interested in exploring Binary Quantization in more detail—including implementation examples, benchmark results, and usage recommendations—check out our dedicated article on [Binary Quantization - Vector Search, 40x Faster](https://qdrant.tech/articles/binary-quantization/). - +## 3. What is Product Quantization? - +![](/articles_data/what-is-vector-quantization/astronaut-centroids.jpg) -https://qdrant.tech/articles/hybrid-search/ +[Product Quantization](https://qdrant.tech/documentation/guides/quantization/#product-quantization) is a method used to compress high-dimensional vectors by representing them with a smaller set of representative points. -2025-01-03T10:53:26+01:00 +The process begins by splitting the original high-dimensional vectors into smaller **sub-vectors.** Each sub-vector represents a segment of the original vector, capturing different characteristics of the data. -... +Creation of the Sub-vector - +For each sub-vector, a separate **codebook** is created, representing regions in the data space where common patterns occur. - +The codebook in Qdrant is trained automatically during the indexing process. As vectors are added to the collection, Qdrant uses your specified quantization settings in the `quantization_config` to build the codebook and quantize the vectors. Here’s how you can set it up: -https://qdrant.tech/articles/what-is-rag-in-ai/ +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 1024, + "distance": "Cosine" + }, + "quantization_config": { + "product": { + "compression": "x32", + "always_ram": true + } + } +} +``` -2024-12-20T13:10:51+01:00 -... +```python +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=1024, distance=models.Distance.COSINE), + quantization_config=models.ProductQuantization( + product=models.ProductQuantizationConfig( + compression=models.CompressionRatio.X32, + always_ram=True, + ), + ), +) +``` - +Each region in the codebook is defined by a **centroid**, which serves as a representative point summarizing the characteristics of that region. Instead of treating every single data point as equally important, we can group similar sub-vectors together and represent them with a single centroid that captures the general characteristics of that group. - +The centroids used in Product Quantization are determined using the **[K-means clustering algorithm](https://en.wikipedia.org/wiki/K-means_clustering)**. -https://qdrant.tech/articles/bm42/ +Codebook and Centroids example -2025-04-10T12:02:16+02:00 +Qdrant always selects **K = 256** as the number of centroids in its implementation, based on the fact that 256 is the maximum number of unique values that can be represented by a single byte. -... +This makes the compression process efficient because each centroid index can be stored in a single byte. - +The original high-dimensional vectors are quantized by mapping each sub-vector to the nearest centroid in its respective codebook. - +Vectors being mapped to their corresponding centroids example -https://qdrant.tech/articles/qdrant-1.8.x/ +The compressed vector stores the index of the closest centroid for each sub-vector. -2024-07-07T18:34:56-07:00 +Here’s how a 1024-dimensional vector, originally taking up 4096 bytes, is reduced to just 128 bytes by representing it as 128 indexes, each pointing to the centroid of a sub-vector: -... +Product Quantization example - +After setting up quantization and adding your vectors, you can perform searches as usual. Qdrant will automatically use the quantized vectors, optimizing both speed and memory usage. Optionally, you can enable rescoring for better accuracy. - -https://qdrant.tech/articles/rapid-rag-optimization-with-qdrant-and-quotient/ +```http +POST /collections/{collection_name}/points/search +{ + "query": [0.22, -0.01, -0.98, 0.37], + "params": { + "quantization": { + "rescore": true + } + }, + "limit": 10 +} +``` -2025-05-15T19:33:44+05:30 +```python +client.query_points( + collection_name="my_collection", + query_vector=[0.22, -0.01, -0.98, 0.37], # Your query vector + search_params=models.SearchParams( + quantization=models.QuantizationSearchParams( + rescore=True # Enables rescoring with original vectors + ) + ), + limit=10 # Return the top 10 results +) +``` +Product Quantization can significantly reduce memory usage, potentially offering up to **64x** compression in certain configurations. However, it's important to note that this level of compression can lead to a noticeable drop in quality. -... +If your application requires high precision or real-time performance, Product Quantization may not be the best choice. However, if **memory savings** are critical and some accuracy loss is acceptable, it could still be an ideal solution. - +Here’s a comparison of speed, accuracy, and compression for all three methods, adapted from [Qdrant's documentation](https://qdrant.tech/documentation/guides/quantization/#how-to-choose-the-right-quantization-method): - +| Quantization method | Accuracy | Speed | Compression | +|---------------------|----------|------------|-------------| +| Scalar | 0.99 | up to x2 | 4 | +| Product | 0.7 | 0.5 | up to 64 | +| Binary | 0.95* | up to x40 | 32 | -https://qdrant.tech/articles/rag-is-dead/ +\* - for compatible models -2024-12-20T13:10:51+01:00 +For a more in-depth understanding of the benchmarks you can expect, check out our dedicated article on [Product Quantization in Vector Search](https://qdrant.tech/articles/product-quantization/). -... +## Rescoring, Oversampling, and Reranking - +When we use quantization methods like Scalar, Binary, or Product Quantization, we're compressing our vectors to save memory and improve performance. However, this compression removes some detail from the original vectors. - +This can slightly reduce the accuracy of our similarity searches because the quantized vectors are approximations of the original data. To mitigate this loss of accuracy, you can use **oversampling** and **rescoring**, which help improve the accuracy of the final search results. -https://qdrant.tech/articles/binary-quantization-openai/ +The original vectors are never deleted during this process, and you can easily switch between quantization methods or parameters by updating the collection configuration at any time. -2024-12-20T13:10:51+01:00 +Here’s how the process works, step by step: -... +### 1. Initial Quantized Search - +When you perform a search, Qdrant retrieves the top candidates using the quantized vectors based on their similarity to the query vector, as determined by the quantized data. This step is fast because we're using the quantized vectors. - +ANN Search with Quantization -https://qdrant.tech/articles/multitenancy/ +### 2. Oversampling -2024-12-20T13:10:51+01:00 +Oversampling is a technique that helps compensate for any precision lost due to quantization. Since quantization simplifies vectors, some relevant matches could be missed in the initial search. To avoid this, you can **retrieve more candidates**, increasing the chances that the most relevant vectors make it into the final results. -... +You can control the number of extra candidates by setting an `oversampling` parameter. For example, if your desired number of results (`limit`) is 4 and you set an `oversampling` factor of 2, Qdrant will retrieve 8 candidates (4 × 2). - +ANN Search with Quantization and Oversampling - +You can adjust the oversampling factor to control how many extra vectors Qdrant includes in the initial pool. More candidates mean a better chance of obtaining high-quality top-K results, especially after rescoring with the original vectors. -https://qdrant.tech/articles/data-privacy/ +### 3. Rescoring with Original Vectors -2024-12-20T13:10:51+01:00 +After oversampling to gather more potential matches, each candidate is re-evaluated based on additional criteria to ensure higher accuracy and relevance to the query. -... +The rescoring process **maps** the quantized vectors to their corresponding original vectors, allowing you to consider factors like context, metadata, or additional relevance that wasn’t included in the initial search, leading to more accurate results. - +![Rescoring with Original Vectors](/articles_data/what-is-vector-quantization/rescoring.png) - +During rescoring, one of the lower-ranked candidates from oversampling might turn out to be a better match than some of the original top-K candidates. -https://qdrant.tech/articles/discovery-search/ +Even though rescoring uses the original, larger vectors, the process remains much faster because only a very small number of vectors are read. The initial quantized search already identifies the specific vectors to read, rescore, and rerank. -2024-12-20T13:10:51+01:00 +### 4. Reranking -... +With the new similarity scores from rescoring, **reranking** is where the final top-K candidates are determined based on the updated similarity scores. - +For example, in our case with a limit of 4, a candidate that ranked 6th in the initial quantized search might improve its score after rescoring because the original vectors capture more context or metadata. As a result, this candidate could move into the final top 4 after reranking, replacing a less relevant option from the initial search. - +Reranking with Original Vectors -https://qdrant.tech/articles/what-are-embeddings/ +Here's how you can set it up: -2024-12-20T13:10:51+01:00 +```http +POST /collections/{collection_name}/points/search -... - +{ + "query": [0.22, -0.01, -0.98, 0.37], + "params": { + "quantization": { + "rescore": true, + "oversampling": 2 + } + }, + "limit": 4 +} +``` - +```python +client.query_points( + collection_name="my_collection", + query_vector=[0.22, -0.01, -0.98, 0.37], + search_params=models.SearchParams( + quantization=models.QuantizationSearchParams( + rescore=True, # Enables rescoring with original vectors + oversampling=2 # Retrieves extra candidates for rescoring + ) + ), + limit=4 # Desired number of final results +) +``` -https://qdrant.tech/articles/sparse-vectors/ +You can adjust the `oversampling` factor to find the right balance between search speed and result accuracy. -2025-03-04T22:08:36+01:00 +If quantization is impacting performance in an application that requires high accuracy, combining oversampling with rescoring is a great choice. However, if you need faster searches and can tolerate some loss in accuracy, you might choose to use oversampling without rescoring, or adjust the oversampling factor to a lower value. -... +## Distributing Resources Between Disk & Memory - +Qdrant stores both the quantized and original vectors. When you enable quantization, both the original and quantized vectors are stored in RAM by default. You can move the original vectors to disk to significantly reduce RAM usage and lower system costs. Simply enabling quantization is not enough—you need to explicitly move the original vectors to disk by setting `on_disk=True`. - +Here’s an example configuration: -https://qdrant.tech/articles/qdrant-1.7.x/ +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 1536, + "distance": "Cosine", + "on_disk": true # Move original vectors to disk + }, + "quantization_config": { + "binary": { + "always_ram": true # Store only quantized vectors in RAM + } + } +} +``` -2024-10-05T03:39:41+05:30 +```python +client.update_collection( + collection_name="my_collection", + vectors_config=models.VectorParams( + size=1536, + distance=models.Distance.COSINE, + on_disk=True # Move original vectors to disk + ), + quantization_config=models.BinaryQuantization( + binary=models.BinaryQuantizationConfig( + always_ram=True # Store only quantized vectors in RAM + ) + ) +) +``` -... +Without explicitly setting `on_disk=True`, you won't see any RAM savings, even with quantization enabled. So, make sure to configure both storage and quantization options based on your memory and performance needs. If your storage has high disk latency, consider disabling rescoring to maintain speed. - +### Speeding Up Rescoring with io_uring - +When dealing with large collections of quantized vectors, frequent disk reads are required to retrieve both original and compressed data for rescoring operations. While `mmap` helps with efficient I/O by reducing user-to-kernel transitions, rescoring can still be slowed down when working with large datasets on disk due to the need for frequent disk reads. -https://qdrant.tech/articles/new-recommendation-api/ +On Linux-based systems, `io_uring` allows multiple disk operations to be processed in parallel, significantly reducing I/O overhead. This optimization is particularly effective during rescoring, where multiple vectors need to be re-evaluated after the initial search. With io_uring, Qdrant can retrieve and rescore vectors from disk in the most efficient way, improving overall search performance. -2024-03-07T20:31:05+01:00 +When you perform vector quantization and store data on disk, Qdrant often needs to access multiple vectors in parallel. Without io_uring, this process can be slowed down due to the system’s limitations in handling many disk accesses. -... +To enable `io_uring` in Qdrant, add the following to your storage configuration: - +```yaml +storage: + async_scorer: true # Enable io_uring for async storage +``` - +Without this configuration, Qdrant will default to using `mmap` for disk I/O operations. -https://qdrant.tech/articles/dedicated-service/ +For more information and benchmarks comparing io_uring with traditional I/O approaches like mmap, check out [Qdrant's io_uring implementation article.](https://qdrant.tech/articles/io_uring/) -2024-12-20T13:10:51+01:00 +## Performance of Quantized vs. Non-Quantized Data -... +Qdrant uses the quantized vectors by default if they are available. If you want to evaluate how quantization affects your search results, you can temporarily disable it to compare results from quantized and non-quantized searches. To do this, set `ignore: true` in the query: - +```http +POST /collections/{collection_name}/points/query +{ + "query": [0.22, -0.01, -0.98, 0.37], + "params": { + "quantization": { + "ignore": true, + } + }, + "limit": 4 +} +``` - +```python +client.query_points( + collection_name="{collection_name}", + query=[0.22, -0.01, -0.98, 0.37], + search_params=models.SearchParams( + quantization=models.QuantizationSearchParams( + ignore=True + ) + ), +) +``` +### Switching Between Quantization Methods -https://qdrant.tech/articles/fastembed/ +Not sure if you’ve chosen the right quantization method? In Qdrant, you have the flexibility to remove quantization and rely solely on the original vectors, adjust the quantization type, or change compression parameters at any time without affecting your original vectors. -2024-12-20T13:10:51+01:00 +To switch to binary quantization and adjust the compression rate, for example, you can update the collection’s quantization configuration using the `update_collection` method: -... +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 1536, + "distance": "Cosine" + }, + "quantization_config": { + "binary": { + "always_ram": true, + "compression_rate": 0.8 # Set the new compression rate + } + } +} +``` - - +```python +client.update_collection( + collection_name="my_collection", + quantization_config=models.BinaryQuantization( + binary=models.BinaryQuantizationConfig( + always_ram=True, # Store only quantized vectors in RAM + compression_rate=0.8 # Set the new compression rate + ) + ), +) +``` -https://qdrant.tech/articles/geo-polygon-filter-gsoc/ +If you decide to **turn off quantization** and use only the original vectors, you can remove the quantization settings entirely with `quantization_config=None`: -2024-12-20T13:10:51+01:00 +```http +PUT /collections/my_collection +{ + "vectors": { + "size": 1536, + "distance": "Cosine" + }, + "quantization_config": null # Remove quantization and use original vectors only +} +``` -... +```python +client.update_collection( + collection_name="my_collection", + quantization_config=None # Remove quantization and rely on original vectors only +) +``` +## Wrapping Up - +![](/articles_data/what-is-vector-quantization/astronaut-running.jpg) - +Quantization methods like Scalar, Product, and Binary Quantization offer powerful ways to optimize memory usage and improve search performance when dealing with large datasets of high-dimensional vectors. Each method comes with its own trade-offs between memory savings, computational speed, and accuracy. -https://qdrant.tech/articles/binary-quantization/ +Here are some final thoughts to help you choose the right quantization method for your needs: -2025-04-10T09:21:38-03:00 +| **Quantization Method** | **Key Features** | **When to Use** | +|--------------------------|-------------------------------------------------------------|--------------------------------------------------------------------------------------------| +| **Binary Quantization** | ‱ **Fastest method and most memory-efficient**
‱ Up to **40x** faster search and **32x** reduced memory footprint | ‱ Use with tested models like OpenAI's `text-embedding-ada-002` and Cohere's `embed-english-v2.0`
‱ When speed and memory efficiency are critical | +| **Scalar Quantization** | ‱ **Minimal loss of accuracy**
‱ Up to **4x** reduced memory footprint | ‱ Safe default choice for most applications.
‱ Offers a good balance between accuracy, speed, and compression. | +| **Product Quantization** | ‱ **Highest compression ratio**
‱ Up to **64x** reduced memory footprint | ‱ When minimizing memory usage is the top priority
‱ Acceptable if some loss of accuracy and slower indexing is tolerable | -... +### Learn More -
+If you want to learn more about improving accuracy, memory efficiency, and speed when using quantization in Qdrant, we have a dedicated [Quantization tips](https://qdrant.tech/documentation/guides/quantization/#quantization-tips) section in our docs that explains all the quantization tips you can use to enhance your results. - +Learn more about optimizing real-time precision with oversampling in Binary Quantization by watching this interview with Qdrant’s CTO, Andrey Vasnetsov: -https://qdrant.tech/articles/food-discovery-demo/ +
+ +
-2024-12-20T13:10:51+01:00 +Stay up-to-date on the latest in [vector search](/advanced-search/) and quantization, share your projects, ask questions, [join our vector search community](https://discord.com/invite/qdrant)! -... +<|page-20-lllmstxt|> +## What’s in This Guide? -
+[**Resource Management Strategies:**](#storage-disk-vs-ram) If you are trying to scale your app on a budget - this is the guide for you. We will show you how to avoid wasting compute resources and get the maximum return on your investment. - +[**Performance Improvement Tricks:**](#configure-indexing-for-faster-searches) We’ll dive into advanced techniques like indexing, compression, and partitioning. Our tips will help you get better results at scale, while reducing total resource expenditure. -https://qdrant.tech/articles/web-ui-gsoc/ +[**Query Optimization Methods:**](#query-optimization) Improving your vector database setup isn’t just about saving costs. We’ll show you how to build search systems that deliver consistently high precision while staying adaptable. -2024-12-20T13:10:51+01:00 +_________________________________________________________________________ -... +#### Remember: Optimization is a Balancing Act - +In this guide, we will show you how to use Qdrant’s features to meet your performance needs. +However - there are resource tradeoffs and you can't have it all. +It is up to you to choose the optimization strategy that best fits your goals. - +optimization -https://qdrant.tech/articles/dimension-reduction-qsoc/ +Let's take a look at some common goals and optimization strategies: -2024-12-20T13:10:51+01:00 +| Intended Result | Optimization Strategy | +|--------------------------------|------------------------------| +| [**High Search Precision + Low Memory Expenditure**](/documentation/guides/optimize/#1-high-speed-search-with-low-memory-usage) | [**On-Disk Indexing**](/documentation/guides/optimize/#1-high-speed-search-with-low-memory-usage) | +| [**Low Memory Expenditure + Fast Search Speed**](/documentation/guides/quantization/) | [**Quantization**](/documentation/guides/quantization/) | +| [**High Search Precision + Fast Search Speed**](/documentation/guides/optimize/#3-high-precision-with-high-speed-search) | [**RAM Storage + Quantization**](/documentation/guides/optimize/#3-high-precision-with-high-speed-search) | +| [**Balance Latency vs Throughput**](/documentation/guides/optimize/#balancing-latency-and-throughput) | [**Segment Configuration**](/documentation/guides/optimize/#balancing-latency-and-throughput) | -... +After this article, check out the code samples in our docs on [**Qdrant’s Optimization Methods**](/documentation/guides/optimize/). - +--- - +## Configure Indexing for Faster Searches -https://qdrant.tech/articles/search-as-you-type/ +![indexing](/articles_data/vector-search-resource-optimization/index.png) -2024-12-20T13:10:51+01:00 +A vector index is the central location where Qdrant calculates vector similarity. It is the backbone of your search process, retrieving relevant results from vast amounts of data. -... +Qdrant uses the [**HNSW (Hierarchical Navigable Small World Graph) algorithm**](/documentation/concepts/indexing/#vector-index) as its dense vector index, which is both powerful and scalable. - +**Figure 2:** A sample HNSW vector index with three layers. Follow the blue arrow on the top layer to see how a query travels throughout the database index. The closest result is on the bottom level, nearest to the gray query point. - +hnsw -https://qdrant.tech/articles/vector-similarity-beyond-search/ +#### Vector Index Optimization Parameters -2024-12-20T13:10:51+01:00 +Working with massive datasets that contain billions of vectors demands significant resources—and those resources come with a price. While Qdrant provides reasonable defaults, tailoring them to your specific use case can unlock optimal performance. Here’s what you need to know. -... +The following parameters give you the flexibility to fine-tune Qdrant’s performance for your specific workload. You can modify them directly in Qdrant's [**configuration**](https://qdrant.tech/documentation/guides/configuration/) files or at the collection and named vector levels for more granular control. - +**Figure 3:** A description of three key HNSW parameters. - +hnsw-parameters -https://qdrant.tech/articles/serverless/ +#### 1. The `m` parameter determines edges per node -2025-02-18T21:01:07+05:30 +This controls the number of edges in the graph. A higher value enhances search accuracy but demands more memory and build time. Fine-tune this to balance memory usage and precision. + +#### 2. The `ef_construct` parameter controls the index build range -... +This parameter sets how many neighbors are considered during index construction. A larger value improves the accuracy of the index but increases the build time. Use this to customize your indexing speed versus quality. - +You need to set both the `m` and `ef parameters` as you create the collection: - +```python +client.update_collection( + collection_name="{collection_name}", + vectors_config={ + "my_vector": models.VectorParamsDiff( + hnsw_config=models.HnswConfigDiff( + m=32, + ef_construct=123, + ), + ), + } +) +``` -https://qdrant.tech/documentation/database-tutorials/bulk-upload/ +#### 3. The `ef` parameter updates vector search range -2025-03-25T21:43:45-03:00 +This determines how many neighbors are evaluated during a search query. You can adjust this to balance query speed and accuracy. + +The `ef` parameter is configured during the search process: -... +```python +client.query_points( + collection_name="{collection_name}", + query=[...] + search_params=models.SearchParams(hnsw_ef=128, exact=False), +) +``` +--- +These are just the basics of HNSW. Learn More about [**Indexing**](/documentation/concepts/indexing/). - +--- - +## Data Compression Techniques +![compression](/articles_data/vector-search-resource-optimization/compress.png) -https://qdrant.tech/benchmarks/benchmarks-intro/ +Efficient data compression is a cornerstone of resource optimization in vector databases. By reducing memory usage, you can achieve faster query performance without sacrificing too much accuracy. -2024-06-27T12:40:08+02:00 +One powerful technique is [**quantization**](/documentation/guides/quantization/), which transforms high-dimensional vectors into compact representations while preserving relative similarity. Let’s explore the quantization options available in Qdrant. -... +#### Scalar Quantization - +Scalar quantization strikes an excellent balance between compression and performance, making it the go-to choice for most use cases. - +This method minimizes the number of bits used to represent each vector component. For instance, Qdrant compresses 32-bit floating-point values (**float32**) into 8-bit unsigned integers (**uint8**), slashing memory usage by an impressive 75%. -https://qdrant.tech/documentation/faq/qdrant-fundamentals/ +**Figure 4:** The top example shows a float32 vector with a size of 40 bytes. Converting it to int8 format reduces its size by a factor of four, while maintaining approximate similarity relationships between vectors. The loss in precision compared to the original representation is typically negligible for most practical applications. -2025-05-02T10:37:48+02:00 +scalar-quantization -... +#### Benefits of Scalar Quantization: - +| Benefit | Description | +|----------------------------------|------------------------------------------------------------------------------------------------------------------| +| **Memory usage will drop** | Compression cuts memory usage by a factor of 4. Qdrant compresses 32-bit floating-point values (float32) into 8-bit unsigned integers (uint8). | +| **Accuracy loss is minimal** | Converting from float32 to uint8 introduces a small loss in precision. Typical error rates remain below 1%, making this method highly efficient. | +| **Best for specific use cases** | To be used with high-dimensional vectors where minor accuracy losses are acceptable. | - +#### Set it up as you create the collection: -https://qdrant.tech/documentation/search-precision/reranking-semantic-search/ +```python +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE), + quantization_config=models.ScalarQuantization( + scalar=models.ScalarQuantizationConfig( + type=models.ScalarType.INT8, + quantile=0.99, + always_ram=True, + ), + ), +) +``` +When working with Qdrant, you can fine-tune the quantization configuration to optimize precision, memory usage, and performance. Here’s what the key configuration options include: -2025-05-21T15:27:35+08:00 +| Configuration Option | Description | +|----------------------|------------------------------------------------------------------------------------------------------------------| +| `type` | Specifies the quantized vector type (currently supports only int8). | +| `quantile` | Sets bounds for quantization, excluding outliers. For example, 0.99 excludes the top 1% of extreme values to maintain better accuracy. | +| `always_ram ` | Keeps quantized vectors in RAM to speed up searches. | -... +Adjust these settings to strike the right balance between precision and efficiency for your specific workload. - +--- +Learn More about [**Scalar Quantization**](/documentation/guides/quantization/) - +--- -https://qdrant.tech/documentation/cloud-rbac/role-management/ +#### Binary Quantization -2025-05-02T16:53:21+02:00 +**Binary quantization** takes scalar quantization to the next level by compressing each vector component into just **a single bit**. This method achieves unparalleled memory efficiency and query speed, reducing memory usage by a factor of 32 and enabling searches up to 40x faster. -... +#### **Benefits of Binary Quantization:** - +Binary quantization is ideal for large-scale datasets and compatible embedding models, where compression and speed are paramount. - +**Figure 5:** This method causes maximum compression. It reduces memory usage by 32x and speeds up searches by up to 40x. -https://qdrant.tech/documentation/beginner-tutorials/search-beginners/ +binary-quantization -2025-04-25T19:32:48+03:00 +| Benefit | Description | +|----------------------------------|------------------------------------------------------------------------------------------------------------------| +| **Efficient similarity calculations** | Emulates Hamming distance through dot product comparisons, making it fast and effective. | +| **Perfect for high-dimensional vectors** | Works well with embedding models like OpenAI’s text-embedding-ada-002 or Cohere’s embed-english-v3.0. | +| **Precision management** | Consider rescoring or oversampling to offset precision loss. | -... +Here’s how you can enable binary quantization in Qdrant: - +```python +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE), + quantization_config=models.BinaryQuantization( + binary=models.BinaryQuantizationConfig( + always_ram=True, + ), + ), +) +``` +> By default, quantized vectors load like original vectors unless you set `always_ram` to `True` for instant access and faster queries. - +--- +Learn more about [**Binary Quantization**](/documentation/guides/quantization/) -https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/ +--- -2025-03-10T22:19:22+01:00 +## Scaling the Database -... +![sharding](/articles_data/vector-search-resource-optimization/shards.png) - +Efficiently managing large datasets in distributed systems like Qdrant requires smart strategies for data isolation. **Multitenancy** and **Sharding** are essential tools to help you handle high volumes of user-specific data while maintaining performance and scalability. - +#### Multitenancy -https://qdrant.tech/documentation/private-cloud/private-cloud-setup/ +**Multitenancy** is a software architecture where multiple independent users (or tenants) share the same resources or environment. In Qdrant, a single collection with logical partitioning is often the most efficient setup for multitenant use cases. -2025-06-03T09:48:32+02:00 +**Figure 5:** Each individual vector is assigned a specific payload that denotes which tenant it belongs to. This is how a large number of different tenants can share a single Qdrant collection. -... +multitenancy - +**Why Choose Multitenancy?** - +- **Logical Isolation**: Ensures each tenant’s data remains separate while residing in the same collection. +- **Minimized Overhead**: Reduces resource consumption compared to maintaining separate collections for each user. +- **Scalability**: Handles high user volumes without compromising performance. -https://qdrant.tech/documentation/overview/vector-search/ +Here’s how you can implement multitenancy efficiently in Qdrant: -2024-10-05T03:39:41+05:30 +```python +client.create_payload_index( + collection_name="{collection_name}", + field_name="group_id", + field_schema=models.KeywordIndexParams( + type="keyword", + is_tenant=True, + ), +) +``` -... +Creating a keyword payload index, with the `is_tenant` parameter set to `True`, modifies the way the vectors will be logically stored. Storage structure will be organized to co-locate vectors of the same tenant together. - +Now, each point stored in Qdrant should have the `group_id` payload attribute set: - +```python +client.upsert( + collection_name="{collection_name}", + points=[ + models.PointStruct( + id=1, + payload={"group_id": "user_1"}, + vector=[0.9, 0.1, 0.1], + ), -https://qdrant.tech/articles/qdrant-1.3.x/ + models.PointStruct( + id=2, + payload={"group_id": "user_2"}, + vector=[0.5, 0.9, 0.4], + ) + ] +) +``` -2024-03-07T20:31:05+01:00 +--- -... +To ensure proper data isolation in a multitenant environment, you can assign a unique identifier, such as a **group_id**, to each vector. This approach ensures that each user's data remains segregated, allowing users to access only their own data. You can further enhance this setup by applying filters during queries to restrict access to the relevant data. - +--- +Learn More about [**Multitenancy**](/documentation/guides/multiple-partitions/) - +--- -https://qdrant.tech/benchmarks/single-node-speed-benchmark/ +#### Sharding -2024-06-17T22:01:23+02:00 +Sharding is a critical strategy in Qdrant for splitting collections into smaller units, called **shards**, to efficiently distribute data across multiple nodes. It’s a powerful tool for improving scalability and maintaining performance in large-scale systems. -... +#### User-Defined Sharding: - +**User-Defined Sharding** allows you to take control of data placement by specifying a shard key. This feature is particularly useful in multi-tenant setups, as it enables the isolation of each tenant’s data within separate shards, ensuring better organization and enhanced data security. - +**Figure 6:** Users can both upsert and query shards that are relevant to them, all within the same collection. Regional sharding can help avoid cross-continental traffic. -https://qdrant.tech/benchmarks/single-node-speed-benchmark-2022/ +user-defined-sharding -2024-01-11T19:41:06+05:30 +**Example:** -... +```python +client.create_collection( + collection_name="my_custom_sharded_collection", + shard_number=1, + sharding_method=models.ShardingMethod.CUSTOM +) +client.create_shard_key("my_custom_sharded_collection", "tenant_id") +``` - +--- - +When implementing user-defined sharding in Qdrant, two key parameters are critical to achieving efficient data distribution: -https://qdrant.tech/documentation/search-precision/automate-filtering-with-llms/ +1. **Shard Key**: + + The shard key determines how data points are distributed across shards. For example, using a key like `tenant_id` allows you to control how Qdrant partitions the data. Each data point added to the collection will be assigned to a shard based on the value of this key, ensuring logical isolation of data. + +2. **Shard Number**: + + This defines the total number of physical shards for each shard key, influencing resource allocation and query performance. + +Here’s how you can add a data point to a collection with user-defined sharding: -2025-05-27T18:00:51+02:00 +```python +client.upsert( + collection_name="my_custom_sharded_collection", + points=[ + models.PointStruct( + id=1111, + vector=[0.1, 0.2, 0.3] + ) + ], + shard_key_selector="tenant_1" +) +``` -... +--- - +This code assigns the point to a specific shard based on the `tenant_1` shard key, ensuring proper data placement. - +Here’s how to choose the shard_number: -https://qdrant.tech/documentation/beginner-tutorials/neural-search/ +| Recommendation | Description | +|---------------------------------|------------------------------------------------------------------------------------------------------------------| +| **Match Shards to Nodes** | The number of shards should align with the number of nodes in your cluster to balance resource utilization and query performance. | +| **Plan for Scalability** | Start with at least **2 shards per node** to allow room for future growth. | +| **Future-Proofing** | Starting with around **12 shards** is a good rule of thumb. This setup allows your system to scale seamlessly from 1 to 12 nodes without requiring re-sharding. | -2024-11-18T15:26:15-08:00 +Learn more about [**Sharding in Distributed Deployment**](/documentation/guides/distributed_deployment/) -... +--- - +## Query Optimization +![qdrant](/articles_data/vector-search-resource-optimization/query.png) +Improving vector database performance is critical when dealing with large datasets and complex queries. By leveraging techniques like **filtering**, **batch processing**, **reranking**, **rescoring**, and **oversampling**, so you can ensure fast response times and maintain efficiency even at scale. - +#### Filtering -https://qdrant.tech/documentation/private-cloud/configuration/ +Filtering allows you to select only the required fields in your query results. By limiting the output size, you can significantly reduce response time and improve performance. -2025-03-21T16:37:49+01:00 +The filterable vector index is Qdrant's solves pre and post-filtering problems by adding specialized links to the search graph. It aims to maintain the speed advantages of vector search while allowing for precise filtering, addressing the inefficiencies that can occur when applying filters after the vector search. -... +**Example:** - +```python +results = client.search( + collection_name="my_collection", + query_vector=[0.1, 0.2, 0.3], + query_filter=models.Filter(must=[ + models.FieldCondition( + key="category", + match=models.MatchValue(value="my-category-name"), + ) + ]), + limit=10, +) +``` +**Figure 7:** The filterable vector index adds specialized links to the search graph to speed up traversal. - +![filterable-vector-index](/articles_data/vector-search-resource-optimization/filterable-vector-index.png) -https://qdrant.tech/documentation/database-tutorials/create-snapshot/ +[**Filterable vector index**](/documentation/concepts/indexing/): This technique builds additional links **(orange)** between leftover data points. The filtered points which stay behind are now traversible once again. Qdrant uses special category-based methods to connect these data points. -2025-06-12T09:02:54+03:00 +--- +Read more about [**Filtering Docs**](/documentation/concepts/filtering/) and check out the [**Complete Filtering Guide**](/articles/vector-search-filtering/). -... +--- +#### Batch Processing - +Batch processing consolidates multiple operations into a single execution cycle, reducing request overhead and enhancing throughput. It’s an effective strategy for both data insertion and query execution. - +batch-processing -https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-cluster-creation/ +**Batch Insertions**: Instead of inserting vectors individually, group them into medium-sized batches to minimize the number of database requests and the overhead of frequent writes. -2025-06-16T17:51:31+02:00 +**Example:** -... +```python +vectors = [ + [.1, .0, .0, .0], + [.0, .1, .0, .0], + [.0, .0, .1, .0], + [.0, .0, .0, .1], + 
 +] +client.upload_collection( + collection_name="test_collection", + vectors=vectors, +) +``` - +This reduces write operations and ensures faster data ingestion. - +**Batch Queries**: Similarly, you can batch multiple queries together rather than executing them one by one. This reduces the number of round trips to the database, optimizing performance and reducing latency. -https://qdrant.tech/documentation/data-ingestion-beginners/ +**Example:** -2025-05-15T20:16:43+05:30 +```python +results = client.search_batch( + collection_name="test_collection", + requests=[ + SearchRequest( + vector=[0., 0., 2., 0.], + limit=1, + ), + SearchRequest( + vector=[0., 0., 0., 0.01], + with_vector=True, + limit=2, + ) + ] +) +``` -... +Batch queries are particularly useful when processing a large number of similar queries or when handling multiple user requests simultaneously. - +___ - +#### Hybrid Search -https://qdrant.tech/documentation/faq/database-optimization/ +Hybrid search combines **keyword filtering** with **vector similarity search**, enabling faster and more precise results. Keywords help narrow down the dataset quickly, while vector similarity ensures semantic accuracy. This search method combines [**dense and sparse vectors**](/documentation/concepts/vectors/). -2024-10-05T03:39:41+05:30 +Hybrid search in Qdrant uses both fusion and reranking. The former is about combining the results from different search methods, based solely on the scores returned by each method. That usually involves some normalization, as the scores returned by different methods might be in different ranges. -... +**Figure 8**: Hybrid Search Architecture - +hybrid-search - +After that, there is a formula that takes the relevancy measures and calculates the final score that we use later on to reorder the documents. Qdrant has built-in support for the Reciprocal Rank Fusion method, which is the de facto standard in the field. -https://qdrant.tech/documentation/ +--- +Learn more about [**Hybrid Search**](/articles/hybrid-search/) and read out [**Hybrid Queries docs**](/documentation/concepts/hybrid-queries/). -2024-12-20T13:10:51+01:00 +--- -... +#### Oversampling - +Oversampling is a technique that helps compensate for any precision lost due to quantization. Since quantization simplifies vectors, some relevant matches could be missed in the initial search. To avoid this, you can **retrieve more candidates**, increasing the chances that the most relevant vectors make it into the final results. - +You can control the number of extra candidates by setting an `oversampling` parameter. For example, if your desired number of results (`limit`) is 4 and you set an `oversampling` factor of 2, Qdrant will retrieve 8 candidates (4 × 2). -https://qdrant.tech/documentation/advanced-tutorials/using-multivector-representations/ +You can adjust the oversampling factor to control how many extra vectors Qdrant includes in the initial pool. More candidates mean a better chance of obtaining high-quality top-K results, especially after rescoring with the original vectors. -2025-06-10T11:40:10+03:00 +--- +Learn more about [**Oversampling**](/articles/what-is-vector-quantization/#2-oversampling). -... +--- - +#### Rescoring - +After oversampling to gather more potential matches, each candidate is re-evaluated based on additional criteria to ensure higher accuracy and relevance to the query. -https://qdrant.tech/documentation/database-tutorials/large-scale-search/ +The rescoring process maps the quantized vectors to their corresponding original vectors, allowing you to consider factors like context, metadata, or additional relevance that wasn’t included in the initial search, leading to more accurate results. -2025-03-24T14:27:15-03:00 +**Example of Rescoring and Oversampling:**: -... +```python +client.query_points( + collection_name="my_collection", + query_vector=[0.22, -0.01, -0.98, 0.37], + search_params=models.SearchParams( + quantization=models.QuantizationSearchParams( + rescore=True, # Enables rescoring with original vectors + oversampling=2 # Retrieves extra candidates for rescoring + ) + ), + limit=4 # Desired number of final results +) +``` +--- +Learn more about [**Rescoring**](/articles/what-is-vector-quantization/#3-rescoring-with-original-vectors). - +--- - +#### Reranking -https://qdrant.tech/documentation/fastembed/fastembed-quickstart/ +Reranking adjusts the order of search results based on additional criteria, ensuring the most relevant results are prioritized. -2024-08-06T15:42:27-07:00 +This method is about taking the results from different search methods and reordering them based on some additional processing using the content of the documents, not just the scores. This processing may rely on an additional neural model, such as a cross-encoder which would be inefficient enough to be used on the whole dataset. -... +reranking - +These methods are practically applicable only when used on a smaller subset of candidates returned by the faster search methods. Late interaction models, such as ColBERT, are way more efficient in this case, as they can be used to rerank the candidates without the need to access all the documents in the collection. - +**Example:** -https://qdrant.tech/documentation/advanced-tutorials/reranking-hybrid-search/ +```python +client.query_points( + "collection-name", + prefetch=prefetch, # Previous results + query=late_vectors, # Colbert converted query + using="colbertv2.0", + with_payload=True, + limit=10, +) +``` +___ +Learn more about [**Reranking**](/documentation/search-precision/reranking-hybrid-search/#rerank). -2025-06-05T14:05:27+03:00 +--- -... +## Storage: Disk vs RAM +![disk](/articles_data/vector-search-resource-optimization/disk.png) - +| Storage | Description | +|--------------|------------------------------------------------------------------------------------------------------------------| +| **RAM** | Crucial for fast access to frequently used data, such as indexed vectors. The amount of RAM required can be estimated based on your dataset size and dimensionality. For example, storing **1 million vectors with 1024 dimensions** would require approximately **5.72 GB of RAM**. | +| **Disk** | Suitable for less frequently accessed data, such as payloads and non-critical information. Disk-backed storage reduces memory demands but can introduce slight latency. | - +#### Which Disk Type? -https://qdrant.tech/documentation/advanced-tutorials/code-search/ +**Local SSDs** are recommended for optimal performance, as they provide the fastest query response times with minimal latency. While network-attached storage is also viable, it typically introduces additional latency that can affect performance, so local SSDs are preferred when possible, particularly for workloads requiring high-speed random access. -2025-05-15T19:33:03+05:30 +#### Memory Management for Vectors and Payload -... +As your data scales, effective resource management becomes crucial to keeping costs low while ensuring your application remains reliable and performant. One of the key areas to focus on is **memory management**. - +Understanding how Qdrant handles memory can help you make informed decisions about scaling your vector database. Qdrant supports two main methods for storing vectors: - +#### 1. In-Memory Storage -https://qdrant.tech/documentation/agentic-rag-crewai-zoom/ +- **How it works**: All data is stored in RAM, providing the fastest access times for queries and operations. +- **When to use it**: This setup is ideal for applications where performance is critical, and your RAM capacity can accommodate all data. +- **Advantages**: Maximum speed for queries and updates. +- **Limitations**: RAM usage can become a bottleneck as your dataset grows. -2025-04-09T12:55:16+02:00 +#### 2. Memmap Storage -... +- **How it works**: Instead of loading all data into memory, memmap storage maps data files directly to a virtual address space on disk. The system's page cache handles data access, making it highly efficient. +- **When to use it**: Perfect for storing large collections that exceed your available RAM while still maintaining near in-memory performance when enough RAM is available. +- **Advantages**: Balances performance and memory usage, allowing you to work with datasets larger than your physical RAM. +- **Limitations**: Slightly slower than pure in-memory storage but significantly more scalable. - +To enable memmap vector storage in Qdrant, you can set the **on_disk** parameter to `true` when creating or updating a collection. - +```python +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams( + 
 + on_disk=True + ) +) +``` -https://qdrant.tech/documentation/cloud-rbac/user-management/ +To do the same for payloads: -2025-05-02T18:40:38+02:00 +```python +client.create_collection( + collection_name="{collection_name}", + on_disk_payload= True +) +``` -... +The general guideline for selecting a storage method in Qdrant is to use **InMemory storage** when high performance is a priority, and sufficient RAM is available to accommodate the dataset. This approach ensures the fastest access speeds by keeping data readily accessible in memory. - +However, for larger datasets or scenarios where memory is limited, **Memmap** and **OnDisk storage** are more suitable. These methods significantly reduce memory usage by storing data on disk while leveraging advanced techniques like page caching and indexing to maintain efficient and relatively fast data access. - +## Monitoring the Database -https://qdrant.tech/articles/io\_uring/ +![monitoring](/articles_data/vector-search-resource-optimization/monitor.png) -2024-12-20T13:10:51+01:00 +Continuous monitoring is essential for maintaining system health and identifying potential issues before they escalate. Tools like **Prometheus** and **Grafana** are widely used to achieve this. -... +- **Prometheus**: An open-source monitoring and alerting toolkit, Prometheus collects and stores metrics in a time-series database. It scrapes metrics from predefined endpoints and supports powerful querying and visualization capabilities. +- **Grafana**: Often paired with Prometheus, Grafana provides an intuitive interface for visualizing metrics and creating interactive dashboards. - +Qdrant exposes metrics in the **Prometheus/OpenMetrics** format through the /metrics endpoint. Prometheus can scrape this endpoint to monitor various aspects of the Qdrant system. - +For a local Qdrant instance, the metrics endpoint is typically available at: -https://qdrant.tech/benchmarks/filtered-search-intro/ +```python +http://localhost:6333/metrics +``` -2024-01-11T19:41:06+05:30 +--- -... +Here are some important metrics to monitor: - +| **Metric Name** | | **Meaning** | +| --- | --- | --- | +| collections_total | | Total number of collections | +| collections_vector_total | | Total number of vectors in all collections | +| rest_responses_avg_duration_seconds | | Average response duration in REST API | +| grpc_responses_avg_duration_seconds | | Average response duration in gRPC API | +| rest_responses_fail_total | | Total number of failed responses (REST) | + +Read more about [**Qdrant Open Source Monitoring**](/documentation/guides/monitoring/) and [**Qdrant Cloud Monitoring**](/documentation/cloud/cluster-monitoring/) for managed clusters. +_________________________________________________________________________ + +## Recap: When Should You Optimize? +![solutions](/articles_data/vector-search-resource-optimization/solutions.png) + +| Scenario | Description | +|-----------------------------------|------------------------------------------------------------------------------------------------------------------| +| **When You Scale Up** | As data grows and the request surge, optimizing resource usage ensures your systems stay responsive and cost-efficient, even under heavy loads. | +| **If Facing Budget Constraints** | Strike the perfect balance between performance and cost, cutting unnecessary expenses while maintaining essential capabilities. | +| **You Need Better Performance** | If you’re noticing slow query speeds, latency issues, or frequent timeouts, it’s time to fine-tune your resource allocation. | +| **When System Stability is Paramount** | To manage high-traffic environments you will need to prevent crashes or failures caused by resource exhaustion. | - +## Get the Cheatsheet -https://qdrant.tech/documentation/agentic-rag-langgraph/ +Want to download a printer-friendly version of this guide? [**Download it now.**](https://try.qdrant.tech/resource-optimization-guide). -2025-05-15T19:37:07+05:30 +[![downloadable vector search resource optimization guide](/articles_data/vector-search-resource-optimization/downloadable-guide.jpg)](https://try.qdrant.tech/resource-optimization-guide) -... +<|page-21-lllmstxt|> +Imagine you sell computer hardware. To help shoppers easily find products on your website, you need to have a **user-friendly [search engine](https://qdrant.tech)**. - +![vector-search-ecommerce](/articles_data/vector-search-filtering/vector-search-ecommerce.png) - + If you’re selling computers and have extensive data on laptops, desktops, and accessories, your search feature should guide customers to the exact device they want - or at least a **very similar** match. -https://qdrant.tech/documentation/advanced-tutorials/collaborative-filtering/ +When storing data in Qdrant, each product is a point, consisting of an `id`, a `vector` and `payload`: -2024-11-18T15:26:15-08:00 +```json +{ + "id": 1, + "vector": [0.1, 0.2, 0.3, 0.4], + "payload": { + "price": 899.99, + "category": "laptop" + } +} +``` +The `id` is a unique identifier for the point in your collection. The `vector` is a mathematical representation of similarity to other points in the collection. +Finally, the `payload` holds metadata that directly describes the point. -... +Though we may not be able to decipher the vector, we are able to derive additional information about the item from its metadata, In this specific case, **we are looking at a data point for a laptop that costs $899.99**. - +## What is filtering? - +When searching for the perfect computer, your customers may end up with results that are mathematically similar to the search entry, but not exact. For example, if they are searching for **laptops under $1000**, a simple [vector search](/advanced-search/) without constraints might still show other laptops over $1000. -https://qdrant.tech/documentation/hybrid-cloud/operator-configuration/ +This is why [semantic search](/advanced-search/) alone **may not be enough**. In order to get the exact result, you would need to enforce a payload filter on the `price`. Only then can you be sure that the search results abide by the chosen characteristic. -2024-12-23T12:11:13+01:00 +> This is called **filtering** and it is one of the key features of [vector databases](https://qdrant.tech). -... +Here is how a **filtered vector search** looks behind the scenes. We'll cover its mechanics in the following section. - +```http +POST /collections/online_store/points/search +{ + "vector": [ 0.2, 0.1, 0.9, 0.7 ], + "filter": { + "must": [ + { + "key": "category", + "match": { "value": "laptop" } + }, + { + "key": "price", + "range": { + "gt": null, + "gte": null, + "lt": null, + "lte": 1000 + } + } + ] + }, + "limit": 3, + "with_payload": true, + "with_vector": false +} +``` - +The filtered result will be a combination of the semantic search and the filtering conditions imposed upon the query. In the following pages, we will show that **filtering is a key practice in vector search for two reasons:** -https://qdrant.tech/documentation/fastembed/fastembed-semantic-search/ +1. With filtering in Qdrant, you can **dramatically increase search precision**. More on this in the next section.
+2. Filtering helps control resources and **reduce compute use**. More on this in [**Payload Indexing**](#filtering-with-the-payload-index). -2025-04-26T13:30:39+03:00 +## What you will learn in this guide: -... +In [vector search](/advanced-search/), filtering and sorting are more interdependent than they are in traditional databases. While databases like SQL use commands such as `WHERE` and `ORDER BY`, the interplay between these processes in vector search is a bit more complex. -
+Most people use default settings and build vector search apps that aren't properly configured or even setup for precise retrieval. In this guide, we will show you how to **use filtering to get the most out of vector search** with some basic and advanced strategies that are easy to implement. - +#### Remember to run all tutorial code in Qdrant's Dashboard -https://qdrant.tech/documentation/database-tutorials/huggingface-datasets/ +The easiest way to reach that "Hello World" moment is to [**try filtering in a live cluster**](/documentation/quickstart-cloud/). Our interactive tutorial will show you how to create a cluster, add data and try some filtering clauses. -2024-11-18T15:26:15-08:00 +![qdrant-filtering-tutorial](/articles_data/vector-search-filtering/qdrant-filtering-tutorial.png) -... +## Qdrant's approach to filtering - +Qdrant follows a specific method of searching and filtering through dense vectors. - +Let's take a look at this **3-stage diagram**. In this case, we are trying to find the nearest neighbour to the query vector **(green)**. Your search journey starts at the bottom **(orange)**. -https://qdrant.tech/documentation/private-cloud/qdrant-cluster-management/ +By default, Qdrant connects all your data points within the [**vector index**](/documentation/concepts/indexing/). After you [**introduce filters**](/documentation/concepts/filtering/), some data points become disconnected. Vector search can't cross the grayed out area and it won't reach the nearest neighbor. +How can we bridge this gap? -2025-06-16T17:51:31+02:00 +**Figure 1:** How Qdrant maintains a filterable vector index. +![filterable-vector-index](/articles_data/vector-search-filtering/filterable-vector-index.png) -... +[**Filterable vector index**](/documentation/concepts/indexing/): This technique builds additional links **(orange)** between leftover data points. The filtered points which stay behind are now traversible once again. Qdrant uses special category-based methods to connect these data points. - +### Qdrant's approach vs traditional filtering methods - +![stepping-lens](/articles_data/vector-search-filtering/stepping-lens.png) -https://qdrant.tech/documentation/cloud-rbac/permission-reference/ +The filterable vector index is Qdrant's solves pre and post-filtering problems by adding specialized links to the search graph. It aims to maintain the speed advantages of vector search while allowing for precise filtering, addressing the inefficiencies that can occur when applying filters after the vector search. -2025-06-13T08:39:21+02:00 +#### Pre-filtering -... +In pre-filtering, a search engine first narrows down the dataset based on chosen metadata values, and then searches within that filtered subset. This reduces unnecessary computation over a dataset that is potentially much larger. - +The choice between pre-filtering and using the filterable HNSW index depends on filter cardinality. When metadata cardinality is too low, the filter becomes restrictive and it can disrupt the connections within the graph. This leads to fragmented search paths (as in **Figure 1**). When the semantic search process begins, it won’t be able to travel to those locations. - +However, Qdrant still benefits from pre-filtering **under certain conditions**. In cases of low cardinality, Qdrant's query planner stops using HNSW and switches over to the payload index alone. This makes the search process much cheaper and faster than if using HNSW. -https://qdrant.tech/documentation/beginner-tutorials/hybrid-search-fastembed/ +**Figure 2:** On the user side, this is how filtering looks. We start with five products with different prices. First, the $1000 price **filter** is applied, narrowing down the selection of laptops. Then, a vector search finds the relevant **results** within this filtered set. -2025-04-26T18:10:19+03:00 +![pre-filtering-vector-search](/articles_data/vector-search-filtering/pre-filtering.png) -... +In conclusion, pre-filtering is efficient in specific cases when you use small datasets with low cardinality metadata. However, pre-filtering should not be used over large datasets as it breaks too many links in the HNSW graph, causing lower accuracy. - +#### Post-filtering - +In post-filtering, a search engine first looks for similar vectors and retrieves a larger set of results. Then, it applies filters to those results based on metadata. The problem with post-filtering becomes apparent when using low-cardinality filters. -https://qdrant.tech/documentation/overview/ +> When you apply a low-cardinality filter after performing a vector search, you often end up discarding a large portion of the results that the vector search returned. -2025-04-26T22:59:20-07:00 +**Figure 3:** In the same example, we have five laptops. First, the vector search finds the top two relevant **results**, but they may not meet the price match. When the $1000 price **filter** is applied, other potential results are discarded. -... +![post-filtering-vector-search](/articles_data/vector-search-filtering/post-filtering.png) - +The system will waste computational resources by first finding similar vectors and then discarding many that don't meet the filter criteria. You're also limited to filtering only from the initial set of [vector search](/advanced-search/) results. If your desired items aren't in this initial set, you won't find them, even if they exist in the database. - +## Basic filtering example: ecommerce and laptops -https://qdrant.tech/articles/product-quantization/ +We know that there are three possible laptops that suit our price point. +Let's see how Qdrant's filterable vector index works and why it is the best method of capturing all available results. -2025-02-04T13:55:26+01:00 +First, add five new laptops to your online store. Here is a sample input: -... +```python +laptops = [ + (1, [0.1, 0.2, 0.3, 0.4], {"price": 899.99, "category": "laptop"}), + (2, [0.2, 0.3, 0.4, 0.5], {"price": 1299.99, "category": "laptop"}), + (3, [0.3, 0.4, 0.5, 0.6], {"price": 799.99, "category": "laptop"}), + (4, [0.4, 0.5, 0.6, 0.7], {"price": 1099.99, "category": "laptop"}), + (5, [0.5, 0.6, 0.7, 0.8], {"price": 949.99, "category": "laptop"}) +] +``` - +The four-dimensional vector can represent features like laptop CPU, RAM or battery life, but that isn’t specified. The payload, however, specifies the exact price and product category. - +Now, set the filter to "price is less than $1000": -https://qdrant.tech/benchmarks/filtered-search-benchmark/ +```json +{ + "key": "price", + "range": { + "gt": null, + "gte": null, + "lt": null, + "lte": 1000 + } +} +``` -2024-01-11T19:41:06+05:30 +When a price filter of equal/less than $1000 is applied, vector search returns the following results: -... +```json +[ + { + "id": 3, + "score": 0.9978443564622781, + "payload": { + "price": 799.99, + "category": "laptop" + } + }, + { + "id": 1, + "score": 0.9938079894227599, + "payload": { + "price": 899.99, + "category": "laptop" + } + }, + { + "id": 5, + "score": 0.9903751498208603, + "payload": { + "price": 949.99, + "category": "laptop" + } + } +] +``` - +As you can see, Qdrant's filtering method has a greater chance of capturing all possible search results. - +This specific example uses the `range` condition for filtering. Qdrant, however, offers many other possible ways to structure a filter -https://qdrant.tech/documentation/agentic-rag-camelai-discord/ +**For detailed usage examples, [filtering](/documentation/concepts/filtering/) docs are the best resource.** -2025-04-09T12:55:16+02:00 +### Scrolling instead of searching -... +You don't need to use our `search` and `query` APIs to filter through data. The `scroll` API is another option that lets you retrieve lists of points which meet the filters. - +If you aren't interested in finding similar points, you can simply list the ones that match a given filter. While search gives you the most similar points based on some query vector, scroll will give you all points matching your filter not considering similarity. - +In Qdrant, scrolling is used to iteratively **retrieve large sets of points from a collection**. It is particularly useful when you’re dealing with a large number of points and don’t want to load them all at once. Instead, Qdrant provides a way to scroll through the points **one page at a time**. -https://qdrant.tech/documentation/private-cloud/backups/ +You start by sending a scroll request to Qdrant with specific conditions like filtering by payload, vector search, or other criteria. -2024-09-05T15:17:16+02:00 +Let's retrieve a list of top 10 laptops ordered by price in the store: -... +```http +POST /collections/online_store/points/scroll +{ + "filter": { + "must": [ + { + "key": "category", + "match": { + "value": "laptop" + } + } + ] + }, + "limit": 10, + "with_payload": true, + "with_vector": false, + "order_by": [ + { + "key": "price", + } + ] +} +``` +The response contains a batch of points that match the criteria and a reference (offset or next page token) to retrieve the next set of points. - +> [**Scrolling**](/documentation/concepts/points/#scroll-points) is designed to be efficient. It minimizes the load on the server and reduces memory consumption on the client side by returning only manageable chunks of data at a time. - +#### Available filtering conditions -https://qdrant.tech/documentation/database-tutorials/async-api/ +| **Condition** | **Usage** | **Condition** | **Usage** | +|-----------------------|------------------------------------------|-----------------------|------------------------------------------| +| **Match** | Exact value match. | **Range** | Filter by value range. | +| **Match Any** | Match multiple values. | **Datetime Range** | Filter by date range. | +| **Match Except** | Exclude specific values. | **UUID Match** | Filter by unique ID. | +| **Nested Key** | Filter by nested data. | **Geo** | Filter by location. | +| **Nested Object** | Filter by nested objects. | **Values Count** | Filter by element count. | +| **Full Text Match** | Search in text fields. | **Is Empty** | Filter empty fields. | +| **Has ID** | Filter by unique ID. | **Is Null** | Filter null values. | -2025-02-18T21:01:07+05:30 +> All clauses and conditions are outlined in Qdrant's [filtering](/documentation/concepts/filtering/) documentation. -... +#### Filtering clauses to remember - +| **Clause** | **Description** | **Clause** | **Description** | +|---------------------|-------------------------------------------------------|---------------------|-------------------------------------------------------| +| **Must** | Includes items that meet the condition
(similar to `AND`). | **Should** | Filters if at least one condition is met
(similar to `OR`). | +| **Must Not** | Excludes items that meet the condition
(similar to `NOT`). | **Clauses Combination** | Combines multiple clauses to refine filtering
(similar to `AND`). | - +## Advanced filtering example: dinosaur diets -https://qdrant.tech/documentation/cloud-quickstart/ +![advanced-payload-filtering](/articles_data/vector-search-filtering/advanced-payload-filtering.png) -2025-05-29T08:51:37-04:00 +We can also use nested filtering to query arrays of objects within the payload. In this example, we have two points. They each represent a dinosaur with a list of food preferences (diet) that indicate what type of food they like or dislike: -... +```json +[ + { + "id": 1, + "dinosaur": "t-rex", + "diet": [ + { "food": "leaves", "likes": false}, + { "food": "meat", "likes": true} + ] + }, + { + "id": 2, + "dinosaur": "diplodocus", + "diet": [ + { "food": "leaves", "likes": true}, + { "food": "meat", "likes": false} + ] + } +] +``` +To ensure that both conditions are applied to the same array element (e.g., food = meat and likes = true must refer to the same diet item), you need to use a nested filter. - +Nested filters are used to apply conditions within an array of objects. They ensure that the conditions are evaluated per array element, rather than across all elements. - +```http +POST /collections/dinosaurs/points/scroll +{ + "filter": { + "must": [ + { + "key": "diet[].food", + "match": { + "value": "meat" + } + }, + { + "key": "diet[].likes", + "match": { + "value": true + } + } + ] + } +} +``` -https://qdrant.tech/documentation/quickstart/ +```python +client.scroll( + collection_name="dinosaurs", + scroll_filter=models.Filter( + must=[ + models.FieldCondition( + key="diet[].food", match=models.MatchValue(value="meat") + ), + models.FieldCondition( + key="diet[].likes", match=models.MatchValue(value=True) + ), + ], + ), +) +``` -2025-01-20T10:08:10+01:00 +```typescript +client.scroll("dinosaurs", { + filter: { + must: [ + { + key: "diet[].food", + match: { value: "meat" }, + }, + { + key: "diet[].likes", + match: { value: true }, + }, + ], + }, +}); +``` -... +```rust +use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; - +client + .scroll( + ScrollPointsBuilder::new("dinosaurs").filter(Filter::must([ + Condition::matches("diet[].food", "meat".to_string()), + Condition::matches("diet[].likes", true), + ])), + ) + .await?; +``` - +```java +import java.util.List; -https://qdrant.tech/documentation/private-cloud/logging-monitoring/ +import static io.qdrant.client.ConditionFactory.match; +import static io.qdrant.client.ConditionFactory.matchKeyword; -2025-02-11T18:21:40+01:00 +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.ScrollPoints; -... +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); - +client + .scrollAsync( + ScrollPoints.newBuilder() + .setCollectionName("dinosaurs") + .setFilter( + Filter.newBuilder() + .addAllMust( + List.of(matchKeyword("diet[].food", "meat"), match("diet[].likes", true))) + .build()) + .build()) + .get(); +``` - +```csharp +using Qdrant.Client; +using static Qdrant.Client.Grpc.Conditions; -https://qdrant.tech/documentation/beginner-tutorials/retrieval-quality/ +var client = new QdrantClient("localhost", 6334); -2024-11-18T15:26:15-08:00 +await client.ScrollAsync( + collectionName: "dinosaurs", + filter: MatchKeyword("diet[].food", "meat") & Match("diet[].likes", true) +); +``` -... +This happens because both points are matching the two conditions: - +- the "t-rex" matches food=meat on `diet[1].food` and likes=true on `diet[1].likes` +- the "diplodocus" matches food=meat on `diet[1].food` and likes=true on `diet[0].likes` - +To retrieve only the points where the conditions apply to a specific element within an array (such as the point with id 1 in this example), you need to use a nested object filter. -https://qdrant.tech/documentation/hybrid-cloud/networking-logging-monitoring/ +Nested object filters enable querying arrays of objects independently, ensuring conditions are checked within individual array elements. -2025-02-11T18:21:40+01:00 +This is done by using the `nested` condition type, which consists of a payload key that targets an array and a filter to apply. The key should reference an array of objects and can be written with or without bracket notation (e.g., "data" or "data[]"). -... +```http +POST /collections/dinosaurs/points/scroll +{ + "filter": { + "must": [{ + "nested": { + "key": "diet", + "filter":{ + "must": [ + { + "key": "food", + "match": { + "value": "meat" + } + }, + { + "key": "likes", + "match": { + "value": true + } + } + ] + } + } + }] + } +} +``` - +```python +client.scroll( + collection_name="dinosaurs", + scroll_filter=models.Filter( + must=[ + models.NestedCondition( + nested=models.Nested( + key="diet", + filter=models.Filter( + must=[ + models.FieldCondition( + key="food", match=models.MatchValue(value="meat") + ), + models.FieldCondition( + key="likes", match=models.MatchValue(value=True) + ), + ] + ), + ) + ) + ], + ), +) +``` - +```typescript +client.scroll("dinosaurs", { + filter: { + must: [ + { + nested: { + key: "diet", + filter: { + must: [ + { + key: "food", + match: { value: "meat" }, + }, + { + key: "likes", + match: { value: true }, + }, + ], + }, + }, + }, + ], + }, +}); +``` -https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/ +```rust +use qdrant_client::qdrant::{Condition, Filter, NestedCondition, ScrollPointsBuilder}; -2025-01-28T15:29:08+01:00 +client + .scroll( + ScrollPointsBuilder::new("dinosaurs").filter(Filter::must([NestedCondition { + key: "diet".to_string(), + filter: Some(Filter::must([ + Condition::matches("food", "meat".to_string()), + Condition::matches("likes", true), + ])), + } + .into()])), + ) + .await?; +``` -... +```java +import java.util.List; - +import static io.qdrant.client.ConditionFactory.match; +import static io.qdrant.client.ConditionFactory.matchKeyword; +import static io.qdrant.client.ConditionFactory.nested; - +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.ScrollPoints; -https://qdrant.tech/articles/scalar-quantization/ +client + .scrollAsync( + ScrollPoints.newBuilder() + .setCollectionName("dinosaurs") + .setFilter( + Filter.newBuilder() + .addMust( + nested( + "diet", + Filter.newBuilder() + .addAllMust( + List.of( + matchKeyword("food", "meat"), match("likes", true))) + .build())) + .build()) + .build()) + .get(); +``` -2024-12-20T13:10:51+01:00 +```csharp +using Qdrant.Client; +using static Qdrant.Client.Grpc.Conditions; -... +var client = new QdrantClient("localhost", 6334); - +await client.ScrollAsync( + collectionName: "dinosaurs", + filter: Nested("diet", MatchKeyword("food", "meat") & Match("likes", true)) +); +``` - +The matching logic is adjusted to operate at the level of individual elements within an array in the payload, rather than on all array elements together. -https://qdrant.tech/documentation/interfaces/ +Nested filters function as though each element of the array is evaluated separately. The parent document will be considered a match if at least one array element satisfies all the nested filter conditions. -2024-11-21T17:41:45+05:30 +## Other creative uses for filters -... +You can use filters to retrieve data points without knowing their `id`. You can search through data and manage it, solely by using filters. Let's take a look at some creative uses for filters: - +| Action | Description | Action | Description | +|--------|-------------|--------|-------------| +| [Delete Points](/documentation/concepts/points/#delete-points) | Deletes all points matching the filter. | [Set Payload](/documentation/concepts/payload/#set-payload) | Adds payload fields to all points matching the filter. | +| [Scroll Points](/documentation/concepts/points/#scroll-points) | Lists all points matching the filter. | [Update Payload](/documentation/concepts/payload/#overwrite-payload) | Updates payload fields for points matching the filter. | +| [Order Points](/documentation/concepts/points/#order-points-by-payload-key) | Lists all points, sorted by the filter. | [Delete Payload](/documentation/concepts/payload/#delete-payload-keys) | Deletes fields for points matching the filter. | +| [Count Points](/documentation/concepts/points/#counting-points) | Totals the points matching the filter. | | | - +## Filtering with the payload index -https://qdrant.tech/documentation/private-cloud/api-reference/ +![vector-search-filtering-vector-search](/articles_data/vector-search-filtering/scanning-lens.png) -2025-06-03T09:48:32+02:00 +When you start working with Qdrant, your data is by default organized in a vector index. +In addition to this, we recommend adding a secondary data structure - **the payload index**. -... +Just how the vector index organizes vectors, the payload index will structure your metadata. - +**Figure 4:** The payload index is an additional data structure that supports vector search. A payload index (in green) organizes candidate results by cardinality, so that semantic search (in red) can traverse the vector index quickly. - +![payload-index-vector-search](/articles_data/vector-search-filtering/payload-index-vector-search.png) -https://qdrant.tech/documentation/private-cloud/changelog/ +On its own, semantic searching over terabytes of data can take up lots of RAM. [**Filtering**](/documentation/concepts/filtering/) and [**Indexing**](/documentation/concepts/indexing/) are two easy strategies to reduce your compute usage and still get the best results. Remember, this is only a guide. For an exhaustive list of filtering options, you should read the [filtering documentation](/documentation/concepts/filtering/). -2025-06-03T09:48:32+02:00 +Here is how you can create a single index for a metadata field "category": -... +```http +PUT /collections/computers/index +{ + "field_name": "category", + "field_schema": "keyword" +} +``` +```python +from qdrant_client import QdrantClient - +client = QdrantClient(url="http://localhost:6333") - +client.create_payload_index( + collection_name="computers", + field_name="category", + field_schema="keyword", +) +``` +Once you mark a field indexable, **you don't need to do anything else**. Qdrant will handle all optimizations in the background. -https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/ +#### Why should you index metadata? -2024-11-18T15:42:18-08:00 +![payload-index-filtering](/articles_data/vector-search-filtering/payload-index-filtering.png) -... +The payload index acts as a secondary data structure that speeds up retrieval. Whenever you run vector search with a filter, Qdrant will consult a payload index - if there is one. - + - +As your dataset grows in complexity, Qdrant takes up additional resources to go through all data points. Without a proper data structure, the search can take longer - or run out of resources. -https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/ +#### Payload indexing helps evaluate the most restrictive filters -2025-04-30T22:48:05+05:30 +The payload index is also used to accurately estimate **filter cardinality**, which helps the query planning choose a search strategy. **Filter cardinality** refers to the number of distinct values that a filter can match within a dataset. Qdrant's search strategy can switch from **HNSW search** to **payload index-based search** if the cardinality is too low. -... +**How it affects your queries:** Depending on the filter used in the search - there are several possible scenarios for query execution. Qdrant chooses one of the query execution options depending on the available indexes, the complexity of the conditions and the cardinality of the filtering result. - +- The planner estimates the cardinality of a filtered result before selecting a strategy. +- Qdrant retrieves points using the **payload index** if cardinality is below threshold. +- Qdrant uses the **filterable vector index** if the cardinality is above a threshold - + -https://qdrant.tech/documentation/guides/installation/ +#### What happens if you don't use payload indexes? -2025-05-02T10:37:48+02:00 +When using filters while querying, Qdrant needs to estimate cardinality of those filters to define a proper query plan. If you don't create a payload index, Qdrant will not be able to do this. It may end up choosing a sub-optimal way of searching causing extremely slow search times or low accuracy results. -... +If you only rely on **searching for the nearest vector**, Qdrant will have to go through the entire vector index. It will calculate similarities against each vector in the collection, relevant or not. Alternatively, when you filter with the help of a payload index, the HSNW algorithm won't have to evaluate every point. Furthermore, the payload index will help HNSW construct the graph with additional links. - +## How does the payload index look? - +A payload index is similar to conventional document-oriented databases. It connects metadata fields with their corresponding point id’s for quick retrieval. -https://qdrant.tech/documentation/multimodal-search/ +In this example, you are indexing all of your computer hardware inside of the `computers` collection. Let’s take a look at a sample payload index for the field `category`. -2025-04-09T12:55:16+02:00 +```json +Payload Index by keyword: ++------------+-------------+ +| category | id | ++------------+-------------+ +| laptop | 1, 4, 7 | +| desktop | 2, 5, 9 | +| speakers | 3, 6, 8 | +| keyboard | 10, 11 | ++------------+-------------+ +``` +When fields are properly indexed, the search engine roughly knows where it can start its journey. It can start looking up points that contain relevant metadata, and it doesn’t need to scan the entire dataset. This reduces the engine’s workload by a lot. As a result, query results are faster and the system can easily scale. -... +> You may create as many payload indexes as you want, and we recommend you do so for each field that you filter by. - +If your users are often filtering by **laptop** when looking up a product **category**, indexing all computer metadata will speed up retrieval and make the results more precise. - +#### Different types of payload indexes -https://qdrant.tech/documentation/fastembed/fastembed-splade/ +| Index Type | Description | +|---------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------| +| [Full-text Index](/documentation/concepts/indexing/#full-text-index) | Enables efficient text search in large datasets. | +| [Tenant Index](/documentation/concepts/indexing/#tenant-index) | For data isolation and retrieval efficiency in multi-tenant architectures. | +| [Principal Index](/documentation/concepts/indexing/#principal-index) | Manages data based on primary entities like users or accounts. | +|[On-Disk Index](/documentation/concepts/indexing/#on-disk-payload-index) | Stores indexes on disk to manage large datasets without memory usage. | +| [Parameterized Index](/documentation/concepts/indexing/#parameterized-index) | Allows for dynamic querying, where the index can adapt based on different parameters or conditions provided by the user. Useful for numeric data like prices or timestamps. | -2025-04-25T19:38:33+03:00 +### Indexing payloads in multitenant setups -... +Some applications need to have data segregated, whereby different users need to see different data inside of the same program. When setting up storage for such a complex application, many users think they need multiple databases for segregated users. - +We see this quite often. Users very frequently make the mistake of creating a separate collection for each tenant inside of the same cluster. This can quickly exhaust the cluster’s resources. Running vector search through too many collections can start using up too much RAM. You may start seeing out-of-memory (OOM) errors and degraded performance. - +To mitigate this, we offer extensive support for multitenant systems, so that you can build an entire global application in one single Qdrant collection. -https://qdrant.tech/articles/seed-round/ +When creating or updating a collection, you can mark a metadata field as indexable. To mark `user_id` as a tenant in a shared collection, do the following: -2024-03-07T20:31:05+01:00 +```http +PUT /collections/{collection_name}/index +{ + "field_name": "user_id", + "field_schema": { + "type": "keyword", + "is_tenant": true + } +} +``` +Additionally, we offer a way of organizing data efficiently by means of the tenant index. This is another variant of the payload index that makes tenant data more accessible. This time, the request will specify the field as a tenant. This means that you can mark various customer types and user id’s as `is_tenant: true`. -... +Read more about setting up [tenant defragmentation](/documentation/concepts/indexing/?q=tenant#tenant-index) in multitenant environments, - +## Key takeaways in filtering and indexing +![best-practices](/articles_data/vector-search-filtering/best-practices.png) - +### Filtering with float-point (decimal) numbers +If you filter by the float data type, your search precision may be limited and inaccurate. -https://qdrant.tech/articles/langchain-integration/ +Float Datatype numbers have a decimal point and are 64 bits in size. Here is an example: -2024-12-20T13:10:51+01:00 +```json +{ + "price": 11.99 +} +``` -... +When you filter for a specific float number, such as 11.99, you may get a different result, like 11.98 or 12.00. With decimals, numbers are rounded differently, so logically identical values may appear different. Unfortunately, searching for exact matches can be unreliable in this case. - +To avoid inaccuracies, use a different filtering method. We recommend that you try Range Based Filtering instead of exact matches. This method accounts for minor variations in data, and it boosts performance - especially with large datasets. - +Here is a sample JSON range filter for values greater than or equal to 11.99 and less than or equal to the same number. This will retrieve any values within the range of 11.99, including those with additional decimal places. -https://qdrant.tech/documentation/rag-deepseek/ +```json +{ + "key": "price", + "range": { + "gt": null, + "gte": 11.99, + "lt": null, + "lte": 11.99 + } +} +``` +### Working with pagination in queries -2025-04-26T13:02:13+03:00 +When you're implementing pagination in filtered queries, indexing becomes even more critical. When paginating results, you often need to exclude items you've already seen. This is typically managed by applying filters that specify which IDs should not be included in the next set of results. -... +However, an interesting aspect of Qdrant's data model is that a single point can have multiple values for the same field, such as different color options for a product. This means that during filtering, an ID might appear multiple times if it matches on different values of the same field. - +Proper indexing ensures that these queries are efficient, preventing duplicate results and making pagination smoother. - +## Conclusion: Real-life use cases of filtering -https://qdrant.tech/documentation/web-ui/ +Filtering in a [vector database](https://qdrant.tech) like Qdrant can significantly enhance search capabilities by enabling more precise and efficient retrieval of data. -2024-11-20T23:14:39+05:30 +As a conclusion to this guide, let's look at some real-life use cases where filtering is crucial: -... +| **Use Case** | **Vector Search** | **Filtering** | +|--------------------------------------|------------------------------------------------------------------|-------------------------------------------------------------------------| +| [E-Commerce Product Search](/advanced-search/) | Search for products by style or visual similarity | Filter by price, color, brand, size, ratings | +| [Recommendation Systems](/recommendations/) | Recommend similar content (e.g., movies, songs) | Filter by release date, genre, etc. (e.g., movies after 2020) | +| [Geospatial Search in Ride-Sharing](/articles/geo-polygon-filter-gsoc/)| Find similar drivers or delivery partners | Filter by rating, distance radius, vehicle type | +| [Fraud & Anomaly Detection](/data-analysis-anomaly-detection/) | Detect transactions similar to known fraud cases | Filter by amount, time, location | - +#### Before you go - all the code is in Qdrant's Dashboard - +The easiest way to reach that "Hello World" moment is to [**try filtering in a live cluster**](/documentation/quickstart-cloud/). Our interactive tutorial will show you how to create a cluster, add data and try some filtering clauses. -https://qdrant.tech/documentation/fastembed/fastembed-colbert/ +**It's all in your free cluster!** -2025-06-19T16:21:03+04:00 +[![qdrant-hybrid-cloud](/docs/homepage/cloud-cta.png)](https://qdrant.to/cloud) -... +<|page-22-lllmstxt|> +## Data Structures 101 - +Those who took programming courses might remember that there is no such thing as a universal data structure. +Some structures are good at accessing elements by index (like arrays), while others shine in terms of insertion efficiency (like linked lists). - +{{< figure src="/articles_data/immutable-data-structures/hardware-optimized.png" alt="Hardware-optimized data structure" caption="Hardware-optimized data structure" width="80%" >}} -https://qdrant.tech/articles/chatgpt-plugin/ +However, when we move from theoretical data structures to real-world systems, and particularly in performance-critical areas such as [vector search](/use-cases/), things become more complex. [Big-O notation](https://en.wikipedia.org/wiki/Big_O_notation) provides a good abstraction, but it doesn’t account for the realities of modern hardware: cache misses, memory layout, disk I/O, and other low-level considerations that influence actual performance. -2024-12-20T13:10:51+01:00 +> From the perspective of hardware efficiency, the ideal data structure is a contiguous array of bytes that can be read sequentially in a single thread. This scenario allows hardware optimizations like prefetching, caching, and branch prediction to operate at their best. -... +However, real-world use cases require more complex structures to perform various operations like insertion, deletion, and search. +These requirements increase complexity and introduce performance trade-offs. - +### Mutability - +One of the most significant challenges when working with data structures is ensuring **mutability — the ability to change the data structure after it’s created**, particularly with fast update operations. -https://qdrant.tech/articles/memory-consumption/ +Let’s consider a simple example: we want to iterate over items in sorted order. +Without a mutability requirement, we can use a simple array and sort it once. +This is very close to our ideal scenario. We can even put the structure on disk - which is trivial for an array. -2024-12-20T13:10:51+01:00 +However, if we need to insert an item into this array, **things get more complicated**. +Inserting into a sorted array requires shifting all elements after the insertion point, which leads to linear time complexity for each insertion, which is not acceptable for many applications. -... +To handle such cases, more complex structures like [B-trees](https://en.wikipedia.org/wiki/B-tree) come into play. B-trees are specifically designed to optimize both insertion and read operations for large data sets. However, they sacrifice the raw speed of array reads for better insertion performance. - +Here’s a benchmark that illustrates the difference between iterating over a plain array and a BTreeSet in Rust: - +```rust +use std::collections::BTreeSet; +use rand::Rng; -https://qdrant.tech/articles/qa-with-cohere-and-qdrant/ +fn main() { + // Benchmark plain vector VS btree in a task of iteration over all elements + let mut rand = rand::thread_rng(); + let vector: Vec<_> = (0..1000000).map(|_| rand.gen::()).collect(); + let btree: BTreeSet<_> = vector.iter().copied().collect(); -2024-12-20T13:10:51+01:00 + { + let mut sum = 0; + for el in vector { + sum += el; + } + } // Elapsed: 850.924”s -... + { + let mut sum = 0; + for el in btree { + sum += el; + } + } // Elapsed: 5.213025ms, ~6x slower - +} +``` - +[Vector databases](https://qdrant.tech/), like Qdrant, have to deal with a large variety of data structures. +If we could make them immutable, it would significantly improve performance and optimize memory usage. -https://qdrant.tech/articles/qdrant-1.2.x/ +## How Does Immutability Help? -2024-03-07T20:31:05+01:00 +A large part of the immutable advantage comes from the fact that we know the exact data we need to put into the structure even before we start building it. +The simplest example is a sorted array: we would know exactly how many elements we have to put into the array so we can allocate the exact amount of memory once. -... +More complex data structures might require additional statistics to be collected before the structure is built. +A Qdrant-related example of this is [Scalar Quantization](/articles/scalar-quantization/#conversion-to-integers): in order to select proper quantization levels, we have to know the distribution of the data. - +{{< figure src="/articles_data/immutable-data-structures/quantization-quantile.png" alt="Scalar Quantization Quantile" caption="Scalar Quantization Quantile" width="70%" >}} - -https://qdrant.tech/articles/dataset-quality/ +Computing this distribution requires knowing all the data in advance, but once we have it, applying scalar quantization is a simple operation. -2024-12-20T13:10:51+01:00 +Let's take a look at a non-exhaustive list of data structures and potential improvements we can get from making them immutable: -... +|Function| Mutable Data Structure | Immutable Alternative | Potential improvements | +|----|------|------|------------------------| +| Read by index | Array | Fixed chunk of memory | Allocate exact amount of memory | +| Vector Storage | Array or Arrays | Memory-mapped file | Offload data to disk | +| Read sorted ranges| B-Tree | Sorted Array | Store all data close, avoid cache misses | +| Read by key | Hash Map | Hash Map with Perfect Hashing | Avoid hash collisions | +| Get documents by keyword | Inverted Index | Inverted Index with Sorted
and BitPacked Postings | Less memory usage, faster search | +| Vector Search | HNSW graph | HNSW graph with
payload-aware connections | Better precision with filters | +| Tenant Isolation | Vector Storage | Defragmented Vector Storage | Faster access to on-disk data | -
- +For more info on payload-aware connections in HNSW, read our [previous article](/articles/filtrable-hnsw/). -https://qdrant.tech/documentation/concepts/ +This time around, we will focus on the latest additions to Qdrant: +- **the immutable hash map with perfect hashing** +- **defragmented vector storage**. -2024-11-14T18:59:28+01:00 +### Perfect Hashing -... +A hash table is one of the most commonly used data structures implemented in almost every programming language, including Rust. +It provides fast access to elements by key, with an average time complexity of O(1) for read and write operations. - +There is, however, the assumption that should be satisfied for the hash table to work efficiently: *hash collisions should not cause too much overhead*. +In a hash table, each key is mapped to a "bucket," a slot where the value is stored. +When different keys map to the same bucket, a collision occurs. - +In regular mutable hash tables, minimization of collisions is achieved by: -https://qdrant.tech/documentation/fastembed/fastembed-rerankers/ +* making the number of buckets bigger so the probability of collision is lower +* using a linked list or a tree to store multiple elements with the same hash -2025-04-26T13:20:52+03:00 +However, these strategies have overheads, which become more significant if we consider using high-latency storage like disk. -... +Indeed, every read operation from disk is several orders of magnitude slower than reading from RAM, so we want to know the correct location of the data from the first attempt. - +In order to achieve this, we can use a so-called minimal perfect hash function (MPHF). +This special type of hash function is constructed specifically for a given set of keys, and it guarantees no collisions while using minimal amount of buckets. - +In Qdrant, we decided to use *fingerprint-based minimal perfect hash function* implemented in the [ph crate 🩀](https://crates.io/crates/ph) by [Piotr Beling](https://dl.acm.org/doi/10.1145/3596453). +According to our benchmarks, using the perfect hash function does introduce some overhead in terms of hashing time, but it significantly reduces the time for the whole operation: -https://qdrant.tech/articles/faq-question-answering/ +| Volume | `ph::Function` | `std::hash::Hash` | `HashMap::get`| +|--------|----------------|-------------------|---------------| +| 1000 | 60ns | ~20ns | 34ns | +| 100k | 90ns | ~20ns | 220ns | +| 10M | 238ns | ~20ns | 500ns | -2024-12-20T13:10:51+01:00 +Even thought the absolute time for hashing is higher, the time for the whole operation is lower, because PHF guarantees no collisions. +The difference is even more significant when we consider disk read time, which +might up to several milliseconds (10^6 ns). -... +PHF RAM size scales linearly for `ph::Function`: 3.46 kB for 10k elements, 119MB for 350M elements. +The construction time required to build the hash function is surprisingly low, and we only need to do it once: - +| Volume | `ph::Function` (construct) | PHF size | Size of int64 keys (for reference) | +|--------|----------------------------|----------|------------------------------------| +| 1M | 52ms | 0.34Mb | 7.62Mb | +| 100M | 7.4s | 33.7Mb | 762.9Mb | - +The usage of PHF in Qdrant lets us minimize the latency of cold reads, which is especially important for large-scale multi-tenant systems. With PHF, it is enough to read a single page from a disk to get the exact location of the data. -https://qdrant.tech/articles/why-rust/ +### Defragmentation -2024-09-05T13:07:07-07:00 +When you read data from a disk, you almost never read a single byte. Instead, you read a page, which is a fixed-size chunk of data. +On many systems, the page size is 4KB, which means that every read operation will read 4KB of data, even if you only need a single byte. -... +Vector search, on the other hand, requires reading a lot of small vectors, which might create a large overhead. +It is especially noticeable if we use binary quantization, where the size of even large OpenAI 1536d vectors is compressed down to **192 bytes**. - +{{< figure src="/articles_data/immutable-data-structures/page-vector.png" alt="Overhead when reading a single vector" caption="Overhead when reading single vector" width="80%" >}} - +That means if the vectors we access during the search are randomly scattered across the disk, we will have to read 4KB for each vector, which is 20 times more than the actual data size. -https://qdrant.tech/articles/embedding-recycler/ +There is, however, a simple way to avoid this overhead: **defragmentation**. +If we knew some additional information about the data, we could combine all relevant vectors into a single page. -2024-12-20T13:10:51+01:00 +{{< figure src="/articles_data/immutable-data-structures/defragmentation.png" alt="Defragmentation" caption="Defragmentation" width="70%" >}} -... +This additional information is available to Qdrant via the [payload index](/documentation/concepts/indexing/#payload-index). - +By specifying the payload index, which is going to be used for filtering most of the time, we can put all vectors with the same payload together. +This way, reading a single page will also read nearby vectors, which will be used in the search. - +This approach is especially efficient for [multi-tenant systems](/documentation/guides/multiple-partitions/), where only a small subset of vectors is actively used for search. +The capacity of such a deployment is typically defined by the size of the hot subset, which is much smaller than the total number of vectors. -https://qdrant.tech/articles/cars-recognition/ +> Grouping relevant vectors together allows us to optimize the size of the hot subset by avoiding caching of irrelevant data. +The following benchmark data compares RPS for defragmented and non-defragmented storage: -2024-12-20T13:10:51+01:00 +| % of hot subset | Tenant Size (vectors) | RPS, Non-defragmented | RPS, Defragmented | +|-----------------|-----------------------|-----------------------|-------------------| +| 2.5% | 50k | 1.5 | 304 | +| 12.5% | 50k | 0.47 | 279 | +| 25% | 50k | 0.4 | 63 | +| 50% | 50k | 0.3 | 8 | +| 2.5% | 5k | 56 | 490 | +| 12.5% | 5k | 5.8 | 488 | +| 25% | 5k | 3.3 | 490 | +| 50% | 5k | 3.1 | 480 | +| 75% | 5k | 2.9 | 130 | +| 100% | 5k | 2.7 | 95 | -... - +**Dataset size:** 2M 768d vectors (~6Gb Raw data), binary quantization, 650Mb of RAM limit. +All benchmarks are made with minimal RAM allocation to demonstrate disk cache efficiency. - +As you can see, the biggest impact is on the small tenant size, where defragmentation allows us to achieve **100x more RPS**. +Of course, the real-world impact of defragmentation depends on the specific workload and the size of the hot subset, but enabling this feature can significantly improve the performance of Qdrant. -https://qdrant.tech/documentation/guides/administration/ +Please find more details on how to enable defragmentation in the [indexing documentation](/documentation/concepts/indexing/#tenant-index). -2025-05-19T15:01:52+02:00 -... +## Updating Immutable Data Structures - +One may wonder how Qdrant allows updating collection data if everything is immutable. +Indeed, [Qdrant API](https://api.qdrant.tech) allows the change of any vector or payload at any time, so from the user's perspective, the whole collection is mutable at any time. - +As it usually happens with every decent magic trick, the secret is disappointingly simple: not all data in Qdrant is immutable. +In Qdrant, storage is divided into segments, which might be either mutable or immutable. +New data is always written to the mutable segment, which is later converted to the immutable one by the optimization process. -https://qdrant.tech/benchmarks/benchmark-faq/ +{{< figure src="/articles_data/immutable-data-structures/optimization.png" alt="Optimization process" caption="Optimization process" width="80%" >}} -2024-01-11T19:41:06+05:30 +If we need to update the data in the immutable or currenly optimized segment, instead of changing the data in place, we perform a copy-on-write operation, move the data to the mutable segment, and update it there. -... +Data in the original segment is marked as deleted, and later vacuumed by the optimization process. - +## Downsides and How to Compensate - +While immutable data structures are great for read-heavy operations, they come with trade-offs: -https://qdrant.tech/documentation/guides/running-with-gpu/ +- **Higher update costs:** Immutable structures are less efficient for updates. The amortized time complexity might be the same as mutable structures, but the constant factor is higher. +- **Rebuilding overhead:** In some cases, we may need to rebuild indices or structures for the same data more than once. +- **Read-heavy workloads:** Immutability assumes a search-heavy workload, which is typical for search engines but not for all applications. -2025-03-20T15:19:07+01:00 +In Qdrant, we mitigate these downsides by allowing the user to adapt the system to their specific workload. +For example, changing the default size of the segment might help to reduce the overhead of rebuilding indices. -... +In extreme cases, multi-segment storage can act as a single segment, falling back to the mutable data structure when needed. - +## Conclusion - +Immutable data structures, while tricky to implement correctly, offer significant performance gains, especially for read-heavy systems like search engines. They allow us to take full advantage of hardware optimizations, reduce memory overhead, and improve cache performance. -https://qdrant.tech/articles/vector-search-manuals/ +In Qdrant, the combination of techniques like perfect hashing and defragmentation brings further benefits, making our vector search operations faster and more efficient. While there are trade-offs, the flexibility of Qdrant’s architecture — including segment-based storage — allows us to balance the best of both worlds. -2024-12-20T13:10:51+01:00 +<|page-23-lllmstxt|> +Have you ever heard of sparse neural retrieval? If so, have you used it in production? -... +It's a field with excellent potential -- who wouldn't want to use an approach that combines the strengths of dense and term-based text retrieval? Yet it's not so popular. Is it due to the common curse of *“What looks good on paper is not going to work in practice”?*? - +This article describes our path towards sparse neural retrieval *as it should be* -- lightweight term-based retrievers capable of distinguishing word meanings. - +Learning from the mistakes of previous attempts, we created **miniCOIL**, a new sparse neural candidate to take BM25's place in hybrid searches. We're happy to share it with you and are awaiting your feedback. -https://qdrant.tech/documentation/guides/capacity-planning/ +## The Good, the Bad and the Ugly -2024-10-05T03:39:41+05:30 +Sparse neural retrieval is not so well known, as opposed to methods it's based on -- term-based and dense retrieval. Their weaknesses motivated this field's development, guiding its evolution. Let's follow its path. -... +{{< figure src="/articles_data/minicoil/models_evolution.png" alt="Retrievers evolution" caption="Retrievers evolution" width="100%" >}} - +### Term-based Retrieval - +Term-based retrieval usually treats text as a bag of words. These words play roles of different importance, contributing to the overall relevance score between a document and a query. -https://qdrant.tech/documentation/fastembed/ +Famous **BM25** estimates words' contribution based on their: +1. Importance in a particular text -- Term Frequency (TF) based. +2. Significance within the whole corpus -- Inverse Document Frequency (IDF) based. -2025-05-27T18:00:51+02:00 +It also has several parameters reflecting typical text length in the corpus, the exact meaning of which you can check in [our detailed breakdown of the BM25 formula](https://qdrant.tech/articles/bm42/#why-has-bm25-stayed-relevant-for-so-long). -... +Precisely defining word importance within a text is nontrivial. - +BM25 is built on the idea that term importance can be defined statistically. +This isn't far from the truth in long texts, where frequent repetition of a certain word signals that the text is related to this concept. In very short texts -- say, chunks for Retrieval Augmented Generation (RAG) -- it's less applicable, with TF of 0 or 1. We approached fixing it in our [BM42 modification of BM25 algorithm.](https://qdrant.tech/articles/bm42/) - +Yet there is one component of a word's importance for retrieval, which is not considered in BM25 at all -- word meaning. The same words have different meanings in different contexts, and it affects the text's relevance. Think of *"fruit **bat**"* and *"baseball **bat**"*—the same importance in the text, different meanings. -https://qdrant.tech/documentation/guides/optimize/ +### Dense Retrieval -2025-04-07T00:40:39+02:00 +How to capture the meaning? Bag-of-words models like BM25 assume that words are placed in a text independently, while linguists say: -... +> "You shall know a word by the company it keeps" - John Rupert Firth - +This idea, together with the motivation to numerically express word relationships, powered the development of the second branch of retrieval -- dense vectors. Transformer models with attention mechanisms solved the challenge of distinguishing word meanings within text context, making it a part of relevance matching in retrieval. - +Yet dense retrieval didn't (and can't) become a complete replacement for term-based retrieval. Dense retrievers are capable of broad semantic similarity searches, yet they lack precision when we need results including a specific keyword. -https://qdrant.tech/documentation/cloud-getting-started/ +It's a fool's errand -- trying to make dense retrievers do exact matching, as they're built in a paradigm where every word matches every other word semantically to some extent, and this semantic similarity depends on the training data of a particular model. -2025-05-02T16:53:21+02:00 +### Sparse Neural Retrieval -... +So, on one side, we have weak control over matching, sometimes leading to too broad retrieval results, and on the other—lightweight, explainable and fast term-based retrievers like BM25, incapable of capturing semantics. - +Of course, we want the best of both worlds, fused in one model, no drawbacks included. Sparse neural retrieval evolution was pushed by this desire. - +- Why **sparse**? Term-based retrieval can operate on sparse vectors, where each word in the text is assigned a non-zero value (its importance in this text). +- Why **neural**? Instead of deriving an importance score for a word based on its statistics, let's use machine learning models capable of encoding words' meaning. -https://qdrant.tech/documentation/guides/multiple-partitions/ +**So why is it not widely used?** +{{< figure src="/articles_data/minicoil/models_problems.png" alt="Problems of modern sparse neural retrievers" caption="Problems of modern sparse neural retrievers" width="100%" >}} -2025-04-07T00:40:39+02:00 +The detailed history of sparse neural retrieval makes for [a whole other article](https://qdrant.tech/articles/modern-sparse-neural-retrieval/). Summing a big part of it up, there were many attempts to map a word representation produced by a dense encoder to a single-valued importance score, and most of them never saw the real world outside of research papers (**DeepImpact**, **TILDEv2**, **uniCOIL**). -... +Trained end-to-end on a relevance objective, most of the **sparse encoders** estimated word importance well only for a particular domain. Their out-of-domain accuracy, on datasets they hadn't "seen" during training, [was worse than BM25.](https://arxiv.org/pdf/2307.10488) - +The SOTA of sparse neural retrieval is **SPLADE** -- (Sparse Lexical and Expansion Model). This model has made its way into retrieval systems - you can [use SPLADE++ in Qdrant with FastEmbed](https://qdrant.tech/documentation/fastembed/fastembed-splade/). - +Yet there's a catch. The "expansion" part of SPLADE's name refers to a technique that combats against another weakness of term-based retrieval -- **vocabulary mismatch**. While dense encoders can successfully connect related terms like "fruit bat" and "flying fox", term-based retrieval fails at this task. -https://qdrant.tech/documentation/qdrant-mcp-server/ +SPLADE solves this problem by **expanding documents and queries with additional fitting terms**. However, it leads to SPLADE inference becoming heavy. Additionally, produced representations become not-so-sparse (so, consequently, not lightweight) and far less explainable as expansion choices are made by machine learning models. -2025-05-27T18:00:51+02:00 +> "Big man in a suit of armor. Take that off, what are you?" -... +Experiments showed that SPLADE without its term expansion tells the same old story of sparse encoders — [it performs worse than BM25.](https://arxiv.org/pdf/2307.10488) - +## Eyes on the Prize: Usable Sparse Neural Retrieval - +Striving for perfection on specific benchmarks, the sparse neural retrieval field either produced models performing worse than BM25 out-of-domain(ironically, [trained with BM25-based hard negatives](https://arxiv.org/pdf/2307.10488)) or models based on heavy document expansion, lowering sparsity. -https://qdrant.tech/documentation/cloud-account-setup/ +To be usable in production, the minimal criteria a sparse neural retriever should meet are: -2025-05-02T18:40:38+02:00 +- **Producing lightweight sparse representations (it's in the name!).** Inheriting the perks of term-based retrieval, it should be lightweight and simple. For broader semantic search, there are dense retrievers. +- **Being better than BM25 at ranking in different domains.** The goal is a term-based retriever capable of distinguishing word meanings — what BM25 can't do — preserving BM25's out-of-domain, time-proven performance. -... +{{< figure src="/articles_data/minicoil/minicoil.png" alt="The idea behind miniCOIL" caption="The idea behind miniCOIL" width="100%" >}} - +### Inspired by COIL - +One of the attempts in the field of Sparse Neural Retrieval — [Contextualized Inverted Lists (COIL)](https://qdrant.tech/articles/modern-sparse-neural-retrieval/#sparse-neural-retriever-which-understood-homonyms) — stands out with its approach to term weights encoding. -https://qdrant.tech/documentation/cloud-rbac/ +Instead of squishing high-dimensional token representations (usually 768-dimensional BERT embeddings) into a single number, COIL authors project them to smaller vectors of 32 dimensions. They propose storing these vectors in **inverted lists** of an **inverted index** (used in term-based retrieval) as is and comparing vector representations through dot product. -2025-05-02T16:53:21+02:00 +This approach captures deeper semantics, a single number simply cannot convey all the nuanced meanings a word can have. -... +Despite this advantage, COIL failed to gain widespread adoption for several key reasons: - +- Inverted indexes are usually not designed to store vectors and perform vector operations. +- Trained end-to-end with a relevance objective on [MS MARCO dataset](https://microsoft.github.io/msmarco/), COIL's performance is heavily domain-bound. +- Additionally, COIL operates on tokens, reusing BERT's tokenizer. However, working at a word level is far better for term-based retrieval. Imagine we want to search for a *"retriever"* in our documentation. COIL will break it down into `re`, `#trie`, and `#ver` 32-dimensional vectors and match all three parts separately -- not so convenient. - +However, COIL representations allow distinguishing homographs, a skill BM25 lacks. The best ideas don't start from zero. We propose an approach **built on top of COIL, keeping in mind what needs fixing**: -https://qdrant.tech/documentation/cloud/ +1. We should **abandon end-to-end training on a relevance objective** to get a model performant on out-of-domain data. There is not enough data to train a model able to generalize. +2. We should **keep representations sparse and reusable in a classic inverted index**. +3. We should **fix tokenization**. This problem is the easiest one to solve, as it was already done in several sparse neural retrievers, and [we also learned to do it in our BM42](https://qdrant.tech/articles/bm42/#wordpiece-retokenization). -2025-05-02T16:53:21+02:00 +### Standing on the Shoulders of BM25 -... +BM25 has been a decent baseline across various domains for many years -- and for a good reason. So why discard a time-proven formula? - +Instead of training our sparse neural retriever to assign words' importance scores, let's add a semantic COIL-inspired component to BM25 formula. - +$$ +\text{score}(D,Q) = \sum_{i=1}^{N} \text{IDF}(q_i) \cdot \text{Importance}^{q_i}_{D} \cdot {\color{YellowGreen}\text{Meaning}^{q_i \times d_j}} \text{, where term } d_j \in D \text{ equals } q_i +$$ -https://qdrant.tech/documentation/hybrid-cloud/ +Then, if we manage to capture a word's meaning, our solution alone could work like BM25 combined with a semantically aware reranker -- or, in other words: -2025-05-02T16:53:21+02:00 +- It could see the difference between homographs; +- When used with word stems, it could distinguish parts of speech. -... +{{< figure src="/articles_data/minicoil/examples.png" alt="Meaning component" caption="Meaning component" width="100%" >}} - +And if our model stumbles upon a word it hasn't "seen" during training, we can just fall back to the original BM25 formula! - +### Bag-of-words in 4D -https://qdrant.tech/documentation/beginner-tutorials/ +COIL uses 32 values to describe one term. Do we need this many? How many words with 32 separate meanings could we name without additional research? -2024-11-18T15:26:15-08:00 +Yet, even if we use fewer values in COIL representations, the initial problem of dense vectors not fitting into a classical inverted index persists. +Unless... We perform a simple trick! -... +{{< figure src="/articles_data/minicoil/bow_4D.png" alt="miniCOIL vectors to sparse representation" caption="miniCOIL vectors to sparse representation" width="80%" >}} - +Imagine a bag-of-words sparse vector. Every word from the vocabulary takes up one cell. If the word is present in the encoded text — we assign some weight; if it isn't — it equals zero. - +If we have a mini COIL vector describing a word's meaning, for example, in 4D semantic space, we could just dedicate 4 consecutive cells for word in the sparse vector, one cell per "meaning" dimension. If we don't, we could fall back to a classic one-cell description with a pure BM25 score. -https://qdrant.tech/documentation/advanced-tutorials/ +**Such representations can be used in any standard inverted index.** -2025-02-07T18:51:10-05:00 +## Training miniCOIL -... +Now, we're coming to the part where we need to somehow get this low-dimensional encapsulation of a word's meaning -- **a miniCOIL vector**. - +We want to work smarter, not harder, and rely as much as possible on time-proven solutions. Dense encoders are good at encoding a word's meaning in its context, so it would be convenient to reuse their output. Moreover, we could kill two birds with one stone if we wanted to add miniCOIL to hybrid search -- where dense encoder inference is done regardless. - +### Reducing Dimensions -https://qdrant.tech/documentation/private-cloud/ +Dense encoder outputs are high-dimensional, so we need to perform **dimensionality reduction, which should preserve the word's meaning in context**. The goal is to: -2025-05-02T16:53:21+02:00 +- Avoid relevance objective and dependence on labelled datasets; +- Find a target capturing spatial relations between word’s meanings; +- Use the simplest architecture possible. -... +### Training Data - +We want miniCOIL vectors to be comparable according to a word's meaning — *fruit **bat*** and *vampire **bat*** should be closer to each other in low-dimensional vector space than to *baseball **bat***. So, we need something to calibrate on when reducing the dimensionality of words' contextualized representations. - +It's said that a word's meaning is hidden in the surrounding context or, simply put, in any texts that include this word. In bigger texts, we risk the word's meaning blending out. So, let's work at the sentence level and assume that sentences sharing one word should cluster in a way that each cluster contains sentences where this word is used in one specific meaning. -https://qdrant.tech/documentation/cloud-pricing-payments/ +If that's true, we could encode various sentences with a sophisticated dense encoder and form a reusable spatial relations target for input dense encoders. It's not a big problem to find lots of textual data containing frequently used words when we have datasets like the [OpenWebText dataset](https://paperswithcode.com/dataset/openwebtext), spanning the whole web. With this amount of data available, we could afford generalization and domain independence, which is hard to achieve with the relevance objective. -2025-05-02T16:53:21+02:00 +#### It's Going to Work, I Bat -... +Let’s test our assumption and take a look at the word *“bat”*. - +We took several thousand sentences with this word, which we sampled from [OpenWebText dataset](https://paperswithcode.com/dataset/openwebtext) and vectorized with a [`mxbai-embed-large-v1`](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) encoder. The goal was to check if we could distinguish any clusters containing sentences where *“bat”* shares the same meaning. - +{{< figure src="/articles_data/minicoil/bat.png" alt="Sentences with \"bat\" in 2D" caption="Sentences with \"bat\" in 2D.
A very important observation: *Looks like a bat*:)" width="80%" >}} -https://qdrant.tech/documentation/examples/qdrant-dspy-medicalbot/ +The result had two big clusters related to *"bat"* as an animal and *"bat"* as a sports equipment, and two smaller ones related to fluttering motion and the verb used in sports. Seems like it could work! -2025-06-19T11:54:06+03:00 +### Architecture and Training Objective -... +Let's continue dealing with *"bats"*. -
+We have a training pool of sentences containing the word *"bat"* in different meanings. Using a dense encoder of choice, we get a contextualized embedding of *"bat"* from each sentence and learn to compress it into a low-dimensional miniCOIL *"bat"* space, guided by [`mxbai-embed-large-v1`](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) sentence embeddings. - +We're dealing with only one word, so it should be enough to use just one linear layer for dimensionality reduction, with a [`Tanh activation`](https://pytorch.org/docs/stable/generated/torch.nn.Tanh.html) on top, mapping values of compressed vectors to (-1, 1) range. The activation function choice is made to align miniCOIL representations with dense encoder ones, which are mainly compared through `cosine similarity`. -https://qdrant.tech/documentation/data-management/ +{{< figure src="/articles_data/minicoil/miniCOIL_one_word.png" alt="miniCOIL architecture on a word level" caption="miniCOIL architecture on a word level" width="100%" >}} -2025-05-31T21:49:18+02:00 +As a training objective, we can select the minimization of [triplet loss](https://qdrant.tech/articles/triplet-loss/), where triplets are picked and aligned based on distances between [`mxbai-embed-large-v1`](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) sentence embeddings. We rely on the confidence (size of the margin) of [`mxbai-embed-large-v1`](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) to guide our *"bat"* miniCOIL compression. -... +{{< figure src="/articles_data/minicoil/training_objective.png" alt="miniCOIL training" caption="miniCOIL training" width="80%" >}} - + - +#### Eating Elephant One Bite at a Time -https://qdrant.tech/documentation/examples/llama-index-multitenancy/ +Now, we have the full idea of how to train miniCOIL for one word. How do we scale to a whole vocabulary? -2024-04-11T13:13:14-07:00 +What if we keep it simple and continue training a model per word? It has certain benefits: -... +1. Extremely simple architecture: even one layer per word can suffice. +2. Super fast and easy training process. +3. Cheap and fast inference due to the simple architecture. +4. Flexibility to discover and tune underperforming words. +5. Flexibility to extend and shrink the vocabulary depending on the domain and use case. - +Then we could train all the words we're interested in and simply combine (stack) all models into one big miniCOIL. - +{{< figure src="/articles_data/minicoil/miniCOIL_full.png" alt="miniCOIL model" caption="miniCOIL model" width="100%" >}} -https://qdrant.tech/documentation/database-tutorials/ +### Implementation Details -2025-06-11T19:02:35+03:00 +The code of the training approach sketched above is open-sourced [in this repository](https://github.com/qdrant/miniCOIL). -... +Here are the specific characteristics of the miniCOIL model we trained based on this approach: - +| Component | Description | +|:---|:---| +| **Input Dense Encoder** | [`jina-embeddings-v2-small-en`](https://huggingface.co/jinaai/jina-embeddings-v2-small-en) (512 dimensions) | +| **miniCOIL Vectors Size** | 4 dimensions | +| **miniCOIL Vocabulary** | List of 30,000 of the most common English words, cleaned of stop words and words shorter than 3 letters, [taken from here](https://github.com/arstgit/high-frequency-vocabulary/tree/master). Words are stemmed to align miniCOIL with our BM25 implementation. | +| **Training Data** | 40 million sentences — a random subset of the [OpenWebText dataset](https://paperswithcode.com/dataset/openwebtext). To make triplet sampling convenient, we uploaded sentences and their [`mxbai-embed-large-v1`](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) embeddings to Qdrant and built a [full-text payload index](https://qdrant.tech/documentation/concepts/indexing/#full-text-index) on sentences with a tokenizer of type `word`. | +| **Training Data per Word** | We sample 8000 sentences per word and form triplets with a margin of at least **0.1**.
Additionally, we apply **augmentation** — take a sentence and cut out the target word plus its 1–3 neighbours. We reuse the same similarity score between original and augmented sentences for simplicity. | +| **Training Parameters** | **Epochs**: 60
**Optimizer**: Adam with a learning rate of 1e-4
**Validation set**: 20% | - +Each word was **trained on just one CPU**, and it took approximately fifty seconds per word to train. +We included this `minicoil-v1` version in the [v0.7.0 release of our FastEmbed library](https://github.com/qdrant/fastembed). -https://qdrant.tech/documentation/embeddings/ +You can check an example of `minicoil-v1` usage with FastEmbed in the [HuggingFace card](https://huggingface.co/Qdrant/minicoil-v1). -2024-11-28T08:54:13+05:30 + -... +## Results - +### Validation Loss - +Input transformer [`jina-embeddings-v2-small-en`](https://huggingface.co/jinaai/jina-embeddings-v2-small-en) approximates the “role model” transformer [`mxbai-embed-large-v1`](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) context relations with a (measured though triplets) quality of 83%. That means that in 17% of cases, [`jina-embeddings-v2-small-en`](https://huggingface.co/jinaai/jina-embeddings-v2-small-en) will take a sentence triplet from [`mxbai-embed-large-v1`](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) and embed it in a way that the negative example from the perspective of `mxbai` will be closer to the anchor than the positive one. -https://qdrant.tech/documentation/cloud-premium/ +The validation loss we obtained, depending on the miniCOIL vector size (4, 8, or 16), demonstrates miniCOIL correctly distinguishing from 76% (60 failed triplets on average per batch of size 256) to 85% (38 failed triplets on average per batch of size 256) triplets respectively. -2025-05-02T16:53:21+02:00 +{{< figure src="/articles_data/minicoil/validation_loss.png" alt="Validation loss" caption="Validation loss" width="80%" >}} -... +### Benchmarking +The benchmarking code is open-sourced in [this repository](https://github.com/qdrant/mini-coil-demo/tree/master/minicoil_demo). - +To check our 4D miniCOIL version performance in different domains, we, ironically, chose a subset of the same [BEIR datasets](https://github.com/beir-cellar/beir), high benchmark values on which became an end in itself for many sparse neural retrievers. Yet the difference is that **miniCOIL wasn't trained on BEIR datasets and shouldn't be biased towards them**. - +We're testing our 4D miniCOIL model versus [our BM25 implementation](https://huggingface.co/Qdrant/bm25). BEIR datasets are indexed to Qdrant using the following parameters for both methods: +- `k = 1.2`, `b = 0.75` default values recommended to use with BM25 scoring; +- `avg_len` estimated on 50,000 documents from a respective dataset. -https://qdrant.tech/articles/metric-learning-tips/ + -2024-12-20T13:10:51+01:00 +We compare models based on the `NDCG@10` metric, as we're interested in the ranking performance of miniCOIL compared to BM25. Both retrieve the same subset of indexed documents based on exact matches, but miniCOIL should ideally rank this subset better based on its semantics understanding. -... +The result on several domains we tested is the following: - +| Dataset | BM25 (NDCG@10) | MiniCOIL (NDCG@10) | +|:-----------|:--------------|:------------------| +| MS MARCO | 0.237 | **0.244** | +| NQ | 0.304 | **0.319** | +| Quora | 0.784 | **0.802** | +| FiQA-2018 | 0.252 | **0.257** | +| HotpotQA | **0.634** | 0.633 | - +We can see miniCOIL performing slightly better than BM25 in four out of five tested domains. It shows that **we're moving in the right direction**. -https://qdrant.tech/documentation/cloud/create-cluster/ + -2025-05-02T16:53:21+02:00 +## Key Takeaways -... +This article describes our attempt to make a lightweight sparse neural retriever that is able to generalize to out-of-domain data. Sparse neural retrieval has a lot of potential, and we hope to see it gain more traction. - +### Why is this Approach Useful? - +This approach to training sparse neural retrievers: -https://qdrant.tech/documentation/frameworks/ +1. Doesn’t rely on a relevance objective because it is trained in a self-supervised way, so it doesn’t need labeled datasets to scale. +2. Builds on the proven BM25 formula, simply adding a semantic component to it. +3. Creates lightweight sparse representations that fit into a standard inverted index. +4. Fully reuses the outputs of dense encoders, making it adaptable to different models. This also makes miniCOIL a cheap upgrade for hybrid search solutions. +5. Uses an extremely simple model architecture, with one trainable layer per word in miniCOIL’s vocabulary. This results in very fast training and inference. Also, this word-level training makes it easy to expand miniCOIL’s vocabulary for a specific use case. -2025-05-19T21:17:24+05:30 +### The Right Tool for the Right Job -... +When are miniCOIL retrievers applicable? - +If you need precise term matching but BM25-based retrieval doesn't meet your needs, ranking higher documents with words of the right form but the wrong semantical meaning. - +Say you're implementing search in your documentation. In this use case, keywords-based search prevails, but BM25 won't account for different context-based meanings of these keywords. For example, if you're searching for a *"data **point**"* in our documentation, you'd prefer to see *"a **point** is a record in Qdrant"* ranked higher than *floating **point** precision*, and here miniCOIL-based retrieval is an alternative to consider. -https://qdrant.tech/articles/qdrant-internals/ +Additionally, miniCOIL fits nicely as a part of a hybrid search, as it enhances sparse retrieval without any noticeable increase in resource consumption, directly reusing contextual word representations produced by a dense encoder. -2024-12-20T13:10:51+01:00 +To sum up, miniCOIL should work as if BM25 understood the meaning of words and ranked documents based on this semantic knowledge. It operates only on exact matches, so if you aim for documents semantically similar to the query but expressed in different words, dense encoders are the way to go. -... +### What's Next? - +We will continue working on improving our approach -- both in-depth, searching for ways to improve the model's quality, and in-width, extending it to various dense encoders and languages beyond English. - +And we would love to share this road to usable sparse neural retrieval with you! -https://qdrant.tech/documentation/observability/ +<|page-24-lllmstxt|> +> A problem well stated is a problem half solved. -2024-11-14T18:59:28+01:00 +This quote applies as much to life as it does to information retrieval. -... +With a well-formulated query, retrieving the relevant document becomes trivial. +In reality, however, most users struggle to precisely define what they are searching for. - +While users may struggle to formulate a perfect request — especially in unfamiliar topics — they can easily judge whether a retrieved answer is relevant or not. - +**Relevance is a powerful feedback mechanism for a retrieval system** to iteratively refine results in the direction of user interest. -https://qdrant.tech/documentation/platforms/ +In 2025, with social media flooded with daily AI breakthroughs, it almost seems like information retrieval is solved, agents can iteratively adjust their search queries while assessing the relevance. -2025-05-14T07:24:10-04:00 +Of course, there's a catch: these models still rely on retrieval systems (*RAG isn't dead yet, despite daily predictions of its demise*). +They receive only a handful of top-ranked results provided by a far simpler and cheaper retriever. +As a result, the success of guided retrieval still mainly depends on the retrieval system itself. -... +So, we should find a way of effectively and efficiently incorporating relevance feedback directly into a retrieval system. +In this article, we'll explore the approaches proposed in the research literature and try to answer the following question: - +*If relevance feedback in search is so widely studied and praised as effective, why is it practically not used in dedicated vector search solutions?* - +## Dismantling the Relevance Feedback -https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/ +Both industry and academia tend to reinvent the wheel here and there. +So, we first took some time to study and categorize different methods — just in case there was something we could plug directly into Qdrant. +The resulting taxonomy isn't set in stone, but we aim to make it useful. -2024-05-15T18:01:28+02:00 +{{
}} -... +### Pseudo-Relevance Feedback (PRF) - +Pseudo-Relevance feedback takes the top-ranked documents from the initial retrieval results and treats them as relevant. This approach might seem naive, but it provides a noticeable performance boost in lexical retrieval while being relatively cheap to compute. - +### Binary Relevance Feedback -https://qdrant.tech/documentation/examples/cohere-rag-connector/ +The most straightforward way to gather feedback is to ask users directly if document is relevant. +There are two main limitations to this approach: -2025-02-18T21:01:07+05:30 +First, users are notoriously reluctant to provide feedback. Did you know that [Google once had](https://en.wikipedia.org/wiki/Google_SearchWiki#:~:text=SearchWiki%20was%20a%20Google%20Search,for%20a%20given%20search%20query) an upvote/downvote mechanism on search results but removed it because almost no one used it? -... +Second, even if users are willing to provide feedback, no relevant documents might be present in the initial retrieval results. In this case, the user can't provide a meaningful signal. - +Instead of asking users, we can ask a smart model to provide binary relevance judgements, but this would limit its potential to generate granular judgements. - +### Re-scored Relevance Feedback -https://qdrant.tech/documentation/send-data/ +We can also apply more sophisticated methods to extract relevance feedback from the top-ranked documents - machine learning models can provide a relevance score for each document. -2024-11-14T18:59:28+01:00 +The obvious concern here is twofold: +1. How accurately can the automated judge determine relevance (or irrelevance)? +2. How cost-efficient is it? After all, you can’t expect GPT-4o to re-rank thousands of documents for every user query — unless you’re filthy rich. -... +Nevertheless, automated re-scored feedback could be a scalable way to improve search when explicit binary feedback is not accessible. - +## Has the Problem Already Been Solved? - +Digging through research materials, we expected anything else but to discover that the first relevance feedback study dates back [*sixty years*](https://sigir.org/files/museum/pub-08/XXIII-1.pdf). +In the midst of the neural search bubble, it's easy to forget that lexical (term-based) retrieval has been around for decades. Naturally, research in that field has had enough time to develop. -https://qdrant.tech/documentation/examples/ +**Neural search** — aka [vector search](https://qdrant.tech/articles/neural-search-tutorial/) — gained traction in the industry around 5 years ago. Hence, vector-specific relevance feedback techniques might still be in their early stages, awaiting production-grade validation and industry adoption. -2025-06-19T11:54:06+03:00 +As a [dedicated vector search engine](https://qdrant.tech/articles/dedicated-vector-search/), we would like to be these adopters. +Our focus is neural search, but approaches in both lexical and neural retrieval seem worth exploring, as cross-field studies are always insightful, with the potential to reuse well-established methods of one field in another. -... +We found some interesting methods applicable to neural search solutions and additionally revealed a **gap in the neural search-based relevance feedback approaches**. Stick around, and we'll share our findings! - +## Two Ways to Approach the Problem - +Retrieval as a recipe can be broken down into three main ingredients: +1. Query +2. Documents +3. Similarity scoring between them. -https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws/ +{{
}} -2025-02-18T21:01:07+05:30 +Query formulation is a subjective process – it can be done in infinite configurations, making the relevance of a document unpredictable until the query is formulated and submitted to the system. -... +So, adapting documents (or the search index) to relevance feedback would require per-request dynamic changes, which is impractical, considering that modern retrieval systems store billions of documents. - +Thus, approaches for incorporating relevance feedback in search fall into two categories: **refining a query** and **refining the similarity scoring function** between the query and documents. - -https://qdrant.tech/documentation/examples/hybrid-search-llamaindex-jinaai/ +## Query Refinement -2024-04-15T17:41:39-07:00 +There are several ways to refine a query based on relevance feedback. +Globally, we prefer to distinguish between two approaches: modifying the query as text and modifying the vector representation of the query. -... +{{
}} - +### Query As Text - +In **term-based retrieval**, an intuitive way to improve a query would be to **expand it with relevant terms**. It resembled the "*aha, so that's what it's called*" stage in the discovery search. -https://qdrant.tech/documentation/cloud-api/ +Before the deep learning era of this century, expansion terms were mainly selected using statistical or probabilistic models. The idea was to: -2025-06-06T09:56:35+02:00 +1. Either extract the **most frequent** terms from (pseudo-)relevant documents; +2. Or the **most specific** ones (for example, according to IDF); +3. Or the **most probable** ones (most likely to be in query according to a relevance set). -... +Well-known methods of those times come from the family of [Relevance Models](https://sigir.org/wp-content/uploads/2017/06/p260.pdf), where terms for expansion are chosen based on their probability in pseudo-relevant documents (how often terms appear) and query terms likelihood given those pseudo-relevant documents - how strongly these pseudo-relevant documents match the query. - +The most famous one, `RM3` – interpolation of expansion terms probability with their probability in a query – is still appearing in papers of the last few years as a (noticeably decent) baseline in term-based retrieval, usually as part of [anserini](https://github.com/castorini/anserini). - +{{
}} -https://qdrant.tech/documentation/cloud-tools/ +With the time approaching the modern machine learning era, [multiple](https://aclanthology.org/2020.findings-emnlp.424.pdf) [studies](https://dl.acm.org/doi/10.1145/1390334.1390377) began claiming that these traditional ways of query expansion are not as effective as they could be. -2024-11-19T17:56:47-08:00 +Started with simple classifiers based on hand-crafted features, this trend naturally led to use the famous [BERT (Bidirectional encoder representations from transformers)](https://huggingface.co/docs/transformers/model_doc/bert). For example, `BERT-QE` (Query Expansion) authors came up with this schema: -... +1. Get pseudo-relevance feedback from the finetuned BERT reranker (~10 documents); +2. Chunk these pseudo-relevant documents (~100 words) and score query-chunk relevance with the same reranker; +3. Expand the query with the most relevant chunks; +4. Rerank 1000 documents with the reranker using the expanded query. - +This approach significantly outperformed BM25 + RM3 baseline in experiments (+11% NDCG@20). However, it required **11.01x** more computation than just using BERT for reranking, and reranking 1000 documents with BERT would take around 9 seconds alone. - +Query term expansion can *hypothetically* work for neural retrieval as well. New terms might shift the query vector closer to that of the desired document. However, [this approach isn’t guaranteed to succeed](https://dl.acm.org/doi/10.1145/3570724). Neural search depends entirely on embeddings, and how those embeddings are generated — consequently, how similar query and document vectors are — depends heavily on the model’s training. -https://qdrant.tech/documentation/examples/rag-contract-management-stackit-aleph-alpha/ +It definitely works if **query refining is done by a model operating in the same vector space**, which typically requires offline training of a retriever. +The goal is to extend the query encoder input to also include feedback documents, producing an adjusted query embedding. Examples include [`ANCE-PRF`](https://arxiv.org/pdf/2108.13454) and [`ColBERT-PRF`](https://dl.acm.org/doi/10.1145/3572405) – ANCE and ColBERT fine-tuned extensions. -2025-02-18T21:01:07+05:30 +{{
}} -... +The reason why you’re most probably not familiar with these models – their absence in the industry – is that their **training** itself is a **high upfront cost**, and even though it was “paid”, these models [struggle with generalization](https://arxiv.org/abs/2108.13454), performing poorly on out-of-domain tasks (datasets they haven’t seen during training). +Additionally, feeding an attention-based model a lengthy input (query + documents) is not a good practice in production settings (attention is quadratic in the input length), where time and money are crucial decision factors. - +Alternatively, one could skip a step — and work directly with vectors. - +### Query As Vector -https://qdrant.tech/documentation/datasets/ +Instead of modifying the initial query, a more scalable approach is to directly adjust the query vector. +It is easily applicable across modalities and suitable for both lexical and neural retrieval. -2024-11-14T18:59:28+01:00 +Although vector search has become a trend in recent years, its core principles have existed in the field for decades. For example, the SMART retrieval system used by [Rocchio](https://sigir.org/files/museum/pub-08/XXIII-1.pdf) in 1965 for his relevance feedback experiments operated on bag-of-words vector representations of text. -... +{{
}} - +**Rocchio’s idea** — to update the query vector by adding a difference between the centroids of relevant and non-relevant documents — seems to translate well to modern dual encoders-based dense retrieval systems. +Researchers seem to agree: a study from 2022 demonstrated that the [parametrized version of Rocchio’s method](https://arxiv.org/pdf/2108.11044) in dense retrieval consistently improves Recall@1000 by 1–5%, while keeping query processing time suitable for production — around 170 ms. - +However, parameters (centroids and query weights) in the dense retrieval version of Roccio’s method must be tuned for each dataset and, ideally, also for each request. -https://qdrant.tech/articles/detecting-coffee-anomalies/ +#### Gradient Descent-Based Methods -2024-12-20T13:10:51+01:00 +The efficient way of doing so on-the-fly remained an open question until the introduction of a **gradient-descent-based Roccio’s method generalization**: [`Test-Time Optimization of Query Representations (TOUR)`](https://arxiv.org/pdf/2205.12680). +TOUR adapts a query vector over multiple iterations of retrieval and reranking (*retrieve → rerank → gradient descent step*), guided by a reranker’s relevance judgments. -... +{{
}} - +The next iteration of gradient-based methods of query refinement – [`ReFit`](https://arxiv.org/abs/2305.11744) – proposed in 2024 a lighter, production-friendly alternative to TOUR, limiting *retrieve → rerank → gradient descent* sequence to only one iteration. The retriever’s query vector is updated through matching (via [Kullback–Leibler divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence)) retriever and cross-encoder’s similarity scores distribution over feedback documents. ReFit is model- and language-independent and stably improves Recall@100 metric on 2–3%. - +{{
}} -https://qdrant.tech/articles/triplet-loss/ +Gradient descent-based methods seem like a production-viable option, an alternative to finetuning the retriever (distilling it from a reranker). +Indeed, it doesn't require in-advance training and is compatible with any re-ranking models. -2024-12-20T13:10:51+01:00 +However, a few limitations baked into these methods prevented a broader adoption in the industry. -... +The gradient descent-based methods modify elements of the query vector as if it were model parameters; therefore, +they require a substantial amount of feedback documents to converge to a stable solution. - +On top of that, the gradient descent-based methods are sensitive to the choice of hyperparameters, leading to **query drift**, where the query may drift entirely away from the user's intent. - +## Similarity Scoring -https://qdrant.tech/documentation/cloud/authentication/ +{{
}} -2025-05-02T16:53:21+02:00 +Another family of approaches is built around the idea of incorporating relevance feedback directly into the similarity scoring function. +It might be desirable in cases where we want to preserve the original query intent, but still adjust the similarity score based on relevance feedback. -... +In **lexical retrieval**, this can be as simple as boosting documents that share more terms with those judged as relevant. - +Its **neural search counterpart** is a [`k-nearest neighbors-based method`](https://aclanthology.org/2022.emnlp-main.614.pdf) that adjusts the query-document similarity score by adding the sum of similarities between the candidate document and all known relevant examples. +This technique yields a significant improvement, around 5.6 percentage points in NDCG@20, but it requires explicitly labelled (by users) feedback documents to be effective. - +In experiments, the knn-based method is treated as a reranker. In all other papers, we also found that adjusting similarity scores based on relevance feedback is centred around [reranking](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/) – **training or finetuning rerankers to become relevance feedback-aware**. +Typically, experiments include cross-encoders, though [simple classifiers are also an option](https://arxiv.org/pdf/1904.08861). +These methods generally involve rescoring a broader set of documents retrieved during an initial search, guided by feedback from a smaller top-ranked subset. It is not a similarity matching function adjustment per se but rather a similarity scoring model adjustment. -https://qdrant.tech/documentation/concepts/collections/ +Methods typically fall into two categories: +1. **Training rerankers offline** to ingest relevance feedback as an additional input at inference time, [as here](https://aclanthology.org/D18-1478.pdf) — again, attention-based models and lengthy inputs: a production-deadly combination. +2. **Finetuning rerankers** on relevance feedback from the first retrieval stage, [as BaumgĂ€rtner et al. did](https://aclanthology.org/2022.emnlp-main.614.pdf), finetuning bias parameters of a small cross-encoder per query on 2k, k={2, 4, 8} feedback documents. -2025-04-07T00:40:39+02:00 +The biggest limitation here is that these reranker-based methods cannot retrieve relevant documents beyond those returned in the initial search, and using rerankers on thousands of documents in production is a no-go – it’s too expensive. +Ideally, to avoid that, a similarity scoring function updated with relevance feedback should be used directly in the second retrieval iteration. However, in every research paper we’ve come across, retrieval systems are **treated as black boxes** — ingesting queries, returning results, and offering no built-in mechanism to modify scoring. -... +## So, what are the takeaways? - +Pseudo Relevance Feedback (PRF) is known to improve the effectiveness of lexical retrievers. Several PRF-based approaches – mainly query terms expansion-based – are successfully integrated into traditional retrieval systems. At the same time, there are **no known industry-adopted analogues in neural (vector) search dedicated solutions**; neural search-compatible methods remain stuck in research papers. - +The gap we noticed while studying the field is that researchers have **no direct access to retrieval systems**, forcing them to design wrappers around the black-box-like retrieval oracles. This is sufficient for query-adjusting methods but not for similarity scoring function adjustment. -https://qdrant.tech/articles/data-exploration/ +Perhaps relevance feedback methods haven't made it into the neural search systems for trivial reasons — like no one having the time to find the right balance between cost and efficiency. -2024-12-20T13:10:51+01:00 +Getting it to work in a production setting means experimenting, building interfaces, and adapting architectures. Simply put, it needs to look worth it. And unlike 2D vector math, high-dimensional vector spaces are anything but intuitive. The curse of dimensionality is real. So is query drift. Even methods that make perfect sense on paper might not work in practice. -... +A real-world solution should be simple. Maybe just a little bit smarter than a rule-based approach, but still practical. It shouldn't require fine-tuning thousands of parameters or feeding paragraphs of text into transformers. **And for it to be effective, it needs to be integrated directly into the retrieval system itself.** - +<|page-25-lllmstxt|> +Any problem with even a bit of complexity requires a specialized solution. You can use a Swiss Army knife to open a bottle or poke a hole in a cardboard box, but you will need an axe to chop wood — the same goes for software. - +In this article, we will describe the unique challenges vector search poses and why a dedicated solution is the best way to tackle them. -https://qdrant.tech/documentation/examples/natural-language-search-oracle-cloud-infrastructure-cohere-langchain/ +## Vectors +![vectors](/articles_data/dedicated-vector-search/image1.jpg) -2024-04-15T19:50:07-07:00 +Let's look at the central concept of vector databases — [**vectors**](/documentation/concepts/vectors/). -... +Vectors (also known as embeddings) are high-dimensional representations of various data points — texts, images, videos, etc. Many state-of-the-art (SOTA) embedding models generate representations of over 1,500 dimensions. When it comes to state-of-the-art PDF retrieval, the representations can reach [**over 100,000 dimensions per page**](/documentation/advanced-tutorials/pdf-retrieval-at-scale/). - +This brings us to the first challenge of vector search — vectors are heavy. - +### Vectors are Heavy -https://qdrant.tech/documentation/examples/rag-chatbot-vultr-dspy-ollama/ +To put this in perspective, consider one million records stored in a relational database. It's a relatively small amount of data for modern databases, which a free tier of many cloud providers could easily handle. -2025-05-15T19:37:07+05:30 +Now, generate a 1536-dimensional embedding with OpenAI's `text-embedding-ada-002` model from each record, and you are looking at around **6GB of storage**. As a result, vector search workloads, especially if not optimized, will quickly dominate the main use cases of a non-vector database. -... +Having vectors as a part of a main database is a potential issue for another reason — vectors are always a transformation of other data. - +### Vectors are a Transformation - +Vectors are obtained from some other source-of-truth data. They can be restored if lost with the same embedding model previously used. At the same time, even small changes in that model can shift the geometry of the vector space, so if you update or change the embedding model, you need to update and reindex all the data to maintain accurate vector comparisons. -https://qdrant.tech/documentation/examples/recommendation-system-ovhcloud/ +If coupled with the main database, this update process can lead to significant complications and even unavailability of the whole system. -2024-08-23T22:48:27+05:30 + -... +However, vectors have positive properties as well. One of the most important is that vectors are fixed-size. - +### Vectors are Fixed-Size - +Embedding models are designed to produce vectors of a fixed size. We have to use it to our advantage. -https://qdrant.tech/documentation/examples/rag-chatbot-scaleway/ +For fast search, vectors need to be instantly accessible. Whether in [**RAM or disk**](/documentation/concepts/storage/), vectors should be stored in a format that allows quick access and comparison. This is essential, as vector comparison is a very hot operation in vector search workloads. It is often performed thousands of times per search query, so even a small overhead can lead to a significant slowdown. -2025-02-18T21:01:07+05:30 +For dedicated storage, vectors' fixed size comes as a blessing. Knowing how much space one data point needs, we don't have to deal with the usual overhead of locating data — the location of elements in storage is straightforward to calculate. -... +Everything becomes far less intuitive if vectors are stored together with other data types, for example, texts or JSONs. The size of a single data point is not fixed anymore, so accessing it becomes non-trivial, especially if data is added, updated, and deleted over time. - +{{
}} - +**Storing vectors together with other types of data, we lose all the benefits of their characteristics**; however, we fully "enjoy" their drawbacks, polluting the storage with an extremely heavy transformation of data already existing in that storage. -https://qdrant.tech/documentation/cloud/cluster-access/ +## Vector Search +![vector-search](/articles_data/dedicated-vector-search/image2.jpg) -2025-05-02T16:53:21+02:00 +Unlike traditional databases that serve as data stores, **vector databases are more like search engines**. They are designed to be **scalable**, always **available**, and capable of delivering high-speed search results even under heavy loads. Just as Google or Bing can handle billions of queries at once, vector databases are designed for scenarios where rapid, high-throughput, low-latency retrieval is a must. -... +{{
}} - +### Pick Any Two - +Distributed systems are perfect for scalability — horizontal scaling in these systems allows you to add more machines as needed. In the world of distributed systems, one well-known principle — the **CAP theorem** — illustrates that you cannot have it all. The theorem states that a distributed system can guarantee only two out of three properties: **Consistency**, **Availability**, and **Partition Tolerance**. -https://qdrant.tech/documentation/support/ +As network partitions are inevitable in any real-world distributed system, all modern distributed databases are designed with partition tolerance in mind, forcing a trade-off between **consistency** (providing the most up-to-date data) and **availability** (remaining responsive). -2025-04-08T10:25:18+02:00 + -... +There are two main design philosophies for databases in this context: - +### ACID: Prioritizing Consistency - +The ACID model ensures that every transaction (a group of operations treated as a single unit, such as transferring money between accounts) is executed fully or not at all (reverted), leaving the database in a valid state. When a system is distributed, achieving ACID properties requires complex coordination between nodes. Each node must communicate and agree on the state of a transaction, which can **limit system availability** — if a node is uncertain about the state of another, it may refuse to process a transaction until consistency is assured. This coordination also makes **scaling more challenging**. -https://qdrant.tech/documentation/send-data/databricks/ +Financial institutions use ACID-compliant databases when dealing with money transfers, where even a momentary discrepancy in an account balance is unacceptable. -2024-07-29T21:03:45+05:30 +### BASE: Prioritizing Availability -... +On the other hand, the BASE model favors high availability and partition tolerance. BASE systems distribute data and workload across multiple nodes, enabling them to respond to read and write requests immediately. They operate under the principle of **eventual consistency** — although data may be temporarily out-of-date, the system will converge on a consistent state given time. - +Social media platforms, streaming services, and search engines all benefit from the BASE approach. For these applications, having immediate responsiveness is more critical than strict consistency. - +### BASEd Vector Search -https://qdrant.tech/documentation/send-data/qdrant-airflow-astronomer/ +Considering the specifics of vector search — its nature demanding availability & scalability — it should be served on BASE-oriented architecture. This choice is made due to the need for horizontal scaling, high availability, low latency, and high throughput. For example, having BASE-focused architecture allows us to [**easily manage resharding**](/documentation/cloud/cluster-scaling/#resharding). -2024-08-13T13:38:38+03:00 +A strictly consistent transactional approach also loses its attractiveness when we remember that vectors are heavy transformations of data at our disposal — what's the point in limiting data protection mechanisms if we can always restore vectorized data through a transformation? -... +## Vector Index +![vector-index](/articles_data/dedicated-vector-search/image3.jpg) - +[**Vector search**](/documentation/concepts/search/) relies on high-dimensional vector mathematics, making it computationally heavy at scale. A brute-force similarity search would require comparing a query against every vector in the database. In a database with 100 million 1536-dimensional vectors, performing 100 million comparisons per one query is unfeasible for production scenarios. Instead of a brute-force approach, vector databases have specialized approximate nearest neighbour (ANN) indexes that balance search precision and speed. These indexes require carefully designed architectures to make their maintenance in production feasible. - +{{< figure src=/articles_data/dedicated-vector-search/hnsw.png caption="HNSW Index" width=80% >}} -https://qdrant.tech/articles/machine-learning/ +One of the most popular vector indexes is **HNSW (Hierarchical Navigable Small World)**, which we picked for its capability to provide simultaneously high search speed and accuracy. High performance came with a cost — implementing it in production is untrivial due to several challenges, so to make it shine all the system's architecture has to be structured around it, serving the capricious index. -2024-12-20T13:10:51+01:00 +### Index Complexity -... +[**HNSW**](/documentation/concepts/indexing/) is structured as a multi-layered graph. With a new data point inserted, the algorithm must compare it to existing nodes across several layers to index it. As the number of vectors grows, these comparisons will noticeably slow down the construction process, making updates increasingly time-consuming. The indexing operation can quickly become the bottleneck in the system, slowing down search requests. - +Building an HNSW monolith means limiting the scalability of your solution — its size has to be capped, as its construction time scales **non-linearly** with the number of elements. To keep the construction process feasible and ensure it doesn't affect the search time, we came up with a layered architecture that breaks down all data management into small units called **segments**. - +{{
}} -https://qdrant.tech/documentation/concepts/points/ +Each segment isolates a subset of vectorized corpora and supports all collection-level operations on it, from searching to indexing, for example segments build their own index on the subset of data available to them. For users working on a collection level, the specifics of segmentation are unnoticeable. The search results they get span the whole collection, as sub-results are gathered from segments and then merged & deduplicated. -2025-04-07T00:40:39+02:00 +By balancing between size and number of segments, we can ensure the right balance between search speed and indexing time, making the system flexible for different workloads. -... +### Immutability - +With index maintenance divided between segments, Qdrant can ensure high performance even during heavy load, and additional optimizations secure that further. These optimizations come from an idea that working with immutable structures introduces plenty of benefits: the possibility of using internally fixed sized lists (so no dynamic updates), ordering stored data accordingly to access patterns (so no unpredictable random accesses). With this in mind, to optimize search speed and memory management further, we use a strategy that combines and manages [**mutable and immutable segments**](/articles/immutable-data-structures/). - +| | | +|---------------------|-------------| +| **Mutable Segments** | These are used for quickly ingesting new data and handling changes (updates) to existing data. | +| **Immutable Segments** | Once a mutable segment reaches a certain size, an optimization process converts it into an immutable segment, constructing an HNSW index – you could [**read about these optimizers here**](/documentation/concepts/optimizer/#optimizer) in detail. This immutability trick allowed us, for example, to ensure effective [**tenant isolation**](/documentation/concepts/indexing/#tenant-index). | -https://qdrant.tech/documentation/concepts/vectors/ +Immutable segments are an implementation detail transparent for users — they can delete vectors at any time, while additions and updates are applied to a mutable segment instead. This combination of mutability and immutability allows search and indexing to smoothly run simultaneously, even under heavy loads. This approach minimizes the performance impact of indexing time and allows on-the-fly configuration changes on a collection level (such as enabling or disabling data quantization) without downtimes. -2025-04-07T00:40:39+02:00 +### Filterable Index -... +Vector search wasn't historically designed for filtering — imposing strict constraints on results. It's inherently fuzzy; every document is, to some extent, both similar and dissimilar to any query — there's no binary "*fits/doesn't fit*" segregation. As a result, vector search algorithms weren't originally built with filtering in mind. - +At the same time, filtering is unavoidable in many vector search applications, such as [**e-commerce search/recommendations**](/recommendations/). Searching for a Christmas present, you might want to filter out everything over 100 euros while still benefiting from the vector search's semantic nature. - +In many vector search solutions, filtering is approached in two ways: **pre-filtering** (computes a binary mask for all vectors fitting the condition before running HNSW search) or **post-filtering** (running HNSW as usual and then filtering the results). -https://qdrant.tech/documentation/concepts/payload/ +| | | | +|----|------------------|---------| +| ❌ | **Pre-filtering** | Has the linear complexity of computing the vector mask and becomes a bottleneck for large datasets. | +| ❌ | **Post-filtering** | The problem with **post-filtering** is tied to vector search "*everything fits and doesn't at the same time*" nature: imagine a low-cardinality filter that leaves only a few matching elements in the database. If none of them are similar enough to the query to appear in the top-X retrieved results, they'll all be filtered out. | -2025-04-07T00:40:39+02:00 +Qdrant [**took filtering in vector search further**](/articles/vector-search-filtering/), recognizing the limitations of pre-filtering & post-filtering strategies. We developed an adaptation of HNSW — [**filterable HNSW**](/articles/filtrable-hnsw/) — that also enables **in-place filtering** during graph traversal. To make this possible, we condition HNSW index construction on possible filtering conditions reflected by [**payload indexes**](/documentation/concepts/indexing/#payload-index) (inverted indexes built on vectors' [**metadata**](/documentation/concepts/payload/)). -... +**Qdrant was designed with a vector index being a central component of the system.** That made it possible to organize optimizers, payload indexes and other components around the vector index, unlocking the possibility of building a filterable HNSW. - +{{
}} - +In general, optimizing vector search requires a custom, finely tuned approach to data and index management that secures high performance even as data grows and changes dynamically. This specialized architecture is the key reason why **dedicated vector databases will always outperform general-purpose databases in production settings**. -https://qdrant.tech/documentation/send-data/data-streaming-kafka-qdrant/ +## Vector Search Beyond RAG -2024-07-22T17:09:17-07:00 +{{
}} -... +Many discussions about the purpose of vector databases focus on Retrieval-Augmented Generation (RAG) — or its more advanced variant, agentic RAG — where vector databases are used as a knowledge source to retrieve context for large language models (LLMs). This is a legitimate use case, however, the hype wave of RAG solutions has overshadowed the broader potential of vector search, which goes [**beyond augmenting generative AI**](/articles/vector-similarity-beyond-search/). - +### Discovery - +The strength of vector search lies in its ability to facilitate [**discovery**](/articles/discovery-search/). Vector search allows you to refine your choices as you search rather than starting with a fixed query. Say, [**you're ordering food not knowing exactly what you want**](/articles/food-discovery-demo/) — just that it should contain meat & not a burger, or that it should be meat with cheese & not tacos. Instead of searching for a specific dish, vector search helps you navigate options based on similarity and dissimilarity, guiding you toward something that matches your taste without requiring you to define it upfront. -https://qdrant.tech/articles/neural-search-tutorial/ +### Recommendations -2024-12-20T13:10:51+01:00 +Vector search is perfect for [**recommendations**](/documentation/concepts/explore/#recommendation-api). Imagine browsing for a new book or movie. Instead of searching for an exact match, you might look for stories that capture a certain mood or theme but differ in key aspects from what you already know. For example, you may [**want a film featuring wizards without the familiar feel of the "Harry Potter" series**](https://www.youtube.com/watch?v=O5mT8M7rqQQ). This flexibility is possible because vector search is not tied to the binary "match/not match" concept but operates on distances in a vector space. -... +### Big Unstructured Data Analysis - +Vector search nature makes it also ideal for [**big unstructured data analysis**](https://www.youtube.com/watch?v=_BQTnXpuH-E), for instance, anomaly detection. In large, unstructured, and often unlabelled datasets, vector search can help identify clusters and outliers by analyzing distance relationships between data points. - +### Fundamentally Different -https://qdrant.tech/articles/rag-and-genai/ +**Vector search beyond RAG isn't just another feature — it's a fundamental shift in how we interact with data**. Dedicated solutions integrate these capabilities natively and are designed from the ground up to handle high-dimensional math and (dis-)similarity-based retrieval. In contrast, databases with vector extensions are built around a different data paradigm, making it impossible to efficiently support advanced vector search capabilities. -2024-12-20T13:10:51+01:00 +Even if you want to retrofit these capabilities, it's not just a matter of adding a new feature — it's a structural problem. Supporting advanced vector search requires **dedicated interfaces** that enable flexible usage of vector search from multi-stage filtering to dynamic exploration of high-dimensional spaces. -... +When the underlying architecture wasn't initially designed for this kind of interaction, integrating interfaces is a **software engineering team nightmare**. You end up breaking existing assumptions, forcing inefficient workarounds, and often introducing backwards-compatibility problems. It's why attempts to patch vector search onto traditional databases won't match the efficiency of purpose-built systems. - +## Making Vector Search State-of-the-Art +![vector-search-state-of-the-art](/articles_data/dedicated-vector-search/image4.jpg) - +Now, let's shift focus to another key advantage of dedicated solutions — their ability to keep up with state-of-the-art solutions in the field. -https://qdrant.tech/documentation/cloud/cluster-scaling/ +[**Vector databases**](/qdrant-vector-database/) are purpose-built for vector retrieval, and as a result, they offer cutting-edge features that are often critical for AI businesses relying on vector search. Vector database engineers invest significant time and effort into researching and implementing the most optimal ways to perform vector search. Many of these innovations come naturally to vector-native architectures, while general-purpose databases with added vector capabilities may struggle to adapt and replicate these benefits efficiently. -2025-05-02T16:53:21+02:00 +Consider some of the advanced features implemented in Qdrant: -... +- [**GPU-Accelerated Indexing**](/blog/qdrant-1.13.x/#gpu-accelerated-indexing) + By offloading index construction tasks to the GPU, Qdrant can significantly speed up the process of data indexing while keeping costs low. This becomes especially valuable when working with large datasets in hot data scenarios. + + GPU acceleration in Qdrant is a custom solution developed by an enthusiast from our core team. It's vendor-free and natively supports all Qdrant's unique architectural features, from FIlterable HNSW to multivectors. - +- [**Multivectors**](/documentation/concepts/vectors/?q=multivectors#multivectors) + Some modern embedding models produce an entire matrix (a list of vectors) as output rather than a single vector. Qdrant supports multivectors natively. + + This feature is critical when using state-of-the-art retrieval models such as [**ColBERT**](/documentation/fastembed/fastembed-colbert/), ColPali, or ColQwen. For instance, ColPali and ColQwen produce multivector outputs, and supporting them natively is crucial for [**state-of-the-art (SOTA) PDF-retrieval**](/documentation/advanced-tutorials/pdf-retrieval-at-scale/). - +In addition to that, we continuously look for improvements in: -https://qdrant.tech/documentation/concepts/search/ +| | | +|----------------------------------|-------------| +| **Memory Efficiency & Compression** | Techniques such as [**quantization**](documentation/guides/quantization/) and [**HNSW compression**](/blog/qdrant-1.13.x/#hnsw-graph-compression) to reduce storage requirements | +| **Retrieval Algorithms** | Support for the latest retrieval algorithms, including [**sparse neural retrieval**](/articles/modern-sparse-neural-retrieval/), [**hybrid search**](/documentation/concepts/hybrid-queries/) methods, and [**re-rankers**](/documentation/fastembed/fastembed-rerankers/). | +| **Vector Data Analysis & Visualization** | Tools like the [**distance matrix API**](/blog/qdrant-1.12.x/#distance-matrix-api-for-data-insights) provide insights into vectorized data, and a [**Web UI**](/blog/qdrant-1.11.x/#web-ui-search-quality-tool) allows for intuitive exploration of data. | +| **Search Speed & Scalability** | Includes optimizations for [**multi-tenant environments**](/articles/multitenancy/) to ensure efficient and scalable search. | -2025-04-07T00:40:39+02:00 +**These advancements are not just incremental improvements — they define the difference between a system optimized for vector search and one that accommodates it.** -... +Staying at the cutting edge of vector search is not just about performance — it's also about keeping pace with an evolving AI landscape. - +## Summing up +![conclusion-vector-search](/articles_data/dedicated-vector-search/image5.jpg) - +When it comes to vector search, there's a clear distinction between using a dedicated vector search solution and extending a database to support vector operations. -https://qdrant.tech/documentation/concepts/explore/ +**For small-scale applications or prototypes handling up to a million data points, a non-optimized architecture might suffice.** However, as the volume of vectors grows, an unoptimized solution will quickly become a bottleneck — slowing down search operations and limiting scalability. Dedicated vector search solutions are engineered from the ground up to handle massive amounts of high-dimensional data efficiently. -2025-06-12T10:45:50-04:00 +State-of-the-art (SOTA) vector search evolves rapidly. If you plan to build on the latest advances, using a vector extension will eventually hold you back. Dedicated vector search solutions integrate these features natively, ensuring that you benefit from continuous innovations without compromising performance. -... +The power of vector search extends into areas such as big data analysis, recommendation systems, and discovery-based applications, and to support these vector search capabilities, a dedicated solution is needed. - +### When to Choose a Dedicated Database over an Extension: - +- **High-Volume, Real-Time Search**: Ideal for applications with many simultaneous users who require fast, continuous access to search results—think search engines, e-commerce recommendations, social media, or media streaming services. +- **Dynamic, Unstructured Data**: Perfect for scenarios where data is continuously evolving and where the goal is to discover insights from data patterns. +- **Innovative Applications**: If you're looking to implement advanced use cases such as recommendation engines, hybrid search solutions, or exploratory data analysis where traditional exact or token-based searches hold short. -https://qdrant.tech/documentation/cloud/cluster-monitoring/ +Investing in a dedicated vector search engine will deliver the performance and flexibility necessary for success if your application relies on vector search at scale, keeps up with trends, or requires more than just a simple small-scale similarity search. -2025-05-02T16:53:21+02:00 +<|page-26-lllmstxt|> +\* At least any open-source model, since you need access to its internals. -... +## You Can Adapt Dense Embedding Models for Late Interaction - +Qdrant 1.10 introduced support for multi-vector representations, with late interaction being a prominent example of this model. In essence, both documents and queries are represented by multiple vectors, and identifying the most relevant documents involves calculating a score based on the similarity between the corresponding query and document embeddings. If you're not familiar with this paradigm, our updated [Hybrid Search](/articles/hybrid-search/) article explains how multi-vector representations can enhance retrieval quality. - +**Figure 1:** We can visualize late interaction between corresponding document-query embedding pairs. -https://qdrant.tech/documentation/cloud/cluster-upgrades/ +![Late interaction model](/articles_data/late-interaction-models/late-interaction.png) -2025-05-02T16:53:21+02:00 +There are many specialized late interaction models, such as [ColBERT](https://qdrant.tech/documentation/fastembed/fastembed-colbert/), but **it appears that regular dense embedding models can also be effectively utilized in this manner**. -... +> In this study, we will demonstrate that standard dense embedding models, traditionally used for single-vector representations, can be effectively adapted for late interaction scenarios using output token embeddings as multi-vector representations. - +By testing out retrieval with Qdrant’s multi-vector feature, we will show that these models can rival or surpass specialized late interaction models in retrieval performance, while offering lower complexity and greater efficiency. This work redefines the potential of dense models in advanced search pipelines, presenting a new method for optimizing retrieval systems. - +## Understanding Embedding Models -https://qdrant.tech/documentation/concepts/hybrid-queries/ +The inner workings of embedding models might be surprising to some. The model doesn’t operate directly on the input text; instead, it requires a tokenization step to convert the text into a sequence of token identifiers. Each token identifier is then passed through an embedding layer, which transforms it into a dense vector. Essentially, the embedding layer acts as a lookup table that maps token identifiers to dense vectors. These vectors are then fed into the transformer model as input. -2025-04-23T11:15:58+02:00 +**Figure 2:** The tokenization step, which takes place before vectors are added to the transformer model. -... +![Input token embeddings](/articles_data/late-interaction-models/input-embeddings.png) - +The input token embeddings are context-free and are learned during the model’s training process. This means that each token always receives the same embedding, regardless of its position in the text. At this stage, the token embeddings are unaware of the context in which they appear. It is the transformer model’s role to contextualize these embeddings. - +Much has been discussed about the role of attention in transformer models, but in essence, this mechanism is responsible for capturing cross-token relationships. Each transformer module takes a sequence of token embeddings as input and produces a sequence of output token embeddings. Both sequences are of the same length, with each token embedding being enriched by information from the other token embeddings at the current step. -https://qdrant.tech/articles/filtrable-hnsw/ +**Figure 3:** The mechanism that produces a sequence of output token embeddings. -2024-12-20T13:10:51+01:00 +![Output token embeddings](/articles_data/late-interaction-models/output-embeddings.png) -... +**Figure 4:** The final step performed by the embedding model is pooling the output token embeddings to generate a single vector representation of the input text. - +![Pooling](/articles_data/late-interaction-models/pooling.png) - +There are several pooling strategies, but regardless of which one a model uses, the output is always a single vector representation, which inevitably loses some information about the input. It’s akin to giving someone detailed, step-by-step directions to the nearest grocery store versus simply pointing in the general direction. While the vague direction might suffice in some cases, the detailed instructions are more likely to lead to the desired outcome. -https://qdrant.tech/documentation/concepts/filtering/ +## Using Output Token Embeddings for Multi-Vector Representations -2025-06-09T18:30:19+03:30 +We often overlook the output token embeddings, but the fact is—they also serve as multi-vector representations of the input text. So, why not explore their use in a multi-vector retrieval model, similar to late interaction models? -... +### Experimental Findings - +We conducted several experiments to determine whether output token embeddings could be effectively used in place of traditional late interaction models. The results are quite promising. - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DatasetModelExperimentNDCG@10
SciFactprithivida/Splade_PP_en_v1sparse vectors0.70928
colbert-ir/colbertv2.0late interaction model0.69579
all-MiniLM-L6-v2single dense vector representation0.64508
output token embeddings0.70724
BAAI/bge-small-ensingle dense vector representation0.68213
output token embeddings0.73696
NFCorpusprithivida/Splade_PP_en_v1sparse vectors0.34166
colbert-ir/colbertv2.0late interaction model0.35036
all-MiniLM-L6-v2single dense vector representation0.31594
output token embeddings0.35779
BAAI/bge-small-ensingle dense vector representation0.29696
output token embeddings0.37502
ArguAnaprithivida/Splade_PP_en_v1sparse vectors0.47271
colbert-ir/colbertv2.0late interaction model0.44534
all-MiniLM-L6-v2single dense vector representation0.50167
output token embeddings0.45997
BAAI/bge-small-ensingle dense vector representation0.58857
output token embeddings0.57648
-https://qdrant.tech/articles/practicle-examples/ +The [source code for these experiments is open-source](https://github.com/kacperlukawski/beir-qdrant/blob/main/examples/retrieval/search/evaluate_all_exact.py) and utilizes [`beir-qdrant`](https://github.com/kacperlukawski/beir-qdrant), an integration of Qdrant with the [BeIR library](https://github.com/beir-cellar/beir). While this package is not officially maintained by the Qdrant team, it may prove useful for those interested in experimenting with various Qdrant configurations to see how they impact retrieval quality. All experiments were conducted using Qdrant in exact search mode, ensuring the results are not influenced by approximate search. -2024-12-20T13:10:51+01:00 +Even the simple `all-MiniLM-L6-v2` model can be applied in a late interaction model fashion, resulting in a positive impact on retrieval quality. However, the best results were achieved with the `BAAI/bge-small-en` model, which outperformed both sparse and late interaction models. -... +It's important to note that ColBERT has not been trained on BeIR datasets, making its performance fully out of domain. Nevertheless, the `all-MiniLM-L6-v2` [training dataset](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2#training-data) also lacks any BeIR data, yet it still performs remarkably well. -
+## Comparative Analysis of Dense vs. Late Interaction Models - +The retrieval quality speaks for itself, but there are other important factors to consider. -https://qdrant.tech/documentation/cloud/backups/ +The traditional dense embedding models we tested are less complex than late interaction or sparse models. With fewer parameters, these models are expected to be faster during inference and more cost-effective to maintain. Below is a comparison of the models used in the experiments: -2025-05-02T16:53:21+02:00 +| Model | Number of parameters | +|------------------------------|----------------------| +| `prithivida/Splade_PP_en_v1` | 109,514,298 | +| `colbert-ir/colbertv2.0` | 109,580,544 | +| `BAAI/bge-small-en` | 33,360,000 | +| `all-MiniLM-L6-v2` | 22,713,216 | -... +One argument against using output token embeddings is the increased storage requirements compared to ColBERT-like models. For instance, the `all-MiniLM-L6-v2` model produces 384-dimensional output token embeddings, which is three times more than the 128-dimensional embeddings generated by ColBERT-like models. This increase not only leads to higher memory usage but also impacts the computational cost of retrieval, as calculating distances takes more time. Mitigating this issue through vector compression would make a lot of sense. - +## Exploring Quantization for Multi-Vector Representations - +Binary quantization is generally more effective for high-dimensional vectors, making the `all-MiniLM-L6-v2` model, with its relatively low-dimensional outputs, less ideal for this approach. However, scalar quantization appeared to be a viable alternative. The table below summarizes the impact of quantization on retrieval quality. -https://qdrant.tech/articles/qdrant-0-11-release/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DatasetModelExperimentNDCG@10
SciFactall-MiniLM-L6-v2output token embeddings0.70724
output token embeddings (uint8)0.70297
NFCorpusall-MiniLM-L6-v2output token embeddings0.35779
output token embeddings (uint8)0.35572
-2022-12-06T13:12:27+01:00 +It’s important to note that quantization doesn’t always preserve retrieval quality at the same level, but in this case, scalar quantization appears to have minimal impact on retrieval performance. The effect is negligible, while the memory savings are substantial. -... +We managed to maintain the original quality while using four times less memory. Additionally, a quantized vector requires 384 bytes, compared to ColBERT’s 512 bytes. This results in a 25% reduction in memory usage, with retrieval quality remaining nearly unchanged. -
+## Practical Application: Enhancing Retrieval with Dense Models - +If you’re using one of the sentence transformer models, the output token embeddings are calculated by default. While a single vector representation is more efficient in terms of storage and computation, there’s no need to discard the output token embeddings. According to our experiments, these embeddings can significantly enhance retrieval quality. You can store both the single vector and the output token embeddings in Qdrant, using the single vector for the initial retrieval step and then reranking the results with the output token embeddings. -https://qdrant.tech/articles/qdrant-0-10-release/ +**Figure 5:** A single model pipeline that relies solely on the output token embeddings for reranking. -2024-05-15T18:01:28+02:00 +![Single model reranking](/articles_data/late-interaction-models/single-model-reranking.png) -... +To demonstrate this concept, we implemented a simple reranking pipeline in Qdrant. This pipeline uses a dense embedding model for the initial oversampled retrieval and then relies solely on the output token embeddings for the reranking step. - +### Single Model Retrieval and Reranking Benchmarks - +Our tests focused on using the same model for both retrieval and reranking. The reported metric is NDCG@10. In all tests, we applied an oversampling factor of 5x, meaning the retrieval step returned 50 results, which were then narrowed down to 10 during the reranking step. Below are the results for some of the BeIR datasets: -https://qdrant.tech/documentation/concepts/optimizer/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Datasetall-miniLM-L6-v2BAAI/bge-small-en
dense embeddings onlydense + rerankingdense embeddings onlydense + reranking
SciFact0.645080.702930.682130.73053
NFCorpus0.315940.342970.296960.35996
ArguAna0.501670.453780.588570.57302
Touche-20200.169040.196930.130550.19821
TREC-COVID0.472460.63790.457880.53539
FiQA-20180.368670.415870.310910.39067
-2024-11-27T16:59:34+01:00 +The source code for the benchmark is publicly available, and [you can find it in the repository of the `beir-qdrant` package](https://github.com/kacperlukawski/beir-qdrant/blob/main/examples/retrieval/search/evaluate_reranking.py). -... +Overall, adding a reranking step using the same model typically improves retrieval quality. However, the quality of various late interaction models is [often reported based on their reranking performance when BM25 is used for the initial retrieval](https://huggingface.co/mixedbread-ai/mxbai-colbert-large-v1#1-reranking-performance). This experiment aimed to demonstrate how a single model can be effectively used for both retrieval and reranking, and the results are quite promising. -
+Now, let's explore how to implement this using the new Query API introduced in Qdrant 1.10. - +## Setting Up Qdrant for Late Interaction -https://qdrant.tech/documentation/concepts/storage/ +The new Query API in Qdrant 1.10 enables the construction of even more complex retrieval pipelines. We can use the single vector created after pooling for the initial retrieval step and then rerank the results using the output token embeddings. -2025-04-07T00:40:39+02:00 +Assuming the collection is named `my-collection` and is configured to store two named vectors: `dense-vector` and `output-token-embeddings`, here’s how such a collection could be created in Qdrant: -... +```python +from qdrant_client import QdrantClient, models - +client = QdrantClient("http://localhost:6333") - +client.create_collection( + collection_name="my-collection", + vectors_config={ + "dense-vector": models.VectorParams( + size=384, + distance=models.Distance.COSINE, + ), + "output-token-embeddings": models.VectorParams( + size=384, + distance=models.Distance.COSINE, + multivector_config=models.MultiVectorConfig( + comparator=models.MultiVectorComparator.MAX_SIM + ), + ), + } +) +``` -https://qdrant.tech/documentation/concepts/indexing/ +Both vectors are of the same size since they are produced by the same `all-MiniLM-L6-v2` model. -2025-04-07T00:40:39+02:00 +```python +from sentence_transformers import SentenceTransformer -... +model = SentenceTransformer("all-MiniLM-L6-v2") +``` - +Now, instead of using the search API with just a single dense vector, we can create a reranking pipeline. First, we retrieve 50 results using the dense vector, and then we rerank them using the output token embeddings to obtain the top 10 results. - +```python +query = "What else can be done with just all-MiniLM-L6-v2 model?" -https://qdrant.tech/documentation/guides/distributed\_deployment/ +client.query_points( + collection_name="my-collection", + prefetch=[ + # Prefetch the dense embeddings of the top-50 documents + models.Prefetch( + query=model.encode(query).tolist(), + using="dense-vector", + limit=50, + ) + ], + # Rerank the top-50 documents retrieved by the dense embedding model + # and return just the top-10. Please note we call the same model, but + # we ask for the token embeddings by setting the output_value parameter. + query=model.encode(query, output_value="token_embeddings").tolist(), + using="output-token-embeddings", + limit=10, +) +``` +## Try the Experiment Yourself -2025-02-03T17:33:39+06:00 +In a real-world scenario, you might take it a step further by first calculating the token embeddings and then performing pooling to obtain the single vector representation. This approach allows you to complete everything in a single pass. -... +The simplest way to start experimenting with building complex reranking pipelines in Qdrant is by using the forever-free cluster on [Qdrant Cloud](https://cloud.qdrant.io/) and reading [Qdrant's documentation](/documentation/). - +The [source code for these experiments is open-source](https://github.com/kacperlukawski/beir-qdrant/blob/main/examples/retrieval/search/evaluate_all_exact.py) and uses [`beir-qdrant`](https://github.com/kacperlukawski/beir-qdrant), an integration of Qdrant with the [BeIR library](https://github.com/beir-cellar/beir). - +## Future Directions and Research Opportunities -https://qdrant.tech/documentation/concepts/snapshots/ +The initial experiments using output token embeddings in the retrieval process have yielded promising results. However, we plan to conduct further benchmarks to validate these findings and explore the incorporation of sparse methods for the initial retrieval. Additionally, we aim to investigate the impact of quantization on multi-vector representations and its effects on retrieval quality. Finally, we will assess retrieval speed, a crucial factor for many applications. -2025-06-12T09:02:54+03:00 +<|page-27-lllmstxt|> +# Optimizing Memory Consumption During Bulk Uploads -... +Efficient memory management is a constant challenge when you’re dealing with **large-scale vector data**. In high-volume ingestion scenarios, even seemingly minor configuration choices can significantly impact stability and performance. - +Let’s take a look at the best practices and recommendations to help you optimize memory usage during bulk uploads in Qdrant. We'll cover scenarios with both **dense** and **sparse** vectors, helping your deployments remain performant even under high load and avoiding out-of-memory errors. - -https://qdrant.tech/documentation/guides/quantization/ +## Indexing for dense vs. sparse vectors -2025-04-07T00:40:39+02:00 +**Dense vectors** -... +Qdrant employs an **HNSW-based index** for fast similarity search on dense vectors. By default, HNSW is built or updated once the number of **unindexed** vectors in a segment exceeds a set `indexing_threshold`. Although it delivers excellent query speed, building or updating the HNSW graph can be **resource-intensive** if it occurs frequently or across many small segments. - +**Sparse vectors** - +Sparse vectors use an **inverted index**. This index is updated at the **time of upsertion**, meaning you cannot disable or postpone it for sparse vectors. In most cases, its overhead is smaller than that of building an HNSW graph, but you should still be aware that each upsert triggers a sparse index update. -https://qdrant.tech/documentation/guides/monitoring/ + -2025-02-11T18:21:40+01:00 +## Bulk upload configuration for dense vectors -... +When performing high-volume vector ingestion, you have **two primary options** for handling indexing overhead. You should choose one depending on your specific workload and memory constraints: - +- **Disable HNSW indexing** - +To reduce memory and CPU pressure during bulk ingestion, you can **disable HNSW indexing entirely** by setting `"m": 0`. +For dense vectors, the `m` parameter defines how many edges each node in the HNSW graph can have. +This way, no dense vector index will be built, preventing unnecessary CPU usage during ingestion. -https://qdrant.tech/documentation/guides/configuration/ +**Figure 1:** A description of three key HNSW parameters. -2025-02-04T11:00:51+01:00 + -... - +```json +PATCH /collections/your_collection +{ + "hnsw_config": { + "m": 0 + } +} +``` - +**After ingestion is complete**, you can **re-enable HNSW** by setting `m` back to a production value (commonly 16 or 32). +Remember that search won't use HNSW until the index is built, so search performance may be slower during this period. -https://qdrant.tech/documentation/guides/security/ +- **Disabling optimizations completely** -2025-01-20T16:32:23+01:00 +The `indexing_threshold` tells Qdrant how many unindexed dense vectors can accumulate in a segment before building the HNSW graph. Setting `"indexing_threshold"=0` defers indexing entirely, keeping **ingestion speed at maximum**. However, this means uploaded vectors are not moved to disk while uploading, which can lead to **high RAM usage**. -... +```json +PATCH /collections/your_collection +{ + "optimizer_config": { + "indexing_threshold": 0 + } +} +``` + - +After bulk ingestion, set `indexing_threshold` to a positive value to ensure vectors are indexed and searchable via HNSW. **Vectors will not be searchable via HNSW until indexing is performed.** - +Small thresholds (e.g., 100) mean more frequent indexing, which can still be costly if many segments exist. Larger thresholds (e.g., 10000) delay indexing to batch more vectors at once, potentially using more RAM at the moment of index build, but fewer builds overall. -https://qdrant.tech/documentation/guides/usage-statistics/ +Between these two approaches, we generally recommend disabling HNSW (`"m"=0`) during bulk ingestion to keep memory usage predictable. Using `indexing_threshold=0` can be an alternative, but only if your system has enough memory to accommodate the unindexed vectors in RAM. -2024-12-03T17:03:30+01:00 +--- -... +## On-Disk storage in Qdrant - +By default, Qdrant keeps **vectors**, **payload data**, and **indexes** in memory to ensure low-latency queries. However, in large-scale or memory-constrained scenarios, you can configure some or all of them to be stored on-disk. This helps reduce RAM usage at the cost of potential increases in query latency, particularly for cold reads. - +**When to use on-disk**: +- You have **very large** or **rarely used** payload data or indexes, and freeing up RAM is worth potential I/O overhead. +- Your dataset doesn’t fit comfortably in available memory. +- You want to reduce memory pressure. +- You can tolerate slower queries if it ensures the system remains stable under heavy loads. -https://qdrant.tech/documentation/guides/common-errors/ +--- -2025-05-27T12:04:07+02:00 +## Memmap storage and segmentation -... +Qdrant uses **memory-mapped files** (segments) to store data on-disk. Rather than loading all vectors into RAM, Qdrant maps each segment into its address space, paging data in and out on demand. This helps keep the active RAM footprint lower, because data can be paged out if memory pressure is high. But each segment still incurs overhead (metadata, page table entries, etc.). - +During **high-volume ingestion**, you can accumulate dozens of small segments. Qdrant’s **optimizer** can later merge these into fewer, larger segments, reducing per-segment overhead and lowering total memory usage. - +When you create a collection with `"on_disk": true`, Qdrant will store newly inserted vectors in memmap storage from the start. For example: -https://qdrant.tech/documentation/database-tutorials/migration/ +```json +PATCH /collections/your_collection +{ + "vectors": { + "on_disk": true + } +} +``` -2025-06-11T18:57:35+03:00 +This approach immediately places all incoming vectors on disk, which can be very efficient in case of bulk ingestion. -... +However, **vector data and indexes are stored separately**, so enabling `on_disk` for vectors does not automatically store their indexes on disk. To fully optimize memory usage, you may need to configure **both vector storage and index storage** independently. - +For dense vectors, you can enable on-disk storage for both the **vector data** and the **HNSW index**: - +```json +PATCH /collections/your_collection +{ + "vectors": { + "on_disk": true + }, + "hnsw_config": { + "on_disk": true + } +} +``` +For sparse vectors, you need to enable `on_disk` for both the vector data and the sparse index separately: -https://qdrant.tech/blog/hybrid-cloud-vultr/ +```json +PATCH /collections/your_collection +{ + "sparse_vectors": { + "text": { + "on_disk": true, + "index": { + "on_disk": true + } + } + } +} +``` -2024-05-21T10:11:09+02:00 +--- -... +## **Best practices for high-volume vector ingestion** - +Bulk ingestion can lead to high memory consumption and even out-of-memory (OOM) errors. **If you’re experiencing out-of-memory errors with your current setup**, scaling up temporarily (increasing available RAM) will provide a buffer while you adjust Qdrant’s configuration for more a efficient data ingestion. - +The key here is to control indexing overhead. Let’s walk through the best practices for high-volume vector ingestion in a constrained-memory environment. -https://qdrant.tech/articles/quantum-quantization/ +### 1. Store vector data on disk immediately -2023-07-13T01:45:36+02:00 +The most effective way to reduce memory usage is to store vector data on disk right from the start using `on_disk: true`. This prevents RAM from being overloaded with raw vectors before optimization kicks in. -... +```json +PATCH /collections/your_collection +{ + "vectors": { + "on_disk": true + } +} +``` - +Previously, vector data had to be held in RAM until optimizers could move it to disk, which caused significant memory pressure. Now, by writing vectors to disk directly, memory overhead is significantly reduced, making bulk ingestion much more efficient. - +### 2. Disable HNSW for dense vectors (`m=0`) -https://qdrant.tech/blog/hybrid-cloud-stackit/ +During an **initial bulk load**, you can **disable** dense indexing by setting `"m": 0.` This ensures Qdrant won’t build an HNSW graph for incoming vectors, avoiding unnecessary memory and CPU usage. -2024-05-21T10:11:09+02:00 +```json +PATCH /collections/your_collection +{ + "hnsw_config": { + "m": 0 + }, + "optimizer_config": { + "indexing_threshold": 10000 + } +} +``` -... + - - +### 3. Let the optimizer run **after** bulk uploads -https://qdrant.tech/blog/hybrid-cloud-scaleway/ +Qdrant’s optimizers continuously restructure data to improve search efficiency. However, during a bulk upload, this can lead to excessive data movement and overhead as segments are constantly reorganized while new data is still arriving. -2024-05-21T10:11:09+02:00 +To avoid this, **upload all data first**, then allow the optimizer to process everything in one go. This minimizes redundant operations and ensures a more efficient segment structure. -... +### **4. Wait for indexation to clear up memory** - +Before performing additional operations, **allow Qdrant to finish any ongoing indexing**. Large indexing jobs can keep memory usage high until they fully complete. - +Monitor Qdrant logs or metrics to confirm when indexing finishes—once that happens, memory consumption should drop as intermediate data structures are freed. -https://qdrant.tech/blog/hybrid-cloud-red-hat-openshift/ +### 5. Re-enable HNSW post-ingestion -2024-05-21T10:11:09+02:00 +After the ingestion phase is over and memory usage has stabilized, re-enable HNSW for dense vectors by setting `m` back to a production value (commonly `16` or `32`): -... +```json +PATCH /collections/your_collection +{ + "hnsw_config": { + "m": 16 + } +} +``` + - +### 5. Enable quantization - +If you had planned to store all dense vectors on disk, be aware that searches can slow down drastically due to frequent disk I/O while memory pressure is high. A more balanced approach is **scalar quantization**: compress vectors (e.g., to `int8`) so they fit in RAM without occupying as much space as full floating-point values. -https://qdrant.tech/blog/hybrid-cloud-ovhcloud/ +```json +PATCH /collections/your_collection +{ + "quantization_config": { + "scalar": { + "type": "int8", + "always_ram": true + } + } +} +``` +Quantized vectors remain **in-memory** yet consume less space, preserving much of the performance advantage of RAM-based search. Learn more about [vector quantization](https://qdrant.tech/articles/what-is-vector-quantization/). -2024-05-21T10:11:09+02:00 +### Conclusion -... +High-volume vector ingestion can place significant memory demands on Qdrant, especially if dense vectors are indexed in real time. By following these tips, you can substantially reduce the risk of out-of-memory errors and maintain stable performance in a memory-limited environment. - +As always, monitor your system’s behavior. Review logs, watch metrics, and keep an eye on memory usage. Each workload is different, so it’s wise to fine-tune Qdrant’s parameters according to your hardware and data scale. - +<|page-28-lllmstxt|> +## Why We Built Our Own Storage Engine -https://qdrant.tech/blog/hybrid-cloud-llamaindex/ +Databases need a place to store and retrieve data. That’s what Qdrant's [**key-value storage**](https://en.wikipedia.org/wiki/Key–value_database) does—it links keys to values. -2024-05-21T10:11:09+02:00 +When we started building Qdrant, we needed to pick something ready for the task. So we chose [**RocksDB**](https://rocksdb.org) as our embedded key-value store. +
+ RocksDB +

It is mature, reliable, and well-documented.

+
-... +Over time, we ran into issues. Its architecture required compaction (uses [LSMT](https://en.wikipedia.org/wiki/Log-structured_merge-tree)), which caused random latency spikes. It handles generic keys, while we only use it for sequential IDs. Having lots of configuration options makes it versatile, but accurately tuning it was a headache. Finally, interoperating with C++ slowed us down (although we will still support it for quite some time 😭). -
+While there are already some good options written in Rust that we could leverage, we needed something custom. Nothing out there fit our needs in the way we wanted. We didn’t require generic keys. We wanted full control over when and which data was written and flushed. Our system already has crash recovery mechanisms built-in. Online compaction isn’t a priority, we already have optimizers for that. Debugging misconfigurations was not a great use of our time. - +So we built our own storage. As of [**Qdrant Version 1.13**](/blog/qdrant-1.13.x/), we are using Gridstore for **payload and sparse vector storages**. +
+ Gridstore +

Simple, efficient, and designed just for Qdrant.

+
-https://qdrant.tech/blog/hybrid-cloud-langchain/ +#### In this article, you’ll learn about: +- **How Gridstore works** – a deep dive into its architecture and mechanics. +- **Why we built it this way** – the key design decisions that shaped it. +- **Rigorous testing** – how we ensured the new storage is production-ready. +- **Performance benchmarks** – official metrics that demonstrate its efficiency. -2024-05-21T10:11:09+02:00 +**Our first challenge?** Figuring out the best way to handle sequential keys and variable-sized data. -... +## Gridstore Architecture: Three Main Components +![gridstore](/articles_data/gridstore-key-value-storage/gridstore-2.png) -
+Gridstore’s architecture is built around three key components that enable fast lookups and efficient space management: +| Component | Description | +|----------------------------|-----------------------------------------------------------------------------------------------| +| The Data Layer | Stores values in fixed-sized blocks and retrieves them using a pointer-based lookup system. | +| The Mask Layer | Uses a bitmask to track which blocks are in use and which are available. | +| The Gaps Layer | Manages block availability at a higher level, allowing for quick space allocation. | - +### 1. The Data Layer for Fast Retrieval +At the core of Gridstore is **The Data Layer**, which is designed to store and retrieve values quickly based on their keys. This layer allows us to do efficient reads and lets us store variable-sized data. The main two components of this layer are **The Tracker** and **The Data Grid**. -https://qdrant.tech/blog/hybrid-cloud-jinaai/ +Since internal IDs are always sequential integers (0, 1, 2, 3, 4, ...), the tracker is an array of pointers, where each pointer tells the system exactly where a value starts and how long it is. -2024-05-21T10:11:09+02:00 +{{< figure src="/articles_data/gridstore-key-value-storage/data-layer.png" alt="The Data Layer" caption="The Data Layer uses an array of pointers to quickly retrieve data." >}} -... +This makes lookups incredibly fast. For example, finding key 3 is just a matter of jumping to the third position in the tracker, and following the pointer to find the value in the data grid. - +However, because values are of variable size, the data itself is stored separately in a grid of fixed-sized blocks, which are grouped into larger page files. The fixed size of each block is usually 128 bytes. When inserting a value, Gridstore allocates one or more consecutive blocks to store it, ensuring that each block only holds data from a single value. - +### 2. The Mask Layer Reuses Space +**The Mask Layer** helps Gridstore handle updates and deletions without the need for expensive data compaction. Instead of maintaining complex metadata for each block, Gridstore tracks usage with a bitmask, where each bit represents a block, with 1 for used, 0 for free. -https://qdrant.tech/blog/hybrid-cloud-haystack/ +{{< figure src="/articles_data/gridstore-key-value-storage/mask-layer.png" alt="The Mask Layer" caption="The bitmask efficiently tracks block usage." >}} -2024-09-24T14:30:20-04:00 +This makes it easy to determine where new values can be written. When a value is removed, it gets soft-deleted at its pointer, and the corresponding blocks in the bitmask are marked as available. Similarly, when updating a value, the new version is written elsewhere, and the old blocks are freed at the bitmask. -... +This approach ensures that Gridstore doesn’t waste space. As the storage grows, however, scanning for available blocks in the entire bitmask can become computationally expensive. - +### 3. The Gaps Layer for Effective Updates +To further optimize update handling, Gridstore introduces **The Gaps Layer**, which provides a higher-level view of block availability. - +Instead of scanning the entire bitmask, Gridstore splits the bitmask into regions and keeps track of the largest contiguous free space within each region, known as **The Region Gap**. By also storing the leading and trailing gaps of each region, the system can efficiently combine multiple regions when needed for storing large values. -https://qdrant.tech/blog/hybrid-cloud-digitalocean/ +{{< figure src="/articles_data/gridstore-key-value-storage/architecture.png" alt="The Gaps Layer" caption="The complete architecture of Gridstore" >}} -2024-05-21T10:11:09+02:00 +This layered approach allows Gridstore to locate available space quickly, scaling down the work required for scans while keeping memory overhead minimal. With this system, finding storage space for new values requires scanning only a tiny fraction of the total metadata, making updates and insertions highly efficient, even in large segments. -... +Given the default configuration, the gaps layer is scoped out in a millionth fraction of the actual storage size. This means that for each 1GB of data, the gaps layer only requires scanning 6KB of metadata. With this mechanism, the other operations can be executed in virtually constant-time complexity. - +## Gridstore in Production: Maintaining Data Integrity +![gridstore](/articles_data/gridstore-key-value-storage/gridstore-1.png) - +Gridstore’s architecture introduces multiple interdependent structures that must remain in sync to ensure data integrity: +- **The Data Layer** holds the data and associates each key with its location in storage, including page ID, block offset, and the size of its value. +- **The Mask Layer** keeps track of which blocks are occupied and which are free. +- **The Gaps Layer** provides an indexed view of free blocks for efficient space allocation. -https://qdrant.tech/blog/hybrid-cloud-aleph-alpha/ +Every time a new value is inserted or an existing value is updated, all these components need to be modified in a coordinated way. -2025-02-04T13:55:26+01:00 +### When Things Break in Real Life +Real-world systems don’t operate in a vacuum. Failures happen: software bugs cause unexpected crashes, memory exhaustion forces processes to terminate, disks fail to persist data reliably, and power losses can interrupt operations at any moment. -... +*The critical question is: what happens if a failure occurs while updating these structures?* - +If one component is updated but another isn’t, the entire system could become inconsistent. Worse, if an operation is only partially written to disk, it could lead to orphaned data, unusable space, or even data corruption. - +### Stability Through Idempotency: Recovering With WAL +To guard against these risks, Qdrant relies on a [**Write-Ahead Log (WAL)**](/documentation/concepts/storage/). Before committing an operation, Qdrant ensures that it is at least recorded in the WAL. If a crash happens before all updates are flushed, the system can safely replay operations from the log. -https://qdrant.tech/blog/hybrid-cloud-airbyte/ +This recovery mechanism introduces another essential property: [**idempotence**](https://en.wikipedia.org/wiki/Idempotence). -2025-02-04T13:55:26+01:00 +The storage system must be designed so that reapplying the same operation after a failure leads to the same final state as if the operation had been applied just once. -... +### The Grand Solution: Lazy Updates +To achieve this, **Gridstore completes updates lazily**, prioritizing the most critical part of the write: the data itself. +| | +|-----------------------------------------------------------------------------------------------------------------------------| +| 👉 Instead of immediately updating all metadata structures, it writes the new value first while keeping lightweight pending changes in a buffer. | +| 👉 The system only finalizes these updates when explicitly requested, ensuring that a crash never results in marking data as deleted before the update has been safely persisted. | +| 👉 In the worst-case scenario, Gridstore may need to write the same data twice, leading to a minor space overhead, but it will never corrupt the storage by overwriting valid data. | - +## How We Tested the Final Product +![gridstore](/articles_data/gridstore-key-value-storage/gridstore-3.png) - +### First... Model Testing -https://qdrant.tech/documentation/observability/openllmetry/ +Gridstore can be tested efficiently using model testing, which compares its behavior to a simple in-memory hash map. Since Gridstore should function like a persisted hash map, this method quickly detects inconsistencies. -2024-08-15T08:50:37+05:30 +The process is straightforward: +1. Initialize a Gridstore instance and an empty hash map. +2. Run random operations (put, delete, update) on both. +3. Verify that results match after each operation. +4. Compare all keys and values to ensure consistency. -... +This approach provides high test coverage, exposing issues like incorrect persistence or faulty deletions. Running large-scale model tests ensures Gridstore remains reliable in real-world use. - +Here is a naive way to generate operations in Rust. - +```rust -https://qdrant.tech/documentation/observability/openlit/ +enum Operation { + Put(PointOffset, Payload), + Delete(PointOffset), + Update(PointOffset, Payload), +} -2024-08-15T08:50:37+05:30 +impl Operation { + fn random(rng: &mut impl Rng, max_point_offset: u32) -> Self { + let point_offset = rng.random_range(0..=max_point_offset); + let operation = rng.gen_range(0..3); + match operation { + 0 => { + let size_factor = rng.random_range(1..10); + let payload = random_payload(rng, size_factor); + Operation::Put(point_offset, payload) + } + 1 => Operation::Delete(point_offset), + 2 => { + let size_factor = rng.random_range(1..10); + let payload = random_payload(rng, size_factor); + Operation::Update(point_offset, payload) + } + _ => unreachable!(), + } + } +} +``` +Model testing is a high-value way to catch bugs, especially when your system mimics a well-defined component like a hash map. If your component behaves the same as another one, using model testing brings a lot of value for a bit of effort. -... +We could have tested against RocksDB, but simplicity matters more. A simple hash map lets us run massive test sequences quickly, exposing issues faster. - +For even sharper debugging, Property-Based Testing adds automated test generation and shrinking. It pinpoints failures with minimalized test cases, making bug hunting faster and more effective. - +### Crash Testing: Can Gridstore Handle the Pressure? -https://qdrant.tech/blog/case-study-lettria-v2/ +Designing for crash resilience is one thing, and proving it works under stress is another. To push Qdrant’s data integrity to the limit, we built [**Crasher**](https://github.com/qdrant/crasher), a test bench that brutally kills and restarts Qdrant while it handles a heavy update workload. -2025-06-16T22:38:02-07:00 +Crasher runs a loop that continuously writes data, then randomly crashes Qdrant. On each restart, Qdrant replays its [**Write-Ahead Log (WAL)**](/documentation/concepts/storage/), and we verify if data integrity holds. Possible anomalies include: +- Missing data (points, vectors, or payloads) +- Corrupt payload values -... +This aggressive yet simple approach has uncovered real-world issues when run for extended periods. While we also use chaos testing for distributed setups, Crasher excels at fast, repeatable failure testing in a local environment. - +## Testing Gridstore Performance: Benchmarks +![gridstore](/articles_data/gridstore-key-value-storage/gridstore-4.png) - +To measure the impact of our new storage engine, we used [**Bustle, a key-value storage benchmarking framework**](https://github.com/jonhoo/bustle), to compare Gridstore against RocksDB. We tested three workloads: -https://qdrant.tech/ +| Workload Type | Operation Distribution | +|------------------------------|-----------------------------------| +| Read-heavy | 95% reads | +| Insert-heavy | 80% inserts | +| Update-heavy | 50% updates -2025-06-19T16:21:03+04:00 +#### The results speak for themselves: -... +Average latency for all kinds of workloads is lower across the board, particularly for inserts. - +![image.png](/articles_data/gridstore-key-value-storage/1.png) - +This shows a clear boost in performance. As we can see, the investment in Gridstore is paying off. -https://qdrant.tech/blog/beta-database-migration-tool/ +### End-to-End Benchmarking -2025-06-18T11:55:05-04:00 +Now, let’s test the impact on a real Qdrant instance. So far, we’ve only integrated Gridstore for [**payloads**](/documentation/concepts/payload/) and [**sparse vectors**](/documentation/concepts/vectors/#sparse-vectors), but even this partial switch should show noticeable improvements. -... +For benchmarking, we used our in-house [**bfb tool**](https://github.com/qdrant/bfb) to generate a workload. Our configuration: - +```json +bfb -n 2000000 --max-id 1000000 \ + --sparse-vectors 0.02 \ + --set-payload \ + --on-disk-payload \ + --dim 1 \ + --sparse-dim 5000 \ + --bool-payloads \ + --keywords 100 \ + --float-payloads true \ + --int-payloads 100000 \ + --text-payloads \ + --text-payload-length 512 \ + --skip-field-indices \ + --jsonl-updates ./rps.jsonl +``` +This benchmark upserts 1 million points twice. Each point has: +- A medium to large payload +- A tiny dense vector (dense vectors use a different storage type) +- A sparse vector - +--------------------------- +#### Additional configuration: -https://qdrant.tech/blog/case-study-lawme/ +1. The test we conducted updated payload data separately in another request. -2025-06-11T09:42:37-07:00 +2. There were no payload indices, which ensured we measured pure ingestion speed. -... +3. Finally, we gathered request latency metrics for analysis. - +--------------------------- - +We ran this against Qdrant 1.12.6, toggling between the old and new storage backends. -https://qdrant.tech/blog/case-study-convosearch/ +### Final Result -2025-06-10T09:54:12-07:00 +Data ingestion is **twice as fast and with a smoother throughput** — a massive win! 😍 -... +![image.png](/articles_data/gridstore-key-value-storage/2.png) - +We optimized for speed, and it paid off—but what about storage size? +- Gridstore: 2333MB +- RocksDB: 2319MB - +Strictly speaking, RocksDB is slightly smaller, but the difference is negligible compared to the 2x faster ingestion and more stable throughput. A small trade-off for a big performance gain! -https://qdrant.tech/blog/legal-tech-builders-guide/ +## Trying Out Gridstore -2025-06-13T15:44:13-07:00 +Gridstore represents a significant advancement in how Qdrant manages its **key-value storage** needs. It offers great performance and streamlined updates tailored specifically for our use case. We have managed to achieve faster, more reliable data ingestion while maintaining data integrity, even under heavy workloads and unexpected failures. It is already used as a storage backend for on-disk payloads and sparse vectors. -... +👉 It’s important to note that Gridstore remains tightly integrated with Qdrant and, as such, has not been released as a standalone crate. - +Its API is still evolving, and we are focused on refining it within our ecosystem to ensure maximum stability and performance. That said, we recognize the value this innovation could bring to the wider Rust community. In the future, once the API stabilizes and we decouple it enough from Qdrant, we will consider publishing it as a contribution to the community ❀. - +For now, Gridstore continues to drive improvements in Qdrant, demonstrating the benefits of a custom-tailored storage engine designed with modern demands in mind. Stay tuned for further updates and potential community releases as we keep pushing the boundaries of performance and reliability. -https://qdrant.tech/blog/soc-2-type-ii-hipaa/ +
+ Gridstore +

Simple, efficient, and designed just for Qdrant.

+
-2025-06-17T16:48:22-07:00 +<|page-29-lllmstxt|> +Standard [Retrieval Augmented Generation](/articles/what-is-rag-in-ai/) follows a predictable, linear path: receive +a query, retrieve relevant documents, and generate a response. In many cases that might be enough to solve a particular +problem. In the worst case scenario, your LLM will just decide to not answer the question, because the context does not +provide enough information. -... +![Standard, linear RAG pipeline](/articles_data/agentic-rag/linear-rag.png) -
+On the other hand, we have agents. These systems are given more freedom to act, and can take multiple non-linear steps +to achieve a certain goal. There isn't a single definition of what an agent is, but in general, it is an application +that uses LLM and usually some tools to communicate with the outside world. LLMs are used as decision-makers which +decide what action to take next. Actions can be anything, but they are usually well-defined and limited to a certain +set of possibilities. One of these actions might be to query a vector database, like Qdrant, to retrieve relevant +documents, if the context is not enough to make a decision. However, RAG is just a single tool in the agent's arsenal. - +![AI Agent](/articles_data/agentic-rag/ai-agent.png) -https://qdrant.tech/blog/n8n-node/ +## Agentic RAG: Combining RAG with Agents -2025-06-09T15:38:39+02:00 +Since the agent definition is vague, the concept of **Agentic RAG** is also not well-defined. In general, it refers to +the combination of RAG with agents. This allows the agent to use external knowledge sources to make decisions, and +primarily to decide when the external knowledge is needed. We can describe a system as Agentic RAG if it breaks the +linear flow of a standard RAG system, and gives the agent the ability to take multiple steps to achieve a goal. -... +A simple router that chooses a path to follow is often described as the simplest form of an agent. Such a system has +multiple paths with conditions describing when to take a certain path. In the context of Agentic RAG, the agent can +decide to query a vector database if the context is not enough to answer, or skip the query if it's enough, or when the +question refers to common knowledge. Alternatively, there might be multiple collections storing different kinds of +information, and the agent can decide which collection to query based on the context. The key factor is that the +decision of choosing a path is made by the LLM, which is the core of the agent. A routing agent never comes back to the +previous step, so it's ultimately just a conditional decision-making system. - +![Routing Agent](/articles_data/agentic-rag/routing-agent.png) - +However, routing is just the beginning. Agents can be much more complex, and extreme forms of agents can have complete +freedom to act. In such cases, the agent is given a set of tools and can autonomously decide which ones to use, how to +use them, and in which order. LLMs are asked to plan and execute actions, and the agent can take multiple steps to +achieve a goal, including taking steps back if needed. Such a system does not have to follow a DAG structure (Directed +Acyclic Graph), and can have loops that help to self-correct the decisions made in the past. An agentic RAG system +built in that manner can have tools not only to query a vector database, but also to play with the query, summarize the +results, or even generate new data to answer the question. Options are endless, but there are some common patterns +that can be observed in the wild. -https://qdrant.tech/blog/datatalks-course/ +![Autonomous Agent](/articles_data/agentic-rag/autonomous-agent.png) -2025-06-05T09:19:05-04:00 +### Solving Information Retrieval Problems with LLMs -... +Generally speaking, tools exposed in an agentic RAG system are used to solve information retrieval problems which are +not new to the search community. LLMs have changed how we approach these problems, but the core of the problem remains +the same. What kind of tools you can consider using in an agentic RAG? Here are some examples: - +- **Querying a vector database** - the most common tool used in agentic RAG systems. It allows the agent to retrieve + relevant documents based on the query. +- **Query expansion** - a tool that can be used to improve the query. It can be used to add synonyms, correct typos, or + even to generate new queries based on the original one. + ![Query expansion example](/articles_data/agentic-rag/query-expansion.png) +- **Extracting filters** - vector search alone is sometimes not enough. In many cases, you might want to narrow down + the results based on specific parameters. This extraction process can automatically identify relevant conditions from + the query. Otherwise, your users would have to manually define these search constraints. + ![Extracting filters](/articles_data/agentic-rag/extracting-filters.png) +- **Quality judgement** - knowing the quality of the results for given query can be used to decide whether they are good + enough to answer, or if the agent should take another step to improve them somehow. Alternatively it can also admit + the failure to provide good response. + ![Quality judgement](/articles_data/agentic-rag/quality-judgement.png) - +These are just some of the examples, but the list is not exhaustive. For example, your LLM could possibly play with +Qdrant search parameters or choose different methods to query it. An example? If your users are searching using some +specific keywords, you may prefer sparse vectors to dense vectors, as they are more efficient in such cases. In that +case you have to arm your agent with tools to decide when to use sparse vectors and when to use dense vectors. Agent +aware of the collection structure can make such decisions easily. -https://qdrant.tech/blog/case-study-qovery/ +Each of these tools might be a separate agent on its own, and multi-agent systems are not uncommon. In such cases, +agents can communicate with each other, and one agent can decide to use another agent to solve a particular problem. +Pretty useful component of an agentic RAG is also a human in the loop, which can be used to correct the agent's +decisions, or steer it in the right direction. -2025-05-27T11:19:41-07:00 +## Where are Agents Used? -... +Agents are an interesting concept, but since they heavily rely on LLMs, they are not applicable to all problems. Using +Large Language Models is expensive and tend to be slow, what in many cases, it's not worth the cost. Standard RAG +involves just a single call to the LLM, and the response is generated in a predictable way. Agents, on the other hand, +can take multiple steps, and the latency experienced by the user adds up. In many cases, it's not acceptable. +Agentic RAG is probably not that widely applicable in ecommerce search, where the user expects a quick response, but +might be fine for customer support, where the user is willing to wait a bit longer for a better answer. - +## Which Framework is Best? - +There are lots of frameworks available to build agents, and choosing the best one is not easy. It depends on your +existing stack or the tools you are familiar with. Some of the most popular LLM libraries have already drifted towards +the agent paradigm, and they are offering tools to build them. There are, however, some tools built primarily for +agents development, so let's focus on them. -https://qdrant.tech/blog/case-study-tripadvisor/ +### LangGraph -2025-05-13T23:15:13-07:00 +Developed by the LangChain team, LangGraph seems like a natural extension for those who already use LangChain for +building their RAG systems, and would like to start with agentic RAG. + +Surprisingly, LangGraph has nothing to do with Large Language Models on its own. It's a framework for building +graph-based applications in which each **node** is a step of the workflow. Each node takes an application **state** as +an input, and produces a modified state as an output. The state is then passed to the next node, and so on. **Edges** +between the nodes might be conditional what makes branching possible. Contrary to some DAG-based tool (i.e. Apache +Airflow), LangGraph allows for loops in the graph, which makes it possible to implement cyclic workflows, so an agent +can achieve self-reflection and self-correction. Theoretically, LangGraph can be used to build any kind of applications +in a graph-based manner, not only LLM agents. -... +Some of the strengths of LangGraph include: - +- **Persistence** - the state of the workflow graph is stored as a checkpoint. That happens at each so-called super-step + (which is a single sequential node of a graph). It enables replying certain steps of the workflow, fault-tolerance, + and including human-in-the-loop interactions. This mechanism also acts as a **short-term memory**, accessible in a + context of a particular workflow execution. +- **Long-term memory** - LangGraph also has a concept of memories that are shared between different workflow runs. + However, this mechanism has to explicitly handled by our nodes. **Qdrant with its semantic search capabilities is + often used as a long-term memory layer**. +- **Multi-agent support** - while there is no separate concept of multi-agent systems in LangGraph, it's possible to + create such an architecture by building a graph that includes multiple agents and some kind of supervisor that + makes a decision which agent to use in a given situation. If a node might be anything, then it might be another agent + as well. - +Some other interesting features of LangGraph include the ability to visualize the graph, automate the retries of failed +steps, and include human-in-the-loop interactions. -https://qdrant.tech/blog/case-study-aracor/ +A minimal example of an agentic RAG could improve the user query, e.g. by fixing typos, expanding it with synonyms, or +even generating a new query based on the original one. The agent could then retrieve documents from a vector database +based on the improved query, and generate a response. The LangGraph app implementing this approach could look like this: -2025-05-13T11:23:13-07:00 +```python +from typing import Sequence +from typing_extensions import TypedDict, Annotated +from langchain_core.messages import BaseMessage +from langgraph.constants import START, END +from langgraph.graph import add_messages, StateGraph -... - +class AgentState(TypedDict): + # The state of the agent includes at least the messages exchanged between the agent(s) + # and the user. It is, however, possible to include other information in the state, as + # it depends on the specific agent. + messages: Annotated[Sequence[BaseMessage], add_messages] - -https://qdrant.tech/blog/case-study-garden-intel/ +def improve_query(state: AgentState): + ... -2025-05-09T11:56:26-07:00 +def retrieve_documents(state: AgentState): + ... -... +def generate_response(state: AgentState): + ... - +# Building a graph requires defining nodes and building the flow between them with edges. +builder = StateGraph(AgentState) - +builder.add_node("improve_query", improve_query) +builder.add_node("retrieve_documents", retrieve_documents) +builder.add_node("generate_response", generate_response) -https://qdrant.tech/blog/product-ui-changes/ +builder.add_edge(START, "improve_query") +builder.add_edge("improve_query", "retrieve_documents") +builder.add_edge("retrieve_documents", "generate_response") +builder.add_edge("generate_response", END) -2025-05-08T09:28:12-04:00 +# Compiling the graph performs some checks and prepares the graph for execution. +compiled_graph = builder.compile() -... +# Compiled graph might be invoked with the initial state to start. +compiled_graph.invoke({ + "messages": [ + ("user", "Why Qdrant is the best vector database out there?"), + ] +}) +``` - +Each node of the process is just a Python function that does certain operation. You can call an LLM of your choice +inside of them, if you want to, but there is no assumption about the messages being created by any AI. **LangGraph +rather acts as a runtime that launches these functions in a specific order, and passes the state between them**. While +[LangGraph](https://www.langchain.com/langgraph) integrates well with the LangChain ecosystem, it can be used +independently. For teams looking for additional support and features, there's also a commercial offering called +LangGraph Platform. The framework is available for both Python and JavaScript environments, making it possible to be +used in different tech stacks. - +### CrewAI -https://qdrant.tech/blog/case-study-pariti/ +CrewAI is another popular choice for building agents, including agentic RAG. It's a high-level framework that assumes +there are some LLM-based agents working together to achieve a common goal. That's where the "crew" in CrewAI comes from. +CrewAI is designed with multi-agent systems in mind. Contrary to LangGraph, the developer does not create a graph of +processing, but defines agents and their roles within the crew. -2025-05-01T10:05:43-07:00 +Some of the key concepts of CrewAI include: -... +- **Agent** - a unit that has a specific role and goal, controlled by an LLM. It can optionally use some external tools + to communicate with the outside world, but generally steered by prompt we provide to the LLM. +- **Process** - currently either sequential or hierarchical. It defines how the task will be executed by the agents. + In a sequential process, agents are executed one after another, while in a hierarchical process, agent is selected + by the manager agent, which is responsible for making decisions about which agent to use in a given situation. +- **Roles and goals** - each agent has a certain role within the crew, and the goal it should aim to achieve. These are + set when we define an agent and are used to make decisions about which agent to use in a given situation. +- **Memory** - an extensive memory system consists of short-term memory, long-term memory, entity memory, and contextual + memory that combines the other three. There is also user memory for preferences and personalization. **This is where + Qdrant comes into play, as it might be used as a long-term memory layer.** - +CrewAI provides a rich set of tools integrated into the framework. That may be a huge advantage for those who want to +combine RAG with e.g. code execution, or image generation. The ecosystem is rich, however brining your own tools is +not a big deal, as CrewAI is designed to be extensible. - +A simple agentic RAG application implemented in CrewAI could look like this: -https://qdrant.tech/articles/vector-search-production/ +```python +from crewai import Crew, Agent, Task +from crewai.memory.entity.entity_memory import EntityMemory +from crewai.memory.short_term.short_term_memory import ShortTermMemory +from crewai.memory.storage.rag_storage import RAGStorage -2025-04-30T17:47:55+02:00 +class QdrantStorage(RAGStorage): + ... -... +response_generator_agent = Agent( + role="Generate response based on the conversation", + goal="Provide the best response, or admit when the response is not available.", + backstory=( + "I am a response generator agent. I generate " + "responses based on the conversation." + ), + verbose=True, +) - +query_reformulation_agent = Agent( + role="Reformulate the query", + goal="Rewrite the query to get better results. Fix typos, grammar, word choice, etc.", + backstory=( + "I am a query reformulation agent. I reformulate the " + "query to get better results." + ), + verbose=True, +) - +task = Task( + description="Let me know why Qdrant is the best vector database out there.", + expected_output="3 bullet points", + agent=response_generator_agent, +) -https://qdrant.tech/blog/case-study-dust-v2/ +crew = Crew( + agents=[response_generator_agent, query_reformulation_agent], + tasks=[task], + memory=True, + entity_memory=EntityMemory(storage=QdrantStorage("entity")), + short_term_memory=ShortTermMemory(storage=QdrantStorage("short-term")), +) +crew.kickoff() +``` -2025-05-08T11:45:46-07:00 +*Disclaimer: QdrantStorage is not a part of the CrewAI framework, but it's taken from the Qdrant documentation on [how +to integrate Qdrant with CrewAI](https://qdrant.tech/documentation/frameworks/crewai/).* -... +Although it's not a technical advantage, CrewAI has a [great documentation](https://docs.crewai.com/introduction). The +framework is available for Python, and it's easy to get started with it. CrewAI also has a commercial offering, CrewAI +Enterprise, which provides a platform for building and deploying agents at scale. - +### AutoGen - +AutoGen emphasizes multi-agent architectures as a fundamental design principle. The framework requires at least two +agents in any system to really call an application agentic - typically an assistant and a user proxy exchange messages +to achieve a common goal. Sequential chat with more than two agents is also supported, as well as group chat and nested +chat for internal dialogue. However, AutoGen does not assume there is a structured state that is passed between the +agents, and the chat conversation is the only way to communicate between them. -https://qdrant.tech/blog/case-study-sayone/ +There are many interesting concepts in the framework, some of them even quite unique: -2025-04-29T09:15:10-07:00 +- **Tools/functions** - external components that can be used by agents to communicate with the outside world. They are + defined as Python callables, and can be used for any external interaction we want to allow the agent to do. Type + annotations are used to define the input and output of the tools, and Pydantic models are supported for more complex + type schema. AutoGen supports only OpenAI-compatible tool call API for the time being. +- **Code executors** - built-in code executors include local command, Docker command, and Jupyter. An agent can write + and launch code, so theoretically the agents can do anything that can be done in Python. None of the other frameworks + made code generation and execution that prominent. Code execution being the first-class citizen in AutoGen is an + interesting concept. -... +Each AutoGen agent uses at least one of the components: human-in-the-loop, code executor, tool executor, or LLM. +A simple agentic RAG, based on the conversation of two agents which can retrieve documents from a vector database, +or improve the query, could look like this: - +```python +from os import environ - +from autogen import ConversableAgent +from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent +from qdrant_client import QdrantClient -https://qdrant.tech/blog/superlinked-multimodal-search/ +client = QdrantClient(...) -2025-04-24T14:10:50+02:00 +response_generator_agent = ConversableAgent( + name="response_generator_agent", + system_message=( + "You answer user questions based solely on the provided context. You ask to retrieve relevant documents for " + "your query, or reformulate the query, if it is incorrect in some way." + ), + description="A response generator agent that can answer your queries.", + llm_config={"config_list": [{"model": "gpt-4", "api_key": environ.get("OPENAI_API_KEY")}]}, + human_input_mode="NEVER", +) -... +user_proxy = RetrieveUserProxyAgent( + name="retrieval_user", + llm_config={"config_list": [{"model": "gpt-4", "api_key": environ.get("OPENAI_API_KEY")}]}, + human_input_mode="NEVER", + retrieve_config={ + "task": "qa", + "chunk_token_size": 2000, + "vector_db": "qdrant", + "db_config": {"client": client}, + "get_or_create": True, + "overwrite": True, + }, +) - +result = user_proxy.initiate_chat( + response_generator_agent, + message=user_proxy.message_generator, + problem="Why Qdrant is the best vector database out there?", + max_turns=10, +) +``` - +For those new to agent development, AutoGen offers AutoGen Studio, a low-code interface for prototyping agents. While +not intended for production use, it significantly lowers the barrier to entry for experimenting with agent +architectures. -https://qdrant.tech/blog/qdrant-1.14.x/ +![AutoGen Studio](/articles_data/agentic-rag/autogen-studio.png) -2025-05-02T15:26:42-03:00 +It's worth noting that AutoGen is currently undergoing significant updates, with version 0.4.x in development +introducing substantial API changes compared to the stable 0.2.x release. While the framework currently has limited +built-in persistence and state management capabilities, these features may evolve in future releases. -... +### OpenAI Swarm - +Unliked the other frameworks described in this article, OpenAI Swarm is an educational project, and it's not ready for +production use. It's worth mentioning, though, as it's pretty lightweight and easy to get started with. OpenAI Swarm +is an experimental framework for orchestrating multi-agent workflows that focuses on agent coordination through direct +handoffs rather than complex orchestration patterns. - +With that setup, **agents** are just exchanging messages in a chat, optionally calling some Python functions to +communicate with external services, or handing off the conversation to another agent, if the other one seems to be more +suitable to answer the question. Each agent has a certain role, defined by the instructions we have to define. +We have to decide which LLM will a particular agent use, and a set of functions it can call. For example, **a retrieval +agent could use a vector database to retrieve documents**, and return the results to the next agent. That means, there +should be a function that performs the semantic search on its behalf, but the model will decide how the query should +look like. -https://qdrant.tech/blog/case-study-pathwork/ +Here is how a similar agentic RAG application, implemented in OpenAI Swarm, could look like: -2025-05-16T09:10:33-07:00 +```python +from swarm import Swarm, Agent -... +client = Swarm() - +def retrieve_documents(query: str) -> list[str]: + """ + Retrieve documents based on the query. + """ + ... - +def transfer_to_query_improve_agent(): + return query_improve_agent -https://qdrant.tech/blog/case-study-lyzr/ +query_improve_agent = Agent( + name="Query Improve Agent", + instructions=( + "You are a search expert that takes user queries and improves them to get better results. You fix typos and " + "extend queries with synonyms, if needed. You never ask the user for more information." + ), +) -2025-05-16T09:10:33-07:00 +response_generation_agent = Agent( + name="Response Generation Agent", + instructions=( + "You take the whole conversation and generate a final response based on the chat history. " + "If you don't have enough information, you can retrieve the documents from the knowledge base or " + "reformulate the query by transferring to other agent. You never ask the user for more information. " + "You have to always be the last participant of each conversation." + ), + functions=[retrieve_documents, transfer_to_query_improve_agent], +) -... +response = client.run( + agent=response_generation_agent, + messages=[ + { + "role": "user", + "content": "Why Qdrant is the best vector database out there?" + } + ], +) +``` - +Even though we don't explicitly define the graph of processing, the agents can still decide to hand off the processing +to a different agent. There is no concept of a state, so everything relies on the messages exchanged between different +components. - +OpenAI Swarm does not focus on integration with external tools, and **if you would like to integrate semantic search +with Qdrant, you would have to implement it fully yourself**. Obviously, the library is tightly coupled with OpenAI +models, and while using some other ones is possible, it requires some additional work like setting up proxy that will +adjust the interface to OpenAI API. -https://qdrant.tech/blog/case-study-mixpeek/ +### The winner? -2025-05-16T09:10:33-07:00 +Choosing the best framework for your agentic RAG system depends on your existing stack, team expertise, and the +specific requirements of your project. All the described tools are strong contenders, and they are developed at rapid +pace. It's worth keeping an eye on all of them, as they are likely to evolve and improve over time. Eventually, you +should be able to build the same processes with any of them, but some of them may be more suitable in a specific +ecosystem of the tools you want your agent to interact with. -... +There are, however, some important factors to consider when choosing a framework for your agentic RAG system: - +- **Human-in-the-loop** - even though we aim to build autonomous agents, it's often important to include the feedback + from the human, so our agents cannot perform malicious actions. +- **Observability** - how easy it is to debug the system, and how easy it is to understand what's happening inside. + Especially important, since we are dealing with lots of LLM prompts. - +Still, choosing the right toolkit depends on the state of your project, and the specific requirements you have. If you +want to integrate your agent with number of external tools, CrewAI might be the best choice, as the set of +out-of-the-box integrations is the biggest. However, LangGraph integrates well with LangChain, so if you are familiar +with that ecosystem, it may suit you better. -https://qdrant.tech/blog/qdrant-n8n-beyond-simple-similarity-search/ +All the frameworks have different approaches to building agents, so it's worth experimenting with all of them to see +which one fits your needs the best. LangGraph and CrewAI are more mature and have more features, while AutoGen and +OpenAI Swarm are more lightweight and more experimental. However, **none of the existing frameworks solves all the +mentioned Information Retrieval problems**, so you still have to build your own tools to fill the gaps. -2025-04-08T11:38:52+02:00 +## Building Agentic RAG with Qdrant -... +No matter which framework you choose, Qdrant is a great tool to build agentic RAG systems. Please check out [our +integrations](/documentation/frameworks/) to choose the best one for your use case and preferences. The easiest way to +start using Qdrant is to use our managed service, [Qdrant Cloud](https://cloud.qdrant.io). A free 1GB cluster is +available for free, so you can start building your agentic RAG system in minutes. - +### Further Reading - +See how Qdrant integrates with: -https://qdrant.tech/blog/satellite-vector-broadcasting/ +- [Autogen](https://qdrant.tech/documentation/frameworks/autogen/) +- [CrewAI](https://qdrant.tech/documentation/frameworks/crewai/) +- [LangGraph](https://qdrant.tech/documentation/frameworks/langgraph/) +- [Swarm](https://qdrant.tech/documentation/frameworks/swarm/) -2025-04-01T08:09:34+02:00 +<|page-30-lllmstxt|> +It's been over a year since we published the original article on how to build a hybrid +search system with Qdrant. The idea was straightforward: combine the results from different search methods to improve +retrieval quality. Back in 2023, you still needed to use an additional service to bring lexical search +capabilities and combine all the intermediate results. Things have changed since then. Once we introduced support for +sparse vectors, [the additional search service became obsolete](/articles/sparse-vectors/), but you were still +required to combine the results from different methods on your end. -... +**Qdrant 1.10 introduces a new Query API that lets you build a search system by combining different search methods +to improve retrieval quality**. Everything is now done on the server side, and you can focus on building the best search +experience for your users. In this article, we will show you how to utilize the new [Query +API](/documentation/concepts/search/#query-api) to build a hybrid search system. - +## Introducing the new Query API - +At Qdrant, we believe that vector search capabilities go well beyond a simple search for nearest neighbors. +That's why we provided separate methods for different search use cases, such as `search`, `recommend`, or `discover`. +With the latest release, we are happy to introduce the new Query API, which combines all of these methods into a single +endpoint and also supports creating nested multistage queries that can be used to build complex search pipelines. -https://qdrant.tech/blog/case-study-hubspot/ +If you are an existing Qdrant user, you probably have a running search mechanism that you want to improve, whether sparse +or dense. Doing any changes should be preceded by a proper evaluation of its effectiveness. -2025-05-16T09:10:33-07:00 +## How effective is your search system? -... +None of the experiments makes sense if you don't measure the quality. How else would you compare which method works +better for your use case? The most common way of doing that is by using the standard metrics, such as `precision@k`, +`MRR`, or `NDCG`. There are existing libraries, such as [ranx](https://amenra.github.io/ranx/), that can help you with +that. We need to have the ground truth dataset to calculate any of these, but curating it is a separate task. - +```python +from ranx import Qrels, Run, evaluate - +# Qrels, or query relevance judgments, keep the ground truth data +qrels_dict = { "q_1": { "d_12": 5, "d_25": 3 }, + "q_2": { "d_11": 6, "d_22": 1 } } -https://qdrant.tech/blog/webinar-vibe-coding-rag/ +# Runs are built from the search results +run_dict = { "q_1": { "d_12": 0.9, "d_23": 0.8, "d_25": 0.7, + "d_36": 0.6, "d_32": 0.5, "d_35": 0.4 }, + "q_2": { "d_12": 0.9, "d_11": 0.8, "d_25": 0.7, + "d_36": 0.6, "d_22": 0.5, "d_35": 0.4 } } -2025-03-21T16:36:29+01:00 +# We need to create both objects, and then we can evaluate the run against the qrels +qrels = Qrels(qrels_dict) +run = Run(run_dict) -... +# Calculating the NDCG@5 metric is as simple as that +evaluate(qrels, run, "ndcg@5") +``` - +## Available embedding options with Query API - +Support for multiple vectors per point is nothing new in Qdrant, but introducing the Query API makes it even +more powerful. The 1.10 release supports the multivectors, allowing you to treat embedding lists +as a single entity. There are many possible ways of utilizing this feature, and the most prominent one is the support +for late interaction models, such as [ColBERT](https://qdrant.tech/documentation/fastembed/fastembed-colbert/). Instead of having a single embedding for each document or query, this +family of models creates a separate one for each token of text. In the search process, the final score is calculated +based on the interaction between the tokens of the query and the document. Contrary to cross-encoders, document +embedding might be precomputed and stored in the database, which makes the search process much faster. If you are +curious about the details, please check out [the article about ColBERT, written by our friends from Jina +AI](https://jina.ai/news/what-is-colbert-and-late-interaction-and-why-they-matter-in-search/). -https://qdrant.tech/blog/case-study-deutsche-telekom/ +![Late interaction](/articles_data/hybrid-search/late-interaction.png) -2025-04-03T08:09:56-04:00 +Besides multivectors, you can use regular dense and sparse vectors, and experiment with smaller data types to reduce +memory use. Named vectors can help you store different dimensionalities of the embeddings, which is useful if you +use multiple models to represent your data, or want to utilize the Matryoshka embeddings. -... +![Multiple vectors per point](/articles_data/hybrid-search/multiple-vectors.png) - +There is no single way of building a hybrid search. The process of designing it is an exploratory exercise, where you +need to test various setups and measure their effectiveness. Building a proper search experience is a +complex task, and it's better to keep it data-driven, not just rely on the intuition. - +## Fusion vs reranking -https://qdrant.tech/blog/enterprise-vector-search/ +We can, distinguish two main approaches to building a hybrid search system: fusion and reranking. The former is about +combining the results from different search methods, based solely on the scores returned by each method. That usually +involves some normalization, as the scores returned by different methods might be in different ranges. After that, there +is a formula that takes the relevancy measures and calculates the final score that we use later on to reorder the +documents. Qdrant has built-in support for the Reciprocal Rank Fusion method, which is the de facto standard in the +field. -2025-04-07T15:17:30-04:00 +![Fusion](/articles_data/hybrid-search/fusion.png) -... +Reranking, on the other hand, is about taking the results from different search methods and reordering them based on +some additional processing using the content of the documents, not just the scores. This processing may rely on an +additional neural model, such as a cross-encoder which would be inefficient enough to be used on the whole dataset. +These methods are practically applicable only when used on a smaller subset of candidates returned by the faster search +methods. Late interaction models, such as ColBERT, are way more efficient in this case, as they can be used to rerank +the candidates without the need to access all the documents in the collection. - +![Reranking](/articles_data/hybrid-search/reranking.png) - +### Why not a linear combination? -https://qdrant.tech/blog/metadata-deasy-labs/ +It's often proposed to use full-text and vector search scores to form a linear combination formula to rerank +the results. So it goes like this: -2025-02-24T15:04:44-03:00 +```final_score = 0.7 * vector_score + 0.3 * full_text_score``` -... +However, we didn't even consider such a setup. Why? Those scores don't make the problem linearly separable. We used +the BM25 score along with cosine vector similarity to use both of them as points coordinates in 2-dimensional space. The +chart shows how those points are distributed: - +![A distribution of both Qdrant and BM25 scores mapped into 2D space.](/articles_data/hybrid-search/linear-combination.png) - +*A distribution of both Qdrant and BM25 scores mapped into 2D space. It clearly shows relevant and non-relevant +objects are not linearly separable in that space, so using a linear combination of both scores won't give us +a proper hybrid search.* -https://qdrant.tech/blog/webinar-crewai-qdrant-obsidian/ +Both relevant and non-relevant items are mixed. **None of the linear formulas would be able to distinguish +between them.** Thus, that's not the way to solve it. -2025-01-24T16:10:16+01:00 +## Building a hybrid search system in Qdrant -... +Ultimately, **any search mechanism might also be a reranking mechanism**. You can prefetch results with sparse vectors +and then rerank them with the dense ones, or the other way around. Or, if you have Matryoshka embeddings, you can start +with oversampling the candidates with the dense vectors of the lowest dimensionality and then gradually reduce the +number of candidates by reranking them with the higher-dimensional embeddings. Nothing stops you from +combining both fusion and reranking. - +Let's go a step further and build a hybrid search mechanism that combines the results from the +Matryoshka embeddings, dense vectors, and sparse vectors and then reranks them with the late interaction model. In the +meantime, we will introduce additional reranking and fusion steps. - +![Complex search pipeline](/articles_data/hybrid-search/complex-search-pipeline.png) -https://qdrant.tech/blog/qdrant-1.13.x/ +Our search pipeline consists of two branches, each of them responsible for retrieving a subset of documents that +we eventually want to rerank with the late interaction model. Let's connect to Qdrant first and then build the search +pipeline. -2025-01-24T04:19:54-05:00 +```python +from qdrant_client import QdrantClient, models -... +client = QdrantClient("http://localhost:6333") +``` - +All the steps utilizing Matryoshka embeddings might be specified in the Query API as a nested structure: - +```python +# The first branch of our search pipeline retrieves 25 documents +# using the Matryoshka embeddings with multistep retrieval. +matryoshka_prefetch = models.Prefetch( + prefetch=[ + models.Prefetch( + prefetch=[ + # The first prefetch operation retrieves 100 documents + # using the Matryoshka embeddings with the lowest + # dimensionality of 64. + models.Prefetch( + query=[0.456, -0.789, ..., 0.239], + using="matryoshka-64dim", + limit=100, + ), + ], + # Then, the retrieved documents are re-ranked using the + # Matryoshka embeddings with the dimensionality of 128. + query=[0.456, -0.789, ..., -0.789], + using="matryoshka-128dim", + limit=50, + ) + ], + # Finally, the results are re-ranked using the Matryoshka + # embeddings with the dimensionality of 256. + query=[0.456, -0.789, ..., 0.123], + using="matryoshka-256dim", + limit=25, +) +``` -https://qdrant.tech/blog/static-embeddings/ +Similarly, we can build the second branch of our search pipeline, which retrieves the documents using the dense and +sparse vectors and performs the fusion of them using the Reciprocal Rank Fusion method: -2025-01-17T14:53:25+01:00 +```python +# The second branch of our search pipeline also retrieves 25 documents, +# but uses the dense and sparse vectors, with their results combined +# using the Reciprocal Rank Fusion. +sparse_dense_rrf_prefetch = models.Prefetch( + prefetch=[ + models.Prefetch( + prefetch=[ + # The first prefetch operation retrieves 100 documents + # using dense vectors using integer data type. Retrieval + # is faster, but quality is lower. + models.Prefetch( + query=[7, 63, ..., 92], + using="dense-uint8", + limit=100, + ) + ], + # Integer-based embeddings are then re-ranked using the + # float-based embeddings. Here we just want to retrieve + # 25 documents. + query=[-1.234, 0.762, ..., 1.532], + using="dense", + limit=25, + ), + # Here we just add another 25 documents using the sparse + # vectors only. + models.Prefetch( + query=models.SparseVector( + indices=[125, 9325, 58214], + values=[-0.164, 0.229, 0.731], + ), + using="sparse", + limit=25, + ), + ], + # RRF is activated below, so there is no need to specify the + # query vector here, as fusion is done on the scores of the + # retrieved documents. + query=models.FusionQuery( + fusion=models.Fusion.RRF, + ), +) +``` -... +The second branch could have already been called hybrid, as it combines the results from the dense and sparse vectors +with fusion. However, nothing stops us from building even more complex search pipelines. - +Here is how the target call to the Query API would look like in Python: - -https://qdrant.tech/blog/case-study-voiceflow/ +```python +client.query_points( + "my-collection", + prefetch=[ + matryoshka_prefetch, + sparse_dense_rrf_prefetch, + ], + # Finally rerank the results with the late interaction model. It only + # considers the documents retrieved by all the prefetch operations above. + # Return 10 final results. + query=[ + [1.928, -0.654, ..., 0.213], + [-1.197, 0.583, ..., 1.901], + ..., + [0.112, -1.473, ..., 1.786], + ], + using="late-interaction", + with_payload=False, + limit=10, +) +``` -2024-12-10T10:26:56-08:00 +The options are endless, the new Query API gives you the flexibility to experiment with different setups. **You +rarely need to build such a complex search pipeline**, but it's good to know that you can do that if needed. -... + - +## Lessons learned: multi-vector representations - +Many of you have already started building hybrid search systems and reached out to us with questions and feedback. +We've seen many different approaches, however one recurring idea was to utilize **multi-vector representations with +ColBERT-style models as a reranking step**, after retrieving candidates with single-vector dense and/or sparse methods. +This reflects the latest trends in the field, as single-vector methods are still the most efficient, but multivectors +capture the nuances of the text better. -https://qdrant.tech/blog/facial-recognition/ +![Reranking with late interaction models](/articles_data/hybrid-search/late-interaction-reranking.png) -2024-12-03T20:56:40-08:00 +Assuming you never use late interaction models for retrieval alone, but only for reranking, this setup comes with a +hidden cost. By default, each configured dense vector of the collection will have a corresponding HNSW graph created. +Even, if it is a multi-vector. -... +```python +from qdrant_client import QdrantClient, models - +client = QdrantClient(...) +client.create_collection( + collection_name="my-collection", + vectors_config={ + "dense": models.VectorParams(...), + "late-interaction": models.VectorParams( + size=128, + distance=models.Distance.COSINE, + multivector_config=models.MultiVectorConfig( + comparator=models.MultiVectorComparator.MAX_SIM + ), + ) + }, + sparse_vectors_config={ + "sparse": models.SparseVectorParams(...) + }, +) +``` - +Reranking will never use the created graph, as all the candidates are already retrieved. Multi-vector ranking will only +be applied to the candidates retrieved by the previous steps, so no search operation is needed. HNSW becomes redundant +while still the indexing process has to be performed, and in that case, it will be quite heavy. ColBERT-like models +create hundreds of embeddings for each document, so the overhead is significant. **To avoid it, you can disable the HNSW +graph creation for this kind of model**: -https://qdrant.tech/blog/colpali-qdrant-optimization/ +```python +client.create_collection( + collection_name="my-collection", + vectors_config={ + "dense": models.VectorParams(...), + "late-interaction": models.VectorParams( + size=128, + distance=models.Distance.COSINE, + multivector_config=models.MultiVectorConfig( + comparator=models.MultiVectorComparator.MAX_SIM + ), + hnsw_config=models.HnswConfigDiff( + m=0, # Disable HNSW graph creation + ), + ) + }, + sparse_vectors_config={ + "sparse": models.SparseVectorParams(...) + }, +) +``` -2024-11-30T18:57:48-03:00 +You won't notice any difference in the search performance, but the use of resources will be significantly lower when you +upload the embeddings to the collection. -... +## Some anecdotal observations - +Neither of the algorithms performs best in all cases. In some cases, keyword-based search +will be the winner and vice-versa. The following table shows some interesting examples we could find in the +[WANDS](https://github.com/wayfair/WANDS) dataset during experimentation: - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
QueryBM25 SearchVector Search
cybersport deskdesk ❌gaming desk ✅
plates for icecream"eat" plates on wood wall dĂ©cor ❌alicyn 8.5 '' melamine dessert plate ✅
kitchen table with a thick boardcraft kitchen acacia wood cutting board ❌industrial solid wood dining table ✅
wooden bedside table30 '' bedside table lamp ❌portable bedside end table ✅
-https://qdrant.tech/blog/rag-evaluation-guide/ +Also examples where keyword-based search did better: -2025-02-18T21:01:07+05:30 + + + + + + + + + + + + + + + + + + +
QueryBM25 SearchVector Search
computer chairvibrant computer task chair ✅office chair ❌
64.2 inch console tablecervantez 64.2 '' console table ✅69.5 '' console table ❌
+ +## Try the New Query API in Qdrant 1.10 + +The new Query API introduced in Qdrant 1.10 is a game-changer for building hybrid search systems. You don't need any +additional services to combine the results from different search methods, and you can even create more complex pipelines +and serve them directly from Qdrant. -... +Our webinar on *Building the Ultimate Hybrid Search* takes you through the process of building a hybrid search system +with Qdrant Query API. If you missed it, you can [watch the recording](https://www.youtube.com/watch?v=LAZOxqzceEU), or +[check the notebooks](https://github.com/qdrant/workshop-ultimate-hybrid-search). -
+
- +If you have any questions or need help with building your hybrid search system, don't hesitate to reach out to us on +[Discord](https://qdrant.to/discord). -https://qdrant.tech/blog/case-study-qatech/ +<|page-31-lllmstxt|> +> Retrieval-augmented generation (RAG) integrates external information retrieval into the process of generating responses by Large Language Models (LLMs). It searches a database for information beyond its pre-trained knowledge base, significantly improving the accuracy and relevance of the generated responses. -2024-11-21T16:42:35-08:00 +Language models have exploded on the internet ever since ChatGPT came out, and rightfully so. They can write essays, code entire programs, and even make memes (though we’re still deciding on whether that's a good thing). -... +But as brilliant as these chatbots become, they still have **limitations** in tasks requiring external knowledge and factual information. Yes, it can describe the honeybee's waggle dance in excruciating detail. But they become far more valuable if they can generate insights from **any data** that we provide, rather than just their original training data. Since retraining those large language models from scratch costs millions of dollars and takes months, we need better ways to give our existing LLMs access to our custom data. - +While you could be more creative with your prompts, it is only a short-term solution. LLMs can consider only a **limited** amount of text in their responses, known as a [context window](https://www.hopsworks.ai/dictionary/context-window-for-llms). Some models like GPT-3 can see up to around 12 pages of text (that’s 4,096 tokens of context). That’s not good enough for most knowledge bases. - +![How a RAG works](/articles_data/what-is-rag-in-ai/how-rag-works.jpg) -https://qdrant.tech/blog/qdrant-colpali/ +The image above shows how a basic RAG system works. Before forwarding the question to the LLM, we have a layer that searches our knowledge base for the "relevant knowledge" to answer the user query. Specifically, in this case, the spending data from the last month. Our LLM can now generate a **relevant non-hallucinated** response about our budget. -2024-11-06T17:18:48-08:00 +As your data grows, you’ll need [efficient ways](https://qdrant.tech/rag/rag-evaluation-guide/) to identify the most relevant information for your LLM's limited memory. This is where you’ll want a proper way to store and retrieve the specific data you’ll need for your query, without needing the LLM to remember it. -... +**Vector databases** store information as **vector embeddings**. This format supports efficient similarity searches to retrieve relevant data for your query. For example, Qdrant is specifically designed to perform fast, even in scenarios dealing with billions of vectors. - +This article will focus on RAG systems and architecture. If you’re interested in learning more about vector search, we recommend the following articles: [What is a Vector Database?](/articles/what-is-a-vector-database/) and [What are Vector Embeddings?](/articles/what-are-embeddings/). - -https://qdrant.tech/blog/case-study-sprinklr/ +## RAG architecture -2024-10-18T09:03:19-07:00 +At its core, a RAG architecture includes the **retriever** and the **generator**. Let's start by understanding what each of these components does. -... - +### The Retriever - +When you ask a question to the retriever, it uses **similarity search** to scan through a vast knowledge base of vector embeddings. It then pulls out the most **relevant** vectors to help answer that query. There are a few different techniques it can use to know what’s relevant: -https://qdrant.tech/blog/qdrant-1.12.x/ -2024-10-08T19:49:58-07:00 +#### How indexing works in RAG retrievers -... +The indexing process organizes the data into your vector database in a way that makes it easily searchable. This allows the RAG to access relevant information when responding to a query. - +![How indexing works](/articles_data/what-is-rag-in-ai/how-indexing-works.jpg) - +As shown in the image above, here’s the process: -https://qdrant.tech/blog/qdrant-deeplearning-ai-course/ -2024-10-07T12:25:14-07:00 -... +* Start with a _loader_ that gathers _documents_ containing your data. These documents could be anything from articles and books to web pages and social media posts. +* Next, a _splitter_ divides the documents into smaller chunks, typically sentences or paragraphs. +* This is because RAG models work better with smaller pieces of text. In the diagram, these are _document snippets_. +* Each text chunk is then fed into an _embedding machine_. This machine uses complex algorithms to convert the text into [vector embeddings](/articles/what-are-embeddings/). - +All the generated vector embeddings are stored in a knowledge base of indexed information. This supports efficient retrieval of similar pieces of information when needed. - -https://qdrant.tech/blog/qdrant-for-startups-launch/ +#### Query vectorization -2024-10-02T19:07:16+05:30 +Once you have vectorized your knowledge base you can do the same to the user query. When the model sees a new query, it uses the same preprocessing and embedding techniques. This ensures that the query vector is compatible with the document vectors in the index. -... +![How retrieval works](/articles_data/what-is-rag-in-ai/how-retrieval-works.jpg) - +#### Retrieval of relevant documents - +When the system needs to find the most relevant documents or passages to answer a query, it utilizes vector similarity techniques. **Vector similarity** is a fundamental concept in machine learning and natural language processing (NLP) that quantifies the resemblance between vectors, which are mathematical representations of data points. -https://qdrant.tech/blog/case-study-shakudo/ +The system can employ different vector similarity strategies depending on the type of vectors used to represent the data: -2025-03-13T17:47:05+01:00 -... +##### Sparse vector representations - +A sparse vector is characterized by a high dimensionality, with most of its elements being zero. - +The classic approach is **keyword search**, which scans documents for the exact words or phrases in the query. The search creates sparse vector representations of documents by counting word occurrences and inversely weighting common words. Queries with rarer words get prioritized. -https://qdrant.tech/blog/qdrant-relari/ -2024-09-17T15:53:48-07:00 +![Sparse vector representation](/articles_data/what-is-rag-in-ai/sparse-vectors.jpg) -... - +[TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) (Term Frequency-Inverse Document Frequency) and [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) are two classic related algorithms. They're simple and computationally efficient. However, they can struggle with synonyms and don't always capture semantic similarities. - +If you’re interested in going deeper, refer to our article on [Sparse Vectors](/articles/sparse-vectors/). -https://qdrant.tech/blog/case-study-nyris/ -2024-09-23T14:05:33-07:00 +##### Dense vector embeddings -... +This approach uses large language models like [BERT](https://en.wikipedia.org/wiki/BERT_(language_model)) to encode the query and passages into dense vector embeddings. These models are compact numerical representations that capture semantic meaning. Vector databases like Qdrant store these embeddings, allowing retrieval based on **semantic similarity** rather than just keywords using distance metrics like cosine similarity. - +This allows the retriever to match based on semantic understanding rather than just keywords. So if I ask about "compounds that cause BO," it can retrieve relevant info about "molecules that create body odor" even if those exact words weren't used. We explain more about it in our [What are Vector Embeddings](/articles/what-are-embeddings/) article. - -https://qdrant.tech/blog/case-study-kern/ +#### Hybrid search -2024-09-23T14:05:33-07:00 +However, neither keyword search nor vector search are always perfect. Keyword search may miss relevant information expressed differently, while vector search can sometimes struggle with specificity or neglect important statistical word patterns. Hybrid methods aim to combine the strengths of different techniques. -... - +![Hybrid search overview](/articles_data/what-is-rag-in-ai/hybrid-search.jpg) - -https://qdrant.tech/blog/qdrant-1.11.x/ +Some common hybrid approaches include: -2024-08-16T00:01:23+02:00 -... - +* Using keyword search to get an initial set of candidate documents. Next, the documents are re-ranked/re-scored using semantic vector representations. +* Starting with semantic vectors to find generally topically relevant documents. Next, the documents are filtered/re-ranked e based on keyword matches or other metadata. +* Considering both semantic vector closeness and statistical keyword patterns/weights in a combined scoring model. +* Having multiple stages were different techniques. One example: start with an initial keyword retrieval, followed by semantic re-ranking, then a final re-ranking using even more complex models. - +When you combine the powers of different search methods in a complementary way, you can provide higher quality, more comprehensive results. Check out our article on [Hybrid Search](/articles/hybrid-search/) if you’d like to learn more. -https://qdrant.tech/blog/case-study-kairoswealth/ -2024-09-11T14:59:00-07:00 +### The Generator -... +With the top relevant passages retrieved, it's now the generator's job to produce a final answer by synthesizing and expressing that information in natural language. - +The LLM is typically a model like GPT, BART or T5, trained on massive datasets to understand and generate human-like text. It now takes not only the query (or question) as input but also the relevant documents or passages that the retriever identified as potentially containing the answer to generate its response. - -https://qdrant.tech/blog/qdrant-1.10.x/ +![How a Generator works](/articles_data/what-is-rag-in-ai/how-generation-works.png) -2024-07-16T22:00:30+05:30 -... +The retriever and generator don't operate in isolation. The image bellow shows how the output of the retrieval feeds the generator to produce the final generated response. - - +![The entire architecture of a RAG system](/articles_data/what-is-rag-in-ai/rag-system.jpg) -https://qdrant.tech/blog/community-highlights-1/ -2024-06-21T02:34:01-03:00 +## Where is RAG being used? -... +Because of their more knowledgeable and contextual responses, we can find RAG models being applied in many areas today, especially those who need factual accuracy and knowledge depth. - - +### Real-World Applications: -https://qdrant.tech/blog/cve-2024-3829-response/ +**Question answering:** This is perhaps the most prominent use case for RAG models. They power advanced question-answering systems that can retrieve relevant information from large knowledge bases and then generate fluent answers. -2024-06-10T12:42:49-04:00 +**Language generation:** RAG enables more factual and contextualized text generation for contextualized text summarization from multiple sources -... +**Data-to-text generation:** By retrieving relevant structured data, RAG models can generate product/business intelligence reports from databases or describing insights from data visualizations and charts - +**Multimedia understanding:** RAG isn't limited to text - it can retrieve multimodal information like images, video, and audio to enhance understanding. Answering questions about images/videos by retrieving relevant textual context. - -https://qdrant.tech/blog/qdrant-soc2-type2-audit/ +## Creating your first RAG chatbot with Langchain, Groq, and OpenAI -2024-08-29T19:19:43+05:30 +Are you ready to create your own RAG chatbot from the ground up? We have a video explaining everything from the beginning. Daniel Romero’s will guide you through: -... - - +* Setting up your chatbot +* Preprocessing and organizing data for your chatbot's use +* Applying vector similarity search algorithms +* Enhancing the efficiency and response quality -https://qdrant.tech/blog/qdrant-stars-announcement/ +After building your RAG chatbot, you'll be able to [evaluate its performance](https://qdrant.tech/rag/rag-evaluation-guide/) against that of a chatbot powered solely by a Large Language Model (LLM). -2024-10-05T03:39:41+05:30 +
-... -
- +## What’s next? -https://qdrant.tech/blog/qdrant-cpu-intel-benchmark/ +Have a RAG project you want to bring to life? Join our [Discord community](https://discord.gg/qdrant) where we’re always sharing tips and answering questions on vector search and retrieval. -2024-10-08T12:41:46-07:00 +Learn more about how to properly evaluate your RAG responses: [Evaluating Retrieval Augmented Generation - a framework for assessment](https://superlinked.com/vectorhub/evaluating-retrieval-augmented-generation-a-framework-for-assessment). -... +<|page-32-lllmstxt|> + - - +For the last 40 years, BM25 has served as the standard for search engines. +It is a simple yet powerful algorithm that has been used by many search engines, including Google, Bing, and Yahoo. -https://qdrant.tech/blog/qsoc24-interns-announcement/ +Though it seemed that the advent of vector search would diminish its influence, it did so only partially. +The current state-of-the-art approach to retrieval nowadays tries to incorporate BM25 along with embeddings into a hybrid search system. -2024-05-08T18:04:46-03:00 +However, the use case of text retrieval has significantly shifted since the introduction of RAG. +Many assumptions upon which BM25 was built are no longer valid. -... +For example, the typical length of documents and queries vary significantly between traditional web search and modern RAG systems. - +In this article, we will recap what made BM25 relevant for so long and why alternatives have struggled to replace it. Finally, we will discuss BM42, as the next step in the evolution of lexical search. - +## Why has BM25 stayed relevant for so long? -https://qdrant.tech/articles/semantic-cache-ai-data-retrieval/ +To understand why, we need to analyze its components. -2024-12-20T13:10:51+01:00 +The famous BM25 formula is defined as: -... +$$ +\text{score}(D,Q) = \sum_{i=1}^{N} \text{IDF}(q_i) \times \frac{f(q_i, D) \cdot (k_1 + 1)}{f(q_i, D) + k_1 \cdot \left(1 - b + b \cdot \frac{|D|}{\text{avgdl}}\right)} +$$ - +Let's simplify this to gain a better understanding. - +- The $score(D, Q)$ - means that we compute the score for each pair of document $D$ and query $Q$. -https://qdrant.tech/blog/are-you-vendor-locked/ +- The $\sum_{i=1}^{N}$ - means that each of $N$ terms in the query contribute to the final score as a part of the sum. -2024-05-21T10:11:09+02:00 +- The $\text{IDF}(q_i)$ - is the inverse document frequency. The more rare the term $q_i$ is, the more it contributes to the score. A simplified formula for this is: -... +$$ +\text{IDF}(q_i) = \frac{\text{Number of documents}}{\text{Number of documents with } q_i} +$$ - +It is fair to say that the `IDF` is the most important part of the BM25 formula. +`IDF` selects the most important terms in the query relative to the specific document collection. +So intuitively, we can interpret the `IDF` as **term importance within the corpora**. - +That explains why BM25 is so good at handling queries, which dense embeddings consider out-of-domain. -https://qdrant.tech/blog/case-study-visua/ +The last component of the formula can be intuitively interpreted as **term importance within the document**. +This might look a bit complicated, so let's break it down. -2024-05-01T17:59:13-07:00 +$$ +\text{Term importance in document }(q_i) = \color{red}\frac{f(q_i, D)\color{gray} \cdot \color{blue}(k_1 + 1) \color{gray} }{\color{red}f(q_i, D)\color{gray} + \color{blue}k_1\color{gray} \cdot \left(1 - \color{blue}b\color{gray} + \color{blue}b\color{gray} \cdot \frac{|D|}{\text{avgdl}}\right)} +$$ -... +- The $\color{red}f(q_i, D)\color{gray}$ - is the frequency of the term $q_i$ in the document $D$. Or in other words, the number of times the term $q_i$ appears in the document $D$. +- The $\color{blue}k_1\color{gray}$ and $\color{blue}b\color{gray}$ are the hyperparameters of the BM25 formula. In most implementations, they are constants set to $k_1=1.5$ and $b=0.75$. Those constants define relative implications of the term frequency and the document length in the formula. +- The $\frac{|D|}{\text{avgdl}}$ - is the relative length of the document $D$ compared to the average document length in the corpora. The intuition befind this part is following: if the token is found in the smaller document, it is more likely that this token is important for this document. - +#### Will BM25 term importance in the document work for RAG? - +As we can see, the *term importance in the document* heavily depends on the statistics within the document. Moreover, statistics works well if the document is long enough. +Therefore, it is suitable for searching webpages, books, articles, etc. -https://qdrant.tech/blog/qdrant-1.9.x/ +However, would it work as well for modern search applications, such as RAG? Let's see. -2024-05-21T10:11:09+02:00 +The typical length of a document in RAG is much shorter than that of web search. In fact, even if we are working with webpages and articles, we would prefer to split them into chunks so that +a) Dense models can handle them and +b) We can pinpoint the exact part of the document which is relevant to the query -... +As a result, the document size in RAG is small and fixed. - +That effectively renders the term importance in the document part of the BM25 formula useless. +The term frequency in the document is always 0 or 1, and the relative length of the document is always 1. - +So, the only part of the BM25 formula that is still relevant for RAG is `IDF`. Let's see how we can leverage it. -https://qdrant.tech/blog/hybrid-cloud-launch-partners/ +## Why SPLADE is not always the answer -2024-05-21T10:11:09+02:00 +Before discussing our new approach, let's examine the current state-of-the-art alternative to BM25 - SPLADE. -... +The idea behind SPLADE is interesting—what if we let a smart, end-to-end trained model generate a bag-of-words representation of the text for us? +It will assign all the weights to the tokens, so we won't need to bother with statistics and hyperparameters. +The documents are then represented as a sparse embedding, where each token is represented as an element of the sparse vector. - +And it works in academic benchmarks. Many papers report that SPLADE outperforms BM25 in terms of retrieval quality. +This performance, however, comes at a cost. - +* **Inappropriate Tokenizer**: To incorporate transformers for this task, SPLADE models require using a standard transformer tokenizer. These tokenizers are not designed for retrieval tasks. For example, if the word is not in the (quite limited) vocabulary, it will be either split into subwords or replaced with a `[UNK]` token. This behavior works well for language modeling but is completely destructive for retrieval tasks. -https://qdrant.tech/blog/hybrid-cloud/ +* **Expensive Token Expansion**: In order to compensate the tokenization issues, SPLADE uses *token expansion* technique. This means that we generate a set of similar tokens for each token in the query. There are a few problems with this approach: + - It is computationally and memory expensive. We need to generate more values for each token in the document, which increases both the storage size and retrieval time. + - It is not always clear where to stop with the token expansion. The more tokens we generate, the more likely we are to get the relevant one. But simultaneously, the more tokens we generate, the more likely we are to get irrelevant results. + - Token expansion dilutes the interpretability of the search. We can't say which tokens were used in the document and which were generated by the token expansion. -2024-05-21T10:11:09+02:00 +* **Domain and Language Dependency**: SPLADE models are trained on specific corpora. This means that they are not always generalizable to new or rare domains. As they don't use any statistics from the corpora, they cannot adapt to the new domain without fine-tuning. -... +* **Inference Time**: Additionally, currently available SPLADE models are quite big and slow. They usually require a GPU to make the inference in a reasonable time. - +At Qdrant, we acknowledge the aforementioned problems and are looking for a solution. +Our idea was to combine the best of both worlds - the simplicity and interpretability of BM25 and the intelligence of transformers while avoiding the pitfalls of SPLADE. - +And here is what we came up with. -https://qdrant.tech/blog/rag-advancements-challenges/ +## The best of both worlds -2024-04-12T14:45:02+00:00 +As previously mentioned, `IDF` is the most important part of the BM25 formula. In fact it is so important, that we decided to build its calculation into the Qdrant engine itself. +Check out our latest [release notes](https://github.com/qdrant/qdrant/releases/tag/v1.10.0). This type of separation allows streaming updates of the sparse embeddings while keeping the `IDF` calculation up-to-date. -... +As for the second part of the formula, *the term importance within the document* needs to be rethought. - +Since we can't rely on the statistics within the document, we can try to use the semantics of the document instead. +And semantics is what transformers are good at. Therefore, we only need to solve two problems: - +- How does one extract the importance information from the transformer? +- How can tokenization issues be avoided? -https://qdrant.tech/blog/building-search-rag-open-api/ -2024-04-12T14:23:42+00:00 +### Attention is all you need -... +Transformer models, even those used to generate embeddings, generate a bunch of different outputs. +Some of those outputs are used to generate embeddings. - +Others are used to solve other kinds of tasks, such as classification, text generation, etc. - +The one particularly interesting output for us is the attention matrix. -https://qdrant.tech/blog/gen-ai-and-vector-search/ +{{< figure src="/articles_data/bm42/attention-matrix.png" alt="Attention matrix" caption="Attention matrix" width="60%" >}} -2024-07-07T19:32:50-07:00 +The attention matrix is a square matrix, where each row and column corresponds to the token in the input sequence. +It represents the importance of each token in the input sequence for each other. -... +The classical transformer models are trained to predict masked tokens in the context, so the attention weights define which context tokens influence the masked token most. - +Apart from regular text tokens, the transformer model also has a special token called `[CLS]`. This token represents the whole sequence in the classification tasks, which is exactly what we need. - +By looking at the attention row for the `[CLS]` token, we can get the importance of each token in the document for the whole document. -https://qdrant.tech/blog/teaching-vector-db-at-scale/ -2024-04-09T11:06:17+00:00 +```python +sentences = "Hello, World - is the starting point in most programming languages" -... +features = transformer.tokenize(sentences) - +# ... - +attentions = transformer.auto_model(**features, output_attentions=True).attentions -https://qdrant.tech/blog/meow-with-cheshire-cat/ +weights = torch.mean(attentions[-1][0,:,0], axis=0) +# â–Č â–Č â–Č â–Č +# │ │ │ └─── [CLS] token is the first one +# │ │ └─────── First item of the batch +# │ └────────── Last transformer layer +# └────────────────────────── Average all 6 attention heads -2024-04-09T11:05:51+00:00 +for weight, token in zip(weights, tokens): + print(f"{token}: {weight}") -... +# [CLS] : 0.434 // Filter out the [CLS] token +# hello : 0.039 +# , : 0.039 +# world : 0.107 // <-- The most important token +# - : 0.033 +# is : 0.024 +# the : 0.031 +# starting : 0.054 +# point : 0.028 +# in : 0.018 +# most : 0.016 +# programming : 0.060 // <-- The third most important token +# languages : 0.062 // <-- The second most important token +# [SEP] : 0.047 // Filter out the [SEP] token - +``` - -https://qdrant.tech/blog/cve-2024-2221-response/ +The resulting formula for the BM42 score would look like this: -2024-08-15T17:31:04+02:00 +$$ +\text{score}(D,Q) = \sum_{i=1}^{N} \text{IDF}(q_i) \times \text{Attention}(\text{CLS}, q_i) +$$ -... - +Note that classical transformers have multiple attention heads, so we can get multiple importance vectors for the same document. The simplest way to combine them is to simply average them. - +These averaged attention vectors make up the importance information we were looking for. +The best part is, one can get them from any transformer model, without any additional training. +Therefore, BM42 can support any natural language as long as there is a transformer model for it. -https://qdrant.tech/blog/fastllm-announcement/ +In our implementation, we use the `sentence-transformers/all-MiniLM-L6-v2` model, which gives a huge boost in the inference speed compared to the SPLADE models. In practice, any transformer model can be used. +It doesn't require any additional training, and can be easily adapted to work as BM42 backend. -2024-04-01T04:13:26-07:00 -... +### WordPiece retokenization - +The final piece of the puzzle we need to solve is the tokenization issue. In order to get attention vectors, we need to use native transformer tokenization. +But this tokenization is not suitable for the retrieval tasks. What can we do about it? - +Actually, the solution we came up with is quite simple. We reverse the tokenization process after we get the attention vectors. -https://qdrant.tech/blog/virtualbrain-best-rag/ +Transformers use [WordPiece](https://huggingface.co/learn/nlp-course/en/chapter6/6) tokenization. +In case it sees the word, which is not in the vocabulary, it splits it into subwords. -2024-09-20T10:12:14-04:00 +Here is how that looks: -... +```text +"unbelievable" -> ["un", "##believ", "##able"] +``` - +What can merge the subwords back into the words. Luckily, the subwords are marked with the `##` prefix, so we can easily detect them. +Since the attention weights are normalized, we can simply sum the attention weights of the subwords to get the attention weight of the word. - +After that, we can apply the same traditional NLP techniques, as -https://qdrant.tech/blog/youtube-without-paying-cent/ +- Removing of the stop-words +- Removing of the punctuation +- Lemmatization -2024-03-27T12:44:32+00:00 +In this way, we can significantly reduce the number of tokens, and therefore minimize the memory footprint of the sparse embeddings. We won't simultaneously compromise the ability to match (almost) exact tokens. -... +## Practical examples - - +| Trait | BM25 | SPLADE | BM42 | +|-------------------------|--------------|--------------|--------------| +| Interpretability | High ✅ | Ok 🆗 | High ✅ | +| Document Inference speed| Very high ✅ | Slow 🐌 | High ✅ | +| Query Inference speed | Very high ✅ | Slow 🐌 | Very high ✅ | +| Memory footprint | Low ✅ | High ❌ | Low ✅ | +| In-domain accuracy | Ok 🆗 | High ✅ | High ✅ | +| Out-of-domain accuracy | Ok 🆗 | Low ❌ | Ok 🆗 | +| Small documents accuracy| Low ❌ | High ✅ | High ✅ | +| Large documents accuracy| High ✅ | Low ❌ | Ok 🆗 | +| Unknown tokens handling | Yes ✅ | Bad ❌ | Yes ✅ | +| Multi-lingual support | Yes ✅ | No ❌ | Yes ✅ | +| Best Match | Yes ✅ | No ❌ | Yes ✅ | -https://qdrant.tech/blog/azure-marketplace/ -2024-10-05T03:39:41+05:30 +Starting from Qdrant v1.10.0, BM42 can be used in Qdrant via FastEmbed inference. -... +Let's see how you can setup a collection for hybrid search with BM42 and [jina.ai](https://jina.ai/embeddings/) dense embeddings. - +```http +PUT collections/my-hybrid-collection +{ + "vectors": { + "jina": { + "size": 768, + "distance": "Cosine" + } + }, + "sparse_vectors": { + "bm42": { + "modifier": "idf" // <--- This parameter enables the IDF calculation + } + } +} +``` - +```python +from qdrant_client import QdrantClient, models -https://qdrant.tech/blog/real-time-news-distillation-rag/ +client = QdrantClient() -2024-03-25T08:49:27+00:00 +client.create_collection( + collection_name="my-hybrid-collection", + vectors_config={ + "jina": models.VectorParams( + size=768, + distance=models.Distance.COSINE, + ) + }, + sparse_vectors_config={ + "bm42": models.SparseVectorParams( + modifier=models.Modifier.IDF, + ) + } +) +``` -... +The search query will retrieve the documents with both dense and sparse embeddings and combine the scores +using the Reciprocal Rank Fusion (RRF) algorithm. - +```python +from fastembed import SparseTextEmbedding, TextEmbedding - +query_text = "best programming language for beginners?" -https://qdrant.tech/blog/insight-generation-platform/ +model_bm42 = SparseTextEmbedding(model_name="Qdrant/bm42-all-minilm-l6-v2-attentions") +model_jina = TextEmbedding(model_name="jinaai/jina-embeddings-v2-base-en") -2024-03-25T08:51:56+00:00 +sparse_embedding = list(model_bm42.query_embed(query_text))[0] +dense_embedding = list(model_jina.query_embed(query_text))[0] -... +client.query_points( + collection_name="my-hybrid-collection", + prefetch=[ + models.Prefetch(query=sparse_embedding.as_object(), using="bm42", limit=10), + models.Prefetch(query=dense_embedding.tolist(), using="jina", limit=10), + ], + query=models.FusionQuery(fusion=models.Fusion.RRF), # <--- Combine the scores + limit=10 +) - +``` - +### Benchmarks -https://qdrant.tech/blog/llm-as-a-judge/ +To prove the point further we have conducted some benchmarks to highlight the cases where BM42 outperforms BM25. +Please note, that we didn't intend to make an exhaustive evaluation, as we are presenting a new approach, not a new model. -2024-03-19T15:05:24+00:00 +For out experiments we choose [quora](https://huggingface.co/datasets/BeIR/quora) dataset, which represents a question-deduplication task ~~the Question-Answering task~~. -... - +The typical example of the dataset is the following: - +```text +{"_id": "109", "text": "How GST affects the CAs and tax officers?"} +{"_id": "110", "text": "Why can't I do my homework?"} +{"_id": "111", "text": "How difficult is it get into RSI?"} +``` -https://qdrant.tech/blog/vector-search-vector-recommendation/ +As you can see, it has pretty short texts, there are not much of the statistics to rely on. -2024-03-19T14:22:15+00:00 +After encoding with BM42, the average vector size is only **5.6 elements per document**. -... +With `datatype: uint8` available in Qdrant, the total size of the sparse vector index is about **13MB** for ~530k documents. - +As a reference point, we use: - +- BM25 with tantivy +- the [sparse vector BM25 implementation](https://github.com/qdrant/bm42_eval/blob/master/index_bm25_qdrant.py) with the same preprocessing pipeline like for BM42: tokenization, stop-words removal, and lemmatization -https://qdrant.tech/blog/using-qdrant-and-langchain/ +| | BM25 (tantivy) | BM25 (Sparse) | BM42 | +|----------------------|-------------------|---------------|----------| +| ~~Precision @ 10~~ * | ~~0.45~~ | ~~0.45~~ | ~~0.49~~ | +| Recall @ 10 | ~~0.71~~ **0.89** | 0.83 | 0.85 | -2024-05-15T18:01:28+02:00 -... + \* - values were corrected after the publication due to a mistake in the evaluation script. - + - +To make our benchmarks transparent, we have published scripts we used for the evaluation: see [github repo](https://github.com/qdrant/bm42_eval). -https://qdrant.tech/blog/iris-agent-qdrant/ -2024-03-06T09:17:19-08:00 +Please note, that both BM25 and BM42 won't work well on their own in a production environment. +Best results are achieved with a combination of sparse and dense embeddings in a hybrid approach. +In this scenario, the two models are complementary to each other. +The sparse model is responsible for exact token matching, while the dense model is responsible for semantic matching. -... +Some more advanced models might outperform default `sentence-transformers/all-MiniLM-L6-v2` model we were using. +We encourage developers involved in training embedding models to include a way to extract attention weights and contribute to the BM42 backend. - +## Fostering curiosity and experimentation - +Despite all of its advantages, BM42 is not always a silver bullet. +For large documents without chunks, BM25 might still be a better choice. -https://qdrant.tech/blog/case-study-dailymotion/ +There might be a smarter way to extract the importance information from the transformer. There could be a better method to weigh IDF against attention scores. -2024-03-07T20:31:05+01:00 +Qdrant does not specialize in model training. Our core project is the search engine itself. However, we understand that we are not operating in a vacuum. By introducing BM42, we are stepping up to empower our community with novel tools for experimentation. -... +We truly believe that the sparse vectors method is at exact level of abstraction to yield both powerful and flexible results. - +Many of you are sharing your recent Qdrant projects in our [Discord channel](https://discord.com/invite/qdrant). Feel free to try out BM42 and let us know what you come up with. - +<|page-33-lllmstxt|> +# Unlocking Next-Level Search: Exploring Qdrant 1.8.0's Advanced Search Capabilities -https://qdrant.tech/blog/comparing-qdrant-vs-pinecone-vector-databases/ +[Qdrant 1.8.0 is out!](https://github.com/qdrant/qdrant/releases/tag/v1.8.0). +This time around, we have focused on Qdrant's internals. Our goal was to optimize performance so that your existing setup can run faster and save on compute. Here is what we've been up to: -2025-02-04T13:55:26+01:00 +- **Faster [sparse vectors](https://qdrant.tech/articles/sparse-vectors/):** [Hybrid search](https://qdrant.tech/articles/hybrid-search/) is up to 16x faster now! +- **CPU resource management:** You can allocate CPU threads for faster indexing. +- **Better indexing performance:** We optimized text [indexing](https://qdrant.tech/documentation/concepts/indexing/) on the backend. -... +## Faster search with sparse vectors - +Search throughput is now up to 16 times faster for sparse vectors. If you are [using Qdrant for hybrid search](/articles/sparse-vectors/), this means that you can now handle up to sixteen times as many queries. This improvement comes from extensive backend optimizations aimed at increasing efficiency and capacity. - +What this means for your setup: -https://qdrant.tech/blog/what-is-vector-similarity/ +- **Query speed:** The time it takes to run a search query has been significantly reduced. +- **Search capacity:** Qdrant can now handle a much larger volume of search requests. +- **User experience:** Results will appear faster, leading to a smoother experience for the user. +- **Scalability:** You can easily accommodate rapidly growing users or an expanding dataset. -2024-09-05T13:07:07-07:00 +### Sparse vectors benchmark -... +Performance results are publicly available for you to test. Qdrant's R&D developed a dedicated [open-source benchmarking tool](https://github.com/qdrant/sparse-vectors-benchmark) just to test sparse vector performance. - +A real-life simulation of sparse vector queries was run against the [NeurIPS 2023 dataset](https://big-ann-benchmarks.com/neurips23.html). All tests were done on an 8 CPU machine on Azure. - +Latency (y-axis) has dropped significantly for queries. You can see the before/after here: -https://qdrant.tech/blog/dspy-vs-langchain/ +![dropping latency](/articles_data/qdrant-1.8.x/benchmark.png) +**Figure 1:** Dropping latency in sparse vector search queries across versions 1.7-1.8. -2025-05-15T19:37:07+05:30 +The colors within both scatter plots show the frequency of results. The red dots show that the highest concentration is around 2200ms (before) and 135ms (after). This tells us that latency for sparse vector queries dropped by about a factor of 16. Therefore, the time it takes to retrieve an answer with Qdrant is that much shorter. -... +This performance increase can have a dramatic effect on hybrid search implementations. [Read more about how to set this up.](/articles/sparse-vectors/) - +FYI, sparse vectors were released in [Qdrant v.1.7.0](/articles/qdrant-1.7.x/#sparse-vectors). They are stored using a different index, so first [check out the documentation](/documentation/concepts/indexing/#sparse-vector-index) if you want to try an implementation. - +## CPU resource management -https://qdrant.tech/blog/qdrant-summer-of-code-24/ +Indexing is Qdrant’s most resource-intensive process. Now you can account for this by allocating compute use specifically to indexing. You can assign a number CPU resources towards indexing and leave the rest for search. As a result, indexes will build faster, and search quality will remain unaffected. -2024-03-14T18:24:32+01:00 +This isn't mandatory, as Qdrant is by default tuned to strike the right balance between indexing and search. However, if you wish to define specific CPU usage, you will need to do so from `config.yaml`. -... +This version introduces a `optimizer_cpu_budget` parameter to control the maximum number of CPUs used for indexing. - +> Read more about `config.yaml` in the [configuration file](/documentation/guides/configuration/). - +```yaml +# CPU budget, how many CPUs (threads) to allocate for an optimization job. +optimizer_cpu_budget: 0 +``` -https://qdrant.tech/blog/dust-and-qdrant/ +- If left at 0, Qdrant will keep 1 or more CPUs unallocated - depending on CPU size. +- If the setting is positive, Qdrant will use this exact number of CPUs for indexing. +- If the setting is negative, Qdrant will subtract this number of CPUs from the available CPUs for indexing. -2024-09-20T10:19:38-04:00 +For most users, the default `optimizer_cpu_budget` setting will work well. We only recommend you use this if your indexing load is significant. -... +Our backend leverages dynamic CPU saturation to increase indexing speed. For that reason, the impact on search query performance ends up being minimal. Ultimately, you will be able to strike the best possible balance between indexing times and search performance. - +This configuration can be done at any time, but it requires a restart of Qdrant. Changing it affects both existing and new collections. - +> **Note:** This feature is not configurable on [Qdrant Cloud](https://qdrant.to/cloud). -https://qdrant.tech/blog/bitter-lesson-generative-language-model/ +## Better indexing for text data -2024-01-29T16:31:02+00:00 +In order to [minimize your RAM expenditure](https://qdrant.tech/articles/memory-consumption/), we have developed a new way to index specific types of data. Please keep in mind that this is a backend improvement, and you won't need to configure anything. -... +> Going forward, if you are indexing immutable text fields, we estimate a 10% reduction in RAM loads. Our benchmark result is based on a system that uses 64GB of RAM. If you are using less RAM, this reduction might be higher than 10%. - +Immutable text fields are static and do not change once they are added to Qdrant. These entries usually represent some type of attribute, description or tag. Vectors associated with them can be indexed more efficiently, since you don’t need to re-index them anymore. Conversely, mutable fields are dynamic and can be modified after their initial creation. Please keep in mind that they will continue to require additional RAM. - +This approach ensures stability in the [vector search](https://qdrant.tech/documentation/overview/vector-search/) index, with faster and more consistent operations. We achieved this by setting up a field index which helps minimize what is stored. To improve search performance we have also optimized the way we load documents for searches with a text field index. Now our backend loads documents mostly sequentially and in increasing order. -https://qdrant.tech/blog/indexify-content-extraction-engine/ -2024-03-07T18:59:29+00:00 +## Minor improvements and new features -... +Beyond these enhancements, [Qdrant v1.8.0](https://github.com/qdrant/qdrant/releases/tag/v1.8.0) adds and improves on several smaller features: - +1. **Order points by payload:** In addition to searching for semantic results, you might want to retrieve results by specific metadata (such as price). You can now use Scroll API to [order points by payload key](/documentation/concepts/points/#order-points-by-payload-key). +2. **Datetime support:** We have implemented [datetime support for the payload index](/documentation/concepts/filtering/#datetime-range). Prior to this, if you wanted to search for a specific datetime range, you would have had to convert dates to UNIX timestamps. ([PR#3320](https://github.com/qdrant/qdrant/issues/3320)) +3. **Check collection existence:** You can check whether a collection exists via the `/exists` endpoint to the `/collections/{collection_name}`. You will get a true/false response. ([PR#3472](https://github.com/qdrant/qdrant/pull/3472)). +4. **Find points** whose payloads match more than the minimal amount of conditions. We included the `min_should` match feature for a condition to be `true` ([PR#3331](https://github.com/qdrant/qdrant/pull/3466/)). +5. **Modify nested fields:** We have improved the `set_payload` API, adding the ability to update nested fields ([PR#3548](https://github.com/qdrant/qdrant/pull/3548)). - +## Experience the Power of Qdrant 1.8.0 -https://qdrant.tech/blog/qdrant-x-dust-vector-search/ +Ready to experience the enhanced performance of Qdrant 1.8.0? Upgrade now and explore the major improvements, from faster sparse vectors to optimized CPU resource management and better indexing for text data. Take your search capabilities to the next level with Qdrant's latest version. [Try a demo today](https://qdrant.tech/demo/) and see the difference firsthand! -2024-07-07T19:40:44-07:00 +## Release notes -... +For more information, see [our release notes](https://github.com/qdrant/qdrant/releases/tag/v1.8.0). +Qdrant is an open-source project. We welcome your contributions; raise [issues](https://github.com/qdrant/qdrant/issues), or contribute via [pull requests](https://github.com/qdrant/qdrant/pulls)! - +<|page-34-lllmstxt|> +In today's fast-paced, information-rich world, AI is revolutionizing knowledge management. The systematic process of capturing, distributing, and effectively using knowledge within an organization is one of the fields in which AI provides exceptional value today. - +> The potential for AI-powered knowledge management increases when leveraging [Retrieval Augmented Generation (RAG)](https://qdrant.tech/rag/rag-evaluation-guide/), a methodology that enables LLMs to access a vast, diverse repository of factual information from knowledge stores, such as vector databases. -https://qdrant.tech/blog/series-a-funding-round/ +This process enhances the accuracy, relevance, and reliability of generated text, thereby mitigating the risk of faulty, incorrect, or nonsensical results sometimes associated with traditional LLMs. This method not only ensures that the answers are contextually relevant but also up-to-date, reflecting the latest insights and data available. -2024-10-08T12:41:46-07:00 +While RAG enhances the accuracy, relevance, and reliability of traditional LLM solutions, **an evaluation strategy can further help teams ensure their AI products meet these benchmarks of success.** -... +## Relevant tools for this experiment - +In this article, we’ll break down a RAG Optimization workflow experiment that demonstrates that evaluation is essential to build a successful RAG strategy. We will use Qdrant and Quotient for this experiment. - +[Qdrant](https://qdrant.tech/) is a vector database and vector similarity search engine designed for efficient storage and retrieval of high-dimensional vectors. Because Qdrant offers efficient indexing and searching capabilities, it is ideal for implementing RAG solutions, where quickly and accurately retrieving relevant information from extremely large datasets is crucial. Qdrant also offers a wealth of additional features, such as quantization, multivector support and multi-tenancy. -https://qdrant.tech/blog/qdrant-cloud-on-microsoft-azure/ +Alongside Qdrant we will use Quotient, which provides a seamless way to evaluate your RAG implementation, accelerating and improving the experimentation process. -2024-03-07T20:31:05+01:00 +[Quotient](https://www.quotientai.co/) is a platform that provides tooling for AI developers to build [evaluation frameworks](https://qdrant.tech/rag/rag-evaluation-guide/) and conduct experiments on their products. Evaluation is how teams surface the shortcomings of their applications and improve performance in key benchmarks such as faithfulness, and semantic similarity. Iteration is key to building innovative AI products that will deliver value to end users. -... +> 💡 The [accompanying notebook](https://github.com/qdrant/qdrant-rag-eval/tree/master/workshop-rag-eval-qdrant-quotient) for this exercise can be found on GitHub for future reference. - +## Summary of key findings - +1. **Irrelevance and Hallucinations**: When the documents retrieved are irrelevant, evidenced by low scores in both Chunk Relevance and Context Relevance, the model is prone to generating inaccurate or fabricated information. +2. **Optimizing Document Retrieval**: By retrieving a greater number of documents and reducing the chunk size, we observed improved outcomes in the model's performance. +3. **Adaptive Retrieval Needs**: Certain queries may benefit from accessing more documents. Implementing a dynamic retrieval strategy that adjusts based on the query could enhance accuracy. +4. **Influence of Model and Prompt Variations**: Alterations in language models or the prompts used can significantly impact the quality of the generated responses, suggesting that fine-tuning these elements could optimize performance. -https://qdrant.tech/blog/qdrant-benchmarks-2024/ +Let us walk you through how we arrived at these findings! -2024-03-07T20:31:05+01:00 +## Building a RAG pipeline -... +To evaluate a RAG pipeline, we will have to build a RAG Pipeline first. In the interest of simplicity, we are building a Naive RAG in this article. There are certainly other versions of RAG : - +![shades_of_rag.png](/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/shades_of_rag.png) - +The illustration below depicts how we can leverage a [RAG Evaluation framework](https://qdrant.tech/rag/rag-evaluation-guide/) to assess the quality of RAG Application. -https://qdrant.tech/blog/navigating-challenges-innovations/ +![qdrant_and_quotient.png](/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/qdrant_and_quotient.png) -2024-05-21T09:57:56+02:00 +We are going to build a RAG application using Qdrant’s Documentation and the premeditated [hugging face dataset](https://huggingface.co/datasets/atitaarora/qdrant_doc). +We will then assess our RAG application’s ability to answer questions about Qdrant. -... +To prepare our knowledge store we will use Qdrant, which can be leveraged in 3 different ways as below : - +```python +client = qdrant_client.QdrantClient( + os.environ.get("QDRANT_URL"), + api_key=os.environ.get("QDRANT_API_KEY"), +) +``` - +We will be using [Qdrant Cloud](https://cloud.qdrant.io/login) so it is a good idea to provide the `QDRANT_URL` and `QDRANT_API_KEY` as environment variables for easier access. -https://qdrant.tech/blog/open-source-vector-search-engine-vector-database/ +Moving on, we will need to define the collection name as : -2024-07-07T19:36:05-07:00 +```python +COLLECTION_NAME = "qdrant-docs-quotient" +``` -... +In this case , we may need to create different collections based on the experiments we conduct. - +To help us provide seamless embedding creations throughout the experiment, we will use Qdrant’s own embeddings library [Fastembed](https://qdrant.github.io/fastembed/) which supports [many different models](https://qdrant.github.io/fastembed/examples/Supported_Models/) including dense as well as sparse vector models. - +Before implementing RAG, we need to prepare and index our data in Qdrant. -https://qdrant.tech/blog/vector-image-search-rag/ +This involves converting textual data into vectors using a suitable encoder (e.g., sentence transformers), and storing these vectors in Qdrant for retrieval. -2024-01-25T17:51:08+01:00 +```python +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.docstore.document import Document as LangchainDocument -... +## Load the dataset with qdrant documentation +dataset = load_dataset("atitaarora/qdrant_doc", split="train") - +## Dataset to langchain document +langchain_docs = [ + LangchainDocument(page_content=doc["text"], metadata={"source": doc["source"]}) + for doc in dataset +] - +len(langchain_docs) -https://qdrant.tech/blog/semantic-search-vector-database/ +#Outputs +#240 +``` -2024-07-07T19:46:08-07:00 +You can preview documents in the dataset as below : -... +```python +## Here's an example of what a document in our dataset looks like +print(dataset[100]['text']) - +``` - +## Evaluation dataset -https://qdrant.tech/blog/llm-complex-search-copilot/ +To measure the quality of our RAG setup, we will need a representative evaluation dataset. This dataset should contain realistic questions and the expected answers. -2024-01-10T11:42:02+00:00 +Additionally, including the expected contexts for which your RAG pipeline is designed to retrieve information would be beneficial. -... +We will be using a [prebuilt evaluation dataset](https://huggingface.co/datasets/atitaarora/qdrant_doc_qna). - +If you are struggling to make an evaluation dataset for your use case , you can use your documents and some techniques described in this [notebook](https://github.com/qdrant/qdrant-rag-eval/blob/master/synthetic_qna/notebook/Synthetic_question_generation.ipynb) - +### Building the RAG pipeline -https://qdrant.tech/blog/entity-matching-qdrant/ +We establish the data preprocessing parameters essential for the RAG pipeline and configure the Qdrant vector database according to the specified criteria. -2024-01-10T11:37:51+00:00 +Key parameters under consideration are: -... +- **Chunk size** +- **Chunk overlap** +- **Embedding model** +- **Number of documents retrieved (retrieval window)** - +Following the ingestion of data in Qdrant, we proceed to retrieve pertinent documents corresponding to each query. These documents are then seamlessly integrated into our evaluation dataset, enriching the contextual information within the designated **`context`** column to fulfil the evaluation aspect. - +Next we define methods to take care of logistics with respect to adding documents to Qdrant -https://qdrant.tech/blog/fast-embed-models/ +```python +import uuid -2024-01-22T10:15:56-08:00 +from qdrant_client import models -... +def add_documents(client, collection_name, chunk_size, chunk_overlap, embedding_model_name): + """ + This function adds documents to the desired Qdrant collection given the specified RAG parameters. + """ - + ## Processing each document with desired TEXT_SPLITTER_ALGO, CHUNK_SIZE, CHUNK_OVERLAP + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + add_start_index=True, + separators=["\n\n", "\n", ".", " ", ""], + ) - + docs_processed = [] + for doc in langchain_docs: + docs_processed += text_splitter.split_documents([doc]) -https://qdrant.tech/blog/human-language-ai-models/ + ## Processing documents to be encoded by Fastembed + docs_contents = [] + docs_metadatas = [] -2024-01-10T10:31:15+00:00 + for doc in docs_processed: + if hasattr(doc, 'page_content') and hasattr(doc, 'metadata'): + docs_contents.append(doc.page_content) + docs_metadatas.append(doc.metadata) + else: + # Handle the case where attributes are missing + print("Warning: Some documents do not have 'page_content' or 'metadata' attributes.") -... + print("processed: ", len(docs_processed)) + print("content: ", len(docs_contents)) + print("metadata: ", len(docs_metadatas)) - + if not client.collection_exists(collection_name): + client.create_collection( + collection_name=collection_name, + vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE), + ) - + client.upsert( + collection_name=collection_name, + points=[ + models.PointStruct( + id=uuid.uuid4().hex, + vector=models.Document(text=content, model=embedding_model_name), + payload={"metadata": metadata, "document": content}, + ) + for metadata, content in zip(docs_metadatas, docs_contents) + ], + ) +``` -https://qdrant.tech/blog/binary-quantization/ +and retrieving documents from Qdrant during our RAG Pipeline assessment. -2024-01-10T10:26:06+00:00 +```python +def get_documents(collection_name, query, num_documents=3): + """ + This function retrieves the desired number of documents from the Qdrant collection given a query. + It returns a list of the retrieved documents. + """ + search_results = client.query_points( + collection_name=collection_name, + query=models.Document(text=query, model=embedding_model_name), + limit=num_documents, + ).points -... + results = [r.payload["document"] for r in search_results] + return results +``` - +### Setting up Quotient - +You will need an account log in, which you can get by requesting access on [Quotient's website](https://www.quotientai.co/). Once you have an account, you can create an API key by running the `quotient authenticate` CLI command. -https://qdrant.tech/blog/qdrant-unstructured/ + -... +**Once you have your API key, make sure to set it as an environment variable called `QUOTIENT_API_KEY`** - +```python +# Import QuotientAI client and connect to QuotientAI +from quotientai.client import QuotientClient +from quotientai.utils import show_job_progress - +# IMPORTANT: be sure to set your API key as an environment variable called QUOTIENT_API_KEY +# You will need this set before running the code below. You may also uncomment the following line and insert your API key: +# os.environ['QUOTIENT_API_KEY'] = "YOUR_API_KEY" -https://qdrant.tech/blog/qdrant-n8n/ +quotient = QuotientClient() +``` -2024-03-07T20:31:05+01:00 +**QuotientAI** provides a seamless way to integrate *RAG evaluation* into your applications. Here, we'll see how to use it to evaluate text generated from an LLM, based on retrieved knowledge from the Qdrant vector database. -... +After retrieving the top similar documents and populating the `context` column, we can submit the evaluation dataset to Quotient and execute an evaluation job. To run a job, all you need is your evaluation dataset and a `recipe`. - +***A recipe is a combination of a prompt template and a specified LLM.*** - +**Quotient** orchestrates the evaluation run and handles version control and asset management throughout the experimentation process. -https://qdrant.tech/blog/vector-search-and-applications-record/ +***Prior to assessing our RAG solution, it's crucial to outline our optimization goals.*** -2024-09-06T13:14:12+02:00 +In the context of *question-answering on Qdrant documentation*, our focus extends beyond merely providing helpful responses. Ensuring the absence of any *inaccurate or misleading information* is paramount. -... +In other words, **we want to minimize hallucinations** in the LLM outputs. - +For our evaluation, we will be considering the following metrics, with a focus on **Faithfulness**: - +- **Context Relevance** +- **Chunk Relevance** +- **Faithfulness** +- **ROUGE-L** +- **BERT Sentence Similarity** +- **BERTScore** -https://qdrant.tech/blog/cohere-embedding-v3/ +### Evaluation in action -2024-09-06T13:14:12+02:00 +The function below takes an evaluation dataset as input, which in this case contains questions and their corresponding answers. It retrieves relevant documents based on the questions in the dataset and populates the context field with this information from Qdrant. The prepared dataset is then submitted to QuotientAI for evaluation for the chosen metrics. After the evaluation is complete, the function displays aggregated statistics on the evaluation metrics followed by the summarized evaluation results. -... +```python +def run_eval(eval_df, collection_name, recipe_id, num_docs=3, path="eval_dataset_qdrant_questions.csv"): + """ + This function evaluates the performance of a complete RAG pipeline on a given evaluation dataset. - + Given an evaluation dataset (containing questions and ground truth answers), + this function retrieves relevant documents, populates the context field, and submits the dataset to QuotientAI for evaluation. + Once the evaluation is complete, aggregated statistics on the evaluation metrics are displayed. - + The evaluation results are returned as a pandas dataframe. + """ -https://qdrant.tech/blog/case-study-pienso/ + # Add context to each question by retrieving relevant documents + eval_df['documents'] = eval_df.apply(lambda x: get_documents(collection_name=collection_name, + query=x['input_text'], + num_documents=num_docs), axis=1) + eval_df['context'] = eval_df.apply(lambda x: "\n".join(x['documents']), axis=1) -2024-04-10T17:59:48-07:00 + # Now we'll save the eval_df to a CSV + eval_df.to_csv(path, index=False) -... + # Upload the eval dataset to QuotientAI + dataset = quotient.create_dataset( + file_path=path, + name="qdrant-questions-eval-v1", + ) - + # Create a new task for the dataset + task = quotient.create_task( + dataset_id=dataset['id'], + name='qdrant-questions-qa-v1', + task_type='question_answering' + ) - + # Run a job to evaluate the model + job = quotient.create_job( + task_id=task['id'], + recipe_id=recipe_id, + num_fewshot_examples=0, + limit=500, + metric_ids=[5, 7, 8, 11, 12, 13, 50], + ) -https://qdrant.tech/blog/case-study-bloop/ + # Show the progress of the job + show_job_progress(quotient, job['id']) -2024-07-18T19:11:22-07:00 + # Once the job is complete, we can get our results + data = quotient.get_eval_results(job_id=job['id']) -... + # Add the results to a pandas dataframe to get statistics on performance + df = pd.json_normalize(data, "results") + df_stats = df[df.columns[df.columns.str.contains("metric|completion_time")]] - + df.columns = df.columns.str.replace("metric.", "") + df_stats.columns = df_stats.columns.str.replace("metric.", "") - + metrics = { + 'completion_time_ms':'Completion Time (ms)', + 'chunk_relevance': 'Chunk Relevance', + 'selfcheckgpt_nli_relevance':"Context Relevance", + 'selfcheckgpt_nli':"Faithfulness", + 'rougeL_fmeasure':"ROUGE-L", + 'bert_score_f1':"BERTScore", + 'bert_sentence_similarity': "BERT Sentence Similarity", + 'completion_verbosity':"Completion Verbosity", + 'verbosity_ratio':"Verbosity Ratio",} -https://qdrant.tech/articles/qdrant-introduces-full-text-filters-and-indexes/ + df = df.rename(columns=metrics) + df_stats = df_stats.rename(columns=metrics) -2024-09-18T15:57:29-07:00 + display(df_stats[metrics.values()].describe()) -... + return df - +main_metrics = [ + 'Context Relevance', + 'Chunk Relevance', + 'Faithfulness', + 'ROUGE-L', + 'BERT Sentence Similarity', + 'BERTScore', + ] +``` - +## Experimentation -https://qdrant.tech/articles/storing-multiple-vectors-per-object-in-qdrant/ +Our approach is rooted in the belief that improvement thrives in an environment of exploration and discovery. By systematically testing and tweaking various components of the RAG pipeline, we aim to incrementally enhance its capabilities and performance. -2024-12-20T13:10:51+01:00 +In the following section, we dive into the details of our experimentation process, outlining the specific experiments conducted and the insights gained. -... +### Experiment 1 - Baseline - +Parameters - +- **Embedding Model: `bge-small-en`** +- **Chunk size: `512`** +- **Chunk overlap: `64`** +- **Number of docs retrieved (Retireval Window): `3`** +- **LLM: `Mistral-7B-Instruct`** -https://qdrant.tech/articles/batch-vector-search-with-qdrant/ +We’ll process our documents based on configuration above and ingest them into Qdrant using `add_documents` method introduced earlier -2024-12-20T13:10:51+01:00 +```python +#experiment1 - base config +chunk_size = 512 +chunk_overlap = 64 +embedding_model_name = "BAAI/bge-small-en" +num_docs = 3 -... +COLLECTION_NAME = f"experiment_{chunk_size}_{chunk_overlap}_{embedding_model_name.split('/')[1]}" - +add_documents(client, + collection_name=COLLECTION_NAME, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + embedding_model_name=embedding_model_name) + +#Outputs +#processed: 4504 +#content: 4504 +#metadata: 4504 +``` - +Notice the `COLLECTION_NAME` which helps us segregate and identify our collections based on the experiments conducted. -https://qdrant.tech/blog/qdrant-supports-arm-architecture/ +To proceed with the evaluation, let’s create the `evaluation recipe` up next -2024-01-16T22:02:52+05:30 +```python +# Create a recipe for the generator model and prompt template +recipe_mistral = quotient.create_recipe( + model_id=10, + prompt_template_id=1, + name='mistral-7b-instruct-qa-with-rag', + description='Mistral-7b-instruct using a prompt template that includes context.' +) +recipe_mistral -... +#Outputs recipe JSON with the used prompt template +#'prompt_template': {'id': 1, +# 'name': 'Default Question Answering Template', +# 'variables': '["input_text","context"]', +# 'created_at': '2023-12-21T22:01:54.632367', +# 'template_string': 'Question: {input_text}\\n\\nContext: {context}\\n\\nAnswer:', +# 'owner_profile_id': None} +``` - +To get a list of your existing recipes, you can simply run: - +```python +quotient.list_recipes() +``` -https://qdrant.tech/about-us/ +Notice the recipe template is a simplest prompt using `Question` from evaluation template `Context` from document chunks retrieved from Qdrant and `Answer` generated by the pipeline. -2024-05-21T09:57:56+02:00 +To kick off the evaluation -... +```python +# Kick off an evaluation job +experiment_1 = run_eval(eval_df, + collection_name=COLLECTION_NAME, + recipe_id=recipe_mistral['id'], + num_docs=num_docs, + path=f"{COLLECTION_NAME}_{num_docs}_mistral.csv") +``` - +This may take few minutes (depending on the size of evaluation dataset!) - +We can look at the results from our first (baseline) experiment as below : -https://qdrant.tech/data-analysis-anomaly-detection/ +![experiment1_eval.png](/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/experiment1_eval.png) -2024-08-29T10:01:03-04:00 +Notice that we have a pretty **low average Chunk Relevance** and **very large standard deviations for both Chunk Relevance and Context Relevance**. -... +Let's take a look at some of the lower performing datapoints with **poor Faithfulness**: - +```python +with pd.option_context('display.max_colwidth', 0): + display(experiment_1[['content.input_text', 'content.answer','content.documents','Chunk Relevance','Context Relevance','Faithfulness'] + ].sort_values(by='Faithfulness').head(2)) +``` - +![experiment1_bad_examples.png](/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/experiment1_bad_examples.png) -https://qdrant.tech/advanced-search/ +In instances where the retrieved documents are **irrelevant (where both Chunk Relevance and Context Relevance are low)**, the model also shows **tendencies to hallucinate** and **produce poor quality responses**. -2024-08-21T16:31:41-07:00 +The quality of the retrieved text directly impacts the quality of the LLM-generated answer. Therefore, our focus will be on enhancing the RAG setup by **adjusting the chunking parameters**. -... +### Experiment 2 - Adjusting the chunk parameter - +Keeping all other parameters constant, we changed the `chunk size` and `chunk overlap` to see if we can improve our results. - +Parameters : -https://qdrant.tech/ai-agents/ +- **Embedding Model : `bge-small-en`** +- **Chunk size: `1024`** +- **Chunk overlap: `128`** +- **Number of docs retrieved (Retireval Window): `3`** +- **LLM: `Mistral-7B-Instruct`** -2025-02-12T08:47:39-06:00 +We will reprocess the data with the updated parameters above: -... +```python +## for iteration 2 - lets modify chunk configuration +## We will start with creating seperate collection to store vectors - +chunk_size = 1024 +chunk_overlap = 128 +embedding_model_name = "BAAI/bge-small-en" +num_docs = 3 - +COLLECTION_NAME = f"experiment_{chunk_size}_{chunk_overlap}_{embedding_model_name.split('/')[1]}" -https://qdrant.tech/e-commerce/ +add_documents(client, + collection_name=COLLECTION_NAME, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + embedding_model_name=embedding_model_name) + +#Outputs +#processed: 2152 +#content: 2152 +#metadata: 2152 +``` -2025-05-22T20:23:57+02:00 +Followed by running evaluation : -... +![experiment2_eval.png](/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/experiment2_eval.png) - +and **comparing it with the results from Experiment 1:** - +![graph_exp1_vs_exp2.png](/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/graph_exp1_vs_exp2.png) -https://qdrant.tech/documentation/data-management/airbyte/ +We observed slight enhancements in our LLM completion metrics (including BERT Sentence Similarity, BERTScore, ROUGE-L, and Knowledge F1) with the increase in *chunk size*. However, it's noteworthy that there was a significant decrease in *Faithfulness*, which is the primary metric we are aiming to optimize. -2024-08-15T08:50:37+05:30 +Moreover, *Context Relevance* demonstrated an increase, indicating that the RAG pipeline retrieved more relevant information required to address the query. Nonetheless, there was a considerable drop in *Chunk Relevance*, implying that a smaller portion of the retrieved documents contained pertinent information for answering the question. -... +**The correlation between the rise in Context Relevance and the decline in Chunk Relevance suggests that retrieving more documents using the smaller chunk size might yield improved results.** - +### Experiment 3 - Increasing the number of documents retrieved (retrieval window) - +This time, we are using the same RAG setup as `Experiment 1`, but increasing the number of retrieved documents from **3** to **5**. -https://qdrant.tech/documentation/embeddings/aleph-alpha/ +Parameters : -2024-11-28T08:54:13+05:30 +- **Embedding Model : `bge-small-en`** +- **Chunk size: `512`** +- **Chunk overlap: `64`** +- **Number of docs retrieved (Retrieval Window): `5`** +- **LLM: : `Mistral-7B-Instruct`** -... +We can use the collection from Experiment 1 and run evaluation with modified `num_docs` parameter as : - +```python +#collection name from Experiment 1 +COLLECTION_NAME = f"experiment_{chunk_size}_{chunk_overlap}_{embedding_model_name.split('/')[1]}" - +#running eval for experiment 3 +experiment_3 = run_eval(eval_df, + collection_name=COLLECTION_NAME, + recipe_id=recipe_mistral['id'], + num_docs=num_docs, + path=f"{COLLECTION_NAME}_{num_docs}_mistral.csv") +``` -https://qdrant.tech/get\_anonymous\_id/ +Observe the results as below : -2025-03-05T11:26:52+00:00 +![experiment_3_eval.png](/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/experiment_3_eval.png) -... +Comparing the results with Experiment 1 and 2 : - +![graph_exp1_exp2_exp3.png](/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/graph_exp1_exp2_exp3.png) - +As anticipated, employing the smaller chunk size while retrieving a larger number of documents resulted in achieving the highest levels of both *Context Relevance* and *Chunk Relevance.* Additionally, it yielded the **best** (albeit marginal) *Faithfulness* score, indicating a *reduced occurrence of inaccuracies or hallucinations*. -https://qdrant.tech/documentation/data-management/airflow/ +Looks like we have achieved a good hold on our chunking parameters but it is worth testing another embedding model to see if we can get better results. -2025-02-18T21:01:07+05:30 +### Experiment 4 - Changing the embedding model -... +Let us try using **MiniLM** for this experiment +****Parameters : - +- **Embedding Model : `MiniLM-L6-v2`** +- **Chunk size: `512`** +- **Chunk overlap: `64`** +- **Number of docs retrieved (Retrieval Window): `5`** +- **LLM: : `Mistral-7B-Instruct`** - +We will have to create another collection for this experiment : -https://qdrant.tech/documentation/data-management/nifi/ +```python +#experiment-4 +chunk_size=512 +chunk_overlap=64 +embedding_model_name="sentence-transformers/all-MiniLM-L6-v2" +num_docs=5 -2024-08-15T08:50:37+05:30 +COLLECTION_NAME = f"experiment_{chunk_size}_{chunk_overlap}_{embedding_model_name.split('/')[1]}" -... +add_documents(client, + collection_name=COLLECTION_NAME, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + embedding_model_name=embedding_model_name) - +#Outputs +#processed: 4504 +#content: 4504 +#metadata: 4504 +``` - +We will observe our evaluations as : -https://qdrant.tech/documentation/data-management/spark/ +![experiment4_eval.png](/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/experiment4_eval.png) -2025-03-06T10:23:24+05:30 +Comparing these with our previous experiments : -... +![graph_exp1_exp2_exp3_exp4.png](/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/graph_exp1_exp2_exp3_exp4.png) - +It appears that `bge-small` was more proficient in capturing the semantic nuances of the Qdrant Documentation. - +Up to this point, our experimentation has focused solely on the *retrieval aspect* of our RAG pipeline. Now, let's explore altering the *generation aspect* or LLM while retaining the optimal parameters identified in Experiment 3. -https://qdrant.tech/documentation/platforms/apify/ +### Experiment 5 - Changing the LLM -2024-08-15T08:50:37+05:30 +Parameters : -... +- **Embedding Model : `bge-small-en`** +- **Chunk size: `512`** +- **Chunk overlap: `64`** +- **Number of docs retrieved (Retrieval Window): `5`** +- **LLM: : `GPT-3.5-turbo`** - +For this we can repurpose our collection from Experiment 3 while the evaluations to use a new recipe with **GPT-3.5-turbo** model. - +```python +#collection name from Experiment 3 +COLLECTION_NAME = f"experiment_{chunk_size}_{chunk_overlap}_{embedding_model_name.split('/')[1]}" -https://qdrant.tech/documentation/frameworks/autogen/ +# We have to create a recipe using the same prompt template and GPT-3.5-turbo +recipe_gpt = quotient.create_recipe( + model_id=5, + prompt_template_id=1, + name='gpt3.5-qa-with-rag-recipe-v1', + description='GPT-3.5 using a prompt template that includes context.' +) -2024-11-20T11:50:06+05:30 +recipe_gpt -... +#Outputs +#{'id': 495, +# 'name': 'gpt3.5-qa-with-rag-recipe-v1', +# 'description': 'GPT-3.5 using a prompt template that includes context.', +# 'model_id': 5, +# 'prompt_template_id': 1, +# 'created_at': '2024-05-03T12:14:58.779585', +# 'owner_profile_id': 34, +# 'system_prompt_id': None, +# 'prompt_template': {'id': 1, +# 'name': 'Default Question Answering Template', +# 'variables': '["input_text","context"]', +# 'created_at': '2023-12-21T22:01:54.632367', +# 'template_string': 'Question: {input_text}\\n\\nContext: {context}\\n\\nAnswer:', +# 'owner_profile_id': None}, +# 'model': {'id': 5, +# 'name': 'gpt-3.5-turbo', +# 'endpoint': 'https://api.openai.com/v1/chat/completions', +# 'revision': 'placeholder', +# 'created_at': '2024-02-06T17:01:21.408454', +# 'model_type': 'OpenAI', +# 'description': 'Returns a maximum of 4K output tokens.', +# 'owner_profile_id': None, +# 'external_model_config_id': None, +# 'instruction_template_cls': 'NoneType'}} +``` - +Running the evaluations as : - +```python +experiment_5 = run_eval(eval_df, + collection_name=COLLECTION_NAME, + recipe_id=recipe_gpt['id'], + num_docs=num_docs, + path=f"{COLLECTION_NAME}_{num_docs}_gpt.csv") +``` -https://qdrant.tech/documentation/embeddings/bedrock/ +We observe : -2024-11-28T08:54:13+05:30 +![experiment5_eval.png](/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/experiment5_eval.png) -... +and comparing all the 5 experiments as below : - +![graph_exp1_exp2_exp3_exp4_exp5.png](/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/graph_exp1_exp2_exp3_exp4_exp5.png) - +**GPT-3.5 surpassed Mistral-7B in all metrics**! Notably, Experiment 5 exhibited the **lowest occurrence of hallucination**. -https://qdrant.tech/documentation/frameworks/lakechain/ +## Conclusions -2024-10-17T11:42:14+05:30 +Let’s take a look at our results from all 5 experiments above -... +![overall_eval_results.png](/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/overall_eval_results.png) - +We still have a long way to go in improving the retrieval performance of RAG, as indicated by our generally poor results thus far. It might be beneficial to **explore alternative embedding models** or **different retrieval strategies** to address this issue. - +The significant variations in *Context Relevance* suggest that **certain questions may necessitate retrieving more documents than others**. Therefore, investigating a **dynamic retrieval strategy** could be worthwhile. -https://qdrant.tech/about-us/about-us-resources/ +Furthermore, there's ongoing **exploration required on the generative aspect** of RAG. +Modifying LLMs or prompts can substantially impact the overall quality of responses. -2025-05-30T14:14:31+03:00 +This iterative process demonstrates how, starting from scratch, continual evaluation and adjustments throughout experimentation can lead to the development of an enhanced RAG system. -... +## Watch this workshop on YouTube - +> A workshop version of this article is [available on YouTube](https://www.youtube.com/watch?v=3MEMPZR1aZA). Follow along using our [GitHub notebook](https://github.com/qdrant/qdrant-rag-eval/tree/master/workshop-rag-eval-qdrant-quotient). - + -https://qdrant.tech/brand-resources/ +<|page-35-lllmstxt|> +# Is RAG Dead? The Role of Vector Databases in AI Efficiency and Vector Search -2024-06-17T16:56:32+03:00 +When Anthropic came out with a context window of 100K tokens, they said: “*[Vector search](https://qdrant.tech/solutions/) is dead. LLMs are getting more accurate and won’t need RAG anymore.*” -... +Google’s Gemini 1.5 now offers a context window of 10 million tokens. [Their supporting paper](https://storage.googleapis.com/deepmind-media/gemini/gemini_v1_5_report.pdf) claims victory over accuracy issues, even when applying Greg Kamradt’s [NIAH methodology](https://twitter.com/GregKamradt/status/1722386725635580292). - +*It’s over. [RAG](https://qdrant.tech/articles/what-is-rag-in-ai/) (Retrieval Augmented Generation) must be completely obsolete now. Right?* - +No. -https://qdrant.tech/documentation/platforms/bubble/ +Larger context windows are never the solution. Let me repeat. Never. They require more computational resources and lead to slower processing times. -2024-08-15T08:50:37+05:30 +The community is already stress testing Gemini 1.5: -... +![RAG and Gemini 1.5](/articles_data/rag-is-dead/rag-is-dead-1.png) - +This is not surprising. LLMs require massive amounts of compute and memory to run. To cite Grant, running such a model by itself “would deplete a small coal mine to generate each completion”. Also, who is waiting 30 seconds for a response? - +## Context stuffing is not the solution -https://qdrant.tech/security/bug-bounty-program/ +> Relying on context is expensive, and it doesn’t improve response quality in real-world applications. Retrieval based on [vector search](https://qdrant.tech/solutions/) offers much higher precision. -2025-03-28T09:40:53+01:00 +If you solely rely on an [LLM](https://qdrant.tech/articles/what-is-rag-in-ai/) to perfect retrieval and precision, you are doing it wrong. -... +A large context window makes it harder to focus on relevant information. This increases the risk of errors or hallucinations in its responses. - +Google found Gemini 1.5 significantly more accurate than GPT-4 at shorter context lengths and “a very small decrease in recall towards 1M tokens”. The recall is still below 0.8. - +![Gemini 1.5 Data](/articles_data/rag-is-dead/rag-is-dead-2.png) -https://qdrant.tech/documentation/build/ +We don’t think 60-80% is good enough. The LLM might retrieve enough relevant facts in its context window, but it still loses up to 40% of the available information. -2024-11-18T14:53:02-08:00 +> The whole point of vector search is to circumvent this process by efficiently picking the information your app needs to generate the best response. A [vector database](https://qdrant.tech/) keeps the compute load low and the query response fast. You don’t need to wait for the LLM at all. -... +Qdrant’s benchmark results are strongly in favor of accuracy and efficiency. We recommend that you consider them before deciding that an LLM is enough. Take a look at our [open-source benchmark reports](/benchmarks/) and [try out the tests](https://github.com/qdrant/vector-db-benchmark) yourself. - +## Vector search in compound systems - +The future of AI lies in careful system engineering. As per [Zaharia et al.](https://bair.berkeley.edu/blog/2024/02/18/compound-ai-systems/), results from Databricks find that “60% of LLM applications use some form of RAG, while 30% use multi-step chains.” -https://qdrant.tech/documentation/platforms/buildship/ +Even Gemini 1.5 demonstrates the need for a complex strategy. When looking at [Google’s MMLU Benchmark](https://storage.googleapis.com/deepmind-media/gemini/gemini_v1_5_report.pdf), the model was called 32 times to reach a score of 90.0% accuracy. This shows us that even a basic compound arrangement is superior to monolithic models. -2024-08-15T08:50:37+05:30 +As a retrieval system, a [vector database](https://qdrant.tech/) perfectly fits the need for compound systems. Introducing them into your design opens the possibilities for superior applications of LLMs. It is superior because it’s faster, more accurate, and much cheaper to run. -... +> The key advantage of RAG is that it allows an LLM to pull in real-time information from up-to-date internal and external knowledge sources, making it more dynamic and adaptable to new information. - Oliver Molander, CEO of IMAGINAI +> - +## Qdrant scales to enterprise RAG scenarios - +People still don’t understand the economic benefit of vector databases. Why would a large corporate AI system need a standalone vector database like [Qdrant](https://qdrant.tech/)? In our minds, this is the most important question. Let’s pretend that LLMs cease struggling with context thresholds altogether. -https://qdrant.tech/documentation/frameworks/camel/ +**How much would all of this cost?** -2024-12-20T13:31:09+05:30 +If you are running a RAG solution in an enterprise environment with petabytes of private data, your compute bill will be unimaginable. Let's assume 1 cent per 1K input tokens (which is the current GPT-4 Turbo pricing). Whatever you are doing, every time you go 100 thousand tokens deep, it will cost you $1. -... +That’s a buck a question. - +> According to our estimations, vector search queries are **at least** 100 million times cheaper than queries made by LLMs. - +Conversely, the only up-front investment with vector databases is the indexing (which requires more compute). After this step, everything else is a breeze. Once setup, Qdrant easily scales via [features like Multitenancy and Sharding](/articles/multitenancy/). This lets you scale up your reliance on the vector retrieval process and minimize your use of the compute-heavy LLMs. As an optimization measure, Qdrant is irreplaceable. -https://qdrant.tech/documentation/frameworks/cheshire-cat/ +Julien Simon from HuggingFace says it best: -2025-01-24T11:47:11+01:00 +> RAG is not a workaround for limited context size. For mission-critical enterprise use cases, RAG is a way to leverage high-value, proprietary company knowledge that will never be found in public datasets used for LLM training. At the moment, the best place to index and query this knowledge is some sort of vector index. In addition, RAG downgrades the LLM to a writing assistant. Since built-in knowledge becomes much less important, a nice small 7B open-source model usually does the trick at a fraction of the cost of a huge generic model. -... - +## Get superior accuracy with Qdrant's vector database - +As LLMs continue to require enormous computing power, users will need to leverage vector search and [RAG](https://qdrant.tech/rag/rag-evaluation-guide/). -https://qdrant.tech/documentation/data-management/cocoindex/ +Our customers remind us of this fact every day. As a product, [our vector database](https://qdrant.tech/) is highly scalable and business-friendly. We develop our features strategically to follow our company’s Unix philosophy. -2025-04-20T23:11:21-07:00 +We want to keep Qdrant compact, efficient and with a focused purpose. This purpose is to empower our customers to use it however they see fit. -... +When large enterprises release their generative AI into production, they need to keep costs under control, while retaining the best possible quality of responses. Qdrant has the [vector search solutions](https://qdrant.tech/solutions/) to do just that. Revolutionize your vector search capabilities and get started with [a Qdrant demo](https://qdrant.tech/contact-us/). - +<|page-36-lllmstxt|> +OpenAI Ada-003 embeddings are a powerful tool for natural language processing (NLP). However, the size of the embeddings are a challenge, especially with real-time search and retrieval. In this article, we explore how you can use Qdrant's Binary Quantization to enhance the performance and efficiency of OpenAI embeddings. - +In this post, we discuss: -https://qdrant.tech/documentation/data-management/cognee/ +- The significance of OpenAI embeddings and real-world challenges. +- Qdrant's Binary Quantization, and how it can improve the performance of OpenAI embeddings +- Results of an experiment that highlights improvements in search efficiency and accuracy +- Implications of these findings for real-world applications +- Best practices for leveraging Binary Quantization to enhance OpenAI embeddings -2025-05-31T22:06:39+02:00 +If you're new to Binary Quantization, consider reading our article which walks you through the concept and [how to use it with Qdrant](/articles/binary-quantization/) -... +You can also try out these techniques as described in [Binary Quantization OpenAI](https://github.com/qdrant/examples/blob/openai-3/binary-quantization-openai/README.md), which includes Jupyter notebooks. - +## New OpenAI embeddings: performance and changes - +As the technology of embedding models has advanced, demand has grown. Users are looking more for powerful and efficient text-embedding models. OpenAI's Ada-003 embeddings offer state-of-the-art performance on a wide range of NLP tasks, including those noted in [MTEB](https://huggingface.co/spaces/mteb/leaderboard) and [MIRACL](https://openai.com/blog/new-embedding-models-and-api-updates). -https://qdrant.tech/documentation/embeddings/cohere/ +These models include multilingual support in over 100 languages. The transition from text-embedding-ada-002 to text-embedding-3-large has led to a significant jump in performance scores (from 31.4% to 54.9% on MIRACL). -2025-02-19T10:27:39+03:00 +#### Matryoshka representation learning -... +The new OpenAI models have been trained with a novel approach called "[Matryoshka Representation Learning](https://aniketrege.github.io/blog/2024/mrl/)". Developers can set up embeddings of different sizes (number of dimensions). In this post, we use small and large variants. Developers can select embeddings which balances accuracy and size. - +Here, we show how the accuracy of binary quantization is quite good across different dimensions -- for both the models. - +## Enhanced performance and efficiency with binary quantization -https://qdrant.tech/community/ +By reducing storage needs, you can scale applications with lower costs. This addresses a critical challenge posed by the original embedding sizes. Binary Quantization also speeds the search process. It simplifies the complex distance calculations between vectors into more manageable bitwise operations, which supports potentially real-time searches across vast datasets. -2025-01-07T11:56:39-06:00 +The accompanying graph illustrates the promising accuracy levels achievable with binary quantization across different model sizes, showcasing its practicality without severely compromising on performance. This dual advantage of storage reduction and accelerated search capabilities underscores the transformative potential of Binary Quantization in deploying OpenAI embeddings more effectively across various real-world applications. -... +![](/blog/openai/Accuracy_Models.png) - +The efficiency gains from Binary Quantization are as follows: - +- Reduced storage footprint: It helps with large-scale datasets. It also saves on memory, and scales up to 30x at the same cost. +- Enhanced speed of data retrieval: Smaller data sizes generally leads to faster searches. +- Accelerated search process: It is based on simplified distance calculations between vectors to bitwise operations. This enables real-time querying even in extensive databases. -https://qdrant.tech/documentation/data-management/confluent/ +### Experiment setup: OpenAI embeddings in focus -2024-08-15T08:50:37+05:30 +To identify Binary Quantization's impact on search efficiency and accuracy, we designed our experiment on OpenAI text-embedding models. These models, which capture nuanced linguistic features and semantic relationships, are the backbone of our analysis. We then delve deep into the potential enhancements offered by Qdrant's Binary Quantization feature. -... +This approach not only leverages the high-caliber OpenAI embeddings but also provides a broad basis for evaluating the search mechanism under scrutiny. - +#### Dataset - + The research employs 100K random samples from the [OpenAI 1M](https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M) 1M dataset, focusing on 100 randomly selected records. These records serve as queries in the experiment, aiming to assess how Binary Quantization influences search efficiency and precision within the dataset. We then use the embeddings of the queries to search for the nearest neighbors in the dataset. -https://qdrant.tech/contact-us/ +#### Parameters: oversampling, rescoring, and search limits -2025-03-13T17:47:05+01:00 +For each record, we run a parameter sweep over the number of oversampling, rescoring, and search limits. We can then understand the impact of these parameters on search accuracy and efficiency. Our experiment was designed to assess the impact of Binary Quantization under various conditions, based on the following parameters: -... +- **Oversampling**: By oversampling, we can limit the loss of information inherent in quantization. This also helps to preserve the semantic richness of your OpenAI embeddings. We experimented with different oversampling factors, and identified the impact on the accuracy and efficiency of search. Spoiler: higher oversampling factors tend to improve the accuracy of searches. However, they usually require more computational resources. - +- **Rescoring**: Rescoring refines the first results of an initial binary search. This process leverages the original high-dimensional vectors to refine the search results, **always** improving accuracy. We toggled rescoring on and off to measure effectiveness, when combined with Binary Quantization. We also measured the impact on search performance. - +- **Search Limits**: We specify the number of results from the search process. We experimented with various search limits to measure their impact the accuracy and efficiency. We explored the trade-offs between search depth and performance. The results provide insight for applications with different precision and speed requirements. -https://qdrant.tech/legal/credits/ +Through this detailed setup, our experiment sought to shed light on the nuanced interplay between Binary Quantization and the high-quality embeddings produced by OpenAI's models. By meticulously adjusting and observing the outcomes under different conditions, we aimed to uncover actionable insights that could empower users to harness the full potential of Qdrant in combination with OpenAI's embeddings, regardless of their specific application needs. -2022-04-25T15:19:19+02:00 +### Results: binary quantization's impact on OpenAI embeddings -... +To analyze the impact of rescoring (`True` or `False`), we compared results across different model configurations and search limits. Rescoring sets up a more precise search, based on results from an initial query. - +#### Rescoring - +![Graph that measures the impact of rescoring](/blog/openai/Rescoring_Impact.png) -https://qdrant.tech/documentation/frameworks/crewai/ +Here are some key observations, which analyzes the impact of rescoring (`True` or `False`): -2025-02-27T09:21:41+01:00 +1. **Significantly Improved Accuracy**: + - Across all models and dimension configurations, enabling rescoring (`True`) consistently results in higher accuracy scores compared to when rescoring is disabled (`False`). + - The improvement in accuracy is true across various search limits (10, 20, 50, 100). -... +2. **Model and Dimension Specific Observations**: + - For the `text-embedding-3-large` model with 3072 dimensions, rescoring boosts the accuracy from an average of about 76-77% without rescoring to 97-99% with rescoring, depending on the search limit and oversampling rate. + - The accuracy improvement with increased oversampling is more pronounced when rescoring is enabled, indicating a better utilization of the additional binary codes in refining search results. + - With the `text-embedding-3-small` model at 512 dimensions, accuracy increases from around 53-55% without rescoring to 71-91% with rescoring, highlighting the significant impact of rescoring, especially at lower dimensions. - +In contrast, for lower dimension models (such as text-embedding-3-small with 512 dimensions), the incremental accuracy gains from increased oversampling levels are less significant, even with rescoring enabled. This suggests a diminishing return on accuracy improvement with higher oversampling in lower dimension spaces. - +3. **Influence of Search Limit**: + - The performance gain from rescoring seems to be relatively stable across different search limits, suggesting that rescoring consistently enhances accuracy regardless of the number of top results considered. -https://qdrant.tech/customers/ +In summary, enabling rescoring dramatically improves search accuracy across all tested configurations. It is crucial feature for applications where precision is paramount. The consistent performance boost provided by rescoring underscores its value in refining search results, particularly when working with complex, high-dimensional data like OpenAI embeddings. This enhancement is critical for applications that demand high accuracy, such as semantic search, content discovery, and recommendation systems, where the quality of search results directly impacts user experience and satisfaction. -2024-06-17T16:56:32+03:00 +### Dataset combinations -... +For those exploring the integration of text embedding models with Qdrant, it's crucial to consider various model configurations for optimal performance. The dataset combinations defined above illustrate different configurations to test against Qdrant. These combinations vary by two primary attributes: - +1. **Model Name**: Signifying the specific text embedding model variant, such as "text-embedding-3-large" or "text-embedding-3-small". This distinction correlates with the model's capacity, with "large" models offering more detailed embeddings at the cost of increased computational resources. - +2. **Dimensions**: This refers to the size of the vector embeddings produced by the model. Options range from 512 to 3072 dimensions. Higher dimensions could lead to more precise embeddings but might also increase the search time and memory usage in Qdrant. -https://qdrant.tech/documentation/frameworks/dagster/ +Optimizing these parameters is a balancing act between search accuracy and resource efficiency. Testing across these combinations allows users to identify the configuration that best meets their specific needs, considering the trade-offs between computational resources and the quality of search results. -2025-04-15T18:20:05+05:30 -... +```python +dataset_combinations = [ + { + "model_name": "text-embedding-3-large", + "dimensions": 3072, + }, + { + "model_name": "text-embedding-3-large", + "dimensions": 1024, + }, + { + "model_name": "text-embedding-3-large", + "dimensions": 1536, + }, + { + "model_name": "text-embedding-3-small", + "dimensions": 512, + }, + { + "model_name": "text-embedding-3-small", + "dimensions": 1024, + }, + { + "model_name": "text-embedding-3-small", + "dimensions": 1536, + }, +] +``` +#### Exploring dataset combinations and their impacts on model performance - +The code snippet iterates through predefined dataset and model combinations. For each combination, characterized by the model name and its dimensions, the corresponding experiment's results are loaded. These results, which are stored in JSON format, include performance metrics like accuracy under different configurations: with and without oversampling, and with and without a rescore step. - +Following the extraction of these metrics, the code computes the average accuracy across different settings, excluding extreme cases of very low limits (specifically, limits of 1 and 5). This computation groups the results by oversampling, rescore presence, and limit, before calculating the mean accuracy for each subgroup. -https://qdrant.tech/documentation/observability/datadog/ +After gathering and processing this data, the average accuracies are organized into a pivot table. This table is indexed by the limit (the number of top results considered), and columns are formed based on combinations of oversampling and rescoring. -2024-10-31T05:56:39+05:30 +```python +import pandas as pd -... +for combination in dataset_combinations: + model_name = combination["model_name"] + dimensions = combination["dimensions"] + print(f"Model: {model_name}, dimensions: {dimensions}") + results = pd.read_json(f"../results/results-{model_name}-{dimensions}.json", lines=True) + average_accuracy = results[results["limit"] != 1] + average_accuracy = average_accuracy[average_accuracy["limit"] != 5] + average_accuracy = average_accuracy.groupby(["oversampling", "rescore", "limit"])[ + "accuracy" + ].mean() + average_accuracy = average_accuracy.reset_index() + acc = average_accuracy.pivot( + index="limit", columns=["oversampling", "rescore"], values="accuracy" + ) + print(acc) +``` - +Here is a selected slice of these results, with `rescore=True`: - +|Method|Dimensionality|Test Dataset|Recall|Oversampling| +|-|-|-|-|-| +|OpenAI text-embedding-3-large (highest MTEB score from the table) |3072|[DBpedia 1M](https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-3072-1M) | 0.9966|3x| +|OpenAI text-embedding-3-small|1536|[DBpedia 100K](https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-small-1536-100K)| 0.9847|3x| +|OpenAI text-embedding-3-large|1536|[DBpedia 1M](https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-1M)| 0.9826|3x| -https://qdrant.tech/documentation/frameworks/deepeval/ +#### Impact of oversampling -2025-04-24T16:09:40+08:00 +You can use oversampling in machine learning to counteract imbalances in datasets. +It works well when one class significantly outnumbers others. This imbalance +can skew the performance of models, which favors the majority class at the +expense of others. By creating additional samples from the minority classes, +oversampling helps equalize the representation of classes in the training dataset, thus enabling more fair and accurate modeling of real-world scenarios. -... +The screenshot showcases the effect of oversampling on model performance metrics. While the actual metrics aren't shown, we expect to see improvements in measures such as precision, recall, or F1-score. These improvements illustrate the effectiveness of oversampling in creating a more balanced dataset. It allows the model to learn a better representation of all classes, not just the dominant one. - +Without an explicit code snippet or output, we focus on the role of oversampling in model fairness and performance. Through graphical representation, you can set up before-and-after comparisons. These comparisons illustrate the contribution to machine learning projects. - +![Measuring the impact of oversampling](/blog/openai/Oversampling_Impact.png) -https://qdrant.tech/documentation/data-management/dlt/ +### Leveraging binary quantization: best practices -2024-08-15T08:50:37+05:30 +We recommend the following best practices for leveraging Binary Quantization to enhance OpenAI embeddings: -... +1. Embedding Model: Use the text-embedding-3-large from MTEB. It is most accurate among those tested. +2. Dimensions: Use the highest dimension available for the model, to maximize accuracy. The results are true for English and other languages. +3. Oversampling: Use an oversampling factor of 3 for the best balance between accuracy and efficiency. This factor is suitable for a wide range of applications. +4. Rescoring: Enable rescoring to improve the accuracy of search results. +5. RAM: Store the full vectors and payload on disk. Limit what you load from memory to the binary quantization index. This helps reduce the memory footprint and improve the overall efficiency of the system. The incremental latency from the disk read is negligible compared to the latency savings from the binary scoring in Qdrant, which uses SIMD instructions where possible. - +## What's next? - +Binary quantization is exceptional if you need to work with large volumes of data under high recall expectations. You can try this feature either by spinning up a [Qdrant container image](https://hub.docker.com/r/qdrant/qdrant) locally or, having us create one for you through a [free account](https://cloud.qdrant.io/login) in our cloud hosted service. -https://qdrant.tech/documentation/frameworks/docarray/ +The article gives examples of data sets and configuration you can use to get going. Our documentation covers [adding large datasets to Qdrant](/documentation/tutorials/bulk-upload/) to your Qdrant instance as well as [more quantization methods](/documentation/guides/quantization/). -2024-08-15T08:50:37+05:30 +Want to discuss these findings and learn more about Binary Quantization? [Join our Discord community.](https://discord.gg/qdrant) -... +<|page-37-lllmstxt|> +# Scaling Your Machine Learning Setup: The Power of Multitenancy and Custom Sharding in Qdrant - +We are seeing the topics of [multitenancy](/documentation/guides/multiple-partitions/) and [distributed deployment](/documentation/guides/distributed_deployment/#sharding) pop-up daily on our [Discord support channel](https://qdrant.to/discord). This tells us that many of you are looking to scale Qdrant along with the rest of your machine learning setup. - +Whether you are building a bank fraud-detection system, [RAG](https://qdrant.tech/articles/what-is-rag-in-ai/) for e-commerce, or services for the federal government - you will need to leverage a multitenant architecture to scale your product. +In the world of SaaS and enterprise apps, this setup is the norm. It will considerably increase your application's performance and lower your hosting costs. -https://qdrant.tech/documentation/platforms/docsgpt/ +## Multitenancy & custom sharding with Qdrant -2025-02-18T21:01:07+05:30 +We have developed two major features just for this. __You can now scale a single Qdrant cluster and support all of your customers worldwide.__ Under [multitenancy](/documentation/guides/multiple-partitions/), each customer's data is completely isolated and only accessible by them. At times, if this data is location-sensitive, Qdrant also gives you the option to divide your cluster by region or other criteria that further secure your customer's access. This is called [custom sharding](/documentation/guides/distributed_deployment/#user-defined-sharding). -... +Combining these two will result in an efficiently-partitioned architecture that further leverages the convenience of a single Qdrant cluster. This article will briefly explain the benefits and show how you can get started using both features. - +## One collection, many tenants - +When working with Qdrant, you can upsert all your data to a single collection, and then partition each vector via its payload. This means that all your users are leveraging the power of a single Qdrant cluster, but their data is still isolated within the collection. Let's take a look at a two-tenant collection: -https://qdrant.tech/documentation/frameworks/dsrag/ +**Figure 1:** Each individual vector is assigned a specific payload that denotes which tenant it belongs to. This is how a large number of different tenants can share a single Qdrant collection. +![Qdrant Multitenancy](/articles_data/multitenancy/multitenancy-single.png) -2024-11-27T17:59:33+05:30 +Qdrant is built to excel in a single collection with a vast number of tenants. You should only create multiple collections when your data is not homogenous or if users' vectors are created by different embedding models. Creating too many collections may result in resource overhead and cause dependencies. This can increase costs and affect overall performance. -... +## Sharding your database - +With Qdrant, you can also specify a shard for each vector individually. This feature is useful if you want to [control where your data is kept in the cluster](/documentation/guides/distributed_deployment/#sharding). For example, one set of vectors can be assigned to one shard on its own node, while another set can be on a completely different node. - +During vector search, your operations will be able to hit only the subset of shards they actually need. In massive-scale deployments, __this can significantly improve the performance of operations that do not require the whole collection to be scanned__. -https://qdrant.tech/documentation/frameworks/dynamiq/ +This works in the other direction as well. Whenever you search for something, you can specify a shard or several shards and Qdrant will know where to find them. It will avoid asking all machines in your cluster for results. This will minimize overhead and maximize performance. -2025-03-24T10:22:45+02:00 +### Common use cases -... +A clear use-case for this feature is managing a multitenant collection, where each tenant (let it be a user or organization) is assumed to be segregated, so they can have their data stored in separate shards. Sharding solves the problem of region-based data placement, whereby certain data needs to be kept within specific locations. To do this, however, you will need to [move your shards between nodes](/documentation/guides/distributed_deployment/#moving-shards). - +**Figure 2:** Users can both upsert and query shards that are relevant to them, all within the same collection. Regional sharding can help avoid cross-continental traffic. +![Qdrant Multitenancy](/articles_data/multitenancy/shards.png) - +Custom sharding also gives you precise control over other use cases. A time-based data placement means that data streams can index shards that represent latest updates. If you organize your shards by date, you can have great control over the recency of retrieved data. This is relevant for social media platforms, which greatly rely on time-sensitive data. -https://qdrant.tech/articles/ecosystem/ +## Before I go any further.....how secure is my user data? -2024-12-20T13:10:51+01:00 +By design, Qdrant offers three levels of isolation. We initially introduced collection-based isolation, but your scaled setup has to move beyond this level. In this scenario, you will leverage payload-based isolation (from multitenancy) and resource-based isolation (from sharding). The ultimate goal is to have a single collection, where you can manipulate and customize placement of shards inside your cluster more precisely and avoid any kind of overhead. The diagram below shows the arrangement of your data within a two-tier isolation arrangement. -... +**Figure 3:** Users can query the collection based on two filters: the `group_id` and the individual `shard_key_selector`. This gives your data two additional levels of isolation. +![Qdrant Multitenancy](/articles_data/multitenancy/multitenancy.png) - +## Create custom shards for a single collection - +When creating a collection, you will need to configure user-defined sharding. This lets you control the shard placement of your data, so that operations can hit only the subset of shards they actually need. In big clusters, this can significantly improve the performance of operations, since you won't need to go through the entire collection to retrieve data. -https://qdrant.tech/enterprise-solutions/ +```python +client.create_collection( + collection_name="{tenant_data}", + shard_number=2, + sharding_method=models.ShardingMethod.CUSTOM, + # ... other collection parameters +) +client.create_shard_key("{tenant_data}", "canada") +client.create_shard_key("{tenant_data}", "germany") +``` +In this example, your cluster is divided between Germany and Canada. Canadian and German law differ when it comes to international data transfer. Let's say you are creating a RAG application that supports the healthcare industry. Your Canadian customer data will have to be clearly separated for compliance purposes from your German customer. -2024-08-20T14:08:09-04:00 +Even though it is part of the same collection, data from each shard is isolated from other shards and can be retrieved as such. For additional examples on shards and retrieval, consult [Distributed Deployments](/documentation/guides/distributed_deployment/) documentation and [Qdrant Client specification](https://python-client.qdrant.tech). -... +## Configure a multitenant setup for users - +Let's continue and start adding data. As you upsert your vectors to your new collection, you can add a `group_id` field to each vector. If you do this, Qdrant will assign each vector to its respective group. - +Additionally, each vector can now be allocated to a shard. You can specify the `shard_key_selector` for each individual vector. In this example, you are upserting data belonging to `tenant_1` to the Canadian region. -https://qdrant.tech/documentation/frameworks/feast/ +```python +client.upsert( + collection_name="{tenant_data}", + points=[ + models.PointStruct( + id=1, + payload={"group_id": "tenant_1"}, + vector=[0.9, 0.1, 0.1], + ), + models.PointStruct( + id=2, + payload={"group_id": "tenant_1"}, + vector=[0.1, 0.9, 0.1], + ), + ], + shard_key_selector="canada", +) +``` +Keep in mind that the data for each `group_id` is isolated. In the example below, `tenant_1` vectors are kept separate from `tenant_2`. The first tenant will be able to access their data in the Canadian portion of the cluster. However, as shown below `tenant_2 `might only be able to retrieve information hosted in Germany. -2025-02-18T21:01:07+05:30 +```python +client.upsert( + collection_name="{tenant_data}", + points=[ + models.PointStruct( + id=3, + payload={"group_id": "tenant_2"}, + vector=[0.1, 0.1, 0.9], + ), + ], + shard_key_selector="germany", +) +``` -... +## Retrieve data via filters - +The access control setup is completed as you specify the criteria for data retrieval. When searching for vectors, you need to use a `query_filter` along with `group_id` to filter vectors for each user. - +```python +client.search( + collection_name="{tenant_data}", + query_filter=models.Filter( + must=[ + models.FieldCondition( + key="group_id", + match=models.MatchValue( + value="tenant_1", + ), + ), + ] + ), + query_vector=[0.1, 0.1, 0.9], + limit=10, +) +``` -https://qdrant.tech/documentation/frameworks/fifty-one/ +## Performance considerations -2024-08-15T08:50:37+05:30 +The speed of indexation may become a bottleneck if you are adding large amounts of data in this way, as each user's vector will be indexed into the same collection. To avoid this bottleneck, consider _bypassing the construction of a global vector index_ for the entire collection and building it only for individual groups instead. -... +By adopting this strategy, Qdrant will index vectors for each user independently, significantly accelerating the process. - +To implement this approach, you should: - +1. Set `payload_m` in the HNSW configuration to a non-zero value, such as 16. +2. Set `m` in hnsw config to 0. This will disable building global index for the whole collection. -https://qdrant.tech/documentation/frameworks/genkit/ +```python +from qdrant_client import QdrantClient, models -2024-10-05T03:39:41+05:30 +client = QdrantClient("localhost", port=6333) -... +client.create_collection( + collection_name="{tenant_data}", + vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), + hnsw_config=models.HnswConfigDiff( + payload_m=16, + m=0, + ), +) +``` - +3. Create keyword payload index for `group_id` field. - +```python +client.create_payload_index( + collection_name="{tenant_data}", + field_name="group_id", + field_schema=models.PayloadSchemaType.KEYWORD, +) +``` +> Note: Keep in mind that global requests (without the `group_id` filter) will be slower since they will necessitate scanning all groups to identify the nearest neighbors. -https://qdrant.tech/documentation/data-management/fondant/ +## Explore multitenancy and custom sharding in Qdrant for scalable solutions -2024-08-15T08:50:37+05:30 +Qdrant is ready to support a massive-scale architecture for your machine learning project. If you want to see whether our [vector database](https://qdrant.tech/) is right for you, try the [quickstart tutorial](/documentation/quick-start/) or read our [docs and tutorials](/documentation/). -... +To spin up a free instance of Qdrant, sign up for [Qdrant Cloud](https://qdrant.to/cloud) - no strings attached. - +Get support or share ideas in our [Discord](https://qdrant.to/discord) community. This is where we talk about vector search theory, publish examples and demos and discuss vector database setups. - +<|page-38-lllmstxt|> +Data stored in vector databases is often proprietary to the enterprise and may include sensitive information like customer records, legal contracts, electronic health records (EHR), financial data, and intellectual property. Moreover, strong security measures become critical to safeguarding this data. If the data stored in a vector database is not secured, it may open a vulnerability known as "[embedding inversion attack](https://arxiv.org/abs/2004.00053)," where malicious actors could potentially [reconstruct the original data from the embeddings](https://arxiv.org/pdf/2305.03010) themselves. -https://qdrant.tech/documentation/embeddings/gemini/ +Strict compliance regulations govern data stored in vector databases across various industries. For instance, healthcare must comply with HIPAA, which dictates how protected health information (PHI) is stored, transmitted, and secured. Similarly, the financial services industry follows PCI DSS to safeguard sensitive financial data. These regulations require developers to ensure data storage and transmission comply with industry-specific legal frameworks across different regions. **As a result, features that enable data privacy, security and sovereignty are deciding factors when choosing the right vector database.** -2024-11-28T08:54:13+05:30 +This article explores various strategies to ensure the security of your critical data while leveraging the benefits of vector search. Implementing some of these security approaches can help you build privacy-enhanced similarity search algorithms and integrate them into your AI applications. +Additionally, you will learn how to build a fully data-sovereign architecture, allowing you to retain control over your data and comply with relevant data laws and regulations. -... +> To skip right to the code implementation, [click here](/articles/data-privacy/#jwt-on-qdrant). - +## Vector Database Security: An Overview - +Vector databases are often unsecured by default to facilitate rapid prototyping and experimentation. This approach allows developers to quickly ingest data, build vector representations, and test similarity search algorithms without initial security concerns. However, in production environments, unsecured databases pose significant data breach risks. -https://qdrant.tech/documentation/frameworks/haystack/ +For production use, robust security systems are essential. Authentication, particularly using static API keys, is a common approach to control access and prevent unauthorized modifications. Yet, simple API authentication is insufficient for enterprise data, which requires granular control. -2024-08-15T08:50:37+05:30 +The primary challenge with static API keys is their all-or-nothing access, inadequate for role-based data segregation in enterprise applications. Additionally, a compromised key could grant attackers full access to manipulate or steal data. To strengthen the security of the vector database, developers typically need the following: -... +1. **Encryption**: This ensures that sensitive data is scrambled as it travels between the application and the vector database. This safeguards against Man-in-the-Middle ([MitM](https://en.wikipedia.org/wiki/Man-in-the-middle_attack)) attacks, where malicious actors can attempt to intercept and steal data during transmission. +2. **Role-Based Access Control**: As mentioned before, traditional static API keys grant all-or-nothing access, which is a significant security risk in enterprise environments. RBAC offers a more granular approach by defining user roles and assigning specific data access permissions based on those roles. For example, an analyst might have read-only access to specific datasets, while an administrator might have full CRUD (Create, Read, Update, Delete) permissions across the database. +3. **Deployment Flexibility**: Data residency regulations like GDPR (General Data Protection Regulation) and industry-specific compliance requirements dictate where data can be stored, processed, and accessed. Developers would need to choose a database solution which offers deployment options that comply with these regulations. This might include on-premise deployments within a company's private cloud or geographically distributed cloud deployments that adhere to data residency laws. - +## How Qdrant Handles Data Privacy and Security - +One of the cornerstones of our design choices at Qdrant has been the focus on security features. We have built in a range of features keeping the enterprise user in mind, which allow building of granular access control on a fully data sovereign architecture. -https://qdrant.tech/documentation/frameworks/honeyhive/ +A Qdrant instance is unsecured by default. However, when you are ready to deploy in production, Qdrant offers a range of security features that allow you to control access to your data, protect it from breaches, and adhere to regulatory requirements. Using Qdrant, you can build granular access control, segregate roles and privileges, and create a fully data sovereign architecture. -2025-05-09T04:07:10-03:00 +### API Keys and TLS Encryption -... +For simpler use cases, Qdrant offers API key-based authentication. This includes both regular API keys and read-only API keys. Regular API keys grant full access to read, write, and delete operations, while read-only keys restrict access to data retrieval operations only, preventing write actions. - +On Qdrant Cloud, you can create API keys using the [Cloud Dashboard](https://qdrant.to/cloud). This allows you to generate API keys that give you access to a single node or cluster, or multiple clusters. You can read the steps to do so [here](/documentation/cloud/authentication/). - +![web-ui](/articles_data/data-privacy/web-ui.png) -https://qdrant.tech/hospitality-and-travel/ +For on-premise or local deployments, you'll need to configure API key authentication. This involves specifying a key in either the Qdrant configuration file or as an environment variable. This ensures that all requests to the server must include a valid API key sent in the header. -2025-05-21T18:13:48+02:00 +When using the simple API key-based authentication, you should also turn on TLS encryption. Otherwise, you are exposing the connection to sniffing and MitM attacks. To secure your connection using TLS, you would need to create a certificate and private key, and then [enable TLS](/documentation/guides/security/#tls) in the configuration. -... +API authentication, coupled with TLS encryption, offers a first layer of security for your Qdrant instance. However, to enable more granular access control, the recommended approach is to leverage JSON Web Tokens (JWTs). - +### JWT on Qdrant - +JSON Web Tokens (JWTs) are a compact, URL-safe, and stateless means of representing _claims_ to be transferred between two parties. These claims are encoded as a JSON object and are cryptographically signed. -https://qdrant.tech/legal/impressum/ +JWT is composed of three parts: a header, a payload, and a signature, which are concatenated with dots (.) to form a single string. The header contains the type of token and algorithm being used. The payload contains the claims (explained in detail later). The signature is a cryptographic hash and ensures the token’s integrity. -2024-02-28T17:57:34+01:00 +In Qdrant, JWT forms the foundation through which powerful access controls can be built. Let’s understand how. -... +JWT is enabled on the Qdrant instance by specifying the API key and turning on the **jwt_rbac** feature in the configuration (alternatively, they can be set as environment variables). For any subsequent request, the API key is used to encode or decode the token. - +The way JWT works is that just the API key is enough to generate the token, and doesn’t require any communication with the Qdrant instance or server. There are several libraries that help generate tokens by encoding a payload, such as [PyJWT](https://pyjwt.readthedocs.io/en/stable/) (for Python), [jsonwebtoken](https://www.npmjs.com/package/jsonwebtoken) (for JavaScript), and [jsonwebtoken](https://crates.io/crates/jsonwebtoken) (for Rust). Qdrant uses the HS256 algorithm to encode or decode the tokens. - +We will look at the payload structure shortly, but here’s how you can generate a token using PyJWT. -https://qdrant.tech/documentation/data-management/fluvio/ +```python +import jwt +import datetime -2024-09-15T21:31:35+05:30 +# Define your API key and other payload data +api_key = "your_api_key" +payload = { ... +} -... +token = jwt.encode(payload, api_key, algorithm="HS256") +print(token) +``` - +Once you have generated the token, you should include it in the subsequent requests. You can do so by providing it as a bearer token in the Authorization header, or in the API Key header of your requests. - +Below is an example of how to do so using QdrantClient in Python: -https://qdrant.tech/documentation/platforms/rivet/ +```python +from qdrant_client import QdrantClient -2024-08-15T08:50:37+05:30 +qdrant_client = QdrantClient( + "http://localhost:6333", + api_key="", # the token goes here +) +# Example search vector +search_vector = [0.1, 0.2, 0.3, 0.4] -... +# Example similarity search request +response = qdrant_client.search( + collection_name="demo_collection", + query_vector=search_vector, + limit=5 # Number of results to retrieve +) +``` - +For convenience, we have added a JWT generation tool in the Qdrant Web UI, which is present under the 🔑 tab. For your local deployments, you will find it at [http://localhost:6333/dashboard#/jwt](http://localhost:6333/dashboard#/jwt). - +### Payload Configuration -https://qdrant.tech/documentation/embeddings/jina-embeddings/ +There are several different options (claims) you can use in the JWT payload that help control access and functionality. Let’s look at them one by one. -2024-11-28T08:54:13+05:30 +**exp**: This claim is the expiration time of the token, and is a unix timestamp in seconds. After the expiration time, the token will be invalid. -... +**value_exists**: This claim validates the token against a specific key-value stored in a collection. By using this claim, you can revoke access by simply changing a value without having to invalidate the API key. - +**access**: This claim defines the access level of the token. The access level can be global read (r) or manage (m). It can also be specific to a collection, or even a subset of a collection, using read (r) and read-write (rw). - +Let’s look at a few example JWT payload configurations. -https://qdrant.tech/about-us/about-us-get-started/ +**Scenario 1: 1-hour expiry time, and read-only access to a collection** +```json +{ + "exp": 1690995200, // Set to 1 hour from the current time (Unix timestamp) + "access": [ + { + "collection": "demo_collection", + "access": "r" // Read-only access + } + ] +} -2025-05-30T14:14:31+03:00 +``` -... +**Scenario 2: 1-hour expiry time, and access to user with a specific role** - +Suppose you have a ‘users’ collection and have defined specific roles for each user, such as ‘developer’, ‘manager’, ‘admin’, ‘analyst’, and ‘revoked’. In such a scenario, you can use a combination of **exp** and **value_exists**. +```json +{ + "exp": 1690995200, + "value_exists": { + "collection": "users", + "matches": [ + { "key": "username", "value": "john" }, + { "key": "role", "value": "developer" } + ], + }, +} - +``` -https://qdrant.tech/documentation/platforms/keboola/ -2025-05-14T07:24:10-04:00 -... +Now, if you ever want to revoke access for a user, simply change the value of their role. All future requests will be invalid using a token payload of the above type. - +**Scenario 3: 1-hour expiry time, and read-write access to a subset of a collection** - +You can even specify access levels specific to subsets of a collection. This can be especially useful when you are leveraging [multitenancy](/documentation/guides/multiple-partitions/), and want to segregate access. +```json +{ + "exp": 1690995200, + "access": [ + { + "collection": "demo_collection", + "access": "r", + "payload": { + "user_id": "user_123456" + } + } + ] +} +``` -https://qdrant.tech/documentation/platforms/kotaemon/ -2024-11-07T03:37:15+05:30 +By combining the claims, you can fully customize the access level that a user or a role has within the vector store. -... +### Creating Role-Based Access Control (RBAC) Using JWT - +As we saw above, JWT claims create powerful levers through which you can create granular access control on Qdrant. Let’s bring it all together and understand how it helps you create Role-Based Access Control (RBAC). - +In a typical enterprise application, you will have a segregation of users based on their roles and permissions. These could be: -https://qdrant.tech/documentation/frameworks/langchain/ +1. **Admin or Owner:** with full access, and can generate API keys. +2. **Editor:** with read-write access levels to specific collections. +3. **Viewer:** with read-only access to specific collections. +4. **Data Scientist or Analyst:** with read-only access to specific collections. +5. **Developer:** with read-write access to development- or testing-specific collections, but limited access to production data. +6. **Guest:** with limited read-only access to publicly available collections. -2024-08-29T19:19:43+05:30 +In addition, you can create access levels within sections of a collection. In a multi-tenant application, where you have used payload-based partitioning, you can create read-only access for specific user roles for a subset of the collection that belongs to that user. -... +Your application requirements will eventually help you decide the roles and access levels you should create. For example, in an application managing customer data, you could create additional roles such as: - +**Customer Support Representative**: read-write access to customer service-related data but no access to billing information. - +**Billing Department**: read-only access to billing data and read-write access to payment records. -https://qdrant.tech/documentation/frameworks/langchain-go/ +**Marketing Analyst**: read-only access to anonymized customer data for analytics. -2024-11-04T16:55:24+01:00 +Each role can be assigned a JWT with claims that specify expiration times, read/write permissions for collections, and validating conditions. -... +In such an application, an example JWT payload for a customer support representative role could be: - +```json +{ + "exp": 1690995200, + "access": [ + { + "collection": "customer_data", + "access": "rw", + "payload": { + "department": "support" + } + } + ], + "value_exists": { + "collection": "departments", + "matches": [ + { "key": "department", "value": "support" } + ] + } +} +``` - -https://qdrant.tech/documentation/frameworks/langchain4j/ +As you can see, by implementing RBAC, you can ensure proper segregation of roles and their privileges, and avoid privacy loopholes in your application. -2024-08-15T08:50:37+05:30 +## Qdrant Hybrid Cloud and Data Sovereignty -... +Data governance varies by country, especially for global organizations dealing with different regulations on data privacy, security, and access. This often necessitates deploying infrastructure within specific geographical boundaries. - +To address these needs, the vector database you choose should support deployment and scaling within your controlled infrastructure. [Qdrant Hybrid Cloud](/documentation/hybrid-cloud/) offers this flexibility, along with features like sharding, replicas, JWT authentication, and monitoring. - +Qdrant Hybrid Cloud integrates Kubernetes clusters from various environments—cloud, on-premises, or edge—into a unified managed service. This allows organizations to manage Qdrant databases through the Qdrant Cloud UI while keeping the databases within their infrastructure. -https://qdrant.tech/documentation/frameworks/langgraph/ +With JWT and RBAC, Qdrant Hybrid Cloud provides a secure, private, and sovereign vector store. Enterprises can scale their AI applications geographically, comply with local laws, and maintain strict data control. -2024-11-20T19:27:09+05:30 +## Conclusion -... +Vector similarity is increasingly becoming the backbone of AI applications that leverage unstructured data. By transforming data into vectors – their numerical representations – organizations can build powerful applications that harness semantic search, ranging from better recommendation systems to algorithms that help with personalization, or powerful customer support chatbots. - +However, to fully leverage the power of AI in production, organizations need to choose a vector database that offers strong privacy and security features, while also helping them adhere to local laws and regulations. - +Qdrant provides exceptional efficiency and performance, along with the capability to implement granular access control to data, Role-Based Access Control (RBAC), and the ability to build a fully data-sovereign architecture. -https://qdrant.tech/legal-tech/ +Interested in mastering vector search security and deployment strategies? [Join our Discord community](https://discord.gg/qdrant) to explore more advanced search strategies, connect with other developers and researchers in the industry, and stay updated on the latest innovations! -2025-04-24T18:13:38+02:00 +<|page-39-lllmstxt|> +# Discovery needs context -... +When Christopher Columbus and his crew sailed to cross the Atlantic Ocean, they were not looking for the Americas. They were looking for a new route to India because they were convinced that the Earth was round. They didn't know anything about a new continent, but since they were going west, they stumbled upon it. - +They couldn't reach their _target_, because the geography didn't let them, but once they realized it wasn't India, they claimed it a new "discovery" for their crown. If we consider that sailors need water to sail, then we can establish a _context_ which is positive in the water, and negative on land. Once the sailor's search was stopped by the land, they could not go any further, and a new route was found. Let's keep these concepts of _target_ and _context_ in mind as we explore the new functionality of Qdrant: __Discovery search__. - +## What is discovery search? -https://qdrant.tech/documentation/frameworks/llama-index/ +In version 1.7, Qdrant [released](/articles/qdrant-1.7.x/) this novel API that lets you constrain the space in which a search is performed, relying only on pure vectors. This is a powerful tool that lets you explore the vector space in a more controlled way. It can be used to find points that are not necessarily closest to the target, but are still relevant to the search. -2024-08-15T08:50:37+05:30 +You can already select which points are available to the search by using payload filters. This by itself is very versatile because it allows us to craft complex filters that show only the points that satisfy their criteria deterministically. However, the payload associated with each point is arbitrary and cannot tell us anything about their position in the vector space. In other words, filtering out irrelevant points can be seen as creating a _mask_ rather than a hyperplane –cutting in between the positive and negative vectors– in the space. -... +## Understanding context - +This is where a __vector _context___ can help. We define _context_ as a list of pairs. Each pair is made up of a positive and a negative vector. With a context, we can define hyperplanes within the vector space, which always prefer the positive over the negative vectors. This effectively partitions the space where the search is performed. After the space is partitioned, we then need a _target_ to return the points that are more similar to it. - +![Discovery search visualization](/articles_data/discovery-search/discovery-search.png) -https://qdrant.tech/documentation/platforms/make/ +While positive and negative vectors might suggest the use of the recommendation interface, in the case of _context_ they require to be paired up in a positive-negative fashion. This is inspired from the machine-learning concept of _triplet loss_, where you have three vectors: an anchor, a positive, and a negative. Triplet loss is an evaluation of how much the anchor is closer to the positive than to the negative vector, so that learning happens by "moving" the positive and negative points to try to get a better evaluation. However, during discovery, we consider the positive and negative vectors as static points, and we search through the whole dataset for the "anchors", or result candidates, which fit this characteristic better. -2024-08-15T08:50:37+05:30 +![Triplet loss](/articles_data/discovery-search/triplet-loss.png) -... +[__Discovery search__](#discovery-search), then, is made up of two main inputs: - +- __target__: the main point of interest +- __context__: the pairs of positive and negative points we just defined. - +However, it is not the only way to use it. Alternatively, you can __only__ provide a context, which invokes a [__Context Search__](#context-search). This is useful when you want to explore the space defined by the context, but don't have a specific target in mind. But hold your horses, we'll get to that [later â†Ș](#context-search). -https://qdrant.tech/documentation/frameworks/mastra/ +## Real-world discovery search applications -2024-12-20T13:30:42+05:30 +Let's talk about the first case: context with a target. -... +To understand why this is useful, let's take a look at a real-world example: using a multimodal encoder like [CLIP](https://openai.com/blog/clip/) to search for images, from text __and__ images. +CLIP is a neural network that can embed both images and text into the same vector space. This means that you can search for images using either a text query or an image query. For this example, we'll reuse our [food recommendations demo](https://food-discovery.qdrant.tech/) by typing "burger" in the text input: - +![Burger text input in food demo](/articles_data/discovery-search/search-for-burger.png) - +This is basically nearest neighbor search, and while technically we have only images of burgers, one of them is a logo representation of a burger. We're looking for actual burgers, though. Let's try to exclude images like that by adding it as a negative example: -https://qdrant.tech/documentation/frameworks/mem0/ +![Try to exclude burger drawing](/articles_data/discovery-search/try-to-exclude-non-burger.png) -2024-10-05T13:55:10+05:30 +Wait a second, what has just happened? These pictures have __nothing__ to do with burgers, and still, they appear on the first results. Is the demo broken? -... +Turns out, multimodal encoders might not work how you expect them to. Images and text are embedded in the same space, but they are not necessarily close to each other. This means that we can create a mental model of the distribution as two separate planes, one for images and one for text. - +![Mental model of CLIP embeddings](/articles_data/discovery-search/clip-mental-model.png) - +This is where discovery excels because it allows us to constrain the space considering the same mode (images) while using a target from the other mode (text). -https://qdrant.tech/documentation/frameworks/nlweb/ +![Cross-modal search with discovery](/articles_data/discovery-search/clip-discovery.png) -2025-05-19T21:26:59+05:30 +Discovery search also lets us keep giving feedback to the search engine in the shape of more context pairs, so we can keep refining our search until we find what we are looking for. -... +Another intuitive example: imagine you're looking for a fish pizza, but pizza names can be confusing, so you can just type "pizza", and prefer a fish over meat. Discovery search will let you use these inputs to suggest a fish pizza... even if it's not called fish pizza! - +![Simple discovery example](/articles_data/discovery-search/discovery-example-with-images.png) - +## Context search -https://qdrant.tech/documentation/data-management/mindsdb/ +Now, the second case: only providing context. -2024-08-15T08:50:37+05:30 +Ever been caught in the same recommendations on your favorite music streaming service? This may be caused by getting stuck in a similarity bubble. As user input gets more complex, diversity becomes scarce, and it becomes harder to force the system to recommend something different. -... +![Context vs recommendation search](/articles_data/discovery-search/context-vs-recommendation.png) - +__Context search__ solves this by de-focusing the search around a single point. Instead, it selects points randomly from within a zone in the vector space. This search is the most influenced by _triplet loss_, as the score can be thought of as _"how much a point is closer to a negative than a positive vector?"_. If it is closer to the positive one, then its score will be zero, same as any other point within the same zone. But if it is on the negative side, it will be assigned a more and more negative score the further it gets. - +![Context search visualization](/articles_data/discovery-search/context-search.png) -https://qdrant.tech/documentation/embeddings/mistral/ +Creating complex tastes in a high-dimensional space becomes easier since you can just add more context pairs to the search. This way, you should be able to constrain the space enough so you select points from a per-search "category" created just from the context in the input. -2024-11-28T08:54:13+05:30 +![A more complex context search](/articles_data/discovery-search/complex-context-search.png) -... +This way you can give refreshing recommendations, while still being in control by providing positive and negative feedback, or even by trying out different permutations of pairs. - +## Key takeaways: +- Discovery search is a powerful tool for controlled exploration in vector spaces. +Context, consisting of positive and negative vectors constrain the search space, while a target guides the search. +- Real-world applications include multimodal search, diverse recommendations, and context-driven exploration. +- Ready to learn more about the math behind it and how to use it? Check out the [documentation](/documentation/concepts/explore/#discovery-api) - +<|page-40-lllmstxt|> +> **Embeddings** are numerical machine learning representations of the semantic of the input data. They capture the meaning of complex, high-dimensional data, like text, images, or audio, into vectors. Enabling algorithms to process and analyze the data more efficiently. -https://qdrant.tech/documentation/embeddings/mixedbread/ +You know when you’re scrolling through your social media feeds and the content just feels incredibly tailored to you? There's the news you care about, followed by a perfect tutorial with your favorite tech stack, and then a meme that makes you laugh so hard you snort. -2024-11-28T08:54:13+05:30 +Or what about how YouTube recommends videos you ended up loving. It’s by creators you've never even heard of and you didn’t even send YouTube a note about your ideal content lineup. -... +This is the magic of embeddings. - +These are the result of **deep learning models** analyzing the data of your interactions online. From your likes, shares, comments, searches, the kind of content you linger on, and even the content you decide to skip. It also allows the algorithm to predict future content that you are likely to appreciate. - +The same embeddings can be repurposed for search, ads, and other features, creating a highly personalized user experience. -https://qdrant.tech/documentation/embeddings/mixpeek/ -2024-11-28T08:54:13+05:30 +![How embeddings are applied to perform recommendantions and other use cases](/articles_data/what-are-embeddings/Embeddings-Use-Case.jpg) -... - +They make [high-dimensional](https://www.sciencedirect.com/topics/computer-science/high-dimensional-data) data more manageable. This reduces storage requirements, improves computational efficiency, and makes sense of a ton of **unstructured** data. - -https://qdrant.tech/documentation/platforms/n8n/ +## Why use vector embeddings? -2025-06-06T22:10:24+05:30 +The **nuances** of natural language or the hidden **meaning** in large datasets of images, sounds, or user interactions are hard to fit into a table. Traditional relational databases can't efficiently query most types of data being currently used and produced, making the **retrieval** of this information very limited. -... +In the embeddings space, synonyms tend to appear in similar contexts and end up having similar embeddings. The space is a system smart enough to understand that "pretty" and "attractive" are playing for the same team. Without being explicitly told so. - +That’s the magic. - +At their core, vector embeddings are about semantics. They take the idea that "a word is known by the company it keeps" and apply it on a grand scale. -https://qdrant.tech/documentation/frameworks/neo4j-graphrag/ -2024-11-07T02:58:58+05:30 +![Example of how synonyms are placed closer together in the embeddings space](/articles_data/what-are-embeddings/Similar-Embeddings.jpg) -... - +This capability is crucial for creating search systems, recommendation engines, retrieval augmented generation (RAG) and any application that benefits from a deep understanding of content. - +## How do embeddings work? -https://qdrant.tech/documentation/embeddings/nomic/ +Embeddings are created through neural networks. They capture complex relationships and semantics into [dense vectors](https://www1.se.cuhk.edu.hk/~seem5680/lecture/semantics-with-dense-vectors-2018.pdf) which are more suitable for machine learning and data processing applications. They can then project these vectors into a proper **high-dimensional** space, specifically, a [Vector Database](/articles/what-is-a-vector-database/). -2024-11-28T08:54:13+05:30 -... - +![The process for turning raw data into embeddings and placing them into the vector space](/articles_data/what-are-embeddings/How-Embeddings-Work.jpg) - -https://qdrant.tech/documentation/embeddings/nvidia/ +The meaning of a data point is implicitly defined by its **position** on the vector space. After the vectors are stored, we can use their spatial properties to perform [nearest neighbor searches](https://en.wikipedia.org/wiki/Nearest_neighbor_search#:~:text=Nearest%20neighbor%20search%20(NNS)%2C,the%20larger%20the%20function%20values.). These searches retrieve semantically similar items based on how close they are in this space. -2024-11-28T08:54:13+05:30 +> The quality of the vector representations drives the performance. The embedding model that works best for you depends on your use case. -... - +### Creating vector embeddings - +Embeddings translate the complexities of human language to a format that computers can understand. It uses neural networks to assign **numerical values** to the input data, in a way that similar data has similar values. -https://qdrant.tech/documentation/embeddings/ollama/ -2024-11-28T08:54:13+05:30 +![The process of using Neural Networks to create vector embeddings](/articles_data/what-are-embeddings/How-Do-Embeddings-Work_.jpg) -... - +For example, if I want to make my computer understand the word 'right', I can assign a number like 1.3. So when my computer sees 1.3, it sees the word 'right’. - +Now I want to make my computer understand the context of the word ‘right’. I can use a two-dimensional vector, such as [1.3, 0.8], to represent 'right'. The first number 1.3 still identifies the word 'right', but the second number 0.8 specifies the context. -https://qdrant.tech/documentation/embeddings/openai/ +We can introduce more dimensions to capture more nuances. For example, a third dimension could represent formality of the word, a fourth could indicate its emotional connotation (positive, neutral, negative), and so on. -2024-11-28T08:54:13+05:30 +The evolution of this concept led to the development of embedding models like [Word2Vec](https://en.wikipedia.org/wiki/Word2vec) and [GloVe](https://en.wikipedia.org/wiki/GloVe). They learn to understand the context in which words appear to generate high-dimensional vectors for each word, capturing far more complex properties. -... - - +![How Word2Vec model creates the embeddings for a word](/articles_data/what-are-embeddings/Word2Vec-model.jpg) -https://qdrant.tech/documentation/frameworks/openai-agents/ -2025-04-30T14:10:48+05:30 +However, these models still have limitations. They generate a single vector per word, based on its usage across texts. This means all the nuances of the word "right" are blended into one vector representation. That is not enough information for computers to fully understand the context. -... +So, how do we help computers grasp the nuances of language in different contexts? In other words, how do we differentiate between: - - -https://qdrant.tech/about-us/about-us-engineering-culture/ +* "your answer is right" +* "turn right at the corner" +* "everyone has the right to freedom of speech" -2025-05-30T14:14:31+03:00 +Each of these sentences use the word 'right', with different meanings. -... +More advanced models like [BERT](https://en.wikipedia.org/wiki/BERT_(language_model)) and [GPT](https://en.wikipedia.org/wiki/Generative_pre-trained_transformer) use deep learning models based on the [transformer architecture](https://arxiv.org/abs/1706.03762), which helps computers consider the full context of a word. These models pay attention to the entire context. The model understands the specific use of a word in its **surroundings**, and then creates different embeddings for each. - - -https://qdrant.tech/documentation/frameworks/pandas-ai/ +![How the BERT model creates the embeddings for a word](/articles_data/what-are-embeddings/BERT-model.jpg) -2025-02-18T21:01:07+05:30 -... +But how does this process of understanding and interpreting work in practice? Think of the term: "biophilic design", for example. To generate its embedding, the transformer architecture can use the following contexts: - - -https://qdrant.tech/partners/ +* "Biophilic design incorporates natural elements into architectural planning." +* "Offices with biophilic design elements report higher employee well-being." +* "...plant life, natural light, and water features are key aspects of biophilic design." -2024-06-17T16:56:32+03:00 +And then it compares contexts to known architectural and design principles: -... - - +* "Sustainable designs prioritize environmental harmony." +* "Ergonomic spaces enhance user comfort and health." -https://qdrant.tech/documentation/frameworks/canopy/ +The model creates a vector embedding for "biophilic design" that encapsulates the concept of integrating natural elements into man-made environments. Augmented with attributes that highlight the correlation between this integration and its positive impact on health, well-being, and environmental sustainability. -2024-08-15T08:50:37+05:30 -... +### Integration with embedding APIs - +Selecting the right embedding model for your use case is crucial to your application performance. Qdrant makes it easier by offering seamless integration with the best selection of embedding APIs, including [Cohere](/documentation/embeddings/cohere/), [Gemini](/documentation/embeddings/gemini/), [Jina Embeddings](/documentation/embeddings/jina-embeddings/), [OpenAI](/documentation/embeddings/openai/), [Aleph Alpha](/documentation/embeddings/aleph-alpha/), [Fastembed](https://github.com/qdrant/fastembed), and [AWS Bedrock](/documentation/embeddings/bedrock/). - +If you’re looking for NLP and rapid prototyping, including language translation, question-answering, and text generation, OpenAI is a great choice. Gemini is ideal for image search, duplicate detection, and clustering tasks. -https://qdrant.tech/documentation/platforms/pipedream/ +Fastembed, which we’ll use on the example below, is designed for efficiency and speed, great for applications needing low-latency responses, such as autocomplete and instant content recommendations. -2024-08-15T08:50:37+05:30 +We plan to go deeper into selecting the best model based on performance, cost, integration ease, and scalability in a future post. -... +## Create a neural search service with Fastmbed - +Now that you’re familiar with the core concepts around vector embeddings, how about start building your own [Neural Search Service](/documentation/tutorials/neural-search/)? - +Tutorial guides you through a practical application of how to use Qdrant for document management based on descriptions of companies from [startups-list.com](https://www.startups-list.com/). From embedding data, integrating it with Qdrant's vector database, constructing a search API, and finally deploying your solution with FastAPI. -https://qdrant.tech/documentation/platforms/portable/ +Check out what the final version of this project looks like on the [live online demo](https://qdrant.to/semantic-search-demo). -2024-08-15T08:50:37+05:30 +Let us know what you’re building with embeddings! Join our [Discord](https://discord.gg/qdrant-907569970500743200) community and share your projects! -... +<|page-41-lllmstxt|> +Think of a library with a vast index card system. Each index card only has a few keywords marked out (sparse vector) of a large possible set for each book (document). This is what sparse vectors enable for text. - +## What are sparse and dense vectors? - +Sparse vectors are like the Marie Kondo of data—keeping only what sparks joy (or relevance, in this case). -https://qdrant.tech/documentation/platforms/powerapps/ +Consider a simplified example of 2 documents, each with 200 words. A dense vector would have several hundred non-zero values, whereas a sparse vector could have, much fewer, say only 20 non-zero values. -2025-01-10T21:05:50+05:30 +In this example: We assume it selects only 2 words or tokens from each document. -... +```python +dense = [0.2, 0.3, 0.5, 0.7, ...] # several hundred floats +sparse = [{331: 0.5}, {14136: 0.7}] # 20 key value pairs +``` - +The numbers 331 and 14136 map to specific tokens in the vocabulary e.g. `['chocolate', 'icecream']`. The rest of the values are zero. This is why it's called a sparse vector. - +The tokens aren't always words though, sometimes they can be sub-words: `['ch', 'ocolate']` too. -https://qdrant.tech/documentation/embeddings/premai/ +They're pivotal in information retrieval, especially in ranking and search systems. BM25, a standard ranking function used by search engines like [Elasticsearch](https://www.elastic.co/blog/practical-bm25-part-2-the-bm25-algorithm-and-its-variables?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors), exemplifies this. BM25 calculates the relevance of documents to a given search query. -2024-11-28T08:54:13+05:30 +BM25's capabilities are well-established, yet it has its limitations. -... +BM25 relies solely on the frequency of words in a document and does not attempt to comprehend the meaning or the contextual importance of the words. Additionally, it requires the computation of the entire corpus's statistics in advance, posing a challenge for large datasets. - +Sparse vectors harness the power of neural networks to surmount these limitations while retaining the ability to query exact words and phrases. +They excel in handling large text data, making them crucial in modern data processing a and marking an advancement over traditional methods such as BM25. - +## Understanding sparse vectors -https://qdrant.tech/pricing/ +Sparse Vectors are a representation where each dimension corresponds to a word or subword, greatly aiding in interpreting document rankings. This clarity is why sparse vectors are essential in modern search and recommendation systems, complimenting the meaning-rich embedding or dense vectors. -2024-08-20T12:47:35-07:00 +Dense vectors from models like OpenAI Ada-002 or Sentence Transformers contain non-zero values for every element. In contrast, sparse vectors focus on relative word weights per document, with most values being zero. This results in a more efficient and interpretable system, especially in text-heavy applications like search. -... +Sparse Vectors shine in domains and scenarios where many rare keywords or specialized terms are present. +For example, in the medical domain, many rare terms are not present in the general vocabulary, so general-purpose dense vectors cannot capture the nuances of the domain. - - +| Feature | Sparse Vectors | Dense Vectors | +|---------------------------|---------------------------------------------|----------------------------------------------| +| **Data Representation** | Majority of elements are zero | All elements are non-zero | +| **Computational Efficiency** | Generally higher, especially in operations involving zero elements | Lower, as operations are performed on all elements | +| **Information Density** | Less dense, focuses on key features | Highly dense, capturing nuanced relationships | +| **Example Applications** | Text search, Hybrid search | [RAG](https://qdrant.tech/articles/what-is-rag-in-ai/), many general machine learning tasks | -https://qdrant.tech/legal/privacy-policy/ +Where do sparse vectors fail though? They're not great at capturing nuanced relationships between words. For example, they can't capture the relationship between "king" and "queen" as well as dense vectors. -2025-06-19T13:22:43+02:00 +## SPLADE -... +Let's check out [SPLADE](https://europe.naverlabs.com/research/computer-science/splade-a-sparse-bi-encoder-bert-based-model-achieves-effective-and-efficient-full-text-document-ranking/?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors), an excellent way to make sparse vectors. Let's look at some numbers first. Higher is better: - +| Model | MRR@10 (MS MARCO Dev) | Type | +|--------------------|---------|----------------| +| BM25 | 0.184 | Sparse | +| TCT-ColBERT | 0.359 | Dense | +| doc2query-T5 [link](https://github.com/castorini/docTTTTTquery) | 0.277 | Sparse | +| SPLADE | 0.322 | Sparse | +| SPLADE-max | 0.340 | Sparse | +| SPLADE-doc | 0.322 | Sparse | +| DistilSPLADE-max | 0.368 | Sparse | - +All numbers are from [SPLADEv2](https://arxiv.org/abs/2109.10086). MRR is [Mean Reciprocal Rank](https://www.wikiwand.com/en/Mean_reciprocal_rank#References), a standard metric for ranking. [MS MARCO](https://microsoft.github.io/MSMARCO-Passage-Ranking/?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors) is a dataset for evaluating ranking and retrieval for passages. -https://qdrant.tech/private-cloud/ +SPLADE is quite flexible as a method, with regularization knobs that can be tuned to obtain [different models](https://github.com/naver/splade) as well: -2024-05-21T09:57:56+02:00 +> SPLADE is more a class of models rather than a model per se: depending on the regularization magnitude, we can obtain different models (from very sparse to models doing intense query/doc expansion) with different properties and performance. -... +First, let's look at how to create a sparse vector. Then, we'll look at the concepts behind SPLADE. - +## Creating a sparse vector - +We'll explore two different ways to create a sparse vector. The higher performance way to create a sparse vector from dedicated document and query encoders. We'll look at a simpler approach -- here we will use the same model for both document and query. We will get a dictionary of token ids and their corresponding weights for a sample text - representing a document. -https://qdrant.tech/documentation/platforms/privategpt/ +If you'd like to follow along, here's a [Colab Notebook](https://colab.research.google.com/gist/NirantK/ad658be3abefc09b17ce29f45255e14e/splade-single-encoder.ipynb), [alternate link](https://gist.github.com/NirantK/ad658be3abefc09b17ce29f45255e14e) with all the code. -2024-08-15T08:50:37+05:30 +### Setting Up +```python +from transformers import AutoModelForMaskedLM, AutoTokenizer -... +model_id = "naver/splade-cocondenser-ensembledistil" - +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForMaskedLM.from_pretrained(model_id) - +text = """Arthur Robert Ashe Jr. (July 10, 1943 – February 6, 1993) was an American professional tennis player. He won three Grand Slam titles in singles and two in doubles.""" +``` -https://qdrant.tech/documentation/cloud-tools/pulumi/ +### Computing the sparse vector +```python +import torch -2024-11-19T18:01:59-08:00 -... +def compute_vector(text): + """ + Computes a vector from logits and attention mask using ReLU, log, and max operations. + """ + tokens = tokenizer(text, return_tensors="pt") + output = model(**tokens) + logits, attention_mask = output.logits, tokens.attention_mask + relu_log = torch.log(1 + torch.relu(logits)) + weighted_log = relu_log * attention_mask.unsqueeze(-1) + max_val, _ = torch.max(weighted_log, dim=1) + vec = max_val.squeeze() - + return vec, tokens - -https://qdrant.tech/articles/ +vec, tokens = compute_vector(text) +print(vec.shape) +``` -2024-12-20T13:10:51+01:00 +You'll notice that there are 38 tokens in the text based on this tokenizer. This will be different from the number of tokens in the vector. In a TF-IDF, we'd assign weights only to these tokens or words. In SPLADE, we assign weights to all the tokens in the vocabulary using this vector using our learned model. -... +## Term expansion and weights +```python +def extract_and_map_sparse_vector(vector, tokenizer): + """ + Extracts non-zero elements from a given vector and maps these elements to their human-readable tokens using a tokenizer. The function creates and returns a sorted dictionary where keys are the tokens corresponding to non-zero elements in the vector, and values are the weights of these elements, sorted in descending order of weights. - + This function is useful in NLP tasks where you need to understand the significance of different tokens based on a model's output vector. It first identifies non-zero values in the vector, maps them to tokens, and sorts them by weight for better interpretability. - + Args: + vector (torch.Tensor): A PyTorch tensor from which to extract non-zero elements. + tokenizer: The tokenizer used for tokenization in the model, providing the mapping from tokens to indices. -https://qdrant.tech/blog/ + Returns: + dict: A sorted dictionary mapping human-readable tokens to their corresponding non-zero weights. + """ -2024-05-21T09:57:56+02:00 + # Extract indices and values of non-zero elements in the vector + cols = vector.nonzero().squeeze().cpu().tolist() + weights = vector[cols].cpu().tolist() -... + # Map indices to tokens and create a dictionary + idx2token = {idx: token for token, idx in tokenizer.get_vocab().items()} + token_weight_dict = { + idx2token[idx]: round(weight, 2) for idx, weight in zip(cols, weights) + } - + # Sort the dictionary by weights in descending order + sorted_token_weight_dict = { + k: v + for k, v in sorted( + token_weight_dict.items(), key=lambda item: item[1], reverse=True + ) + } - + return sorted_token_weight_dict -https://qdrant.tech/cloud/ -2024-08-20T11:44:59-07:00 +# Usage example +sorted_tokens = extract_and_map_sparse_vector(vec, tokenizer) +sorted_tokens +``` -... +There will be 102 sorted tokens in total. This has expanded to include tokens that weren't in the original text. This is the term expansion we will talk about next. - +Here are some terms that are added: "Berlin", and "founder" - despite having no mention of Arthur's race (which leads to Owen's Berlin win) and his work as the founder of Arthur Ashe Institute for Urban Health. Here are the top few `sorted_tokens` with a weight of more than 1: - +```python +{ + "ashe": 2.95, + "arthur": 2.61, + "tennis": 2.22, + "robert": 1.74, + "jr": 1.55, + "he": 1.39, + "founder": 1.36, + "doubles": 1.24, + "won": 1.22, + "slam": 1.22, + "died": 1.19, + "singles": 1.1, + "was": 1.07, + "player": 1.06, + "titles": 0.99, + ... +} +``` -https://qdrant.tech/demo/ +If you're interested in using the higher-performance approach, check out the following models: -2024-09-06T13:14:12+02:00 +1. [naver/efficient-splade-VI-BT-large-doc](https://huggingface.co/naver/efficient-splade-vi-bt-large-doc) +2. [naver/efficient-splade-VI-BT-large-query](https://huggingface.co/naver/efficient-splade-vi-bt-large-doc) -... +## Why SPLADE works: term expansion - +Consider a query "solar energy advantages". SPLADE might expand this to include terms like "renewable," "sustainable," and "photovoltaic," which are contextually relevant but not explicitly mentioned. This process is called term expansion, and it's a key component of SPLADE. - +SPLADE learns the query/document expansion to include other relevant terms. This is a crucial advantage over other sparse methods which include the exact word, but completely miss the contextually relevant ones. -https://qdrant.tech/qdrant-for-startups/ +This expansion has a direct relationship with what we can control when making a SPLADE model: Sparsity via Regularisation. The number of tokens (BERT wordpieces) we use to represent each document. If we use more tokens, we can represent more terms, but the vectors become denser. This number is typically between 20 to 200 per document. As a reference point, the dense BERT vector is 768 dimensions, OpenAI Embedding is 1536 dimensions, and the sparse vector is 30 dimensions. -2024-09-30T18:44:08+02:00 +For example, assume a 1M document corpus. Say, we use 100 sparse token ids + weights per document. Correspondingly, dense BERT vector would be 768M floats, the OpenAI Embedding would be 1.536B floats, and the sparse vector would be a maximum of 100M integers + 100M floats. This could mean a **10x reduction in memory usage**, which is a huge win for large-scale systems: -... +| Vector Type | Memory (GB) | +|-------------------|-------------------------| +| Dense BERT Vector | 6.144 | +| OpenAI Embedding | 12.288 | +| Sparse Vector | 1.12 | - +### How SPLADE works: leveraging BERT - +SPLADE leverages a transformer architecture to generate sparse representations of documents and queries, enabling efficient retrieval. Let's dive into the process. -https://qdrant.tech/hybrid-cloud/ +The output logits from the transformer backbone are inputs upon which SPLADE builds. The transformer architecture can be something familiar like BERT. Rather than producing dense probability distributions, SPLADE utilizes these logits to construct sparse vectors—think of them as a distilled essence of tokens, where each dimension corresponds to a term from the vocabulary and its associated weight in the context of the given document or query. -2024-05-21T10:11:09+02:00 +This sparsity is critical; it mirrors the probability distributions from a typical [Masked Language Modeling](http://jalammar.github.io/illustrated-bert/?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors) task but is tuned for retrieval effectiveness, emphasizing terms that are both: -... +1. Contextually relevant: Terms that represent a document well should be given more weight. +2. Discriminative across documents: Terms that a document has, and other documents don't, should be given more weight. - +The token-level distributions that you'd expect in a standard transformer model are now transformed into token-level importance scores in SPLADE. These scores reflect the significance of each term in the context of the document or query, guiding the model to allocate more weight to terms that are likely to be more meaningful for retrieval purposes. - +The resulting sparse vectors are not only memory-efficient but also tailored for precise matching in the high-dimensional space of a search engine like Qdrant. -https://qdrant.tech/stars/ +### Interpreting SPLADE -2024-06-17T16:56:32+03:00 +A downside of dense vectors is that they are not interpretable, making it difficult to understand why a document is relevant to a query. -... +SPLADE importance estimation can provide insights into the 'why' behind a document's relevance to a query. By shedding light on which tokens contribute most to the retrieval score, SPLADE offers some degree of interpretability alongside performance, a rare feat in the realm of neural IR systems. For engineers working on search, this transparency is invaluable. - +## Known limitations of SPLADE - +### Pooling strategy +The switch to max pooling in SPLADE improved its performance on the MS MARCO and TREC datasets. However, this indicates a potential limitation of the baseline SPLADE pooling method, suggesting that SPLADE's performance is sensitive to the choice of pooling strategy​​. -https://qdrant.tech/qdrant-vector-database/ +### Document and query Eecoder +The SPLADE model variant that uses a document encoder with max pooling but no query encoder reaches the same performance level as the prior SPLADE model. This suggests a limitation in the necessity of a query encoder, potentially affecting the efficiency of the model​​. -2024-08-29T08:43:52-04:00 +### Other sparse vector methods -... +SPLADE is not the only method to create sparse vectors. - +Essentially, sparse vectors are a superset of TF-IDF and BM25, which are the most popular text retrieval methods. +In other words, you can create a sparse vector using the term frequency and inverse document frequency (TF-IDF) to reproduce the BM25 score exactly. - +Additionally, attention weights from Sentence Transformers can be used to create sparse vectors. +This method preserves the ability to query exact words and phrases but avoids the computational overhead of query expansion used in SPLADE. -https://qdrant.tech/rag/rag-evaluation-guide/ +We will cover these methods in detail in a future article. -2024-09-16T18:43:11+02:00 +## Leveraging sparse vectors in Qdrant for hybrid search -... +Qdrant supports a separate index for Sparse Vectors. +This enables you to use the same collection for both dense and sparse vectors. +Each "Point" in Qdrant can have both dense and sparse vectors. - +But let's first take a look at how you can work with sparse vectors in Qdrant. - +## Practical implementation in Python -https://qdrant.tech/rag/ +Let's dive into how Qdrant handles sparse vectors with an example. Here is what we will cover: -2024-08-20T11:45:42-07:00 +1. Setting Up Qdrant Client: Initially, we establish a connection with Qdrant using the QdrantClient. This setup is crucial for subsequent operations. -... +2. Creating a Collection with Sparse Vector Support: In Qdrant, a collection is a container for your vectors. Here, we create a collection specifically designed to support sparse vectors. This is done using the create_collection method where we define the parameters for sparse vectors, such as setting the index configuration. - +3. Inserting Sparse Vectors: Once the collection is set up, we can insert sparse vectors into it. This involves defining the sparse vector with its indices and values, and then upserting this point into the collection. - +4. Querying with Sparse Vectors: To perform a search, we first prepare a query vector. This involves computing the vector from a query text and extracting its indices and values. We then use these details to construct a query against our collection. -https://qdrant.tech/documentation/frameworks/ragbits/ +5. Retrieving and Interpreting Results: The search operation returns results that include the id of the matching document, its score, and other relevant details. The score is a crucial aspect, reflecting the similarity between the query and the documents in the collection. -2024-11-07T08:29:10+05:30 +### 1. Set up -... +```python +# Qdrant client setup +client = QdrantClient(":memory:") - +# Define collection name +COLLECTION_NAME = "example_collection" - +# Insert sparse vector into Qdrant collection +point_id = 1 # Assign a unique ID for the point +``` -https://qdrant.tech/recommendations/ +### 2. Create a collection with sparse vector support -2024-08-20T12:49:28-07:00 +```python +client.create_collection( + collection_name=COLLECTION_NAME, + vectors_config={}, + sparse_vectors_config={ + "text": models.SparseVectorParams( + index=models.SparseIndexParams( + on_disk=False, + ) + ) + }, +) +``` -... - +### 3. Insert sparse vectors - +Here, we see the process of inserting a sparse vector into the Qdrant collection. This step is key to building a dataset that can be quickly retrieved in the first stage of the retrieval process, utilizing the efficiency of sparse vectors. Since this is for demonstration purposes, we insert only one point with Sparse Vector and no dense vector. -https://qdrant.tech/documentation/data-management/redpanda/ +```python +client.upsert( + collection_name=COLLECTION_NAME, + points=[ + models.PointStruct( + id=point_id, + payload={}, # Add any additional payload if necessary + vector={ + "text": models.SparseVector( + indices=indices.tolist(), values=values.tolist() + ) + }, + ) + ], +) +``` +By upserting points with sparse vectors, we prepare our dataset for rapid first-stage retrieval, laying the groundwork for subsequent detailed analysis using dense vectors. Notice that we use "text" to denote the name of the sparse vector. -2024-08-15T22:23:17+05:30 +Those familiar with the Qdrant API will notice that the extra care taken to be consistent with the existing named vectors API -- this is to make it easier to use sparse vectors in existing codebases. As always, you're able to **apply payload filters**, shard keys, and other advanced features you've come to expect from Qdrant. To make things easier for you, the indices and values don't have to be sorted before upsert. Qdrant will sort them when the index is persisted e.g. on disk. -... +### 4. Query with sparse vectors - +We use the same process to prepare a query vector as well. This involves computing the vector from a query text and extracting its indices and values. We then use these details to construct a query against our collection. - +```python +# Preparing a query vector -https://qdrant.tech/documentation/frameworks/rig-rs/ +query_text = "Who was Arthur Ashe?" +query_vec, query_tokens = compute_vector(query_text) +query_vec.shape -2024-11-07T08:04:53+05:30 +query_indices = query_vec.nonzero().numpy().flatten() +query_values = query_vec.detach().numpy()[query_indices] +``` -... +In this example, we use the same model for both document and query. This is not a requirement, but it's a simpler approach. - +### 5. Retrieve and interpret results - +After setting up the collection and inserting sparse vectors, the next critical step is retrieving and interpreting the results. This process involves executing a search query and then analyzing the returned results. -https://qdrant.tech/documentation/platforms/mulesoft/ +```python +# Searching for similar documents +result = client.search( + collection_name=COLLECTION_NAME, + query_vector=models.NamedSparseVector( + name="text", + vector=models.SparseVector( + indices=query_indices, + values=query_values, + ), + ), + with_vectors=True, +) -2025-01-10T21:16:11+05:30 +result +``` -... +In the above code, we execute a search against our collection using the prepared sparse vector query. The `client.search` method takes the collection name and the query vector as inputs. The query vector is constructed using the `models.NamedSparseVector`, which includes the indices and values derived from the query text. This is a crucial step in efficiently retrieving relevant documents. - +```python +ScoredPoint( + id=1, + version=0, + score=3.4292831420898438, + payload={}, + vector={ + "text": SparseVector( + indices=[2001, 2002, 2010, 2018, 2032, ...], + values=[ + 1.0660614967346191, + 1.391068458557129, + 0.8903818726539612, + 0.2502821087837219, + ..., + ], + ) + }, +) +``` - +The result, as shown above, is a `ScoredPoint` object containing the ID of the retrieved document, its version, a similarity score, and the sparse vector. The score is a key element as it quantifies the similarity between the query and the document, based on their respective vectors. -https://qdrant.tech/documentation/frameworks/semantic-router/ +To understand how this scoring works, we use the familiar dot product method: -2024-08-15T08:50:37+05:30 +$$\text{Similarity}(\text{Query}, \text{Document}) = \sum_{i \in I} \text{Query}_i \times \text{Document}_i$$ -... +This formula calculates the similarity score by multiplying corresponding elements of the query and document vectors and summing these products. This method is particularly effective with sparse vectors, where many elements are zero, leading to a computationally efficient process. The higher the score, the greater the similarity between the query and the document, making it a valuable metric for assessing the relevance of the retrieved documents. - - +## Hybrid search: combining sparse and dense vectors -https://qdrant.tech/documentation/frameworks/smolagents/ +By combining search results from both dense and sparse vectors, you can achieve a hybrid search that is both efficient and accurate. +Results from sparse vectors will guarantee, that all results with the required keywords are returned, +while dense vectors will cover the semantically similar results. -2025-01-04T22:43:37+05:30 +The mixture of dense and sparse results can be presented directly to the user, or used as a first stage of a two-stage retrieval process. -... +Let's see how you can make a hybrid search query in Qdrant. - +First, you need to create a collection with both dense and sparse vectors: - +```python +client.create_collection( + collection_name=COLLECTION_NAME, + vectors_config={ + "text-dense": models.VectorParams( + size=1536, # OpenAI Embeddings + distance=models.Distance.COSINE, + ) + }, + sparse_vectors_config={ + "text-sparse": models.SparseVectorParams( + index=models.SparseIndexParams( + on_disk=False, + ) + ) + }, +) +``` -https://qdrant.tech/documentation/embeddings/snowflake/ -2024-11-28T08:54:13+05:30 +Then, assuming you have upserted both dense and sparse vectors, you can query them together: -... +```python +query_text = "Who was Arthur Ashe?" - +# Compute sparse and dense vectors +query_indices, query_values = compute_sparse_vector(query_text) +query_dense_vector = compute_dense_vector(query_text) - -https://qdrant.tech/documentation/frameworks/solon/ +client.search_batch( + collection_name=COLLECTION_NAME, + requests=[ + models.SearchRequest( + vector=models.NamedVector( + name="text-dense", + vector=query_dense_vector, + ), + limit=10, + ), + models.SearchRequest( + vector=models.NamedSparseVector( + name="text-sparse", + vector=models.SparseVector( + indices=query_indices, + values=query_values, + ), + ), + limit=10, + ), + ], +) +``` -2025-04-15T18:20:05+05:30 +The result will be a pair of result lists, one for dense and one for sparse vectors. -... +Having those results, there are several ways to combine them: - +### Mixing or fusion - +You can mix the results from both dense and sparse vectors, based purely on their relative scores. This is a simple and effective approach, but it doesn't take into account the semantic similarity between the results. Among the [popular mixing methods](https://medium.com/plain-simple-software/distribution-based-score-fusion-dbsf-a-new-approach-to-vector-search-ranking-f87c37488b18) are: -https://qdrant.tech/documentation/frameworks/spring-ai/ + - Reciprocal Ranked Fusion (RRF) + - Relative Score Fusion (RSF) + - Distribution-Based Score Fusion (DBSF) -2024-08-29T19:19:43+05:30 +{{< figure src=/articles_data/sparse-vectors/mixture.png caption="Relative Score Fusion" width=80% >}} -... +[Ranx](https://github.com/AmenRa/ranx) is a great library for mixing results from different sources. - - +### Re-ranking -https://qdrant.tech/documentation/frameworks/dspy/ +You can use obtained results as a first stage of a two-stage retrieval process. In the second stage, you can re-rank the results from the first stage using a more complex model, such as [Cross-Encoders](https://www.sbert.net/examples/applications/cross-encoder/README.html) or services like [Cohere Rerank](https://txt.cohere.com/rerank/). -2025-06-16T17:32:35+03:00 +And that's it! You've successfully achieved hybrid search with Qdrant! -... +## Additional resources +For those who want to dive deeper, here are the top papers on the topic most of which have code available: - +1. Problem Motivation: [Sparse Overcomplete Word Vector Representations](https://ar5iv.org/abs/1506.02004?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors) +1. [SPLADE v2: Sparse Lexical and Expansion Model for Information Retrieval](https://ar5iv.org/abs/2109.10086?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors) +1. [SPLADE: Sparse Lexical and Expansion Model for First Stage Ranking](https://ar5iv.org/abs/2107.05720?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors) +1. Late Interaction - [ColBERTv2: Effective and Efficient Retrieval via Lightweight Late Interaction](https://ar5iv.org/abs/2112.01488?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors) +1. [SparseEmbed: Learning Sparse Lexical Representations with Contextual Embeddings for Retrieval](https://research.google/pubs/pub52289/?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors) - +**Why just read when you can try it out?** -https://qdrant.tech/subscribe-confirmation/ +We've packed an easy-to-use Colab for you on how to make a Sparse Vector: [Sparse Vectors Single Encoder Demo](https://colab.research.google.com/drive/1wa2Yr5BCOgV0MTOFFTude99BOXCLHXky?usp=sharing). Run it, tinker with it, and start seeing the magic unfold in your projects. We can't wait to hear how you use it! -2023-12-26T11:53:00+00:00 +## Conclusion -... +Alright, folks, let's wrap it up. Better search isn't a 'nice-to-have,' it's a game-changer, and Qdrant can get you there. - +Got questions? Our [Discord community](https://qdrant.to/discord?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors) is teeming with answers. - +If you enjoyed reading this, why not sign up for our [newsletter](/subscribe/?utm_source=qdrant&utm_medium=website&utm_campaign=sparse-vectors&utm_content=article&utm_term=sparse-vectors) to stay ahead of the curve. -https://qdrant.tech/subscribe/ +And, of course, a big thanks to you, our readers, for pushing us to make ranking better for everyone. -2025-02-04T13:55:26+01:00 +<|page-42-lllmstxt|> +Please welcome the long-awaited [Qdrant 1.7.0 release](https://github.com/qdrant/qdrant/releases/tag/v1.7.0). Except for a handful of minor fixes and improvements, this release brings some cool brand-new features that we are excited to share! +The latest version of your favorite vector search engine finally supports **sparse vectors**. That's the feature many of you requested, so why should we ignore it? +We also decided to continue our journey with [vector similarity beyond search](/articles/vector-similarity-beyond-search/). The new Discovery API covers some utterly new use cases. We're more than excited to see what you will build with it! +But there is more to it! Check out what's new in **Qdrant 1.7.0**! -... +1. Sparse vectors: do you want to use keyword-based search? Support for sparse vectors is finally here! +2. Discovery API: an entirely new way of using vectors for restricted search and exploration. +3. User-defined sharding: you can now decide which points should be stored on which shard. +4. Snapshot-based shard transfer: a new option for moving shards between nodes. - +Do you see something missing? Your feedback drives the development of Qdrant, so do not hesitate to [join our Discord community](https://qdrant.to/discord) and help us build the best vector search engine out there! - +## New features -https://qdrant.tech/documentation/frameworks/superduper/ +Qdrant 1.7.0 brings a bunch of new features. Let's take a closer look at them! -2024-11-27T17:46:12+05:30 +### Sparse vectors -... +Traditional keyword-based search mechanisms often rely on algorithms like TF-IDF, BM25, or comparable methods. While these techniques internally utilize vectors, they typically involve sparse vector representations. In these methods, the **vectors are predominantly filled with zeros, containing a relatively small number of non-zero values**. +Those sparse vectors are theoretically high dimensional, definitely way higher than the dense vectors used in semantic search. However, since the majority of dimensions are usually zeros, we store them differently and just keep the non-zero dimensions. - +Until now, Qdrant has not been able to handle sparse vectors natively. Some were trying to convert them to dense vectors, but that was not the best solution or a suggested way. We even wrote a piece with [our thoughts on building a hybrid search](/articles/hybrid-search/), and we encouraged you to use a different tool for keyword lookup. - +Things have changed since then, as so many of you wanted a single tool for sparse and dense vectors. And responding to this [popular](https://github.com/qdrant/qdrant/issues/1678) [demand](https://github.com/qdrant/qdrant/issues/1135), we've now introduced sparse vectors! -https://qdrant.tech/documentation/frameworks/sycamore/ +If you're coming across the topic of sparse vectors for the first time, our [Brief History of Search](/documentation/overview/vector-search/) explains the difference between sparse and dense vectors. -2024-10-17T11:40:28+05:30 +Check out the [sparse vectors article](/articles/sparse-vectors/) and [sparse vectors index docs](/documentation/concepts/indexing/#sparse-vector-index) for more details on what this new index means for Qdrant users. -... +### Discovery API - +The recently launched [Discovery API](/documentation/concepts/explore/#discovery-api) extends the range of scenarios for leveraging vectors. While its interface mirrors the [Recommendation API](/documentation/concepts/explore/#recommendation-api), it focuses on refining the search parameters for greater precision. +The concept of 'context' refers to a collection of positive-negative pairs that define zones within a space. Each pair effectively divides the space into positive or negative segments. This concept guides the search operation to prioritize points based on their inclusion within positive zones or their avoidance of negative zones. Essentially, the search algorithm favors points that fall within multiple positive zones or steer clear of negative ones. - +The Discovery API can be used in two ways - either with or without the target point. The first case is called a **discovery search**, while the second is called a **context search**. -https://qdrant.tech/legal/terms\_and\_conditions/ +#### Discovery search -2021-12-10T10:29:52+01:00 +*Discovery search* is an operation that uses a target point to find the most relevant points in the collection, while performing the search in the preferred areas only. That is basically a search operation with more control over the search space. -... +![Discovery search visualization](/articles_data/qdrant-1.7.x/discovery-search.png) - +Please refer to the [Discovery API documentation on discovery search](/documentation/concepts/explore/#discovery-search) for more details and the internal mechanics of the operation. - +#### Context search -https://qdrant.tech/documentation/cloud-tools/terraform/ +The mode of *context search* is similar to the discovery search, but it does not use a target point. Instead, the `context` is used to navigate the [HNSW graph](https://arxiv.org/abs/1603.09320) towards preferred zones. It is expected that the results in that mode will be diverse, and not centered around one point. +*Context Search* could serve as a solution for individuals seeking a more exploratory approach to navigate the vector space. -2024-11-19T18:01:59-08:00 +![Context search visualization](/articles_data/qdrant-1.7.x/context-search.png) -... +### User-defined sharding - +Qdrant's collections are divided into shards. A single **shard** is a self-contained store of points, which can be moved between nodes. Up till now, the points were distributed among shards by using a consistent hashing algorithm, so that shards were managing non-intersecting subsets of points. +The latter one remains true, but now you can define your own sharding and decide which points should be stored on which shard. Sounds cool, right? But why would you need that? Well, there are multiple scenarios in which you may want to use custom sharding. For example, you may want to store some points on a dedicated node, or you may want to store points from the same user on the same shard and - +While the existing behavior is still the default one, you can now define the shards when you create a collection. Then, you can assign each point to a shard by providing a `shard_key` in the `upsert` operation. What's more, you can also search over the selected shards only, by providing the `shard_key` parameter in the search operation. -https://qdrant.tech/documentation/frameworks/testcontainers/ +```http request +POST /collections/my_collection/points/search +{ + "vector": [0.29, 0.81, 0.75, 0.11], + "shard_key": ["cats", "dogs"], + "limit": 10, + "with_payload": true, +} +``` -2025-04-24T18:47:10+10:00 +If you want to know more about the user-defined sharding, please refer to the [sharding documentation](/documentation/guides/distributed_deployment/#sharding). -... +### Snapshot-based shard transfer - +That's a really more in depth technical improvement for the distributed mode users, that we implemented a new options the shard transfer mechanism. The new approach is based on the snapshot of the shard, which is transferred to the target node. - +Moving shards is required for dynamical scaling of the cluster. Your data can migrate between nodes, and the way you move it is crucial for the performance of the whole system. The good old `stream_records` method (still the default one) transmits all the records between the machines and indexes them on the target node. +In the case of moving the shard, it's necessary to recreate the HNSW index each time. However, with the introduction of the new `snapshot` approach, the snapshot itself, inclusive of all data and potentially quantized content, is transferred to the target node. This comprehensive snapshot includes the entire index, enabling the target node to seamlessly load it and promptly begin handling requests without the need for index recreation. -https://qdrant.tech/documentation/platforms/tooljet/ +There are multiple scenarios in which you may prefer one over the other. Please check out the docs of the [shard transfer method](/documentation/guides/distributed_deployment/#shard-transfer-method) for more details and head-to-head comparison. As for now, the old `stream_records` method is still the default one, but we may decide to change it in the future. -2025-03-06T14:58:05+05:30 +## Minor improvements -... +Beyond introducing new features, Qdrant 1.7.0 enhances performance and addresses various minor issues. Here's a rundown of the key improvements: - +1. Improvement of HNSW Index Building on High CPU Systems ([PR#2869](https://github.com/qdrant/qdrant/pull/2869)). - +2. Improving [Search Tail Latencies](https://github.com/qdrant/qdrant/pull/2931): improvement for high CPU systems with many parallel searches, directly impacting the user experience by reducing latency. -https://qdrant.tech/documentation/embeddings/twelvelabs/ +3. [Adding Index for Geo Map Payloads](https://github.com/qdrant/qdrant/pull/2768): index for geo map payloads can significantly improve search performance, especially for applications involving geographical data. -2025-01-07T21:51:22+05:30 +4. Stability of Consensus on Big High Load Clusters: enhancing the stability of consensus in large, high-load environments is critical for ensuring the reliability and scalability of the system ([PR#3013](https://github.com/qdrant/qdrant/pull/3013), [PR#3026](https://github.com/qdrant/qdrant/pull/3026), [PR#2942](https://github.com/qdrant/qdrant/pull/2942), [PR#3103](https://github.com/qdrant/qdrant/pull/3103), [PR#3054](https://github.com/qdrant/qdrant/pull/3054)). -... +5. Configurable Timeout for Searches: allowing users to configure the timeout for searches provides greater flexibility and can help optimize system performance under different operational conditions ([PR#2748](https://github.com/qdrant/qdrant/pull/2748), [PR#2771](https://github.com/qdrant/qdrant/pull/2771)). - +## Release notes - +[Our release notes](https://github.com/qdrant/qdrant/releases/tag/v1.7.0) are a place to go if you are interested in more details. Please remember that Qdrant is an open source project, so feel free to [contribute](https://github.com/qdrant/qdrant/issues)! -https://qdrant.tech/documentation/frameworks/txtai/ +<|page-43-lllmstxt|> +The most popular use case for vector search engines, such as Qdrant, is Semantic search with a single query vector. Given the +query, we can vectorize (embed) it and find the closest points in the index. But [Vector Similarity beyond Search](/articles/vector-similarity-beyond-search/) +does exist, and recommendation systems are a great example. Recommendations might be seen as a multi-aim search, where we want +to find items close to positive and far from negative examples. This use of vector databases has many applications, including +recommendation systems for e-commerce, content, or even dating apps. -2024-08-15T08:50:37+05:30 +Qdrant has provided the [Recommendation API](/documentation/concepts/search/#recommendation-api) for a while, and with the latest release, [Qdrant 1.6](https://github.com/qdrant/qdrant/releases/tag/v1.6.0), +we're glad to give you more flexibility and control over the Recommendation API. +Here, we'll discuss some internals and show how they may be used in practice. -... +### Recap of the old recommendations API - +The previous [Recommendation API](/documentation/concepts/search/#recommendation-api) in Qdrant came with some limitations. First of all, it was required to pass vector IDs for +both positive and negative example points. If you wanted to use vector embeddings directly, you had to either create a new point +in a collection or mimic the behaviour of the Recommendation API by using the [Search API](/documentation/concepts/search/#search-api). +Moreover, in the previous releases of Qdrant, you were always asked to provide at least one positive example. This requirement +was based on the algorithm used to combine multiple samples into a single query vector. It was a simple, yet effective approach. +However, if the only information you had was that your user dislikes some items, you couldn't use it directly. - +Qdrant 1.6 brings a more flexible API. You can now provide both IDs and vectors of positive and negative examples. You can even +combine them within a single request. That makes the new implementation backward compatible, so you can easily upgrade an existing +Qdrant instance without any changes in your code. And the default behaviour of the API is still the same as before. However, we +extended the API, so **you can now choose the strategy of how to find the recommended points**. -https://qdrant.tech/documentation/data-management/unstructured/ +```http +POST /collections/{collection_name}/points/recommend +{ + "positive": [100, 231], + "negative": [718, [0.2, 0.3, 0.4, 0.5]], + "filter": { + "must": [ + { + "key": "city", + "match": { + "value": "London" + } + } + ] + }, + "strategy": "average_vector", + "limit": 3 +} +``` -2025-02-18T21:01:07+05:30 +There are two key changes in the request. First of all, we can adjust the strategy of search and set it to `average_vector` (the +default) or `best_score`. Moreover, we can pass both IDs (`718`) and embeddings (`[0.2, 0.3, 0.4, 0.5]`) as both positive and +negative examples. -... +## HNSW ANN example and strategy - +Let’s start with an example to help you understand the [HNSW graph](/articles/filtrable-hnsw/). Assume you want +to travel to a small city on another continent: - +1. You start from your hometown and take a bus to the local airport. +2. Then, take a flight to one of the closest hubs. +3. From there, you have to take another flight to a hub on your destination continent. +4. Hopefully, one last flight to your destination city. +5. You still have one more leg on local transport to get to your final address. -https://qdrant.tech/documentation/embeddings/upstage/ +This journey is similar to the HNSW graph’s use in Qdrant's approximate nearest neighbours search. -2024-11-28T08:54:13+05:30 +![Transport network](/articles_data/new-recommendation-api/example-transport-network.png) -... +HNSW is a multilayer graph of vectors (embeddings), with connections based on vector proximity. The top layer has the least +points, and the distances between those points are the biggest. The deeper we go, the more points we have, and the distances +get closer. The graph is built in a way that the points are connected to their closest neighbours at every layer. - +All the points from a particular layer are also in the layer below, so switching the search layer while staying in the same +location is possible. In the case of transport networks, the top layer would be the airline hubs, well-connected but with big +distances between the airports. Local airports, along with railways and buses, with higher density and smaller distances, make +up the middle layers. Lastly, our bottom layer consists of local means of transport, which is the densest and has the smallest +distances between the points. - +You don’t have to check all the possible connections when you travel. You select an intercontinental flight, then a local one, +and finally a bus or a taxi. All the decisions are made based on the distance between the points. -https://qdrant.tech/documentation/frameworks/vanna-ai/ +The search process in HNSW is also based on similarly traversing the graph. Start from the entry point in the top layer, find +its closest point and then use that point as the entry point into the next densest layer. This process repeats until we reach +the bottom layer. Visited points and distances to the original query vector are kept in memory. If none of the neighbours of +the current point is better than the best match, we can stop the traversal, as this is a local minimum. We start at the biggest +scale, and then gradually zoom in. -2024-08-15T08:50:37+05:30 +In this oversimplified example, we assumed that the distance between the points is the only factor that matters. In reality, we +might want to consider other criteria, such as the ticket price, or avoid some specific locations due to certain restrictions. +That means, there are various strategies for choosing the best match, which is also true in the case of vector recommendations. +We can use different approaches to determine the path of traversing the HNSW graph by changing how we calculate the score of a +candidate point during traversal. The default behaviour is based on pure distance, but Qdrant 1.6 exposes two strategies for the +recommendation API. -... +### Average vector - +The default strategy, called `average_vector` is the previous one, based on the average of positive and negative examples. It +simplifies the recommendations process and converts it into a single vector search. It supports both point IDs and vectors as +parameters. For example, you can get recommendations based on past interactions with existing points combined with query vector +embedding. Internally, that mechanism is based on the averages of positive and negative examples and was calculated with the +following formula: - +$$ +\text{average vector} = \text{avg}(\text{positive vectors}) + \left( \text{avg}(\text{positive vectors}) - \text{avg}(\text{negative vectors}) \right) +$$ -https://qdrant.tech/documentation/frameworks/mirror-security/ +The `average_vector` converts the problem of recommendations into a single vector search. -2025-02-21T09:20:59+05:30 +### The new hotness - Best score -... +The new strategy is called `best_score`. It does not rely on averages and is more flexible. It allows you to pass just negative +samples and uses a slightly more sophisticated algorithm under the hood. - +The best score is chosen at every step of HNSW graph traversal. We separately calculate the distance between a traversed point +and every positive and negative example. In the case of the best score strategy, **there is no single query vector anymore, but a +bunch of positive and negative queries**. As a result, for each sample in the query, we have a set of distances, one for each +sample. In the next step, we simply take the best scores for positives and negatives, creating two separate values. Best scores +are just the closest distances of a query to positives and negatives. The idea is: **if a point is closer to any negative than to +any positive example, we do not want it**. We penalize being close to the negatives, so instead of using the similarity value +directly, we check if it’s closer to positives or negatives. The following formula is used to calculate the score of a traversed +potential point: - +```rust +// Sigmoid function to normalize the score between 0 and 1 +let sigmoid = |x| 0.5 * (1.0 + (x / (1.0 + x.abs()))); -https://qdrant.tech/benchmarks/ +let score = if best_positive_score > best_negative_score { + sigmoid(best_positive_score) +} else { + -sigmoid(best_negative_score) +}; +``` -2023-02-16T18:40:22+04:00 +If the point is closer to the negatives, we penalize it by taking the negative squared value of the best negative score. For a +closer negative, the score of the candidate point will always be lower or equal to zero, making the chances of choosing that point +significantly lower. However, if the best negative score is higher than the best positive score, we still prefer those that are +further away from the negatives. That procedure effectively **pulls the traversal procedure away from the negative examples**. -... +If you want to know more about the internals of HNSW, you can check out the article about the +[Filtrable HNSW](/articles/filtrable-hnsw/) that covers the topic thoroughly. - +## Food Discovery demo - +Our [Food Discovery demo](/articles/food-discovery-demo/) is an application built on top of the new [Recommendation API](/documentation/concepts/search/#recommendation-api). +It allows you to find a meal based on liked and disliked photos. There are some updates, enabled by the new Qdrant release: -https://qdrant.tech/use-cases/ +* **Ability to include multiple textual queries in the recommendation request.** Previously, we only allowed passing a single + query to solve the cold start problem. Right now, you can pass multiple queries and mix them with the liked/disliked photos. + This became possible because of the new flexibility in parameters. We can pass both point IDs and embedding vectors in the same + request, and user queries are obviously not a part of the collection. +* **Switch between the recommendation strategies.** You can now choose between the `average_vector` and the `best_score` scoring + algorithm. -2024-09-04T08:01:21-07:00 +### Differences between the strategies -... +The UI of the Food Discovery demo allows you to switch between the strategies. The `best_vector` is the default one, but with just +a single switch, you can see how the results differ when using the previous `average_vector` strategy. - +If you select just a single positive example, both algorithms work identically. - +##### One positive example -https://qdrant.tech/documentation/platforms/vectorize/ + -2025-02-05T06:14:34-05:00 +The difference only becomes apparent when you start adding more examples, especially if you choose some negatives. -... +##### One positive and one negative example - + - +The more likes and dislikes we add, the more diverse the results of the `best_score` strategy will be. In the old strategy, there +is just a single vector, so all the examples are similar to it. The new one takes into account all the examples separately, making +the variety richer. -https://qdrant.tech/documentation/embeddings/voyage/ +##### Multiple positive and negative examples -2024-11-28T08:54:13+05:30 + -... +Choosing the right strategy is dataset-dependent, and the embeddings play a significant role here. Thus, it’s always worth trying +both of them and comparing the results in a particular case. - +#### Handling the negatives only - +In the case of our Food Discovery demo, passing just the negative images can work as an outlier detection mechanism. While the dataset +was supposed to contain only food photos, this is not actually true. A simple way to find these outliers is to pass in food item photos +as negatives, leading to the results being the most "unlike" food images. In our case you will see pill bottles and books. -https://qdrant.tech/documentation/cloud-intro/ +**The `average_vector` strategy still requires providing at least one positive example!** However, since cosine distance is set up +for the collection used in the demo, we faked it using [a trick described in the previous article](/articles/food-discovery-demo/#negative-feedback-only). +In a nutshell, if you only pass negative examples, their vectors will be averaged, and the negated resulting vector will be used as +a query to the search endpoint. -2025-05-02T16:53:21+02:00 +##### Negatives only -... + - +Still, both methods return different results, so they each have their place depending on the questions being asked and the datasets +being used. -... +#### Challenges with multimodality - +Food Discovery uses the [CLIP embeddings model](https://huggingface.co/sentence-transformers/clip-ViT-B-32), which is multimodal, +allowing both images and texts encoded into the same vector space. Using this model allows for image queries, text queries, or both of +them combined. We utilized that mechanism in the updated demo, allowing you to pass the textual queries to filter the results further. -<|page-32-lllmstxt|> -## cloud-tools -- [Documentation](https://qdrant.tech/documentation/) -- Infrastructure Tools +##### A single text query -## [Anchor](https://qdrant.tech/documentation/cloud-tools/\#cloud-tools) Cloud Tools + -| Integration | Description | -| --- | --- | -| [Pulumi](https://qdrant.tech/documentation/cloud-tools/pulumi/) | Infrastructure as code tool for creating, deploying, and managing cloud infrastructure | -| [Terraform](https://qdrant.tech/documentation/cloud-tools/terraform/) | infrastructure as code tool to define resources in human-readable configuration files. | +Text queries might be mixed with the liked and disliked photos, so you can combine them in a single request. However, you might be +surprised by the results achieved with the new strategy, if you start adding the negative examples. -##### Was this page useful? +##### A single text query with negative example -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No + -Thank you for your feedback! 🙏 +This is an issue related to the embeddings themselves. Our dataset contains a bunch of image embeddings that are pretty close to each +other. On the other hand, our text queries are quite far from most of the image embeddings, but relatively close to some of them, so the +text-to-image search seems to work well. When all query items come from the same domain, such as only text, everything works fine. +However, if we mix positive text and negative image embeddings, the results of the `best_score` are overwhelmed by the negative samples, +which are simply closer to the dataset embeddings. If you experience such a problem, the `average_vector` strategy might be a better +choice. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-tools/_index.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +### Check out the demo -On this page: +The [Food Discovery Demo](https://food-discovery.qdrant.tech/) is available online, so you can test and see the difference. +This is an open source project, so you can easily deploy it on your own. The source code is available in the [GitHub repository +](https://github.com/qdrant/demo-food-discovery/) and the [README](https://github.com/qdrant/demo-food-discovery/blob/main/README.md) describes the process of setting it up. +Since calculating the embeddings takes a while, we precomputed them and exported them as a [snapshot](https://storage.googleapis.com/common-datasets-snapshots/wolt-clip-ViT-B-32.snapshot), +which might be easily imported into any Qdrant instance. [Qdrant Cloud is the easiest way to start](https://cloud.qdrant.io/), though! -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-tools/_index.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +<|page-44-lllmstxt|> +Ever since the data science community discovered that vector search significantly improves LLM answers, +various vendors and enthusiasts have been arguing over the proper solutions to store embeddings. -× +Some say storing them in a specialized engine (aka vector database) is better. Others say that it's enough to use plugins for existing databases. -[Powered by](https://qdrant.tech/) +Here are [just](https://nextword.substack.com/p/vector-database-is-not-a-separate) a [few](https://stackoverflow.blog/2023/09/20/do-you-need-a-specialized-vector-database-to-implement-vector-search-well/) of [them](https://www.singlestore.com/blog/why-your-vector-database-should-not-be-a-vector-database/). -<|page-33-lllmstxt|> -## common-errors -- [Documentation](https://qdrant.tech/documentation/) -- [Guides](https://qdrant.tech/documentation/guides/) -- Troubleshooting -# [Anchor](https://qdrant.tech/documentation/guides/common-errors/\#solving-common-errors) Solving common errors +This article presents our vision and arguments on the topic . +We will: -## [Anchor](https://qdrant.tech/documentation/guides/common-errors/\#too-many-files-open-os-error-24) Too many files open (OS error 24) +1. Explain why and when you actually need a dedicated vector solution +2. Debunk some ungrounded claims and anti-patterns to be avoided when building a vector search system. -Each collection segment needs some files to be open. At some point you may encounter the following errors in your server log: +A table of contents: -```text -Error: Too many files open (OS error 24) +* *Each database vendor will sooner or later introduce vector capabilities...* [[click](#each-database-vendor-will-sooner-or-later-introduce-vector-capabilities-that-will-make-every-database-a-vector-database)] +* *Having a dedicated vector database requires duplication of data.* [[click](#having-a-dedicated-vector-database-requires-duplication-of-data)] +* *Having a dedicated vector database requires complex data synchronization.* [[click](#having-a-dedicated-vector-database-requires-complex-data-synchronization)] +* *You have to pay for a vector service uptime and data transfer.* [[click](#you-have-to-pay-for-a-vector-service-uptime-and-data-transfer-of-both-solutions)] +* *What is more seamless than your current database adding vector search capability?* [[click](#what-is-more-seamless-than-your-current-database-adding-vector-search-capability)] +* *Databases can support RAG use-case end-to-end.* [[click](#databases-can-support-rag-use-case-end-to-end)] -``` -In such a case you may need to increase the limit of the open files. It might be done, for example, while you launch the Docker container: +## Responding to claims -```bash -docker run --ulimit nofile=10000:10000 qdrant/qdrant:latest +###### Each database vendor will sooner or later introduce vector capabilities. That will make every database a Vector Database. -``` +The origins of this misconception lie in the careless use of the term Vector *Database*. +When we think of a *database*, we subconsciously envision a relational database like Postgres or MySQL. +Or, more scientifically, a service built on ACID principles that provides transactions, strong consistency guarantees, and atomicity. -The command above will set both soft and hard limits to `10000`. +The majority of Vector Database are not *databases* in this sense. +It is more accurate to call them *search engines*, but unfortunately, the marketing term *vector database* has already stuck, and it is unlikely to change. -If you are not using Docker, the following command will change the limit for the current user session: -```bash -ulimit -n 10000 +*What makes search engines different, and why vector DBs are built as search engines?* -``` +First of all, search engines assume different patterns of workloads and prioritize different properties of the system. The core architecture of such solutions is built around those priorities. -Please note, the command should be executed before you run Qdrant server. +What types of properties do search engines prioritize? -## [Anchor](https://qdrant.tech/documentation/guides/common-errors/\#cant-open-collections-meta-wal) Can’t open Collections meta Wal +* **Scalability**. Search engines are built to handle large amounts of data and queries. They are designed to be horizontally scalable and operate with more data than can fit into a single machine. +* **Search speed**. Search engines should guarantee low latency for queries, while the atomicity of updates is less important. +* **Availability**. Search engines must stay available if the majority of the nodes in a cluster are down. At the same time, they can tolerate the eventual consistency of updates. -When starting a Qdrant instance as part of a distributed deployment, you may -come across an error message similar to this: +{{< figure src=/articles_data/dedicated-service/compass.png caption="Database guarantees compass" width=80% >}} -```bash -Can't open Collections meta Wal: Os { code: 11, kind: WouldBlock, message: "Resource temporarily unavailable" } -``` +Those priorities lead to different architectural decisions that are not reproducible in a general-purpose database, even if it has vector index support. -It means that Qdrant cannot start because a collection cannot be loaded. Its -associated [WAL](https://qdrant.tech/documentation/concepts/storage/#versioning) files are currently -unavailable, likely because the same files are already being used by another -Qdrant instance. -Each node must have their own separate storage directory, volume or mount. +###### Having a dedicated vector database requires duplication of data. -The formed cluster will take care of sharing all data with each node, putting it -all in the correct places for you. If using Kubernetes, each node must have -their own volume. If using Docker, each node must have their own storage mount -or volume. If using Qdrant directly, each node must have their own storage -directory. +By their very nature, vector embeddings are derivatives of the primary source data. -## [Anchor](https://qdrant.tech/documentation/guides/common-errors/\#using-python-grpc-client-with-multiprocessing) Using python gRPC client with `multiprocessing` +In the vast majority of cases, embeddings are derived from some other data, such as text, images, or additional information stored in your system. So, in fact, all embeddings you have in your system can be considered transformations of some original source. -When using the Python gRPC client with `multiprocessing`, you may encounter an error like this: +And the distinguishing feature of derivative data is that it will change when the transformation pipeline changes. +In the case of vector embeddings, the scenario of those changes is quite simple: every time you update the encoder model, all the embeddings will change. -```text -<_InactiveRpcError of RPC that terminated with: - status = StatusCode.UNAVAILABLE - details = "sendmsg: Socket operation on non-socket (88)" - debug_error_string = "UNKNOWN:Error received from peer {grpc_message:"sendmsg: Socket operation on non-socket (88)", grpc_status:14, created_time:"....."}" +In systems where vector embeddings are fused with the primary data source, it is impossible to perform such migrations without significantly affecting the production system. -``` +As a result, even if you want to use a single database for storing all kinds of data, you would still need to duplicate data internally. -This error happens, because `multiprocessing` creates copies of gRPC channels, which share the same socket. When the parent process closes the channel, it closes the socket, and the child processes try to use a closed socket. +###### Having a dedicated vector database requires complex data synchronization. -To prevent this error, you can use the `forkserver` or `spawn` start methods for `multiprocessing`. +Most production systems prefer to isolate different types of workloads into separate services. +In many cases, those isolated services are not even related to search use cases. -```python -import multiprocessing +For example, databases for analytics and one for serving can be updated from the same source. +Yet they can store and organize the data in a way that is optimal for their typical workloads. -multiprocessing.set_start_method("forkserver") # or "spawn" +Search engines are usually isolated for the same reason: you want to avoid creating a noisy neighbor problem and compromise the performance of your main database. -``` +*To give you some intuition, let's consider a practical example:* -Alternatively, you can switch to `REST` API, async client, or use built-in parallelization in the Python client - functions like `qdrant.upload_points(...)` +Assume we have a database with 1 million records. +This is a small database by modern standards of any relational database. +You can probably use the smallest free tier of any cloud provider to host it. -##### Was this page useful? +But if we want to use this database for vector search, 1 million OpenAI `text-embedding-ada-002` embeddings will take **~6GB of RAM** (sic!). +As you can see, the vector search use case completely overwhelmed the main database resource requirements. +In practice, this means that your main database becomes burdened with high memory requirements and can not scale efficiently, limited by the size of a single machine. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Fortunately, the data synchronization problem is not new and definitely not unique to vector search. +There are many well-known solutions, starting with message queues and ending with specialized ETL tools. -Thank you for your feedback! 🙏 +For example, we recently released our [integration with Airbyte](/documentation/integrations/airbyte/), allowing you to synchronize data from various sources into Qdrant incrementally. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/common-errors.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +###### You have to pay for a vector service uptime and data transfer of both solutions. -On this page: +In the open-source world, you pay for the resources you use, not the number of different databases you run. +Resources depend more on the optimal solution for each use case. +As a result, running a dedicated vector search engine can be even cheaper, as it allows optimization specifically for vector search use cases. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/common-errors.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +For instance, Qdrant implements a number of [quantization techniques](/documentation/guides/quantization/) that can significantly reduce the memory footprint of embeddings. -× +In terms of data transfer costs, on most cloud providers, network use within a region is usually free. As long as you put the original source data and the vector store in the same region, there are no added data transfer costs. -[Powered by](https://qdrant.tech/) +###### What is more seamless than your current database adding vector search capability? -<|page-34-lllmstxt|> -## qdrant-1.8.x -- [Articles](https://qdrant.tech/articles/) -- Qdrant 1.8.0: Enhanced Search Capabilities for Better Results +In contrast to the short-term attractiveness of integrated solutions, dedicated search engines propose flexibility and a modular approach. +You don't need to update the whole production database each time some of the vector plugins are updated. +Maintenance of a dedicated search engine is as isolated from the main database as the data itself. -[Back to Qdrant Articles](https://qdrant.tech/articles/) +In fact, integration of more complex scenarios, such as read/write segregation, is much easier with a dedicated vector solution. +You can easily build cross-region replication to ensure low latency for your users. -# Qdrant 1.8.0: Enhanced Search Capabilities for Better Results +{{< figure src=/articles_data/dedicated-service/region-based-deploy.png caption="Read/Write segregation + cross-regional deployment" width=80% >}} -David Myriel, Mike Jang +It is especially important in large enterprise organizations, where the responsibility for different parts of the system is distributed among different teams. +In those situations, it is much easier to maintain a dedicated search engine for the AI team than to convince the core team to update the whole primary database. -· +Finally, the vector capabilities of the all-in-one database are tied to the development and release cycle of the entire stack. +Their long history of use also means that they need to pay a high price for backward compatibility. -March 06, 2024 +###### Databases can support RAG use-case end-to-end. -![Qdrant 1.8.0: Enhanced Search Capabilities for Better Results](https://qdrant.tech/articles_data/qdrant-1.8.x/preview/title.jpg) +Putting aside performance and scalability questions, the whole discussion about implementing RAG in the DBs assumes that the only detail missing in traditional databases is the vector index and the ability to make fast ANN queries. -# [Anchor](https://qdrant.tech/articles/qdrant-1.8.x/\#unlocking-next-level-search-exploring-qdrant-180s-advanced-search-capabilities) Unlocking Next-Level Search: Exploring Qdrant 1.8.0’s Advanced Search Capabilities +In fact, the current capabilities of vector search have only scratched the surface of what is possible. +For example, in our recent article, we discuss the possibility of building an [exploration API](/articles/vector-similarity-beyond-search/) to fuel the discovery process - an alternative to kNN search, where you don’t even know what exactly you are looking for. -[Qdrant 1.8.0 is out!](https://github.com/qdrant/qdrant/releases/tag/v1.8.0). -This time around, we have focused on Qdrant’s internals. Our goal was to optimize performance so that your existing setup can run faster and save on compute. Here is what we’ve been up to: +## Summary +Ultimately, you do not need a vector database if you are looking for a simple vector search functionality with a small amount of data. We genuinely recommend starting with whatever you already have in your stack to prototype. But you need one if you are looking to do more out of it, and it is the central functionality of your application. It is just like using a multi-tool to make something quick or using a dedicated instrument highly optimized for the use case. -- **Faster [sparse vectors](https://qdrant.tech/articles/sparse-vectors/):** [Hybrid search](https://qdrant.tech/articles/hybrid-search/) is up to 16x faster now! -- **CPU resource management:** You can allocate CPU threads for faster indexing. -- **Better indexing performance:** We optimized text [indexing](https://qdrant.tech/documentation/concepts/indexing/) on the backend. +Large-scale production systems usually consist of different specialized services and storage types for good reasons since it is one of the best practices of modern software architecture. Comparable to the orchestration of independent building blocks in a microservice architecture. -## [Anchor](https://qdrant.tech/articles/qdrant-1.8.x/\#faster-search-with-sparse-vectors) Faster search with sparse vectors +When you stuff the database with a vector index, you compromise both the performance and scalability of the main database and the vector search capabilities. +There is no one-size-fits-all approach that would not compromise on performance or flexibility. +So if your use case utilizes vector search in any significant way, it is worth investing in a dedicated vector search engine, aka vector database. -Search throughput is now up to 16 times faster for sparse vectors. If you are [using Qdrant for hybrid search](https://qdrant.tech/articles/sparse-vectors/), this means that you can now handle up to sixteen times as many queries. This improvement comes from extensive backend optimizations aimed at increasing efficiency and capacity. +<|page-45-lllmstxt|> +Data Science and Machine Learning practitioners often find themselves navigating through a labyrinth of models, libraries, and frameworks. Which model to choose, what embedding size, and how to approach tokenizing, are just some questions you are faced with when starting your work. We understood how many data scientists wanted an easier and more intuitive means to do their embedding work. This is why we built FastEmbed, a Python library engineered for speed, efficiency, and usability. We have created easy to use default workflows, handling the 80% use cases in NLP embedding. -What this means for your setup: +## Current State of Affairs for Generating Embeddings -- **Query speed:** The time it takes to run a search query has been significantly reduced. -- **Search capacity:** Qdrant can now handle a much larger volume of search requests. -- **User experience:** Results will appear faster, leading to a smoother experience for the user. -- **Scalability:** You can easily accommodate rapidly growing users or an expanding dataset. +Usually you make embedding by utilizing PyTorch or TensorFlow models under the hood. However, using these libraries comes at a cost in terms of ease of use and computational speed. This is at least in part because these are built for both: model inference and improvement e.g. via fine-tuning. -### [Anchor](https://qdrant.tech/articles/qdrant-1.8.x/\#sparse-vectors-benchmark) Sparse vectors benchmark +To tackle these problems we built a small library focused on the task of quickly and efficiently creating text embeddings. We also decided to start with only a small sample of best in class transformer models. By keeping it small and focused on a particular use case, we could make our library focused without all the extraneous dependencies. We ship with limited models, quantize the model weights and seamlessly integrate them with the ONNX Runtime. FastEmbed strikes a balance between inference time, resource utilization and performance (recall/accuracy). -Performance results are publicly available for you to test. Qdrant’s R&D developed a dedicated [open-source benchmarking tool](https://github.com/qdrant/sparse-vectors-benchmark) just to test sparse vector performance. +## Quick Embedding Text Document Example -A real-life simulation of sparse vector queries was run against the [NeurIPS 2023 dataset](https://big-ann-benchmarks.com/neurips23.html). All tests were done on an 8 CPU machine on Azure. +Here is an example of how simple we have made embedding text documents: -Latency (y-axis) has dropped significantly for queries. You can see the before/after here: +```python +documents: List[str] = [ + "Hello, World!", + "fastembed is supported by and maintained by Qdrant." +]  +embedding_model = DefaultEmbedding()  +embeddings: List[np.ndarray] = list(embedding_model.embed(documents)) +``` -![dropping latency](https://qdrant.tech/articles_data/qdrant-1.8.x/benchmark.png)**Figure 1:** Dropping latency in sparse vector search queries across versions 1.7-1.8. +These 3 lines of code do a lot of heavy lifting for you: They download the quantized model, load it using ONNXRuntime, and then run a batched embedding creation of your documents. -The colors within both scatter plots show the frequency of results. The red dots show that the highest concentration is around 2200ms (before) and 135ms (after). This tells us that latency for sparse vector queries dropped by about a factor of 16. Therefore, the time it takes to retrieve an answer with Qdrant is that much shorter. +### Code Walkthrough -This performance increase can have a dramatic effect on hybrid search implementations. [Read more about how to set this up.](https://qdrant.tech/articles/sparse-vectors/) +Let’s delve into a more advanced example code snippet line-by-line: -FYI, sparse vectors were released in [Qdrant v.1.7.0](https://qdrant.tech/articles/qdrant-1.7.x/#sparse-vectors). They are stored using a different index, so first [check out the documentation](https://qdrant.tech/documentation/concepts/indexing/#sparse-vector-index) if you want to try an implementation. +```python +from fastembed.embedding import DefaultEmbedding +``` -## [Anchor](https://qdrant.tech/articles/qdrant-1.8.x/\#cpu-resource-management) CPU resource management +Here, we import the FlagEmbedding class from FastEmbed and alias it as Embedding. This is the core class responsible for generating embeddings based on your chosen text model. This is also the class which you can import directly as DefaultEmbedding which is [BAAI/bge-small-en-v1.5](https://huggingface.co/baai/bge-small-en-v1.5) -Indexing is Qdrant’s most resource-intensive process. Now you can account for this by allocating compute use specifically to indexing. You can assign a number CPU resources towards indexing and leave the rest for search. As a result, indexes will build faster, and search quality will remain unaffected. +```python +documents: List[str] = [ + "passage: Hello, World!", + "query: How is the World?", + "passage: This is an example passage.", + "fastembed is supported by and maintained by Qdrant." +] +``` -This isn’t mandatory, as Qdrant is by default tuned to strike the right balance between indexing and search. However, if you wish to define specific CPU usage, you will need to do so from `config.yaml`. +In this list called documents, we define four text strings that we want to convert into embeddings. -This version introduces a `optimizer_cpu_budget` parameter to control the maximum number of CPUs used for indexing. +Note the use of prefixes “passage” and “query” to differentiate the types of embeddings to be generated. This is inherited from the cross-encoder implementation of the BAAI/bge series of models themselves. This is particularly useful for retrieval and we strongly recommend using this as well. -> Read more about `config.yaml` in the [configuration file](https://qdrant.tech/documentation/guides/configuration/). +The use of text prefixes like “query” and “passage” isn’t merely syntactic sugar; it informs the algorithm on how to treat the text for embedding generation. A “query” prefix often triggers the model to generate embeddings that are optimized for similarity comparisons, while “passage” embeddings are fine-tuned for contextual understanding. If you omit the prefix, the default behavior is applied, although specifying it is recommended for more nuanced results. -```yaml -# CPU budget, how many CPUs (threads) to allocate for an optimization job. -optimizer_cpu_budget: 0 +Next, we initialize the Embedding model with the default model: [BAAI/bge-small-en-v1.5](https://huggingface.co/baai/bge-small-en-v1.5). +```python +embedding_model = DefaultEmbedding() ``` -- If left at 0, Qdrant will keep 1 or more CPUs unallocated - depending on CPU size. -- If the setting is positive, Qdrant will use this exact number of CPUs for indexing. -- If the setting is negative, Qdrant will subtract this number of CPUs from the available CPUs for indexing. - -For most users, the default `optimizer_cpu_budget` setting will work well. We only recommend you use this if your indexing load is significant. +The default model and several other models have a context window of a maximum of 512 tokens. This maximum limit comes from the embedding model training and design itself. If you'd like to embed sequences larger than that, we'd recommend using some pooling strategy to get a single vector out of the sequence. For example, you can use the mean of the embeddings of different chunks of a document. This is also what the [SBERT Paper recommends](https://lilianweng.github.io/posts/2021-05-31-contrastive/#sentence-bert) -Our backend leverages dynamic CPU saturation to increase indexing speed. For that reason, the impact on search query performance ends up being minimal. Ultimately, you will be able to strike the best possible balance between indexing times and search performance. +This model strikes a balance between speed and accuracy, ideal for real-world applications. -This configuration can be done at any time, but it requires a restart of Qdrant. Changing it affects both existing and new collections. +```python +embeddings: List[np.ndarray] = list(embedding_model.embed(documents)) +``` -> **Note:** This feature is not configurable on [Qdrant Cloud](https://qdrant.to/cloud). +Finally, we call the `embed()` method on our embedding_model object, passing in the documents list. The method returns a Python generator, so we convert it to a list to get all the embeddings. These embeddings are NumPy arrays, optimized for fast mathematical operations. -## [Anchor](https://qdrant.tech/articles/qdrant-1.8.x/\#better-indexing-for-text-data) Better indexing for text data +The `embed()` method returns a list of NumPy arrays, each corresponding to the embedding of a document in your original documents list. The dimensions of these arrays are determined by the model you chose e.g. for “BAAI/bge-small-en-v1.5” it’s a 384-dimensional vector. -In order to [minimize your RAM expenditure](https://qdrant.tech/articles/memory-consumption/), we have developed a new way to index specific types of data. Please keep in mind that this is a backend improvement, and you won’t need to configure anything. +You can easily parse these NumPy arrays for any downstream application—be it clustering, similarity comparison, or feeding them into a machine learning model for further analysis. -> Going forward, if you are indexing immutable text fields, we estimate a 10% reduction in RAM loads. Our benchmark result is based on a system that uses 64GB of RAM. If you are using less RAM, this reduction might be higher than 10%. +## 3 Key Features of FastEmbed -Immutable text fields are static and do not change once they are added to Qdrant. These entries usually represent some type of attribute, description or tag. Vectors associated with them can be indexed more efficiently, since you don’t need to re-index them anymore. Conversely, mutable fields are dynamic and can be modified after their initial creation. Please keep in mind that they will continue to require additional RAM. +FastEmbed is built for inference speed, without sacrificing (too much) performance: -This approach ensures stability in the [vector search](https://qdrant.tech/documentation/overview/vector-search/) index, with faster and more consistent operations. We achieved this by setting up a field index which helps minimize what is stored. To improve search performance we have also optimized the way we load documents for searches with a text field index. Now our backend loads documents mostly sequentially and in increasing order. +1. 50% faster than PyTorch Transformers +2. Better performance than Sentence Transformers and OpenAI Ada-002 +3. Cosine similarity of quantized and original model vectors is 0.92 -## [Anchor](https://qdrant.tech/articles/qdrant-1.8.x/\#minor-improvements-and-new-features) Minor improvements and new features +We use `BAAI/bge-small-en-v1.5` as our DefaultEmbedding, hence we've chosen that for comparison: -Beyond these enhancements, [Qdrant v1.8.0](https://github.com/qdrant/qdrant/releases/tag/v1.8.0) adds and improves on several smaller features: +![](/articles_data/fastembed/throughput.png) -1. **Order points by payload:** In addition to searching for semantic results, you might want to retrieve results by specific metadata (such as price). You can now use Scroll API to [order points by payload key](https://qdrant.tech/documentation/concepts/points/#order-points-by-payload-key). -2. **Datetime support:** We have implemented [datetime support for the payload index](https://qdrant.tech/documentation/concepts/filtering/#datetime-range). Prior to this, if you wanted to search for a specific datetime range, you would have had to convert dates to UNIX timestamps. ( [PR#3320](https://github.com/qdrant/qdrant/issues/3320)) -3. **Check collection existence:** You can check whether a collection exists via the `/exists` endpoint to the `/collections/{collection_name}`. You will get a true/false response. ( [PR#3472](https://github.com/qdrant/qdrant/pull/3472)). -4. **Find points** whose payloads match more than the minimal amount of conditions. We included the `min_should` match feature for a condition to be `true` ( [PR#3331](https://github.com/qdrant/qdrant/pull/3466/)). -5. **Modify nested fields:** We have improved the `set_payload` API, adding the ability to update nested fields ( [PR#3548](https://github.com/qdrant/qdrant/pull/3548)). +## Under the Hood of FastEmbed -## [Anchor](https://qdrant.tech/articles/qdrant-1.8.x/\#experience-the-power-of-qdrant-180) Experience the Power of Qdrant 1.8.0 +**Quantized Models**: We quantize the models for CPU (and Mac Metal) – giving you the best buck for your compute model. Our default model is so small, you can run this in AWS Lambda if you’d like! -Ready to experience the enhanced performance of Qdrant 1.8.0? Upgrade now and explore the major improvements, from faster sparse vectors to optimized CPU resource management and better indexing for text data. Take your search capabilities to the next level with Qdrant’s latest version. [Try a demo today](https://qdrant.tech/demo/) and see the difference firsthand! +Shout out to Huggingface's [Optimum](https://github.com/huggingface/optimum) – which made it easier to quantize models. -## [Anchor](https://qdrant.tech/articles/qdrant-1.8.x/\#release-notes) Release notes +**Reduced Installation Time**: -For more information, see [our release notes](https://github.com/qdrant/qdrant/releases/tag/v1.8.0). -Qdrant is an open-source project. We welcome your contributions; raise [issues](https://github.com/qdrant/qdrant/issues), or contribute via [pull requests](https://github.com/qdrant/qdrant/pulls)! +FastEmbed sets itself apart by maintaining a low minimum RAM/Disk usage. -##### Was this page useful? +It’s designed to be agile and fast, useful for businesses looking to integrate text embedding for production usage. For FastEmbed, the list of dependencies is refreshingly brief: -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +> - onnx: Version ^1.11 – We’ll try to drop this also in the future if we can! +> - onnxruntime: Version ^1.15 +> - tqdm: Version ^4.65 – used only at Download +> - requests: Version ^2.31 – used only at Download +> - tokenizers: Version ^0.13 -Thank you for your feedback! 🙏 +This minimized list serves two purposes. First, it significantly reduces the installation time, allowing for quicker deployments. Second, it limits the amount of disk space required, making it a viable option even for environments with storage limitations. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/qdrant-1.8.x.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Notably absent from the dependency list are bulky libraries like PyTorch, and there’s no requirement for CUDA drivers. This is intentional. FastEmbed is engineered to deliver optimal performance right on your CPU, eliminating the need for specialized hardware or complex setups. -On this page: +**ONNXRuntime**: The ONNXRuntime gives us the ability to support multiple providers. The quantization we do is limited for CPU (Intel), but we intend to support GPU versions of the same in the future as well.  This allows for greater customization and optimization, further aligning with your specific performance and computational requirements. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/qdrant-1.8.x.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +## Current Models -× +We’ve started with a small set of supported models: -[Powered by](https://qdrant.tech/) +All the models we support are [quantized](https://pytorch.org/docs/stable/quantization.html) to enable even faster computation! -<|page-35-lllmstxt|> -## what-is-a-vector-database -- [Articles](https://qdrant.tech/articles/) -- An Introduction to Vector Databases +If you're using FastEmbed and you've got ideas or need certain features, feel free to let us know. Just drop an issue on our GitHub page. That's where we look first when we're deciding what to work on next. Here's where you can do it: [FastEmbed GitHub Issues](https://github.com/qdrant/fastembed/issues). -[Back to Vector Search Manuals](https://qdrant.tech/articles/vector-search-manuals/) +When it comes to FastEmbed's DefaultEmbedding model, we're committed to supporting the best Open Source models. -# An Introduction to Vector Databases +If anything changes, you'll see a new version number pop up, like going from 0.0.6 to 0.1. So, it's a good idea to lock in the FastEmbed version you're using to avoid surprises. -Sabrina Aquino +## Using FastEmbed with Qdrant -· +Qdrant is a Vector Store, offering comprehensive, efficient, and scalable [enterprise solutions](https://qdrant.tech/enterprise-solutions/) for modern machine learning and AI applications. Whether you are dealing with billions of data points, require a low latency performant [vector database solution](https://qdrant.tech/qdrant-vector-database/), or specialized quantization methods – [Qdrant is engineered](/documentation/overview/) to meet those demands head-on. -October 09, 2024 +The fusion of FastEmbed with Qdrant’s vector store capabilities enables a transparent workflow for seamless embedding generation, storage, and retrieval. This simplifies the API design — while still giving you the flexibility to make significant changes e.g. you can use FastEmbed to make your own embedding other than the DefaultEmbedding and use that with Qdrant. -![An Introduction to Vector Databases](https://qdrant.tech/articles_data/what-is-a-vector-database/preview/title.jpg) +Below is a detailed guide on how to get started with FastEmbed in conjunction with Qdrant. -## [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#what-is-a-vector-database) What Is a Vector Database? +### Step 1: Installation -![vector-database-architecture](https://qdrant.tech/articles_data/what-is-a-vector-database/vector-database-1.jpeg) +Before diving into the code, the initial step involves installing the Qdrant Client along with the FastEmbed library. This can be done using pip: -Most of the millions of terabytes of data we generate each day is **unstructured**. Think of the meal photos you snap, the PDFs shared at work, or the podcasts you save but may never listen to. None of it fits neatly into rows and columns. +``` +pip install qdrant-client[fastembed] +``` -Unstructured data lacks a strict format or schema, making it challenging for conventional databases to manage. Yet, this unstructured data holds immense potential for **AI**, **machine learning**, and **modern search engines**. +For those using zsh as their shell, you might encounter syntax issues. In such cases, wrap the package name in quotes: -> A [Vector Database](https://qdrant.tech/qdrant-vector-database/) is a specialized system designed to efficiently handle high-dimensional vector data. It excels at indexing, querying, and retrieving this data, enabling advanced analysis and similarity searches that traditional databases cannot easily perform. +``` +pip install 'qdrant-client[fastembed]' +``` -### [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#the-challenge-with-traditional-databases) The Challenge with Traditional Databases +### Step 2: Initializing the Qdrant Client -Traditional [OLTP](https://www.ibm.com/topics/oltp) and [OLAP](https://www.ibm.com/topics/olap) databases have been the backbone of data storage for decades. They are great at managing structured data with well-defined schemas, like `name`, `address`, `phone number`, and `purchase history`. +After successful installation, the next step involves initializing the Qdrant Client. This can be done either in-memory or by specifying a database path: -![Structure of OLTP and OLAP databases](https://qdrant.tech/articles_data/what-is-a-vector-database/oltp-and-olap.png) +```python +from qdrant_client import QdrantClient +# Initialize the client +client = QdrantClient(":memory:")  # or QdrantClient(path="path/to/db") +``` -But when data can’t be easily categorized, like the content inside a PDF file, things start to get complicated. +### Step 3: Preparing Documents, Metadata, and IDs -You can always store the PDF file as raw data, perhaps with some metadata attached. However, the database still wouldn’t be able to understand what’s inside the document, categorize it, or even search for the information that it contains. +Once the client is initialized, prepare the text documents you wish to embed, along with any associated metadata and unique IDs: -Also, this applies to more than just PDF documents. Think about the vast amounts of text, audio, and image data you generate every day. If a database can’t grasp the **meaning** of this data, how can you search for or find relationships within the data? +```python +docs = [ + "Qdrant has Langchain integrations", + "Qdrant also has Llama Index integrations" +] +metadata = [ + {"source": "Langchain-docs"}, + {"source": "LlamaIndex-docs"}, +] +ids = [42, 2] +``` -![Structure of a Vector Database](https://qdrant.tech/articles_data/what-is-a-vector-database/vector-db-structure.png) +Note that the add method we’ll use is overloaded: If you skip the ids, we’ll generate those for you. metadata is obviously optional. So, you can simply use this too: -Vector databases allow you to understand the **context** or **conceptual similarity** of unstructured data by representing them as vectors, enabling advanced analysis and retrieval based on data similarity. +```python +docs = [ + "Qdrant has Langchain integrations", + "Qdrant also has Llama Index integrations" +] +``` -## [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#when-to-use-a-vector-database) When to Use a Vector Database +### Step 4: Adding Documents to a Collection -Not sure if you should use a vector database or a traditional database? This chart may help. +With your documents, metadata, and IDs ready, you can proceed to add these to a specified collection within Qdrant using the add method: -| **Feature** | **OLTP Database** | **OLAP Database** | **Vector Database** | -| --- | --- | --- | --- | -| **Data Structure** | Rows and columns | Rows and columns | Vectors | -| **Type of Data** | Structured | Structured/Partially Unstructured | Unstructured | -| **Query Method** | SQL-based (Transactional Queries) | SQL-based (Aggregations, Analytical Queries) | Vector Search (Similarity-Based) | -| **Storage Focus** | Schema-based, optimized for updates | Schema-based, optimized for reads | Context and Semantics | -| **Performance** | Optimized for high-volume transactions | Optimized for complex analytical queries | Optimized for unstructured data retrieval | -| **Use Cases** | Inventory, order processing, CRM | Business intelligence, data warehousing | Similarity search, recommendations, RAG, anomaly detection, etc. | +```python +client.add( + collection_name="demo_collection", + documents=docs, + metadata=metadata, + ids=ids +) +``` -## [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#what-is-a-vector) What Is a Vector? +Inside this function, Qdrant Client uses FastEmbed to make the text embedding, generate ids if they’re missing, and then add them to the index with metadata. This uses the DefaultEmbedding model: [BAAI/bge-small-en-v1.5](https://huggingface.co/baai/bge-small-en-v1.5) -![vector-database-vector](https://qdrant.tech/articles_data/what-is-a-vector-database/vector-database-7.jpeg) +![INDEX TIME: Sequence Diagram for Qdrant and FastEmbed](/articles_data/fastembed/generate-embeddings-from-docs.png) -When a machine needs to process unstructured data - an image, a piece of text, or an audio file, it first has to translate that data into a format it can work with: **vectors**. +### Step 5: Performing Queries -> A **vector** is a numerical representation of data that can capture the **context** and **semantics** of data. +Finally, you can perform queries on your stored documents. Qdrant offers a robust querying capability, and the query results can be easily retrieved as follows: -When you deal with unstructured data, traditional databases struggle to understand its meaning. However, a vector can translate that data into something a machine can process. For example, a vector generated from text can represent relationships and meaning between words, making it possible for a machine to compare and understand their context. +```python +search_result = client.query( + collection_name="demo_collection", + query_text="This is a query document" +) +print(search_result) +``` -There are three key elements that define a vector in a vector database: the **ID**, the **dimensions**, and the **payload**. These components work together to represent a vector effectively within the system. Together, they form a **point**, which is the core unit of data stored and retrieved in a vector database. +Behind the scenes, we first convert the query_text to the embedding and use that to query the vector index. -![Representation of a Point in Qdrant](https://qdrant.tech/articles_data/what-is-a-vector-database/point.png) +![QUERY TIME: Sequence Diagram for Qdrant and FastEmbed integration](/articles_data/fastembed/generate-embeddings-query.png) -Each one of these parts plays an important role in how vectors are stored, retrieved, and interpreted. Let’s see how. +By following these steps, you effectively utilize the combined capabilities of FastEmbed and Qdrant, thereby streamlining your embedding generation and retrieval tasks. -### [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#1-the-id-your-vectors-unique-identifier) 1\. The ID: Your Vector’s Unique Identifier +Qdrant is designed to handle large-scale datasets with billions of data points. Its architecture employs techniques like [binary quantization](https://qdrant.tech/articles/binary-quantization/) and [scalar quantization](https://qdrant.tech/articles/scalar-quantization/) for efficient storage and retrieval. When you inject FastEmbed’s CPU-first design and lightweight nature into this equation, you end up with a system that can scale seamlessly while maintaining low latency. -Just like in a relational database, each vector in a vector database gets a unique ID. Think of it as your vector’s name tag, a **primary key** that ensures the vector can be easily found later. When a vector is added to the database, the ID is created automatically. +## Summary -While the ID itself doesn’t play a part in the similarity search (which operates on the vector’s numerical data), it is essential for associating the vector with its corresponding “real-world” data, whether that’s a document, an image, or a sound file. +If you're curious about how FastEmbed and Qdrant can make your search tasks a breeze, why not take it for a spin? You get a real feel for what it can do. Here are two easy ways to get started: -After a search is performed and similar vectors are found, their IDs are returned. These can then be used to **fetch additional details or metadata** tied to the result. +1. **Cloud**: Get started with a free plan on the [Qdrant Cloud](https://qdrant.to/cloud?utm_source=qdrant&utm_medium=website&utm_campaign=fastembed&utm_content=article). -### [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#2-the-dimensions-the-core-representation-of-the-data) 2\. The Dimensions: The Core Representation of the Data +2. **Docker Container**: If you're the DIY type, you can set everything up on your own machine. Here's a quick guide to help you out: [Quick Start with Docker](/documentation/quick-start/?utm_source=qdrant&utm_medium=website&utm_campaign=fastembed&utm_content=article). -At the core of every vector is a set of numbers, which together form a representation of the data in a **multi-dimensional** space. +So, go ahead, take it for a test drive. We're excited to hear what you think! -#### [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#from-text-to-vectors-how-does-it-work) From Text to Vectors: How Does It Work? +Lastly, If you find FastEmbed useful and want to keep up with what we're doing, giving our GitHub repo a star would mean a lot to us. Here's the link to [star the repository](https://github.com/qdrant/fastembed). -These numbers are generated by **embedding models**, such as deep learning algorithms, and capture the essential patterns or relationships within the data. That’s why the term **embedding** is often used interchangeably with vector when referring to the output of these models. +If you ever have questions about FastEmbed, please ask them on the Qdrant Discord: [https://discord.gg/Qy6HCJK9Dc](https://discord.gg/Qy6HCJK9Dc) -To represent textual data, for example, an embedding will encapsulate the nuances of language, such as semantics and context within its dimensions. +<|page-46-lllmstxt|> +## Introduction -![Creation of a vector based on a sentence with an embedding model](https://qdrant.tech/articles_data/what-is-a-vector-database/embedding-model.png) +Greetings, I'm Zein Wen, and I was a Google Summer of Code 2023 participant at Qdrant. I got to work with an amazing mentor, Arnaud Gourlay, on enhancing the Qdrant Geo Polygon Filter. This new feature allows users to refine their query results using polygons. As the latest addition to the Geo Filter family of radius and rectangle filters, this enhancement promises greater flexibility in querying geo data, unlocking interesting new use cases. -For that reason, when comparing two similar sentences, their embeddings will turn out to be very similar, because they have similar **linguistic elements**. +## Project Overview -![Comparison of the embeddings of 2 similar sentences](https://qdrant.tech/articles_data/what-is-a-vector-database/two-similar-vectors.png) +{{< figure src="/articles_data/geo-polygon-filter-gsoc/geo-filter-example.png" caption="A Use Case of Geo Filter (https://traveltime.com/blog/map-postcode-data-catchment-area)" alt="A Use Case of Geo Filter" >}} -That’s the beauty of embeddings. Tthe complexity of the data is distilled into something that can be compared across a multi-dimensional space. +Because Qdrant is a powerful query vector database it presents immense potential for machine learning-driven applications, such as recommendation. However, the scope of vector queries alone may not always meet user requirements. Consider a scenario where you're seeking restaurant recommendations; it's not just about a list of restaurants, but those within your neighborhood. This is where the Geo Filter comes into play, enhancing query by incorporating additional filtering criteria. Up until now, Qdrant's geographic filter options were confined to circular and rectangular shapes, which may not align with the diverse boundaries found in the real world. This scenario was exactly what led to a user feature request and we decided it would be a good feature to tackle since it introduces greater capability for geo-related queries. -### [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#3-the-payload-adding-context-with-metadata) 3\. The Payload: Adding Context with Metadata +## Technical Challenges -Sometimes you’re going to need more than just numbers to fully understand or refine a search. While the dimensions capture the essence of the data, the payload holds **metadata** for structured information. +**1. Geo Geometry Computation** -It could be textual data like descriptions, tags, categories, or it could be numerical values like dates or prices. This extra information is vital when you want to filter or rank search results based on criteria that aren’t directly encoded in the vector. +{{< figure src="/articles_data/geo-polygon-filter-gsoc/basic-concept.png" caption="Geo Space Basic Concept" alt="Geo Space Basic Concept" >}} -> This metadata is invaluable when you need to apply additional **filters** or **sorting** criteria. +Internally, the Geo Filter doesn't start by testing each individual geo location as this would be computationally expensive. Instead, we create a geo hash layer that [divides the world](https://en.wikipedia.org/wiki/Grid_(spatial_index)#Grid-based_spatial_indexing) into rectangles. When a spatial index is created for Qdrant entries it assigns the entry to the geohash for its location. -For example, if you’re searching for a picture of a dog, the vector helps the database find images that are visually similar. But let’s say you want results showing only images taken within the last year, or those tagged with “vacation.” +During a query we first identify all potential geo hashes that satisfy the filters and subsequently check for location candidates within those hashes. Accomplishing this search involves two critical geometry computations: +1. determining if a polygon intersects with a rectangle +2. ascertaining if a point lies within a polygon. -![Filtering Example](https://qdrant.tech/articles_data/what-is-a-vector-database/filtering-example.png) +{{< figure src=/articles_data/geo-polygon-filter-gsoc/geo-computation-testing.png caption="Geometry Computation Testing" alt="Geometry Computation Testing" >}} -The payload can help you narrow down those results by ignoring vectors that doesn’t match your query vector filtering criteria. If you want the full picture of how filtering works in Qdrant, check out our [Complete Guide to Filtering.](https://qdrant.tech/articles/vector-search-filtering/) +While we have a geo crate (a Rust library) that provides APIs for these computations, we dug in deeper to understand the underlying algorithms and verify their accuracy. This lead us to conduct extensive testing and visualization to determine correctness. In addition to assessing the current crate, we also discovered that there are multiple algorithms available for these computations. We invested time in exploring different approaches, such as [winding windows](https://en.wikipedia.org/wiki/Point_in_polygon#Winding%20number%20algorithm:~:text=of%20the%20algorithm.-,Winding%20number%20algorithm,-%5Bedit%5D) and [ray casting](https://en.wikipedia.org/wiki/Point_in_polygon#Winding%20number%20algorithm:~:text=.%5B2%5D-,Ray%20casting%20algorithm,-%5Bedit%5D), to grasp their distinctions, and pave the way for future improvements. -## [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#the-architecture-of-a-vector-database) The Architecture of a Vector Database +Through this process, I enjoyed honing my ability to swiftly grasp unfamiliar concepts. In addition, I needed to develop analytical strategies to dissect and draw meaningful conclusions from them. This experience has been invaluable in expanding my problem-solving toolkit. -A vector database is made of multiple different entities and relations. Let’s understand a bit of what’s happening here: -![Architecture Diagram of a Vector Database](https://qdrant.tech/articles_data/what-is-a-vector-database/architecture-vector-db.png) +**2. Proto and JSON format design** -### [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#collections) Collections +Considerable effort was devoted to designing the ProtoBuf and JSON interfaces for this new feature. This component is directly exposed to users, requiring a consistent and user-friendly interface, which in turns help drive a a positive user experience and less code modifications in the future. -A [collection](https://qdrant.tech/documentation/concepts/collections/) is essentially a group of **vectors** (or “ [points](https://qdrant.tech/documentation/concepts/points/)”) that are logically grouped together **based on similarity or a specific task**. Every vector within a collection shares the same dimensionality and can be compared using a single metric. Avoid creating multiple collections unless necessary; instead, consider techniques like **sharding** for scaling across nodes or **multitenancy** for handling different use cases within the same infrastructure. +Initially, we contemplated aligning our interface with the [GeoJSON](https://geojson.org/) specification, given its prominence as a standard for many geo-related APIs. However, we soon realized that the way GeoJSON defines geometries significantly differs from our current JSON and ProtoBuf coordinate definitions for our point radius and rectangular filter. As a result, we prioritized API-level consistency and user experience, opting to align the new polygon definition with all our existing definitions. -### [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#distance-metrics) Distance Metrics +In addition, we planned to develop a separate multi-polygon filter in addition to the polygon. However, after careful consideration, we recognize that, for our use case, polygon filters can achieve the same result as a multi-polygon filter. This relationship mirrors how we currently handle multiple circles or rectangles. Consequently, we deemed the multi-polygon filter redundant and would introduce unnecessary complexity to the API. -These metrics defines how similarity between vectors is calculated. The choice of distance metric is made when creating a collection and the right choice depends on the type of data you’re working with and how the vectors were created. Here are the three most common distance metrics: +Doing this work illustrated to me the challenge of navigating real-world solutions that require striking a balance between adhering to established standards and prioritizing user experience. It also was key to understanding the wisdom of focusing on developing what's truly necessary for users, without overextending our efforts. -- **Euclidean Distance:** The straight-line path. It’s like measuring the physical distance between two points in space. Pick this one when the actual distance (like spatial data) matters. +## Outcomes -- **Cosine Similarity:** This one is about the angle, not the length. It measures how two vectors point in the same direction, so it works well for text or documents when you care more about meaning than magnitude. For example, if two things are _similar_, _opposite_, or _unrelated_: +**1. Capability of Deep Dive** +Navigating unfamiliar code bases, concepts, APIs, and techniques is a common challenge for developers. Participating in GSoC was akin to me going from the safety of a swimming pool and right into the expanse of the ocean. Having my mentor’s support during this transition was invaluable. He provided me with numerous opportunities to independently delve into areas I had never explored before. I have grown into no longer fearing unknown technical areas, whether it's unfamiliar code, techniques, or concepts in specific domains. I've gained confidence in my ability to learn them step by step and use them to create the things I envision. +**2. Always Put User in Minds** +Another crucial lesson I learned is the importance of considering the user's experience and their specific use cases. While development may sometimes entail iterative processes, every aspect that directly impacts the user must be approached and executed with empathy. Neglecting this consideration can lead not only to functional errors but also erode the trust of users due to inconsistency and confusion, which then leads to them no longer using my work. -![Cosine Similarity Example](https://qdrant.tech/articles_data/what-is-a-vector-database/cosine-similarity.png) +**3. Speak Up and Effectively Communicate** +Finally, In the course of development, encountering differing opinions is commonplace. It's essential to remain open to others' ideas, while also possessing the resolve to communicate one's own perspective clearly. This fosters productive discussions and ultimately elevates the quality of the development process. -- **Dot Product:** This looks at how much two vectors align. It’s popular in recommendation systems where you’re interested in how much two things “agree” with each other. +### Wrap up -### [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#ram-based-and-memmap-storage) RAM-Based and Memmap Storage +Being selected for Google Summer of Code 2023 and collaborating with Arnaud and the other Qdrant engineers, along with all the other community members, has been a true privilege. I'm deeply grateful to those who invested their time and effort in reviewing my code, engaging in discussions about alternatives and design choices, and offering assistance when needed. Through these interactions, I've experienced firsthand the essence of open source and the culture that encourages collaboration. This experience not only allowed me to write Rust code for a real-world product for the first time, but it also opened the door to the amazing world of open source. -By default, Qdrant stores vectors in RAM, delivering incredibly fast access for datasets that fit comfortably in memory. But when your dataset exceeds RAM capacity, Qdrant offers Memmap as an alternative. +Without a doubt, I'm eager to continue growing alongside this community and contribute to new features and enhancements that elevate the product. I've also become an advocate for Qdrant, introducing this project to numerous coworkers and friends in the tech industry. I'm excited to witness new users and contributors emerge from within my own network! -Memmap allows you to store vectors **on disk**, yet still access them efficiently by mapping the data directly into memory if you have enough RAM. To enable it, you only need to set `"on_disk": true` when you are **creating a collection:** +If you want to try out my work, read the [documentation](/documentation/concepts/filtering/#geo-polygon) and then, either sign up for a free [cloud account](https://cloud.qdrant.io) or download the [Docker image](https://hub.docker.com/r/qdrant/qdrant). I look forward to seeing how people are using my work in their own applications! -```python -from qdrant_client import QdrantClient, models +<|page-47-lllmstxt|> +# Optimizing High-Dimensional Vectors with Binary Quantization -client = QdrantClient(url='http://localhost:6333') +Qdrant is built to handle typical scaling challenges: high throughput, low latency and efficient indexing. **Binary quantization (BQ)** is our latest attempt to give our customers the edge they need to scale efficiently. This feature is particularly excellent for collections with large vector lengths and a large number of points. -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams( - size=768, distance=models.Distance.COSINE, on_disk=True - ), -) +Our results are dramatic: Using BQ will reduce your memory consumption and improve retrieval speeds by up to 40x. -``` +As is the case with other quantization methods, these benefits come at the cost of recall degradation. However, our implementation lets you balance the tradeoff between speed and recall accuracy at time of search, rather than time of index creation. -For other configurations like `hnsw_config.on_disk` or `memmap_threshold`, see the Qdrant documentation for [Storage.](https://qdrant.tech/documentation/concepts/storage/) +The rest of this article will cover: +1. The importance of binary quantization +2. Basic implementation using our Python client +3. Benchmark analysis and usage recommendations -### [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#sdks) SDKs +## What is Binary Quantization? +Binary quantization (BQ) converts any vector embedding of floating point numbers into a vector of binary or boolean values. This feature is an extension of our past work on [scalar quantization](/articles/scalar-quantization/) where we convert `float32` to `uint8` and then leverage a specific SIMD CPU instruction to perform fast vector comparison. -Qdrant offers a range of SDKs. You can use the programming language you’re most comfortable with, whether you’re coding in [Python](https://github.com/qdrant/qdrant-client), [Go](https://github.com/qdrant/go-client), [Rust](https://github.com/qdrant/rust-client), [Javascript/Typescript](https://github.com/qdrant/qdrant-js), [C#](https://github.com/qdrant/qdrant-dotnet) or [Java](https://github.com/qdrant/java-client). +![What is binary quantization](/articles_data/binary-quantization/bq-2.png) -## [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#the-core-functionalities-of-vector-databases) The Core Functionalities of Vector Databases +**This binarization function is how we convert a range to binary values. All numbers greater than zero are marked as 1. If it's zero or less, they become 0.** -![vector-database-functions](https://qdrant.tech/articles_data/what-is-a-vector-database/vector-database-3.jpeg) +The benefit of reducing the vector embeddings to binary values is that boolean operations are very fast and need significantly less CPU instructions. In exchange for reducing our 32 bit embeddings to 1 bit embeddings we can see up to a 40x retrieval speed up gain! -When you think of a traditional database, the operations are familiar: you **create**, **read**, **update**, and **delete** records. These are the fundamentals. And guess what? In many ways, vector databases work the same way, but the operations are translated for the complexity of vectors. +One of the reasons vector search still works with such a high compression rate is that these large vectors are over-parameterized for retrieval. This is because they are designed for ranking, clustering, and similar use cases, which typically need more information encoded in the vector. -### [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#1-indexing-hnsw-index-and-sending-data-to-qdrant) 1\. Indexing: HNSW Index and Sending Data to Qdrant +For example, The 1536 dimension OpenAI embedding is worse than Open Source counterparts of 384 dimension at retrieval and ranking. Specifically, it scores 49.25 on the same [Embedding Retrieval Benchmark](https://huggingface.co/spaces/mteb/leaderboard) where the Open Source `bge-small` scores 51.82. This 2.57 points difference adds up quite soon. -Indexing your vectors is like creating an entry in a traditional database. But for vector databases, this step is very important. Vectors need to be indexed in a way that makes them easy to search later on. +Our implementation of quantization achieves a good balance between full, large vectors at ranking time and binary vectors at search and retrieval time. It also has the ability for you to adjust this balance depending on your use case. -**HNSW** (Hierarchical Navigable Small World) is a powerful indexing algorithm that most vector databases rely on to organize vectors for fast and efficient search. +## Faster search and retrieval -It builds a multi-layered graph, where each vector is a node and connections represent similarity. The higher layers connect broadly similar vectors, while lower layers link vectors that are closely related, making searches progressively more refined as they go deeper. +Unlike product quantization, binary quantization does not rely on reducing the search space for each probe. Instead, we build a binary index that helps us achieve large increases in search speed. -![Indexing Data with the HNSW algorithm](https://qdrant.tech/articles_data/what-is-a-vector-database/hnsw.png) +![Speed by quantization method](/articles_data/binary-quantization/bq-3.png) -When you run a search, HNSW starts at the top, quickly narrowing down the search by hopping between layers. It focuses only on relevant vectors as it goes deeper, refining the search with each step. +HNSW is the approximate nearest neighbor search. This means our accuracy improves up to a point of diminishing returns, as we check the index for more similar candidates. In the context of binary quantization, this is referred to as the **oversampling rate**. -### [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#11-payload-indexing) 1.1 Payload Indexing +For example, if `oversampling=2.0` and the `limit=100`, then 200 vectors will first be selected using a quantized index. For those 200 vectors, the full 32 bit vector will be used with their HNSW index to a much more accurate 100 item result set. As opposed to doing a full HNSW search, we oversample a preliminary search and then only do the full search on this much smaller set of vectors. -In Qdrant, indexing is modular. You can configure indexes for **both vectors and payloads independently**. The payload index is responsible for optimizing filtering based on metadata. Each payload index is built for a specific field and allows you to quickly filter vectors based on specific conditions. +## Improved storage efficiency -![Searching Data with the HNSW algorithm](https://qdrant.tech/articles_data/what-is-a-vector-database/hnsw-search.png) +The following diagram shows the binarization function, whereby we reduce 32 bits storage to 1 bit information. -You need to build the payload index for **each field** you’d like to search. The magic here is in the combination: HNSW finds similar vectors, and the payload index makes sure only the ones that fit your criteria come through. Learn more about Qdrant’s [Filtrable HNSW](https://qdrant.tech/articles/filtrable-hnsw/) and why it was built like this. +Text embeddings can be over 1024 elements of floating point 32 bit numbers. For example, remember that OpenAI embeddings are 1536 element vectors. This means each vector is 6kB for just storing the vector. -> Combining [full-text search](https://qdrant.tech/documentation/concepts/indexing/#full-text-index) with vector-based search gives you even more versatility. You can simultaneously search for conceptually similar documents while ensuring specific keywords are present, all within the same query. +![Improved storage efficiency](/articles_data/binary-quantization/bq-4.png) -### [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#2-searching-approximate-nearest-neighbors-ann-search) 2\. Searching: Approximate Nearest Neighbors (ANN) Search +In addition to storing the vector, we also need to maintain an index for faster search and retrieval. Qdrant’s formula to estimate overall memory consumption is: -Similarity search allows you to search by **meaning**. This way you can do searches such as similar songs that evoke the same mood, finding images that match your artistic vision, or even exploring emotional patterns in text. +`memory_size = 1.5 * number_of_vectors * vector_dimension * 4 bytes` -![Similar words grouped together](https://qdrant.tech/articles_data/what-is-a-vector-database/similarity.png) +For 100K OpenAI Embedding (`ada-002`) vectors we would need 900 Megabytes of RAM and disk space. This consumption can start to add up rapidly as you create multiple collections or add more items to the database. -The way it works is, when the user queries the database, this query is also converted into a vector. The algorithm quickly identifies the area of the graph likely to contain vectors closest to the **query vector**. +**With binary quantization, those same 100K OpenAI vectors only require 128 MB of RAM.** We benchmarked this result using methods similar to those covered in our [Scalar Quantization memory estimation](/articles/scalar-quantization/#benchmarks). -![Approximate Nearest Neighbors (ANN) Search Graph](https://qdrant.tech/articles_data/what-is-a-vector-database/ann-search.png) +This reduction in RAM usage is achieved through the compression that happens in the binary conversion. HNSW and quantized vectors will live in RAM for quick access, while original vectors can be offloaded to disk only. For searching, quantized HNSW will provide oversampled candidates, then they will be re-evaluated using their disk-stored original vectors to refine the final results. All of this happens under the hood without any additional intervention on your part. -The search then moves down progressively narrowing down to more closely related and relevant vectors. Once the closest vectors are identified at the bottom layer, these points translate back to actual data, representing your **top-scored documents**. +### When should you not use BQ? -Here’s a high-level overview of this process: +Since this method exploits the over-parameterization of embedding, you can expect poorer results for small embeddings i.e. less than 1024 dimensions. With the smaller number of elements, there is not enough information maintained in the binary vector to achieve good results. -![Vector Database Searching Functionality](https://qdrant.tech/articles_data/what-is-a-vector-database/simple-arquitecture.png) +You will still get faster boolean operations and reduced RAM usage, but the accuracy degradation might be too high. -### [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#3-updating-vectors-real-time-and-bulk-adjustments) 3\. Updating Vectors: Real-Time and Bulk Adjustments +## Sample implementation -Data isn’t static, and neither are vectors. Keeping your vectors up to date is crucial for maintaining relevance in your searches. +Now that we have introduced you to binary quantization, let’s try our a basic implementation. In this example, we will be using OpenAI and Cohere with Qdrant. -Vector updates don’t always need to happen instantly, but when they do, Qdrant handles real-time modifications efficiently with a simple API call: +#### Create a collection with Binary Quantization enabled -```python -client.upsert( - collection_name='product_collection', - points=[PointStruct(id=product_id, vector=new_vector, payload=new_payload)] -) +Here is what you should do at indexing time when you create the collection: -``` +1. We store all the "full" vectors on disk. +2. Then we set the binary embeddings to be in RAM. -For large-scale changes, like re-indexing vectors after a model update, batch updating allows you to update multiple vectors in one operation without impacting search performance: +By default, both the full vectors and BQ get stored in RAM. We move the full vectors to disk because this saves us memory and allows us to store more vectors in RAM. By doing this, we explicitly move the binary vectors to memory by setting `always_ram=True`. ```python -batch_of_updates = [\ - PointStruct(id=product_id_1, vector=updated_vector_1, payload=new_payload_1),\ - PointStruct(id=product_id_2, vector=updated_vector_2, payload=new_payload_2),\ - # Add more points...\ -] +from qdrant_client import QdrantClient -client.upsert( - collection_name='product_collection', - points=batch_of_updates +#collect to our Qdrant Server +client = QdrantClient( + url="http://localhost:6333", + prefer_grpc=True, ) +#Create the collection to hold our embeddings +# on_disk=True and the quantization_config are the areas to focus on +collection_name = "binary-quantization" +if not client.collection_exists(collection_name): + client.create_collection( + collection_name=f"{collection_name}", + vectors_config=models.VectorParams( + size=1536, + distance=models.Distance.DOT, + on_disk=True, + ), + optimizers_config=models.OptimizersConfigDiff( + default_segment_number=5, + ), + hnsw_config=models.HnswConfigDiff( + m=0, + ), + quantization_config=models.BinaryQuantization( + binary=models.BinaryQuantizationConfig(always_ram=True), + ), + ) ``` -### [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#4-deleting-vectors-managing-outdated-and-duplicate-data) 4\. Deleting Vectors: Managing Outdated and Duplicate Data +#### What is happening in the HnswConfig? -Efficient vector management is key to keeping your searches accurate and your database lean. Deleting vectors that represent outdated or irrelevant data, such as expired products, old news articles, or archived profiles, helps maintain both performance and relevance. +We're setting `m` to 0 i.e. disabling the HNSW graph construction. This allows faster uploads of vectors and payloads. We will turn it back on down below, once all the data is loaded. -In Qdrant, removing vectors is straightforward, requiring only the vector IDs to be specified: +#### Next, we upload our vectors to this and then enable the graph construction: ```python -client.delete( - collection_name='data_collection', - points_selector=[point_id_1, point_id_2] +batch_size = 10000 +client.upload_collection( + collection_name=collection_name, + ids=range(len(dataset)), + vectors=dataset["openai"], + payload=[ + {"text": x} for x in dataset["text"] + ], + parallel=10, # based on the machine ) - ``` -You can use deletion to remove outdated data, clean up duplicates, and manage the lifecycle of vectors by automatically deleting them after a set period to keep your dataset relevant and focused. - -## [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#dense-vs-sparse-vectors) Dense vs. Sparse Vectors - -![vector-database-dense-sparse](https://qdrant.tech/articles_data/what-is-a-vector-database/vector-database-4.jpeg) - -Now that you understand what vectors are and how they are created, let’s learn more about the two possible types of vectors you can use: **dense** or **sparse**. The main difference between the two are: - -### [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#1-dense-vectors) 1\. Dense Vectors - -Dense vectors are, quite literally, dense with information. Every element in the vector contributes to the **semantic meaning**, **relationships** and **nuances** of the data. A dense vector representation of this sentence might look like this: - -![Representation of a Dense Vector](https://qdrant.tech/articles_data/what-is-a-vector-database/dense-1.png) - -Each number holds weight. Together, they convey the overall meaning of the sentence, and are better for identifying contextually similar items, even if the words don’t match exactly. - -### [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#2-sparse-vectors) 2\. Sparse Vectors - -Sparse vectors operate differently. They focus only on the essentials. In most sparse vectors, a large number of elements are zeros. When a feature or token is present, it’s marked—otherwise, zero. - -In the image, you can see a sentence, _“I love Vector Similarity,”_ broken down into tokens like _“i,” “love,” “vector”_ through tokenization. Each token is assigned a unique `ID` from a large vocabulary. For example, _“i”_ becomes `193`, and _“vector”_ becomes `15012`. - -![How Sparse Vectors are Created](https://qdrant.tech/articles_data/what-is-a-vector-database/sparse.png) - -Sparse vectors, are used for **exact matching** and specific token-based identification. The values on the right, such as `193: 0.04` and `9182: 0.12`, are the scores or weights for each token, showing how relevant or important each token is in the context. The final result is a sparse vector: - -```json -{ - 193: 0.04, - 9182: 0.12, - 15012: 0.73, - 6731: 0.69, - 454: 0.21 -} +Enable HNSW graph construction again: +```python +client.update_collection( + collection_name=f"{collection_name}", + hnsw_config=models.HnswConfigDiff( + m=16, + , +) ``` +#### Configure the search parameters: -Everything else in the vector space is assumed to be zero. - -Sparse vectors are ideal for tasks like **keyword search** or **metadata filtering**, where you need to check for the presence of specific tokens without needing to capture the full meaning or context. They suited for exact matches within the **data itself**, rather than relying on external metadata, which is handled by payload filtering. - -## [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#benefits-of-hybrid-search) Benefits of Hybrid Search - -![vector-database-get-started](https://qdrant.tech/articles_data/what-is-a-vector-database/vector-database-5.jpeg) - -Sometimes context alone isn’t enough. Sometimes you need precision, too. Dense vectors are fantastic when you need to retrieve results based on the context or meaning behind the data. Sparse vectors are useful when you also need **keyword or specific attribute matching**. +When setting search parameters, we specify that we want to use `oversampling` and `rescore`. Here is an example snippet: -> With hybrid search you don’t have to choose one over the othe and use both to get searches that are more **relevant** and **filtered**. +```python +client.search( + collection_name="{collection_name}", + query_vector=[0.2, 0.1, 0.9, 0.7, ...], + search_params=models.SearchParams( + quantization=models.QuantizationSearchParams( + ignore=False, + rescore=True, + oversampling=2.0, + ) + ) +) +``` -To achieve this balance, Qdrant uses **normalization** and **fusion** techniques to blend results from multiple search methods. One common approach is **Reciprocal Rank Fusion (RRF)**, where results from different methods are merged, giving higher importance to items ranked highly by both methods. This ensures that the best candidates, whether identified through dense or sparse vectors, appear at the top of the results. +After Qdrant pulls the oversampled vectors set, the full vectors which will be, say 1536 dimensions for OpenAI will then be pulled up from disk. Qdrant computes the nearest neighbor with the query vector and returns the accurate, rescored order. This method produces much more accurate results. We enabled this by setting `rescore=True`. -Qdrant combines dense and sparse vector results through a process of **normalization** and **fusion**. +These two parameters are how you are going to balance speed versus accuracy. The larger the size of your oversample, the more items you need to read from disk and the more elements you have to search with the relatively slower full vector index. On the other hand, doing this will produce more accurate results. -![Hybrid Search API - How it works](https://qdrant.tech/articles_data/what-is-a-vector-database/hybrid-search-2.png) +If you have lower accuracy requirements you can even try doing a small oversample without rescoring. Or maybe, for your data set combined with your accuracy versus speed requirements you can just search the binary index and no rescoring, i.e. leaving those two parameters out of the search query. -### [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#how-to-use-hybrid-search-in-qdrant) How to Use Hybrid Search in Qdrant +## Benchmark results -Qdrant makes it easy to implement hybrid search through its Query API. Here’s how you can make it happen in your own project: +We retrieved some early results on the relationship between limit and oversampling using the the DBPedia OpenAI 1M vector dataset. We ran all these experiments on a Qdrant instance where 100K vectors were indexed and used 100 random queries. -![Hybrid Query Example](https://qdrant.tech/articles_data/what-is-a-vector-database/hybrid-query-1.png) +We varied the 3 parameters that will affect query time and accuracy: limit, rescore and oversampling. We offer these as an initial exploration of this new feature. You are highly encouraged to reproduce these experiments with your data sets. -**Example Hybrid Query:** Let’s say a researcher is looking for papers on NLP, but the paper must specifically mention “transformers” in the content: +> Aside: Since this is a new innovation in vector databases, we are keen to hear feedback and results. [Join our Discord server](https://discord.gg/Qy6HCJK9Dc) for further discussion! -```json -search_query = { - "vector": query_vector, # Dense vector for semantic search - "filter": { # Filtering for specific terms - "must": [\ - {"key": "text", "match": "transformers"} # Exact keyword match in the paper\ - ] - } -} +**Oversampling:** +In the figure below, we illustrate the relationship between recall and number of candidates: -``` +![Correct vs candidates](/articles_data/binary-quantization/bq-5.png) -In this query the dense vector search finds papers related to the broad topic of NLP and the sparse vector filtering ensures that the papers specifically mention “transformers”. +We see that "correct" results i.e. recall increases as the number of potential "candidates" increase (limit x oversampling). To highlight the impact of changing the `limit`, different limit values are broken apart into different curves. For example, we see that the lowest recall for limit 50 is around 94 correct, with 100 candidates. This also implies we used an oversampling of 2.0 -This is just a simple example and there’s so much more you can do with it. See our complete [article on Hybrid Search](https://qdrant.tech/articles/hybrid-search/) guide to see what’s happening behind the scenes and all the possibilities when building a hybrid search system. +As oversampling increases, we see a general improvement in results – but that does not hold in every case. -## [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#quantization-get-40x-faster-results) Quantization: Get 40x Faster Results +**Rescore:** +As expected, rescoring increases the time it takes to return a query. +We also repeated the experiment with oversampling except this time we looked at how rescore impacted result accuracy. -![vector-database-architecture](https://qdrant.tech/articles_data/what-is-a-vector-database/vector-database-2.jpeg) +![Relationship between limit and rescore on correct](/articles_data/binary-quantization/bq-7.png) -As your vector dataset grow larger, so do the computational demands of searching through it. +**Limit:** +We experiment with limits from Top 1 to Top 50 and we are able to get to 100% recall at limit 50, with rescore=True, in an index with 100K vectors. -Quantized vectors are much smaller and easier to compare. With methods like [**Binary Quantization**](https://qdrant.tech/articles/binary-quantization/), you can see **search speeds improve by up to 40x while memory usage decreases by 32x**. Improvements that can be decicive when dealing with large datasets or needing low-latency results. +## Recommendations -It works by converting high-dimensional vectors, which typically use `4 bytes` per dimension, into binary representations, using just `1 bit` per dimension. Values above zero become “1”, and everything else becomes “0”. +Quantization gives you the option to make tradeoffs against other parameters: +Dimension count/embedding size +Throughput and Latency requirements +Recall requirements -![ Binary Quantization example](https://qdrant.tech/articles_data/what-is-a-vector-database/binary-quantization.png) +If you're working with OpenAI or Cohere embeddings, we recommend the following oversampling settings: -Quantization reduces data precision, and yes, this does lead to some loss of accuracy. However, for binary quantization, **OpenAI embeddings** achieves this performance improvement at a cost of only 5% of accuracy. If you apply techniques like **oversampling** and **rescoring**, this loss can be brought down even further. +|Method|Dimensionality|Test Dataset|Recall|Oversampling| +|-|-|-|-|-| +|OpenAI text-embedding-3-large|3072|[DBpedia 1M](https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-3072-1M) | 0.9966|3x| +|OpenAI text-embedding-3-small|1536|[DBpedia 100K](https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-small-1536-100K)| 0.9847|3x| +|OpenAI text-embedding-3-large|1536|[DBpedia 1M](https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-1M)| 0.9826|3x| +|OpenAI text-embedding-ada-002|1536|[DbPedia 1M](https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M) |0.98|4x| +|Gemini|768|No Open Data| 0.9563|3x| +|Mistral Embed|768|No Open Data| 0.9445 |3x| -However, binary quantization isn’t the only available option. Techniques like [**Scalar Quantization**](https://qdrant.tech/documentation/guides/quantization/#scalar-quantization) and [**Product Quantization**](https://qdrant.tech/documentation/guides/quantization/#product-quantization) are also popular alternatives when optimizing vector compression. +If you determine that binary quantization is appropriate for your datasets and queries then we suggest the following: +- Binary Quantization with always_ram=True +- Vectors stored on disk +- Oversampling=2.0 (or more) +- Rescore=True -You can set up your chosen quantization method using the `quantization_config` parameter when creating a new collection: +## What's next? -```python -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams( - size=1536, - distance=models.Distance.COSINE - ), +Binary quantization is exceptional if you need to work with large volumes of data under high recall expectations. You can try this feature either by spinning up a [Qdrant container image](https://hub.docker.com/r/qdrant/qdrant) locally or, having us create one for you through a [free account](https://cloud.qdrant.io/signup) in our cloud hosted service. - # Choose your preferred quantization method - quantization_config=models.BinaryQuantization( - binary=models.BinaryQuantizationConfig( - always_ram=True, # Store the quantized vectors in RAM for faster access - ), - ), -) +The article gives examples of data sets and configuration you can use to get going. Our documentation covers [adding large datasets to Qdrant](/documentation/tutorials/bulk-upload/) to your Qdrant instance as well as [more quantization methods](/documentation/guides/quantization/). -``` +If you have any feedback, drop us a note on Twitter or LinkedIn to tell us about your results. [Join our lively Discord Server](https://discord.gg/Qy6HCJK9Dc) if you want to discuss BQ with like-minded people! -You can store original vectors on disk within the `vectors_config` by setting `on_disk=True` to save RAM space, while keeping quantized vectors in RAM for faster access +<|page-48-lllmstxt|> +Not every search journey begins with a specific destination in mind. Sometimes, you just want to explore and see what’s out there and what you might like. +This is especially true when it comes to food. You might be craving something sweet, but you don’t know what. You might be also looking for a new dish to try, +and you just want to see the options available. In these cases, it's impossible to express your needs in a textual query, as the thing you are looking for is not +yet defined. Qdrant's semantic search for images is useful when you have a hard time expressing your tastes in words. -We recommend checking out our [Vector Quantization guide](https://qdrant.tech/articles/what-is-vector-quantization/) for a full breakdown of methods and tips on **optimizing performance** for your specific use case. +## General architecture -## [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#distributed-deployment) Distributed Deployment +We are happy to announce a refreshed version of our [Food Discovery Demo](https://food-discovery.qdrant.tech/). This time available as an open source project, +so you can easily deploy it on your own and play with it. If you prefer to dive into the source code directly, then feel free to check out the [GitHub repository +](https://github.com/qdrant/demo-food-discovery/). +Otherwise, read on to learn more about the demo and how it works! -When thinking about scaling, the key factors to consider are **fault tolerance**, **load balancing**, and **availability**. One node, no matter how powerful, can only take you so far. Eventually, you’ll need to spread the workload across multiple machines to ensure the system remains fast and stable. +In general, our application consists of three parts: a [FastAPI](https://fastapi.tiangolo.com/) backend, a [React](https://react.dev/) frontend, and +a [Qdrant](/) instance. The architecture diagram below shows how these components interact with each other: -### [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#sharding-distributing-data-across-nodes) Sharding: Distributing Data Across Nodes +![Archtecture diagram](/articles_data/food-discovery-demo/architecture-diagram.png) -In a distributed Qdrant cluster, data is split into smaller units called **shards**, which are distributed across different nodes. which helps balance the load and ensures that queries can be processed in parallel. +## Why did we use a CLIP model? -Each collection—a group of related data points—can be split into non-overlapping subsets, which are then managed by different nodes. +CLIP is a neural network that can be used to encode both images and texts into vectors. And more importantly, both images and texts are vectorized into the same +latent space, so we can compare them directly. This lets you perform semantic search on images using text queries and the other way around. For example, if +you search for “flat bread with toppings”, you will get images of pizza. Or if you search for “pizza”, you will get images of some flat bread with toppings, even +if they were not labeled as “pizza”. This is because CLIP embeddings capture the semantics of the images and texts and can find the similarities between them +no matter the wording. -![ Distributed vector database with sharding and Raft consensus](https://qdrant.tech/articles_data/what-is-a-vector-database/sharding-raft.png) +![CLIP model](/articles_data/food-discovery-demo/clip-model.png) -**Raft Consensus** ensures that all the nodes stay in sync and have a consistent view of the data. Each node knows where every shard is, and Raft ensures that all nodes are in sync. If one node fails, the others know where the missing data is located and can take over. +CLIP is available in many different ways. We used the pretrained `clip-ViT-B-32` model available in the [Sentence-Transformers](https://www.sbert.net/examples/applications/image-search/README.html) +library, as this is the easiest way to get started. -By default, the number of shards in your Qdrant system matches the number of nodes in your cluster. But if you need more control, you can choose the `shard_number` manually when creating a collection. +## The dataset -```python -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=300, distance=models.Distance.COSINE), - shard_number=4, # Custom number of shards -) +The demo is based on the [Wolt](https://wolt.com/) dataset. It contains over 2M images of dishes from different restaurants along with some additional metadata. +This is how a payload for a single dish looks like: +```json +{ + "cafe": { + "address": "VGX7+6R2 Vecchia Napoli, Valletta", + "categories": ["italian", "pasta", "pizza", "burgers", "mediterranean"], + "location": {"lat": 35.8980154, "lon": 14.5145106}, + "menu_id": "610936a4ee8ea7a56f4a372a", + "name": "Vecchia Napoli Is-Suq Tal-Belt", + "rating": 9, + "slug": "vecchia-napoli-skyparks-suq-tal-belt" + }, + "description": "Tomato sauce, mozzarella fior di latte, crispy guanciale, Pecorino Romano cheese and a hint of chilli", + "image": "https://wolt-menu-images-cdn.wolt.com/menu-images/610936a4ee8ea7a56f4a372a/005dfeb2-e734-11ec-b667-ced7a78a5abd_l_amatriciana_pizza_joel_gueller1.jpeg", + "name": "L'Amatriciana" +} ``` -There are two main types of sharding: +Processing this amount of records takes some time, so we precomputed the CLIP embeddings, stored them in a Qdrant collection and exported the collection as +a snapshot. You may [download it here](https://storage.googleapis.com/common-datasets-snapshots/wolt-clip-ViT-B-32.snapshot). -1. **Automatic Sharding:** Points (vectors) are automatically distributed across shards using consistent hashing. Each shard contains non-overlapping subsets of the data. -2. **User-defined Sharding:** Specify how points are distributed, enabling more control over your data organization, especially for use cases like **multitenancy**, where each tenant (a user, client, or organization) has their own isolated data. +## Different search modes -Each shard is divided into **segments**. They are a smaller storage unit within a shard, storing a subset of vectors and their associated payloads (metadata). When a query is executed, it targets the only relevant segments, processing them in parallel. +The FastAPI backend [exposes just a single endpoint](https://github.com/qdrant/demo-food-discovery/blob/6b49e11cfbd6412637d527cdd62fe9b9f74ac699/backend/main.py#L37), +however it handles multiple scenarios. Let's dive into them one by one and understand why they are needed. -![Segments act as smaller storage units within a shard](https://qdrant.tech/articles_data/what-is-a-vector-database/segments.png) +### Cold start -### [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#replication-high-availability-and-data-integrity) Replication: High Availability and Data Integrity +Recommendation systems struggle with a cold start problem. When a new user joins the system, there is no data about their preferences, so it’s hard to recommend +anything. The same applies to our demo. When you open it, you will see a random selection of dishes, and it changes every time you refresh the page. Internally, +the demo [chooses some random points](https://github.com/qdrant/demo-food-discovery/blob/6b49e11cfbd6412637d527cdd62fe9b9f74ac699/backend/discovery.py#L70) in the +vector space. -You don’t want a single failure to take down your system, right? Replication keeps multiple copies of the same data across different nodes to ensure **high availability**. +![Random points selection](/articles_data/food-discovery-demo/random-results.png) -In Qdrant, **Replica Sets** manage these copies of shards across different nodes. If one replica becomes unavailable, others are there to take over and keep the system running. Whether the data is local or remote is mainly influenced by how you’ve configured the cluster. +That procedure should result in returning diverse results, so we have a higher chance of showing something interesting to the user. -![ Replica Set and Replication diagram](https://qdrant.tech/articles_data/what-is-a-vector-database/replication.png) +### Textual search -When a query is made, if the relevant data is stored locally, the local shard handles the operation. If the data is on a remote shard, it’s retrieved via gRPC. +Since the demo suffers from the cold start problem, we implemented a textual search mode that is useful to start exploring the data. You can type in any text query +by clicking a search icon in the top right corner. The demo will use the CLIP model to encode the query into a vector and then search for the nearest neighbors +in the vector space. -You can control how many copies you want with the `replication_factor`. For example, creating a collection with 4 shards and a replication factor of 2 will result in 8 physical shards distributed across the cluster: +![Random points selection](/articles_data/food-discovery-demo/textual-search.png) -```python -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=300, distance=models.Distance.COSINE), - shard_number=4, - replication_factor=2, -) +This is implemented as [a group search query to Qdrant](https://github.com/qdrant/demo-food-discovery/blob/6b49e11cfbd6412637d527cdd62fe9b9f74ac699/backend/discovery.py#L44). +We didn't use a simple search, but performed grouping by the restaurant to get more diverse results. [Search groups](/documentation/concepts/search/#search-groups) +is a mechanism similar to `GROUP BY` clause in SQL, and it's useful when you want to get a specific number of result per group (in our case just one). -``` +```python +import settings -We recommend using sharding and replication together so that your data is both split across nodes and replicated for availability. +# Encode query into a vector, model is an instance of +# sentence_transformers.SentenceTransformer that loaded CLIP model +query_vector = model.encode(query).tolist() -For more details on features like **user-defined sharding, node failure recovery**, and **consistency guarantees**, see our guide on [Distributed Deployment.](https://qdrant.tech/documentation/guides/distributed_deployment/) +# Search for nearest neighbors, client is an instance of +# qdrant_client.QdrantClient that has to be initialized before +response = client.search_groups( + settings.QDRANT_COLLECTION, + query_vector=query_vector, + group_by=settings.GROUP_BY_FIELD, + limit=search_query.limit, +) +``` -## [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#multitenancy-data-isolation-for-multi-tenant-architectures) Multitenancy: Data Isolation for Multi-Tenant Architectures +### Exploring the results -![vector-database-get-started](https://qdrant.tech/articles_data/what-is-a-vector-database/vector-database-6.png) +The main feature of the demo is the ability to explore the space of the dishes. You can click on any of them to see more details, but first of all you can like or dislike it, +and the demo will update the search results accordingly. -Sharding efficiently distributes data across nodes, while replication guarantees redundancy and fault tolerance. But what happens when you’ve got multiple clients or user groups, and you need to keep their data isolated within the same infrastructure? +![Recommendation results](/articles_data/food-discovery-demo/recommendation-results.png) -**Multitenancy** allows you to keep data for different tenants (users, clients, or organizations) isolated within a single cluster. Instead of creating separate collections for `Tenant 1` and `Tenant 2`, you store their data in the same collection but tag each vector with a `group_id` to identify which tenant it belongs to. +#### Negative feedback only -![Multitenancy dividing data between 2 tenants](https://qdrant.tech/articles_data/what-is-a-vector-database/multitenancy-1.png) +Qdrant [Recommendation API](/documentation/concepts/search/#recommendation-api) needs at least one positive example to work. However, in our demo +we want to be able to provide only negative examples. This is because we want to be able to say “I don’t like this dish” without having to like anything first. +To achieve this, we use a trick. We negate the vectors of the disliked dishes and use their mean as a query. This way, the disliked dishes will be pushed away +from the search results. **This works because the cosine distance is based on the angle between two vectors, and the angle between a vector and its negation is 180 degrees.** -In the backend, Qdrant can store `Tenant 1`’s data in Shard 1 located in Canada (perhaps for compliance reasons like GDPR), while `Tenant 2`’s data is stored in Shard 2 located in Germany. The data will be physically separated but still within the same infrastructure. +![CLIP model](/articles_data/food-discovery-demo/negated-vector.png) -To implement this, you tag each vector with a tenant-specific `group_id` during the upsert operation: +Food Discovery Demo [implements that trick](https://github.com/qdrant/demo-food-discovery/blob/6b49e11cfbd6412637d527cdd62fe9b9f74ac699/backend/discovery.py#L122) +by calling Qdrant twice. Initially, we use the [Scroll API](/documentation/concepts/points/#scroll-points) to find disliked items, +and then calculate a negated mean of all their vectors. That allows using the [Search Groups API](/documentation/concepts/search/#search-groups) +to find the nearest neighbors of the negated mean vector. ```python -client.upsert( - collection_name="tenant_data", - points=[models.PointStruct(\ - id=2,\ - payload={"group_id": "tenant_1"},\ - vector=[0.1, 0.9, 0.1]\ - )], - shard_key_selector="canada" +import numpy as np + +# Retrieve the disliked points based on their ids +disliked_points, _ = client.scroll( + settings.QDRANT_COLLECTION, + scroll_filter=models.Filter( + must=[ + models.HasIdCondition(has_id=search_query.negative), + ] + ), + with_vectors=True, ) +# Calculate a mean vector of disliked points +disliked_vectors = np.array([point.vector for point in disliked_points]) +mean_vector = np.mean(disliked_vectors, axis=0) +negated_vector = -mean_vector + +# Search for nearest neighbors of the negated mean vector +response = client.search_groups( + settings.QDRANT_COLLECTION, + query_vector=negated_vector.tolist(), + group_by=settings.GROUP_BY_FIELD, + limit=search_query.limit, +) ``` -Each tenant’s data remains isolated while still benefiting from the shared infrastructure. Optimizing for data privacy, compliance with local regulations, and scalability, without the need to create excessive collections or maintain separate clusters for each tenant. +#### Positive and negative feedback -If you want to learn more about working with a multitenant setup in Qdrant, you can check out our [Multitenancy and Custom Sharding dedicated guide.](https://qdrant.tech/articles/multitenancy/) +Since the [Recommendation API](/documentation/concepts/search/#recommendation-api) requires at least one positive example, we can use it only when +the user has liked at least one dish. We could theoretically use the same trick as above and negate the disliked dishes, but it would be a bit weird, as Qdrant has +that feature already built-in, and we can call it just once to do the job. It's always better to perform the search server-side. Thus, in this case [we just call +the Qdrant server with a list of positive and negative examples](https://github.com/qdrant/demo-food-discovery/blob/6b49e11cfbd6412637d527cdd62fe9b9f74ac699/backend/discovery.py#L166), +so it can find some points which are close to the positive examples and far from the negative ones. + +```python +response = client.recommend_groups( + settings.QDRANT_COLLECTION, + positive=search_query.positive, + negative=search_query.negative, + group_by=settings.GROUP_BY_FIELD, + limit=search_query.limit, +) +``` -## [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#data-security-and-access-control) Data Security and Access Control +From the user perspective nothing changes comparing to the previous case. -A common security risk in vector databases is the possibility of **embedding inversion attacks**, where attackers could reconstruct the original data from embeddings. There are many layers of protection you can use to secure your instance that are very important before getting your vector database into production. +### Location-based search -For quick security in simpler use cases, you can use the **API key authentication**. To enable it, set up the API key in the configuration or environment variable. +Last but not least, location plays an important role in the food discovery process. You are definitely looking for something you can find nearby, not on the other +side of the globe. Therefore, your current location can be toggled as a filtering condition. You can enable it by clicking on “Find near me” icon +in the top right. This way you can find the best pizza in your neighborhood, not in the whole world. Qdrant [geo radius filter](/documentation/concepts/filtering/#geo-radius) is a perfect choice for this. It lets you +filter the results by distance from a given point. -```yaml -service: - api_key: your_secret_api_key_here - enable_tls: true # Make sure to enable TLS to protect the API key from being exposed +```python +from qdrant_client import models +# Create a geo radius filter +query_filter = models.Filter( + must=[ + models.FieldCondition( + key="cafe.location", + geo_radius=models.GeoRadius( + center=models.GeoPoint( + lon=location.longitude, + lat=location.latitude, + ), + radius=location.radius_km * 1000, + ), + ) + ] +) ``` -Once this is set up, remember to include the API key in all your requests: +Such a filter needs [a payload index](/documentation/concepts/indexing/#payload-index) to work efficiently, and it was created on a collection +we used to create the snapshot. When you import it into your instance, the index will be already there. -```python -from qdrant_client import QdrantClient +## Using the demo -client = QdrantClient( - url="https://localhost:6333", - api_key="your_secret_api_key_here" -) +The Food Discovery Demo [is available online](https://food-discovery.qdrant.tech/), but if you prefer to run it locally, you can do it with Docker. The +[README](https://github.com/qdrant/demo-food-discovery/blob/main/README.md) describes all the steps more in detail, but here is a quick start: +```bash +git clone git@github.com:qdrant/demo-food-discovery.git +cd demo-food-discovery +# Create .env file based on .env.example +docker-compose up -d ``` -In more advanced setups, Qdrant uses **JWT (JSON Web Tokens)** to enforce **Role-Based Access Control (RBAC)**. +The demo will be available at `http://localhost:8001`, but you won't be able to search anything until you [import the snapshot into your Qdrant +instance](/documentation/concepts/snapshots/#recover-via-api). If you don't want to bother with hosting a local one, you can use the [Qdrant +Cloud](https://cloud.qdrant.io/) cluster. 4 GB RAM is enough to load all the 2 million entries. -RBAC defines roles and assigns permissions, while JWT securely encodes these roles into tokens. Each request is validated against the user’s JWT, ensuring they can only access or modify data based on their assigned permissions. +## Fork and reuse -You can easily setup you access tokens and secure access to sensitive data through the **Qdrant Web UI:** +Our demo is completely open-source. Feel free to fork it, update with your own dataset or adapt the application to your use case. Whether you’re looking to understand the mechanics +of semantic search or to have a foundation to build a larger project, this demo can serve as a starting point. Check out the [Food Discovery Demo repository +](https://github.com/qdrant/demo-food-discovery/) to get started. If you have any questions, feel free to reach out [through Discord](https://qdrant.to/discord). -![Qdrant Web UI for generating a new access token.](https://qdrant.tech/articles_data/what-is-a-vector-database/jwt-web-ui.png) +<|page-49-lllmstxt|> +## Introduction -By default, Qdrant instances are **unsecured**, so it’s important to configure security measures before moving to production. To learn more about how to configure security for your Qdrant instance and other advanced options, please check out the [official Qdrant documentation on security.](https://qdrant.tech/documentation/guides/security/) +Hello everyone! My name is Kartik Gupta, and I am thrilled to share my coding journey as part of the Google Summer of Code 2023 program. This summer, I had the incredible opportunity to work on an exciting project titled "Web UI for Visualization and Exploration" for Qdrant, a vector search engine. In this article, I will take you through my experience, challenges, and achievements during this enriching coding journey. -## [Anchor](https://qdrant.tech/articles/what-is-a-vector-database/\#time-to-experiment) Time to Experiment +## Project Overview -As we’ve seen in this article, a vector database is definitely not **just** a database as we traditionally know it. It opens up a world of possibilities, from advanced similarity search to hybrid search that allows content retrieval with both context and precision. +Qdrant is a powerful vector search engine widely used for similarity search and clustering. However, it lacked a user-friendly web-based UI for data visualization and exploration. My project aimed to bridge this gap by developing a web-based user interface that allows users to easily interact with and explore their vector data. -But there’s no better way to learn than by doing. Try building a [semantic search engine](https://qdrant.tech/documentation/tutorials/search-beginners/) or experiment deploying a [hybrid search service](https://qdrant.tech/documentation/tutorials/hybrid-search-fastembed/) from zero. You’ll realize there are endless ways you can take advantage of vectors. +## Milestones and Achievements -| **Use Case** | **How It Works** | **Examples** | -| --- | --- | --- | -| **Similarity Search** | Finds similar data points using vector distances | Find similar product images, retrieve documents based on themes, discover related topics | -| **Anomaly Detection** | Identifies outliers based on deviations in vector space | Detect unusual user behavior in banking, spot irregular patterns | -| **Recommendation Systems** | Uses vector embeddings to learn and model user preferences | Personalized movie or music recommendations, e-commerce product suggestions | -| **RAG (Retrieval-Augmented Generation)** | Combines vector search with large language models (LLMs) for contextually relevant answers | Customer support, auto-generate summaries of documents, research reports | -| **Multimodal Search** | Search across different types of data like text, images, and audio in a single query. | Search for products with a description and image, retrieve images based on audio or text | -| **Voice & Audio Recognition** | Uses vector representations to recognize and retrieve audio content | Speech-to-text transcription, voice-controlled smart devices, identify and categorize sounds | -| **Knowledge Graph Augmentation** | Links unstructured data to concepts in knowledge graphs using vectors | Link research papers to related studies, connect customer reviews to product features, organize patents by innovation trends | +The project was divided into six milestones, each focusing on a specific aspect of the web UI development. Let's go through each of them and my achievements during the coding period. -You can also watch our video tutorial and get started with Qdrant to generate semantic search results and recommendations from a sample dataset. +**1. Designing a friendly UI on Figma** -Getting Started with Qdrant - YouTube +I started by designing the user interface on Figma, ensuring it was easy to use, visually appealing, and responsive on different devices. I focused on usability and accessibility to create a seamless user experience. ( [Figma Design](https://www.figma.com/file/z54cAcOErNjlVBsZ1DrXyD/Qdant?type=design&node-id=0-1&mode=design&t=Pu22zO2AMFuGhklG-0)) + +**2. Building the layout** -[Photo image of Qdrant - Vector Database & Search Engine](https://www.youtube.com/channel/UC6ftm8PwH1RU_LM1jwG0LQA?embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +The layout route served as a landing page with an overview of the application's features and navigation links to other routes. -Qdrant - Vector Database & Search Engine +**3. Creating a view collection route** -8.12K subscribers +This route enabled users to view a list of collections available in the application. Users could click on a collection to see more details, including the data and vectors associated with it. -[Getting Started with Qdrant](https://www.youtube.com/watch?v=LRcZ9pbGnno) +{{< figure src=/articles_data/web-ui-gsoc/collections-page.png caption="Collection Page" alt="Collection Page" >}} -Qdrant - Vector Database & Search Engine +**4. Developing a data page with "find similar" functionality** -Search +I implemented a data page where users could search for data and find similar data using a recommendation API. The recommendation API suggested similar data based on the Data's selected ID, providing valuable insights. -Watch later +{{< figure src=/articles_data/web-ui-gsoc/points-page.png caption="Points Page" alt="Points Page" >}} + +**5. Developing query editor page libraries** -Share +This milestone involved creating a query editor page that allowed users to write queries in a custom language. The editor provided syntax highlighting, autocomplete, and error-checking features for a seamless query writing experience. -Copy link +{{< figure src=/articles_data/web-ui-gsoc/console-page.png caption="Query Editor Page" alt="Query Editor Page" >}} + +**6. Developing a route for visualizing vector data points** -Info +This is done by the reduction of n-dimensional vector in 2-D points and they are displayed with their respective payloads. -Shopping +{{< figure src=/articles_data/web-ui-gsoc/visualization-page.png caption="Vector Visuliztion Page" alt="visualization-page" >}} -Tap to unmute +## Challenges and Learning -If playback doesn't begin shortly, try restarting your device. +Throughout the project, I encountered a series of challenges that stretched my engineering capabilities and provided unique growth opportunities. From mastering new libraries and technologies to ensuring the user interface (UI) was both visually appealing and user-friendly, every obstacle became a stepping stone toward enhancing my skills as a developer. However, each challenge provided an opportunity to learn and grow as a developer. I acquired valuable experience in vector search and dimension reduction techniques. -More videos +The most significant learning for me was the importance of effective project management. Setting realistic timelines, collaborating with mentors, and staying proactive with feedback allowed me to complete the milestones efficiently. -## More videos +### Technical Learning and Skill Development -You're signed out +One of the most significant aspects of this journey was diving into the intricate world of vector search and dimension reduction techniques. These areas, previously unfamiliar to me, required rigorous study and exploration. Learning how to process vast amounts of data efficiently and extract meaningful insights through these techniques was both challenging and rewarding. -Videos you watch may be added to the TV's watch history and influence TV recommendations. To avoid this, cancel and sign in to YouTube on your computer. +### Effective Project Management -CancelConfirm +Undoubtedly, the most impactful lesson was the art of effective project management. I quickly grasped the importance of setting realistic timelines and goals. Collaborating closely with mentors and maintaining proactive communication proved indispensable. This approach enabled me to navigate the complex development process and successfully achieve the project's milestones. -Share +### Overcoming Technical Challenges -Include playlist +#### Autocomplete Feature in Console -An error occurred while retrieving sharing information. Please try again later. +One particularly intriguing challenge emerged while working on the autocomplete feature within the console. Finding a solution was proving elusive until a breakthrough came from an unexpected direction. My mentor, Andrey, proposed creating a separate module that could support autocomplete based on OpenAPI for our custom language. This ingenious approach not only resolved the issue but also showcased the power of collaborative problem-solving. -[Watch on](https://www.youtube.com/watch?v=LRcZ9pbGnno&embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +#### Optimization with Web Workers -0:00 +The high-processing demands of vector reduction posed another significant challenge. Initially, this task was straining browsers and causing performance issues. The solution materialized in the form of web workers—an independent processing instance that alleviated the strain on browsers. However, a new question arose: how to terminate these workers effectively? With invaluable insights from my mentor, I gained a deeper understanding of web worker dynamics and successfully tackled this challenge. -0:00 / 24:22 -‱Live +#### Console Integration Complexity -‱ +Integrating the console interaction into the application presented multifaceted challenges. Crafting a custom language in Monaco, parsing text to make API requests, and synchronizing the entire process demanded meticulous attention to detail. Overcoming these hurdles was a testament to the complexity of real-world engineering endeavours. -[Watch on YouTube](https://www.youtube.com/watch?v=LRcZ9pbGnno "Watch on YouTube") +#### Codelens Multiplicity Issue -Phew! I hope you found some of the concepts here useful. If you have any questions feel free to send them in our [Discord Community](https://discord.com/invite/qdrant) where our team will be more than happy to help you out! +An unexpected issue cropped up during the development process: the codelen (run button) registered multiple times, leading to undesired behaviour. This hiccup underscored the importance of thorough testing and debugging, even in seemingly straightforward features. -> Remember, don’t get lost in vector space! 🚀 +### Key Learning Points -##### Was this page useful? +Amidst these challenges, I garnered valuable insights that have significantly enriched my engineering prowess: -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +**Vector Reduction Techniques**: Navigating the realm of vector reduction techniques provided a deep understanding of how to process and interpret data efficiently. This knowledge opens up new avenues for developing data-driven applications in the future. -Thank you for your feedback! 🙏 +**Web Workers Efficiency**: Mastering the intricacies of web workers not only resolved performance concerns but also expanded my repertoire of optimization strategies. This newfound proficiency will undoubtedly find relevance in various future projects. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/what-is-a-vector-database.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +**Monaco Editor and UI Frameworks**: Working extensively with the Monaco Editor, Material-UI (MUI), and Vite enriched my familiarity with these essential tools. I honed my skills in integrating complex UI components seamlessly into applications. -On this page: +## Areas for Improvement and Future Enhancements -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/what-is-a-vector-database.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +While reflecting on this transformative journey, I recognize several areas that offer room for improvement and future enhancements: -× +1. Enhanced Autocomplete: Further refining the autocomplete feature to support key-value suggestions in JSON structures could greatly enhance the user experience. + +2. Error Detection in Console: Integrating the console's error checker with OpenAPI could enhance its accuracy in identifying errors and offering precise suggestions for improvement. + +3. Expanded Vector Visualization: Exploring additional visualization methods and optimizing their performance could elevate the utility of the vector visualization route. + -[Powered by](https://qdrant.tech/) +## Conclusion -<|page-36-lllmstxt|> -## data-management -- [Documentation](https://qdrant.tech/documentation/) -- Data Management +Participating in the Google Summer of Code 2023 and working on the "Web UI for Visualization and Exploration" project has been an immensely rewarding experience. I am grateful for the opportunity to contribute to Qdrant and develop a user-friendly interface for vector data exploration. -## [Anchor](https://qdrant.tech/documentation/data-management/\#data-management-integrations) Data Management Integrations +I want to express my gratitude to my mentors and the entire Qdrant community for their support and guidance throughout this journey. This experience has not only improved my coding skills but also instilled a deeper passion for web development and data analysis. -| Integration | Description | -| --- | --- | -| [Airbyte](https://qdrant.tech/documentation/data-management/airbyte/) | Data integration platform specialising in ELT pipelines. | -| [Airflow](https://qdrant.tech/documentation/data-management/airflow/) | Platform designed for developing, scheduling, and monitoring batch-oriented workflows. | -| [CocoIndex](https://qdrant.tech/documentation/data-management/cocoindex/) | High performance ETL framework to transform data for AI, with real-time incremental processing | -| [Cognee](https://qdrant.tech/documentation/data-management/cognee/) | AI memory frameworks that allows loading from 30+ data sources to graph and vector stores | -| [Connect](https://qdrant.tech/documentation/data-management/redpanda/) | Declarative data-agnostic streaming service for efficient, stateless processing. | -| [Confluent](https://qdrant.tech/documentation/data-management/confluent/) | Fully-managed data streaming platform with a cloud-native Apache Kafka engine. | -| [DLT](https://qdrant.tech/documentation/data-management/dlt/) | Python library to simplify data loading processes between several sources and destinations. | -| [Fluvio](https://qdrant.tech/documentation/data-management/fluvio/) | Rust-based platform for high speed, real-time data processing. | -| [Fondant](https://qdrant.tech/documentation/data-management/fondant/) | Framework for developing datasets, sharing reusable operations and data processing trees. | -| [MindsDB](https://qdrant.tech/documentation/data-management/mindsdb/) | Platform to deploy, serve, and fine-tune models with numerous data source integrations. | -| [NiFi](https://qdrant.tech/documentation/data-management/nifi/) | Data ingestion platform to manage data transfer between different sources and destination systems. | -| [Spark](https://qdrant.tech/documentation/data-management/spark/) | A unified analytics engine for large-scale data processing. | -| [Unstructured](https://qdrant.tech/documentation/data-management/unstructured/) | Python library with components for ingesting and pre-processing data from numerous sources. | +As my coding journey continues beyond this project, I look forward to applying the knowledge and experience gained here to future endeavours. I am excited to see how Qdrant evolves with the newly developed web UI and how it positively impacts users worldwide. -##### Was this page useful? +Thank you for joining me on this coding adventure, and I hope to share more exciting projects in the future! Happy coding! -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +<|page-50-lllmstxt|> +## Introduction -Thank you for your feedback! 🙏 +Hello, everyone! I'm Jishan Bhattacharya, and I had the incredible opportunity to intern at Qdrant this summer as part of the Qdrant Summer of Code 2024. Under the mentorship of [Andrey Vasnetsov](https://www.linkedin.com/in/andrey-vasnetsov-75268897/), I dived into the world of performance optimization, focusing on enhancing vector visualization using WebAssembly (WASM). In this article, I'll share the insights, challenges, and accomplishments from my journey — one filled with learning, experimentation, and plenty of coding adventures. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/data-management/_index.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. -On this page: +## Project Overview -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/data-management/_index.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Qdrant is a robust vector database and search engine designed to store vector data and perform tasks like similarity search and clustering. One of its standout features is the ability to visualize high-dimensional vectors in a 2D space. However, the existing implementation faced performance bottlenecks, especially when scaling to large datasets. My mission was to tackle this challenge by leveraging a WASM-based solution for dimensionality reduction in the visualization process. -× -[Powered by](https://qdrant.tech/) +## Learnings & Challenges -<|page-37-lllmstxt|> -## configuration -- [Documentation](https://qdrant.tech/documentation/) -- [Guides](https://qdrant.tech/documentation/guides/) -- Configuration +Our weapon of choice was Rust, paired with WASM, and we employed the t-SNE algorithm for dimensionality reduction. For those unfamiliar, t-SNE (t-Distributed Stochastic Neighbor Embedding) is a technique that helps visualize high-dimensional data by projecting it into two or three dimensions. It operates in two main steps: -# [Anchor](https://qdrant.tech/documentation/guides/configuration/\#configuration) Configuration +1. **Computing Pairwise Similarity:** This step involves calculating the similarity between each pair of data points in the original high-dimensional space. -Qdrant ships with sensible defaults for collection and network settings that are suitable for most use cases. You can view these defaults in the [Qdrant source](https://github.com/qdrant/qdrant/blob/master/config/config.yaml). If you need to customize the settings, you can do so using configuration files and environment variables. +2. **Iterative Optimization:** The second step is iterative, where the embedding is refined using gradient descent. Here, the similarity matrix from the first step plays a crucial role. -## [Anchor](https://qdrant.tech/documentation/guides/configuration/\#configuration-files) Configuration Files +At the outset, Andrey tasked me with rewriting the existing JavaScript implementation of t-SNE in Rust, introducing multi-threading along the way. Setting up WASM with Vite for multi-threaded execution was no small feat, but the effort paid off. The resulting Rust implementation outperformed the single-threaded JavaScript version, although it still struggled with large datasets. -To customize Qdrant, you can mount your configuration file in any of the following locations. This guide uses `.yaml` files, but Qdrant also supports other formats such as `.toml`, `.json`, and `.ini`. +Next came the challenge of optimizing the algorithm further. A key aspect of t-SNE's first step is finding the nearest neighbors for each data point, which requires an efficient data structure. I opted for a [Vantage Point Tree](https://en.wikipedia.org/wiki/Vantage-point_tree) (also known as a Ball Tree) to speed up this process. As for the second step, while it is inherently sequential, there was still room for improvement. I incorporated Barnes-Hut approximation to accelerate the gradient calculation. This method approximates the forces between points in low dimensional space, making the process more efficient. -1. **Main Configuration: `qdrant/config/config.yaml`** +To illustrate, imagine dividing a 2D space into quadrants, each containing multiple points. Every quadrant is again subdivided into four quadrants. This is done until every point belongs to a single cell. -Mount your custom `config.yaml` file to override default settings: +{{< figure + src="/articles_data/dimension-reduction-qsoc/barnes_hut.png" + caption="Barnes-Hut Approximation" + alt="Calculating the resultant force on red point using Barnes-Hut approximation" +>}} +We then calculate the center of mass for each cell represented by a blue circle as shown in the figure. Now let’s say we want to find all the forces, represented by dotted lines, on the red point. Barnes Hut’s approximation states that for points that are sufficiently distant, instead of computing the force for each individual point, we use the center of mass as a proxy, significantly reducing the computational load. This is represented by the blue dotted line in the figure. +These optimizations made a remarkable difference — Barnes-Hut t-SNE was eight times faster than the exact t-SNE for 10,000 vectors. -```bash -docker run -p 6333:6333 \ - -v $(pwd)/config.yaml:/qdrant/config/config.yaml \ - qdrant/qdrant +{{< figure + src="/articles_data/dimension-reduction-qsoc/rust_rewrite.jpg" + caption="Exact t-SNE - Total time: 884.728s" + alt="Image of visualizing 10,000 vectors using exact t-SNE which took 884.728s" +>}} -``` +{{< figure + src="/articles_data/dimension-reduction-qsoc/rust_bhtsne.jpg" + caption="Barnes-Hut t-SNE - Total time: 104.191s" + alt="Image of visualizing 10,000 vectors using Barnes-Hut t-SNE which took 110.728s" +>}} -2. **Environment-Specific Configuration: `config/{RUN_MODE}.yaml`** +Despite these improvements, the first step of the algorithm was still a bottleneck, leading to noticeable delays and blank screens. I experimented with approximate nearest neighbor algorithms, but the performance gains were minimal. After consulting with my mentor, we decided to compute the nearest neighbors on the server side, passing the distance matrix directly to the visualization process instead of the raw vectors. -Qdrant looks for an environment-specific configuration file based on the `RUN_MODE` variable. By default, the [official Docker image](https://hub.docker.com/r/qdrant/qdrant) uses `RUN_MODE=production`, meaning it will look for `config/production.yaml`. +While waiting for the distance-matrix API to be ready, I explored further optimizations. I observed that the worker thread sent results to the main thread for rendering at specific intervals, causing unnecessary delays due to serialization and deserialization. -You can override this by setting `RUN_MODE` to another value (e.g., `dev`), and providing the corresponding file: +{{< figure + src="/articles_data/dimension-reduction-qsoc/channels.png" + caption="Serialization and Deserialization Overhead" + alt="Image showing serialization and deserialization overhead due to message passing between threads" +>}} +To address this, I implemented a `SharedArrayBuffer`, allowing the main thread to access changes made by the worker thread instantly. This change led to noticeable improvements. +Additionally, the previous architecture resulted in choppy animations due to the fixed intervals at which the worker thread sent results. -```bash -docker run -p 6333:6333 \ - -v $(pwd)/dev.yaml:/qdrant/config/dev.yaml \ - -e RUN_MODE=dev \ - qdrant/qdrant +{{< figure + src="/articles_data/dimension-reduction-qsoc/prev_arch.png" + caption="Previous architecture with fixed intervals" + alt="Image showing the previous architecture of the frontend with fixed intervals for sending results" +>}} -``` +I introduced a "rendering-on-demand" approach, where the main thread would signal the worker thread when it was ready to render the next result. This created smoother, more responsive animations. -3. **Local Configuration: `config/local.yaml`** +{{< figure + src="/articles_data/dimension-reduction-qsoc/curr_arch.png" + caption="Current architecture with rendering-on-demand" + alt="Image showing the current architecture of the frontend with rendering-on-demand approach" +>}} -The `local.yaml` file is typically used for machine-specific settings that are not tracked in version control: +With these optimizations in place, the final step was wrapping up the project by creating a Node.js [package](https://www.npmjs.com/package/wasm-dist-bhtsne). This package exposed the necessary interfaces to accept the distance matrix, perform calculations, and return the results, making the solution easy to integrate into various projects. +## Areas for Improvement -```bash -docker run -p 6333:6333 \ - -v $(pwd)/local.yaml:/qdrant/config/local.yaml \ - qdrant/qdrant +While reflecting on this transformative journey, there are still areas that offer room for improvement and future enhancements: -``` +1. **Payload Parsing:** When requesting a large number of vectors, parsing the payload on the main thread can make the user interface unresponsive. Implementing a faster parser could mitigate this issue. -4. **Custom Configuration via `--config-path`** +2. **Direct Data Requests:** Allowing the worker thread to request data directly could eliminate the initial transfer of data from the main thread, speeding up the overall process. -You can specify a custom configuration file path using the `--config-path` argument. This will override other configuration files: +3. **Chart Library Optimization:** Profiling revealed that nearly 80% of the time was spent on the Chart.js update function. Switching to a WebGL-accelerated chart library could dramatically improve performance, especially for large datasets. +{{< figure + src="/articles_data/dimension-reduction-qsoc/profiling.png" + caption="Profiling Result" + alt="Image showing profiling results with 80% time spent on Chart.js update function" +>}} +## Conclusion -```bash -docker run -p 6333:6333 \ - -v $(pwd)/config.yaml:/path/to/config.yaml \ - qdrant/qdrant \ - ./qdrant --config-path /path/to/config.yaml +Participating in the Qdrant Summer of Code 2024 was a deeply rewarding experience. I had the chance to push the boundaries of my coding skills while exploring new technologies like Rust and WebAssembly. I'm incredibly grateful for the guidance and support from my mentor and the entire Qdrant team, who made this journey both educational and enjoyable. -``` +This experience has not only honed my technical skills but also ignited a deeper passion for optimizing performance in real-world applications. I’m excited to apply the knowledge and skills I've gained to future projects and to see how Qdrant's enhanced vector visualization feature will benefit users worldwide. +This experience has not only honed my technical skills but also ignited a deeper passion for optimizing performance in real-world applications. I’m excited to apply the knowledge and skills I've gained to future projects and to see how Qdrant's enhanced vector visualization feature will benefit users worldwide. -For details on how these configurations are loaded and merged, see the [loading order and priority](https://qdrant.tech/documentation/guides/configuration/#loading-order-and-priority). The full list of available configuration options can be found [below](https://qdrant.tech/documentation/guides/configuration/#configuration-options). +Thank you for joining me on this coding adventure. I hope you found something valuable in my journey, and I look forward to sharing more exciting projects with you in the future. Happy coding! -## [Anchor](https://qdrant.tech/documentation/guides/configuration/\#environment-variables) Environment Variables +<|page-51-lllmstxt|> +Qdrant is one of the fastest vector search engines out there, so while looking for a demo to show off, we came upon the idea to do a search-as-you-type box with a fully semantic search backend. Now we already have a semantic/keyword hybrid search on our website. But that one is written in Python, which incurs some overhead for the interpreter. Naturally, I wanted to see how fast I could go using Rust. -You can also configure Qdrant using environment variables, which always take the highest priority and override any file-based settings. +Since Qdrant doesn't embed by itself, I had to decide on an embedding model. The prior version used the [SentenceTransformers](https://www.sbert.net/) package, which in turn employs Bert-based [All-MiniLM-L6-V2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/tree/main) model. This model is battle-tested and delivers fair results at speed, so not experimenting on this front I took an [ONNX version](https://huggingface.co/optimum/all-MiniLM-L6-v2/tree/main) and ran that within the service. -Environment variables follow this format: they should be prefixed with `QDRANT__`, and nested properties should be separated by double underscores ( `__`). For example: +The workflow looks like this: -```bash -docker run -p 6333:6333 \ - -e QDRANT__LOG_LEVEL=INFO \ - -e QDRANT__SERVICE__API_KEY= \ - -e QDRANT__SERVICE__ENABLE_TLS=1 \ - -e QDRANT__TLS__CERT=./tls/cert.pem \ - qdrant/qdrant +![Search Qdrant by Embedding](/articles_data/search-as-you-type/Qdrant_Search_by_Embedding.png) -``` +This will, after tokenizing and embedding send a `/collections/site/points/search` POST request to Qdrant, sending the following JSON: -This results in the following configuration: +```json +POST collections/site/points/search +{ + "vector": [-0.06716014,-0.056464013, ...(382 values omitted)], + "limit": 5, + "with_payload": true, +} +``` -```yaml -log_level: INFO -service: - enable_tls: true - api_key: -tls: - cert: ./tls/cert.pem +Even with avoiding a network round-trip, the embedding still takes some time. As always in optimization, if you cannot do the work faster, a good solution is to avoid work altogether (please don't tell my employer). This can be done by pre-computing common prefixes and calculating embeddings for them, then storing them in a `prefix_cache` collection. Now the [`recommend`](https://api.qdrant.tech/api-reference/search/recommend-points) API method can find the best matches without doing any embedding. For now, I use short (up to and including 5 letters) prefixes, but I can also parse the logs to get the most common search terms and add them to the cache later. -``` +![Qdrant Recommendation](/articles_data/search-as-you-type/Qdrant_Recommendation.png) -## [Anchor](https://qdrant.tech/documentation/guides/configuration/\#loading-order-and-priority) Loading Order and Priority +Making that work requires setting up the `prefix_cache` collection with points that have the prefix as their `point_id` and the embedding as their `vector`, which lets us do the lookup with no search or index. The `prefix_to_id` function currently uses the `u64` variant of `PointId`, which can hold eight bytes, enough for this use. If the need arises, one could instead encode the names as UUID, hashing the input. Since I know all our prefixes are within 8 bytes, I decided against this for now. -During startup, Qdrant merges multiple configuration sources into a single effective configuration. The loading order is as follows (from least to most significant): +The `recommend` endpoint works roughly the same as `search_points`, but instead of searching for a vector, Qdrant searches for one or more points (you can also give negative example points the search engine will try to avoid in the results). It was built to help drive recommendation engines, saving the round-trip of sending the current point's vector back to Qdrant to find more similar ones. However Qdrant goes a bit further by allowing us to select a different collection to lookup the points, which allows us to keep our `prefix_cache` collection separate from the site data. So in our case, Qdrant first looks up the point from the `prefix_cache`, takes its vector and searches for that in the `site` collection, using the precomputed embeddings from the cache. The API endpoint expects a POST of the following JSON to `/collections/site/points/recommend`: -1. Embedded default configuration -2. `config/config.yaml` -3. `config/{RUN_MODE}.yaml` -4. `config/local.yaml` -5. Custom configuration file -6. Environment variables +```json +POST collections/site/points/recommend +{ + "positive": [1936024932], + "limit": 5, + "with_payload": true, + "lookup_from": { + "collection": "prefix_cache" + } +} +``` -### [Anchor](https://qdrant.tech/documentation/guides/configuration/\#overriding-behavior) Overriding Behavior +Now I have, in the best Rust tradition, a blazingly fast semantic search. -Settings from later sources in the list override those from earlier sources: +To demo it, I used our [Qdrant documentation website](/documentation/)'s page search, replacing our previous Python implementation. So in order to not just spew empty words, here is a benchmark, showing different queries that exercise different code paths. -- Settings in `config/{RUN_MODE}.yaml` (3) will override those in `config/config.yaml` (2). -- A custom configuration file provided via `--config-path` (5) will override all other file-based settings. -- Environment variables (6) have the highest priority and will override any settings from files. +Since the operations themselves are far faster than the network whose fickle nature would have swamped most measurable differences, I benchmarked both the Python and Rust services locally. I'm measuring both versions on the same AMD Ryzen 9 5900HX with 16GB RAM running Linux. The table shows the average time and error bound in milliseconds. I only measured up to a thousand concurrent requests. None of the services showed any slowdown with more requests in that range. I do not expect our service to become DDOS'd, so I didn't benchmark with more load. -## [Anchor](https://qdrant.tech/documentation/guides/configuration/\#configuration-validation) Configuration Validation +Without further ado, here are the results: -Qdrant validates the configuration during startup. If any issues are found, the server will terminate immediately, providing information about the error. For example: -```console -Error: invalid type: 64-bit integer `-1`, expected an unsigned 64-bit or smaller integer for key `storage.hnsw_index.max_indexing_threads` in config/production.yaml +| query length | Short | Long | +|---------------|-----------|------------| +| Python 🐍 | 16 ± 4 ms | 16 ± 4 ms | +| Rust 🩀 | 1œ ± œ ms | 5 ± 1 ms | -``` +The Rust version consistently outperforms the Python version and offers a semantic search even on few-character queries. If the prefix cache is hit (as in the short query length), the semantic search can even get more than ten times faster than the Python version. The general speed-up is due to both the relatively lower overhead of Rust + Actix Web compared to Python + FastAPI (even if that already performs admirably), as well as using ONNX Runtime instead of SentenceTransformers for the embedding. The prefix cache gives the Rust version a real boost by doing a semantic search without doing any embedding work. -This ensures that misconfigurations are caught early, preventing Qdrant from running with invalid settings. +As an aside, while the millisecond differences shown here may mean relatively little for our users, whose latency will be dominated by the network in between, when typing, every millisecond more or less can make a difference in user perception. Also search-as-you-type generates between three and five times as much load as a plain search, so the service will experience more traffic. Less time per request means being able to handle more of them. -## [Anchor](https://qdrant.tech/documentation/guides/configuration/\#configuration-options) Configuration Options +Mission accomplished! But wait, there's more! -The following YAML example describes the available configuration options. +### Prioritizing Exact Matches and Headings -```yaml -log_level: INFO +To improve on the quality of the results, Qdrant can do multiple searches in parallel, and then the service puts the results in sequence, taking the first best matches. The extended code searches: -# Logging configuration -# Qdrant logs to stdout. You may configure to also write logs to a file on disk. -# Be aware that this file may grow indefinitely. -# logger: -# # Logging format, supports `text` and `json` -# format: text -# on_disk: -# enabled: true -# log_file: path/to/log/file.log -# log_level: INFO -# # Logging format, supports `text` and `json` -# format: text +1. Text matches in titles +2. Text matches in body (paragraphs or lists) +3. Semantic matches in titles +4. Any Semantic matches -storage: - # Where to store all the data - storage_path: ./storage +Those are put together by taking them in the above order, deduplicating as necessary. - # Where to store snapshots - snapshots_path: ./snapshots +![merge workflow](/articles_data/search-as-you-type/sayt_merge.png) - snapshots_config: - # "local" or "s3" - where to store snapshots - snapshots_storage: local - # s3_config: - # bucket: "" - # region: "" - # access_key: "" - # secret_key: "" +Instead of sending a `search` or `recommend` request, one can also send a `search/batch` or `recommend/batch` request, respectively. Each of those contain a `"searches"` property with any number of search/recommend JSON requests: - # Where to store temporary files - # If null, temporary snapshots are stored in: storage/snapshots_temp/ - temp_path: null +```json +POST collections/site/points/search/batch +{ + "searches": [ + { + "vector": [-0.06716014,-0.056464013, ...], + "filter": { + "must": [ + { "key": "text", "match": { "text": }}, + { "key": "tag", "match": { "any": ["h1", "h2", "h3"] }}, + ] + } + ..., + }, + { + "vector": [-0.06716014,-0.056464013, ...], + "filter": { + "must": [ { "key": "body", "match": { "text": }} ] + } + ..., + }, + { + "vector": [-0.06716014,-0.056464013, ...], + "filter": { + "must": [ { "key": "tag", "match": { "any": ["h1", "h2", "h3"] }} ] + } + ..., + }, + { + "vector": [-0.06716014,-0.056464013, ...], + ..., + }, + ] +} +``` - # If true - point payloads will not be stored in memory. - # It will be read from the disk every time it is requested. - # This setting saves RAM by (slightly) increasing the response time. - # Note: those payload values that are involved in filtering and are indexed - remain in RAM. - # - # Default: true - on_disk_payload: true +As the queries are done in a batch request, there isn't any additional network overhead and only very modest computation overhead, yet the results will be better in many cases. - # Maximum number of concurrent updates to shard replicas - # If `null` - maximum concurrency is used. - update_concurrency: null +The only additional complexity is to flatten the result lists and take the first 5 results, deduplicating by point ID. Now there is one final problem: The query may be short enough to take the recommend code path, but still not be in the prefix cache. In that case, doing the search *sequentially* would mean two round-trips between the service and the Qdrant instance. The solution is to *concurrently* start both requests and take the first successful non-empty result. - # Write-ahead-log related configuration - wal: - # Size of a single WAL segment - wal_capacity_mb: 32 +![sequential vs. concurrent flow](/articles_data/search-as-you-type/sayt_concurrency.png) - # Number of WAL segments to create ahead of actual data requirement - wal_segments_ahead: 0 +While this means more load for the Qdrant vector search engine, this is not the limiting factor. The relevant data is already in cache in many cases, so the overhead stays within acceptable bounds, and the maximum latency in case of prefix cache misses is measurably reduced. - # Normal node - receives all updates and answers all queries - node_type: "Normal" +The code is available on the [Qdrant github](https://github.com/qdrant/page-search) - # Listener node - receives all updates, but does not answer search/read queries - # Useful for setting up a dedicated backup node - # node_type: "Listener" +To sum up: Rust is fast, recommend lets us use precomputed embeddings, batch requests are awesome and one can do a semantic search in mere milliseconds. - performance: - # Number of parallel threads used for search operations. If 0 - auto selection. - max_search_threads: 0 +<|page-52-lllmstxt|> +# Vector Similarity: Unleashing Data Insights Beyond Traditional Search - # Max number of threads (jobs) for running optimizations across all collections, each thread runs one job. - # If 0 - have no limit and choose dynamically to saturate CPU. - # Note: each optimization job will also use `max_indexing_threads` threads by itself for index building. - max_optimization_threads: 0 +When making use of unstructured data, there are traditional go-to solutions that are well-known for developers: - # CPU budget, how many CPUs (threads) to allocate for an optimization job. - # If 0 - auto selection, keep 1 or more CPUs unallocated depending on CPU size - # If negative - subtract this number of CPUs from the available CPUs. - # If positive - use this exact number of CPUs. - optimizer_cpu_budget: 0 +- **Full-text search** when you need to find documents that contain a particular word or phrase. +- **[Vector search](https://qdrant.tech/documentation/overview/vector-search/)** when you need to find documents that are semantically similar to a given query. - # Prevent DDoS of too many concurrent updates in distributed mode. - # One external update usually triggers multiple internal updates, which breaks internal - # timings. For example, the health check timing and consensus timing. - # If null - auto selection. - update_rate_limit: null +Sometimes people mix those two approaches, so it might look like the vector similarity is just an extension of full-text search. However, in this article, we will explore some promising new techniques that can be used to expand the use-case of unstructured data and demonstrate that vector similarity creates its own stack of data exploration tools. - # Limit for number of incoming automatic shard transfers per collection on this node, does not affect user-requested transfers. - # The same value should be used on all nodes in a cluster. - # Default is to allow 1 transfer. - # If null - allow unlimited transfers. - #incoming_shard_transfers_limit: 1 +## What is vector similarity search? - # Limit for number of outgoing automatic shard transfers per collection on this node, does not affect user-requested transfers. - # The same value should be used on all nodes in a cluster. - # Default is to allow 1 transfer. - # If null - allow unlimited transfers. - #outgoing_shard_transfers_limit: 1 +Vector similarity offers a range of powerful functions that go far beyond those available in traditional full-text search engines. From dissimilarity search to diversity and recommendation, these methods can expand the cases in which vectors are useful. - # Enable async scorer which uses io_uring when rescoring. - # Only supported on Linux, must be enabled in your kernel. - # See: - #async_scorer: false +Vector Databases, which are designed to store and process immense amounts of vectors, are the first candidates to implement these new techniques and allow users to exploit their data to its fullest. - optimizers: - # The minimal fraction of deleted vectors in a segment, required to perform segment optimization - deleted_threshold: 0.2 - # The minimal number of vectors in a segment, required to perform segment optimization - vacuum_min_vector_number: 1000 +## Vector similarity search vs. full-text search - # Target amount of segments optimizer will try to keep. - # Real amount of segments may vary depending on multiple parameters: - # - Amount of stored points - # - Current write RPS - # - # It is recommended to select default number of segments as a factor of the number of search threads, - # so that each segment would be handled evenly by one of the threads. - # If `default_segment_number = 0`, will be automatically selected by the number of available CPUs - default_segment_number: 0 +While there is an intersection in the functionality of these two approaches, there is also a vast area of functions that is unique to each of them. +For example, the exact phrase matching and counting of results are native to full-text search, while vector similarity support for this type of operation is limited. +On the other hand, vector similarity easily allows cross-modal retrieval of images by text or vice-versa, which is impossible with full-text search. - # Do not create segments larger this size (in KiloBytes). - # Large segments might require disproportionately long indexation times, - # therefore it makes sense to limit the size of segments. - # - # If indexation speed have more priority for your - make this parameter lower. - # If search speed is more important - make this parameter higher. - # Note: 1Kb = 1 vector of size 256 - # If not set, will be automatically selected considering the number of available CPUs. - max_segment_size_kb: null +This mismatch in expectations might sometimes lead to confusion. +Attempting to use a vector similarity as a full-text search can result in a range of frustrations, from slow response times to poor search results, to limited functionality. +As an outcome, they are getting only a fraction of the benefits of vector similarity. - # Maximum size (in KiloBytes) of vectors to store in-memory per segment. - # Segments larger than this threshold will be stored as read-only memmapped file. - # To enable memmap storage, lower the threshold - # Note: 1Kb = 1 vector of size 256 - # To explicitly disable mmap optimization, set to `0`. - # If not set, will be disabled by default. - memmap_threshold_kb: null +{{< figure width=70% src=/articles_data/vector-similarity-beyond-search/venn-diagram.png caption="Full-text search and Vector Similarity Functionality overlap" >}} - # Maximum size (in KiloBytes) of vectors allowed for plain index. - # Default value based on https://github.com/google-research/google-research/blob/master/scann/docs/algorithms.md - # Note: 1Kb = 1 vector of size 256 - # To explicitly disable vector indexing, set to `0`. - # If not set, the default value will be used. - indexing_threshold_kb: 20000 +Below we will explore why the vector similarity stack deserves new interfaces and design patterns that will unlock the full potential of this technology, which can still be used in conjunction with full-text search. - # Interval between forced flushes. - flush_interval_sec: 5 - # Max number of threads (jobs) for running optimizations per shard. - # Note: each optimization job will also use `max_indexing_threads` threads by itself for index building. - # If null - have no limit and choose dynamically to saturate CPU. - # If 0 - no optimization threads, optimizations will be disabled. - max_optimization_threads: null +## New ways to interact with similarities - # This section has the same options as 'optimizers' above. All values specified here will overwrite the collections - # optimizers configs regardless of the config above and the options specified at collection creation. - #optimizers_overwrite: - # deleted_threshold: 0.2 - # vacuum_min_vector_number: 1000 - # default_segment_number: 0 - # max_segment_size_kb: null - # memmap_threshold_kb: null - # indexing_threshold_kb: 20000 - # flush_interval_sec: 5 - # max_optimization_threads: null +Having a vector representation of unstructured data unlocks new ways of interacting with it. +For example, it can be used to measure semantic similarity between words, to cluster words or documents based on their meaning, to find related images, or even to generate new text. +However, these interactions can go beyond finding their nearest neighbors (kNN). - # Default parameters of HNSW Index. Could be overridden for each collection or named vector individually - hnsw_index: - # Number of edges per node in the index graph. Larger the value - more accurate the search, more space required. - m: 16 +There are several other techniques that can be leveraged by vector representations beyond the traditional kNN search. These include dissimilarity search, diversity search, recommendations, and discovery functions. - # Number of neighbours to consider during the index building. Larger the value - more accurate the search, more time required to build index. - ef_construct: 100 - # Minimal size threshold (in KiloBytes) below which full-scan is preferred over HNSW search. - # This measures the total size of vectors being queried against. - # When the maximum estimated amount of points that a condition satisfies is smaller than - # `full_scan_threshold_kb`, the query planner will use full-scan search instead of HNSW index - # traversal for better performance. - # Note: 1Kb = 1 vector of size 256 - full_scan_threshold_kb: 10000 +## Dissimilarity ssearch - # Number of parallel threads used for background index building. - # If 0 - automatically select. - # Best to keep between 8 and 16 to prevent likelihood of building broken/inefficient HNSW graphs. - # On small CPUs, less threads are used. - max_indexing_threads: 0 +The Dissimilarity —or farthest— search is the most straightforward concept after the nearest search, which can’t be reproduced in a traditional full-text search. +It aims to find the most un-similar or distant documents across the collection. - # Store HNSW index on disk. If set to false, index will be stored in RAM. Default: false - on_disk: false - # Custom M param for hnsw graph built for payload index. If not set, default M will be used. - payload_m: null +{{< figure width=80% src=/articles_data/vector-similarity-beyond-search/dissimilarity.png caption="Dissimilarity Search" >}} - # Default shard transfer method to use if none is defined. - # If null - don't have a shard transfer preference, choose automatically. - # If stream_records, snapshot or wal_delta - prefer this specific method. - # More info: https://qdrant.tech/documentation/guides/distributed_deployment/#shard-transfer-method - shard_transfer_method: null +Unlike full-text match, Vector similarity can compare any pair of documents (or points) and assign a similarity score. +It doesn’t rely on keywords or other metadata. +With vector similarity, we can easily achieve a dissimilarity search by inverting the search objective from maximizing similarity to minimizing it. - # Default parameters for collections - collection: - # Number of replicas of each shard that network tries to maintain - replication_factor: 1 +The dissimilarity search can find items in areas where previously no other search could be used. +Let’s look at a few examples. - # How many replicas should apply the operation for us to consider it successful - write_consistency_factor: 1 +### Case: mislabeling detection - # Default parameters for vectors. - vectors: - # Whether vectors should be stored in memory or on disk. - on_disk: null +For example, we have a dataset of furniture in which we have classified our items into what kind of furniture they are: tables, chairs, lamps, etc. +To ensure our catalog is accurate, we can use a dissimilarity search to highlight items that are most likely mislabeled. - # shard_number_per_node: 1 +To do this, we only need to search for the most dissimilar items using the +embedding of the category title itself as a query. +This can be too broad, so, by combining it with filters —a [Qdrant superpower](/articles/filtrable-hnsw/)—, we can narrow down the search to a specific category. - # Default quantization configuration. - # More info: https://qdrant.tech/documentation/guides/quantization - quantization: null - # Default strict mode parameters for newly created collections. - strict_mode: - # Whether strict mode is enabled for a collection or not. - enabled: false +{{< figure src=/articles_data/vector-similarity-beyond-search/mislabelling.png caption="Mislabeling Detection" >}} - # Max allowed `limit` parameter for all APIs that don't have their own max limit. - max_query_limit: null +The output of this search can be further processed with heavier models or human supervision to detect actual mislabeling. - # Max allowed `timeout` parameter. - max_timeout: null +### Case: outlier detection - # Allow usage of unindexed fields in retrieval based (eg. search) filters. - unindexed_filtering_retrieve: null +In some cases, we might not even have labels, but it is still possible to try to detect anomalies in our dataset. +Dissimilarity search can be used for this purpose as well. - # Allow usage of unindexed fields in filtered updates (eg. delete by payload). - unindexed_filtering_update: null +{{< figure width=80% src=/articles_data/vector-similarity-beyond-search/anomaly-detection.png caption="Anomaly Detection" >}} - # Max HNSW value allowed in search parameters. - search_max_hnsw_ef: null +The only thing we need is a bunch of reference points that we consider "normal". +Then we can search for the most dissimilar points to this reference set and use them as candidates for further analysis. - # Whether exact search is allowed or not. - search_allow_exact: null - # Max oversampling value allowed in search. - search_max_oversampling: null +## Diversity search -service: - # Maximum size of POST data in a single request in megabytes - max_request_size_mb: 32 +Even with no input provided vector, (dis-)similarity can improve an overall selection of items from the dataset. - # Number of parallel workers used for serving the api. If 0 - equal to the number of available cores. - # If missing - Same as storage.max_search_threads - max_workers: 0 +The naive approach is to do random sampling. +However, unless our dataset has a uniform distribution, the results of such sampling might be biased toward more frequent types of items. - # Host to bind the service on - host: 0.0.0.0 +{{< figure width=80% src=/articles_data/vector-similarity-beyond-search/diversity-random.png caption="Example of random sampling" >}} - # HTTP(S) port to bind the service on - http_port: 6333 - # gRPC port to bind the service on. - # If `null` - gRPC is disabled. Default: null - # Comment to disable gRPC: - grpc_port: 6334 +The similarity information can increase the diversity of those results and make the first overview more interesting. +That is especially useful when users do not yet know what they are looking for and want to explore the dataset. - # Enable CORS headers in REST API. - # If enabled, browsers would be allowed to query REST endpoints regardless of query origin. - # More info: https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS - # Default: true - enable_cors: true +{{< figure width=80% src=/articles_data/vector-similarity-beyond-search/diversity-force.png caption="Example of similarity-based sampling" >}} - # Enable HTTPS for the REST and gRPC API - enable_tls: false - # Check user HTTPS client certificate against CA file specified in tls config - verify_https_client_certificate: false +The power of vector similarity, in the context of being able to compare any two points, allows making a diverse selection of the collection possible without any labeling efforts. +By maximizing the distance between all points in the response, we can have an algorithm that will sequentially output dissimilar results. - # Set an api-key. - # If set, all requests must include a header with the api-key. - # example header: `api-key: ` - # - # If you enable this you should also enable TLS. - # (Either above or via an external service like nginx.) - # Sending an api-key over an unencrypted channel is insecure. - # - # Uncomment to enable. - # api_key: your_secret_api_key_here +{{< figure src=/articles_data/vector-similarity-beyond-search/diversity.png caption="Diversity Search" >}} - # Set an api-key for read-only operations. - # If set, all requests must include a header with the api-key. - # example header: `api-key: ` - # - # If you enable this you should also enable TLS. - # (Either above or via an external service like nginx.) - # Sending an api-key over an unencrypted channel is insecure. - # - # Uncomment to enable. - # read_only_api_key: your_secret_read_only_api_key_here - # Uncomment to enable JWT Role Based Access Control (RBAC). - # If enabled, you can generate JWT tokens with fine-grained rules for access control. - # Use generated token instead of API key. - # - # jwt_rbac: true +Some forms of diversity sampling are already used in the industry and are known as [Maximum Margin Relevance](https://python.langchain.com/docs/integrations/vectorstores/qdrant#maximum-marginal-relevance-search-mmr) (MMR). Techniques like this were developed to enhance similarity on a universal search API. +However, there is still room for new ideas, particularly regarding diversity retrieval. +By utilizing more advanced vector-native engines, it could be possible to take use cases to the next level and achieve even better results. - # Hardware reporting adds information to the API responses with a - # hint on how many resources were used to execute the request. - # - # Uncomment to enable. - # hardware_reporting: true -cluster: - # Use `enabled: true` to run Qdrant in distributed deployment mode - enabled: false +## Vector similarity recommendations - # Configuration of the inter-cluster communication - p2p: - # Port for internal communication between peers - port: 6335 +Vector similarity can go above a single query vector. +It can combine multiple positive and negative examples for a more accurate retrieval. +Building a recommendation API in a vector database can take advantage of using already stored vectors as part of the queries, by specifying the point id. +Doing this, we can skip query-time neural network inference, and make the recommendation search faster. - # Use TLS for communication between peers - enable_tls: false +There are multiple ways to implement recommendations with vectors. - # Configuration related to distributed consensus algorithm - consensus: - # How frequently peers should ping each other. - # Setting this parameter to lower value will allow consensus - # to detect disconnected nodes earlier, but too frequent - # tick period may create significant network and CPU overhead. - # We encourage you NOT to change this parameter unless you know what you are doing. - tick_period_ms: 100 +### Vector-features recommendations -# Set to true to prevent service from sending usage statistics to the developers. -# Read more: https://qdrant.tech/documentation/guides/telemetry -telemetry_disabled: false +The first approach is to take all positive and negative examples and average them to create a single query vector. +In this technique, the more significant components of positive vectors are canceled out by the negative ones, and the resulting vector is a combination of all the features present in the positive examples, but not in the negative ones. -# TLS configuration. -# Required if either service.enable_tls or cluster.p2p.enable_tls is true. -tls: - # Server certificate chain file - cert: ./tls/cert.pem +{{< figure width=80% src=/articles_data/vector-similarity-beyond-search/feature-based-recommendations.png caption="Vector-Features Based Recommendations" >}} - # Server private key file - key: ./tls/key.pem +This approach is already implemented in Qdrant, and while it works great when the vectors are assumed to have each of their dimensions represent some kind of feature of the data, sometimes distances are a better tool to judge negative and positive examples. - # Certificate authority certificate file. - # This certificate will be used to validate the certificates - # presented by other nodes during inter-cluster communication. - # - # If verify_https_client_certificate is true, it will verify - # HTTPS client certificate - # - # Required if cluster.p2p.enable_tls is true. - ca_cert: ./tls/cacert.pem +### Relative distance recommendations - # TTL in seconds to reload certificate from disk, useful for certificate rotations. - # Only works for HTTPS endpoints. Does not support gRPC (and intra-cluster communication). - # If `null` - TTL is disabled. - cert_ttl: 3600 +Another approach is to use the distance between negative examples to the candidates to help them create exclusion areas. +In this technique, we perform searches near the positive examples while excluding the points that are closer to a negative example than to a positive one. -``` +{{< figure width=80% src=/articles_data/vector-similarity-beyond-search/relative-distance-recommendations.png caption="Relative Distance Recommendations" >}} + +The main use-case of both approaches —of course— is to take some history of user interactions and recommend new items based on it. -##### Was this page useful? +## Discovery -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +In many exploration scenarios, the desired destination is not known in advance. +The search process in this case can consist of multiple steps, where each step would provide a little more information to guide the search in the right direction. -Thank you for your feedback! 🙏 +To get more intuition about the possible ways to implement this approach, let’s take a look at how similarity modes are trained in the first place: -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/configuration.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +The most well-known loss function used to train similarity models is a [triplet-loss](https://en.wikipedia.org/wiki/Triplet_loss). +In this loss, the model is trained by fitting the information of relative similarity of 3 objects: the Anchor, Positive, and Negative examples. -On this page: +{{< figure width=80% src=/articles_data/vector-similarity-beyond-search/triplet-loss.png caption="Triplet Loss" >}} -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/configuration.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Using the same mechanics, we can look at the training process from the other side. +Given a trained model, the user can provide positive and negative examples, and the goal of the discovery process is then to find suitable anchors across the stored collection of vectors. -× + +{{< figure width=60% src=/articles_data/vector-similarity-beyond-search/discovery.png caption="Reversed triplet loss" >}} -[Powered by](https://qdrant.tech/) +Multiple positive-negative pairs can be provided to make the discovery process more accurate. +Worth mentioning, that as well as in NN training, the dataset may contain noise and some portion of contradictory information, so a discovery process should be tolerant of this kind of data imperfections. -<|page-38-lllmstxt|> -## collections -- [Documentation](https://qdrant.tech/documentation/) -- [Concepts](https://qdrant.tech/documentation/concepts/) -- Collections -# [Anchor](https://qdrant.tech/documentation/concepts/collections/\#collections) Collections + +{{< figure width=80% src=/articles_data/vector-similarity-beyond-search/discovery-noise.png caption="Sample pairs" >}} -A collection is a named set of points (vectors with a payload) among which you can search. The vector of each point within the same collection must have the same dimensionality and be compared by a single metric. [Named vectors](https://qdrant.tech/documentation/concepts/collections/#collection-with-multiple-vectors) can be used to have multiple vectors in a single point, each of which can have their own dimensionality and metric requirements. +The important difference between this and the recommendation method is that the positive-negative pairs in the discovery method don’t assume that the final result should be close to positive, it only assumes that it should be closer than the negative one. -Distance metrics are used to measure similarities among vectors. -The choice of metric depends on the way vectors obtaining and, in particular, on the method of neural network encoder training. +{{< figure width=80% src=/articles_data/vector-similarity-beyond-search/discovery-vs-recommendations.png caption="Discovery vs Recommendation" >}} -Qdrant supports these most popular types of metrics: +In combination with filtering or similarity search, the additional context information provided by the discovery pairs can be used as a re-ranking factor. -- Dot product: `Dot` \- [\[wiki\]](https://en.wikipedia.org/wiki/Dot_product) -- Cosine similarity: `Cosine` \- [\[wiki\]](https://en.wikipedia.org/wiki/Cosine_similarity) -- Euclidean distance: `Euclid` \- [\[wiki\]](https://en.wikipedia.org/wiki/Euclidean_distance) -- Manhattan distance: `Manhattan` \- [\[wiki\]](https://en.wikipedia.org/wiki/Taxicab_geometry) +## A new API stack for vector databases -In addition to metrics and vector size, each collection uses its own set of parameters that controls collection optimization, index construction, and vacuum. -These settings can be changed at any time by a corresponding request. +When you introduce vector similarity capabilities into your text search engine, you extend its functionality. +However, it doesn't work the other way around, as the vector similarity as a concept is much broader than some task-specific implementations of full-text search. -## [Anchor](https://qdrant.tech/documentation/concepts/collections/\#setting-up-multitenancy) Setting up multitenancy +[Vector databases](https://qdrant.tech/), which introduce built-in full-text functionality, must make several compromises: -**How many collections should you create?** In most cases, you should only use a single collection with payload-based partitioning. This approach is called [multitenancy](https://en.wikipedia.org/wiki/Multitenancy). It is efficient for most of users, but it requires additional configuration. [Learn how to set it up](https://qdrant.tech/documentation/tutorials/multiple-partitions/) +- Choose a specific full-text search variant. +- Either sacrifice API consistency or limit vector similarity functionality to only basic kNN search. +- Introduce additional complexity to the system. -**When should you create multiple collections?** When you have a limited number of users and you need isolation. This approach is flexible, but it may be more costly, since creating numerous collections may result in resource overhead. Also, you need to ensure that they do not affect each other in any way, including performance-wise. +Qdrant, on the contrary, puts vector similarity in the center of its API and architecture, such that it allows us to move towards a new stack of vector-native operations. +We believe that this is the future of vector databases, and we are excited to see what new use-cases will be unlocked by these techniques. -## [Anchor](https://qdrant.tech/documentation/concepts/collections/\#create-a-collection) Create a collection +## Key takeaways: -httpbashpythontypescriptrustjavacsharpgo +- Vector similarity offers advanced data exploration tools beyond traditional full-text search, including dissimilarity search, diversity sampling, and recommendation systems. +- Practical applications of vector similarity include improving data quality through mislabeling detection and anomaly identification. +- Enhanced user experiences are achieved by leveraging advanced search techniques, providing users with intuitive data exploration, and improving decision-making processes. -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 300, - "distance": "Cosine" - } -} +Ready to unlock the full potential of your data? [Try a free demo](https://qdrant.tech/contact-us/) to explore how vector similarity can revolutionize your data insights and drive smarter decision-making. -``` +<|page-53-lllmstxt|> +Do you want to insert a semantic search function into your website or online app? Now you can do so - without spending any money! In this example, you will learn how to create a free prototype search engine for your own non-commercial purposes. -```bash -curl -X PUT http://localhost:6333/collections/{collection_name} \ - -H 'Content-Type: application/json' \ - --data-raw '{ - "vectors": { - "size": 300, - "distance": "Cosine" - } - }' +## Ingredients -``` +* A [Rust](https://rust-lang.org) toolchain +* [cargo lambda](https://cargo-lambda.info) (install via package manager, [download](https://github.com/cargo-lambda/cargo-lambda/releases) binary or `cargo install cargo-lambda`) +* The [AWS CLI](https://aws.amazon.com/cli) +* Qdrant instance ([free tier](https://cloud.qdrant.io) available) +* An embedding provider service of your choice (see our [Embeddings docs](/documentation/embeddings/). You may be able to get credits from [AI Grant](https://aigrant.org), also Cohere has a [rate-limited non-commercial free tier](https://cohere.com/pricing)) +* AWS Lambda account (12-month free tier available) -```python -from qdrant_client import QdrantClient, models +## What you're going to build -client = QdrantClient(url="http://localhost:6333") +You'll combine the embedding provider and the Qdrant instance to a neat semantic search, calling both services from a small Lambda function. -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=100, distance=models.Distance.COSINE), -) +![lambda integration diagram](/articles_data/serverless/lambda_integration.png) -``` +Now lets look at how to work with each ingredient before connecting them. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +## Rust and cargo-lambda -const client = new QdrantClient({ host: "localhost", port: 6333 }); +You want your function to be quick, lean and safe, so using Rust is a no-brainer. To compile Rust code for use within Lambda functions, the `cargo-lambda` subcommand has been built. `cargo-lambda` can put your Rust code in a zip file that AWS Lambda can then deploy on a no-frills `provided.al2` runtime. -client.createCollection("{collection_name}", { - vectors: { size: 100, distance: "Cosine" }, -}); +To interface with AWS Lambda, you will need a Rust project with the following dependencies in your `Cargo.toml`: +```toml +[dependencies] +tokio = { version = "1", features = ["macros"] } +lambda_http = { version = "0.8", default-features = false, features = ["apigw_http"] } +lambda_runtime = "0.8" ``` -```rust -use qdrant_client::Qdrant; -use qdrant_client::qdrant::{CreateCollectionBuilder, VectorParamsBuilder}; +This gives you an interface consisting of an entry point to start the Lambda runtime and a way to register your handler for HTTP calls. Put the following snippet into `src/helloworld.rs`: -let client = Qdrant::from_url("http://localhost:6334").build()?; +```rust +use lambda_http::{run, service_fn, Body, Error, Request, RequestExt, Response}; -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(100, Distance::Cosine)), - ) - .await?; +/// This is your callback function for responding to requests at your URL +async fn function_handler(_req: Request) -> Result, Error> { + Response::from_text("Hello, Lambda!") +} +#[tokio::main] +async fn main() { + run(service_fn(function_handler)).await +} ``` -```java -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; - -QdrantClient client = new QdrantClient( - QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +You can also use a closure to bind other arguments to your function handler (the `service_fn` call then becomes `service_fn(|req| function_handler(req, ...))`). Also if you want to extract parameters from the request, you can do so using the [Request](https://docs.rs/lambda_http/latest/lambda_http/type.Request.html) methods (e.g. `query_string_parameters` or `query_string_parameters_ref`). -client.createCollectionAsync("{collection_name}", - VectorParams.newBuilder().setDistance(Distance.Cosine).setSize(100).build()).get(); +Add the following to your `Cargo.toml` to define the binary: +```toml +[[bin]] +name = "helloworld" +path = "src/helloworld.rs" ``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +On the AWS side, you need to setup a Lambda and IAM role to use with your function. -var client = new QdrantClient("localhost", 6334); +![create lambda web page](/articles_data/serverless/create_lambda.png) -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 100, Distance = Distance.Cosine } -); +Choose your function name, select "Provide your own bootstrap on Amazon Linux 2". As architecture, use `arm64`. You will also activate a function URL. Here it is up to you if you want to protect it via IAM or leave it open, but be aware that open end points can be accessed by anyone, potentially costing money if there is too much traffic. -``` +By default, this will also create a basic role. To look up the role, you can go into the Function overview: -```go -import ( - "context" +![function overview](/articles_data/serverless/lambda_overview.png) - "github.com/qdrant/go-client/qdrant" -) +Click on the "Info" link near the "▾ Function overview" heading, and select the "Permissions" tab on the left. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +You will find the "Role name" directly under *Execution role*. Note it down for later. -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 100, - Distance: qdrant.Distance_Cosine, - }), -}) +![function overview](/articles_data/serverless/lambda_role.png) + +To test that your "Hello, Lambda" service works, you can compile and upload the function: +```bash +$ export LAMBDA_FUNCTION_NAME=hello +$ export LAMBDA_ROLE= +$ export LAMBDA_REGION=us-east-1 +$ cargo lambda build --release --arm --bin helloworld --output-format zip + Downloaded libc v0.2.137 +# [..] output omitted for brevity + Finished release [optimized] target(s) in 1m 27s +$ # Delete the old empty definition +$ aws lambda delete-function-url-config --region $LAMBDA_REGION --function-name $LAMBDA_FUNCTION_NAME +$ aws lambda delete-function --region $LAMBDA_REGION --function-name $LAMBDA_FUNCTION_NAME +$ # Upload the function +$ aws lambda create-function --function-name $LAMBDA_FUNCTION_NAME \ + --handler bootstrap \ + --architectures arm64 \ + --zip-file fileb://./target/lambda/helloworld/bootstrap.zip \ + --runtime provided.al2 \ + --region $LAMBDA_REGION \ + --role $LAMBDA_ROLE \ + --tracing-config Mode=Active +$ # Add the function URL +$ aws lambda add-permission \ + --function-name $LAMBDA_FUNCTION_NAME \ + --action lambda:InvokeFunctionUrl \ + --principal "*" \ + --function-url-auth-type "NONE" \ + --region $LAMBDA_REGION \ + --statement-id url +$ # Here for simplicity unauthenticated URL access. Beware! +$ aws lambda create-function-url-config \ + --function-name $LAMBDA_FUNCTION_NAME \ + --region $LAMBDA_REGION \ + --cors "AllowOrigins=*,AllowMethods=*,AllowHeaders=*" \ + --auth-type NONE ``` -In addition to the required options, you can also specify custom values for the following collection options: +Now you can go to your *Function Overview* and click on the Function URL. You should see something like this: -- `hnsw_config` \- see [indexing](https://qdrant.tech/documentation/concepts/indexing/#vector-index) for details. -- `wal_config` \- Write-Ahead-Log related configuration. See more details about [WAL](https://qdrant.tech/documentation/concepts/storage/#versioning) -- `optimizers_config` \- see [optimizer](https://qdrant.tech/documentation/concepts/optimizer/) for details. -- `shard_number` \- which defines how many shards the collection should have. See [distributed deployment](https://qdrant.tech/documentation/guides/distributed_deployment/#sharding) section for details. -- `on_disk_payload` \- defines where to store payload data. If `true` \- payload will be stored on disk only. Might be useful for limiting the RAM usage in case of large payload. -- `quantization_config` \- see [quantization](https://qdrant.tech/documentation/guides/quantization/#setting-up-quantization-in-qdrant) for details. -- `strict_mode_config` \- see [strict mode](https://qdrant.tech/documentation/guides/administration/#strict-mode) for details. +```text +Hello, Lambda! +``` -Default parameters for the optional collection parameters are defined in [configuration file](https://github.com/qdrant/qdrant/blob/master/config/config.yaml). +Bearer ! You have set up a Lambda function in Rust. On to the next ingredient: -See [schema definitions](https://api.qdrant.tech/api-reference/collections/create-collection) and a [configuration file](https://github.com/qdrant/qdrant/blob/master/config/config.yaml) for more information about collection and vector parameters. +## Embedding -_Available as of v1.2.0_ +Most providers supply a simple https GET or POST interface you can use with an API key, which you have to supply in an authentication header. If you are using this for non-commercial purposes, the rate limited trial key from Cohere is just a few clicks away. Go to [their welcome page](https://dashboard.cohere.ai/welcome/register), register and you'll be able to get to the dashboard, which has an "API keys" menu entry which will bring you to the following page: + [cohere dashboard](/articles_data/serverless/cohere-dashboard.png) -Vectors all live in RAM for very quick access. The `on_disk` parameter can be -set in the vector configuration. If true, all vectors will live on disk. This -will enable the use of -[memmaps](https://qdrant.tech/documentation/concepts/storage/#configuring-memmap-storage), -which is suitable for ingesting a large amount of data. +From there you can click on the ⎘ symbol next to your API key to copy it to the clipboard. *Don't put your API key in the code!* Instead read it from an env variable you can set in the lambda environment. This avoids accidentally putting your key into a public repo. Now all you need to get embeddings is a bit of code. First you need to extend your dependencies with `reqwest` and also add `anyhow` for easier error handling: -### [Anchor](https://qdrant.tech/documentation/concepts/collections/\#create-collection-from-another-collection) Create collection from another collection +```toml +anyhow = "1.0" +reqwest = { version = "0.11.18", default-features = false, features = ["json", "rustls-tls"] } +serde = "1.0" +``` -_Available as of v1.0.0_ +Now given the API key from above, you can make a call to get the embedding vectors: -It is possible to initialize a collection from another existing collection. +```rust +use anyhow::Result; +use serde::Deserialize; +use reqwest::Client; -This might be useful for experimenting quickly with different configurations for the same data set. +#[derive(Deserialize)] +struct CohereResponse { outputs: Vec> } -Make sure the vectors have the same `size` and `distance` function when setting up the vectors configuration in the new collection. If you used the previous sample -code, `"size": 300` and `"distance": "Cosine"`. +pub async fn embed(client: &Client, text: &str, api_key: &str) -> Result>> { + let CohereResponse { outputs } = client + .post("https://api.cohere.ai/embed") + .header("Authorization", &format!("Bearer {api_key}")) + .header("Content-Type", "application/json") + .header("Cohere-Version", "2021-11-08") + .body(format!("{{\"text\":[\"{text}\"],\"model\":\"small\"}}")) + .send() + .await? + .json() + .await?; + Ok(outputs) +} +``` -httpbashpythontypescriptrustjavacsharpgo +Note that this may return multiple vectors if the text overflows the input dimensions. +Cohere's `small` model has 1024 output dimensions. -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 100, - "distance": "Cosine" - }, - "init_from": { - "collection": "{from_collection_name}" - } -} +Other providers have similar interfaces. Consult our [Embeddings docs](/documentation/embeddings/) for further information. See how little code it took to get the embedding? +While you're at it, it's a good idea to write a small test to check if embedding works and the vectors are of the expected size: + +```rust +#[tokio::test] +async fn check_embedding() { + // ignore this test if API_KEY isn't set + let Ok(api_key) = &std::env::var("API_KEY") else { return; } + let embedding = crate::embed("What is semantic search?", api_key).unwrap()[0]; + // Cohere's `small` model has 1024 output dimensions. + assert_eq!(1024, embedding.len()); +} ``` -```bash -curl -X PUT http://localhost:6333/collections/{collection_name} \ - -H 'Content-Type: application/json' \ - --data-raw '{ - "vectors": { - "size": 300, - "distance": "Cosine" - }, - "init_from": { - "collection": {from_collection_name} - } - }' +Run this while setting the `API_KEY` environment variable to check if the embedding works. + +## Qdrant search + +Now that you have embeddings, it's time to put them into your Qdrant. You could of course use `curl` or `python` to set up your collection and upload the points, but as you already have Rust including some code to obtain the embeddings, you can stay in Rust, adding `qdrant-client` to the mix. + +```rust +use anyhow::Result; +use qdrant_client::prelude::*; +use qdrant_client::qdrant::{VectorsConfig, VectorParams}; +use qdrant_client::qdrant::vectors_config::Config; +use std::collections::HashMap; + +fn setup<'i>( + embed_client: &reqwest::Client, + embed_api_key: &str, + qdrant_url: &str, + api_key: Option<&str>, + collection_name: &str, + data: impl Iterator)>, +) -> Result<()> { + let mut config = QdrantClientConfig::from_url(qdrant_url); + config.api_key = api_key; + let client = QdrantClient::new(Some(config))?; + // create the collections + if !client.has_collection(collection_name).await? { + client + .create_collection(&CreateCollection { + collection_name: collection_name.into(), + vectors_config: Some(VectorsConfig { + config: Some(Config::Params(VectorParams { + size: 1024, // output dimensions from above + distance: Distance::Cosine as i32, + ..Default::default() + })), + }), + ..Default::default() + }) + .await?; + } + let mut id_counter = 0_u64; + let points = data.map(|(text, payload)| { + let id = std::mem::replace(&mut id_counter, *id_counter + 1); + let vectors = Some(embed(embed_client, text, embed_api_key).unwrap()); + PointStruct { id, vectors, payload } + }).collect(); + client.upsert_points(collection_name, points, None).await?; + Ok(()) +} ``` -```python -from qdrant_client import QdrantClient, models +Depending on whether you want to efficiently filter the data, you can also add some indexes. I'm leaving this out for brevity. Also this does not implement chunking (splitting the data to upsert in multiple requests, which avoids timeout errors). -client = QdrantClient(url="http://localhost:6333") +Add a suitable `main` method and you can run this code to insert the points (or just use the binary from the example). Be sure to include the port in the `qdrant_url`. -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=100, distance=models.Distance.COSINE), - init_from=models.InitFrom(collection="{from_collection_name}"), -) +Now that you have the points inserted, you can search them by embedding: +```rust +use anyhow::Result; +use qdrant_client::prelude::*; +pub async fn search( + text: &str, + collection_name: String, + client: &Client, + api_key: &str, + qdrant: &QdrantClient, +) -> Result> { + Ok(qdrant.search_points(&SearchPoints { + collection_name, + limit: 5, // use what fits your use case here + with_payload: Some(true.into()), + vector: embed(client, text, api_key)?, + ..Default::default() + }).await?.result) +} ``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +You can also filter by adding a `filter: ...` field to the `SearchPoints`, and you will likely want to process the result further, but the example code already does that, so feel free to start from there in case you need this functionality. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +## Putting it all together -client.createCollection("{collection_name}", { - vectors: { size: 100, distance: "Cosine" }, - init_from: { collection: "{from_collection_name}" }, -}); +Now that you have all the parts, it's time to join them up. Now copying and wiring up the snippets above is left as an exercise to the reader. + +You'll want to extend the `main` method a bit to connect with the Client once at the start, also get API keys from the environment so you don't need to compile them into the code. To do that, you can get them with `std::env::var(_)` from the rust code and set the environment from the AWS console. +```bash +$ export QDRANT_URI= +$ export QDRANT_API_KEY= +$ export COHERE_API_KEY= +$ export COLLECTION_NAME=site-cohere +$ aws lambda update-function-configuration \ + --function-name $LAMBDA_FUNCTION_NAME \ + --environment "Variables={QDRANT_URI=$QDRANT_URI,\ + QDRANT_API_KEY=$QDRANT_API_KEY,COHERE_API_KEY=${COHERE_API_KEY},\ + COLLECTION_NAME=${COLLECTION_NAME}"` ``` -```rust -use qdrant_client::Qdrant; -use qdrant_client::qdrant::{CreateCollectionBuilder, Distance, VectorParamsBuilder}; +In any event, you will arrive at one command line program to insert your data and one Lambda function. The former can just be `cargo run` to set up the collection. For the latter, you can again call `cargo lambda` and the AWS console: -let client = Qdrant::from_url("http://localhost:6334").build()?; +```bash +$ export LAMBDA_FUNCTION_NAME=search +$ export LAMBDA_REGION=us-east-1 +$ cargo lambda build --release --arm --output-format zip + Downloaded libc v0.2.137 +# [..] output omitted for brevity + Finished release [optimized] target(s) in 1m 27s +$ # Update the function +$ aws lambda update-function-code --function-name $LAMBDA_FUNCTION_NAME \ + --zip-file fileb://./target/lambda/page-search/bootstrap.zip \ + --region $LAMBDA_REGION +``` -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(100, Distance::Cosine)) - .init_from_collection("{from_collection_name}"), - ) - .await?; +## Discussion -``` +Lambda works by spinning up your function once the URL is called, so they don't need to keep the compute on hand unless it is actually used. This means that the first call will be burdened by some 1-2 seconds of latency for loading the function, later calls will resolve faster. Of course, there is also the latency for calling the embeddings provider and Qdrant. On the other hand, the free tier doesn't cost a thing, so you certainly get what you pay for. And for many use cases, a result within one or two seconds is acceptable. -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.grpc.Collections.VectorsConfig; +Rust minimizes the overhead for the function, both in terms of file size and runtime. Using an embedding service means you don't need to care about the details. Knowing the URL, API key and embedding size is sufficient. Finally, with free tiers for both Lambda and Qdrant as well as free credits for the embedding provider, the only cost is your time to set everything up. Who could argue with free? -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +<|page-54-lllmstxt|> +A brand-new [Qdrant 1.3.0 release](https://github.com/qdrant/qdrant/releases/tag/v1.3.0) comes packed with a plethora of new features, performance improvements and bux fixes: -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig( - VectorsConfig.newBuilder() - .setParams( - VectorParams.newBuilder() - .setSize(100) - .setDistance(Distance.Cosine) - .build())) - .setInitFromCollection("{from_collection_name}") - .build()) - .get(); +1. Asynchronous I/O interface: Reduce overhead by managing I/O operations asynchronously, thus minimizing context switches. +2. Oversampling for Quantization: Improve the accuracy and performance of your queries while using Scalar or Product Quantization. +3. Grouping API lookup: Storage optimization method that lets you look for points in another collection using group ids. +4. Qdrant Web UI: A convenient dashboard to help you manage data stored in Qdrant. +5. Temp directory for Snapshots: Set a separate storage directory for temporary snapshots on a faster disk. +6. Other important changes -``` +Your feedback is valuable to us, and are always tying to include some of your feature requests into our roadmap. Join [our Discord community](https://qdrant.to/discord) and help us build Qdrant!. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +## New features -var client = new QdrantClient("localhost", 6334); +### Asychronous I/O interface -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 100, Distance = Distance.Cosine }, - initFromCollection: "{from_collection_name}" -); +Going forward, we will support the `io_uring` asychnronous interface for storage devices on Linux-based systems. Since its introduction, `io_uring` has been proven to speed up slow-disk deployments as it decouples kernel work from the IO process. -``` + -```go -import ( - "context" +This interface uses two ring buffers to queue and manage I/O operations asynchronously, avoiding costly context switches and reducing overhead. Unlike mmap, it frees the user threads to do computations instead of waiting for the kernel to complete. - "github.com/qdrant/go-client/qdrant" -) +![io_uring](/articles_data/qdrant-1.3.x/io-uring.png) -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 100, - Distance: qdrant.Distance_Cosine, - }), - InitFromCollection: qdrant.PtrOf("{from_collection_name}"), -}) +#### Enable the interface from your config file: + +```yaml +storage: + # enable the async scorer which uses io_uring + async_scorer: true ``` +You can return to the mmap based backend by either deleting the `async_scorer` entry or setting the value to `false`. -### [Anchor](https://qdrant.tech/documentation/concepts/collections/\#collection-with-multiple-vectors) Collection with multiple vectors +This optimization will mainly benefit workloads with lots of disk IO (e.g. querying on-disk collections with rescoring). +Please keep in mind that this feature is experimental and that the interface may change in further versions. -_Available as of v0.10.0_ +### Oversampling for quantization -It is possible to have multiple vectors per record. -This feature allows for multiple vector storages per collection. -To distinguish vectors in one record, they should have a unique name defined when creating the collection. -Each named vector in this mode has its distance and size: +We are introducing [oversampling](/documentation/guides/quantization/#oversampling) as a new way to help you improve the accuracy and performance of similarity search algorithms. With this method, you are able to significantly compress high-dimensional vectors in memory and then compensate the accuracy loss by re-scoring additional points with the original vectors. + +You will experience much faster performance with quantization due to parallel disk usage when reading vectors. Much better IO means that you can keep quantized vectors in RAM, so the pre-selection will be even faster. Finally, once pre-selection is done, you can use parallel IO to retrieve original vectors, which is significantly faster than traversing HNSW on slow disks. + +#### Set the oversampling factor via query: -httpbashpythontypescriptrustjavacsharpgo +Here is how you can configure the oversampling factor - define how many extra vectors should be pre-selected using the quantized index, and then re-scored using original vectors. ```http -PUT /collections/{collection_name} +POST /collections/{collection_name}/points/search { - "vectors": { - "image": { - "size": 4, - "distance": "Dot" - }, - "text": { - "size": 8, - "distance": "Cosine" - } + "params": { + "quantization": { + "ignore": false, + "rescore": true, + "oversampling": 2.4 } + }, + "vector": [0.2, 0.1, 0.9, 0.7], + "limit": 100 } - -``` - -```bash -curl -X PUT http://localhost:6333/collections/{collection_name} \ - -H 'Content-Type: application/json' \ - --data-raw '{ - "vectors": { - "image": { - "size": 4, - "distance": "Dot" - }, - "text": { - "size": 8, - "distance": "Cosine" - } - } - }' - ``` ```python -from qdrant_client import QdrantClient, models +from qdrant_client import QdrantClient +from qdrant_client.http import models -client = QdrantClient(url="http://localhost:6333") +client = QdrantClient("localhost", port=6333) -client.create_collection( +client.search( collection_name="{collection_name}", - vectors_config={ - "image": models.VectorParams(size=4, distance=models.Distance.DOT), - "text": models.VectorParams(size=8, distance=models.Distance.COSINE), - }, + query_vector=[0.2, 0.1, 0.9, 0.7], + search_params=models.SearchParams( + quantization=models.QuantizationSearchParams( + ignore=False, + rescore=True, + oversampling=2.4 + ) + ) ) - ``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +In this case, if `oversampling` is 2.4 and `limit` is 100, then 240 vectors will be pre-selected using quantized index, and then the top 100 points will be returned after re-scoring with the unquantized vectors. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +As you can see from the example above, this parameter is set during the query. This is a flexible method that will let you tune query accuracy. While the index is not changed, you can decide how many points you want to retrieve using quantized vectors. -client.createCollection("{collection_name}", { - vectors: { - image: { size: 4, distance: "Dot" }, - text: { size: 8, distance: "Cosine" }, - }, -}); +### Grouping API lookup -``` +In version 1.2.0, we introduced a mechanism for requesting groups of points. Our new feature extends this functionality by giving you the option to look for points in another collection using the group ids. We wanted to add this feature, since having a single point for the shared data of the same item optimizes storage use, particularly if the payload is large. -```rust -use qdrant_client::Qdrant; -use qdrant_client::qdrant::{ - CreateCollectionBuilder, Distance, VectorParamsBuilder, VectorsConfigBuilder, -}; +This has the extra benefit of having a single point to update when the information shared by the points in a group changes. -let client = Qdrant::from_url("http://localhost:6334").build()?; +![Group Lookup](/articles_data/qdrant-1.3.x/group-lookup.png) -let mut vectors_config = VectorsConfigBuilder::default(); -vectors_config - .add_named_vector_params("image", VectorParamsBuilder::new(4, Distance::Dot).build()); -vectors_config.add_named_vector_params( - "text", - VectorParamsBuilder::new(8, Distance::Cosine).build(), -); +For example, if you have a collection of documents, you may want to chunk them and store the points for the chunks in a separate collection, making sure that you store the point id from the document it belongs in the payload of the chunk point. -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}").vectors_config(vectors_config), - ) - .await?; +#### Adding the parameter to grouping API request: -``` +When using the grouping API, add the `with_lookup` parameter to bring the information from those points into each group: -```java -import java.util.Map; +```http +POST /collections/chunks/points/search/groups +{ + // Same as in the regular search API + "vector": [1.1], + ..., -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.VectorParams; + // Grouping parameters + "group_by": "document_id", + "limit": 2, + "group_size": 2, -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + // Lookup parameters + "with_lookup": { + // Name of the collection to look up points in + "collection_name": "documents", -client - .createCollectionAsync( - "{collection_name}", - Map.of( - "image", VectorParams.newBuilder().setSize(4).setDistance(Distance.Dot).build(), - "text", - VectorParams.newBuilder().setSize(8).setDistance(Distance.Cosine).build())) - .get(); + // Options for specifying what to bring from the payload + // of the looked up point, true by default + "with_payload": ["title", "text"], + // Options for specifying what to bring from the vector(s) + // of the looked up point, true by default + "with_vectors: false, + } +} ``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +```python +client.search_groups( + collection_name="chunks", -var client = new QdrantClient("localhost", 6334); + # Same as in the regular search() API + query_vector=[1.1], + ..., + + # Grouping parameters + group_by="document_id", # Path of the field to group by + limit=2, # Max amount of groups + group_size=2, # Max amount of points per group -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParamsMap - { - Map = - { - ["image"] = new VectorParams { Size = 4, Distance = Distance.Dot }, - ["text"] = new VectorParams { Size = 8, Distance = Distance.Cosine }, - } - } -); + # Lookup parameters + with_lookup=models.WithLookup( + # Name of the collection to look up points in + collection_name="documents", + # Options for specifying what to bring from the payload + # of the looked up point, True by default + with_payload=["title", "text"] + + # Options for specifying what to bring from the vector(s) + # of the looked up point, True by default + with_vectors=False, + ) +) ``` -```go -import ( - "context" +### Qdrant web user interface - "github.com/qdrant/go-client/qdrant" -) +We are excited to announce a more user-friendly way to organize and work with your collections inside of Qdrant. Our dashboard's design is simple, but very intuitive and easy to access. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Try it out now! If you have Docker running, you can [quickstart Qdrant](/documentation/quick-start/) and access the Dashboard locally from [http://localhost:6333/dashboard](http://localhost:6333/dashboard). You should see this simple access point to Qdrant: -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfigMap( - map[string]*qdrant.VectorParams{ - "image": { - Size: 4, - Distance: qdrant.Distance_Dot, - }, - "text": { - Size: 8, - Distance: qdrant.Distance_Cosine, - }, - }), -}) +![Qdrant Web UI](/articles_data/qdrant-1.3.x/web-ui.png) -``` +### Temporary directory for Snapshots -For rare use cases, it is possible to create a collection without any vector storage. +Currently, temporary snapshot files are created inside the `/storage` directory. Oftentimes `/storage` is a network-mounted disk. Therefore, we found this method suboptimal because `/storage` is limited in disk size and also because writing data to it may affect disk performance as it consumes bandwidth. This new feature allows you to specify a different directory on another disk that is faster. We expect this feature to significantly optimize cloud performance. -_Available as of v1.1.1_ +To change it, access `config.yaml` and set `storage.temp_path` to another directory location. -For each named vector you can optionally specify -[`hnsw_config`](https://qdrant.tech/documentation/concepts/indexing/#vector-index) or -[`quantization_config`](https://qdrant.tech/documentation/guides/quantization/#setting-up-quantization-in-qdrant) to -deviate from the collection configuration. This can be useful to fine-tune -search performance on a vector level. +## Important changes -_Available as of v1.2.0_ +The latest release focuses not only on the new features but also introduces some changes making +Qdrant even more reliable. -Vectors all live in RAM for very quick access. On a per-vector basis you can set -`on_disk` to true to store all vectors on disk at all times. This will enable -the use of -[memmaps](https://qdrant.tech/documentation/concepts/storage/#configuring-memmap-storage), -which is suitable for ingesting a large amount of data. +### Optimizing group requests -### [Anchor](https://qdrant.tech/documentation/concepts/collections/\#vector-datatypes) Vector datatypes +Internally, `is_empty` was not using the index when it was called, so it had to deserialize the whole payload to see if the key had values or not. Our new update makes sure to check the index first, before confirming with the payload if it is actually `empty`/`null`, so these changes improve performance only when the negated condition is true (e.g. it improves when the field is not empty). Going forward, this will improve the way grouping API requests are handled. -_Available as of v1.9.0_ +### Faster read access with mmap -Some embedding providers may provide embeddings in a pre-quantized format. -One of the most notable examples is the [Cohere int8 & binary embeddings](https://cohere.com/blog/int8-binary-embeddings). -Qdrant has direct support for uint8 embeddings, which you can also use in combination with binary quantization. +If you used mmap, you most likely found that segments were always created with cold caches. The first request to the database needed to request the disk, which made startup slower despite plenty of RAM being available. We have implemeneted a way to ask the kernel to "heat up" the disk cache and make initialization much faster. -To create a collection with uint8 embeddings, you can use the following configuration: +The function is expected to be used on startup and after segment optimization and reloading of newly indexed segment. So far this is only implemented for "immutable" memmaps. -httpbashpythontypescriptrustjavacsharpgo +## Release notes -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 1024, - "distance": "Cosine", - "datatype": "uint8" - } -} +As usual, [our release notes](https://github.com/qdrant/qdrant/releases/tag/v1.3.0) describe all the changes +introduced in the latest version. -``` +<|page-55-lllmstxt|> +With Qdrant [version 1.3.0](https://github.com/qdrant/qdrant/releases/tag/v1.3.0) we +introduce the alternative io\_uring based *async uring* storage backend on +Linux-based systems. Since its introduction, io\_uring has been known to improve +async throughput wherever the OS syscall overhead gets too high, which tends to +occur in situations where software becomes *IO bound* (that is, mostly waiting +on disk). -```bash -curl -X PUT http://localhost:6333/collections/{collection_name} \ - -H 'Content-Type: application/json' \ - --data-raw '{ - "vectors": { - "size": 1024, - "distance": "Cosine", - "datatype": "uint8" - } - }' +## Input+Output -``` +Around the mid-90s, the internet took off. The first servers used a process- +per-request setup, which was good for serving hundreds if not thousands of +concurrent request. The POSIX Input + Output (IO) was modeled in a strictly +synchronous way. The overhead of starting a new process for each request made +this model unsustainable. So servers started forgoing process separation, opting +for the thread-per-request model. But even that ran into limitations. -```python -from qdrant_client import QdrantClient, models +I distinctly remember when someone asked the question whether a server could +serve 10k concurrent connections, which at the time exhausted the memory of +most systems (because every thread had to have its own stack and some other +metadata, which quickly filled up available memory). As a result, the +synchronous IO was replaced by asynchronous IO during the 2.5 kernel update, +either via `select` or `epoll` (the latter being Linux-only, but a small bit +more efficient, so most servers of the time used it). -client = QdrantClient(url="http://localhost:6333") +However, even this crude form of asynchronous IO carries the overhead of at +least one system call per operation. Each system call incurs a context switch, +and while this operation is itself not that slow, the switch disturbs the +caches. Today's CPUs are much faster than memory, but if their caches start to +miss data, the memory accesses required led to longer and longer wait times for +the CPU. -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams( - size=1024, - distance=models.Distance.COSINE, - datatype=models.Datatype.UINT8, - ), -) +### Memory-mapped IO -``` +Another way of dealing with file IO (which unlike network IO doesn't have a hard +time requirement) is to map parts of files into memory - the system fakes having +that chunk of the file in memory, so when you read from a location there, the +kernel interrupts your process to load the needed data from disk, and resumes +your process once done, whereas writing to the memory will also notify the +kernel. Also the kernel can prefetch data while the program is running, thus +reducing the likelyhood of interrupts. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +Thus there is still some overhead, but (especially in asynchronous +applications) it's far less than with `epoll`. The reason this API is rarely +used in web servers is that these usually have a large variety of files to +access, unlike a database, which can map its own backing store into memory +once. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +### Combating the Poll-ution -client.createCollection("{collection_name}", { - vectors: { - image: { size: 1024, distance: "Cosine", datatype: "uint8" }, - }, -}); +There were multiple experiments to improve matters, some even going so far as +moving a HTTP server into the kernel, which of course brought its own share of +problems. Others like Intel added their own APIs that ignored the kernel and +worked directly on the hardware. -``` +Finally, Jens Axboe took matters into his own hands and proposed a ring buffer +based interface called *io\_uring*. The buffers are not directly for data, but +for operations. User processes can setup a Submission Queue (SQ) and a +Completion Queue (CQ), both of which are shared between the process and the +kernel, so there's no copying overhead. -```rust -use qdrant_client::Qdrant; -use qdrant_client::qdrant::{ - CreateCollectionBuilder, Datatype, Distance, VectorParamsBuilder, -}; +![io_uring diagram](/articles_data/io_uring/io-uring.png) -let client = Qdrant::from_url("http://localhost:6334").build()?; +Apart from avoiding copying overhead, the queue-based architecture lends +itself to multithreading as item insertion/extraction can be made lockless, +and once the queues are set up, there is no further syscall that would stop +any user thread. -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}").vectors_config( - VectorParamsBuilder::new(1024, Distance::Cosine).datatype(Datatype::Uint8), - ), - ) - .await?; +Servers that use this can easily get to over 100k concurrent requests. Today +Linux allows asynchronous IO via io\_uring for network, disk and accessing +other ports, e.g. for printing or recording video. -``` +## And what about Qdrant? -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.grpc.Collections.Datatype; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.VectorParams; +Qdrant can store everything in memory, but not all data sets may fit, which can +require storing on disk. Before io\_uring, Qdrant used mmap to do its IO. This +led to some modest overhead in case of disk latency. The kernel may +stop a user thread trying to access a mapped region, which incurs some context +switching overhead plus the wait time until the disk IO is finished. Ultimately, +this works very well with the asynchronous nature of Qdrant's core. -QdrantClient client = new QdrantClient( - QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +One of the great optimizations Qdrant offers is quantization (either +[scalar](/articles/scalar-quantization/) or +[product](/articles/product-quantization/)-based). +However unless the collection resides fully in memory, this optimization +method generates significant disk IO, so it is a prime candidate for possible +improvements. -client - .createCollectionAsync("{collection_name}", - VectorParams.newBuilder() - .setSize(1024) - .setDistance(Distance.Cosine) - .setDatatype(Datatype.Uint8) - .build()) - .get(); +If you run Qdrant on Linux, you can enable io\_uring with the following in your +configuration: +```yaml +# within the storage config +storage: + # enable the async scorer which uses io_uring + async_scorer: true ``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +You can return to the mmap based backend by either deleting the `async_scorer` +entry or setting the value to `false`. -var client = new QdrantClient("localhost", 6334); +## Benchmarks -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { - Size = 1024, Distance = Distance.Cosine, Datatype = Datatype.Uint8 - } -); +To run the benchmark, use a test instance of Qdrant. If necessary spin up a +docker container and load a snapshot of the collection you want to benchmark +with. You can copy and edit our [benchmark script](/articles_data/io_uring/rescore-benchmark.sh) +to run the benchmark. Run the script with and without enabling +`storage.async_scorer` and once. You can measure IO usage with `iostat` from +another console. -``` +For our benchmark, we chose the laion dataset picking 5 million 768d entries. +We enabled scalar quantization + HNSW with m=16 and ef_construct=512. +We do the quantization in RAM, HNSW in RAM but keep the original vectors on +disk (which was a network drive rented from Hetzner for the benchmark). -```go -import ( - "context" +If you want to reproduce the benchmarks, you can get snapshots containing the +datasets: - "github.com/qdrant/go-client/qdrant" -) +* [mmap only](https://storage.googleapis.com/common-datasets-snapshots/laion-768-6m-mmap.snapshot) +* [with scalar quantization](https://storage.googleapis.com/common-datasets-snapshots/laion-768-6m-sq-m16-mmap.shapshot) -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Running the benchmark, we get the following IOPS, CPU loads and wall clock times: -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 1024, - Distance: qdrant.Distance_Cosine, - Datatype: qdrant.Datatype_Uint8.Enum(), - }), -}) +| | oversampling | parallel | ~max IOPS | CPU% (of 4 cores) | time (s) (avg of 3) | +|----------|--------------|----------|-----------|-------------------|---------------------| +| io_uring | 1 | 4 | 4000 | 200 | 12 | +| mmap | 1 | 4 | 2000 | 93 | 43 | +| io_uring | 1 | 8 | 4000 | 200 | 12 | +| mmap | 1 | 8 | 2000 | 90 | 43 | +| io_uring | 4 | 8 | 7000 | 100 | 30 | +| mmap | 4 | 8 | 2300 | 50 | 145 | -``` -Vectors with `uint8` datatype are stored in a more compact format, which can save memory and improve search speed at the cost of some precision. -If you choose to use the `uint8` datatype, elements of the vector will be stored as unsigned 8-bit integers, which can take values **from 0 to 255**. +Note that in this case, the IO operations have relatively high latency due to +using a network disk. Thus, the kernel takes more time to fulfil the mmap +requests, and application threads need to wait, which is reflected in the CPU +percentage. On the other hand, with the io\_uring backend, the application +threads can better use available cores for the rescore operation without any +IO-induced delays. + +Oversampling is a new feature to improve accuracy at the cost of some +performance. It allows setting a factor, which is multiplied with the `limit` +while doing the search. The results are then re-scored using the original vector +and only then the top results up to the limit are selected. -### [Anchor](https://qdrant.tech/documentation/concepts/collections/\#collection-with-sparse-vectors) Collection with sparse vectors +## Discussion -_Available as of v1.7.0_ +Looking back, disk IO used to be very serialized; re-positioning read-write +heads on moving platter was a slow and messy business. So the system overhead +didn't matter as much, but nowadays with SSDs that can often even parallelize +operations while offering near-perfect random access, the overhead starts to +become quite visible. While memory-mapped IO gives us a fair deal in terms of +ease of use and performance, we can improve on the latter in exchange for +some modest complexity increase. -Qdrant supports sparse vectors as a first-class citizen. +io\_uring is still quite young, having only been introduced in 2019 with kernel +5.1, so some administrators will be wary of introducing it. Of course, as with +performance, the right answer is usually "it depends", so please review your +personal risk profile and act accordingly. -Sparse vectors are useful for text search, where each word is represented as a separate dimension. +## Best Practices -Collections can contain sparse vectors as additional [named vectors](https://qdrant.tech/documentation/concepts/collections/#collection-with-multiple-vectors) along side regular dense vectors in a single point. +If your on-disk collection's query performance is of sufficiently high +priority to you, enable the io\_uring-based async\_scorer to greatly reduce +operating system overhead from disk IO. On the other hand, if your +collections are in memory only, activating it will be ineffective. Also note +that many queries are not IO bound, so the overhead may or may not become +measurable in your workload. Finally, on-device disks typically carry lower +latency than network drives, which may also affect mmap overhead. -Unlike dense vectors, sparse vectors must be named. -And additionally, sparse vectors and dense vectors must have different names within a collection. +Therefore before you roll out io\_uring, perform the above or a similar +benchmark with both mmap and io\_uring and measure both wall time and IOps). +Benchmarks are always highly use-case dependent, so your mileage may vary. +Still, doing that benchmark once is a small price for the possible performance +wins. Also please +[tell us](https://discord.com/channels/907569970500743200/907569971079569410) +about your benchmark results! -httpbashpythontypescriptrustjavacsharpgo +<|page-56-lllmstxt|> +# Product Quantization Demystified: Streamlining Efficiency in Data Management -```http -PUT /collections/{collection_name} -{ - "sparse_vectors": { - "text": { } - } -} +Qdrant 1.1.0 brought the support of [Scalar Quantization](/articles/scalar-quantization/), +a technique of reducing the memory footprint by even four times, by using `int8` to represent +the values that would be normally represented by `float32`. -``` +The memory usage in [vector search](https://qdrant.tech/solutions/) might be reduced even further! Please welcome **Product +Quantization**, a brand-new feature of Qdrant 1.2.0! -```bash -curl -X PUT http://localhost:6333/collections/{collection_name} \ - -H 'Content-Type: application/json' \ - --data-raw '{ - "sparse_vectors": { - "text": { } - } - }' +## What is Product Quantization? -``` +Product Quantization converts floating-point numbers into integers like every other quantization +method. However, the process is slightly more complicated than [Scalar Quantization](https://qdrant.tech/articles/scalar-quantization/) and is more customizable, so you can find the sweet spot between memory usage and search precision. This article +covers all the steps required to perform Product Quantization and the way it's implemented in Qdrant. -```python -from qdrant_client import QdrantClient, models +## How Does Product Quantization Work? -client = QdrantClient(url="http://localhost:6333") +Let’s assume we have a few vectors being added to the collection and that our optimizer decided +to start creating a new segment. -client.create_collection( - collection_name="{collection_name}", - vectors_config={}, - sparse_vectors_config={ - "text": models.SparseVectorParams(), - }, -) +![A list of raw vectors](/articles_data/product-quantization/raw-vectors.png) -``` +### Cutting the vector into pieces -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +First of all, our vectors are going to be divided into **chunks** aka **subvectors**. The number +of chunks is configurable, but as a rule of thumb - the lower it is, the higher the compression rate. +That also comes with reduced search precision, but in some cases, you may prefer to keep the memory +usage as low as possible. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +![A list of chunked vectors](/articles_data/product-quantization/chunked-vectors.png) -client.createCollection("{collection_name}", { - sparse_vectors: { - text: { }, - }, -}); +Qdrant API allows choosing the compression ratio from 4x up to 64x. In our example, we selected 16x, +so each subvector will consist of 4 floats (16 bytes), and it will eventually be represented by +a single byte. -``` +### Clustering -```rust -use qdrant_client::Qdrant; -use qdrant_client::qdrant::{ - CreateCollectionBuilder, SparseVectorParamsBuilder, SparseVectorsConfigBuilder, -}; +The chunks of our vectors are then used as input for clustering. Qdrant uses the K-means algorithm, +with $ K = 256 $. It was selected a priori, as this is the maximum number of values a single byte +represents. As a result, we receive a list of 256 centroids for each chunk and assign each of them +a unique id. **The clustering is done separately for each group of chunks.** -let client = Qdrant::from_url("http://localhost:6334").build()?; +![Clustered chunks of vectors](/articles_data/product-quantization/chunks-clustering.png) -let mut sparse_vector_config = SparseVectorsConfigBuilder::default(); +Each chunk of a vector might now be mapped to the closest centroid. That’s where we lose the precision, +as a single point will only represent a whole subspace. Instead of using a subvector, we can store +the id of the closest centroid. If we repeat that for each chunk, we can approximate the original +embedding as a vector of subsequent ids of the centroids. The dimensionality of the created vector +is equal to the number of chunks, in our case 2. -sparse_vector_config.add_named_vector_params("text", SparseVectorParamsBuilder::default()); +![A new vector built from the ids of the centroids](/articles_data/product-quantization/vector-of-ids.png) -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .sparse_vectors_config(sparse_vector_config), - ) - .await?; +### Full process -``` +All those steps build the following pipeline of Product Quantization: -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.SparseVectorConfig; -import io.qdrant.client.grpc.Collections.SparseVectorParams; +![Full process of Product Quantization](/articles_data/product-quantization/full-process.png) -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +## Measuring the distance -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setSparseVectorsConfig( - SparseVectorConfig.newBuilder() - .putMap("text", SparseVectorParams.getDefaultInstance())) - .build()) - .get(); +Vector search relies on the distances between the points. Enabling Product Quantization slightly changes +the way it has to be calculated. The query vector is divided into chunks, and then we figure the overall +distance as a sum of distances between the subvectors and the centroids assigned to the specific id of +the vector we compare to. We know the coordinates of the centroids, so that's easy. -``` +![Calculating the distance of between the query and the stored vector](/articles_data/product-quantization/distance-calculation.png) -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +#### Qdrant implementation -var client = new QdrantClient("localhost", 6334); +Search operation requires calculating the distance to multiple points. Since we calculate the +distance to a finite set of centroids, those might be precomputed and reused. Qdrant creates +a lookup table for each query, so it can then simply sum up several terms to measure the +distance between a query and all the centroids. -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - sparseVectorsConfig: ("text", new SparseVectorParams()) -); +| | Centroid 0 | Centroid 1 | ... | +|-------------|------------|------------|-----| +| **Chunk 0** | 0.14213 | 0.51242 | | +| **Chunk 1** | 0.08421 | 0.00142 | | +| **...** | ... | ... | ... | -``` +## Product Quantization Benchmarks -```go -import ( - "context" +Product Quantization comes with a cost - there are some additional operations to perform so +that the performance might be reduced. However, memory usage might be reduced drastically as +well. As usual, we did some benchmarks to give you a brief understanding of what you may expect. - "github.com/qdrant/go-client/qdrant" -) +Again, we reused the same pipeline as in [the other benchmarks we published](/benchmarks/). We +selected [Arxiv-titles-384-angular-no-filters](https://github.com/qdrant/ann-filtering-benchmark-datasets) +and [Glove-100](https://github.com/erikbern/ann-benchmarks/) datasets to measure the impact +of Product Quantization on precision and time. Both experiments were launched with $ EF = 128 $. +The results are summarized in the tables: -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +#### Glove-100 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Original1D clusters2D clusters3D clusters
Mean precision0.71580.71430.67310.5854
Mean search time2336 ”s2750 ”s2597 ”s2534 ”s
Compressionx1x4x8x12
Upload & indexing time147 s339 s217 s178 s
+ +Product Quantization increases both indexing and searching time. The higher the compression ratio, +the lower the search precision. The main benefit is undoubtedly the reduced usage of memory. -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - SparseVectorsConfig: qdrant.NewSparseVectorsConfig( - map[string]*qdrant.SparseVectorParams{ - "text": {}, - }), -}) +#### Arxiv-titles-384-angular-no-filters + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Original1D clusters2D clusters4D clusters8D clusters
Mean precision0.98370.96770.91430.80680.6618
Mean search time2719 ”s4134 ”s2947 ”s2175 ”s2053 ”s
Compressionx1x4x8x16x32
Upload & indexing time332 s921 s597 s481 s474 s
+ +It turns out that in some cases, Product Quantization may not only reduce the memory usage, +but also the search time. -``` +## Product Quantization vs Scalar Quantization -Outside of a unique name, there are no required configuration parameters for sparse vectors. +Compared to [Scalar Quantization](https://qdrant.tech/articles/scalar-quantization/), Product Quantization offers a higher compression rate. However, this comes with considerable trade-offs in accuracy, and at times, in-RAM search speed. -The distance function for sparse vectors is always `Dot` and does not need to be specified. +Product Quantization tends to be favored in certain specific scenarios: -However, there are optional parameters to tune the underlying [sparse vector index](https://qdrant.tech/documentation/concepts/indexing/#sparse-vector-index). +- Deployment in a low-RAM environment where the limiting factor is the number of disk reads rather than the vector comparison itself +- Situations where the dimensionality of the original vectors is sufficiently high +- Cases where indexing speed is not a critical factor -### [Anchor](https://qdrant.tech/documentation/concepts/collections/\#check-collection-existence) Check collection existence +In circumstances that do not align with the above, Scalar Quantization should be the preferred choice. -_Available as of v1.8.0_ +## Using Qdrant for Product Quantization -httpbashpythontypescriptrustjavacsharpgo -```http -GET http://localhost:6333/collections/{collection_name}/exists +If you’re already a Qdrant user, we have, documentation on [Product Quantization](/documentation/guides/quantization/#setting-up-product-quantization) that will help you to set and configure the new quantization for your data and achieve even +up to 64x memory reduction. -``` +Ready to experience the power of Product Quantization? [Sign up now](https://cloud.qdrant.io/signup) for a free Qdrant demo and optimize your data management today! -```bash -curl -X GET http://localhost:6333/collections/{collection_name}/exists +<|page-57-lllmstxt|> +# Efficiency Unleashed: The Power of Scalar Quantization -``` +High-dimensional vector embeddings can be memory-intensive, especially when working with +large datasets consisting of millions of vectors. Memory footprint really starts being +a concern when we scale things up. A simple choice of the data type used to store a single +number impacts even billions of numbers and can drive the memory requirements crazy. The +higher the precision of your type, the more accurately you can represent the numbers. +The more accurate your vectors, the more precise is the distance calculation. But the +advantages stop paying off when you need to order more and more memory. -```python -client.collection_exists(collection_name="{collection_name}") +Qdrant chose `float32` as a default type used to store the numbers of your embeddings. +So a single number needs 4 bytes of the memory and a 512-dimensional vector occupies +2 kB. That's only the memory used to store the vector. There is also an overhead of the +HNSW graph, so as a rule of thumb we estimate the memory size with the following formula: +```text +memory_size = 1.5 * number_of_vectors * vector_dimension * 4 bytes ``` -```typescript -client.collectionExists("{collection_name}"); +While Qdrant offers various options to store some parts of the data on disk, starting +from version 1.1.0, you can also optimize your memory by compressing the embeddings. +We've implemented the mechanism of **Scalar Quantization**! It turns out to have not +only a positive impact on memory but also on the performance. -``` +## Scalar quantization -```rust -client.collection_exists("{collection_name}").await?; +Scalar quantization is a data compression technique that converts floating point values +into integers. In case of Qdrant `float32` gets converted into `int8`, so a single number +needs 75% less memory. It's not a simple rounding though! It's a process that makes that +transformation partially reversible, so we can also revert integers back to floats with +a small loss of precision. -``` +### Theoretical background -```java -client.collectionExistsAsync("{collection_name}").get(); +Assume we have a collection of `float32` vectors and denote a single value as `f32`. +In reality neural embeddings do not cover a whole range represented by the floating +point numbers, but rather a small subrange. Since we know all the other vectors, we can +establish some statistics of all the numbers. For example, the distribution of the values +will be typically normal: -``` +![A distribution of the vector values](/articles_data/scalar-quantization/float32-distribution.png) -```csharp -await client.CollectionExistsAsync("{collection_name}"); +Our example shows that 99% of the values come from a `[-2.0, 5.0]` range. And the +conversion to `int8` will surely lose some precision, so we rather prefer keeping the +representation accuracy within the range of 99% of the most probable values and ignoring +the precision of the outliers. There might be a different choice of the range width, +actually, any value from a range `[0, 1]`, where `0` means empty range, and `1` would +keep all the values. That's a hyperparameter of the procedure called `quantile`. A value +of `0.95` or `0.99` is typically a reasonable choice, but in general `quantile ∈ [0, 1]`. -``` +#### Conversion to integers -```go -import "context" +Let's talk about the conversion to `int8`. Integers also have a finite set of values that +might be represented. Within a single byte they may represent up to 256 different values, +either from `[-128, 127]` or `[0, 255]`. -client.CollectionExists(context.Background(), "my_collection") +![Value ranges represented by int8](/articles_data/scalar-quantization/int8-value-range.png) -``` +Since we put some boundaries on the numbers that might be represented by the `f32`, and +`i8` has some natural boundaries, the process of converting the values between those +two ranges is quite natural: -### [Anchor](https://qdrant.tech/documentation/concepts/collections/\#delete-collection) Delete collection +$$ f32 = \alpha \times i8 + offset $$ -httpbashpythontypescriptrustjavacsharpgo +$$ i8 = \frac{f32 - offset}{\alpha} $$ -```http -DELETE http://localhost:6333/collections/{collection_name} +The parameters $ \alpha $ and $ offset $ has to be calculated for a given set of vectors, +but that comes easily by putting the minimum and maximum of the represented range for +both `f32` and `i8`. -``` +![Float32 to int8 conversion](/articles_data/scalar-quantization/float32-to-int8-conversion.png) -```bash -curl -X DELETE http://localhost:6333/collections/{collection_name} +For the unsigned `int8` it will go as following: -``` +$$ \begin{equation} +\begin{cases} -2 = \alpha \times 0 + offset \\\\ 5 = \alpha \times 255 + offset \end{cases} +\end{equation} $$ -```python -client.delete_collection(collection_name="{collection_name}") +In case of signed `int8`, we'll just change the represented range boundaries: -``` +$$ \begin{equation} +\begin{cases} -2 = \alpha \times (-128) + offset \\\\ 5 = \alpha \times 127 + offset \end{cases} +\end{equation} $$ -```typescript -client.deleteCollection("{collection_name}"); +For any set of vector values we can simply calculate the $ \alpha $ and $ offset $ and +those values have to be stored along with the collection to enable to conversion between +the types. -``` +#### Distance calculation -```rust -client.delete_collection("{collection_name}").await?; +We do not store the vectors in the collections represented by `int8` instead of `float32` +just for the sake of compressing the memory. But the coordinates are being used while we +calculate the distance between the vectors. Both dot product and cosine distance requires +multiplying the corresponding coordinates of two vectors, so that's the operation we +perform quite often on `float32`. Here is how it would look like if we perform the +conversion to `int8`: -``` +$$ f32 \times f32' = $$ +$$ = (\alpha \times i8 + offset) \times (\alpha \times i8' + offset) = $$ +$$ = \alpha^{2} \times i8 \times i8' + \underbrace{offset \times \alpha \times i8' + offset \times \alpha \times i8 + offset^{2}}_\text{pre-compute} $$ -```java -client.deleteCollectionAsync("{collection_name}").get(); +The first term, $ \alpha^{2} \times i8 \times i8' $ has to be calculated when we measure the +distance as it depends on both vectors. However, both the second and the third term +($ offset \times \alpha \times i8' $ and $ offset \times \alpha \times i8 $ respectively), +depend only on a single vector and those might be precomputed and kept for each vector. +The last term, $ offset^{2} $ does not depend on any of the values, so it might be even +computed once and reused. -``` +If we had to calculate all the terms to measure the distance, the performance could have +been even worse than without the conversion. But thanks for the fact we can precompute +the majority of the terms, things are getting simpler. And in turns out the scalar +quantization has a positive impact not only on the memory usage, but also on the +performance. As usual, we performed some benchmarks to support this statement! -```csharp -await client.DeleteCollectionAsync("{collection_name}"); +## Benchmarks -``` +We simply used the same approach as we use in all [the other benchmarks we publish](/benchmarks/). +Both [Arxiv-titles-384-angular-no-filters](https://github.com/qdrant/ann-filtering-benchmark-datasets) +and [Gist-960](https://github.com/erikbern/ann-benchmarks/) datasets were chosen to make +the comparison between non-quantized and quantized vectors. The results are summarized +in the tables: -```go -import "context" +#### Arxiv-titles-384-angular-no-filters + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ef = 128ef = 256ef = 512
Upload and indexing timeMean search precisionMean search timeMean search precisionMean search timeMean search precisionMean search time
Non-quantized vectors649 s0.9890.00940.9940.09320.9960.161
Scalar Quantization496 s0.9860.00370.9930.0600.9960.115
Difference-23.57%-0.3%-60.64%-0.1%-35.62%0%-28.57%
+ +A slight decrease in search precision results in a considerable improvement in the +latency. Unless you aim for the highest precision possible, you should not notice the +difference in your search quality. -client.DeleteCollection(context.Background(), "{collection_name}") +#### Gist-960 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ef = 128ef = 256ef = 512
Upload and indexing timeMean search precisionMean search timeMean search precisionMean search timeMean search precisionMean search time
Non-quantized vectors4520.8020.0770.8870.1350.9410.231
Scalar Quantization3120.8020.0430.8880.0770.9410.135
Difference-30.79%0%-44,16%+0.11%-42.96%0%-41,56%
+ +In all the cases, the decrease in search precision is negligible, but we keep a latency +reduction of at least 28.57%, even up to 60,64%, while searching. As a rule of thumb, +the higher the dimensionality of the vectors, the lower the precision loss. -``` +### Oversampling and rescoring -### [Anchor](https://qdrant.tech/documentation/concepts/collections/\#update-collection-parameters) Update collection parameters +A distinctive feature of the Qdrant architecture is the ability to combine the search for quantized and original vectors in a single query. +This enables the best combination of speed, accuracy, and RAM usage. -Dynamic parameter updates may be helpful, for example, for more efficient initial loading of vectors. -For example, you can disable indexing during the upload process, and enable it immediately after the upload is finished. -As a result, you will not waste extra computation resources on rebuilding the index. +Qdrant stores the original vectors, so it is possible to rescore the top-k results with +the original vectors after doing the neighbours search in quantized space. That obviously +has some impact on the performance, but in order to measure how big it is, we made the +comparison in different search scenarios. +We used a machine with a very slow network-mounted disk and tested the following scenarios with different amounts of allowed RAM: -The following command enables indexing for segments that have more than 10000 kB of vectors stored: +| Setup | RPS | Precision | +|-----------------------------|------|-----------| +| 4.5GB memory | 600 | 0.99 | +| 4.5GB memory + SQ + rescore | 1000 | 0.989 | -httpbashpythontypescriptrustjavacsharpgo +And another group with more strict memory limits: -```http -PATCH /collections/{collection_name} -{ - "optimizers_config": { - "indexing_threshold": 10000 - } -} +| Setup | RPS | Precision | +|------------------------------|------|-----------| +| 2GB memory | 2 | 0.99 | +| 2GB memory + SQ + rescore | 30 | 0.989 | +| 2GB memory + SQ + no rescore | 1200 | 0.974 | -``` +In those experiments, throughput was mainly defined by the number of disk reads, and quantization efficiently reduces it by allowing more vectors in RAM. +Read more about on-disk storage in Qdrant and how we measure its performance in our article: [Minimal RAM you need to serve a million vectors +](/articles/memory-consumption/). -```bash -curl -X PATCH http://localhost:6333/collections/{collection_name} \ - -H 'Content-Type: application/json' \ - --data-raw '{ - "optimizers_config": { - "indexing_threshold": 10000 - } - }' +The mechanism of Scalar Quantization with rescoring disabled pushes the limits of low-end +machines even further. It seems like handling lots of requests does not require an +expensive setup if you can agree to a small decrease in the search precision. -``` +### Accessing best practices -```python -client.update_collection( - collection_name="{collection_name}", - optimizers_config=models.OptimizersConfigDiff(indexing_threshold=10000), -) +Qdrant documentation on [Scalar Quantization](/documentation/quantization/#setting-up-quantization-in-qdrant) +is a great resource describing different scenarios and strategies to achieve up to 4x +lower memory footprint and even up to 2x performance increase. -``` +<|page-58-lllmstxt|> +> Vector databases are here to stay. The New Age of AI is powered by vector embeddings, and vector databases are a foundational part of the stack. At Qdrant, we are working on cutting-edge open-source vector similarity search solutions to power fantastic AI applications with the best possible performance and excellent developer experience. +> +> Our 7.5M seed funding – led by [Unusual Ventures](https://www.unusual.vc/), awesome angels, and existing investors – will help us bring these innovations to engineers and empower them to make the most of their unstructured data and the awesome power of LLMs at any scale. -```typescript -client.updateCollection("{collection_name}", { - optimizers_config: { - indexing_threshold: 10000, - }, -}); +We are thrilled to announce that we just raised our seed round from the best possible investor we could imagine for this stage. Let’s talk about fundraising later – it is a story itself that I could probably write a bestselling book about. First, let's dive into a bit of background about our project, our progress, and future plans. -``` +## A need for vector databases. -```rust -use qdrant_client::qdrant::{OptimizersConfigDiffBuilder, UpdateCollectionBuilder}; +Unstructured data is growing exponentially, and we are all part of a huge unstructured data workforce. This blog post is unstructured data; your visit here produces unstructured and semi-structured data with every web interaction, as does every photo you take or email you send. The global datasphere will grow to [165 zettabytes by 2025](https://github.com/qdrant/qdrant/pull/1639), and about 80% of that will be unstructured. At the same time, the rising demand for AI is vastly outpacing existing infrastructure. Around 90% of machine learning research results fail to reach production because of a lack of tools. -client - .update_collection( - UpdateCollectionBuilder::new("{collection_name}").optimizers_config( - OptimizersConfigDiffBuilder::default().indexing_threshold(10000), - ), - ) - .await?; -``` +{{< figure src=/articles_data/seed-round/demand.png caption="Demand for AI tools" alt="Vector Databases Demand" >}} -```java -import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; -import io.qdrant.client.grpc.Collections.UpdateCollection; +Thankfully there’s a new generation of tools that let developers work with unstructured data in the form of vector embeddings, which are deep representations of objects obtained from a neural network model. A vector database, also known as a vector similarity search engine or approximate nearest neighbour (ANN) search database, is a database designed to store, manage, and search high-dimensional data with an additional payload. Vector Databases turn research prototypes into commercial AI products. Vector search solutions are industry agnostic and bring solutions for a number of use cases, including classic ones like semantic search, matching engines, and recommender systems to more novel applications like anomaly detection, working with time series, or biomedical data. The biggest limitation is to have a neural network encoder in place for the data type you are working with. -client.updateCollectionAsync( - UpdateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setOptimizersConfig( - OptimizersConfigDiff.newBuilder().setIndexingThreshold(10000).build()) - .build()); -``` +{{< figure src=/articles_data/seed-round/use-cases.png caption="Vector Search Use Cases" alt="Vector Search Use Cases" >}} -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +With the rise of large language models (LLMs), Vector Databases have become the fundamental building block of the new AI Stack. They let developers build even more advanced applications by extending the “knowledge base” of LLMs-based applications like ChatGPT with real-time and real-world data. -var client = new QdrantClient("localhost", 6334); +A new AI product category, “Co-Pilot for X,” was born and is already affecting how we work. Starting from producing content to developing software. And this is just the beginning, there are even more types of novel applications being developed on top of this stack. -await client.UpdateCollectionAsync( - collectionName: "{collection_name}", - optimizersConfig: new OptimizersConfigDiff { IndexingThreshold = 10000 } -); +{{< figure src=/articles_data/seed-round/ai-stack.png caption="New AI Stack" alt="New AI Stack" >}} -``` +## Enter Qdrant. ## -```go -import ( - "context" +At the same time, adoption has only begun. Vector Search Databases are replacing VSS libraries like FAISS, etc., which, despite their disadvantages, are still used by ~90% of projects out there They’re hard-coupled to the application code, lack of production-ready features like basic CRUD operations or advanced filtering, are a nightmare to maintain and scale and have many other difficulties that make life hard for developers. - "github.com/qdrant/go-client/qdrant" -) +The current Qdrant ecosystem consists of excellent products to work with vector embeddings. We launched our managed vector database solution, Qdrant Cloud, early this year, and it is already serving more than 1,000 Qdrant clusters. We are extending our offering now with managed on-premise solutions for enterprise customers. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +{{< figure src=/articles_data/seed-round/ecosystem.png caption="Qdrant Ecosystem" alt="Qdrant Vector Database Ecosystem" >}} -client.UpdateCollection(context.Background(), &qdrant.UpdateCollection{ - CollectionName: "{collection_name}", - OptimizersConfig: &qdrant.OptimizersConfigDiff{ - IndexingThreshold: qdrant.PtrOf(uint64(10000)), - }, -}) -``` +Our plan for the current [open-source roadmap](https://github.com/qdrant/qdrant/blob/master/docs/roadmap/README.md) is to make billion-scale vector search affordable. Our recent release of the [Scalar Quantization](/articles/scalar-quantization/) improves both memory usage (x4) as well as speed (x2). Upcoming [Product Quantization](https://www.irisa.fr/texmex/people/jegou/papers/jegou_searching_with_quantization.pdf) will introduce even another option with more memory saving. Stay tuned. -The following parameters can be updated: +Qdrant started more than two years ago with the mission of building a vector database powered by a well-thought-out tech stack. Using Rust as the system programming language and technical architecture decision during the development of the engine made Qdrant the leading and one of the most popular vector database solutions. -- `optimizers_config` \- see [optimizer](https://qdrant.tech/documentation/concepts/optimizer/) for details. -- `hnsw_config` \- see [indexing](https://qdrant.tech/documentation/concepts/indexing/#vector-index) for details. -- `quantization_config` \- see [quantization](https://qdrant.tech/documentation/guides/quantization/#setting-up-quantization-in-qdrant) for details. -- `vectors_config` \- vector-specific configuration, including individual `hnsw_config`, `quantization_config` and `on_disk` settings. -- `params` \- other collection parameters, including `write_consistency_factor` and `on_disk_payload`. -- `strict_mode_config` \- see [strict mode](https://qdrant.tech/documentation/guides/administration/#strict-mode) for details. +Our unique custom modification of the [HNSW algorithm](/articles/filtrable-hnsw/) for Approximate Nearest Neighbor Search (ANN) allows querying the result with a state-of-the-art speed and applying filters without compromising on results. Cloud-native support for distributed deployment and replications makes the engine suitable for high-throughput applications with real-time latency requirements. Rust brings stability, efficiency, and the possibility to make optimization on a very low level. In general, we always aim for the best possible results in [performance](/benchmarks/), code quality, and feature set. -Full API specification is available in [schema definitions](https://api.qdrant.tech/api-reference/collections/update-collection). +Most importantly, we want to say a big thank you to our [open-source community](https://qdrant.to/discord), our adopters, our contributors, and our customers. Your active participation in the development of our products has helped make Qdrant the best vector database on the market. I cannot imagine how we could do what we’re doing without the community or without being open-source and having the TRUST of the engineers. Thanks to all of you! -Calls to this endpoint may be blocking as it waits for existing optimizers to -finish. We recommended against using this in a production database as it may -introduce huge overhead due to the rebuilding of the index. +I also want to thank our team. Thank you for your patience and trust. Together we are strong. Let’s continue doing great things together. -#### [Anchor](https://qdrant.tech/documentation/concepts/collections/\#update-vector-parameters) Update vector parameters +## Fundraising ## +The whole process took only a couple of days, we got several offers, and most probably, we would get more with different conditions. We decided to go with Unusual Ventures because they truly understand how things work in the open-source space. They just did it right. -_Available as of v1.4.0_ +Here is a big piece of advice for all investors interested in open-source: Dive into the community, and see and feel the traction and product feedback instead of looking at glossy pitch decks. With Unusual on our side, we have an active operational partner instead of one who simply writes a check. That help is much more important than overpriced valuations and big shiny names. -Qdrant 1.4 adds support for updating more collection parameters at runtime. HNSW -index, quantization and disk configurations can now be changed without -recreating a collection. Segments (with index and quantized data) will -automatically be rebuilt in the background to match updated parameters. +Ultimately, the community and adopters will decide what products win and lose, not VCs. Companies don’t need crazy valuations to create products that customers love. You do not need Ph.D. to innovate. You do not need to over-engineer to build a scalable solution. You do not need ex-FANG people to have a great team. You need clear focus, a passion for what you’re building, and the know-how to do it well. -To put vector data on disk for a collection that **does not have** named vectors, -use `""` as name: +We know how. -httpbash +PS: This text is written by me in an old-school way without any ChatGPT help. Sometimes you just need inspiration instead of AI ;-) -```http -PATCH /collections/{collection_name} -{ - "vectors": { - "": { - "on_disk": true - } - } -} +<|page-59-lllmstxt|> +# Streamlining Question Answering: Simplifying Integration with LangChain and Qdrant -``` +Building applications with Large Language Models doesn't have to be complicated. A lot has been going on recently to simplify the development, +so you can utilize already pre-trained models and support even complex pipelines with a few lines of code. [LangChain](https://langchain.readthedocs.io) +provides unified interfaces to different libraries, so you can avoid writing boilerplate code and focus on the value you want to bring. -```bash -curl -X PATCH http://localhost:6333/collections/{collection_name} \ - -H 'Content-Type: application/json' \ - --data-raw '{ - "vectors": { - "": { - "on_disk": true - } - } - }' +## Why Use Qdrant for Question Answering with LangChain? -``` +It has been reported millions of times recently, but let's say that again. ChatGPT-like models struggle with generating factual statements if no context +is provided. They have some general knowledge but cannot guarantee to produce a valid answer consistently. Thus, it is better to provide some facts we +know are actual, so it can just choose the valid parts and extract them from all the provided contextual data to give a comprehensive answer. [Vector database, +such as Qdrant](https://qdrant.tech/), is of great help here, as their ability to perform a [semantic search](https://qdrant.tech/documentation/tutorials/search-beginners/) over a huge knowledge base is crucial to preselect some possibly valid +documents, so they can be provided into the LLM. That's also one of the **chains** implemented in [LangChain](https://qdrant.tech/documentation/frameworks/langchain/), which is called `VectorDBQA`. And Qdrant got +integrated with the library, so it might be used to build it effortlessly. -To put vector data on disk for a collection that **does have** named vectors: +### The Two-Model Approach -Note: To create a vector name, follow the procedure from our [Points](https://qdrant.tech/documentation/concepts/points/#create-vector-name). +Surprisingly enough, there will be two models required to set things up. First of all, we need an embedding model that will convert the set of facts into +vectors, and store those into Qdrant. That's an identical process to any other semantic search application. We're going to use one of the +`SentenceTransformers` models, so it can be hosted locally. The embeddings created by that model will be put into Qdrant and used to retrieve the most +similar documents, given the query. -httpbash +However, when we receive a query, there are two steps involved. First of all, we ask Qdrant to provide the most relevant documents and simply combine all +of them into a single text. Then, we build a prompt to the LLM (in our case [OpenAI](https://openai.com/)), including those documents as a context, of course together with the +question asked. So the input to the LLM looks like the following: -```http -PATCH /collections/{collection_name} -{ - "vectors": { - "my_vector": { - "on_disk": true - } - } -} +```text +Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. +It's as certain as 2 + 2 = 4 +... +Question: How much is 2 + 2? +Helpful Answer: ``` -```bash -curl -X PATCH http://localhost:6333/collections/{collection_name} \ - -H 'Content-Type: application/json' \ - --data-raw '{ - "vectors": { - "my_vector": { - "on_disk": true - } - } - }' +There might be several context documents combined, and it is solely up to LLM to choose the right piece of content. But our expectation is, the model should +respond with just `4`. -``` +## Why do we need two different models? +Both solve some different tasks. The first model performs feature extraction, by converting the text into vectors, while +the second one helps in text generation or summarization. Disclaimer: This is not the only way to solve that task with LangChain. Such a chain is called `stuff` +in the library nomenclature. -In the following example the HNSW index and quantization parameters are updated, -both for the whole collection, and for `my_vector` specifically: +![](/articles_data/langchain-integration/flow-diagram.png) -httpbashpythontypescriptrustjavacsharpgo +Enough theory! This sounds like a pretty complex application, as it involves several systems. But with LangChain, it might be implemented in just a few lines +of code, thanks to the recent integration with [Qdrant](https://qdrant.tech/). We're not even going to work directly with `QdrantClient`, as everything is already done in the background +by LangChain. If you want to get into the source code right away, all the processing is available as a +[Google Colab notebook](https://colab.research.google.com/drive/19RxxkZdnq_YqBH5kBV10Rt0Rax-kminD?usp=sharing). -```http -PATCH /collections/{collection_name} -{ - "vectors": { - "my_vector": { - "hnsw_config": { - "m": 32, - "ef_construct": 123 - }, - "quantization_config": { - "product": { - "compression": "x32", - "always_ram": true - } - }, - "on_disk": true - } - }, - "hnsw_config": { - "ef_construct": 123 - }, - "quantization_config": { - "scalar": { - "type": "int8", - "quantile": 0.8, - "always_ram": false - } - } -} +## How to Implement Question Answering with LangChain and Qdrant -``` +### Step 1: Configuration -```bash -curl -X PATCH http://localhost:6333/collections/{collection_name} \ - -H 'Content-Type: application/json' \ - --data-raw '{ - "vectors": { - "my_vector": { - "hnsw_config": { - "m": 32, - "ef_construct": 123 - }, - "quantization_config": { - "product": { - "compression": "x32", - "always_ram": true - } - }, - "on_disk": true - } - }, - "hnsw_config": { - "ef_construct": 123 - }, - "quantization_config": { - "scalar": { - "type": "int8", - "quantile": 0.8, - "always_ram": false - } - } -}' +A journey of a thousand miles begins with a single step, in our case with the configuration of all the services. We'll be using [Qdrant Cloud](https://cloud.qdrant.io), +so we need an API key. The same is for OpenAI - the API key has to be obtained from their website. -``` +![](/articles_data/langchain-integration/code-configuration.png) -```python -client.update_collection( - collection_name="{collection_name}", - vectors_config={ - "my_vector": models.VectorParamsDiff( - hnsw_config=models.HnswConfigDiff( - m=32, - ef_construct=123, - ), - quantization_config=models.ProductQuantization( - product=models.ProductQuantizationConfig( - compression=models.CompressionRatio.X32, - always_ram=True, - ), - ), - on_disk=True, - ), - }, - hnsw_config=models.HnswConfigDiff( - ef_construct=123, - ), - quantization_config=models.ScalarQuantization( - scalar=models.ScalarQuantizationConfig( - type=models.ScalarType.INT8, - quantile=0.8, - always_ram=False, - ), - ), -) +### Step 2: Building the knowledge base -``` +We also need some facts from which the answers will be generated. There is plenty of public datasets available, and +[Natural Questions](https://ai.google.com/research/NaturalQuestions/visualization) is one of them. It consists of the whole HTML content of the websites they were +scraped from. That means we need some preprocessing to extract plain text content. As a result, we’re going to have two lists of strings - one for questions and +the other one for the answers. -```typescript -client.updateCollection("{collection_name}", { - vectors: { - my_vector: { - hnsw_config: { - m: 32, - ef_construct: 123, - }, - quantization_config: { - product: { - compression: "x32", - always_ram: true, - }, - }, - on_disk: true, - }, - }, - hnsw_config: { - ef_construct: 123, - }, - quantization_config: { - scalar: { - type: "int8", - quantile: 0.8, - always_ram: true, - }, - }, -}); +The answers have to be vectorized with the first of our models. The `sentence-transformers/all-mpnet-base-v2` is one of the possibilities, but there are some +other options available. LangChain will handle that part of the process in a single function call. -``` +![](/articles_data/langchain-integration/code-qdrant.png) -```rust -use std::collections::HashMap; +### Step 3: Setting up QA with Qdrant in a loop -use qdrant_client::qdrant::{ - quantization_config_diff::Quantization, vectors_config_diff::Config, HnswConfigDiffBuilder, - QuantizationType, ScalarQuantizationBuilder, UpdateCollectionBuilder, VectorParamsDiffBuilder, - VectorParamsDiffMap, -}; +`VectorDBQA` is a chain that performs the process described above. So it, first of all, loads some facts from Qdrant and then feeds them into OpenAI LLM which +should analyze them to find the answer to a given question. The only last thing to do before using it is to put things together, also with a single function call. -client - .update_collection( - UpdateCollectionBuilder::new("{collection_name}") - .hnsw_config(HnswConfigDiffBuilder::default().ef_construct(123)) - .vectors_config(Config::ParamsMap(VectorParamsDiffMap { - map: HashMap::from([(\ - ("my_vector".into()),\ - VectorParamsDiffBuilder::default()\ - .hnsw_config(HnswConfigDiffBuilder::default().m(32).ef_construct(123))\ - .build(),\ - )]), - })) - .quantization_config(Quantization::Scalar( - ScalarQuantizationBuilder::default() - .r#type(QuantizationType::Int8.into()) - .quantile(0.8) - .always_ram(true) - .build(), - )), - ) - .await?; +![](/articles_data/langchain-integration/code-vectordbqa.png) -``` +## Step 4: Testing out the chain -```java -import io.qdrant.client.grpc.Collections.HnswConfigDiff; -import io.qdrant.client.grpc.Collections.QuantizationConfigDiff; -import io.qdrant.client.grpc.Collections.QuantizationType; -import io.qdrant.client.grpc.Collections.ScalarQuantization; -import io.qdrant.client.grpc.Collections.UpdateCollection; -import io.qdrant.client.grpc.Collections.VectorParamsDiff; -import io.qdrant.client.grpc.Collections.VectorParamsDiffMap; -import io.qdrant.client.grpc.Collections.VectorsConfigDiff; +And that's it! We can put some queries, and LangChain will perform all the required processing to find the answer in the provided context. -client - .updateCollectionAsync( - UpdateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setHnswConfig(HnswConfigDiff.newBuilder().setEfConstruct(123).build()) - .setVectorsConfig( - VectorsConfigDiff.newBuilder() - .setParamsMap( - VectorParamsDiffMap.newBuilder() - .putMap( - "my_vector", - VectorParamsDiff.newBuilder() - .setHnswConfig( - HnswConfigDiff.newBuilder() - .setM(3) - .setEfConstruct(123) - .build()) - .build()))) - .setQuantizationConfig( - QuantizationConfigDiff.newBuilder() - .setScalar( - ScalarQuantization.newBuilder() - .setType(QuantizationType.Int8) - .setQuantile(0.8f) - .setAlwaysRam(true) - .build())) - .build()) - .get(); +![](/articles_data/langchain-integration/code-answering.png) -``` +```text +> what kind of music is scott joplin most famous for + Scott Joplin is most famous for composing ragtime music. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +> who died from the band faith no more + Chuck Mosley -var client = new QdrantClient("localhost", 6334); +> when does maggie come on grey's anatomy + Maggie first appears in season 10, episode 1, which aired on September 26, 2013. -await client.UpdateCollectionAsync( - collectionName: "{collection_name}", - hnswConfig: new HnswConfigDiff { EfConstruct = 123 }, - vectorsConfig: new VectorParamsDiffMap - { - Map = - { - { - "my_vector", - new VectorParamsDiff - { - HnswConfig = new HnswConfigDiff { M = 3, EfConstruct = 123 } - } - } - } - }, - quantizationConfig: new QuantizationConfigDiff - { - Scalar = new ScalarQuantization - { - Type = QuantizationType.Int8, - Quantile = 0.8f, - AlwaysRam = true - } - } -); +> can't take my eyes off you lyrics meaning + I don't know. +> who lasted the longest on alone season 2 + David McIntyre lasted the longest on Alone season 2, with a total of 66 days. ``` -```go -import ( - "context" +The great thing about such a setup is that the knowledge base might be easily extended with some new facts and those will be included in the prompts +sent to LLM later on. Of course, assuming their similarity to the given question will be in the top results returned by Qdrant. - "github.com/qdrant/go-client/qdrant" -) +If you want to run the chain on your own, the simplest way to reproduce it is to open the +[Google Colab notebook](https://colab.research.google.com/drive/19RxxkZdnq_YqBH5kBV10Rt0Rax-kminD?usp=sharing). -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +<|page-60-lllmstxt|> + + + + +When it comes to measuring the memory consumption of our processes, we often rely on tools such as `htop` to give us an indication of how much RAM is being used. However, this method can be misleading and doesn't always accurately reflect the true memory usage of a process. + +There are many different ways in which `htop` may not be a reliable indicator of memory usage. +For instance, a process may allocate memory in advance but not use it, or it may not free deallocated memory, leading to overstated memory consumption. +A process may be forked, which means that it will have a separate memory space, but it will share the same code and data with the parent process. +This means that the memory consumption of the child process will be counted twice. +Additionally, a process may utilize disk cache, which is also accounted as resident memory in the `htop` measurements. -client.UpdateCollection(context.Background(), &qdrant.UpdateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfigDiffMap( - map[string]*qdrant.VectorParamsDiff{ - "my_vector": { - HnswConfig: &qdrant.HnswConfigDiff{ - M: qdrant.PtrOf(uint64(3)), - EfConstruct: qdrant.PtrOf(uint64(123)), - }, - }, - }), - QuantizationConfig: qdrant.NewQuantizationDiffScalar( - &qdrant.ScalarQuantization{ - Type: qdrant.QuantizationType_Int8, - Quantile: qdrant.PtrOf(float32(0.8)), - AlwaysRam: qdrant.PtrOf(true), - }), -}) +As a result, even if `htop` shows that a process is using 10GB of memory, it doesn't necessarily mean that the process actually requires 10GB of RAM to operate efficiently. +In this article, we will explore how to properly measure RAM usage and optimize [Qdrant](https://qdrant.tech/) for optimal memory consumption. -``` +## How to measure actual RAM requirements -## [Anchor](https://qdrant.tech/documentation/concepts/collections/\#collection-info) Collection info + -``` +We need to know memory consumption in order to estimate how much RAM is required to run the program. +So in order to determine that, we can conduct a simple experiment. +Let's limit the allowed memory of the process and observe at which point it stops functioning. +In this way we can determine the minimum amount of RAM the program needs to operate. -```bash -curl -X GET http://localhost:6333/collections/{collection_name} +One way to do this is by conducting a grid search, but a more efficient method is to use binary search to quickly find the minimum required amount of RAM. +We can use docker to limit the memory usage of the process. +Before running each benchmark, it is important to clear the page cache with the following command: + +```bash +sudo bash -c 'sync; echo 1 > /proc/sys/vm/drop_caches' ``` -```python -client.get_collection(collection_name="{collection_name}") +This ensures that the process doesn't utilize any data from previous runs, providing more accurate and consistent results. + +We can use the following command to run Qdrant with a memory limit of 1GB: +```bash +docker run -it --rm \ + --memory 1024mb \ + --network=host \ + -v "$(pwd)/data/storage:/qdrant/storage" \ + qdrant/qdrant:latest ``` -```typescript -client.getCollection("{collection_name}"); +## Let's run some benchmarks -``` +Let's run some benchmarks to see how much RAM Qdrant needs to serve 1 million vectors. -```rust -client.collection_info("{collection_name}").await?; +We can use the `glove-100-angular` and scripts from the [vector-db-benchmark](https://github.com/qdrant/vector-db-benchmark) project to upload and query the vectors. +With the first run we will use the default configuration of Qdrant with all data stored in RAM. +```bash +# Upload vectors +python run.py --engines qdrant-all-in-ram --datasets glove-100-angular ``` -```java -client.getCollectionInfoAsync("{collection_name}").get(); +After uploading vectors, we will repeat the same experiment with different RAM limits to see how they affect the memory consumption and search speed. +```bash +# Search vectors +python run.py --engines qdrant-all-in-ram --datasets glove-100-angular --skip-upload ``` -```csharp -await client.GetCollectionInfoAsync("{collection_name}"); + -_Available as of v1.9.0_ +### All in Memory -A collection may have the grey ⚫ status or show “optimizations pending, -awaiting update operation” as optimization status. This state is normally caused -by restarting a Qdrant instance while optimizations were ongoing. +In the first experiment, we tested how well our system performs when all vectors are stored in memory. +We tried using different amounts of memory, ranging from 1512mb to 1024mb, and measured the number of requests per second (rps) that our system was able to handle. -It means the collection has optimizations pending, but they are paused. You must -send any update operation to trigger and start the optimizations again. +| Memory | Requests/s | +|--------|---------------| +| 1512mb | 774.38 | +| 1256mb | 760.63 | +| 1200mb | 794.72 | +| 1152mb | out of memory | +| 1024mb | out of memory | -For example: -httpbashpythontypescriptrustjavacsharpgo +We found that 1152MB memory limit resulted in our system running out of memory, but using 1512mb, 1256mb, and 1200mb of memory resulted in our system being able to handle around 780 RPS. +This suggests that about 1.2GB of memory is needed to serve around 1 million vectors, and there is no speed degradation when limiting memory usage above 1.2GB. + +### Vectors stored using MMAP + +Let's go a bit further! +In the second experiment, we tested how well our system performs when **vectors are stored using the memory-mapped file** (mmap). +Create collection with: ```http -PATCH /collections/{collection_name} +PUT /collections/benchmark { - "optimizers_config": {} + "vectors": { + ... + "on_disk": true + } } ``` +This configuration tells Qdrant to use mmap for vectors if the segment size is greater than 20000Kb (which is approximately 40K 128d-vectors). -```bash -curl -X PATCH http://localhost:6333/collections/{collection_name} \ - -H 'Content-Type: application/json' \ - --data-raw '{ - "optimizers_config": {} - }' +Now the out-of-memory happens when we allow using **600mb** RAM only -``` +
+ Experiments details -```python -client.update_collection( - collection_name="{collection_name}", - optimizer_config=models.OptimizersConfigDiff(), -) +| Memory | Requests/s | +|--------|---------------| +| 1200mb | 759.94 | +| 1100mb | 687.00 | +| 1000mb | 10 | -``` +--- use a bit faster disk --- -```typescript -client.updateCollection("{collection_name}", { - optimizers_config: {}, -}); +| Memory | Requests/s | +|--------|---------------| +| 1000mb | 25 rps | +| 750mb | 5 rps | +| 625mb | 2.5 rps | +| 600mb | out of memory | -``` -```rust -use qdrant_client::qdrant::{OptimizersConfigDiffBuilder, UpdateCollectionBuilder}; +
-client - .update_collection( - UpdateCollectionBuilder::new("{collection_name}") - .optimizers_config(OptimizersConfigDiffBuilder::default()), - ) - .await?; +At this point we have to switch from network-mounted storage to a faster disk, as the network-based storage is too slow to handle the amount of sequential reads that our system needs to serve the queries. -``` +But let's first see how much RAM we need to serve 1 million vectors and then we will discuss the speed optimization as well. -```java -import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; -import io.qdrant.client.grpc.Collections.UpdateCollection; -client.updateCollectionAsync( - UpdateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setOptimizersConfig( - OptimizersConfigDiff.getDefaultInstance()) - .build()); +### Vectors and HNSW graph stored using MMAP + +In the third experiment, we tested how well our system performs when vectors and [HNSW](https://qdrant.tech/articles/filtrable-hnsw/) graph are stored using the memory-mapped files. +Create collection with: +```http +PUT /collections/benchmark +{ + "vectors": { + ... + "on_disk": true + }, + "hnsw_config": { + "on_disk": true + }, + ... +} ``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +With this configuration we are able to serve 1 million vectors with **only 135mb of RAM**! -var client = new QdrantClient("localhost", 6334); -await client.UpdateCollectionAsync( - collectionName: "{collection_name}", - optimizersConfig: new OptimizersConfigDiff { } -); +
+ Experiments details -``` -```go -import ( - "context" +| Memory | Requests/s | +|--------|---------------| +| 600mb | 5 rps | +| 300mb | 0.9 rps / 1.1 sec per query | +| 150mb | 0.4 rps / 2.5 sec per query | +| 135mb | 0.33 rps / 3 sec per query | +| 125mb | out of memory | - "github.com/qdrant/go-client/qdrant" -) +
-client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +At this point the importance of the disk speed becomes critical. +We can serve the search requests with 135mb of RAM, but the speed of the requests makes it impossible to use the system in production. -client.UpdateCollection(context.Background(), &qdrant.UpdateCollection{ - CollectionName: "{collection_name}", - OptimizersConfig: &qdrant.OptimizersConfigDiff{}, -}) +Let's see how we can improve the speed. -``` -Alternatively you may use the `Trigger Optimizers` button in the [Qdrant Web UI](https://qdrant.tech/documentation/web-ui/). -It is shown next to the grey collection status on the collection info page. +## How to speed up the search -### [Anchor](https://qdrant.tech/documentation/concepts/collections/\#approximate-point-and-vector-counts) Approximate point and vector counts -You may be interested in the count attributes: + -There is no way to stop the service and rebuild the collection with new vectors in these situations. -Aliases are additional names for existing collections. -All queries to the collection can also be done identically, using an alias instead of the collection name. +To measure the impact of disk parameters on search speed, we used the `fio` tool to test the speed of different types of disks. -Thus, it is possible to build a second collection in the background and then switch alias from the old to the new collection. -Since all changes of aliases happen atomically, no concurrent requests will be affected during the switch. +```bash +# Install fio +sudo apt-get install fio -### [Anchor](https://qdrant.tech/documentation/concepts/collections/\#create-alias) Create alias +# Run fio to check the random reads speed +fio --randrepeat=1 \ + --ioengine=libaio \ + --direct=1 \ + --gtod_reduce=1 \ + --name=fiotest \ + --filename=testfio \ + --bs=4k \ + --iodepth=64 \ + --size=8G \ + --readwrite=randread +``` -httpbashpythontypescriptrustjavacsharpgo -```http -POST /collections/aliases -{ - "actions": [\ - {\ - "create_alias": {\ - "collection_name": "example_collection",\ - "alias_name": "production_collection"\ - }\ - }\ - ] -} +Initially, we tested on a network-mounted disk, but its performance was too slow, with a read IOPS of 6366 and a bandwidth of 24.9 MiB/s: +```text +read: IOPS=6366, BW=24.9MiB/s (26.1MB/s)(8192MiB/329424msec) ``` -```bash -curl -X POST http://localhost:6333/collections/aliases \ - -H 'Content-Type: application/json' \ - --data-raw '{ - "actions": [\ - {\ - "create_alias": {\ - "collection_name": "example_collection",\ - "alias_name": "production_collection"\ - }\ - }\ - ] -}' +To improve performance, we switched to a local disk, which showed much faster results, with a read IOPS of 63.2k and a bandwidth of 247 MiB/s: +```text +read: IOPS=63.2k, BW=247MiB/s (259MB/s)(8192MiB/33207msec) ``` -```python -client.update_collection_aliases( - change_aliases_operations=[\ - models.CreateAliasOperation(\ - create_alias=models.CreateAlias(\ - collection_name="example_collection", alias_name="production_collection"\ - )\ - )\ - ] -) +That gave us a significant speed boost, but we wanted to see if we could improve performance even further. +To do that, we switched to a machine with a local SSD, which showed even better results, with a read IOPS of 183k and a bandwidth of 716 MiB/s: +```text +read: IOPS=183k, BW=716MiB/s (751MB/s)(8192MiB/11438msec) ``` -```typescript -client.updateCollectionAliases({ - actions: [\ - {\ - create_alias: {\ - collection_name: "example_collection",\ - alias_name: "production_collection",\ - },\ - },\ - ], -}); +Let's see how these results translate into search speed: -``` +| Memory | RPS with IOPS=63.2k | RPS with IOPS=183k | +|--------|---------------------|--------------------| +| 600mb | 5 | 50 | +| 300mb | 0.9 | 13 | +| 200mb | 0.5 | 8 | +| 150mb | 0.4 | 7 | -```rust -use qdrant_client::qdrant::CreateAliasBuilder; -client - .create_alias(CreateAliasBuilder::new( - "example_collection", - "production_collection", - )) - .await?; +As you can see, the speed of the disk has a significant impact on the search speed. +With a local SSD, we were able to increase the search speed by 10x! -``` +With the production-grade disk, the search speed could be even higher. +Some configurations of the SSDs can reach 1M IOPS and more. -```java -client.createAliasAsync("production_collection", "example_collection").get(); +Which might be an interesting option to serve large datasets with low search latency in Qdrant. -``` -```csharp -await client.CreateAliasAsync(aliasName: "production_collection", collectionName: "example_collection"); +## Conclusion -``` +In this article, we showed that Qdrant has flexibility in terms of RAM usage and can be used to serve large datasets. It provides configurable trade-offs between RAM usage and search speed. If you’re interested to learn more about Qdrant, [book a demo today](https://qdrant.tech/contact-us/)! -```go -import "context" +We are eager to learn more about how you use Qdrant in your projects, what challenges you face, and how we can help you solve them. +Please feel free to join our [Discord](https://qdrant.to/discord) and share your experience with us! -client.CreateAlias(context.Background(), "production_collection", "example_collection") +<|page-61-lllmstxt|> +Bi-encoders are probably the most efficient way of setting up a semantic Question Answering system. +This architecture relies on the same neural model that creates vector embeddings for both questions and answers. +The assumption is, both question and answer should have representations close to each other in the latent space. +It should be like that because they should both describe the same semantic concept. That doesn't apply +to answers like "Yes" or "No" though, but standard FAQ-like problems are a bit easier as there is typically +an overlap between both texts. Not necessarily in terms of wording, but in their semantics. -``` +![Bi-encoder structure. Both queries (questions) and documents (answers) are vectorized by the same neural encoder. +Output embeddings are then compared by a chosen distance function, typically cosine similarity.](/articles_data/qa-with-cohere-and-qdrant/biencoder-diagram.png) -### [Anchor](https://qdrant.tech/documentation/concepts/collections/\#remove-alias) Remove alias +And yeah, you need to **bring your own embeddings**, in order to even start. There are various ways how +to obtain them, but using Cohere [co.embed API](https://docs.cohere.ai/reference/embed) is probably +the easiest and most convenient method. -httpbashpythontypescriptrustjavacsharpgo +## Why co.embed API and Qdrant go well together? -```http -POST /collections/aliases -{ - "actions": [\ - {\ - "delete_alias": {\ - "alias_name": "production_collection"\ - }\ - }\ - ] -} +Maintaining a **Large Language Model** might be hard and expensive. Scaling it up and down, when the traffic +changes, require even more effort and becomes unpredictable. That might be definitely a blocker for any semantic +search system. But if you want to start right away, you may consider using a SaaS model, Cohere’s +[co.embed API](https://docs.cohere.ai/reference/embed) in particular. It gives you state-of-the-art language +models available as a Highly Available HTTP service with no need to train or maintain your own service. As all +the communication is done with JSONs, you can simply provide the co.embed output as Qdrant input. +```python +# Putting the co.embed API response directly as Qdrant method input +qdrant_client.upsert( + collection_name="collection", + points=rest.Batch( + ids=[...], + vectors=cohere_client.embed(...).embeddings, + payloads=[...], + ), +) ``` -```bash -curl -X POST http://localhost:6333/collections/aliases \ - -H 'Content-Type: application/json' \ - --data-raw '{ - "actions": [\ - {\ - "delete_alias": {\ - "alias_name": "production_collection"\ - }\ - }\ - ] -}' +Both tools are easy to combine, so you can start working with semantic search in a few minutes, not days. -``` +And what if your needs are so specific that you need to fine-tune a general usage model? Co.embed API goes beyond +pre-trained encoders and allows providing some custom datasets to +[customize the embedding model with your own data](https://docs.cohere.com/docs/finetuning). +As a result, you get the quality of domain-specific models, but without worrying about infrastructure. + +## System architecture overview + +In real systems, answers get vectorized and stored in an efficient vector search database. We typically don’t +even need to provide specific answers, but just use sentences or paragraphs of text and vectorize them instead. +Still, if a bit longer piece of text contains the answer to a particular question, its distance to the question +embedding should not be that far away. And for sure closer than all the other, non-matching answers. Storing the +answer embeddings in a vector database makes the search process way easier. + +![Building the database of possible answers. All the texts are converted into their vector embeddings and those +embeddings are stored in a vector database, i.e. Qdrant.](/articles_data/qa-with-cohere-and-qdrant/vector-database.png) + +## Looking for the correct answer + +Once our database is working and all the answer embeddings are already in place, we can start querying it. +We basically perform the same vectorization on a given question and ask the database to provide some near neighbours. +We rely on the embeddings to be close to each other, so we expect the points with the smallest distance in the latent +space to contain the proper answer. + +![While searching, a question gets vectorized by the same neural encoder. Vector database is a component that looks +for the closest answer vectors using i.e. cosine similarity. A proper system, like Qdrant, will make the lookup +process more efficient, as it won’t calculate the distance to all the answer embeddings. Thanks to HNSW, it will +be able to find the nearest neighbours with sublinear complexity.](/articles_data/qa-with-cohere-and-qdrant/search-with-vector-database.png) + +## Implementing the QA search system with SaaS tools + +We don’t want to maintain our own service for the neural encoder, nor even set up a Qdrant instance. There are SaaS +solutions for both — Cohere’s [co.embed API](https://docs.cohere.ai/reference/embed) +and [Qdrant Cloud](https://qdrant.to/cloud), so we’ll use them instead of on-premise tools. + +### Question Answering on biomedical data + +We’re going to implement the Question Answering system for the biomedical data. There is a +*[pubmed_qa](https://huggingface.co/datasets/pubmed_qa)* dataset, with it *pqa_labeled* subset containing 1,000 examples +of questions and answers labelled by domain experts. Our system is going to be fed with the embeddings generated by +co.embed API and we’ll load them to Qdrant. Using Qdrant Cloud vs your own instance does not matter much here. +There is a subtle difference in how to connect to the cloud instance, but all the other operations are executed +in the same way. ```python -client.update_collection_aliases( - change_aliases_operations=[\ - models.DeleteAliasOperation(\ - delete_alias=models.DeleteAlias(alias_name="production_collection")\ - ),\ - ] -) +from datasets import load_dataset +# Loading the dataset from HuggingFace hub. It consists of several columns: pubid, +# question, context, long_answer and final_decision. For the purposes of our system, +# we’ll use question and long_answer. +dataset = load_dataset("pubmed_qa", "pqa_labeled") ``` -```typescript -client.updateCollectionAliases({ - actions: [\ - {\ - delete_alias: {\ - alias_name: "production_collection",\ - },\ - },\ - ], -}); +| **pubid** | **question** | **context** | **long_answer** | **final_decision** | +|-----------|---------------------------------------------------|-------------|---------------------------------------------------|--------------------| +| 18802997 | Can calprotectin predict relapse risk in infla... | ... | Measuring calprotectin may help to identify UC... | maybe | +| 20538207 | Should temperature be monitorized during kidne... | ... | The new storage can affords more stable temper... | no | +| 25521278 | Is plate clearing a risk factor for obesity? | ... | The tendency to clear one's plate when eating ... | yes | +| 17595200 | Is there an intrauterine influence on obesity? | ... | Comparison of mother-offspring and father-offs.. | no | +| 15280782 | Is unsafe sexual behaviour increasing among HI... | ... | There was no evidence of a trend in unsafe sex... | no | -``` +### Using Cohere and Qdrant to build the answers database -```rust -client.delete_alias("production_collection").await?; +In order to start generating the embeddings, you need to [create a Cohere account](https://dashboard.cohere.ai/welcome/register). +That will start your trial period, so you’ll be able to vectorize the texts for free. Once logged in, your default API key will +be available in [Settings](https://dashboard.cohere.ai/api-keys). We’ll need it to call the co.embed API. with the official python package. -``` +```python +import cohere -```java -client.deleteAliasAsync("production_collection").get(); +cohere_client = cohere.Client(COHERE_API_KEY) +# Generating the embeddings with Cohere client library +embeddings = cohere_client.embed( + texts=["A test sentence"], + model="large", +) +vector_size = len(embeddings.embeddings[0]) +print(vector_size) # output: 4096 ``` -```csharp -await client.DeleteAliasAsync("production_collection"); +Let’s connect to the Qdrant instance first and create a collection with the proper configuration, so we can put some embeddings into it later on. +```python +# Connecting to Qdrant Cloud with qdrant-client requires providing the api_key. +# If you use an on-premise instance, it has to be skipped. +qdrant_client = QdrantClient( + host="xyz-example.eu-central.aws.cloud.qdrant.io", + prefer_grpc=True, + api_key=QDRANT_API_KEY, +) ``` -```go -import "context" +Now we’re able to vectorize all the answers. They are going to form our collection, so we can also put them already into Qdrant, along with the +payloads and identifiers. That will make our dataset easily searchable. -client.DeleteAlias(context.Background(), "production_collection") +```python +answer_response = cohere_client.embed( + texts=dataset["train"]["long_answer"], + model="large", +) +vectors = [ + # Conversion to float is required for Qdrant + list(map(float, vector)) + for vector in answer_response.embeddings +] +ids = [entry["pubid"] for entry in dataset["train"]] +# Filling up Qdrant collection with the embeddings generated by Cohere co.embed API +qdrant_client.upsert( + collection_name="pubmed_qa", + points=rest.Batch( + ids=ids, + vectors=vectors, + payloads=list(dataset["train"]), + ) +) ``` -### [Anchor](https://qdrant.tech/documentation/concepts/collections/\#switch-collection) Switch collection +And that’s it. Without even setting up a single server on our own, we created a system that might be easily asked a question. I don’t want to call +it serverless, as this term is already taken, but co.embed API with Qdrant Cloud makes everything way easier to maintain. -Multiple alias actions are performed atomically. -For example, you can switch underlying collection with the following command: +### Answering the questions with semantic search — the quality -httpbashpythontypescriptrustjavacsharpgo +It’s high time to query our database with some questions. It might be interesting to somehow measure the quality of the system in general. +In those kinds of problems we typically use *top-k accuracy*. We assume the prediction of the system was correct if the correct answer +was present in the first *k* results. -```http -POST /collections/aliases -{ - "actions": [\ - {\ - "delete_alias": {\ - "alias_name": "production_collection"\ - }\ - },\ - {\ - "create_alias": {\ - "collection_name": "example_collection",\ - "alias_name": "production_collection"\ - }\ - }\ - ] -} +```python +# Finding the position at which Qdrant provided the expected answer for each question. +# That allows to calculate accuracy@k for different values of k. +k_max = 10 +answer_positions = [] +for embedding, pubid in tqdm(zip(question_response.embeddings, ids)): + response = qdrant_client.search( + collection_name="pubmed_qa", + query_vector=embedding, + limit=k_max, + ) + answer_ids = [record.id for record in response] + if pubid in answer_ids: + answer_positions.append(answer_ids.index(pubid)) + else: + answer_positions.append(-1) ``` -```bash -curl -X POST http://localhost:6333/collections/aliases \ - -H 'Content-Type: application/json' \ - --data-raw '{ - "actions": [\ - {\ - "delete_alias": {\ - "alias_name": "production_collection"\ - }\ - },\ - {\ - "create_alias": {\ - "collection_name": "example_collection",\ - "alias_name": "production_collection"\ - }\ - }\ - ] -}' - -``` +Saved answer positions allow us to calculate the metric for different *k* values. ```python -client.update_collection_aliases( - change_aliases_operations=[\ - models.DeleteAliasOperation(\ - delete_alias=models.DeleteAlias(alias_name="production_collection")\ - ),\ - models.CreateAliasOperation(\ - create_alias=models.CreateAlias(\ - collection_name="example_collection", alias_name="production_collection"\ - )\ - ),\ - ] -) - +# Prepared answer positions are being used to calculate different values of accuracy@k +for k in range(1, k_max + 1): + correct_answers = len( + list( + filter(lambda x: 0 <= x < k, answer_positions) + ) + ) + print(f"accuracy@{k} =", correct_answers / len(dataset["train"])) ``` -```typescript -client.updateCollectionAliases({ - actions: [\ - {\ - delete_alias: {\ - alias_name: "production_collection",\ - },\ - },\ - {\ - create_alias: {\ - collection_name: "example_collection",\ - alias_name: "production_collection",\ - },\ - },\ - ], -}); +Here are the values of the top-k accuracy for different values of k: -``` +| **metric** | **value** | +|-------------|-----------| +| accuracy@1 | 0.877 | +| accuracy@2 | 0.921 | +| accuracy@3 | 0.942 | +| accuracy@4 | 0.950 | +| accuracy@5 | 0.956 | +| accuracy@6 | 0.960 | +| accuracy@7 | 0.964 | +| accuracy@8 | 0.971 | +| accuracy@9 | 0.976 | +| accuracy@10 | 0.977 | + +It seems like our system worked pretty well even if we consider just the first result, with the lowest distance. +We failed with around 12% of questions. But numbers become better with the higher values of k. It might be also +valuable to check out what questions our system failed to answer, their perfect match and our guesses. -```rust -use qdrant_client::qdrant::CreateAliasBuilder; +We managed to implement a working Question Answering system within just a few lines of code. If you are fine +with the results achieved, then you can start using it right away. Still, if you feel you need a slight improvement, +then fine-tuning the model is a way to go. If you want to check out the full source code, +it is available on [Google Colab](https://colab.research.google.com/drive/1YOYq5PbRhQ_cjhi6k4t1FnWgQm8jZ6hm?usp=sharing). -client.delete_alias("production_collection").await?; -client - .create_alias(CreateAliasBuilder::new( - "example_collection", - "production_collection", - )) - .await?; +<|page-62-lllmstxt|> +A brand-new Qdrant 1.2 release comes packed with a plethora of new features, some of which +were highly requested by our users. If you want to shape the development of the Qdrant vector +database, please [join our Discord community](https://qdrant.to/discord) and let us know +how you use it! -``` +## New features -```java -client.deleteAliasAsync("production_collection").get(); -client.createAliasAsync("production_collection", "example_collection").get(); +As usual, a minor version update of Qdrant brings some interesting new features. We love to see your +feedback, and we tried to include the features most requested by our community. -``` +### Product Quantization -```csharp -await client.DeleteAliasAsync("production_collection"); -await client.CreateAliasAsync(aliasName: "production_collection", collectionName: "example_collection"); +The primary focus of Qdrant was always performance. That's why we built it in Rust, but we were +always concerned about making vector search affordable. From the very beginning, Qdrant offered +support for disk-stored collections, as storage space is way cheaper than memory. That's also +why we have introduced the [Scalar Quantization](/articles/scalar-quantization/) mechanism recently, +which makes it possible to reduce the memory requirements by up to four times. -``` +Today, we are bringing a new quantization mechanism to life. A separate article on [Product +Quantization](/documentation/quantization/#product-quantization) will describe that feature in more +detail. In a nutshell, you can **reduce the memory requirements by up to 64 times**! -```go -import "context" +### Optional named vectors -client.DeleteAlias(context.Background(), "production_collection") -client.CreateAlias(context.Background(), "production_collection", "example_collection") +Qdrant has been supporting multiple named vectors per point for quite a long time. Those may have +utterly different dimensionality and distance functions used to calculate similarity. Having multiple +embeddings per item is an essential real-world scenario. For example, you might be encoding textual +and visual data using different models. Or you might be experimenting with different models but +don't want to make your payloads redundant by keeping them in separate collections. -``` +![Optional vectors](/articles_data/qdrant-1.2.x/optional-vectors.png) -### [Anchor](https://qdrant.tech/documentation/concepts/collections/\#list-collection-aliases) List collection aliases +However, up to the previous version, we requested that you provide all the vectors for each point. There +have been many requests to allow nullable vectors, as sometimes you cannot generate an embedding or +simply don't want to for reasons we don't need to know. -httpbashpythontypescriptrustjavacsharpgo +### Grouping requests -```http -GET /collections/{collection_name}/aliases +Embeddings are great for capturing the semantics of the documents, but we rarely encode larger pieces +of data into a single vector. Having a summary of a book may sound attractive, but in reality, we +divide it into paragraphs or some different parts to have higher granularity. That pays off when we +perform the semantic search, as we can return the relevant pieces only. That's also how modern tools +like Langchain process the data. The typical way is to encode some smaller parts of the document and +keep the document id as a payload attribute. -``` +![Query without grouping request](/articles_data/qdrant-1.2.x/without-grouping-request.png) -```bash -curl -X GET http://localhost:6333/collections/{collection_name}/aliases +There are cases where we want to find relevant parts, but only up to a specific number of results +per document (for example, only a single one). Up till now, we had to implement such a mechanism +on the client side and send several calls to the Qdrant engine. But that's no longer the case. +Qdrant 1.2 provides a mechanism for [grouping requests](/documentation/search/#grouping-api), which +can handle that server-side, within a single call to the database. This mechanism is similar to the +SQL `GROUP BY` clause. -``` +![Query with grouping request](/articles_data/qdrant-1.2.x/with-grouping-request.png) -```python -from qdrant_client import QdrantClient +You are not limited to a single result per document, and you can select how many entries will be +returned. -client = QdrantClient(url="http://localhost:6333") +### Nested filters -client.get_collection_aliases(collection_name="{collection_name}") +Unlike some other vector databases, Qdrant accepts any arbitrary JSON payload, including +arrays, objects, and arrays of objects. You can also [filter the search results using nested +keys](/documentation/filtering/#nested-key), even though arrays (using the `[]` syntax). +Before Qdrant 1.2 it was impossible to express some more complex conditions for the +nested structures. For example, let's assume we have the following payload: + +```json +{ + "country": "Japan", + "cities": [ + { + "name": "Tokyo", + "population": 9.3, + "area": 2194 + }, + { + "name": "Osaka", + "population": 2.7, + "area": 223 + }, + { + "name": "Kyoto", + "population": 1.5, + "area": 827.8 + } + ] +} ``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +We want to filter out the results to include the countries with a city with over 2 million citizens +and an area bigger than 500 square kilometers but no more than 1000. There is no such a city in +Japan, looking at our data, but if we wrote the following filter, it would be returned: -const client = new QdrantClient({ host: "localhost", port: 6333 }); +```json +{ + "filter": { + "must": [ + { + "key": "country.cities[].population", + "range": { + "gte": 2 + } + }, + { + "key": "country.cities[].area", + "range": { + "gt": 500, + "lte": 1000 + } + } + ] + }, + "limit": 3 +} +``` -client.getCollectionAliases("{collection_name}"); +Japan would be returned because Tokyo and Osaka match the first criteria, while Kyoto fulfills +the second. But that's not what we wanted to achieve. That's the motivation behind introducing +a new type of nested filter. +```json +{ + "filter": { + "must": [ + { + "nested": { + "key": "country.cities", + "filter": { + "must": [ + { + "key": "population", + "range": { + "gte": 2 + } + }, + { + "key": "area", + "range": { + "gt": 500, + "lte": 1000 + } + } + ] + } + } + } + ] + }, + "limit": 3 +} ``` -```rust -use qdrant_client::Qdrant; +The syntax is consistent with all the other supported filters and enables new possibilities. In +our case, it allows us to express the joined condition on a nested structure and make the results +list empty but correct. -let client = Qdrant::from_url("http://localhost:6334").build()?; +## Important changes -client.list_collection_aliases("{collection_name}").await?; +The latest release focuses not only on the new features but also introduces some changes making +Qdrant even more reliable. -``` +### Recovery mode -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; +There has been an issue in memory-constrained environments, such as cloud, happening when users were +pushing massive amounts of data into the service using `wait=false`. This data influx resulted in an +overreaching of disk or RAM limits before the Write-Ahead Logging (WAL) was fully applied. This +situation was causing Qdrant to attempt a restart and reapplication of WAL, failing recurrently due +to the same memory constraints and pushing the service into a frustrating crash loop with many +Out-of-Memory errors. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +Qdrant 1.2 enters recovery mode, if enabled, when it detects a failure on startup. +That makes the service halt the loading of collection data and commence operations in a partial state. +This state allows for removing collections but doesn't support search or update functions. +**Recovery mode [has to be enabled by user](/documentation/administration/#recovery-mode).** -client.listCollectionAliasesAsync("{collection_name}").get(); +### Appendable mmap -``` +For a long time, segments using mmap storage were `non-appendable` and could only be constructed by +the optimizer. Dynamically adding vectors to the mmap file is fairly complicated and thus not +implemented in Qdrant, but we did our best to implement it in the recent release. If you want +to read more about segments, check out our docs on [vector storage](/documentation/storage/#vector-storage). -```csharp -using Qdrant.Client; +## Security -var client = new QdrantClient("localhost", 6334); +There are two major changes in terms of [security](/documentation/security/): -await client.ListCollectionAliasesAsync("{collection_name}"); +1. **API-key support** - basic authentication with a static API key to prevent unwanted access. Previously + API keys were only supported in [Qdrant Cloud](https://cloud.qdrant.io/). +2. **TLS support** - to use encrypted connections and prevent sniffing/MitM attacks. -``` +## Release notes -```go -import ( - "context" +As usual, [our release notes](https://github.com/qdrant/qdrant/releases/tag/v1.2.0) describe all the changes +introduced in the latest version. - "github.com/qdrant/go-client/qdrant" -) +<|page-63-lllmstxt|> +Nowadays, people create a huge number of applications of various types and solve problems in different areas. +Despite such diversity, they have something in common - they need to process data. +Real-world data is a living structure, it grows day by day, changes a lot and becomes harder to work with. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +In some cases, you need to categorize or label your data, which can be a tough problem given its scale. +The process of splitting or labelling is error-prone and these errors can be very costly. +Imagine that you failed to achieve the desired quality of the model due to inaccurate labels. +Worse, your users are faced with a lot of irrelevant items, unable to find what they need and getting annoyed by it. +Thus, you get poor retention, and it directly impacts company revenue. +It is really important to avoid such errors in your data. -client.ListCollectionAliases(context.Background(), "{collection_name}") +## Furniture web-marketplace -``` +Let’s say you work on an online furniture marketplace. -### [Anchor](https://qdrant.tech/documentation/concepts/collections/\#list-all-aliases) List all aliases +{{< figure src=https://storage.googleapis.com/demo-dataset-quality-public/article/furniture_marketplace.png caption="Furniture marketplace" >}} -httpbashpythontypescriptrustjavacsharpgo +In this case, to ensure a good user experience, you need to split items into different categories: tables, chairs, beds, etc. +One can arrange all the items manually and spend a lot of money and time on this. +There is also another way: train a classification or similarity model and rely on it. +With both approaches it is difficult to avoid mistakes. +Manual labelling is a tedious task, but it requires concentration. +Once you got distracted or your eyes became blurred mistakes won't keep you waiting. +The model also can be wrong. +You can analyse the most uncertain predictions and fix them, but the other errors will still leak to the site. +There is no silver bullet. You should validate your dataset thoroughly, and you need tools for this. -```http -GET /aliases +When you are sure that there are not many objects placed in the wrong category, they can be considered outliers or anomalies. +Thus, you can train a model or a bunch of models capable of looking for anomalies, e.g. autoencoder and a classifier on it. +However, this is again a resource-intensive task, both in terms of time and manual labour, since labels have to be provided for classification. +On the contrary, if the proportion of out-of-place elements is high enough, outlier search methods are likely to be useless. -``` +### Similarity search -```bash -curl -X GET http://localhost:6333/aliases +The idea behind similarity search is to measure semantic similarity between related parts of the data. +E.g. between category title and item images. +The hypothesis is, that unsuitable items will be less similar. -``` +We can't directly compare text and image data. +For this we need an intermediate representation - embeddings. +Embeddings are just numeric vectors containing semantic information. +We can apply a pre-trained model to our data to produce these vectors. +After embeddings are created, we can measure the distances between them. -```python -from qdrant_client import QdrantClient +Assume we want to search for something other than a single bed in «Single beds» category. -client = QdrantClient(url="http://localhost:6333") +{{< figure src=https://storage.googleapis.com/demo-dataset-quality-public/article/similarity_search.png caption="Similarity search" >}} -client.get_aliases() +One of the possible pipelines would look like this: +- Take the name of the category as an anchor and calculate the anchor embedding. +- Calculate embeddings for images of each object placed into this category. +- Compare obtained anchor and object embeddings. +- Find the furthest. -``` +For instance, we can do it with the [CLIP](https://huggingface.co/sentence-transformers/clip-ViT-B-32-multilingual-v1) model. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +{{< figure src=https://storage.googleapis.com/demo-dataset-quality-public/article/category_vs_image_transparent.png caption="Category vs. Image" >}} -const client = new QdrantClient({ host: "localhost", port: 6333 }); +We can also calculate embeddings for titles instead of images, or even for both of them to find more errors. -client.getAliases(); +{{< figure src=https://storage.googleapis.com/demo-dataset-quality-public/article/category_vs_name_and_image_transparent.png caption="Category vs. Title and Image" >}} -``` +As you can see, different approaches can find new errors or the same ones. +Stacking several techniques or even the same techniques with different models may provide better coverage. +Hint: Caching embeddings for the same models and reusing them among different methods can significantly speed up your lookup. -```rust -use qdrant_client::Qdrant; +### Diversity search -let client = Qdrant::from_url("http://localhost:6334").build()?; +Since pre-trained models have only general knowledge about the data, they can still leave some misplaced items undetected. +You might find yourself in a situation when the model focuses on non-important features, selects a lot of irrelevant elements, and fails to find genuine errors. +To mitigate this issue, you can perform a diversity search. -client.list_aliases().await?; +Diversity search is a method for finding the most distinctive examples in the data. +As similarity search, it also operates on embeddings and measures the distances between them. +The difference lies in deciding which point should be extracted next. -``` +Let's imagine how to get 3 points with similarity search and then with diversity search. -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; +Similarity: +1. Calculate distance matrix +2. Choose your anchor +3. Get a vector corresponding to the distances from the selected anchor from the distance matrix +4. Sort fetched vector +5. Get top-3 embeddings + +Diversity: +1. Calculate distance matrix +2. Initialize starting point (randomly or according to the certain conditions) +3. Get a distance vector for the selected starting point from the distance matrix +4. Find the furthest point +5. Get a distance vector for the new point +6. Find the furthest point from all of already fetched points -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +{{< figure src=https://storage.googleapis.com/demo-dataset-quality-public/article/diversity_transparent.png caption="Diversity search" >}} -client.listAliasesAsync().get(); +Diversity search utilizes the very same embeddings, and you can reuse them. +If your data is huge and does not fit into memory, vector search engines like [Qdrant](https://github.com/qdrant/qdrant) might be helpful. -``` +Although the described methods can be used independently. But they are simple to combine and improve detection capabilities. +If the quality remains insufficient, you can fine-tune the models using a similarity learning approach (e.g. with [Quaterion](https://quaterion.qdrant.tech) both to provide a better representation of your data and pull apart dissimilar objects in space. -```csharp -using Qdrant.Client; +## Conclusion -var client = new QdrantClient("localhost", 6334); +In this article, we enlightened distance-based methods to find errors in categorized datasets. +Showed how to find incorrectly placed items in the furniture web store. +I hope these methods will help you catch sneaky samples leaked into the wrong categories in your data, and make your users` experience more enjoyable. -await client.ListAliasesAsync(); +Poke the [demo](https://dataset-quality.qdrant.tech). -``` +Stay tuned :) -```go -import ( - "context" +<|page-64-lllmstxt|> +# Question-answering system with Similarity Learning and Quaterion - "github.com/qdrant/go-client/qdrant" -) -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Many problems in modern machine learning are approached as classification tasks. +Some are the classification tasks by design, but others are artificially transformed into such. +And when you try to apply an approach, which does not naturally fit your problem, you risk coming up with over-complicated or bulky solutions. +In some cases, you would even get worse performance. -client.ListAliases(context.Background()) +Imagine that you got a new task and decided to solve it with a good old classification approach. +Firstly, you will need labeled data. +If it came on a plate with the task, you're lucky, but if it didn't, you might need to label it manually. +And I guess you are already familiar with how painful it might be. -``` +Assuming you somehow labeled all required data and trained a model. +It shows good performance - well done! +But a day later, your manager told you about a bunch of new data with new classes, which your model has to handle. +You repeat your pipeline. +Then, two days later, you've been reached out one more time. +You need to update the model again, and again, and again. +Sounds tedious and expensive for me, does not it for you? + +## Automating customer support -### [Anchor](https://qdrant.tech/documentation/concepts/collections/\#list-all-collections) List all collections +Let's now take a look at the concrete example. There is a pressing problem with automating customer support. +The service should be capable of answering user questions and retrieving relevant articles from the documentation without any human involvement. -httpbashpythontypescriptrustjavacsharpgo +With the classification approach, you need to build a hierarchy of classification models to determine the question's topic. +You have to collect and label a whole custom dataset of your private documentation topics to train that. +And then, each time you have a new topic in your documentation, you have to re-train the whole pile of classifiers with additionally labeled data. +Can we make it easier? + +## Similarity option -```http -GET /collections +One of the possible alternatives is Similarity Learning, which we are going to discuss in this article. +It suggests getting rid of the classes and making decisions based on the similarity between objects instead. +To do it quickly, we would need some intermediate representation - embeddings. +Embeddings are high-dimensional vectors with semantic information accumulated in them. -``` +As embeddings are vectors, one can apply a simple function to calculate the similarity score between them, for example, cosine or euclidean distance. +So with similarity learning, all we need to do is provide pairs of correct questions and answers. +And then, the model will learn to distinguish proper answers by the similarity of embeddings. -```bash -curl -X GET http://localhost:6333/collections +>If you want to learn more about similarity learning and applications, check out this [article](/documentation/tutorials/neural-search/) which might be an asset. -``` +## Let's build -```python -from qdrant_client import QdrantClient +Similarity learning approach seems a lot simpler than classification in this case, and if you have some +doubts on your mind, let me dispel them. -client = QdrantClient(url="http://localhost:6333") +As I have no any resource with exhaustive F.A.Q. which might serve as a dataset, I've scrapped it from sites of popular cloud providers. +The dataset consists of just 8.5k pairs of question and answers, you can take a closer look at it [here](https://github.com/qdrant/demo-cloud-faq). -client.get_collections() +Once we have data, we need to obtain embeddings for it. +It is not a novel technique in NLP to represent texts as embeddings. +There are plenty of algorithms and models to calculate them. +You could have heard of Word2Vec, GloVe, ELMo, BERT, all these models can provide text embeddings. -``` +However, it is better to produce embeddings with a model trained for semantic similarity tasks. +For instance, we can find such models at [sentence-transformers](https://www.sbert.net/docs/pretrained_models.html). +Authors claim that `all-mpnet-base-v2` provides the best quality, but let's pick `all-MiniLM-L6-v2` for our tutorial +as it is 5x faster and still offers good results. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +Having all this, we can test our approach. We won't take all our dataset at the moment, but only +a part of it. To measure model's performance we will use two metrics - +[mean reciprocal rank](https://en.wikipedia.org/wiki/Mean_reciprocal_rank) and +[precision@1](https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Precision_at_k). +We have a [ready script](https://github.com/qdrant/demo-cloud-faq/blob/experiments/faq/baseline.py) +for this experiment, let's just launch it now. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +
-client.getCollections(); +| precision@1 | reciprocal_rank | +|-------------|-----------------| +| 0.564 | 0.663 | -``` +
-```rust -use qdrant_client::Qdrant; +That's already quite decent quality, but maybe we can do better? -let client = Qdrant::from_url("http://localhost:6334").build()?; +## Improving results with fine-tuning -client.list_collections().await?; +Actually, we can! Model we used has a good natural language understanding, but it has never seen +our data. An approach called `fine-tuning` might be helpful to overcome this issue. With +fine-tuning you don't need to design a task-specific architecture, but take a model pre-trained on +another task, apply a couple of layers on top and train its parameters. -``` +Sounds good, but as similarity learning is not as common as classification, it might be a bit inconvenient to fine-tune a model with traditional tools. +For this reason we will use [Quaterion](https://github.com/qdrant/quaterion) - a framework for fine-tuning similarity learning models. +Let's see how we can train models with it -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; +First, create our project and call it `faq`. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +> All project dependencies, utils scripts not covered in the tutorial can be found in the +> [repository](https://github.com/qdrant/demo-cloud-faq/tree/tutorial). -client.listCollectionsAsync().get(); +### Configure training -``` +The main entity in Quaterion is [TrainableModel](https://quaterion.qdrant.tech/quaterion.train.trainable_model.html). +This class makes model's building process fast and convenient. -```csharp -using Qdrant.Client; +`TrainableModel` is a wrapper around [pytorch_lightning.LightningModule](https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html). -var client = new QdrantClient("localhost", 6334); +[Lightning](https://www.pytorchlightning.ai/) handles all the training process complexities, like training loop, device managing, etc. and saves user from a necessity to implement all this routine manually. +Also Lightning's modularity is worth to be mentioned. +It improves separation of responsibilities, makes code more readable, robust and easy to write. +All these features make Pytorch Lightning a perfect training backend for Quaterion. -await client.ListCollectionsAsync(); +To use `TrainableModel` you need to inherit your model class from it. +The same way you would use `LightningModule` in pure `pytorch_lightning`. +Mandatory methods are `configure_loss`, `configure_encoders`, `configure_head`, +`configure_optimizers`. -``` +The majority of mentioned methods are quite easy to implement, you'll probably just need a couple of +imports to do that. But `configure_encoders` requires some code:) -```go -import ( - "context" +Let's create a `model.py` with model's template and a placeholder for `configure_encoders` +for the moment. - "github.com/qdrant/go-client/qdrant" -) +```python +from typing import Union, Dict, Optional -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +from torch.optim import Adam + +from quaterion import TrainableModel +from quaterion.loss import MultipleNegativesRankingLoss, SimilarityLoss +from quaterion_models.encoders import Encoder +from quaterion_models.heads import EncoderHead +from quaterion_models.heads.skip_connection_head import SkipConnectionHead -client.ListCollections(context.Background()) +class FAQModel(TrainableModel): + def __init__(self, lr=10e-5, *args, **kwargs): + self.lr = lr + super().__init__(*args, **kwargs) + + def configure_optimizers(self): + return Adam(self.model.parameters(), lr=self.lr) + + def configure_loss(self) -> SimilarityLoss: + return MultipleNegativesRankingLoss(symmetric=True) + + def configure_encoders(self) -> Union[Encoder, Dict[str, Encoder]]: + ... # ToDo + + def configure_head(self, input_embedding_size: int) -> EncoderHead: + return SkipConnectionHead(input_embedding_size) ``` -##### Was this page useful? +- `configure_optimizers` is a method provided by Lightning. An eagle-eye of you could notice +mysterious `self.model`, it is actually a [SimilarityModel](https://quaterion-models.qdrant.tech/quaterion_models.model.html) instance. We will cover it later. +- `configure_loss` is a loss function to be used during training. You can choose a ready-made implementation from Quaterion. +However, since Quaterion's purpose is not to cover all possible losses, or other entities and +features of similarity learning, but to provide a convenient framework to build and use such models, +there might not be a desired loss. In this case it is possible to use [PytorchMetricLearningWrapper](https://quaterion.qdrant.tech/quaterion.loss.extras.pytorch_metric_learning_wrapper.html) +to bring required loss from [pytorch-metric-learning](https://kevinmusgrave.github.io/pytorch-metric-learning/) library, which has a rich collection of losses. +You can also implement a custom loss yourself. +- `configure_head` - model built via Quaterion is a combination of encoders and a top layer - head. +As with losses, some head implementations are provided. They can be found at [quaterion_models.heads](https://quaterion-models.qdrant.tech/quaterion_models.heads.html). + +At our example we use [MultipleNegativesRankingLoss](https://quaterion.qdrant.tech/quaterion.loss.multiple_negatives_ranking_loss.html). +This loss is especially good for training retrieval tasks. +It assumes that we pass only positive pairs (similar objects) and considers all other objects as negative examples. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +`MultipleNegativesRankingLoss` use cosine to measure distance under the hood, but it is a configurable parameter. +Quaterion provides implementation for other distances as well. You can find available ones at [quaterion.distances](https://quaterion.qdrant.tech/quaterion.distances.html). -Thank you for your feedback! 🙏 +Now we can come back to `configure_encoders`:) -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/collections.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +### Configure Encoder -On this page: +The encoder task is to convert objects into embeddings. +They usually take advantage of some pre-trained models, in our case `all-MiniLM-L6-v2` from `sentence-transformers`. +In order to use it in Quaterion, we need to create a wrapper inherited from the [Encoder](https://quaterion-models.qdrant.tech/quaterion_models.encoders.encoder.html) class. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/collections.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Let's create our encoder in `encoder.py` -× +```python +import os -[Powered by](https://qdrant.tech/) +from torch import Tensor, nn +from sentence_transformers.models import Transformer, Pooling -<|page-39-lllmstxt|> -## hybrid-search-fastembed -- [Documentation](https://qdrant.tech/documentation/) -- [Beginner tutorials](https://qdrant.tech/documentation/beginner-tutorials/) -- Setup Hybrid Search with FastEmbed +from quaterion_models.encoders import Encoder +from quaterion_models.types import TensorInterchange, CollateFnType -# [Anchor](https://qdrant.tech/documentation/beginner-tutorials/hybrid-search-fastembed/\#build-a-hybrid-search-service-with-fastembed-and-qdrant) Build a Hybrid Search Service with FastEmbed and Qdrant -| Time: 20 min | Level: Beginner | Output: [GitHub](https://github.com/qdrant/qdrant_demo/) | | -| --- | --- | --- | --- | +class FAQEncoder(Encoder): + def __init__(self, transformer, pooling): + super().__init__() + self.transformer = transformer + self.pooling = pooling + self.encoder = nn.Sequential(self.transformer, self.pooling) + + @property + def trainable(self) -> bool: + # Defines if we want to train encoder itself, or head layer only + return False + + @property + def embedding_size(self) -> int: + return self.transformer.get_word_embedding_dimension() + + def forward(self, batch: TensorInterchange) -> Tensor: + return self.encoder(batch)["sentence_embedding"] + + def get_collate_fn(self) -> CollateFnType: + return self.transformer.tokenize + + @staticmethod + def _transformer_path(path: str): + return os.path.join(path, "transformer") + + @staticmethod + def _pooling_path(path: str): + return os.path.join(path, "pooling") + + def save(self, output_path: str): + transformer_path = self._transformer_path(output_path) + os.makedirs(transformer_path, exist_ok=True) + pooling_path = self._pooling_path(output_path) + os.makedirs(pooling_path, exist_ok=True) + self.transformer.save(transformer_path) + self.pooling.save(pooling_path) + + @classmethod + def load(cls, input_path: str) -> Encoder: + transformer = Transformer.load(cls._transformer_path(input_path)) + pooling = Pooling.load(cls._pooling_path(input_path)) + return cls(transformer=transformer, pooling=pooling) +``` -This tutorial shows you how to build and deploy your own hybrid search service to look through descriptions of companies from [startups-list.com](https://www.startups-list.com/) and pick the most similar ones to your query. -The website contains the company names, descriptions, locations, and a picture for each entry. +As you can notice, there are more methods implemented, then we've already discussed. Let's go +through them now! +- In `__init__` we register our pre-trained layers, similar as you do in [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html) descendant. -As we have already written on our [blog](https://qdrant.tech/articles/hybrid-search/), there is no single definition of hybrid search. -In this tutorial we are covering the case with a combination of dense and [sparse embeddings](https://qdrant.tech/articles/sparse-vectors/). -The former ones refer to the embeddings generated by such well-known neural networks as BERT, while the latter ones are more related to a traditional full-text search approach. +- `trainable` defines whether current `Encoder` layers should be updated during training or not. If `trainable=False`, then all layers will be frozen. -Our hybrid search service will use [Fastembed](https://github.com/qdrant/fastembed) package to generate embeddings of text descriptions and [FastAPI](https://fastapi.tiangolo.com/) to serve the search API. -Fastembed natively integrates with Qdrant client, so you can easily upload the data into Qdrant and perform search queries. +- `embedding_size` is a size of encoder's output, it is required for proper `head` configuration. + +- `get_collate_fn` is a tricky one. Here you should return a method which prepares a batch of raw +data into the input, suitable for the encoder. If `get_collate_fn` is not overridden, then the [default_collate](https://pytorch.org/docs/stable/data.html#torch.utils.data.default_collate) will be used. -![Hybrid Search Schema](https://qdrant.tech/documentation/tutorials/hybrid-search-with-fastembed/hybrid-search-schema.png) -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/hybrid-search-fastembed/\#workflow) Workflow +The remaining methods are considered self-describing. -To create a hybrid search service, you will need to transform your raw data and then create a search function to manipulate it. -First, you will 1) download and prepare a sample dataset using a modified version of the BERT ML model. Then, you will 2) load the data into Qdrant, 3) create a hybrid search API and 4) serve it using FastAPI. +As our encoder is ready, we now are able to fill `configure_encoders`. +Just insert the following code into `model.py`: -![Hybrid Search Workflow](https://qdrant.tech/docs/workflow-neural-search.png) +```python +... +from sentence_transformers import SentenceTransformer +from sentence_transformers.models import Transformer, Pooling +from faq.encoder import FAQEncoder -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/hybrid-search-fastembed/\#prerequisites) Prerequisites +class FAQModel(TrainableModel): + ... + def configure_encoders(self) -> Union[Encoder, Dict[str, Encoder]]: + pre_trained_model = SentenceTransformer("all-MiniLM-L6-v2") + transformer: Transformer = pre_trained_model[0] + pooling: Pooling = pre_trained_model[1] + encoder = FAQEncoder(transformer, pooling) + return encoder +``` -To complete this tutorial, you will need: +### Data preparation -- Docker - The easiest way to use Qdrant is to run a pre-built Docker image. -- [Raw parsed data](https://storage.googleapis.com/generall-shared-data/startups_demo.json) from startups-list.com. -- Python version >=3.9 +Okay, we have raw data and a trainable model. But we don't know yet how to feed this data to our model. -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/hybrid-search-fastembed/\#prepare-sample-dataset) Prepare sample dataset -To conduct a hybrid search on startup descriptions, you must first encode the description data into vectors. -Fastembed integration into qdrant client combines encoding and uploading into a single step. +Currently, Quaterion takes two types of similarity representation - pairs and groups. -It also takes care of batching and parallelization, so you don’t have to worry about it. +The groups format assumes that all objects split into groups of similar objects. All objects inside +one group are similar, and all other objects outside this group considered dissimilar to them. -Let’s start by downloading the data and installing the necessary packages. +But in the case of pairs, we can only assume similarity between explicitly specified pairs of objects. -1. First you need to download the dataset. +We can apply any of the approaches with our data, but pairs one seems more intuitive. -```bash -wget https://storage.googleapis.com/generall-shared-data/startups_demo.json +The format in which Similarity is represented determines which loss can be used. +For example, _ContrastiveLoss_ and _MultipleNegativesRankingLoss_ works with pairs format. +[SimilarityPairSample](https://quaterion.qdrant.tech/quaterion.dataset.similarity_samples.html#quaterion.dataset.similarity_samples.SimilarityPairSample) could be used to represent pairs. +Let's take a look at it: + +```python +@dataclass +class SimilarityPairSample: + obj_a: Any + obj_b: Any + score: float = 1.0 + subgroup: int = 0 ``` -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/hybrid-search-fastembed/\#run-qdrant-in-docker) Run Qdrant in Docker +Here might be some questions: what `score` and `subgroup` are? -Next, you need to manage all of your data using a vector engine. Qdrant lets you store, update or delete created vectors. Most importantly, it lets you search for the nearest vectors via a convenient API. +Well, `score` is a measure of expected samples similarity. +If you only need to specify if two samples are similar or not, you can use `1.0` and `0.0` respectively. -> **Note:** Before you begin, create a project directory and a virtual python environment in it. +`subgroups` parameter is required for more granular description of what negative examples could be. +By default, all pairs belong the subgroup zero. +That means that we would need to specify all negative examples manually. +But in most cases, we can avoid this by enabling different subgroups. +All objects from different subgroups will be considered as negative examples in loss, and thus it +provides a way to set negative examples implicitly. -1. Download the Qdrant image from DockerHub. -```bash -docker pull qdrant/qdrant +With this knowledge, we now can create our `Dataset` class in `dataset.py` to feed our model: -``` +```python +import json +from typing import List, Dict -2. Start Qdrant inside of Docker. +from torch.utils.data import Dataset +from quaterion.dataset.similarity_samples import SimilarityPairSample -```bash -docker run -p 6333:6333 \ - -v $(pwd)/qdrant_storage:/qdrant/storage \ - qdrant/qdrant +class FAQDataset(Dataset): + """Dataset class to process .jsonl files with FAQ from popular cloud providers.""" + + def __init__(self, dataset_path): + self.dataset: List[Dict[str, str]] = self.read_dataset(dataset_path) + + def __getitem__(self, index) -> SimilarityPairSample: + line = self.dataset[index] + question = line["question"] + # All questions have a unique subgroup + # Meaning that all other answers are considered negative pairs + subgroup = hash(question) + return SimilarityPairSample( + obj_a=question, + obj_b=line["answer"], + score=1, + subgroup=subgroup + ) + + def __len__(self): + return len(self.dataset) + + @staticmethod + def read_dataset(dataset_path) -> List[Dict[str, str]]: + """Read jsonl-file into a memory.""" + with open(dataset_path, "r") as fd: + return [json.loads(json_line) for json_line in fd] ``` -You should see output like this +We assigned a unique subgroup for each question, so all other objects which have different question will be considered as negative examples. -```text -... -[2021-02-05T00:08:51Z INFO actix_server::builder] Starting 12 workers -[2021-02-05T00:08:51Z INFO actix_server::builder] Starting "actix-web-service-0.0.0.0:6333" service on 0.0.0.0:6333 +### Evaluation Metric -``` +We still haven't added any metrics to the model. For this purpose Quaterion provides `configure_metrics`. +We just need to override it and attach interested metrics. -Test the service by going to [http://localhost:6333/](http://localhost:6333/). You should see the Qdrant version info in your browser. +Quaterion has some popular retrieval metrics implemented - such as _precision @ k_ or _mean reciprocal rank_. +They can be found in [quaterion.eval](https://quaterion.qdrant.tech/quaterion.eval.html) package. +But there are just a few metrics, it is assumed that desirable ones will be made by user or taken from another libraries. +You will probably need to inherit from `PairMetric` or `GroupMetric` to implement a new one. -All data uploaded to Qdrant is saved inside the `./qdrant_storage` directory and will be persisted even if you recreate the container. +In `configure_metrics` we need to return a list of `AttachedMetric`. +They are just wrappers around metric instances and helps to log metrics more easily. +Under the hood `logging` is handled by `pytorch-lightning`. +You can configure it as you want - pass required parameters as keyword arguments to `AttachedMetric`. +For additional info visit [logging documentation page](https://pytorch-lightning.readthedocs.io/en/stable/extensions/logging.html) -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/hybrid-search-fastembed/\#upload-data-to-qdrant) Upload data to Qdrant +Let's add mentioned metrics for our `FAQModel`. +Add this code to `model.py`: -1. Install the official Python client to best interact with Qdrant. +```python +... +from quaterion.eval.pair import RetrievalPrecision, RetrievalReciprocalRank +from quaterion.eval.attached_metric import AttachedMetric -```bash -pip install "qdrant-client[fastembed]>=1.14.2" +class FAQModel(TrainableModel): + def __init__(self, lr=10e-5, *args, **kwargs): + self.lr = lr + super().__init__(*args, **kwargs) + + ... + def configure_metrics(self): + return [ + AttachedMetric( + "RetrievalPrecision", + RetrievalPrecision(k=1), + prog_bar=True, + on_epoch=True, + ), + AttachedMetric( + "RetrievalReciprocalRank", + RetrievalReciprocalRank(), + prog_bar=True, + on_epoch=True + ), + ] ``` -> **Note:** This tutorial requires fastembed of version >=0.6.1. - -At this point, you should have startup records in the `startups_demo.json` file and Qdrant running on a local machine. +### Fast training with Cache -Now you need to write a script to upload all startup data and vectors into the search engine. +Quaterion has one more cherry on top of the cake when it comes to non-trainable encoders. +If encoders are frozen, they are deterministic and emit the exact embeddings for the same input data on each epoch. +It provides a way to avoid repeated calculations and reduce training time. +For this purpose Quaterion has a cache functionality. -2. Create a client object for Qdrant. -```python -# Import client library -from qdrant_client import QdrantClient, models +Before training starts, the cache runs one epoch to pre-calculate all embeddings with frozen encoders and then store them on a device you chose (currently CPU or GPU). +Everything you need is to define which encoders are trainable or not and set cache settings. +And that's it: everything else Quaterion will handle for you. -client = QdrantClient(url="http://localhost:6333") +To configure cache you need to override `configure_cache` method in `TrainableModel`. +This method should return an instance of [CacheConfig](https://quaterion.qdrant.tech/quaterion.train.cache.cache_config.html#quaterion.train.cache.cache_config.CacheConfig). +Let's add cache to our model: +```python +... +from quaterion.train.cache import CacheConfig, CacheType +... +class FAQModel(TrainableModel): + ... + def configure_caches(self) -> Optional[CacheConfig]: + return CacheConfig(CacheType.AUTO) + ... ``` -3. Choose models to encode your data and prepare collections. - -In this tutorial, we will be using two pre-trained models to compute dense and sparse vectors correspondingly -The models are: `sentence-transformers/all-MiniLM-L6-v2` and `prithivida/Splade_PP_en_v1`. -As soon as the choice is made, we need to configure a collection in Qdrant. - -```python -dense_vector_name = "dense" -sparse_vector_name = "sparse" -dense_model_name = "sentence-transformers/all-MiniLM-L6-v2" -sparse_model_name = "prithivida/Splade_PP_en_v1" -if not client.collection_exists("startups"): - client.create_collection( - collection_name="startups", - vectors_config={ - dense_vector_name: models.VectorParams( - size=client.get_embedding_size(dense_model_name), - distance=models.Distance.COSINE - ) - }, # size and distance are model dependent - sparse_vectors_config={sparse_vector_name: models.SparseVectorParams()}, - ) +[CacheType](https://quaterion.qdrant.tech/quaterion.train.cache.cache_config.html#quaterion.train.cache.cache_config.CacheType) determines how the cache will be stored in memory. -``` -Qdrant requires vectors to have their own names and configurations. -Parameters `size` and `distance` are mandatory, however, you can additionaly specify extended configuration for your vectors, like `quantization_config` or `hnsw_config`. +### Training -4. Read data from the file. +Now we need to combine all our code together in `train.py` and launch a training process. ```python -import json +import torch +import pytorch_lightning as pl -payload_path = "startups_demo.json" -documents = [] -metadata = [] +from quaterion import Quaterion +from quaterion.dataset import PairsSimilarityDataLoader -with open(payload_path) as fd: - for line in fd: - obj = json.loads(line) - description = obj["description"] - dense_document = models.Document(text=description, model=dense_model_name) - sparse_document = models.Document(text=description, model=sparse_model_name) - documents.append( - { - dense_vector_name: dense_document, - sparse_vector_name: sparse_document, - } - ) - metadata.append(obj) +from faq.dataset import FAQDataset + +def train(model, train_dataset_path, val_dataset_path, params): + use_gpu = params.get("cuda", torch.cuda.is_available()) + + trainer = pl.Trainer( + min_epochs=params.get("min_epochs", 1), + max_epochs=params.get("max_epochs", 500), + auto_select_gpus=use_gpu, + log_every_n_steps=params.get("log_every_n_steps", 1), + gpus=int(use_gpu), + ) + train_dataset = FAQDataset(train_dataset_path) + val_dataset = FAQDataset(val_dataset_path) + train_dataloader = PairsSimilarityDataLoader( + train_dataset, batch_size=1024 + ) + val_dataloader = PairsSimilarityDataLoader( + val_dataset, batch_size=1024 + ) + + Quaterion.fit(model, trainer, train_dataloader, val_dataloader) + +if __name__ == "__main__": + import os + from pytorch_lightning import seed_everything + from faq.model import FAQModel + from faq.config import DATA_DIR, ROOT_DIR + seed_everything(42, workers=True) + faq_model = FAQModel() + train_path = os.path.join( + DATA_DIR, + "train_cloud_faq_dataset.jsonl" + ) + val_path = os.path.join( + DATA_DIR, + "val_cloud_faq_dataset.jsonl" + ) + train(faq_model, train_path, val_path, {}) + faq_model.save_servable(os.path.join(ROOT_DIR, "servable")) ``` -In this block of code, we read data from `startups_demo.json` file and split it into two list: `documents` and `metadata`. -Documents are models with descriptions of startups and model names to embed data. Metadata is payload associated with each startup, such as the name, location, and picture. -We will use `documents` to encode the data into vectors. +Here are a couple of unseen classes, `PairsSimilarityDataLoader`, which is a native dataloader for +`SimilarityPairSample` objects, and `Quaterion` is an entry point to the training process. -6. Encode and upload data. +### Dataset-wise evaluation -```python - client.upload_collection( - collection_name="startups", - vectors=tqdm.tqdm(documents), - payload=metadata, - parallel=4, # Use 4 CPU cores to encode data. - # This will spawn a model per process, which might be memory expensive - # Make sure that your system does not use swap, and reduce the amount - # # of processes if it does. - # Otherwise, it might significantly slow down the process. - # Requires wrapping code into if __name__ == '__main__' block - ) +Up to this moment we've calculated only batch-wise metrics. +Such metrics can fluctuate a lot depending on a batch size and can be misleading. +It might be helpful if we can calculate a metric on a whole dataset or some large part of it. +Raw data may consume a huge amount of memory, and usually we can't fit it into one batch. +Embeddings, on the contrary, most probably will consume less. -``` +That's where `Evaluator` enters the scene. +At first, having dataset of `SimilaritySample`, `Evaluator` encodes it via `SimilarityModel` and compute corresponding labels. +After that, it calculates a metric value, which could be more representative than batch-wise ones. -Upload processed data +However, you still can find yourself in a situation where evaluation becomes too slow, or there is no enough space left in the memory. +A bottleneck might be a squared distance matrix, which one needs to calculate to compute a retrieval metric. +You can mitigate this bottleneck by calculating a rectangle matrix with reduced size. +`Evaluator` accepts `sampler` with a sample size to select only specified amount of embeddings. +If sample size is not specified, evaluation is performed on all embeddings. -Download and unpack the processed data from [here](https://storage.googleapis.com/dataset-startup-search/startup-list-com/startups_hybrid_search_processed_40k.tar.gz) or use the following script: +Fewer words! Let's add evaluator to our code and finish `train.py`. + +```python +... +from quaterion.eval.evaluator import Evaluator +from quaterion.eval.pair import RetrievalReciprocalRank, RetrievalPrecision +from quaterion.eval.samplers.pair_sampler import PairSampler +... -```bash -wget https://storage.googleapis.com/dataset-startup-search/startup-list-com/startups_hybrid_search_processed_40k.tar.gz -tar -xvf startups_hybrid_search_processed_40k.tar.gz +def train(model, train_dataset_path, val_dataset_path, params): + ... + metrics = { + "rrk": RetrievalReciprocalRank(), + "rp@1": RetrievalPrecision(k=1) + } + sampler = PairSampler() + evaluator = Evaluator(metrics, sampler) + results = Quaterion.evaluate(evaluator, val_dataset, model.model) + print(f"results: {results}") ``` -Then you can upload the data to Qdrant. - -```python -import json -import numpy as np +### Train Results -def named_vectors( - vectors: list[float], - sparse_vectors: list[models.SparseVector] -) -> dict: - for vector, sparse_vector in zip(vectors, sparse_vectors): - yield { - dense_vector_name: vector, - sparse_vector_name: models.SparseVector(**sparse_vector), - } +At this point we can train our model, I do it via `python3 -m faq.train`. -with open("dense_vectors.npy", "rb") as f: - vectors = np.load(f) -with open("sparse_vectors.json", "r") as f: - sparse_vectors = json.load(f) +
-with open("payload.json", "r") as f: - payload = json.load(f) +|epoch|train_precision@1|train_reciprocal_rank|val_precision@1|val_reciprocal_rank| +|-----|-----------------|---------------------|---------------|-------------------| +|0 |0.650 |0.732 |0.659 |0.741 | +|100 |0.665 |0.746 |0.673 |0.754 | +|200 |0.677 |0.757 |0.682 |0.763 | +|300 |0.686 |0.765 |0.688 |0.768 | +|400 |0.695 |0.772 |0.694 |0.773 | +|500 |0.701 |0.778 |0.700 |0.777 | -client.upload_collection( - "startups", - vectors=named_vectors(vectors, sparse_vectors), - payload=payload -) +
-``` +Results obtained with `Evaluator`: -The `upload_collection` method will encode all documents and upload them to Qdrant. +
-The `parallel` parameter enables data-parallelism instead of built-in ONNX parallelism. +| precision@1 | reciprocal_rank | +|-------------|-----------------| +| 0.577 | 0.675 | -Additionally, you can specify ids for each document, if you want to use them later to update or delete documents. -If you don’t specify ids, they will be generated automatically. +
-You can monitor the progress of the encoding by passing tqdm progress bar to the `upload_collection` method. +After training all the metrics have been increased. +And this training was done in just 3 minutes on a single gpu! +There is no overfitting and the results are steadily growing, although I think there is still room for improvement and experimentation. -```python -from tqdm import tqdm +## Model serving -client.upload_collection( - collection_name="startups", - vectors=documents, - payload=metadata, - ids=tqdm(range(len(documents))), -) +As you could already notice, Quaterion framework is split into two separate libraries: `quaterion` +and [quaterion-models](https://quaterion-models.qdrant.tech/). +The former one contains training related stuff like losses, cache, `pytorch-lightning` dependency, etc. +While the latter one contains only modules necessary for serving: encoders, heads and `SimilarityModel` itself. -``` +The reasons for this separation are: -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/hybrid-search-fastembed/\#build-the-search-api) Build the search API +- less amount of entities you need to operate in a production environment +- reduced memory footprint -Now that all the preparations are complete, let’s start building a neural search class. +It is essential to isolate training dependencies from the serving environment cause the training step is usually more complicated. +Training dependencies are quickly going out of control, significantly slowing down the deployment and serving timings and increasing unnecessary resource usage. -In order to process incoming requests, the hybrid search class will need 3 things: 1) models to convert the query into a vector, 2) the Qdrant client to perform search queries, 3) fusion function to re-rank dense and sparse search results. -Qdrant supports 2 fusion functions for combining the results: [reciprocal rank fusion](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) and [distribution based score fusion](https://qdrant.tech/documentation/concepts/hybrid-queries/?q=distribution+based+sc#:~:text=Distribution%2DBased%20Score%20Fusion) +The very last row of `train.py` - `faq_model.save_servable(...)` saves encoders and the model in a fashion that eliminates all Quaterion dependencies and stores only the most necessary data to run a model in production. -1. Create a file named `hybrid_searcher.py` and specify the following. +In `serve.py` we load and encode all the answers and then look for the closest vectors to the questions we are interested in: ```python -from qdrant_client import QdrantClient, models +import os +import json -class HybridSearcher: - DENSE_MODEL = "sentence-transformers/all-MiniLM-L6-v2" - SPARSE_MODEL = "prithivida/Splade_PP_en_v1" +import torch +from quaterion_models.model import SimilarityModel +from quaterion.distances import Distance - def __init__(self, collection_name): - self.collection_name = collection_name - self.qdrant_client = QdrantClient() +from faq.config import DATA_DIR, ROOT_DIR + +if __name__ == "__main__": + device = "cuda:0" if torch.cuda.is_available() else "cpu" + model = SimilarityModel.load(os.path.join(ROOT_DIR, "servable")) + model.to(device) + dataset_path = os.path.join(DATA_DIR, "val_cloud_faq_dataset.jsonl") + + with open(dataset_path) as fd: + answers = [json.loads(json_line)["answer"] for json_line in fd] + + # everything is ready, let's encode our answers + answer_embeddings = model.encode(answers, to_numpy=False) + + # Some prepared questions and answers to ensure that our model works as intended + questions = [ + "what is the pricing of aws lambda functions powered by aws graviton2 processors?", + "can i run a cluster or job for a long time?", + "what is the dell open manage system administrator suite (omsa)?", + "what are the differences between the event streams standard and event streams enterprise plans?", + ] + ground_truth_answers = [ + "aws lambda functions powered by aws graviton2 processors are 20% cheaper compared to x86-based lambda functions", + "yes, you can run a cluster for as long as is required", + "omsa enables you to perform certain hardware configuration tasks and to monitor the hardware directly via the operating system", + "to find out more information about the different event streams plans, see choosing your plan", + ] + + # encode our questions and find the closest to them answer embeddings + question_embeddings = model.encode(questions, to_numpy=False) + distance = Distance.get_by_name(Distance.COSINE) + question_answers_distances = distance.distance_matrix( + question_embeddings, answer_embeddings + ) + answers_indices = question_answers_distances.min(dim=1)[1] + for q_ind, a_ind in enumerate(answers_indices): + print("Q:", questions[q_ind]) + print("A:", answers[a_ind], end="\n\n") + assert ( + answers[a_ind] == ground_truth_answers[q_ind] + ), f"<{answers[a_ind]}> != <{ground_truth_answers[q_ind]}>" ``` -2. Write the search function. +We stored our collection of answer embeddings in memory and perform search directly in Python. +For production purposes, it's better to use some sort of vector search engine like [Qdrant](https://github.com/qdrant/qdrant). +It provides durability, speed boost, and a bunch of other features. -```python -def search(self, text: str): - search_result = self.qdrant_client.query_points( - collection_name=self.collection_name, - query=models.FusionQuery( - fusion=models.Fusion.RRF # we are using reciprocal rank fusion here - ), - prefetch=[\ - models.Prefetch(\ - query=models.Document(text=text, model=self.DENSE_MODEL)\ - ),\ - models.Prefetch(\ - query=models.Document(text=text, model=self.SPARSE_MODEL)\ - ),\ - ], - query_filter=None, # If you don't want any filters for now - limit=5, # 5 the closest results - ).points - # `search_result` contains models.QueryResponse structure - # We can access list of scored points with the corresponding similarity scores, - # vectors (if `with_vectors` was set to `True`), and payload via `points` attribute. +So far, we've implemented a whole training process, prepared model for serving and even applied a +trained model today with `Quaterion`. - # Select and return metadata - metadata = [point.payload for point in search_result] - return metadata +Thank you for your time and attention! +I hope you enjoyed this huge tutorial and will use `Quaterion` for your similarity learning projects. -``` +All ready to use code can be found [here](https://github.com/qdrant/demo-cloud-faq/tree/tutorial). -3. Add search filters. +Stay tuned!:) -With Qdrant it is also feasible to add some conditions to the search. -For example, if you wanted to search for startups in a certain city, the search query could look like this: +<|page-65-lllmstxt|> +# Building Qdrant in Rust -```python - ... +Looking at the [github repository](https://github.com/qdrant/qdrant), you can see that Qdrant is built in [Rust](https://rust-lang.org). Other offerings may be written in C++, Go, Java or even Python. So why does Qdrant chose Rust? Our founder Andrey had built the first prototype in C++, but didn’t trust his command of the language to scale to a production system (to be frank, he likened it to cutting his leg off). He was well versed in Java and Scala and also knew some Python. However, he considered neither a good fit: - city_of_interest = "Berlin" +**Java** is also more than 30 years old now. With a throughput-optimized VM it can often at least play in the same ball park as native services, and the tooling is phenomenal. Also portability is surprisingly good, although the GC is not suited for low-memory applications and will generally take good amount of RAM to deliver good performance. That said, the focus on throughput led to the dreaded GC pauses that cause latency spikes. Also the fat runtime incurs high start-up delays, which need to be worked around. - # Define a filter for cities - city_filter = models.Filter( - must=[\ - models.FieldCondition(\ - key="city",\ - match=models.MatchValue(value=city_of_interest)\ - )\ - ] - ) +**Scala** also builds on the JVM, although there is a native compiler, there was the question of compatibility. So Scala shared the limitations of Java, and although it has some nice high-level amenities (of which Java only recently copied a subset), it still doesn’t offer the same level of control over memory layout as, say, C++, so it is similarly disqualified. - # NOTE: it is not a hybrid search! It's just a dense query for simplicity - search_result = self.qdrant_client.query_points( - collection_name=self.collection_name, - query=models.Document(text=text, model=self.DENSE_MODEL), - query_filter=city_filter, - limit=5 - ).points - ... +**Python**, being just a bit younger than Java, is ubiquitous in ML projects, mostly owing to its tooling (notably jupyter notebooks), being easy to learn and integration in most ML stacks. It doesn’t have a traditional garbage collector, opting for ubiquitous reference counting instead, which somewhat helps memory consumption. With that said, unless you only use it as glue code over high-perf modules, you may find yourself waiting for results. Also getting complex python services to perform stably under load is a serious technical challenge. -``` +## Into the Unknown -You have now created a class for neural search queries. Now wrap it up into a service. +So Andrey looked around at what younger languages would fit the challenge. After some searching, two contenders emerged: Go and Rust. Knowing neither, Andrey consulted the docs, and found hinself intrigued by Rust with its promise of Systems Programming without pervasive memory unsafety. -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/hybrid-search-fastembed/\#deploy-the-search-with-fastapi) Deploy the search with FastAPI +This early decision has been validated time and again. When first learning Rust, the compiler’s error messages are very helpful (and have only improved in the meantime). It’s easy to keep memory profile low when one doesn’t have to wrestle a garbage collector and has complete control over stack and heap. Apart from the much advertised memory safety, many footguns one can run into when writing C++ have been meticulously designed out. And it’s much easier to parallelize a task if one doesn’t have to fear data races. -To build the service you will use the FastAPI framework. +With Qdrant written in Rust, we can offer cloud services that don’t keep us awake at night, thanks to Rust’s famed robustness. A current qdrant docker container comes in at just a bit over 50MB — try that for size. As for performance
 have some [benchmarks](/benchmarks/). -1. Install FastAPI. +And we don’t have to compromise on ergonomics either, not for us nor for our users. Of course, there are downsides: Rust compile times are usually similar to C++’s, and though the learning curve has been considerably softened in the last years, it’s still no match for easy-entry languages like Python or Go. But learning it is a one-time cost. Contrast this with Go, where you may find [the apparent simplicity is only skin-deep](https://fasterthanli.me/articles/i-want-off-mr-golangs-wild-ride). -To install it, use the command +## Smooth is Fast -```bash -pip install fastapi uvicorn +The complexity of the type system pays large dividends in bugs that didn’t even make it to a commit. The ecosystem for web services is also already quite advanced, perhaps not at the same point as Java, but certainly matching or outcompeting Go. -``` +Some people may think that the strict nature of Rust will slow down development, which is true only insofar as it won’t let you cut any corners. However, experience has conclusively shown that this is a net win. In fact, Rust lets us [ride the wall](https://the-race.com/nascar/bizarre-wall-riding-move-puts-chastain-into-nascar-folklore/), which makes us faster, not slower. -2. Implement the service. +The job market for Rust programmers is certainly not as big as that for Java or Python programmers, but the language has finally reached the mainstream, and we don’t have any problems getting and retaining top talent. And being an open source project, when we get contributions, we don’t have to check for a wide variety of errors that Rust already rules out. -Create a file named `service.py` and specify the following. +## In Rust We Trust -The service will have only one API endpoint and will look like this: +Finally, the Rust community is a very friendly bunch, and we are delighted to be part of that. And we don’t seem to be alone. Most large IT companies (notably Amazon, Google, Huawei, Meta and Microsoft) have already started investing in Rust. It’s in the Windows font system already and in the process of coming to the Linux kernel (build support has already been included). In machine learning applications, Rust has been tried and proven by the likes of Aleph Alpha and Huggingface, among many others. -```python -from fastapi import FastAPI +To sum up, choosing Rust was a lucky guess that has brought huge benefits to Qdrant. Rust continues to be our not-so-secret weapon. -# The file where HybridSearcher is stored -from hybrid_searcher import HybridSearcher +### Key Takeaways: -app = FastAPI() +- **Rust's Advantages for Qdrant:** Rust provides memory safety and control without a garbage collector, which is crucial for Qdrant's high-performance cloud services. -# Create a neural searcher instance -hybrid_searcher = HybridSearcher(collection_name="startups") +- **Low Overhead:** Qdrant's Rust-based system offers efficiency, with small Docker container sizes and robust performance benchmarks. -@app.get("/api/search") -def search_startup(q: str): - return {"result": hybrid_searcher.search(text=q)} +- **Complexity vs. Simplicity:** Rust's strict type system reduces bugs early in development, making it faster in the long run despite initial learning curves. -if __name__ == "__main__": - import uvicorn +- **Adoption by Major Players:** Large tech companies like Amazon, Google, and Microsoft are embracing Rust, further validating Qdrant's choice. - uvicorn.run(app, host="0.0.0.0", port=8000) +- **Community and Talent:** The supportive Rust community and increasing availability of Rust developers make it easier for Qdrant to grow and innovate. -``` +<|page-66-lllmstxt|> +A recent [paper](https://arxiv.org/abs/2207.04993) +by Allen AI has attracted attention in the NLP community as they cache the output of a certain intermediate layer +in the training and inference phases to achieve a speedup of ~83% +with a negligible loss in model performance. +This technique is quite similar to [the caching mechanism in Quaterion](https://quaterion.qdrant.tech/tutorials/cache_tutorial.html), +but the latter is intended for any data modalities while the former focuses only on language models +despite presenting important insights from their experiments. +In this post, I will share our findings combined with those, +hoping to provide the community with a wider perspective on layer recycling. -3. Run the service. +## How layer recycling works +The main idea of layer recycling is to accelerate the training (and inference) +by avoiding repeated passes of the same data object through the frozen layers. +Instead, it is possible to pass objects through those layers only once, +cache the output +and use them as inputs to the unfrozen layers in future epochs. -```bash -python service.py +In the paper, they usually cache 50% of the layers, e.g., the output of the 6th multi-head self-attention block in a 12-block encoder. +However, they find out that it does not work equally for all the tasks. +For example, the question answering task suffers from a more significant degradation in performance with 50% of the layers recycled, +and they choose to lower it down to 25% for this task, +so they suggest determining the level of caching based on the task at hand. +they also note that caching provides a more considerable speedup for larger models and on lower-end machines. -``` +In layer recycling, the cache is hit for exactly the same object. +It is easy to achieve this in textual data as it is easily hashable, +but you may need more advanced tricks to generate keys for the cache +when you want to generalize this technique to diverse data types. +For instance, hashing PyTorch tensors [does not work as you may expect](https://github.com/joblib/joblib/issues/1282). +Quaterion comes with an intelligent key extractor that may be applied to any data type, +but it is also allowed to customize it with a callable passed as an argument. +Thanks to this flexibility, we were able to run a variety of experiments in different setups, +and I believe that these findings will be helpful for your future projects. -4. Open your browser at [http://localhost:8000/docs](http://localhost:8000/docs). +## Experiments +We conducted different experiments to test the performance with: +1. Different numbers of layers recycled in [the similar cars search example](https://quaterion.qdrant.tech/tutorials/cars-tutorial.html). +2. Different numbers of samples in the dataset for training and fine-tuning for similar cars search. +3. Different numbers of layers recycled in [the question answerring example](https://quaterion.qdrant.tech/tutorials/nlp_tutorial.html). -You should be able to see a debug interface for your service. +## Easy layer recycling with Quaterion +The easiest way of caching layers in Quaterion is to compose a [TrainableModel](https://quaterion.qdrant.tech/quaterion.train.trainable_model.html#quaterion.train.trainable_model.TrainableModel) +with a frozen [Encoder](https://quaterion-models.qdrant.tech/quaterion_models.encoders.encoder.html#quaterion_models.encoders.encoder.Encoder) +and an unfrozen [EncoderHead](https://quaterion-models.qdrant.tech/quaterion_models.heads.encoder_head.html#quaterion_models.heads.encoder_head.EncoderHead). +Therefore, we modified the `TrainableModel` in the [example](https://github.com/qdrant/quaterion/blob/master/examples/cars/models.py) +as in the following: -![FastAPI Swagger interface](https://qdrant.tech/docs/fastapi_neural_search.png) +```python +class Model(TrainableModel): + # ... -Feel free to play around with it, make queries regarding the companies in our corpus, and check out the results. -Join our [Discord community](https://qdrant.to/discord), where we talk about vector search and similarity learning, publish other examples of neural networks and neural search applications. + def configure_encoders(self) -> Union[Encoder, Dict[str, Encoder]]: + pre_trained_encoder = torchvision.models.resnet34(pretrained=True) + self.avgpool = copy.deepcopy(pre_trained_encoder.avgpool) + self.finetuned_block = copy.deepcopy(pre_trained_encoder.layer4) + modules = [] + + for name, child in pre_trained_encoder.named_children(): + modules.append(child) + if name == "layer3": + break -##### Was this page useful? + pre_trained_encoder = nn.Sequential(*modules) + + return CarsEncoder(pre_trained_encoder) -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No + def configure_head(self, input_embedding_size) -> EncoderHead: + return SequentialHead(self.finetuned_block, + self.avgpool, + nn.Flatten(), + SkipConnectionHead(512, dropout=0.3, skip_dropout=0.2), + output_size=512) -Thank you for your feedback! 🙏 -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/beginner-tutorials/hybrid-search-fastembed.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. + # ... +``` -On this page: +This trick lets us finetune one more layer from the base model as a part of the `EncoderHead` +while still benefiting from the speedup in the frozen `Encoder` provided by the cache. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/beginner-tutorials/hybrid-search-fastembed.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) -× +## Experiment 1: Percentage of layers recycled +The paper states that recycling 50% of the layers yields little to no loss in performance when compared to full fine-tuning. +In this setup, we compared performances of four methods: +1. Freeze the whole base model and train only `EncoderHead`. +2. Move one of the four residual blocks `EncoderHead` and train it together with the head layer while freezing the rest (75% layer recycling). +3. Move two of the four residual blocks to `EncoderHead` while freezing the rest (50% layer recycling). +4. Train the whole base model together with `EncoderHead`. -[Powered by](https://qdrant.tech/) +**Note**: During these experiments, we used ResNet34 instead of ResNet152 as the pretrained model +in order to be able to use a reasonable batch size in full training. +The baseline score with ResNet34 is 0.106. -<|page-40-lllmstxt|> -## hybrid-cloud-setup -- [Documentation](https://qdrant.tech/documentation/) -- [Hybrid cloud](https://qdrant.tech/documentation/hybrid-cloud/) -- Setup Hybrid Cloud +| Model | RRP | +| ------------- | ---- | +| Full training | 0.32 | +| 50% recycling | 0.31 | +| 75% recycling | 0.28 | +| Head only | 0.22 | +| Baseline | 0.11 | -# [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/\#creating-a-hybrid-cloud-environment) Creating a Hybrid Cloud Environment +As is seen in the table, the performance in 50% layer recycling is very close to that in full training. +Additionally, we can still have a considerable speedup in 50% layer recycling with only a small drop in performance. +Although 75% layer recycling is better than training only `EncoderHead`, +its performance drops quickly when compared to 50% layer recycling and full training. -The following instruction set will show you how to properly set up a **Qdrant cluster** in your **Hybrid Cloud Environment**. +## Experiment 2: Amount of available data +In the second experiment setup, we compared performances of fine-tuning strategies with different dataset sizes. +We sampled 50% of the training set randomly while still evaluating models on the whole validation set. -You can also watch a video demo on how to set up a Hybrid Cloud Environment: +| Model | RRP | +| ------------- | ---- | +| Full training | 0.27 | +| 50% recycling | 0.26 | +| 75% recycling | 0.25 | +| Head only | 0.21 | +| Baseline | 0.11 | -Deploy a Production-Ready Vector Database in 5 Minutes With Qdrant Hybrid Cloud - YouTube +This experiment shows that, the smaller the available dataset is, +the bigger drop in performance we observe in full training, 50% and 75% layer recycling. +On the other hand, the level of degradation in training only `EncoderHead` is really small when compared to others. +When we further reduce the dataset size, full training becomes untrainable at some point, +while we can still improve over the baseline by training only `EncoderHead`. -[Photo image of Qdrant - Vector Database & Search Engine](https://www.youtube.com/channel/UC6ftm8PwH1RU_LM1jwG0LQA?embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) -Qdrant - Vector Database & Search Engine +## Experiment 3: Layer recycling in question answering +We also wanted to test layer recycling in a different domain +as one of the most important takeaways of the paper is that +the performance of layer recycling is task-dependent. +To this end, we set up an experiment with the code from the [Question Answering with Similarity Learning tutorial](https://quaterion.qdrant.tech/tutorials/nlp_tutorial.html). -8.12K subscribers +| Model | RP@1 | RRK | +| ------------- | ---- | ---- | +| Full training | 0.76 | 0.65 | +| 50% recycling | 0.75 | 0.63 | +| 75% recycling | 0.69 | 0.59 | +| Head only | 0.67 | 0.58 | +| Baseline | 0.64 | 0.55 | -[Deploy a Production-Ready Vector Database in 5 Minutes With Qdrant Hybrid Cloud](https://www.youtube.com/watch?v=BF02jULGCfo) -Qdrant - Vector Database & Search Engine +In this task, 50% layer recycling can still do a good job with only a small drop in performance when compared to full training. +However, the level of degradation is smaller than that in the similar cars search example. +This can be attributed to several factors such as the pretrained model quality, dataset size and task definition, +and it can be the subject of a more elaborate and comprehensive research project. +Another observation is that the performance of 75% layer recycling is closer to that of training only `EncoderHead` +than 50% layer recycling. -Search +## Conclusion +We set up several experiments to test layer recycling under different constraints +and confirmed that layer recycling yields varying performances with different tasks and domains. +One of the most important observations is the fact that the level of degradation in layer recycling +is sublinear with a comparison to full training, i.e., we lose a smaller percentage of performance than +the percentage we recycle. Additionally, training only `EncoderHead` +is more resistant to small dataset sizes. +There is even a critical size under which full training does not work at all. +The issue of performance differences shows that there is still room for further research on layer recycling, +and luckily Quaterion is flexible enough to run such experiments quickly. +We will continue to report our findings on fine-tuning efficiency. -Watch later +**Fun fact**: The preview image for this article was created with Dall.e with the following prompt: "Photo-realistic robot using a tuning fork to adjust a piano." +[Click here](/articles_data/embedding-recycling/full.png) +to see it in full size! -Share +<|page-67-lllmstxt|> +Supervised classification is one of the most widely used training objectives in machine learning, +but not every task can be defined as such. For example, -Copy link +1. Your classes may change quickly —e.g., new classes may be added over time, +2. You may not have samples from every possible category, +3. It may be impossible to enumerate all the possible classes during the training time, +4. You may have an essentially different task, e.g., search or retrieval. -Info +All such problems may be efficiently solved with similarity learning. -Shopping +N.B.: If you are new to the similarity learning concept, checkout the [awesome-metric-learning](https://github.com/qdrant/awesome-metric-learning) repo for great resources and use case examples. -Tap to unmute +However, similarity learning comes with its own difficulties such as: -If playback doesn't begin shortly, try restarting your device. +1. Need for larger batch sizes usually, +2. More sophisticated loss functions, +3. Changing architectures between training and inference. -More videos +Quaterion is a fine tuning framework built to tackle such problems in similarity learning. +It uses [PyTorch Lightning](https://www.pytorchlightning.ai/) +as a backend, which is advertized with the motto, "spend more time on research, less on engineering." +This is also true for Quaterion, and it includes: -## More videos +1. Trainable and servable model classes, +2. Annotated built-in loss functions, and a wrapper over [pytorch-metric-learning](https://kevinmusgrave.github.io/pytorch-metric-learning/) when you need even more, +3. Sample, dataset and data loader classes to make it easier to work with similarity learning data, +4. A caching mechanism for faster iterations and less memory footprint. -You're signed out +## A closer look at Quaterion -Videos you watch may be added to the TV's watch history and influence TV recommendations. To avoid this, cancel and sign in to YouTube on your computer. +Let's break down some important modules: -CancelConfirm +- `TrainableModel`: A subclass of `pl.LightNingModule` that has additional hook methods such as `configure_encoders`, `configure_head`, `configure_metrics` and others +to define objects needed for training and evaluation —see below to learn more on these. +- `SimilarityModel`: An inference-only export method to boost code transfer and lower dependencies during the inference time. +In fact, Quaterion is composed of two packages: + 1. `quaterion_models`: package that you need for inference. + 2. `quaterion`: package that defines objects needed for training and also depends on `quaterion_models`. +- `Encoder` and `EncoderHead`: Two objects that form a `SimilarityModel`. +In most of the cases, you may use a frozen pretrained encoder, e.g., ResNets from `torchvision`, or language modelling +models from `transformers`, with a trainable `EncoderHead` stacked on top of it. +`quaterion_models` offers several ready-to-use `EncoderHead` implementations, +but you may also create your own by subclassing a parent class or easily listing PyTorch modules in a `SequentialHead`. -Share +Quaterion has other objects such as distance functions, evaluation metrics, evaluators, convenient dataset and data loader classes, but these are mostly self-explanatory. +Thus, they will not be explained in detail in this article for brevity. +However, you can always go check out the [documentation](https://quaterion.qdrant.tech) to learn more about them. -Include playlist +The focus of this tutorial is a step-by-step solution to a similarity learning problem with Quaterion. +This will also help us better understand how the abovementioned objects fit together in a real project. +Let's start walking through some of the important parts of the code. -An error occurred while retrieving sharing information. Please try again later. +If you are looking for the complete source code instead, you can find it under the [examples](https://github.com/qdrant/quaterion/tree/master/examples/cars) +directory in the Quaterion repo. -[Watch on](https://www.youtube.com/watch?v=BF02jULGCfo&embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +## Dataset +In this tutorial, we will use the [Stanford Cars](https://pytorch.org/vision/main/generated/torchvision.datasets.StanfordCars.html) +dataset. -0:00 +{{< figure src=https://storage.googleapis.com/quaterion/docs/class_montage.jpg caption="Stanford Cars Dataset" >}} -0:00 / 6:44 -‱Live -‱ +It has 16185 images of cars from 196 classes, +and it is split into training and testing subsets with almost a 50-50% split. +To make things even more interesting, however, we will first merge training and testing subsets, +then we will split it into two again in such a way that the half of the 196 classes will be put into the training set and the other half will be in the testing set. +This will let us test our model with samples from novel classes that it has never seen in the training phase, +which is what supervised classification cannot achieve but similarity learning can. -[Watch on YouTube](https://www.youtube.com/watch?v=BF02jULGCfo "Watch on YouTube") +In the following code borrowed from [`data.py`](https://github.com/qdrant/quaterion/blob/master/examples/cars/data.py): +- `get_datasets()` function performs the splitting task described above. +- `get_dataloaders()` function creates `GroupSimilarityDataLoader` instances from training and testing datasets. +- Datasets are regular PyTorch datasets that emit `SimilarityGroupSample` instances. -To learn how Hybrid Cloud works, [read the overview document](https://qdrant.tech/documentation/hybrid-cloud/). +N.B.: Currently, Quaterion has two data types to represent samples in a dataset. To learn more about `SimilarityPairSample`, check out the [NLP tutorial](https://quaterion.qdrant.tech/tutorials/nlp_tutorial.html) -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/\#prerequisites) Prerequisites +```python +import numpy as np +import os +import tqdm +from torch.utils.data import Dataset, Subset +from torchvision import datasets, transforms +from typing import Callable +from pytorch_lightning import seed_everything -- **Kubernetes cluster:** To create a Hybrid Cloud Environment, you need a [standard compliant](https://www.cncf.io/training/certification/software-conformance/) Kubernetes cluster. You can run this cluster in any cloud, on-premise or edge environment, with distributions that range from AWS EKS to VMWare vSphere. See [Deployment Platforms](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/) for more information. -- **Storage:** For storage, you need to set up the Kubernetes cluster with a Container Storage Interface (CSI) driver that provides block storage. For vertical scaling, the CSI driver needs to support volume expansion. The `StorageClass` needs to be created beforehand. For backups and restores, the driver needs to support CSI snapshots and restores. The `VolumeSnapshotClass` needs to be created beforehand. See [Deployment Platforms](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/) for more information. +from quaterion.dataset import ( + GroupSimilarityDataLoader, + SimilarityGroupSample, +) -- **Kubernetes nodes:** You need enough CPU and memory capacity for the Qdrant database clusters that you create. A small amount of resources is also needed for the Hybrid Cloud control plane components. Qdrant Hybrid Cloud supports x86\_64 and ARM64 architectures. -- **Permissions:** To install the Qdrant Kubernetes Operator you need to have `cluster-admin` access in your Kubernetes cluster. -- **Connection:** The Qdrant Kubernetes Operator in your cluster needs to be able to connect to Qdrant Cloud. It will create an outgoing connection to `cloud.qdrant.io` on port `443`. -- **Locations:** By default, the Qdrant Cloud Agent and Operator pulls Helm charts and container images from `registry.cloud.qdrant.io`. The Qdrant database container image is pulled from `docker.io`. +# set seed to deterministically sample train and test categories later on +seed_everything(seed=42) -> **Note:** You can also mirror these images and charts into your own registry and pull them from there. +# dataset will be downloaded to this directory under local directory +dataset_path = os.path.join(".", "torchvision", "datasets") -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/\#cli-tools) CLI tools -During the onboarding, you will need to deploy the Qdrant Kubernetes Operator and Agent using Helm. Make sure you have the following tools installed: +def get_datasets(input_size: int): + # Use Mean and std values for the ImageNet dataset as the base model was pretrained on it. + # taken from https://www.geeksforgeeks.org/how-to-normalize-images-in-pytorch/ + mean = [0.485, 0.456, 0.406] + std = [0.229, 0.224, 0.225] -- [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) -- [helm](https://helm.sh/docs/intro/install/) + # create train and test transforms + transform = transforms.Compose( + [ + transforms.Resize((input_size, input_size)), + transforms.ToTensor(), + transforms.Normalize(mean, std), + ] + ) -You will need to have access to the Kubernetes cluster with `kubectl` and `helm` configured to connect to it. Please refer the documentation of your Kubernetes distribution for more information. + # we need to merge train and test splits into a full dataset first, + # and then we will split it to two subsets again with each one composed of distinct labels. + full_dataset = datasets.StanfordCars( + root=dataset_path, split="train", download=True + ) + datasets.StanfordCars(root=dataset_path, split="test", download=True) -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/\#installation) Installation + # full_dataset contains examples from 196 categories labeled with an integer from 0 to 195 + # randomly sample half of it to be used for training + train_categories = np.random.choice(a=196, size=196 // 2, replace=False) -1. To set up Hybrid Cloud, open the Qdrant Cloud Console at [cloud.qdrant.io](https://cloud.qdrant.io/). On the dashboard, select **Hybrid Cloud**. + # get a list of labels for all samples in the dataset + labels_list = np.array([label for _, label in tqdm.tqdm(full_dataset)]) -2. Before creating your first Hybrid Cloud Environment, you have to provide billing information and accept the Hybrid Cloud license agreement. The installation wizard will guide you through the process. + # get a mask for indices where label is included in train_categories + labels_mask = np.isin(labels_list, train_categories) + # get a list of indices to be used as train samples + train_indices = np.argwhere(labels_mask).squeeze() -> **Note:** You will only be charged for the Qdrant cluster you create in a Hybrid Cloud Environment, but not for the environment itself. + # others will be used as test samples + test_indices = np.argwhere(np.logical_not(labels_mask)).squeeze() -3. Now you can specify the following: + # now that we have distinct indices for train and test sets, we can use `Subset` to create new datasets + # from `full_dataset`, which contain only the samples at given indices. + # finally, we apply transformations created above. + train_dataset = CarsDataset( + Subset(full_dataset, train_indices), transform=transform + ) -- **Name:** A name for the Hybrid Cloud Environment -- **Kubernetes Namespace:** The Kubernetes namespace for the operator and agent. Once you select a namespace, you can’t change it. + test_dataset = CarsDataset( + Subset(full_dataset, test_indices), transform=transform + ) -You can also configure the StorageClass and VolumeSnapshotClass to use for the Qdrant databases, if you want to deviate from the default settings of your cluster. + return train_dataset, test_dataset -![Create Hybrid Cloud Environment](https://qdrant.tech/documentation/cloud/hybrid_cloud_env_create.png) -4. You can then enter the YAML configuration for your Kubernetes operator. Qdrant supports a specific list of configuration options, as described in the [Qdrant Operator configuration](https://qdrant.tech/documentation/hybrid-cloud/operator-configuration/) section. +def get_dataloaders( + batch_size: int, + input_size: int, + shuffle: bool = False, +): + train_dataset, test_dataset = get_datasets(input_size) -5. (Optional) If you have special requirements for any of the following, activate the **Show advanced configuration** option: + train_dataloader = GroupSimilarityDataLoader( + train_dataset, batch_size=batch_size, shuffle=shuffle + ) + test_dataloader = GroupSimilarityDataLoader( + test_dataset, batch_size=batch_size, shuffle=False + ) -- If you use a proxy to connect from your infrastructure to the Qdrant Cloud API, you can specify the proxy URL, credentials and cetificates. -- Container registry URL for Qdrant Operator and Agent images. The default is [https://registry.cloud.qdrant.io/qdrant/](https://registry.cloud.qdrant.io/qdrant/). -- Helm chart repository URL for the Qdrant Operator and Agent. The default is [oci://registry.cloud.qdrant.io/qdrant-charts](oci://registry.cloud.qdrant.io/qdrant-charts). -- An optional secret with credentials to access your own container registry. -- Log level for the operator and agent -- Node selectors and tolerations for the operater, agent and monitoring stack + return train_dataloader, test_dataloader -![Create Hybrid Cloud Environment - Advanced Configuration](https://qdrant.tech/documentation/cloud/hybrid_cloud_advanced_configuration.png) -6. Once complete, click **Create**. +class CarsDataset(Dataset): + def __init__(self, dataset: Dataset, transform: Callable): + self._dataset = dataset + self._transform = transform -> **Note:** All settings but the Kubernetes namespace can be changed later. + def __len__(self) -> int: + return len(self._dataset) -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/\#generate-installation-command) Generate Installation Command + def __getitem__(self, index) -> SimilarityGroupSample: + image, label = self._dataset[index] + image = self._transform(image) -After creating your Hybrid Cloud, select **Generate Installation Command** to generate a script that you can run in your Kubernetes cluster which will perform the initial installation of the Kubernetes operator and agent. + return SimilarityGroupSample(obj=image, group=label) +``` -![Rotate Hybrid Cloud Secrets](https://qdrant.tech/documentation/cloud/hybrid_cloud_create_command.png) +## Trainable Model -It will: +Now it's time to review one of the most exciting building blocks of Quaterion: [TrainableModel](https://quaterion.qdrant.tech/quaterion.train.trainable_model.html#module-quaterion.train.trainable_model). +It is the base class for models you would like to configure for training, +and it provides several hook methods starting with `configure_` to set up every aspect of the training phase +just like [`pl.LightningModule`](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.LightningModule.html), its own base class. +It is central to fine tuning with Quaterion, so we will break down this essential code in [`models.py`](https://github.com/qdrant/quaterion/blob/master/examples/cars/models.py) +and review each method separately. Let's begin with the imports: -- Create the Kubernetes namespace, if not present. -- Set up the necessary secrets with credentials to access the Qdrant container registry and the Qdrant Cloud API. -- Sign in to the Helm registry at `registry.cloud.qdrant.io`. -- Install the Qdrant cloud agent and Kubernetes operator chart. +```python +import torch +import torchvision +from quaterion_models.encoders import Encoder +from quaterion_models.heads import EncoderHead, SkipConnectionHead +from torch import nn +from typing import Dict, Union, Optional, List -You need this command only for the initial installation. After that, you can update the agent and operator using the Qdrant Cloud Console. +from quaterion import TrainableModel +from quaterion.eval.attached_metric import AttachedMetric +from quaterion.eval.group import RetrievalRPrecision +from quaterion.loss import SimilarityLoss, TripletLoss +from quaterion.train.cache import CacheConfig, CacheType -> **Note:** If you generate the installation command a second time, it will re-generate the included secrets, and you will have to apply the command again to update them. +from .encoders import CarsEncoder +``` -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/\#advanced-configuration) Advanced configuration +In the following code snippet, we subclass `TrainableModel`. +You may use `__init__()` to store some attributes to be used in various `configure_*` methods later on. +The more interesting part is, however, in the [`configure_encoders()`](https://quaterion.qdrant.tech/quaterion.train.trainable_model.html#quaterion.train.trainable_model.TrainableModel.configure_encoders) method. +We need to return an instance of [`Encoder`](https://quaterion-models.qdrant.tech/quaterion_models.encoders.encoder.html#quaterion_models.encoders.encoder.Encoder) (or a dictionary with `Encoder` instances as values) from this method. +In our case, it is an instance of `CarsEncoders`, which we will review soon. +Notice now how it is created with a pretrained ResNet152 model whose classification layer is replaced by an identity function. -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/\#mirroring-images-and-charts) Mirroring images and charts -#### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/\#required-artifacts) Required artifacts +```python +class Model(TrainableModel): + def __init__(self, lr: float, mining: str): + self._lr = lr + self._mining = mining + super().__init__() -Container images: + def configure_encoders(self) -> Union[Encoder, Dict[str, Encoder]]: + pre_trained_encoder = torchvision.models.resnet152(pretrained=True) + pre_trained_encoder.fc = nn.Identity() + return CarsEncoder(pre_trained_encoder) +``` -- `registry.cloud.qdrant.io/qdrant/qdrant` -- `registry.cloud.qdrant.io/qdrant/qdrant-cloud-agent` -- `registry.cloud.qdrant.io/qdrant/operator` -- `registry.cloud.qdrant.io/qdrant/cluster-manager` -- `registry.cloud.qdrant.io/qdrant/prometheus` -- `registry.cloud.qdrant.io/qdrant/prometheus-config-reloader` -- `registry.cloud.qdrant.io/qdrant/kube-state-metrics` -- `registry.cloud.qdrant.io/qdrant/kubernetes-event-exporter` -- `registry.cloud.qdrant.io/qdrant/qdrant-cluster-exporter` +In Quaterion, a [`SimilarityModel`](https://quaterion-models.qdrant.tech/quaterion_models.model.html#quaterion_models.model.SimilarityModel) is composed of one or more `Encoder`s +and an [`EncoderHead`](https://quaterion-models.qdrant.tech/quaterion_models.heads.encoder_head.html#quaterion_models.heads.encoder_head.EncoderHead). +`quaterion_models` has [several `EncoderHead` implementations](https://quaterion-models.qdrant.tech/quaterion_models.heads.html#module-quaterion_models.heads) +with a unified API such as a configurable dropout value. +You may use one of them or create your own subclass of `EncoderHead`. +In either case, you need to return an instance of it from [`configure_head`](https://quaterion.qdrant.tech/quaterion.train.trainable_model.html#quaterion.train.trainable_model.TrainableModel.configure_head) +In this example, we will use a `SkipConnectionHead`, which is lightweight and more resistant to overfitting. -Open Containers Initiative (OCI) Helm charts: +```python + def configure_head(self, input_embedding_size) -> EncoderHead: + return SkipConnectionHead(input_embedding_size, dropout=0.1) +``` -- `registry.cloud.qdrant.io/qdrant-charts/qdrant-cloud-agent` -- `registry.cloud.qdrant.io/qdrant-charts/operator` -- `registry.cloud.qdrant.io/qdrant-charts/qdrant-cluster-manager` -- `registry.cloud.qdrant.io/qdrant-charts/prometheus` -- `registry.cloud.qdrant.io/qdrant-charts/kubernetes-event-exporter` -- `registry.cloud.qdrant.io/qdrant-charts/qdrant-cluster-exporter` +Quaterion has implementations of [some popular loss functions](https://quaterion.qdrant.tech/quaterion.loss.html) for similarity learning, all of which subclass either [`GroupLoss`](https://quaterion.qdrant.tech/quaterion.loss.group_loss.html#quaterion.loss.group_loss.GroupLoss) +or [`PairwiseLoss`](https://quaterion.qdrant.tech/quaterion.loss.pairwise_loss.html#quaterion.loss.pairwise_loss.PairwiseLoss). +In this example, we will use [`TripletLoss`](https://quaterion.qdrant.tech/quaterion.loss.triplet_loss.html#quaterion.loss.triplet_loss.TripletLoss), +which is a subclass of `GroupLoss`. In general, subclasses of `GroupLoss` are used with +datasets in which samples are assigned with some group (or label). In our example label is a make of the car. +Those datasets should emit `SimilarityGroupSample`. +Other alternatives are implementations of `PairwiseLoss`, which consume `SimilarityPairSample` - pair of objects for which similarity is specified individually. +To see an example of the latter, you may need to check out the [NLP Tutorial](https://quaterion.qdrant.tech/tutorials/nlp_tutorial.html) -To mirror all necessary container images and Helm charts into your own registry, you should use an automatic replication feature that your registry provides, so that you have new image versions available automatically. Alternatively you can manually sync the images with tools like [Skopeo](https://github.com/containers/skopeo). When syncing images manually, make sure that you sync then with all, or with the right CPU architecture. +```python + def configure_loss(self) -> SimilarityLoss: + return TripletLoss(mining=self._mining, margin=0.5) +``` -##### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/\#automatic-replication) Automatic replication -Ensure that you have both the container images in the `/qdrant/` repository, and the helm charts in the `/qdrant-charts/` repository synced. Then go to the advanced section of your Hybrid Cloud Environment and configure your registry locations: +`configure_optimizers()` may be familiar to PyTorch Lightning users, +but there is a novel `self.model` used inside that method. +It is an instance of `SimilarityModel` and is automatically created by Quaterion from the return values of `configure_encoders()` and `configure_head()`. -- Container registry URL: `your-registry.example.com/qdrant` (this will for example result in `your-registry.example.com/qdrant/qdrant-cloud-agent`) -- Chart repository URL: `oci://your-registry.example.com/qdrant-charts` (this will for example result in `oci://your-registry.example.com/qdrant-charts/qdrant-cloud-agent`) +```python + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.model.parameters(), self._lr) + return optimizer +``` -If you registry requires authentication, you have to create your own secrets with authentication information into your `the-qdrant-namespace` namespace. +Caching in Quaterion is used for avoiding calculation of outputs of a frozen pretrained `Encoder` in every epoch. +When it is configured, outputs will be computed once and cached in the preferred device for direct usage later on. +It provides both a considerable speedup and less memory footprint. +However, it is quite a bit versatile and has several knobs to tune. +To get the most out of its potential, it's recommended that you check out the [cache tutorial](https://quaterion.qdrant.tech/tutorials/cache_tutorial.html). +For the sake of making this article self-contained, you need to return a [`CacheConfig`](https://quaterion.qdrant.tech/quaterion.train.cache.cache_config.html#quaterion.train.cache.cache_config.CacheConfig) +instance from [`configure_caches()`](https://quaterion.qdrant.tech/quaterion.train.trainable_model.html#quaterion.train.trainable_model.TrainableModel.configure_caches) +to specify cache-related preferences such as: +- [`CacheType`](https://quaterion.qdrant.tech/quaterion.train.cache.cache_config.html#quaterion.train.cache.cache_config.CacheType), i.e., whether to store caches on CPU or GPU, +- `save_dir`, i.e., where to persist caches for subsequent runs, +- `batch_size`, i.e., batch size to be used only when creating caches - the batch size to be used during the actual training might be different. -Example: +```python + def configure_caches(self) -> Optional[CacheConfig]: + return CacheConfig( + cache_type=CacheType.AUTO, save_dir="./cache_dir", batch_size=32 + ) +``` -```shell -kubectl --namespace the-qdrant-namespace create secret docker-registry my-creds --docker-server='your-registry.example.com' --docker-username='your-username' --docker-password='your-password' +We have just configured the training-related settings of a `TrainableModel`. +However, evaluation is an integral part of experimentation in machine learning, +and you may configure evaluation metrics by returning one or more [`AttachedMetric`](https://quaterion.qdrant.tech/quaterion.eval.attached_metric.html#quaterion.eval.attached_metric.AttachedMetric) +instances from `configure_metrics()`. Quaterion has several built-in [group](https://quaterion.qdrant.tech/quaterion.eval.group.html) +and [pairwise](https://quaterion.qdrant.tech/quaterion.eval.pair.html) +evaluation metrics. +```python + def configure_metrics(self) -> Union[AttachedMetric, List[AttachedMetric]]: + return AttachedMetric( + "rrp", + metric=RetrievalRPrecision(), + prog_bar=True, + on_epoch=True, + on_step=False, + ) ``` -You can then reference they secret in the advanced section of your Hybrid Cloud Environment. - -##### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/\#manual-replication) Manual replication +## Encoder -This example uses Skopeo. +As previously stated, a `SimilarityModel` is composed of one or more `Encoder`s and an `EncoderHead`. +Even if we freeze pretrained `Encoder` instances, +`EncoderHead` is still trainable and has enough parameters to adapt to the new task at hand. +It is recommended that you set the `trainable` property to `False` whenever possible, +as it lets you benefit from the caching mechanism described above. +Another important property is `embedding_size`, which will be passed to `TrainableModel.configure_head()` as `input_embedding_size` +to let you properly initialize the head layer. +Let's see how an `Encoder` is implemented in the following code borrowed from [`encoders.py`](https://github.com/qdrant/quaterion/blob/master/examples/cars/encoders.py): -You can find your personal credentials for the Qdrant Cloud registry in the onboarding command, or you can fetch them with `kubectl`: +```python +import os -```shell -kubectl get secrets qdrant-registry-creds --namespace the-qdrant-namespace -o jsonpath='{.data.\.dockerconfigjson}' | base64 --decode | jq -r '.' +import torch +import torch.nn as nn +from quaterion_models.encoders import Encoder -``` -First login to the source registry: +class CarsEncoder(Encoder): + def __init__(self, encoder_model: nn.Module): + super().__init__() + self._encoder = encoder_model + self._embedding_size = 2048 # last dimension from the ResNet model -```shell -skopeo login registry.cloud.qdrant.io + @property + def trainable(self) -> bool: + return False + @property + def embedding_size(self) -> int: + return self._embedding_size ``` -Then login to your own registry: - -```shell -skopeo login your-registry.example.com +An `Encoder` is a regular `torch.nn.Module` subclass, +and we need to implement the forward pass logic in the `forward` method. +Depending on how you create your submodules, this method may be more complex; +however, we simply pass the input through a pretrained ResNet152 backbone in this example: +```python + def forward(self, images): + embeddings = self._encoder.forward(images) + return embeddings ``` -To sync all container images: +An important step of machine learning development is proper saving and loading of models. +Quaterion lets you save your `SimilarityModel` with [`TrainableModel.save_servable()`](https://quaterion.qdrant.tech/quaterion.train.trainable_model.html#quaterion.train.trainable_model.TrainableModel.save_servable) +and restore it with [`SimilarityModel.load()`](https://quaterion-models.qdrant.tech/quaterion_models.model.html#quaterion_models.model.SimilarityModel.load). +To be able to use these two methods, you need to implement `save()` and `load()` methods in your `Encoder`. +Additionally, it is also important that you define your subclass of `Encoder` outside the `__main__` namespace, +i.e., in a separate file from your main entry point. +It may not be restored properly otherwise. -```shell -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/operator your-registry.example.com/qdrant/operator -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/qdrant-cloud-agent your-registry.example.com/qdrant/qdrant-cloud-agent -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/prometheus your-registry.example.com/qdrant/prometheus -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/prometheus-config-reloader your-registry.example.com/qdrant/prometheus-config-reloader -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/kube-state-metrics your-registry.example.com/qdrant/kube-state-metrics -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/qdrant your-registry.example.com/qdrant/qdrant -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/cluster-manager your-registry.example.com/qdrant/cluster-manager -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/qdrant-cluster-exporter your-registry.example.com/qdrant/qdrant-cluster-exporter -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/kubernetes-event-exporter your-registry.example.com/qdrant/kubernetes-event-exporter +```python + def save(self, output_path: str): + os.makedirs(output_path, exist_ok=True) + torch.save(self._encoder, os.path.join(output_path, "encoder.pth")) + @classmethod + def load(cls, input_path): + encoder_model = torch.load(os.path.join(input_path, "encoder.pth")) + return CarsEncoder(encoder_model) ``` -To sync all helm charts: +## Training -```shell -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant-charts/prometheus your-registry.example.com/qdrant-charts/prometheus -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant-charts/operator your-registry.example.com/qdrant-charts/operator -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant-charts/qdrant-kubernetes-api your-registry.example.com/qdrant-charts/qdrant-kubernetes-api -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant-charts/qdrant-cloud-agent your-registry.example.com/qdrant-charts/qdrant-cloud-agent -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant-charts/qdrant-cluster-exporter your-registry.example.com/qdrant-charts/qdrant-cluster-exporter -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant-charts/kubernetes-event-exporter your-registry.example.com/qdrant-charts/kubernetes-event-exporter +With all essential objects implemented, it is easy to bring them all together and run a training loop with the [`Quaterion.fit()`](https://quaterion.qdrant.tech/quaterion.main.html#quaterion.main.Quaterion.fit) +method. It expects: +- A `TrainableModel`, +- A [`pl.Trainer`](https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html), +- A [`SimilarityDataLoader`](https://quaterion.qdrant.tech/quaterion.dataset.similarity_data_loader.html#quaterion.dataset.similarity_data_loader.SimilarityDataLoader) for training data, +- And optionally, another `SimilarityDataLoader` for evaluation data. -``` +We need to import a few objects to prepare all of these: -With the above configuration, you can add the following values to the advanced section of your Hybrid Cloud Environment: +```python +import os +import pytorch_lightning as pl +import torch +from pytorch_lightning.callbacks import EarlyStopping, ModelSummary -- Container registry URL: `your-registry.example.com/qdrant` -- Chart repository URL: `oci://your-registry.example.com/qdrant-charts` +from quaterion import Quaterion +from .data import get_dataloaders +from .models import Model +``` -If your registry requires authentication, you can create and reference the secret the same way as described above. +The `train()` function in the following code snippet expects several hyperparameter values as arguments. +They can be defined in a `config.py` or passed from the command line. +However, that part of the code is omitted for brevity. +Instead let's focus on how all the building blocks are initialized and passed to `Quaterion.fit()`, +which is responsible for running the whole loop. +When the training loop is complete, you can simply call `TrainableModel.save_servable()` +to save the current state of the `SimilarityModel` instance: -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/\#rate-limits-at-dockerio) Rate limits at `docker.io` +```python +def train( + lr: float, + mining: str, + batch_size: int, + epochs: int, + input_size: int, + shuffle: bool, + save_dir: str, +): + model = Model( + lr=lr, + mining=mining, + ) + + train_dataloader, val_dataloader = get_dataloaders( + batch_size=batch_size, input_size=input_size, shuffle=shuffle + ) -By default, the Qdrant database image will be fetched from Docker Hub, which is the main source of truth. Docker Hub has rate limits for anonymous users. If you have larger setups and also fetch other images from their, you may run into these limits. To solve this, you can provide authentication information for Docker Hub. + early_stopping = EarlyStopping( + monitor="validation_loss", + patience=50, + ) -First, create a secret with your Docker Hub credentials into your `the-qdrant-namespace` namespace: + trainer = pl.Trainer( + gpus=1 if torch.cuda.is_available() else 0, + max_epochs=epochs, + callbacks=[early_stopping, ModelSummary(max_depth=3)], + enable_checkpointing=False, + log_every_n_steps=1, + ) -```shell -kubectl create secret docker-registry dockerhub-registry-secret --namespace the-qdrant-namespace --docker-server=https://index.docker.io/v1/ --docker-username= --docker-password= --docker-email= + Quaterion.fit( + trainable_model=model, + trainer=trainer, + train_dataloader=train_dataloader, + val_dataloader=val_dataloader, + ) + model.save_servable(save_dir) ``` -Then, you can reference this secret by adding the following configuration in the operator configuration YAML editor in the advanced section of the Hybrid Cloud Environment: - -```yaml -qdrant: - image: - pull_secret: "dockerhub-registry-secret" - -``` +## Evaluation -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/\#rotating-secrets) Rotating Secrets +Let's see what we have achieved with these simple steps. +[`evaluate.py`](https://github.com/qdrant/quaterion/blob/master/examples/cars/evaluate.py) has two functions to evaluate both the baseline model and the tuned similarity model. +We will review only the latter for brevity. +In addition to the ease of restoring a `SimilarityModel`, this code snippet also shows +how to use [`Evaluator`](https://quaterion.qdrant.tech/quaterion.eval.evaluator.html#quaterion.eval.evaluator.Evaluator) +to evaluate the performance of a `SimilarityModel` on a given dataset +by given evaluation metrics. -If you need to rotate the secrets to pull container images and charts from the Qdrant registry and to authenticate at the Qdrant Cloud API, you can do so by following these steps: -- Go to the Hybrid Cloud environment list or the detail page of the environment. -- In the actions menu, choose “Rotate Secrets” -- Confirm the action -- You will receive a new installation command that you can run in your Kubernetes cluster to update the secrets. +{{< figure src=https://storage.googleapis.com/quaterion/docs/original_vs_tuned_cars.png caption="Comparison of original and tuned models for retrieval" >}} -If you don’t run the installation command, the secrets will not be updated and the communication between your Hybrid Cloud Environment and the Qdrant Cloud API will not work. -![Rotate Hybrid Cloud Secrets](https://qdrant.tech/documentation/cloud/hybrid_cloud_rotate_secrets.png) +Full evaluation of a dataset usually grows exponentially, +and thus you may want to perform a partial evaluation on a sampled subset. +In this case, you may use [samplers](https://quaterion.qdrant.tech/quaterion.eval.samplers.html) +to limit the evaluation. +Similar to `Quaterion.fit()` used for training, [`Quaterion.evaluate()`](https://quaterion.qdrant.tech/quaterion.main.html#quaterion.main.Quaterion.evaluate) +runs a complete evaluation loop. It takes the following as arguments: +- An `Evaluator` instance created with given evaluation metrics and a `Sampler`, +- The `SimilarityModel` to be evaluated, +- And the evaluation dataset. -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/\#deleting-a-hybrid-cloud-environment) Deleting a Hybrid Cloud Environment +```python +def eval_tuned_encoder(dataset, device): + print("Evaluating tuned encoder...") + tuned_cars_model = SimilarityModel.load( + os.path.join(os.path.dirname(__file__), "cars_encoders") + ).to(device) + tuned_cars_model.eval() -To delete a Hybrid Cloud Environment, first delete all Qdrant database clusters in it. Then you can delete the environment itself. + result = Quaterion.evaluate( + evaluator=Evaluator( + metrics=RetrievalRPrecision(), + sampler=GroupSampler(sample_size=1000, device=device, log_progress=True), + ), + model=tuned_cars_model, + dataset=dataset, + ) -To clean up your Kubernetes cluster, after deleting the Hybrid Cloud Environment, you can download the script from [https://github.com/qdrant/qdrant-cloud-support-tools/tree/main/hybrid-cloud-cleanup](https://github.com/qdrant/qdrant-cloud-support-tools/tree/main/hybrid-cloud-cleanup) to remove all Qdrant related resources. + print(result) +``` -Run the following command while being connected to your Kubernetes cluster. The script requires `kubectl` and `helm` to be installed. +## Conclusion -```shell -./hybrid-cloud-cleanup.sh your-qdrant-namespace +In this tutorial, we trained a similarity model to search for similar cars from novel categories unseen in the training phase. +Then, we evaluated it on a test dataset by the Retrieval R-Precision metric. +The base model scored 0.1207, +and our tuned model hit 0.2540, a twice higher score. +These scores can be seen in the following figure: -``` +{{< figure src=/articles_data/cars-recognition/cars_metrics.png caption="Metrics for the base and tuned models" >}} -##### Was this page useful? +<|page-68-lllmstxt|> +## How to train object matching model with no labeled data and use it in production -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No -Thank you for your feedback! 🙏 +Currently, most machine-learning-related business cases are solved as a classification problems. +Classification algorithms are so well studied in practice that even if the original problem is not directly a classification task, it is usually decomposed or approximately converted into one. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/hybrid-cloud/hybrid-cloud-setup.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +However, despite its simplicity, the classification task has requirements that could complicate its production integration and scaling. +E.g. it requires a fixed number of classes, where each class should have a sufficient number of training samples. -On this page: +In this article, I will describe how we overcome these limitations by switching to metric learning. +By the example of matching job positions and candidates, I will show how to train metric learning model with no manually labeled data, how to estimate prediction confidence, and how to serve metric learning in production. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/hybrid-cloud/hybrid-cloud-setup.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) -× +## What is metric learning and why using it? -[Powered by](https://qdrant.tech/) +According to Wikipedia, metric learning is the task of learning a distance function over objects. +In practice, it means that we can train a model that tells a number for any pair of given objects. +And this number should represent a degree or score of similarity between those given objects. +For example, objects with a score of 0.9 could be more similar than objects with a score of 0.5 +Actual scores and their direction could vary among different implementations. -<|page-41-lllmstxt|> -## points -- [Documentation](https://qdrant.tech/documentation/) -- [Concepts](https://qdrant.tech/documentation/concepts/) -- Points +In practice, there are two main approaches to metric learning and two corresponding types of NN architectures. +The first is the interaction-based approach, which first builds local interactions (i.e., local matching signals) between two objects. Deep neural networks learn hierarchical interaction patterns for matching. +Examples of neural network architectures include MV-LSTM, ARC-II, and MatchPyramid. -# [Anchor](https://qdrant.tech/documentation/concepts/points/\#points) Points +![MV-LSTM, example of interaction-based model](https://gist.githubusercontent.com/generall/4821e3c6b5eee603d56729e7a156e461/raw/b0eb4ea5d088fe1095e529eb12708ac69f304ce3/mv_lstm.png) +> MV-LSTM, example of interaction-based model, [Shengxian Wan et al. +](https://www.researchgate.net/figure/Illustration-of-MV-LSTM-S-X-and-S-Y-are-the-in_fig1_285271115) via Researchgate -The points are the central entity that Qdrant operates with. -A point is a record consisting of a [vector](https://qdrant.tech/documentation/concepts/vectors/) and an optional [payload](https://qdrant.tech/documentation/concepts/payload/). +The second is the representation-based approach. +In this case distance function is composed of 2 components: +the Encoder transforms an object into embedded representation - usually a large float point vector, and the Comparator takes embeddings of a pair of objects from the Encoder and calculates their similarity. +The most well-known example of this embedding representation is Word2Vec. -It looks like this: +Examples of neural network architectures also include DSSM, C-DSSM, and ARC-I. -```json -// This is a simple point -{ - "id": 129, - "vector": [0.1, 0.2, 0.3, 0.4], - "payload": {"color": "red"}, -} +The Comparator is usually a very simple function that could be calculated very quickly. +It might be cosine similarity or even a dot production. +Two-stage schema allows performing complex calculations only once per object. +Once transformed, the Comparator can calculate object similarity independent of the Encoder much more quickly. +For more convenience, embeddings can be placed into specialized storages or vector search engines. +These search engines allow to manage embeddings using API, perform searches and other operations with vectors. -``` +![C-DSSM, example of representation-based model](https://gist.githubusercontent.com/generall/4821e3c6b5eee603d56729e7a156e461/raw/b0eb4ea5d088fe1095e529eb12708ac69f304ce3/cdssm.png) +> C-DSSM, example of representation-based model, [Xue Li et al.](https://arxiv.org/abs/1901.10710v2) via arXiv -You can search among the points grouped in one [collection](https://qdrant.tech/documentation/concepts/collections/) based on vector similarity. -This procedure is described in more detail in the [search](https://qdrant.tech/documentation/concepts/search/) and [filtering](https://qdrant.tech/documentation/concepts/filtering/) sections. +Pre-trained NNs can also be used. The output of the second-to-last layer could work as an embedded representation. +Further in this article, I would focus on the representation-based approach, as it proved to be more flexible and fast. -This section explains how to create and manage vectors. +So what are the advantages of using metric learning comparing to classification? +Object Encoder does not assume the number of classes. +So if you can't split your object into classes, +if the number of classes is too high, or you suspect that it could grow in the future - consider using metric learning. -Any point modification operation is asynchronous and takes place in 2 steps. -At the first stage, the operation is written to the Write-ahead-log. +In our case, business goal was to find suitable vacancies for candidates who specify the title of the desired position. +To solve this, we used to apply a classifier to determine the job category of the vacancy and the candidate. +But this solution was limited to only a few hundred categories. +Candidates were complaining that they couldn't find the right category for them. +Training the classifier for new categories would be too long and require new training data for each new category. +Switching to metric learning allowed us to overcome these limitations, the resulting solution could compare any pair position descriptions, even if we don't have this category reference yet. -After this moment, the service will not lose the data, even if the machine loses power supply. +![T-SNE with job samples](https://gist.githubusercontent.com/generall/4821e3c6b5eee603d56729e7a156e461/raw/b0eb4ea5d088fe1095e529eb12708ac69f304ce3/embeddings.png) +> T-SNE with job samples, Image by Author. Play with [Embedding Projector](https://projector.tensorflow.org/?config=https://gist.githubusercontent.com/generall/7e712425e3b340c2c4dbc1a29f515d91/raw/b45b2b6f6c1d5ab3d3363c50805f3834a85c8879/config.json) yourself. -## [Anchor](https://qdrant.tech/documentation/concepts/points/\#point-ids) Point IDs +With metric learning, we learn not a concrete job type but how to match job descriptions from a candidate's CV and a vacancy. +Secondly, with metric learning, it is easy to add more reference occupations without model retraining. +We can then add the reference to a vector search engine. +Next time we will match occupations - this new reference vector will be searchable. -Qdrant supports using both `64-bit unsigned integers` and `UUID` as identifiers for points. -Examples of UUID string representations: +## Data for metric learning -- simple: `936DA01F9ABD4d9d80C702AF85C822A8` -- hyphenated: `550e8400-e29b-41d4-a716-446655440000` -- urn: `urn:uuid:F9168C5E-CEB2-4faa-B6BF-329BF39FA1E4` +Unlike classifiers, a metric learning training does not require specific class labels. +All that is required are examples of similar and dissimilar objects. +We would call them positive and negative samples. -That means that in every request UUID string could be used instead of numerical id. -Example: +At the same time, it could be a relative similarity between a pair of objects. +For example, twins look more alike to each other than a pair of random people. +And random people are more similar to each other than a man and a cat. +A model can use such relative examples for learning. -httppythontypescriptrustjavacsharpgo +The good news is that the division into classes is only a special case of determining similarity. +To use such datasets, it is enough to declare samples from one class as positive and samples from another class as negative. +In this way, it is possible to combine several datasets with mismatched classes into one generalized dataset for metric learning. -```http -PUT /collections/{collection_name}/points -{ - "points": [\ - {\ - "id": "5c56c793-69f3-4fbf-87e6-c4bf54c28c26",\ - "payload": {"color": "red"},\ - "vector": [0.9, 0.1, 0.1]\ - }\ - ] -} +But not only datasets with division into classes are suitable for extracting positive and negative examples. +If, for example, there are additional features in the description of the object, the value of these features can also be used as a similarity factor. +It may not be as explicit as class membership, but the relative similarity is also suitable for learning. -``` +In the case of job descriptions, there are many ontologies of occupations, which were able to be combined into a single dataset thanks to this approach. +We even went a step further and used identical job titles to find similar descriptions. -```python -from qdrant_client import QdrantClient, models +As a result, we got a self-supervised universal dataset that did not require any manual labeling. -client = QdrantClient(url="http://localhost:6333") +Unfortunately, universality does not allow some techniques to be applied in training. +Next, I will describe how to overcome this disadvantage. -client.upsert( - collection_name="{collection_name}", - points=[\ - models.PointStruct(\ - id="5c56c793-69f3-4fbf-87e6-c4bf54c28c26",\ - payload={\ - "color": "red",\ - },\ - vector=[0.9, 0.1, 0.1],\ - ),\ - ], -) +## Training the model -``` +There are several ways to train a metric learning model. +Among the most popular is the use of Triplet or Contrastive loss functions, but I will not go deep into them in this article. +However, I will tell you about one interesting trick that helped us work with unified training examples. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +One of the most important practices to efficiently train the metric learning model is hard negative mining. +This technique aims to include negative samples on which model gave worse predictions during the last training epoch. +Most articles that describe this technique assume that training data consists of many small classes (in most cases it is people's faces). +With data like this, it is easy to find bad samples - if two samples from different classes have a high similarity score, we can use it as a negative sample. +But we had no such classes in our data, the only thing we have is occupation pairs assumed to be similar in some way. +We cannot guarantee that there is no better match for each job occupation among this pair. +That is why we can't use hard negative mining for our model. -const client = new QdrantClient({ host: "localhost", port: 6333 }); -client.upsert("{collection_name}", { - points: [\ - {\ - id: "5c56c793-69f3-4fbf-87e6-c4bf54c28c26",\ - payload: {\ - color: "red",\ - },\ - vector: [0.9, 0.1, 0.1],\ - },\ - ], -}); +![Loss variations](https://gist.githubusercontent.com/generall/4821e3c6b5eee603d56729e7a156e461/raw/b0eb4ea5d088fe1095e529eb12708ac69f304ce3/losses.png) +> [Alfonso Medela et al.](https://arxiv.org/abs/1905.10675) via arXiv -``` -```rust -use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder}; -use qdrant_client::Qdrant; +To compensate for this limitation we can try to increase the number of random (weak) negative samples. +One way to achieve this is to train the model longer, so it will see more samples by the end of the training. +But we found a better solution in adjusting our loss function. +In a regular implementation of Triplet or Contractive loss, each positive pair is compared with some or a few negative samples. +What we did is we allow pair comparison amongst the whole batch. +That means that loss-function penalizes all pairs of random objects if its score exceeds any of the positive scores in a batch. +This extension gives `~ N * B^2` comparisons where `B` is a size of batch and `N` is a number of batches. +Much bigger than `~ N * B` in regular triplet loss. +This means that increasing the size of the batch significantly increases the number of negative comparisons, and therefore should improve the model performance. +We were able to observe this dependence in our experiments. +Similar idea we also found in the article [Supervised Contrastive Learning](https://arxiv.org/abs/2004.11362). -let client = Qdrant::from_url("http://localhost:6334").build()?; -client - .upsert_points( - UpsertPointsBuilder::new( - "{collection_name}", - vec![PointStruct::new(\ - "5c56c793-69f3-4fbf-87e6-c4bf54c28c26",\ - vec![0.9, 0.1, 0.1],\ - [("color", "Red".into())],\ - )], - ) - .wait(true), - ) - .await?; +## Model confidence -``` +In real life it is often needed to know how confident the model was in the prediction. +Whether manual adjustment or validation of the result is required. -```java -import java.util.List; -import java.util.Map; -import java.util.UUID; +With conventional classification, it is easy to understand by scores how confident the model is in the result. +If the probability values of different classes are close to each other, the model is not confident. +If, on the contrary, the most probable class differs greatly, then the model is confident. -import static io.qdrant.client.PointIdFactory.id; -import static io.qdrant.client.ValueFactory.value; -import static io.qdrant.client.VectorsFactory.vectors; +At first glance, this cannot be applied to metric learning. +Even if the predicted object similarity score is small it might only mean that the reference set has no proper objects to compare with. +Conversely, the model can group garbage objects with a large score. -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.PointStruct; +Fortunately, we found a small modification to the embedding generator, which allows us to define confidence in the same way as it is done in conventional classifiers with a Softmax activation function. +The modification consists in building an embedding as a combination of feature groups. +Each feature group is presented as a one-hot encoded sub-vector in the embedding. +If the model can confidently predict the feature value - the corresponding sub-vector will have a high absolute value in some of its elements. +For a more intuitive understanding, I recommend thinking about embeddings not as points in space, but as a set of binary features. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +To implement this modification and form proper feature groups we would need to change a regular linear output layer to a concatenation of several Softmax layers. +Each softmax component would represent an independent feature and force the neural network to learn them. -client - .upsertAsync( - "{collection_name}", - List.of( - PointStruct.newBuilder() - .setId(id(UUID.fromString("5c56c793-69f3-4fbf-87e6-c4bf54c28c26"))) - .setVectors(vectors(0.05f, 0.61f, 0.76f, 0.74f)) - .putAllPayload(Map.of("color", value("Red"))) - .build())) - .get(); +Let's take for example that we have 4 softmax components with 128 elements each. +Every such component could be roughly imagined as a one-hot-encoded number in the range of 0 to 127. +Thus, the resulting vector will represent one of `128^4` possible combinations. +If the trained model is good enough, you can even try to interpret the values of singular features individually. -``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +![Softmax feature embeddings](https://gist.githubusercontent.com/generall/4821e3c6b5eee603d56729e7a156e461/raw/b0eb4ea5d088fe1095e529eb12708ac69f304ce3/feature_embedding.png) +> Softmax feature embeddings, Image by Author. -var client = new QdrantClient("localhost", 6334); -await client.UpsertAsync( - collectionName: "{collection_name}", - points: new List - { - new() - { - Id = Guid.Parse("5c56c793-69f3-4fbf-87e6-c4bf54c28c26"), - Vectors = new[] { 0.05f, 0.61f, 0.76f, 0.74f }, - Payload = { ["color"] = "Red" } - } - } -); +## Neural rules -``` +Machine learning models rarely train to 100% accuracy. +In a conventional classifier, errors can only be eliminated by modifying and repeating the training process. +Metric training, however, is more flexible in this matter and allows you to introduce additional steps that allow you to correct the errors of an already trained model. -```go -import ( - "context" +A common error of the metric learning model is erroneously declaring objects close although in reality they are not. +To correct this kind of error, we introduce exclusion rules. - "github.com/qdrant/go-client/qdrant" -) +Rules consist of 2 object anchors encoded into vector space. +If the target object falls into one of the anchors' effects area - it triggers the rule. It will exclude all objects in the second anchor area from the prediction result. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +![Exclusion rules](https://gist.githubusercontent.com/generall/4821e3c6b5eee603d56729e7a156e461/raw/b0eb4ea5d088fe1095e529eb12708ac69f304ce3/exclusion_rule.png) +> Neural exclusion rules, Image by Author. -client.Upsert(context.Background(), &qdrant.UpsertPoints{ - CollectionName: "{collection_name}", - Points: []*qdrant.PointStruct{ - { - Id: qdrant.NewID("5c56c793-69f3-4fbf-87e6-c4bf54c28c26"), - Vectors: qdrant.NewVectors(0.05, 0.61, 0.76, 0.74), - Payload: qdrant.NewValueMap(map[string]any{"color": "Red"}), - }, - }, -}) +The convenience of working with embeddings is that regardless of the number of rules, +you only need to perform the encoding once per object. +Then to find a suitable rule, it is enough to compare the target object's embedding and the pre-calculated embeddings of the rule's anchors. +Which, when implemented, translates into just one additional query to the vector search engine. + -``` +## Vector search in production -and +When implementing a metric learning model in production, the question arises about the storage and management of vectors. +It should be easy to add new vectors if new job descriptions appear in the service. -httppythontypescriptrustjavacsharpgo +In our case, we also needed to apply additional conditions to the search. +We needed to filter, for example, the location of candidates and the level of language proficiency. -```http -PUT /collections/{collection_name}/points -{ - "points": [\ - {\ - "id": 1,\ - "payload": {"color": "red"},\ - "vector": [0.9, 0.1, 0.1]\ - }\ - ] -} +We did not find a ready-made tool for such vector management, so we created [Qdrant](https://github.com/qdrant/qdrant) - open-source vector search engine. -``` +It allows you to add and delete vectors with a simple API, independent of a programming language you are using. +You can also assign the payload to vectors. +This payload allows additional filtering during the search request. -```python -client.upsert( - collection_name="{collection_name}", - points=[\ - models.PointStruct(\ - id=1,\ - payload={\ - "color": "red",\ - },\ - vector=[0.9, 0.1, 0.1],\ - ),\ - ], -) +Qdrant has a pre-built docker image and start working with it is just as simple as running +```bash +docker run -p 6333:6333 qdrant/qdrant ``` -```typescript -client.upsert("{collection_name}", { - points: [\ - {\ - id: 1,\ - payload: {\ - color: "red",\ - },\ - vector: [0.9, 0.1, 0.1],\ - },\ - ], -}); +Documentation with examples could be found [here](https://api.qdrant.tech/api-reference). -``` -```rust -use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder}; -use qdrant_client::Qdrant; +## Conclusion -let client = Qdrant::from_url("http://localhost:6334").build()?; +In this article, I have shown how metric learning can be more scalable and flexible than the classification models. +I suggest trying similar approaches in your tasks - it might be matching similar texts, images, or audio data. +With the existing variety of pre-trained neural networks and a vector search engine, it is easy to build your metric learning-based application. -client - .upsert_points( - UpsertPointsBuilder::new( - "{collection_name}", - vec![PointStruct::new(\ - 1,\ - vec![0.9, 0.1, 0.1],\ - [("color", "Red".into())],\ - )], - ) - .wait(true), - ) - .await?; +<|page-69-lllmstxt|> +Anomaly detection is a thirsting yet challenging task that has numerous use cases across various industries. +The complexity results mainly from the fact that the task is data-scarce by definition. -``` +Similarly, anomalies are, again by definition, subject to frequent change, and they may take unexpected forms. +For that reason, supervised classification-based approaches are: -```java -import java.util.List; -import java.util.Map; +* Data-hungry - requiring quite a number of labeled data; +* Expensive - data labeling is an expensive task itself; +* Time-consuming - you would try to obtain what is necessarily scarce; +* Hard to maintain - you would need to re-train the model repeatedly in response to changes in the data distribution. -import static io.qdrant.client.PointIdFactory.id; -import static io.qdrant.client.ValueFactory.value; -import static io.qdrant.client.VectorsFactory.vectors; +These are not desirable features if you want to put your model into production in a rapidly-changing environment. +And, despite all the mentioned difficulties, they do not necessarily offer superior performance compared to the alternatives. +In this post, we will detail the lessons learned from such a use case. -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.PointStruct; +## Coffee Beans -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +[Agrivero.ai](https://agrivero.ai/) - is a company making AI-enabled solution for quality control & traceability of green coffee for producers, traders, and roasters. +They have collected and labeled more than **30 thousand** images of coffee beans with various defects - wet, broken, chipped, or bug-infested samples. +This data is used to train a classifier that evaluates crop quality and highlights possible problems. -client - .upsertAsync( - "{collection_name}", - List.of( - PointStruct.newBuilder() - .setId(id(1)) - .setVectors(vectors(0.05f, 0.61f, 0.76f, 0.74f)) - .putAllPayload(Map.of("color", value("Red"))) - .build())) - .get(); +{{< figure src=/articles_data/detecting-coffee-anomalies/detection.gif caption="Anomalies in coffee" width="400px" >}} -``` +We should note that anomalies are very diverse, so the enumeration of all possible anomalies is a challenging task on it's own. +In the course of work, new types of defects appear, and shooting conditions change. Thus, a one-time labeled dataset becomes insufficient. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +Let's find out how metric learning might help to address this challenge. -var client = new QdrantClient("localhost", 6334); +## Metric Learning Approach -await client.UpsertAsync( - collectionName: "{collection_name}", - points: new List - { - new() - { - Id = 1, - Vectors = new[] { 0.05f, 0.61f, 0.76f, 0.74f }, - Payload = { ["color"] = "Red" } - } - } -); +In this approach, we aimed to encode images in an n-dimensional vector space and then use learned similarities to label images during the inference. -``` +The simplest way to do this is KNN classification. +The algorithm retrieves K-nearest neighbors to a given query vector and assigns a label based on the majority vote. -```go -import ( - "context" +In production environment kNN classifier could be easily replaced with [Qdrant](https://github.com/qdrant/qdrant) vector search engine. - "github.com/qdrant/go-client/qdrant" -) +{{< figure src=/articles_data/detecting-coffee-anomalies/anomalies_detection.png caption="Production deployment" >}} -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +This approach has the following advantages: -client.Upsert(context.Background(), &qdrant.UpsertPoints{ - CollectionName: "{collection_name}", - Points: []*qdrant.PointStruct{ - { - Id: qdrant.NewIDNum(1), - Vectors: qdrant.NewVectors(0.05, 0.61, 0.76, 0.74), - Payload: qdrant.NewValueMap(map[string]any{"color": "Red"}), - }, - }, -}) +* We can benefit from unlabeled data, considering labeling is time-consuming and expensive. +* The relevant metric, e.g., precision or recall, can be tuned according to changing requirements during the inference without re-training. +* Queries labeled with a high score can be added to the KNN classifier on the fly as new data points. -``` +To apply metric learning, we need to have a neural encoder, a model capable of transforming an image into a vector. -are both possible. +Training such an encoder from scratch may require a significant amount of data we might not have. Therefore, we will divide the training into two steps: -## [Anchor](https://qdrant.tech/documentation/concepts/points/\#vectors) Vectors +* The first step is to train the autoencoder, with which we will prepare a model capable of representing the target domain. -Each point in qdrant may have one or more vectors. -Vectors are the central component of the Qdrant architecture, -qdrant relies on different types of vectors to provide different types of data exploration and search. +* The second step is finetuning. Its purpose is to train the model to distinguish the required types of anomalies. -Here is a list of supported vector types: +{{< figure src=/articles_data/detecting-coffee-anomalies/anomaly_detection_training.png caption="Model training architecture" >}} -| | | -| --- | --- | -| Dense Vectors | A regular vectors, generated by majority of the embedding models. | -| Sparse Vectors | Vectors with no fixed length, but only a few non-zero elements.
Useful for exact token match and collaborative filtering recommendations. | -| MultiVectors | Matrices of numbers with fixed length but variable height.
Usually obtained from late interaction models like ColBERT. | -It is possible to attach more than one type of vector to a single point. -In Qdrant we call these Named Vectors. +### Step 1 - Autoencoder for Unlabeled Data -Read more about vector types, how they are stored and optimized in the [vectors](https://qdrant.tech/documentation/concepts/vectors/) section. +First, we pretrained a Resnet18-like model in a vanilla autoencoder architecture by leaving the labels aside. +Autoencoder is a model architecture composed of an encoder and a decoder, with the latter trying to recreate the original input from the low-dimensional bottleneck output of the former. -## [Anchor](https://qdrant.tech/documentation/concepts/points/\#upload-points) Upload points +There is no intuitive evaluation metric to indicate the performance in this setup, but we can evaluate the success by examining the recreated samples visually. -To optimize performance, Qdrant supports batch loading of points. I.e., you can load several points into the service in one API call. -Batching allows you to minimize the overhead of creating a network connection. +{{< figure src=/articles_data/detecting-coffee-anomalies/image_reconstruction.png caption="Example of image reconstruction with Autoencoder" >}} -The Qdrant API supports two ways of creating batches - record-oriented and column-oriented. -Internally, these options do not differ and are made only for the convenience of interaction. +Then we encoded a subset of the data into 128-dimensional vectors by using the encoder, +and created a KNN classifier on top of these embeddings and associated labels. -Create points with batch: +Although the results are promising, we can do even better by finetuning with metric learning. -httppythontypescript +### Step 2 - Finetuning with Metric Learning -```http -PUT /collections/{collection_name}/points -{ - "batch": { - "ids": [1, 2, 3], - "payloads": [\ - {"color": "red"},\ - {"color": "green"},\ - {"color": "blue"}\ - ], - "vectors": [\ - [0.9, 0.1, 0.1],\ - [0.1, 0.9, 0.1],\ - [0.1, 0.1, 0.9]\ - ] - } -} +We started by selecting 200 labeled samples randomly without replacement. -``` +In this step, The model was composed of the encoder part of the autoencoder with a randomly initialized projection layer stacked on top of it. +We applied transfer learning from the frozen encoder and trained only the projection layer with Triplet Loss and an online batch-all triplet mining strategy. -```python -client.upsert( - collection_name="{collection_name}", - points=models.Batch( - ids=[1, 2, 3], - payloads=[\ - {"color": "red"},\ - {"color": "green"},\ - {"color": "blue"},\ - ], - vectors=[\ - [0.9, 0.1, 0.1],\ - [0.1, 0.9, 0.1],\ - [0.1, 0.1, 0.9],\ - ], - ), -) +Unfortunately, the model overfitted quickly in this attempt. +In the next experiment, we used an online batch-hard strategy with a trick to prevent vector space from collapsing. +We will describe our approach in the further articles. -``` +This time it converged smoothly, and our evaluation metrics also improved considerably to match the supervised classification approach. -```typescript -client.upsert("{collection_name}", { - batch: { - ids: [1, 2, 3], - payloads: [{ color: "red" }, { color: "green" }, { color: "blue" }], - vectors: [\ - [0.9, 0.1, 0.1],\ - [0.1, 0.9, 0.1],\ - [0.1, 0.1, 0.9],\ - ], - }, -}); +{{< figure src=/articles_data/detecting-coffee-anomalies/ae_report_knn.png caption="Metrics for the autoencoder model with KNN classifier" >}} -``` +{{< figure src=/articles_data/detecting-coffee-anomalies/ft_report_knn.png caption="Metrics for the finetuned model with KNN classifier" >}} -or record-oriented equivalent: +We repeated this experiment with 500 and 2000 samples, but it showed only a slight improvement. +Thus we decided to stick to 200 samples - see below for why. -httppythontypescriptrustjavacsharpgo +## Supervised Classification Approach +We also wanted to compare our results with the metrics of a traditional supervised classification model. +For this purpose, a Resnet50 model was finetuned with ~30k labeled images, made available for training. +Surprisingly, the F1 score was around ~0.86. -```http -PUT /collections/{collection_name}/points -{ - "points": [\ - {\ - "id": 1,\ - "payload": {"color": "red"},\ - "vector": [0.9, 0.1, 0.1]\ - },\ - {\ - "id": 2,\ - "payload": {"color": "green"},\ - "vector": [0.1, 0.9, 0.1]\ - },\ - {\ - "id": 3,\ - "payload": {"color": "blue"},\ - "vector": [0.1, 0.1, 0.9]\ - }\ - ] -} +Please note that we used only 200 labeled samples in the metric learning approach instead of ~30k in the supervised classification approach. +These numbers indicate a huge saving with no considerable compromise in the performance. -``` +## Conclusion +We obtained results comparable to those of the supervised classification method by using **only 0.66%** of the labeled data with metric learning. +This approach is time-saving and resource-efficient, and that may be improved further. Possible next steps might be: -```python -client.upsert( - collection_name="{collection_name}", - points=[\ - models.PointStruct(\ - id=1,\ - payload={\ - "color": "red",\ - },\ - vector=[0.9, 0.1, 0.1],\ - ),\ - models.PointStruct(\ - id=2,\ - payload={\ - "color": "green",\ - },\ - vector=[0.1, 0.9, 0.1],\ - ),\ - models.PointStruct(\ - id=3,\ - payload={\ - "color": "blue",\ - },\ - vector=[0.1, 0.1, 0.9],\ - ),\ - ], -) +- Collect more unlabeled data and pretrain a larger autoencoder. +- Obtain high-quality labels for a small number of images instead of tens of thousands for finetuning. +- Use hyperparameter optimization and possibly gradual unfreezing in the finetuning step. +- Use [vector search engine](https://github.com/qdrant/qdrant) to serve Metric Learning in production. -``` +We are actively looking into these, and we will continue to publish our findings in this challenge and other use cases of metric learning. -```typescript -client.upsert("{collection_name}", { - points: [\ - {\ - id: 1,\ - payload: { color: "red" },\ - vector: [0.9, 0.1, 0.1],\ - },\ - {\ - id: 2,\ - payload: { color: "green" },\ - vector: [0.1, 0.9, 0.1],\ - },\ - {\ - id: 3,\ - payload: { color: "blue" },\ - vector: [0.1, 0.1, 0.9],\ - },\ - ], -}); +<|page-70-lllmstxt|> +## What is Triplet Loss? -``` +Triplet Loss was first introduced in [FaceNet: A Unified Embedding for Face Recognition and Clustering](https://arxiv.org/abs/1503.03832) in 2015, +and it has been one of the most popular loss functions for supervised similarity or metric learning ever since. +In its simplest explanation, Triplet Loss encourages that dissimilar pairs be distant from any similar pairs by at least a certain margin value. +Mathematically, the loss value can be calculated as +$L=max(d(a,p) - d(a,n) + m, 0)$, where: -```rust -use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder}; +- $p$, i.e., positive, is a sample that has the same label as $a$, i.e., anchor, +- $n$, i.e., negative, is another sample that has a label different from $a$, +- $d$ is a function to measure the distance between these three samples, +- and $m$ is a margin value to keep negative samples far apart. -client - .upsert_points( - UpsertPointsBuilder::new( - "{collection_name}", - vec![\ - PointStruct::new(1, vec![0.9, 0.1, 0.1], [("city", "red".into())]),\ - PointStruct::new(2, vec![0.1, 0.9, 0.1], [("city", "green".into())]),\ - PointStruct::new(3, vec![0.1, 0.1, 0.9], [("city", "blue".into())]),\ - ], - ) - .wait(true), - ) - .await?; +The paper uses Euclidean distance, but it is equally valid to use any other distance metric, e.g., cosine distance. -``` +The function has a learning objective that can be visualized as in the following: -```java -import java.util.List; -import java.util.Map; +{{< figure src=/articles_data/triplet-loss/loss_objective.png caption="Triplet Loss learning objective" >}} -import static io.qdrant.client.PointIdFactory.id; -import static io.qdrant.client.ValueFactory.value; -import static io.qdrant.client.VectorsFactory.vectors; +Notice that Triplet Loss does not have a side effect of urging to encode anchor and positive samples into the same point +in the vector space as in Contrastive Loss. +This lets Triplet Loss tolerate some intra-class variance, unlike Contrastive Loss, +as the latter forces the distance between an anchor and any positive essentially to $0$. +In other terms, Triplet Loss allows to stretch clusters in such a way as to include outliers +while still ensuring a margin between samples from different clusters, e.g., negative pairs. -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.PointStruct; +Additionally, Triplet Loss is less greedy. Unlike Contrastive Loss, +it is already satisfied when different samples are easily distinguishable from similar ones. It does not change the distances in a positive cluster if +there is no interference from negative examples. +This is due to the fact that Triplet Loss tries to ensure a margin between distances of negative pairs and distances of positive pairs. +However, Contrastive Loss takes into account the margin value only when comparing dissimilar pairs, +and it does not care at all where similar pairs are at that moment. +This means that Contrastive Loss may reach a local minimum earlier, +while Triplet Loss may continue to organize the vector space in a better state. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +Let's demonstrate how two loss functions organize the vector space by animations. +For simpler visualization, the vectors are represented by points in a 2-dimensional space, +and they are selected randomly from a normal distribution. -client - .upsertAsync( - "{collection_name}", - List.of( - PointStruct.newBuilder() - .setId(id(1)) - .setVectors(vectors(0.9f, 0.1f, 0.1f)) - .putAllPayload(Map.of("color", value("red"))) - .build(), - PointStruct.newBuilder() - .setId(id(2)) - .setVectors(vectors(0.1f, 0.9f, 0.1f)) - .putAllPayload(Map.of("color", value("green"))) - .build(), - PointStruct.newBuilder() - .setId(id(3)) - .setVectors(vectors(0.1f, 0.1f, 0.9f)) - .putAllPayload(Map.of("color", value("blue"))) - .build())) - .get(); +{{< figure src=/articles_data/triplet-loss/contrastive.gif caption="Animation that shows how Contrastive Loss moves points in the course of training." >}} -``` +{{< figure src=/articles_data/triplet-loss/triplet.gif caption="Animation that shows how Triplet Loss moves points in the course of training." >}} -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -var client = new QdrantClient("localhost", 6334); +From mathematical interpretations of the two-loss functions, it is clear that Triplet Loss is theoretically stronger, +but Triplet Loss has additional tricks that help it work better. +Most importantly, Triplet Loss introduce online triplet mining strategies, e.g., automatically forming the most useful triplets. -await client.UpsertAsync( - collectionName: "{collection_name}", - points: new List - { - new() - { - Id = 1, - Vectors = new[] { 0.9f, 0.1f, 0.1f }, - Payload = { ["color"] = "red" } - }, - new() - { - Id = 2, - Vectors = new[] { 0.1f, 0.9f, 0.1f }, - Payload = { ["color"] = "green" } - }, - new() - { - Id = 3, - Vectors = new[] { 0.1f, 0.1f, 0.9f }, - Payload = { ["color"] = "blue" } - } - } -); +## Why triplet mining matters? -``` +The formulation of Triplet Loss demonstrates that it works on three objects at a time: -```go -import ( - "context" +- `anchor`, +- `positive` - a sample that has the same label as the anchor, +- and `negative` - a sample with a different label from the anchor and the positive. - "github.com/qdrant/go-client/qdrant" -) +In a naive implementation, we could form such triplets of samples at the beginning of each epoch +and then feed batches of such triplets to the model throughout that epoch. This is called "offline strategy." +However, this would not be so efficient for several reasons: +- It needs to pass $3n$ samples to get a loss value of $n$ triplets. +- Not all these triplets will be useful for the model to learn anything, e.g., yielding a positive loss value. +- Even if we form "useful" triplets at the beginning of each epoch with one of the methods that I will be implementing in this series, +they may become "useless" at some point in the epoch as the model weights will be constantly updated. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Instead, we can get a batch of $n$ samples and their associated labels, +and form triplets on the fly. That is called "online strategy." Normally, this gives +$n^3$ possible triplets, but only a subset of such possible triplets will be actually valid. Even in this case, +we will have a loss value calculated from much more triplets than the offline strategy. -client.Upsert(context.Background(), &qdrant.UpsertPoints{ - CollectionName: "{collection_name}", - Points: []*qdrant.PointStruct{ - { - Id: qdrant.NewIDNum(1), - Vectors: qdrant.NewVectors(0.9, 0.1, 0.1), - Payload: qdrant.NewValueMap(map[string]any{"color": "red"}), - }, - { - Id: qdrant.NewIDNum(2), - Vectors: qdrant.NewVectors(0.1, 0.9, 0.1), - Payload: qdrant.NewValueMap(map[string]any{"color": "green"}), - }, - { - Id: qdrant.NewIDNum(3), - Vectors: qdrant.NewVectors(0.1, 0.1, 0.9), - Payload: qdrant.NewValueMap(map[string]any{"color": "blue"}), - }, - }, -}) +Given a triplet of `(a, p, n)`, it is valid only if: -``` +- `a` and `p` has the same label, +- `a` and `p` are distinct samples, +- and `n` has a different label from `a` and `p`. -The Python client has additional features for loading points, which include: +These constraints may seem to be requiring expensive computation with nested loops, +but it can be efficiently implemented with tricks such as distance matrix, masking, and broadcasting. +The rest of this series will focus on the implementation of these tricks. -- Parallelization -- A retry mechanism -- Lazy batching support -For example, you can read your data directly from hard drives, to avoid storing all data in RAM. You can use these -features with the `upload_collection` and `upload_points` methods. -Similar to the basic upsert API, these methods support both record-oriented and column-oriented formats. +## Distance matrix -Column-oriented format: +A distance matrix is a matrix of shape $(n, n)$ to hold distance values between all possible +pairs made from items in two $n$-sized collections. +This matrix can be used to vectorize calculations that would need inefficient loops otherwise. +Its calculation can be optimized as well, and we will implement [Euclidean Distance Matrix Trick (PDF)](https://www.robots.ox.ac.uk/~albanie/notes/Euclidean_distance_trick.pdf) +explained by Samuel Albanie. You may want to read this three-page document for +the full intuition of the trick, but a brief explanation is as follows: -```python -client.upload_collection( - collection_name="{collection_name}", - ids=[1, 2], - payload=[\ - {"color": "red"},\ - {"color": "green"},\ - ], - vectors=[\ - [0.9, 0.1, 0.1],\ - [0.1, 0.9, 0.1],\ - ], - parallel=4, - max_retries=3, -) +- Calculate the dot product of two collections of vectors, e.g., embeddings in our case. +- Extract the diagonal from this matrix that holds the squared Euclidean norm of each embedding. +- Calculate the squared Euclidean distance matrix based on the following equation: $||a - b||^2 = ||a||^2 - 2 ⟹a, b⟩ + ||b||^2$ +- Get the square root of this matrix for non-squared distances. -``` +We will implement it in PyTorch, so let's start with imports. -Record-oriented format: ```python -client.upload_points( - collection_name="{collection_name}", - points=[\ - models.PointStruct(\ - id=1,\ - payload={\ - "color": "red",\ - },\ - vector=[0.9, 0.1, 0.1],\ - ),\ - models.PointStruct(\ - id=2,\ - payload={\ - "color": "green",\ - },\ - vector=[0.1, 0.9, 0.1],\ - ),\ - ], - parallel=4, - max_retries=3, -) +import torch +import torch.nn as nn +import torch.nn.functional as F +eps = 1e-8 # an arbitrary small value to be used for numerical stability tricks ``` -All APIs in Qdrant, including point loading, are idempotent. -It means that executing the same method several times in a row is equivalent to a single execution. +--- -In this case, it means that points with the same id will be overwritten when re-uploaded. +```python +def euclidean_distance_matrix(x): + """Efficient computation of Euclidean distance matrix -Idempotence property is useful if you use, for example, a message queue that doesn’t provide an exactly-ones guarantee. -Even with such a system, Qdrant ensures data consistency. + Args: + x: Input tensor of shape (batch_size, embedding_dim) + + Returns: + Distance matrix of shape (batch_size, batch_size) + """ + # step 1 - compute the dot product -[_Available as of v0.10.0_](https://qdrant.tech/documentation/concepts/points/#create-vector-name) + # shape: (batch_size, batch_size) + dot_product = torch.mm(x, x.t()) -If the collection was created with multiple vectors, each vector data can be provided using the vector’s name: + # step 2 - extract the squared Euclidean norm from the diagonal -httppythontypescriptrustjavacsharpgo + # shape: (batch_size,) + squared_norm = torch.diag(dot_product) -```http -PUT /collections/{collection_name}/points -{ - "points": [\ - {\ - "id": 1,\ - "vector": {\ - "image": [0.9, 0.1, 0.1, 0.2],\ - "text": [0.4, 0.7, 0.1, 0.8, 0.1, 0.1, 0.9, 0.2]\ - }\ - },\ - {\ - "id": 2,\ - "vector": {\ - "image": [0.2, 0.1, 0.3, 0.9],\ - "text": [0.5, 0.2, 0.7, 0.4, 0.7, 0.2, 0.3, 0.9]\ - }\ - }\ - ] -} + # step 3 - compute squared Euclidean distances -``` + # shape: (batch_size, batch_size) + distance_matrix = squared_norm.unsqueeze(0) - 2 * dot_product + squared_norm.unsqueeze(1) -```python -client.upsert( - collection_name="{collection_name}", - points=[\ - models.PointStruct(\ - id=1,\ - vector={\ - "image": [0.9, 0.1, 0.1, 0.2],\ - "text": [0.4, 0.7, 0.1, 0.8, 0.1, 0.1, 0.9, 0.2],\ - },\ - ),\ - models.PointStruct(\ - id=2,\ - vector={\ - "image": [0.2, 0.1, 0.3, 0.9],\ - "text": [0.5, 0.2, 0.7, 0.4, 0.7, 0.2, 0.3, 0.9],\ - },\ - ),\ - ], -) + # get rid of negative distances due to numerical instabilities + distance_matrix = F.relu(distance_matrix) -``` + # step 4 - compute the non-squared distances + + # handle numerical stability + # derivative of the square root operation applied to 0 is infinite + # we need to handle by setting any 0 to eps + mask = (distance_matrix == 0.0).float() -```typescript -client.upsert("{collection_name}", { - points: [\ - {\ - id: 1,\ - vector: {\ - image: [0.9, 0.1, 0.1, 0.2],\ - text: [0.4, 0.7, 0.1, 0.8, 0.1, 0.1, 0.9, 0.2],\ - },\ - },\ - {\ - id: 2,\ - vector: {\ - image: [0.2, 0.1, 0.3, 0.9],\ - text: [0.5, 0.2, 0.7, 0.4, 0.7, 0.2, 0.3, 0.9],\ - },\ - },\ - ], -}); + # use this mask to set indices with a value of 0 to eps + distance_matrix += mask * eps -``` + # now it is safe to get the square root + distance_matrix = torch.sqrt(distance_matrix) -```rust -use std::collections::HashMap; + # undo the trick for numerical stability + distance_matrix *= (1.0 - mask) -use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder}; -use qdrant_client::Payload; + return distance_matrix +``` -client - .upsert_points( - UpsertPointsBuilder::new( - "{collection_name}", - vec![\ - PointStruct::new(\ - 1,\ - HashMap::from([\ - ("image".to_string(), vec![0.9, 0.1, 0.1, 0.2]),\ - (\ - "text".to_string(),\ - vec![0.4, 0.7, 0.1, 0.8, 0.1, 0.1, 0.9, 0.2],\ - ),\ - ]),\ - Payload::default(),\ - ),\ - PointStruct::new(\ - 2,\ - HashMap::from([\ - ("image".to_string(), vec![0.2, 0.1, 0.3, 0.9]),\ - (\ - "text".to_string(),\ - vec![0.5, 0.2, 0.7, 0.4, 0.7, 0.2, 0.3, 0.9],\ - ),\ - ]),\ - Payload::default(),\ - ),\ - ], - ) - .wait(true), - ) - .await?; +## Invalid triplet masking -``` +Now that we can compute a distance matrix for all possible pairs of embeddings in a batch, +we can apply broadcasting to enumerate distance differences for all possible triplets and represent them in a tensor of shape `(batch_size, batch_size, batch_size)`. +However, only a subset of these $n^3$ triplets are actually valid as I mentioned earlier, +and we need a corresponding mask to compute the loss value correctly. +We will implement such a helper function in three steps: -```java -import java.util.List; -import java.util.Map; +- Compute a mask for distinct indices, e.g., `(i != j and j != k)`. +- Compute a mask for valid anchor-positive-negative triplets, e.g., `labels[i] == labels[j] and labels[j] != labels[k]`. +- Combine two masks. -import static io.qdrant.client.PointIdFactory.id; -import static io.qdrant.client.VectorFactory.vector; -import static io.qdrant.client.VectorsFactory.namedVectors; -import io.qdrant.client.grpc.Points.PointStruct; +```python +def get_triplet_mask(labels): + """compute a mask for valid triplets -client - .upsertAsync( - "{collection_name}", - List.of( - PointStruct.newBuilder() - .setId(id(1)) - .setVectors( - namedVectors( - Map.of( - "image", - vector(List.of(0.9f, 0.1f, 0.1f, 0.2f)), - "text", - vector(List.of(0.4f, 0.7f, 0.1f, 0.8f, 0.1f, 0.1f, 0.9f, 0.2f))))) - .build(), - PointStruct.newBuilder() - .setId(id(2)) - .setVectors( - namedVectors( - Map.of( - "image", - List.of(0.2f, 0.1f, 0.3f, 0.9f), - "text", - List.of(0.5f, 0.2f, 0.7f, 0.4f, 0.7f, 0.2f, 0.3f, 0.9f)))) - .build())) - .get(); + Args: + labels: Batch of integer labels. shape: (batch_size,) -``` + Returns: + Mask tensor to indicate which triplets are actually valid. Shape: (batch_size, batch_size, batch_size) + A triplet is valid if: + `labels[i] == labels[j] and labels[i] != labels[k]` + and `i`, `j`, `k` are different. + """ + # step 1 - get a mask for distinct indices -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; + # shape: (batch_size, batch_size) + indices_equal = torch.eye(labels.size()[0], dtype=torch.bool, device=labels.device) + indices_not_equal = torch.logical_not(indices_equal) + # shape: (batch_size, batch_size, 1) + i_not_equal_j = indices_not_equal.unsqueeze(2) + # shape: (batch_size, 1, batch_size) + i_not_equal_k = indices_not_equal.unsqueeze(1) + # shape: (1, batch_size, batch_size) + j_not_equal_k = indices_not_equal.unsqueeze(0) + # Shape: (batch_size, batch_size, batch_size) + distinct_indices = torch.logical_and(torch.logical_and(i_not_equal_j, i_not_equal_k), j_not_equal_k) -var client = new QdrantClient("localhost", 6334); + # step 2 - get a mask for valid anchor-positive-negative triplets -await client.UpsertAsync( - collectionName: "{collection_name}", - points: new List - { - new() - { - Id = 1, - Vectors = new Dictionary - { - ["image"] = [0.9f, 0.1f, 0.1f, 0.2f], - ["text"] = [0.4f, 0.7f, 0.1f, 0.8f, 0.1f, 0.1f, 0.9f, 0.2f] - } - }, - new() - { - Id = 2, - Vectors = new Dictionary - { - ["image"] = [0.2f, 0.1f, 0.3f, 0.9f], - ["text"] = [0.5f, 0.2f, 0.7f, 0.4f, 0.7f, 0.2f, 0.3f, 0.9f] - } - } - } -); + # shape: (batch_size, batch_size) + labels_equal = labels.unsqueeze(0) == labels.unsqueeze(1) + # shape: (batch_size, batch_size, 1) + i_equal_j = labels_equal.unsqueeze(2) + # shape: (batch_size, 1, batch_size) + i_equal_k = labels_equal.unsqueeze(1) + # shape: (batch_size, batch_size, batch_size) + valid_indices = torch.logical_and(i_equal_j, torch.logical_not(i_equal_k)) + # step 3 - combine two masks + mask = torch.logical_and(distinct_indices, valid_indices) + + return mask ``` -```go -import ( - "context" +## Batch-all strategy for online triplet mining - "github.com/qdrant/go-client/qdrant" -) +Now we are ready for actually implementing Triplet Loss itself. +Triplet Loss involves several strategies to form or select triplets, and the simplest one is +to use all valid triplets that can be formed from samples in a batch. +This can be achieved in four easy steps thanks to utility functions we've already implemented: -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +- Get a distance matrix of all possible pairs that can be formed from embeddings in a batch. +- Apply broadcasting to this matrix to compute loss values for all possible triplets. +- Set loss values of invalid or easy triplets to $0$. +- Average the remaining positive values to return a scalar loss. -client.Upsert(context.Background(), &qdrant.UpsertPoints{ - CollectionName: "{collection_name}", - Points: []*qdrant.PointStruct{ - { - Id: qdrant.NewIDNum(1), - Vectors: qdrant.NewVectorsMap(map[string]*qdrant.Vector{ - "image": qdrant.NewVector(0.9, 0.1, 0.1, 0.2), - "text": qdrant.NewVector(0.4, 0.7, 0.1, 0.8, 0.1, 0.1, 0.9, 0.2), - }), - }, - { - Id: qdrant.NewIDNum(2), - Vectors: qdrant.NewVectorsMap(map[string]*qdrant.Vector{ - "image": qdrant.NewVector(0.2, 0.1, 0.3, 0.9), - "text": qdrant.NewVector(0.5, 0.2, 0.7, 0.4, 0.7, 0.2, 0.3, 0.9), - }), - }, - }, -}) +I will start by implementing this strategy, and more complex ones will follow as separate posts. -``` -_Available as of v1.2.0_ +```python +class BatchAllTtripletLoss(nn.Module): + """Uses all valid triplets to compute Triplet loss -Named vectors are optional. When uploading points, some vectors may be omitted. -For example, you can upload one point with only the `image` vector and a second -one with only the `text` vector. + Args: + margin: Margin value in the Triplet Loss equation + """ + def __init__(self, margin=1.): + super().__init__() + self.margin = margin + + def forward(self, embeddings, labels): + """computes loss value. -When uploading a point with an existing ID, the existing point is deleted first, -then it is inserted with just the specified vectors. In other words, the entire -point is replaced, and any unspecified vectors are set to null. To keep existing -vectors unchanged and only update specified vectors, see [update vectors](https://qdrant.tech/documentation/concepts/points/#update-vectors). + Args: + embeddings: Batch of embeddings, e.g., output of the encoder. shape: (batch_size, embedding_dim) + labels: Batch of integer labels associated with embeddings. shape: (batch_size,) -_Available as of v1.7.0_ + Returns: + Scalar loss value. + """ + # step 1 - get distance matrix + # shape: (batch_size, batch_size) + distance_matrix = euclidean_distance_matrix(embeddings) -Points can contain dense and sparse vectors. + # step 2 - compute loss values for all triplets by applying broadcasting to distance matrix -A sparse vector is an array in which most of the elements have a value of zero. + # shape: (batch_size, batch_size, 1) + anchor_positive_dists = distance_matrix.unsqueeze(2) + # shape: (batch_size, 1, batch_size) + anchor_negative_dists = distance_matrix.unsqueeze(1) + # get loss values for all possible n^3 triplets + # shape: (batch_size, batch_size, batch_size) + triplet_loss = anchor_positive_dists - anchor_negative_dists + self.margin -It is possible to take advantage of this property to have an optimized representation, for this reason they have a different shape than dense vectors. + # step 3 - filter out invalid or easy triplets by setting their loss values to 0 -They are represented as a list of `(index, value)` pairs, where `index` is an integer and `value` is a floating point number. The `index` is the position of the non-zero value in the vector. The `values` is the value of the non-zero element. + # shape: (batch_size, batch_size, batch_size) + mask = get_triplet_mask(labels) + triplet_loss *= mask + # easy triplets have negative loss values + triplet_loss = F.relu(triplet_loss) -For example, the following vector: + # step 4 - compute scalar loss value by averaging positive losses + num_positive_losses = (triplet_loss > eps).float().sum() + triplet_loss = triplet_loss.sum() / (num_positive_losses + eps) + return triplet_loss ``` -[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 0.0, 0.0] -``` +## Conclusion -can be represented as a sparse vector: +I mentioned that Triplet Loss is different from Contrastive Loss not only mathematically but also in its sample selection strategies, and I implemented the batch-all strategy for online triplet mining in this post +efficiently by using several tricks. -``` -[(6, 1.0), (7, 2.0)] +There are other more complicated strategies such as batch-hard and batch-semihard mining, +but their implementations, and discussions of the tricks I used for efficiency in this post, +are worth separate posts of their own. -``` +The future posts will cover such topics and additional discussions on some tricks +to avoid vector collapsing and control intra-class and inter-class variance. -Qdrant uses the following JSON representation throughout its APIs. +<|page-71-lllmstxt|> +# Neural Search 101: A Comprehensive Guide and Step-by-Step Tutorial -```json -{ - "indices": [6, 7], - "values": [1.0, 2.0] -} +Information retrieval technology is one of the main technologies that enabled the modern Internet to exist. +These days, search technology is the heart of a variety of applications. +From web-pages search to product recommendations. +For many years, this technology didn't get much change until neural networks came into play. -``` +In this guide we are going to find answers to these questions: -The `indices` and `values` arrays must have the same length. -And the `indices` must be unique. +* What is the difference between regular and neural search? +* What neural networks could be used for search? +* In what tasks is neural network search useful? +* How to build and deploy own neural search service step-by-step? -If the `indices` are not sorted, Qdrant will sort them internally so you may not rely on the order of the elements. +## What is neural search? -Sparse vectors must be named and can be uploaded in the same way as dense vectors. +A regular full-text search, such as Google's, consists of searching for keywords inside a document. +For this reason, the algorithm can not take into account the real meaning of the query and documents. +Many documents that might be of interest to the user are not found because they use different wording. -httppythontypescriptrustjavacsharpgo +Neural search tries to solve exactly this problem - it attempts to enable searches not by keywords but by meaning. +To achieve this, the search works in 2 steps. +In the first step, a specially trained neural network encoder converts the query and the searched objects into a vector representation called embeddings. +The encoder must be trained so that similar objects, such as texts with the same meaning or alike pictures get a close vector representation. -```http -PUT /collections/{collection_name}/points -{ - "points": [\ - {\ - "id": 1,\ - "vector": {\ - "text": {\ - "indices": [6, 7],\ - "values": [1.0, 2.0]\ - }\ - }\ - },\ - {\ - "id": 2,\ - "vector": {\ - "text": {\ - "indices": [1, 2, 4, 15, 33, 34],\ - "values": [0.1, 0.2, 0.3, 0.4, 0.5]\ - }\ - }\ - }\ - ] -} +![Encoders and embedding space](https://gist.githubusercontent.com/generall/c229cc94be8c15095286b0c55a3f19d7/raw/e52e3f1a320cd985ebc96f48955d7f355de8876c/encoders.png) -``` +Having this vector representation, it is easy to understand what the second step should be. +To find documents similar to the query you now just need to find the nearest vectors. +The most convenient way to determine the distance between two vectors is to calculate the cosine distance. +The usual Euclidean distance can also be used, but it is not so efficient due to [the curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality). -```python -client.upsert( - collection_name="{collection_name}", - points=[\ - models.PointStruct(\ - id=1,\ - vector={\ - "text": models.SparseVector(\ - indices=[6, 7],\ - values=[1.0, 2.0],\ - )\ - },\ - ),\ - models.PointStruct(\ - id=2,\ - vector={\ - "text": models.SparseVector(\ - indices=[1, 2, 3, 4, 5],\ - values=[0.1, 0.2, 0.3, 0.4, 0.5],\ - )\ - },\ - ),\ - ], -) +## Which model could be used? -``` +It is ideal to use a model specially trained to determine the closeness of meanings. +For example, models trained on Semantic Textual Similarity (STS) datasets. +Current state-of-the-art models can be found on this [leaderboard](https://paperswithcode.com/sota/semantic-textual-similarity-on-sts-benchmark?p=roberta-a-robustly-optimized-bert-pretraining). -```typescript -client.upsert("{collection_name}", { - points: [\ - {\ - id: 1,\ - vector: {\ - text: {\ - indices: [6, 7],\ - values: [1.0, 2.0],\ - },\ - },\ - },\ - {\ - id: 2,\ - vector: {\ - text: {\ - indices: [1, 2, 3, 4, 5],\ - values: [0.1, 0.2, 0.3, 0.4, 0.5],\ - },\ - },\ - },\ - ], -}); +However, not only specially trained models can be used. +If the model is trained on a large enough dataset, its internal features can work as embeddings too. +So, for instance, you can take any pre-trained on ImageNet model and cut off the last layer from it. +In the penultimate layer of the neural network, as a rule, the highest-level features are formed, which, however, do not correspond to specific classes. +The output of this layer can be used as an embedding. -``` +## What tasks is neural search good for? -```rust -use std::collections::HashMap; +Neural search has the greatest advantage in areas where the query cannot be formulated precisely. +Querying a table in an SQL database is not the best place for neural search. -use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder, Vector}; -use qdrant_client::Payload; +On the contrary, if the query itself is fuzzy, or it cannot be formulated as a set of conditions - neural search can help you. +If the search query is a picture, sound file or long text, neural network search is almost the only option. -client - .upsert_points( - UpsertPointsBuilder::new( - "{collection_name}", - vec![\ - PointStruct::new(\ - 1,\ - HashMap::from([("text".to_string(), vec![(6, 1.0), (7, 2.0)])]),\ - Payload::default(),\ - ),\ - PointStruct::new(\ - 2,\ - HashMap::from([(\ - "text".to_string(),\ - vec![(1, 0.1), (2, 0.2), (3, 0.3), (4, 0.4), (5, 0.5)],\ - )]),\ - Payload::default(),\ - ),\ - ], - ) - .wait(true), - ) - .await?; +If you want to build a recommendation system, the neural approach can also be useful. +The user's actions can be encoded in vector space in the same way as a picture or text. +And having those vectors, it is possible to find semantically similar users and determine the next probable user actions. -``` +## Step-by-step neural search tutorial using Qdrant -```java -import java.util.List; -import java.util.Map; +With all that said, let's make our neural network search. +As an example, I decided to make a search for startups by their description. +In this demo, we will see the cases when text search works better and the cases when neural network search works better. -import static io.qdrant.client.PointIdFactory.id; -import static io.qdrant.client.VectorFactory.vector; -import io.qdrant.client.grpc.Points.NamedVectors; -import io.qdrant.client.grpc.Points.PointStruct; -import io.qdrant.client.grpc.Points.Vectors; +I will use data from [startups-list.com](https://www.startups-list.com/). +Each record contains the name, a paragraph describing the company, the location and a picture. +Raw parsed data can be found at [this link](https://storage.googleapis.com/generall-shared-data/startups_demo.json). -client - .upsertAsync( - "{collection_name}", - List.of( - PointStruct.newBuilder() - .setId(id(1)) - .setVectors( - Vectors.newBuilder() - .setVectors( - NamedVectors.newBuilder() - .putAllVectors( - Map.of( - "text", vector(List.of(1.0f, 2.0f), List.of(6, 7)))) - .build()) - .build()) - .build(), - PointStruct.newBuilder() - .setId(id(2)) - .setVectors( - Vectors.newBuilder() - .setVectors( - NamedVectors.newBuilder() - .putAllVectors( - Map.of( - "text", - vector( - List.of(0.1f, 0.2f, 0.3f, 0.4f, 0.5f), - List.of(1, 2, 3, 4, 5)))) - .build()) - .build()) - .build())) - .get(); +### Step 1: Prepare data for neural search -``` +To be able to search for our descriptions in vector space, we must get vectors first. +We need to encode the descriptions into a vector representation. +As the descriptions are textual data, we can use a pre-trained language model. +As mentioned above, for the task of text search there is a whole set of pre-trained models specifically tuned for semantic similarity. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +One of the easiest libraries to work with pre-trained language models, in my opinion, is the [sentence-transformers](https://github.com/UKPLab/sentence-transformers) by UKPLab. +It provides a way to conveniently download and use many pre-trained models, mostly based on transformer architecture. +Transformers is not the only architecture suitable for neural search, but for our task, it is quite enough. -var client = new QdrantClient("localhost", 6334); +We will use a model called `all-MiniLM-L6-v2`. +This model is an all-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs. +It is optimized for low memory consumption and fast inference. -await client.UpsertAsync( - collectionName: "{collection_name}", - points: new List - { - new() - { - Id = 1, - Vectors = new Dictionary { ["text"] = ([1.0f, 2.0f], [6, 7]) } - }, - new() - { - Id = 2, - Vectors = new Dictionary - { - ["text"] = ([0.1f, 0.2f, 0.3f, 0.4f, 0.5f], [1, 2, 3, 4, 5]) - } - } - } -); +The complete code for data preparation with detailed comments can be found and run in [Colab Notebook](https://colab.research.google.com/drive/1kPktoudAP8Tu8n8l-iVMOQhVmHkWV_L9?usp=sharing). -``` +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1kPktoudAP8Tu8n8l-iVMOQhVmHkWV_L9?usp=sharing) -```go -import ( - "context" +### Step 2: Incorporate a Vector search engine - "github.com/qdrant/go-client/qdrant" -) +Now as we have a vector representation for all our records, we need to store them somewhere. +In addition to storing, we may also need to add or delete a vector, save additional information with the vector. +And most importantly, we need a way to search for the nearest vectors. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +The vector search engine can take care of all these tasks. +It provides a convenient API for searching and managing vectors. +In our tutorial, we will use [Qdrant vector search engine](https://github.com/qdrant/qdrant) vector search engine. +It not only supports all necessary operations with vectors but also allows you to store additional payload along with vectors and use it to perform filtering of the search result. +Qdrant has a client for Python and also defines the API schema if you need to use it from other languages. -client.Upsert(context.Background(), &qdrant.UpsertPoints{ - CollectionName: "{collection_name}", - Points: []*qdrant.PointStruct{ - { - Id: qdrant.NewIDNum(1), - Vectors: qdrant.NewVectorsMap(map[string]*qdrant.Vector{ - "text": qdrant.NewVectorSparse( - []uint32{6, 7}, - []float32{1.0, 2.0}), - }), - }, - { - Id: qdrant.NewIDNum(2), - Vectors: qdrant.NewVectorsMap(map[string]*qdrant.Vector{ - "text": qdrant.NewVectorSparse( - []uint32{1, 2, 3, 4, 5}, - []float32{0.1, 0.2, 0.3, 0.4, 0.5}), - }), - }, - }, -}) +The easiest way to use Qdrant is to run a pre-built image. +So make sure you have Docker installed on your system. +To start Qdrant, use the instructions on its [homepage](https://github.com/qdrant/qdrant). + +Download image from [DockerHub](https://hub.docker.com/r/qdrant/qdrant): + +```bash +docker pull qdrant/qdrant ``` -## [Anchor](https://qdrant.tech/documentation/concepts/points/\#modify-points) Modify points +And run the service inside the docker: -To change a point, you can modify its vectors or its payload. There are several -ways to do this. +```bash +docker run -p 6333:6333 \ + -v $(pwd)/qdrant_storage:/qdrant/storage \ + qdrant/qdrant +``` +You should see output like this -### [Anchor](https://qdrant.tech/documentation/concepts/points/\#update-vectors) Update vectors +```text +... +[2021-02-05T00:08:51Z INFO actix_server::builder] Starting 12 workers +[2021-02-05T00:08:51Z INFO actix_server::builder] Starting "actix-web-service-0.0.0.0:6333" service on 0.0.0.0:6333 +``` -_Available as of v1.2.0_ +This means that the service is successfully launched and listening port 6333. +To make sure you can test [http://localhost:6333/](http://localhost:6333/) in your browser and get qdrant version info. -This method updates the specified vectors on the given points. Unspecified -vectors are kept unchanged. All given points must exist. +All uploaded to Qdrant data is saved into the `./qdrant_storage` directory and will be persisted even if you recreate the container. -REST API ( [Schema](https://api.qdrant.tech/api-reference/points/update-vectors)): +### Step 3: Upload data to Qdrant -httppythontypescriptrustjavacsharpgo +Now once we have the vectors prepared and the search engine running, we can start uploading the data. +To interact with Qdrant from python, I recommend using an out-of-the-box client library. -```http -PUT /collections/{collection_name}/points/vectors -{ - "points": [\ - {\ - "id": 1,\ - "vector": {\ - "image": [0.1, 0.2, 0.3, 0.4]\ - }\ - },\ - {\ - "id": 2,\ - "vector": {\ - "text": [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]\ - }\ - }\ - ] -} +To install it, use the following command +```bash +pip install qdrant-client ``` -```python -client.update_vectors( - collection_name="{collection_name}", - points=[\ - models.PointVectors(\ - id=1,\ - vector={\ - "image": [0.1, 0.2, 0.3, 0.4],\ - },\ - ),\ - models.PointVectors(\ - id=2,\ - vector={\ - "text": [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2],\ - },\ - ),\ - ], -) +At this point, we should have startup records in file `startups.json`, encoded vectors in file `startup_vectors.npy`, and running Qdrant on a local machine. +Let's write a script to upload all startup data and vectors into the search engine. -``` +First, let's create a client object for Qdrant. -```typescript -client.updateVectors("{collection_name}", { - points: [\ - {\ - id: 1,\ - vector: {\ - image: [0.1, 0.2, 0.3, 0.4],\ - },\ - },\ - {\ - id: 2,\ - vector: {\ - text: [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2],\ - },\ - },\ - ], -}); +```python +# Import client library +from qdrant_client import QdrantClient +from qdrant_client.models import VectorParams, Distance +qdrant_client = QdrantClient(host='localhost', port=6333) ``` -```rust -use std::collections::HashMap; +Qdrant allows you to combine vectors of the same purpose into collections. +Many independent vector collections can exist on one service at the same time. -use qdrant_client::qdrant::{ - PointVectors, UpdatePointVectorsBuilder, -}; +Let's create a new collection for our startup vectors. -client - .update_vectors( - UpdatePointVectorsBuilder::new( - "{collection_name}", - vec![\ - PointVectors {\ - id: Some(1.into()),\ - vectors: Some(\ - HashMap::from([("image".to_string(), vec![0.1, 0.2, 0.3, 0.4])]).into(),\ - ),\ - },\ - PointVectors {\ - id: Some(2.into()),\ - vectors: Some(\ - HashMap::from([(\ - "text".to_string(),\ - vec![0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2],\ - )])\ - .into(),\ - ),\ - },\ - ], - ) - .wait(true), +```python +if not qdrant_client.collection_exists('startups'): + qdrant_client.create_collection( + collection_name='startups', + vectors_config=VectorParams(size=384, distance=Distance.COSINE), ) - .await?; - ``` -```java -import java.util.List; -import java.util.Map; +The `vector_size` parameter is very important. +It tells the service the size of the vectors in that collection. +All vectors in a collection must have the same size, otherwise, it is impossible to calculate the distance between them. +`384` is the output dimensionality of the encoder we are using. -import static io.qdrant.client.PointIdFactory.id; -import static io.qdrant.client.VectorFactory.vector; -import static io.qdrant.client.VectorsFactory.namedVectors; +The `distance` parameter allows specifying the function used to measure the distance between two points. -client - .updateVectorsAsync( - "{collection_name}", - List.of( - PointVectors.newBuilder() - .setId(id(1)) - .setVectors(namedVectors(Map.of("image", vector(List.of(0.1f, 0.2f, 0.3f, 0.4f))))) - .build(), - PointVectors.newBuilder() - .setId(id(2)) - .setVectors( - namedVectors( - Map.of( - "text", vector(List.of(0.9f, 0.8f, 0.7f, 0.6f, 0.5f, 0.4f, 0.3f, 0.2f))))) - .build())) - .get(); +The Qdrant client library defines a special function that allows you to load datasets into the service. +However, since there may be too much data to fit a single computer memory, the function takes an iterator over the data as input. -``` +Let's create an iterator over the startup data and vectors. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +```python +import numpy as np +import json -var client = new QdrantClient("localhost", 6334); +fd = open('./startups.json') -await client.UpdateVectorsAsync( - collectionName: "{collection_name}", - points: new List - { - new() { Id = 1, Vectors = ("image", new float[] { 0.1f, 0.2f, 0.3f, 0.4f }) }, - new() - { - Id = 2, - Vectors = ("text", new float[] { 0.9f, 0.8f, 0.7f, 0.6f, 0.5f, 0.4f, 0.3f, 0.2f }) - } - } -); +# payload is now an iterator over startup data +payload = map(json.loads, fd) +# Here we load all vectors into memory, numpy array works as iterable for itself. +# Other option would be to use Mmap, if we don't want to load all data into RAM +vectors = np.load('./startup_vectors.npy') ``` -```go -import ( - "context" +And the final step - data uploading - "github.com/qdrant/go-client/qdrant" +```python +qdrant_client.upload_collection( + collection_name='startups', + vectors=vectors, + payload=payload, + ids=None, # Vector ids will be assigned automatically + batch_size=256 # How many vectors will be uploaded in a single request? ) +``` -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Now we have vectors uploaded to the vector search engine. +In the next step, we will learn how to actually search for the closest vectors. -client.UpdateVectors(context.Background(), &qdrant.UpdatePointVectors{ - CollectionName: "{collection_name}", - Points: []*qdrant.PointVectors{ - { - Id: qdrant.NewIDNum(1), - Vectors: qdrant.NewVectorsMap(map[string]*qdrant.Vector{ - "image": qdrant.NewVector(0.1, 0.2, 0.3, 0.4), - }), - }, - { - Id: qdrant.NewIDNum(2), - Vectors: qdrant.NewVectorsMap(map[string]*qdrant.Vector{ - "text": qdrant.NewVector(0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2), - }), - }, - }, -}) +The full code for this step can be found [here](https://github.com/qdrant/qdrant_demo/blob/master/qdrant_demo/init_collection_startups.py). -``` +### Step 4: Make a search API -To update points and replace all of its vectors, see [uploading\\ -points](https://qdrant.tech/documentation/concepts/points/#upload-points). +Now that all the preparations are complete, let's start building a neural search class. -### [Anchor](https://qdrant.tech/documentation/concepts/points/\#delete-vectors) Delete vectors +First, install all the requirements: +```bash +pip install sentence-transformers numpy +``` -_Available as of v1.2.0_ +In order to process incoming requests neural search will need 2 things. +A model to convert the query into a vector and Qdrant client, to perform a search queries. -This method deletes just the specified vectors from the given points. Other -vectors are kept unchanged. Points are never deleted. +```python +# File: neural_searcher.py -REST API ( [Schema](https://api.qdrant.tech/api-reference/points/delete-vectors)): +from qdrant_client import QdrantClient +from sentence_transformers import SentenceTransformer -httppythontypescriptrustjavacsharpgo -```http -POST /collections/{collection_name}/points/vectors/delete -{ - "points": [0, 3, 100], - "vectors": ["text", "image"] -} +class NeuralSearcher: + def __init__(self, collection_name): + self.collection_name = collection_name + # Initialize encoder model + self.model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu') + # initialize Qdrant client + self.qdrant_client = QdrantClient(host='localhost', port=6333) ``` +The search function looks as simple as possible: + ```python -client.delete_vectors( - collection_name="{collection_name}", - points=[0, 3, 100], - vectors=["text", "image"], -) + def search(self, text: str): + # Convert text query into vector + vector = self.model.encode(text).tolist() + # Use `vector` for search for closest vectors in the collection + search_result = self.qdrant_client.search( + collection_name=self.collection_name, + query_vector=vector, + query_filter=None, # We don't want any filters for now + top=5 # 5 the most closest results is enough + ) + # `search_result` contains found vector ids with similarity scores along with the stored payload + # In this function we are interested in payload only + payloads = [hit.payload for hit in search_result] + return payloads ``` -```typescript -client.deleteVectors("{collection_name}", { - points: [0, 3, 10], - vector: ["text", "image"], -}); +With Qdrant it is also feasible to add some conditions to the search. +For example, if we wanted to search for startups in a certain city, the search query could look like this: -``` +```python +from qdrant_client.models import Filter -```rust -use qdrant_client::qdrant::{ - DeletePointVectorsBuilder, PointsIdsList, -}; + ... -client - .delete_vectors( - DeletePointVectorsBuilder::new("{collection_name}") - .points_selector(PointsIdsList { - ids: vec![0.into(), 3.into(), 10.into()], - }) - .vectors(vec!["text".into(), "image".into()]) - .wait(true), + city_of_interest = "Berlin" + + # Define a filter for cities + city_filter = Filter(**{ + "must": [{ + "key": "city", # We store city information in a field of the same name + "match": { # This condition checks if payload field have requested value + "keyword": city_of_interest + } + }] + }) + + search_result = self.qdrant_client.search( + collection_name=self.collection_name, + query_vector=vector, + query_filter=city_filter, + top=5 ) - .await?; + ... ``` -```java -import java.util.List; +We now have a class for making neural search queries. Let's wrap it up into a service. -import static io.qdrant.client.PointIdFactory.id; -client - .deleteVectorsAsync( - "{collection_name}", List.of("text", "image"), List.of(id(0), id(3), id(10))) - .get(); +### Step 5: Deploy as a service -``` +To build the service we will use the FastAPI framework. +It is super easy to use and requires minimal code writing. -```csharp -await client.DeleteVectorsAsync("{collection_name}", ["text", "image"], [0, 3, 10]); +To install it, use the command +```bash +pip install fastapi uvicorn ``` -```go -import ( - "context" - - "github.com/qdrant/go-client/qdrant" -) - -client.DeleteVectors(context.Background(), &qdrant.DeletePointVectors{ - CollectionName: "{collection_name}", - PointsSelector: qdrant.NewPointsSelector( - qdrant.NewIDNum(0), qdrant.NewIDNum(3), qdrant.NewIDNum(10)), - Vectors: &qdrant.VectorsSelector{ - Names: []string{"text", "image"}, - }, -}) +Our service will have only one API endpoint and will look like this: -``` +```python +# File: service.py -To delete entire points, see [deleting points](https://qdrant.tech/documentation/concepts/points/#delete-points). +from fastapi import FastAPI -### [Anchor](https://qdrant.tech/documentation/concepts/points/\#update-payload) Update payload +# That is the file where NeuralSearcher is stored +from neural_searcher import NeuralSearcher -Learn how to modify the payload of a point in the [Payload](https://qdrant.tech/documentation/concepts/payload/#update-payload) section. +app = FastAPI() -## [Anchor](https://qdrant.tech/documentation/concepts/points/\#delete-points) Delete points +# Create an instance of the neural searcher +neural_searcher = NeuralSearcher(collection_name='startups') -REST API ( [Schema](https://api.qdrant.tech/api-reference/points/delete-points)): +@app.get("/api/search") +def search_startup(q: str): + return { + "result": neural_searcher.search(text=q) + } -httppythontypescriptrustjavacsharpgo -```http -POST /collections/{collection_name}/points/delete -{ - "points": [0, 3, 100] -} +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8000) ``` -```python -client.delete( - collection_name="{collection_name}", - points_selector=models.PointIdsList( - points=[0, 3, 100], - ), -) +Now, if you run the service with +```bash +python service.py ``` -```typescript -client.delete("{collection_name}", { - points: [0, 3, 100], -}); +and open your browser at [http://localhost:8000/docs](http://localhost:8000/docs) , you should be able to see a debug interface for your service. -``` +![FastAPI Swagger interface](https://gist.githubusercontent.com/generall/c229cc94be8c15095286b0c55a3f19d7/raw/d866e37a60036ebe65508bd736faff817a5d27e9/fastapi_neural_search.png) -```rust -use qdrant_client::qdrant::{DeletePointsBuilder, PointsIdsList}; +Feel free to play around with it, make queries and check out the results. +This concludes the tutorial. -client - .delete_points( - DeletePointsBuilder::new("{collection_name}") - .points(PointsIdsList { - ids: vec![0.into(), 3.into(), 100.into()], - }) - .wait(true), - ) - .await?; -``` +### Experience Neural Search With Qdrant’s Free Demo +Excited to see neural search in action? Take the next step and book a [free demo](https://qdrant.to/semantic-search-demo) with Qdrant! Experience firsthand how this cutting-edge technology can transform your search capabilities. -```java -import java.util.List; +Our demo will help you grow intuition for cases when the neural search is useful. The demo contains a switch that selects between neural and full-text searches. You can turn neural search on and off to compare the result with regular full-text search. +Try to use a startup description to find similar ones. -import static io.qdrant.client.PointIdFactory.id; +Join our [Discord community](https://qdrant.to/discord), where we talk about vector search and similarity learning, and publish other examples of neural networks and neural search applications. -client.deleteAsync("{collection_name}", List.of(id(0), id(3), id(100))); +<|page-72-lllmstxt|> +If you need to find some similar objects in vector space, provided e.g. by embeddings or matching NN, you can choose among a variety of libraries: Annoy, FAISS or NMSLib. +All of them will give you a fast approximate neighbors search within almost any space. -``` +But what if you need to introduce some constraints in your search? +For example, you want search only for products in some category or select the most similar customer of a particular brand. +I did not find any simple solutions for this. +There are several discussions like [this](https://github.com/spotify/annoy/issues/263), but they only suggest to iterate over top search results and apply conditions consequently after the search. -```csharp -using Qdrant.Client; +Let's see if we could somehow modify any of ANN algorithms to be able to apply constrains during the search itself. -var client = new QdrantClient("localhost", 6334); +Annoy builds tree index over random projections. +Tree index implies that we will meet same problem that appears in relational databases: +if field indexes were built independently, then it is possible to use only one of them at a time. +Since nobody solved this problem before, it seems that there is no easy approach. -await client.DeleteAsync(collectionName: "{collection_name}", ids: [0, 3, 100]); +There is another algorithm which shows top results on the [benchmark](https://github.com/erikbern/ann-benchmarks). +It is called HNSW which stands for Hierarchical Navigable Small World. -``` +The [original paper](https://arxiv.org/abs/1603.09320) is well written and very easy to read, so I will only give the main idea here. +We need to build a navigation graph among all indexed points so that the greedy search on this graph will lead us to the nearest point. +This graph is constructed by sequentially adding points that are connected by a fixed number of edges to previously added points. +In the resulting graph, the number of edges at each point does not exceed a given threshold $m$ and always contains the nearest considered points. -```go -import ( - "context" +![NSW](/articles_data/filtrable-hnsw/NSW.png) - "github.com/qdrant/go-client/qdrant" -) +### How can we modify it? -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +What if we simply apply the filter criteria to the nodes of this graph and use in the greedy search only those that meet these criteria? +It turns out that even with this naive modification algorithm can cover some use cases. -client.Delete(context.Background(), &qdrant.DeletePoints{ - CollectionName: "{collection_name}", - Points: qdrant.NewPointsSelector( - qdrant.NewIDNum(0), qdrant.NewIDNum(3), qdrant.NewIDNum(100), - ), -}) +One such case is if your criteria do not correlate with vector semantics. +For example, you use a vector search for clothing names and want to filter out some sizes. +In this case, the nodes will be uniformly filtered out from the entire cluster structure. +Therefore, the theoretical conclusions obtained in the [Percolation theory](https://en.wikipedia.org/wiki/Percolation_theory) become applicable: -``` -Alternative way to specify which points to remove is to use filter. +> Percolation is related to the robustness of the graph (called also network). Given a random graph of $n$ nodes and an average degree $\langle k\rangle$ . Next we remove randomly a fraction $1-p$ of nodes and leave only a fraction $p$. There exists a critical percolation threshold $ pc = \frac{1}{\langle k\rangle} $ below which the network becomes fragmented while above $pc$ a giant connected component exists. -httppythontypescriptrustjavacsharpgo -```http -POST /collections/{collection_name}/points/delete -{ - "filter": { - "must": [\ - {\ - "key": "color",\ - "match": {\ - "value": "red"\ - }\ - }\ - ] - } -} +This statement also confirmed by experiments: -``` +{{< figure src=/articles_data/filtrable-hnsw/exp_connectivity_glove_m0.png caption="Dependency of connectivity to the number of edges" >}} -```python -client.delete( - collection_name="{collection_name}", - points_selector=models.FilterSelector( - filter=models.Filter( - must=[\ - models.FieldCondition(\ - key="color",\ - match=models.MatchValue(value="red"),\ - ),\ - ], - ) - ), -) +{{< figure src=/articles_data/filtrable-hnsw/exp_connectivity_glove_num_elements.png caption="Dependency of connectivity to the number of point (no dependency)." >}} -``` -```typescript -client.delete("{collection_name}", { - filter: { - must: [\ - {\ - key: "color",\ - match: {\ - value: "red",\ - },\ - },\ - ], - }, -}); +There is a clear threshold when the search begins to fail. +This threshold is due to the decomposition of the graph into small connected components. +The graphs also show that this threshold can be shifted by increasing the $m$ parameter of the algorithm, which is responsible for the degree of nodes. -``` +Let's consider some other filtering conditions we might want to apply in the search: -```rust -use qdrant_client::qdrant::{Condition, DeletePointsBuilder, Filter}; +* Categorical filtering + * Select only points in a specific category + * Select points which belong to a specific subset of categories + * Select points with a specific set of labels +* Numerical range +* Selection within some geographical region -client - .delete_points( - DeletePointsBuilder::new("{collection_name}") - .points(Filter::must([Condition::matches(\ - "color",\ - "red".to_string(),\ - )])) - .wait(true), - ) - .await?; +In the first case, we can guarantee that the HNSW graph will be connected simply by creating additional edges +inside each category separately, using the same graph construction algorithm, and then combining them into the original graph. +In this case, the total number of edges will increase by no more than 2 times, regardless of the number of categories. -``` +Second case is a little harder. A connection may be lost between two categories if they lie in different clusters. -```java -import static io.qdrant.client.ConditionFactory.matchKeyword; +![category clusters](/articles_data/filtrable-hnsw/hnsw_graph_category.png) -import io.qdrant.client.grpc.Points.Filter; +The idea here is to build same navigation graph but not between nodes, but between categories. +Distance between two categories might be defined as distance between category entry points (or, for precision, as the average distance between a random sample). Now we can estimate expected graph connectivity by number of excluded categories, not nodes. +It still does not guarantee that two random categories will be connected, but allows us to switch to multiple searches in each category if connectivity threshold passed. In some cases, multiple searches can be even faster if you take advantage of parallel processing. -client - .deleteAsync( - "{collection_name}", - Filter.newBuilder().addMust(matchKeyword("color", "red")).build()) - .get(); +{{< figure src=/articles_data/filtrable-hnsw/exp_random_groups.png caption="Dependency of connectivity to the random categories included in search" >}} -``` +Third case might be resolved in a same way it is resolved in classical databases. +Depending on labeled subsets size ration we can go for one of the following scenarios: -```csharp -using Qdrant.Client; -using static Qdrant.Client.Grpc.Conditions; +* if at least one subset is small: perform search over the label containing smallest subset and then filter points consequently. +* if large subsets give large intersection: perform regular search with constraints expecting that intersection size fits connectivity threshold. +* if large subsets give small intersection: perform linear search over intersection expecting that it is small enough to fit a time frame. -var client = new QdrantClient("localhost", 6334); +Numerical range case can be reduces to the previous one if we split numerical range into a buckets containing equal amount of points. +Next we also connect neighboring buckets to achieve graph connectivity. We still need to filter some results which presence in border buckets but do not fulfill actual constraints, but their amount might be regulated by the size of buckets. -await client.DeleteAsync(collectionName: "{collection_name}", filter: MatchKeyword("color", "red")); +Geographical case is a lot like a numerical one. +Usual geographical search involves [geohash](https://en.wikipedia.org/wiki/Geohash), which matches any geo-point to a fixes length identifier. -``` +![Geohash example](/articles_data/filtrable-hnsw/geohash.png) -```go -import ( - "context" +We can use this identifiers as categories and additionally make connections between neighboring geohashes. +It will ensure that any selected geographical region will also contain connected HNSW graph. - "github.com/qdrant/go-client/qdrant" -) +## Conclusion -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +It is possible to enchant HNSW algorithm so that it will support filtering points in a first search phase. +Filtering can be carried out on the basis of belonging to categories, +which in turn is generalized to such popular cases as numerical ranges and geo. -client.Delete(context.Background(), &qdrant.DeletePoints{ - CollectionName: "{collection_name}", - Points: qdrant.NewPointsSelectorFilter( - &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("color", "red"), - }, - }, - ), -}) +Experiments were carried by modification [python implementation](https://github.com/generall/hnsw-python) of the algorithm, +but real production systems require much faster version, like [NMSLib](https://github.com/nmslib/nmslib). -``` +<|page-73-lllmstxt|> +We are excited to [announce the release of Qdrant v0.11](https://github.com/qdrant/qdrant/releases/tag/v0.11.0), +which introduces a number of new features and improvements. -This example removes all points with `{ "color": "red" }` from the collection. +## Replication -## [Anchor](https://qdrant.tech/documentation/concepts/points/\#retrieve-points) Retrieve points +One of the key features in this release is replication support, which allows Qdrant to provide a high availability +setup with distributed deployment out of the box. This, combined with sharding, enables you to horizontally scale +both the size of your collections and the throughput of your cluster. This means that you can use Qdrant to handle +large amounts of data without sacrificing performance or reliability. -There is a method for retrieving points by their ids. +## Administration API -REST API ( [Schema](https://api.qdrant.tech/api-reference/points/get-points)): +Another new feature is the administration API, which allows you to disable write operations to the service. This is +useful in situations where search availability is more critical than updates, and can help prevent issues like memory +usage watermarks from affecting your searches. -httppythontypescriptrustjavacsharpgo +## Exact search -```http -POST /collections/{collection_name}/points -{ - "ids": [0, 3, 100] -} +We have also added the ability to report indexed payload points in the info API, which allows you to verify that +payload values were properly formatted for indexing. In addition, we have introduced a new `exact` search parameter +that allows you to force exact searches of vectors, even if an ANN index is built. This can be useful for validating +the accuracy of your HNSW configuration. -``` +## Backward compatibility -```python -client.retrieve( - collection_name="{collection_name}", - ids=[0, 3, 100], -) +This release is backward compatible with v0.10.5 storage in single node deployment, but unfortunately, distributed +deployment is not compatible with previous versions due to the large number of changes required for the replica set +implementation. However, clients are tested for backward compatibility with the v0.10.x service. -``` +<|page-74-lllmstxt|> +[Qdrant 0.10 is a new version](https://github.com/qdrant/qdrant/releases/tag/v0.10.0) that brings a lot of performance +improvements, but also some new features which were heavily requested by our users. Here is an overview of what has changed. -```typescript -client.retrieve("{collection_name}", { - ids: [0, 3, 100], -}); +## Storing multiple vectors per object -``` +Previously, if you wanted to use semantic search with multiple vectors per object, you had to create separate collections +for each vector type. This was even if the vectors shared some other attributes in the payload. With Qdrant 0.10, you can +now store all of these vectors together in the same collection, which allows you to share a single copy of the payload. +This makes it easier to use semantic search with multiple vector types, and reduces the amount of work you need to do to +set up your collections. -```rust -use qdrant_client::qdrant::GetPointsBuilder; +## Batch vector search -client - .get_points(GetPointsBuilder::new( - "{collection_name}", - vec![0.into(), 30.into(), 100.into()], - )) - .await?; +Previously, you had to send multiple requests to the Qdrant API to perform multiple non-related tasks. However, this +can cause significant network overhead and slow down the process, especially if you have a poor connection speed. +Fortunately, the [new batch search feature](/documentation/concepts/search/#batch-search-api) allows +you to avoid this issue. With just one API call, Qdrant will handle multiple search requests in the most efficient way +possible. This means that you can perform multiple tasks simultaneously without having to worry about network overhead +or slow performance. -``` +## Built-in ARM support -```java -import java.util.List; +To make our application accessible to ARM users, we have compiled it specifically for that platform. If it is not +compiled for ARM, the device will have to emulate it, which can slow down performance. To ensure the best possible +experience for ARM users, we have created Docker images specifically for that platform. Keep in mind that using +a limited set of processor instructions may affect the performance of your vector search. Therefore, we have tested +both ARM and non-ARM architectures using similar setups to understand the potential impact on performance. -import static io.qdrant.client.PointIdFactory.id; +## Full-text filtering -client - .retrieveAsync("{collection_name}", List.of(id(0), id(30), id(100)), false, false, null) - .get(); +Qdrant is a vector database that allows you to quickly search for the nearest neighbors. However, you may need to apply +additional filters on top of the semantic search. Up until version 0.10, Qdrant only supported keyword filters. With the +release of Qdrant 0.10, [you can now use full-text filters](/documentation/concepts/filtering/#full-text-match) +as well. This new filter type can be used on its own or in combination with other filter types to provide even more +flexibility in your searches. -``` +<|page-75-lllmstxt|> +The advent of quantum computing has revolutionized many areas of science and technology, and one of the most intriguing developments has been its potential application to artificial neural networks (ANNs). One area where quantum computing can significantly improve performance is in vector search, a critical component of many machine learning tasks. In this article, we will discuss the concept of quantum quantization for ANN vector search, focusing on the conversion of float32 to qbit vectors and the ability to perform vector search on arbitrary-sized databases in constant time. -```csharp -using Qdrant.Client; -var client = new QdrantClient("localhost", 6334); +## Quantum Quantization and Entanglement -await client.RetrieveAsync( - collectionName: "{collection_name}", - ids: [0, 30, 100], - withPayload: false, - withVectors: false -); +Quantum quantization is a novel approach that leverages the power of quantum computing to speed up the search process in ANNs. By converting traditional float32 vectors into qbit vectors, we can create quantum entanglement between the qbits. Quantum entanglement is a unique phenomenon in which the states of two or more particles become interdependent, regardless of the distance between them. This property of quantum systems can be harnessed to create highly efficient vector search algorithms. -``` -```go -import ( - "context" +The conversion of float32 vectors to qbit vectors can be represented by the following formula: - "github.com/qdrant/go-client/qdrant" -) +```text +qbit_vector = Q( float32_vector ) +``` -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +where Q is the quantum quantization function that transforms the float32_vector into a quantum entangled qbit_vector. -client.Get(context.Background(), &qdrant.GetPoints{ - CollectionName: "{collection_name}", - Ids: []*qdrant.PointId{ - qdrant.NewIDNum(0), qdrant.NewIDNum(3), qdrant.NewIDNum(100), - }, -}) -``` +## Vector Search in Constant Time -This method has additional parameters `with_vectors` and `with_payload`. -Using these parameters, you can select parts of the point you want as a result. -Excluding helps you not to waste traffic transmitting useless data. +The primary advantage of using quantum quantization for ANN vector search is the ability to search through an arbitrary-sized database in constant time. -The single point can also be retrieved via the API: +The key to performing vector search in constant time with quantum quantization is to use a quantum algorithm called Grover's algorithm. +Grover's algorithm is a quantum search algorithm that finds the location of a marked item in an unsorted database in O(√N) time, where N is the size of the database. +This is a significant improvement over classical algorithms, which require O(N) time to solve the same problem. -REST API ( [Schema](https://api.qdrant.tech/api-reference/points/get-point)): +However, the is one another trick, which allows to improve Grover's algorithm performanse dramatically. +This trick is called transposition and it allows to reduce the number of Grover's iterations from O(√N) to O(√D), where D - is a dimension of the vector space. -```http -GET /collections/{collection_name}/points/{point_id} +And since the dimension of the vector space is much smaller than the number of vectors, and usually is a constant, this trick allows to reduce the number of Grover's iterations from O(√N) to O(√D) = O(1). -``` -## [Anchor](https://qdrant.tech/documentation/concepts/points/\#scroll-points) Scroll points +Check out our [Quantum Quantization PR](https://github.com/qdrant/qdrant/pull/1639) on GitHub. -Sometimes it might be necessary to get all stored points without knowing ids, or iterate over points that correspond to a filter. +<|page-76-lllmstxt|> +No matter if you are just beginning your journey in the world of vector search, or you are a seasoned practitioner, you +have probably wondered how to choose the right embedding model to achieve the best search quality. There are some +public benchmarks, such as [MTEB](https://huggingface.co/spaces/mteb/leaderboard), that can help you narrow down the +options, but datasets used in those benchmarks will rarely be representative of your domain-specific data. Moreover, +search quality is not the only requirement you could have. For example, some of the best models might be amazingly +accurate for retrieval, but you can't afford to run them, e.g., due to high resource usage or your budget constraints. -REST API ( [Schema](https://api.qdrant.tech/master/api-reference/points/scroll-points)): + -httppythontypescriptrustjavacsharpgo +Selecting the best embedding model is a multi-objective optimization problem and there is no one-size-fits-all solution, +and there probably never will be. In this article, we will try to provide some guidance on how to approach this problem +in a practical way, and how to move from model selection to running it in production. -```http -POST /collections/{collection_name}/points/scroll -{ - "filter": { - "must": [\ - {\ - "key": "color",\ - "match": {\ - "value": "red"\ - }\ - }\ - ] - }, - "limit": 1, - "with_payload": true, - "with_vector": false -} +## Evaluation: the holy grail of vector search -``` +You can't improve what you don't measure. It's clichĂ©, but it's true also for retrieval. Search quality might and should +be measured not only in a running system, but also before you make the most important decision - which embedding model +to use. -```python -client.scroll( - collection_name="{collection_name}", - scroll_filter=models.Filter( - must=[\ - models.FieldCondition(key="color", match=models.MatchValue(value="red")),\ - ] - ), - limit=1, - with_payload=True, - with_vectors=False, -) +### Know the language your model speaks -``` +Embedding models are trained with specific languages in mind. When evaluating one, consider whether it supports all the +languages you have or predict to have in your data. If your data is not homogeneous, you might require a multilingual +model that can properly embed text across different languages. If you use Open Source models, then your model is likely +documented on Hugging Face Hub. For example, the popular in demos `all-MiniLM-L6-v2` was trained on English data only, +so it's not a good choice if you have data in other languages. -```typescript -client.scroll("{collection_name}", { - filter: { - must: [\ - {\ - key: "color",\ - match: {\ - value: "red",\ - },\ - },\ - ], - }, - limit: 1, - with_payload: true, - with_vector: false, -}); +[![all-MiniLM-L6-v2 on Hugging Face Hub](/articles_data/how-to-choose-an-embedding-model/hf-model-card.png)](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) -``` +However, it's not only about the language, but also about how the model treats the input data. Surprisingly, this is +often overlooked. Text embedding models use a specific tokenizer to chunk the input data into pieces, and then [starts +all the Transformer magic with assigning each token a specific input vector +representation](/articles/late-interaction-models/#understanding-embedding-models). -```rust -use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; +![An example of tokenization with WordPiece tokenizer](/articles_data/how-to-choose-an-embedding-model/tokenization-example.png) -client - .scroll( - ScrollPointsBuilder::new("{collection_name}") - .filter(Filter::must([Condition::matches(\ - "color",\ - "red".to_string(),\ - )])) - .limit(1) - .with_payload(true) - .with_vectors(false), - ) - .await?; +One of the effects of such inner workings is that the model can only understand what its tokenizer was trained on ([yes, +tokenizers are also trainable components](https://huggingface.co/learn/llm-course/chapter2/4#tokenizers)). As a result, +any characters it hasn't seen during the training will be replaced with a special `UNK` token. If you analyze social +media data, then you might be surprised that two contradicting sentences are actually perfect matches in your search, +as presented in the following example: -``` +![Tokenization: The weather today is so đŸŒ§ïž vs The weather today is so 🌞](/articles_data/how-to-choose-an-embedding-model/tokenization-contradictions.png) -```java -import static io.qdrant.client.ConditionFactory.matchKeyword; -import static io.qdrant.client.WithPayloadSelectorFactory.enable; +The same may go for accented letters, different alphabets, etc., that dominate in your target language. However, in that +case, you shouldn't be using such a model in the first place, as it does not support your language either way. +Tokenization has a bigger impact on the quality of the embeddings than many people think. If you want to understand what +the effects of tokenization are, we recommend you take the course on [Retrieval Optimization: From Tokenization to Vector +Quantization](https://www.deeplearning.ai/short-courses/retrieval-optimization-from-tokenization-to-vector-quantization/) +we recorded together with DeepLearning.AI. You may find the course especially interesting if you still wonder why your +semantic search engine can't handle numerical data, such as prices or dates, and what you can do about it. -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.ScrollPoints; +How do you know if the tokenizer supports the target language? That's pretty easy for the Open Source models, as you +can just run the tokenizer without the model and see how the yielded tokens look like. For the commercial models that +might be slightly harder, but companies like [OpenAI](https://github.com/openai/tiktoken) and +[Cohere](https://huggingface.co/Cohere/multilingual-22-12) are transparent about it and open source their tokenizers. +In the worst case, you can just modify some of the suspected tokens and see how the model reacts in terms of the +similarity between the original and modified text. -client - .scrollAsync( - ScrollPoints.newBuilder() - .setCollectionName("{collection_name}") - .setFilter(Filter.newBuilder().addMust(matchKeyword("color", "red")).build()) - .setLimit(1) - .setWithPayload(enable(true)) - .build()) - .get(); +![Creating vectors for accented and non-accented letters](/articles_data/how-to-choose-an-embedding-model/accented-letters.png) -``` +If the created representations are really far from each other in the vector space, it may indicate that some +non-supported characters are replaced with `UNK` tokens and thus the model can't properly embed the input data. -```csharp -using Qdrant.Client; -using static Qdrant.Client.Grpc.Conditions; +### Checklist of things to consider -var client = new QdrantClient("localhost", 6334); +Nevertheless, the evaluation does not focus on the input tokens only. First and foremost, we should measure how well +a particular model can handle the task we want to use it for. Vector embeddings are multipurpose tools, and some models +might be more suitable for **semantic similarity**, while others for **retrieval** or **question answering**. Nobody, +except you, can tell what's the nature of the problem you are trying to solve. Type of the task is not the only thing +to consider when choosing the right embedding model: -await client.ScrollAsync( - collectionName: "{collection_name}", - filter: MatchKeyword("color", "red"), - limit: 1, - payloadSelector: true -); +- **Sequence length** - embedding models have a limited input size they can process at a time. Check how long your + documents are and how many tokens they contain. If you use Open Source models, you can check the maximum sequence + length in the model card on Hugging Face Hub. For commercial models, it's better to ask the provider directly. +- **Model size** - larger models have more parameters and require more memory. Inference time also depends on model + architecture and your hardware. Some models run effectively only on GPUs, while others can run on CPUs as well. +- **Optimization support** - not all models are compatible with every optimization technique. For example, Binary + Quantization and Matryoshka embeddings require specific model characteristics. -``` +The list is not exhaustive, as there might be plenty of other things to consider, but you get the idea. -```go -import ( - "context" +That's why you need to precisely define the task you really want to solve, get your hands dirty with the data the system +is supposed to process and build a ground truth dataset for it, so you can make an informed decision. - "github.com/qdrant/go-client/qdrant" -) +### Building the ground truth dataset -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +The way your dataset will look like depends on the task you want to evaluate. If we speak about semantic similarity, +then you will need pairs of texts with a score indicating how similar they are. -client.Scroll(context.Background(), &qdrant.ScrollPoints{ - CollectionName: "{collection_name}", - Filter: &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("color", "red"), - }, - }, - Limit: qdrant.PtrOf(uint32(1)), - WithPayload: qdrant.NewWithPayload(true), -}) +For semantic similarity tasks, your dataset might look like this: +```json +[ + { + "text1": "I love this movie, it's fantastic", + "text2": "This film is amazing, I really enjoyed it", + "similarity_score": 0.92 + }, + { + "text1": "The weather is nice today", + "text2": "I need to buy groceries", + "similarity_score": 0.12 + } +] ``` -Returns all point with `color` = `red`. +Most typically, Qdrant users build retrieval systems that they use alone, or combine them with Large Language Models +to build Retrieval Augmented Generation. When we do retrieval, we need a slightly different structure of the golden +dataset than for semantic similarity. The problem of retrieval is to find the `K` most relevant documents for a given +query. Therefore, we need a set of queries and a set of documents that we would expect to receive for each of them. +There are also three different ways of how to define the relevancy at different granularity levels: + +1. **Binary relevancy** - a document is either relevant or not. +2. **Ordinal relevancy** - a document can be more or less relevant (ranking). +3. **Relevancy with a score** - a document can have a score indicating how relevant it is. ```json -{ - "result": { - "next_page_offset": 1, - "points": [\ - {\ - "id": 0,\ - "payload": {\ - "color": "red"\ - }\ - }\ +[ + { + "query": "How do vector databases work?", + "relevant_documents": [ + { + "id": "doc_123", + "text": "Vector databases store and index vector embeddings...", + "relevance": 3 // Highly relevant (scale 0-3) + }, + { + "id": "doc_456", + "text": "The architecture of modern vector search engines...", + "relevance": 2 // Moderately relevant + } ] }, - "status": "ok", - "time": 0.0001 -} + { + "query": "Python code example for Qdrant", + "relevant_documents": [ + { + "id": "doc_789", + "text": "```python\nfrom qdrant_client import QdrantClient\n...", + "relevance": 3 // Highly relevant + } + ] + } +] +``` + +Once you have the dataset, you can start evaluating the models using one of the evaluation metrics, such as +`precision@k`, `MRR`, or `NDCG`. There are existing libraries, such as [ranx](https://amenra.github.io/ranx/) that can +help you with that. [Running the evaluation process](/rag/rag-evaluation-guide/) on various models is a good way to get +a sense of how they perform on your data. You can test even proprietary models that way. However, it's not the only +thing you should consider when choosing the best model. + +Please do not be afraid of building your evaluation dataset. It’s not as complicated as it might seem, and it's a +critical step! You don’t need millions of samples to get a good idea of how the model performs. A few hundred +well-curated examples might be a good starting point. Even dozens are better than nothing! + +## Compute resource constraints + +Even if you found the best performing embedding model for your domain, that doesn't mean you can use it. Software projects +do not live in isolation, and you have to consider the bigger picture. For example, you might have budget constraints +that limit your choices. It's also about being pragmatic. If you have a model that is 1% more precise, but it's 10 times +slower and consumes 10 times more resources, is it really worth it? + +Eventually, enjoying the journey is more important than reaching the destination in some cases, but that doesn't hold +true for search. The simpler and faster the means that took you there, the better. + +## Throughput, latency and cost + +When selecting an embedding model for production, you need to consider three critical operational factors: + +1. **Throughput**: How many embeddings can you generate per second? This directly impacts your system's ability to + handle load. Larger models typically have lower throughput, which might become a bottleneck during data ingestion or + high-traffic periods. +2. **Latency**: How quickly can you get a single embedding? For real-time applications like search-as-you-type or + interactive chatbots, low latency is crucial. Quantized versions of larger models can offer significant latency + improvements. +3. **Cost**: This includes both infrastructure costs (CPU/GPU resources, memory) and, for API-based models, per-token or + per-request charges. For example, running your own model might have higher upfront costs but lower per-request costs + than some SaaS models. + +The right balance depends on your specific use case. A news recommendation system might prioritize throughput for +processing large volumes of articles in real-time, while a website search might prioritize latency for real-time +results. Similarly, a chatbot using a Large Language Model to generate a response might prioritize cost-effectiveness, +as LLMs are often slower and retrieval isn't the most time-consuming part of the process. -``` +## Balancing all aspects -The Scroll API will return all points that match the filter in a page-by-page manner. +After all these considerations, you should have a table that summarizes each of the models you evaluated under all the +different conditions. Now things are getting hard and answers are not obvious anymore. -All resulting points are sorted by ID. To query the next page it is necessary to specify the largest seen ID in the `offset` field. -For convenience, this ID is also returned in the field `next_page_offset`. -If the value of the `next_page_offset` field is `null` \- the last page is reached. +Here's an example of how such a comparison table might look: -### [Anchor](https://qdrant.tech/documentation/concepts/points/\#order-points-by-payload-key) Order points by payload key +| Model | Precision@10 | MRR | Inference Time | Memory Usage | Cost | Multilingual | Max Sequence Length | +|----------------------------------|--------------|------|----------------|--------------|----------------|----------------------------|---------------------| +| expensive-proprietary-saas-only | 0.92 | 0.87 | API-dependent | N/A | $0.25/M tokens | Probably, yet undocumented | 8192 | +| cheaper-proprietary-multilingual | 0.89 | 0.84 | API-dependent | N/A | $0.01/M tokens | Yes (94 languages) | 4096 | +| open-source-gpu-required | 0.88 | 0.83 | 120ms | 15GB | Self-hosted | English | 1024 | +| open-source-on-cpu | 0.85 | 0.79 | 30ms | 120MB | Self-hosted | English | 512 | -_Available as of v1.8.0_ +The decision process should be guided by your specific requirements. Organizations struggling with budget constraints +might lean towards self-hosted options, while those who prefer to avoid dealing with infrastructure management +might prefer API-based solutions. Who knows? Maybe your project does not require the highest precision possible, and a +smaller model will do the job just fine. -When using the [`scroll`](https://qdrant.tech/documentation/concepts/points/#scroll-points) API, you can sort the results by payload key. For example, you can retrieve points in chronological order if your payloads have a `"timestamp"` field, as is shown from the example below: +![Fast, precise, cheap - pick two](/articles_data/how-to-choose-an-embedding-model/pyramid.png) -httppythontypescriptrustjavacsharpgo +Remember that this doesn't have to be a one-time decision. As your application evolves, you might need to revisit your +choice of the embedding model. Qdrant's architecture makes it relatively easy to migrate to a different model if needed. +Named vectors help to create a system with multiple models and switch between them based on the query, or build a +[hybrid search](/articles/hybrid-search/) that takes advantage of different models or more complex search pipelines. -```http -POST /collections/{collection_name}/points/scroll -{ - "limit": 15, - "order_by": "timestamp", // <-- this! -} +An important decision to make is also where to host the embedding model. Maybe you prefer not to deal with the +infrastructure management and send the data you process in its original form? Qdrant now has something for you! -``` +## Locally sourced embeddings -```python -client.scroll( - collection_name="{collection_name}", - limit=15, - order_by="timestamp", # <-- this! -) +Wouldn't it be great to run your selected embedding model as close to your search engine as possible? Network latency +might be one of the biggest enemies, and transferring millions of vectors over the network may take longer if done from +a distant location. Moreover, some of the cloud providers will charge you for the data transfer, so it's not only about +the latency, but also about the cost. Finally, running an embedding model on-premises requires some expertise and +resources, and if you want to focus on your core business, you might prefer to avoid that. -``` +![Architecture diagram with Qdrant Cloud Inference](/articles_data/how-to-choose-an-embedding-model/cloud-inference-diagram.jpg) -```typescript -client.scroll("{collection_name}", { - limit: 15, - order_by: "timestamp", // <-- this! -}); +**Qdrant's Cloud Inference** solves these problems by allowing you to run the embedding model next to the cluster where +your vector database is running. It's a perfect solution for those who want not to worry about the model inference and +just use search that works on the data they have. Check out the [Cloud Inference +documentation](/documentation/cloud/inference/) to learn more. -``` +<|page-77-lllmstxt|> +## What Does it Take to Run Search in Production? -```rust -use qdrant_client::qdrant::{OrderByBuilder, ScrollPointsBuilder}; +A mid-sized e-commerce company launched a vector search pilot to improve product discovery. During testing, everything ran smoothly. But in production, their queries began failing intermittently: memory errors, disk I/O spikes, and search delays sprang up unexpectedly. -client - .scroll( - ScrollPointsBuilder::new("{collection_name}") - .limit(15) - .order_by(OrderByBuilder::new("timestamp")), - ) - .await?; +It turned out the team hadn't adjusted the default configuration settings or reserved dedicated paths for write-ahead logs. Their vector index was too large to fit comfortably in RAM, and it frequently spilled to disk, causing slowdowns. -``` +> *Issues like these underscore how default configurations can fail spectacularly under production loads.* -```java -import io.qdrant.client.grpc.Points.OrderBy; -import io.qdrant.client.grpc.Points.ScrollPoints; +Running vector search in production is about ensuring **reliability, performance, and resilience**—no matter your hosting environment. Managing memory constraints, configuring data distribution, indexing choices, and backups are crucial whether you're on bare metal, virtual machines, or orchestrated containers. -client.scrollAsync(ScrollPoints.newBuilder() - .setCollectionName("{collection_name}") - .setLimit(15) - .setOrderBy(OrderBy.newBuilder().setKey("timestamp").build()) - .build()).get(); +### This Guide Addresses Most Common Issues in Production -``` +![vector-search-production](/articles_data/vector-search-production/vector-search-production-0.jpg) -```csharp -await client.ScrollAsync("{collection_name}", limit: 15, orderBy: "timestamp"); +Whether you're planning your first deployment or looking to improve an existing system, this walkthrough will help you build resilient and high-performing vector search infrastructure. -``` +This article will help you successfully deploy and maintain vector search systems in production environments. -```go -import ( - "context" +> Drawing from real-world experiences of our users, you'll discover practical techniques to avoid common pitfalls that have derailed many production deployments. +## Table of Contents - "github.com/qdrant/go-client/qdrant" -) +| Section | +|---------| +| [**1. How Can You Get the Best Search Performance?**](#1-how-can-you-get-the-best-search-performance) | +| [**2. How do I Ingest and Index Large Amounts of Data?**](#2-how-do-i-ingest-and-index-large-amounts-of-data) | +| [**3. What's the Best Way to Scale the Database and Optimize Resources?**](#3-whats-the-best-way-to-scale-the-database-and-optimize-resources) | +| [**4. Ensuring Disaster Recovery With Database Backups and Snapshots**](#4-ensuring-disaster-recovery-with-database-backups-and-snapshots) | +| [**5. Tips for Proper Database Administration**](#5-tips-for-proper-database-administration) | -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +## 1. How Can You Get the Best Search Performance? +![vector-search-production](/articles_data/vector-search-production/vector-search-production-1.jpg) -client.Scroll(context.Background(), &qdrant.ScrollPoints{ - CollectionName: "{collection_name}", - Limit: qdrant.PtrOf(uint32(15)), - OrderBy: &qdrant.OrderBy{ - Key: "timestamp", - }, -}) +|| +|:-:| +|**"Search got super slow after we added more vectors."**| -``` +❓ **Use Case:** A customer support startup saw intermittent latency spikes. Their index outgrew available memory, so disk fetches became frequent. Upgrading their RAM helped, and quantizing all the data became the real game-changer. All it took for the user was to manage their memory loads and to reduce in-memory vectors and oflload the rest onto the disk. -You need to use the `order_by` `key` parameter to specify the payload key. Then you can add other fields to control the ordering, such as `direction` and `start_from`: +> If this is happening to you, then your indexing settings are most likely not optimized, so the system is hitting the disk more often and driving up latency. -httppythontypescriptrustjavacsharpgo +### Ensure your hot dataset fits in RAM for low-latency queries. -```http -"order_by": { - "key": "timestamp", - "direction": "desc" // default is "asc" - "start_from": 123, // start from this value -} +If not, then you'll have to [**offload data 'on_disk'**](/documentation/concepts/storage/#configuring-memmap-storage). If this parameter is enabled, Qdrant caches your most frequently accessed vectors loaded into RAM, and the rest is memory-mapped onto the disk. -``` +This ensures minimal disk access during queries, significantly reducing latency and boosting overall performance. By monitoring query patterns and usage metrics, you can identify which subsets of your data deserve dedicated in-memory storage, reserving disk access only for colder, less frequently queried vectors. -```python -order_by=models.OrderBy( - key="timestamp", - direction="desc", # default is "asc" - start_from=123, # start from this value -) +|| +|-| +|**Read More:** [**Storage Documentation**](https://qdrant.tech/documentation/concepts/storage/)| -``` +### Index Your Important Metadata to Avoid Costly Queries -```typescript -order_by: { - key: "timestamp", - direction: "desc", // default is "asc" - start_from: 123, // start from this value -} +✅ You should always [**create payload indexes**](https://qdrant.tech/documentation/concepts/indexing/#payload-index) for all fields used in filters or sorting. -``` +Many users configure complex filters but may not be aware of the need to create corresponding payload indexes. -```rust -use qdrant_client::qdrant::{start_from::Value, Direction, OrderByBuilder}; +> As a result, every query scans thousands of vectors and their bare payloads before discarding the majority that failed the filter condition. This leads to soaring CPU usage and long response times, especially under higher traffic loads. -OrderByBuilder::new("timestamp") - .direction(Direction::Desc.into()) - .start_from(Value::Integer(123)) - .build(); +Filtering after retrieving thousands of vectors can get expensive. If you don't filter with your queries, then Qdrant will evaluate more vectors than you need. This will make the entire system slower and more resource intensive. Because of this, we have developed out own version of HNSW - [**The Filterable Vector Index**](https://qdrant.tech/articles/filtrable-hnsw/). -``` +Unlike some other engines, Qdrant lets you make the optimal choice of which fields to index for your use case rather than creating indexes for every field by default. -```java -import io.qdrant.client.grpc.Points.Direction; -import io.qdrant.client.grpc.Points.OrderBy; -import io.qdrant.client.grpc.Points.StartFrom; +> **Note:** Don't forget to use the correct [**payload index type**](https://qdrant.tech/documentation/concepts/indexing/#payload-index). If there are numeric values, the you must use a numeric index. If you represent numbers in strings ("123"), a numeric index will not work. -OrderBy.newBuilder() - .setKey("timestamp") - .setDirection(Direction.Desc) - .setStartFrom(StartFrom.newBuilder() - .setInteger(123) - .build()) - .build(); +|| +|-| +|**Read More:** [**Filtering Documentation**](https://qdrant.tech/documentation/concepts/filtering/)| -``` +### Don't Forget to Tune HNSW Search Parameters -```csharp -using Qdrant.Client.Grpc; +|| +|:-:| +|**"Our results aren't relevant enough, or they take too long to compute."**| -new OrderBy -{ - Key = "timestamp", - Direction = Direction.Desc, - StartFrom = 123 -}; +Sometimes users don't properly balance HNSW search parameters. Setting the HNSW `ef` parameter to a very low value like zero would result in extremely fast responses (around one millisecond), but the results would be of poor quality. -``` +❓ **Use Case:** A customer ran advanced similarity searches across their vast dataset of nearly 800 million vectors. Initially, they found that queries took anywhere from 10 to 20 seconds, especially when combining multiple filters and metadata fields. -```go -import "github.com/qdrant/go-client/qdrant" +> ✅ How can they retain accuracy and keep things fast? [**The answer is optimization.**](https://qdrant.tech/documentation/guides/optimize/) -qdrant.OrderBy{ - Key: "timestamp", - Direction: qdrant.Direction_Desc.Enum(), - StartFrom: qdrant.NewStartFromInt(123), -} +**Figure 1:** Qdrant is highly configurable. You can configure it for speed, precision or resource use. +![qdrant resource tradeoffs](/docs/tradeoff.png) -``` +By dialing in `ef`, the team discovered a sweet spot where sub-second responses became feasible, yet accuracy remained solid. They also fine-tuned other aspects of the database, such as placing quantized vectors in RAM while offloading original, uncompressed vectors to disk. -When sorting is based on a non-unique value, it is not possible to rely on an ID offset. Thus, next\_page\_offset is not returned within the response. However, you can still do pagination by combining `"order_by": { "start_from": ... }` with a `{ "must_not": [{ "has_id": [...] }] }` filter. +This strategy balanced memory usage with performance: only the compact vectors needed to be in memory for fast lookups, while the larger ones stayed on disk until required. -## [Anchor](https://qdrant.tech/documentation/concepts/points/\#counting-points) Counting points +|| +|-| +|**Read More:** [**Optimization Guide**](https://qdrant.tech/documentation/guides/optimize/)|#optimizing-qdrant-performance-three-scenarios +|**Read More:** [**HNSW Documentation**](https://qdrant.tech/documentation/concepts/indexing/#vector-index)| -_Available as of v0.8.4_ +### Compress Your Data with Quantization Strategies -Sometimes it can be useful to know how many points fit the filter conditions without doing a real search. +|| +|:-:| +|**"We're using too much memory for our massive dataset."**| -Among others, for example, we can highlight the following scenarios: +Many users skip [**quantization**](https://qdrant.tech/documentation/guides/quantization/), causing their index to consume excessive RAM and produce uneven performance. Some users hesitate to compromise precision, but this is not always the case. -- Evaluation of results size for faceted search -- Determining the number of pages for pagination -- Debugging the query execution speed +If your workload can tolerate a moderate drop in embedding precision, data compression offers a powerful way to shrink vector size and slash memory usage. By converting high-dimensional floating-point values into lower-bit formats (such as 8-bit scalar or even a single bit-sized representations), you can keep far more vectors in RAM while reducing disk footprint. -REST API ( [Schema](https://api.qdrant.tech/master/api-reference/points/count-points)): +> ✅ [**You should evaluate and apply quantization**](https://qdrant.tech/documentation/guides/quantization/#how-to-choose-the-right-quantization-method) if your use case allows. Quantization seriously improves performance and reduces storage costs. -httppythontypescriptrustjavacsharpgo +This not only speeds up query throughput for large-scale datasets, but also cuts hardware costs and storage overhead. While Scalar Quantization is a midrange compression alternative, Binary quantization is more drastic, so be sure to test your accuracy requirements for each thoroughly. -```http -POST /collections/{collection_name}/points/count -{ - "filter": { - "must": [\ - {\ - "key": "color",\ - "match": {\ - "value": "red"\ - }\ - }\ - ] - }, - "exact": true -} +When using [**quantization**](https://qdrant.tech/documentation/guides/quantization/), you can store only the compressed vectors in memory while leaving the original floating-point versions on disk for reference. This approach dramatically lowers RAM consumption—since quantized vectors take far less space—yet still allows you to retrieve full-precision vectors if needed for downstream tasks like re-ranking. -``` +>**Sidenote:** You can always enable `async_io` scorer when the linux kernel supports it and if you have `on_disk` vectors. -```python -client.count( - collection_name="{collection_name}", - count_filter=models.Filter( - must=[\ - models.FieldCondition(key="color", match=models.MatchValue(value="red")),\ - ] - ), - exact=True, -) +|| +|-| +|**Read More:** [**Quantization Documentation**](https://qdrant.tech/documentation/guides/quantization/)| -``` +## 2. How do I Ingest and Index Large Amounts of Data? +![vector-search-production](/articles_data/vector-search-production/vector-search-production-2.jpg) -```typescript -client.count("{collection_name}", { - filter: { - must: [\ - {\ - key: "color",\ - match: {\ - value: "red",\ - },\ - },\ - ], - }, - exact: true, -}); +|| +|:-:| +|**"When I try to import a huge dataset, everything grinds to a halt."**| -``` +❓ **Use Case:** A fintech team ingested 500 million transaction records. Performance was fine initially but collapsed within an hour. -```rust -use qdrant_client::qdrant::{Condition, CountPointsBuilder, Filter}; +They had left HNSW indexing enabled, so every insert triggered a full index update. Unfortunately, their CPU usage soared, and other services timed out. -client - .count( - CountPointsBuilder::new("{collection_name}") - .filter(Filter::must([Condition::matches(\ - "color",\ - "red".to_string(),\ - )])) - .exact(true), - ) - .await?; +> ✅ On a case-by-case basis, we recommend **disabling building the HNSW index during large uploads** to improve ingestion and indexing speed. -``` +Once all records are inserted, you can rebuild the index in a single pass. Consider a specialized ingestion pipeline that batches writes and schedules indexing during low-traffic windows. If you don't have such low-traffic windows, you can tune the `indexing_threshold` to find a balance between receiving updates without triggering indexation, and keeping the collection indexed. -```java -import static io.qdrant.client.ConditionFactory.matchKeyword; +|| +|-| +|**Read More:** [**Configuring the Vector Index**](https://qdrant.tech/documentation/concepts/indexing/#vector-index)| -import io.qdrant.client.grpc.Points.Filter; +### Other Solutions to Alleviate Indexing Bottleneck -client - .countAsync( - "{collection_name}", - Filter.newBuilder().addMust(matchKeyword("color", "red")).build(), - true) - .get(); +✅ **Increase indexing threads:** If you're using more than 16 cores, consider explicitly increasing the number of indexing threads. -``` +> By default, Qdrant uses up to 16 threads for indexing, but if you notice your CPU isn't being fully utilized during indexing, you can increase this number. -```csharp -using Qdrant.Client; -using static Qdrant.Client.Grpc.Conditions; +✅ **Use Batch Processes:** Increase the number of concurrent processes. Running 50-60 processes can significantly improve upload performance. Using just one or two processes won't allow you to see the true performance potential. -var client = new QdrantClient("localhost", 6334); +**Be patient with indexing:** After uploading large datasets, there's a waiting period for indexing to complete. This is normal and can take time depending on your dataset size. -await client.CountAsync( - collectionName: "{collection_name}", - filter: MatchKeyword("color", "red"), - exact: true -); +|| +|-| +|**Read More:** [**Configuration Documentation**](https://qdrant.tech/documentation/guides/configuration/)| -``` +### When Indexing Falls Behind Ingestion +![vector-search-production](/articles_data/vector-search-production/vector-search-production-3.jpg) -```go -import ( - "context" +> It's possible for indexing to temporarily fall behind data ingestion, both during gradual streaming uploads and after large bulk uploads. - "github.com/qdrant/go-client/qdrant" -) +By default, searches include unindexed data. However, a large number of unindexed points can significantly slow down searches due to full scans, potentially causing high search latency, timeouts, and application failures. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +If the maximum number of indexed points remains consistently low, this is likely not an issue. If you anticipate periods with many unindexed points, you should take measures to prevent search disruptions in production. -client.Count(context.Background(), &qdrant.CountPoints{ - CollectionName: "midlib", - Filter: &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("color", "red"), - }, - }, -}) +One option is to [**set `indexed_only=true` in search requests**](https://qdrant.tech/documentation/concepts/search/#search-api). This will ensure fast searches by only considering indexed data, at the expense of eventual consistency (new data becomes searchable only after indexing). -``` +Alternatively, you can perform [**bulk vector uploads**](https://qdrant.tech/documentation/database-tutorials/bulk-upload/) during low-traffic periods to allow indexing to complete before increased traffic. -Returns number of counts matching given filtering conditions: +> A persistent increase in the number of indexed points indicates a problem. Potential solutions include: increasing hardware resources, optimizing indexing (e.g., smaller segments, HNSW tuning), or reducing the volume of data changes. -```json -{ - "count": 3811 -} +|| +|-| +|**Read More:** [**Indexing Documentation**](https://qdrant.tech/documentation/concepts/indexing/)| -``` +### How to Arrange Metadata and Schema for Consistency -## [Anchor](https://qdrant.tech/documentation/concepts/points/\#batch-update) Batch update +|| +|:-:| +|**"My filters aren't working the same way every time."**| -_Available as of v1.5.0_ +In some cases, the payload schema is inconsistent across data pipelines, so some fields have mismatched types or are missing altogether. -You can batch multiple point update operations. This includes inserting, -updating and deleting points, vectors and payload. +❓ **Use Case:** A healthcare firm discovered that some pipelines inserted strings where others inserted integers. Filters broke silently or returned inconsistent results, signalling that [**a unified payload schema**](https://qdrant.tech/documentation/concepts/indexing/#payload-index) was not in place. -A batch update request consists of a list of operations. These are executed in -order. These operations can be batched: +> When payload fields are typed inconsistently across your ingestion pipelines, filters can break in unpredictable ways. -- [Upsert points](https://qdrant.tech/documentation/concepts/points/#upload-points): `upsert` or `UpsertOperation` -- [Delete points](https://qdrant.tech/documentation/concepts/points/#delete-points): `delete_points` or `DeleteOperation` -- [Update vectors](https://qdrant.tech/documentation/concepts/points/#update-vectors): `update_vectors` or `UpdateVectorsOperation` -- [Delete vectors](https://qdrant.tech/documentation/concepts/points/#delete-vectors): `delete_vectors` or `DeleteVectorsOperation` -- [Set payload](https://qdrant.tech/documentation/concepts/payload/#set-payload): `set_payload` or `SetPayloadOperation` -- [Overwrite payload](https://qdrant.tech/documentation/concepts/payload/#overwrite-payload): `overwrite_payload` or `OverwritePayload` -- [Delete payload](https://qdrant.tech/documentation/concepts/payload/#delete-payload-keys): `delete_payload` or `DeletePayloadOperation` -- [Clear payload](https://qdrant.tech/documentation/concepts/payload/#clear-payload): `clear_payload` or `ClearPayloadOperation` +For example, **some services might write a "status" field as a string ("active") while others insert it as a numeric code (1)**. As a result, queries that expect uniform data might silently fail, skip important records, or produce incorrect sorting/filtering. -The following example snippet makes use of all operations. +✅ Ensuring your payload schema is consistently enforced, whether through strict type checking or a well-defined data contract, is the best way to prevent this mismatch. It's also important to log any schema violations during ingestion, giving you a chance to fix errors before they degrade query performance and result quality. -REST API ( [Schema](https://api.qdrant.tech/master/api-reference/points/batch-update)): +|| +|-| +|**Read More:** [**Payload Documentation**](https://qdrant.tech/documentation/concepts/payload/)| -httppythontypescriptrustjava +### Decide How to Set Up a Multitenant Collection -```http -POST /collections/{collection_name}/points/batch +❓ **Use Case:** When implementing vector databases, healthcare organizations need to ensure isolation between users' data. Our customer needed to make sure that when they filtered queries to only show a particular patient's documents, and no other patient's documents appeared in the query results. + +✅ [**You should almost always consolidate tenants to a single collection**](https://qdrant.tech/documentation/guides/multiple-partitions/) if possible, tagging by tenant. + +```text +PUT /collections/{collection_name}/index { - "operations": [\ - {\ - "upsert": {\ - "points": [\ - {\ - "id": 1,\ - "vector": [1.0, 2.0, 3.0, 4.0],\ - "payload": {}\ - }\ - ]\ - }\ - },\ - {\ - "update_vectors": {\ - "points": [\ - {\ - "id": 1,\ - "vector": [1.0, 2.0, 3.0, 4.0]\ - }\ - ]\ - }\ - },\ - {\ - "delete_vectors": {\ - "points": [1],\ - "vector": [""]\ - }\ - },\ - {\ - "overwrite_payload": {\ - "payload": {\ - "test_payload": "1"\ - },\ - "points": [1]\ - }\ - },\ - {\ - "set_payload": {\ - "payload": {\ - "test_payload_2": "2",\ - "test_payload_3": "3"\ - },\ - "points": [1]\ - }\ - },\ - {\ - "delete_payload": {\ - "keys": ["test_payload_2"],\ - "points": [1]\ - }\ - },\ - {\ - "clear_payload": {\ - "points": [1]\ - }\ - },\ - {"delete": {"points": [1]}}\ - ] + "field_name": "group_id", + "field_schema": { + "type": "keyword", + "is_tenant": true + } } - ``` -```python -client.batch_update_points( - collection_name="{collection_name}", - update_operations=[\ - models.UpsertOperation(\ - upsert=models.PointsList(\ - points=[\ - models.PointStruct(\ - id=1,\ - vector=[1.0, 2.0, 3.0, 4.0],\ - payload={},\ - ),\ - ]\ - )\ - ),\ - models.UpdateVectorsOperation(\ - update_vectors=models.UpdateVectors(\ - points=[\ - models.PointVectors(\ - id=1,\ - vector=[1.0, 2.0, 3.0, 4.0],\ - )\ - ]\ - )\ - ),\ - models.DeleteVectorsOperation(\ - delete_vectors=models.DeleteVectors(points=[1], vector=[""])\ - ),\ - models.OverwritePayloadOperation(\ - overwrite_payload=models.SetPayload(\ - payload={"test_payload": 1},\ - points=[1],\ - )\ - ),\ - models.SetPayloadOperation(\ - set_payload=models.SetPayload(\ - payload={\ - "test_payload_2": 2,\ - "test_payload_3": 3,\ - },\ - points=[1],\ - )\ - ),\ - models.DeletePayloadOperation(\ - delete_payload=models.DeletePayload(keys=["test_payload_2"], points=[1])\ - ),\ - models.ClearPayloadOperation(clear_payload=models.PointIdsList(points=[1])),\ - models.DeleteOperation(delete=models.PointIdsList(points=[1])),\ - ], -) +Figure: For many-tenant setups, spinning up a new collection per tenant can balloon overhead. A multitenant design—using a single collection with a tenant field—uses resources more efficiently. -``` +![vector-search-production](/articles_data/vector-search-production/multitenancy.png) -```typescript -client.batchUpdate("{collection_name}", { - operations: [\ - {\ - upsert: {\ - points: [\ - {\ - id: 1,\ - vector: [1.0, 2.0, 3.0, 4.0],\ - payload: {},\ - },\ - ],\ - },\ - },\ - {\ - update_vectors: {\ - points: [\ - {\ - id: 1,\ - vector: [1.0, 2.0, 3.0, 4.0],\ - },\ - ],\ - },\ - },\ - {\ - delete_vectors: {\ - points: [1],\ - vector: [""],\ - },\ - },\ - {\ - overwrite_payload: {\ - payload: {\ - test_payload: 1,\ - },\ - points: [1],\ - },\ - },\ - {\ - set_payload: {\ - payload: {\ - test_payload_2: 2,\ - test_payload_3: 3,\ - },\ - points: [1],\ - },\ - },\ - {\ - delete_payload: {\ - keys: ["test_payload_2"],\ - points: [1],\ - },\ - },\ - {\ - clear_payload: {\ - points: [1],\ - },\ - },\ - {\ - delete: {\ - points: [1],\ - },\ - },\ - ], -}); +> **Don't forget:** you can always create [**API keys in Qdrant Cloud (or JWT in OSS)**](https://qdrant.tech/articles/data-privacy/) to enforce a certain filter via a payload constraint. -``` +|| +|-| +|**Read More:** [**Multitenancy Documentation**](https://qdrant.tech/documentation/concepts/multitenancy/)| -```rust -use std::collections::HashMap; +## 3. What's the Best Way to Scale the Database and Optimize Resources? +![vector-search-production](/articles_data/vector-search-production/vector-search-production-4.jpg) -use qdrant_client::qdrant::{ - points_update_operation::{ - ClearPayload, DeletePayload, DeletePoints, DeleteVectors, Operation, OverwritePayload, - PointStructList, SetPayload, UpdateVectors, - }, - PointStruct, PointVectors, PointsUpdateOperation, UpdateBatchPointsBuilder, VectorsSelector, -}; -use qdrant_client::Payload; +|| +|:-:| +|**"How many nodes, CPUs, RAM and storage do I need for my Qdrant Cluster?"**| -client - .update_points_batch( - UpdateBatchPointsBuilder::new( - "{collection_name}", - vec![\ - PointsUpdateOperation {\ - operation: Some(Operation::Upsert(PointStructList {\ - points: vec![PointStruct::new(\ - 1,\ - vec![1.0, 2.0, 3.0, 4.0],\ - Payload::default(),\ - )],\ - ..Default::default()\ - })),\ - },\ - PointsUpdateOperation {\ - operation: Some(Operation::UpdateVectors(UpdateVectors {\ - points: vec![PointVectors {\ - id: Some(1.into()),\ - vectors: Some(vec![1.0, 2.0, 3.0, 4.0].into()),\ - }],\ - ..Default::default()\ - })),\ - },\ - PointsUpdateOperation {\ - operation: Some(Operation::DeleteVectors(DeleteVectors {\ - points_selector: Some(vec![1.into()].into()),\ - vectors: Some(VectorsSelector {\ - names: vec!["".into()],\ - }),\ - ..Default::default()\ - })),\ - },\ - PointsUpdateOperation {\ - operation: Some(Operation::OverwritePayload(OverwritePayload {\ - points_selector: Some(vec![1.into()].into()),\ - payload: HashMap::from([("test_payload".to_string(), 1.into())]),\ - ..Default::default()\ - })),\ - },\ - PointsUpdateOperation {\ - operation: Some(Operation::SetPayload(SetPayload {\ - points_selector: Some(vec![1.into()].into()),\ - payload: HashMap::from([\ - ("test_payload_2".to_string(), 2.into()),\ - ("test_payload_3".to_string(), 3.into()),\ - ]),\ - ..Default::default()\ - })),\ - },\ - PointsUpdateOperation {\ - operation: Some(Operation::DeletePayload(DeletePayload {\ - points_selector: Some(vec![1.into()].into()),\ - keys: vec!["test_payload_2".to_string()],\ - ..Default::default()\ - })),\ - },\ - PointsUpdateOperation {\ - operation: Some(Operation::ClearPayload(ClearPayload {\ - points: Some(vec![1.into()].into()),\ - ..Default::default()\ - })),\ - },\ - PointsUpdateOperation {\ - operation: Some(Operation::DeletePoints(DeletePoints {\ - points: Some(vec![1.into()].into()),\ - ..Default::default()\ - })),\ - },\ - ], - ) - .wait(true), - ) - .await?; +It depends. If you're just starting out - we have prepared a tool on our website to help you figure this out. For more information, [**check out the Capacity Planning document as well.**](https://qdrant.tech/documentation/guides/capacity-planning/) -``` +✅ [**Use the sizing calculator**](https://cloud.qdrant.io/calculator) or performance testing to ensure node specs (RAM/CPU) match your workload. -```java -import java.util.List; -import java.util.Map; +> Overestimating wastes resources, while underestimating leads to slow queries or out-of-memory errors. By methodically testing realistic workloads, you can confidently match hardware specs to your target ingestion rate, query volume, and dataset size. -import static io.qdrant.client.PointIdFactory.id; -import static io.qdrant.client.ValueFactory.value; -import static io.qdrant.client.VectorsFactory.vectors; +### Preparing for High Availability Scenarios in Production -import io.qdrant.client.grpc.Points.PointStruct; -import io.qdrant.client.grpc.Points.PointVectors; -import io.qdrant.client.grpc.Points.PointsIdsList; -import io.qdrant.client.grpc.Points.PointsSelector; -import io.qdrant.client.grpc.Points.PointsUpdateOperation; -import io.qdrant.client.grpc.Points.PointsUpdateOperation.ClearPayload; -import io.qdrant.client.grpc.Points.PointsUpdateOperation.DeletePayload; -import io.qdrant.client.grpc.Points.PointsUpdateOperation.DeletePoints; -import io.qdrant.client.grpc.Points.PointsUpdateOperation.DeleteVectors; -import io.qdrant.client.grpc.Points.PointsUpdateOperation.PointStructList; -import io.qdrant.client.grpc.Points.PointsUpdateOperation.SetPayload; -import io.qdrant.client.grpc.Points.PointsUpdateOperation.UpdateVectors; -import io.qdrant.client.grpc.Points.VectorsSelector; +✅ **Use at least 3 nodes** to ensure failover and reduce downtime risk. -client - .batchUpdateAsync( - "{collection_name}", - List.of( - PointsUpdateOperation.newBuilder() - .setUpsert( - PointStructList.newBuilder() - .addPoints( - PointStruct.newBuilder() - .setId(id(1)) - .setVectors(vectors(1.0f, 2.0f, 3.0f, 4.0f)) - .build()) - .build()) - .build(), - PointsUpdateOperation.newBuilder() - .setUpdateVectors( - UpdateVectors.newBuilder() - .addPoints( - PointVectors.newBuilder() - .setId(id(1)) - .setVectors(vectors(1.0f, 2.0f, 3.0f, 4.0f)) - .build()) - .build()) - .build(), - PointsUpdateOperation.newBuilder() - .setDeleteVectors( - DeleteVectors.newBuilder() - .setPointsSelector( - PointsSelector.newBuilder() - .setPoints(PointsIdsList.newBuilder().addIds(id(1)).build()) - .build()) - .setVectors(VectorsSelector.newBuilder().addNames("").build()) - .build()) - .build(), - PointsUpdateOperation.newBuilder() - .setOverwritePayload( - SetPayload.newBuilder() - .setPointsSelector( - PointsSelector.newBuilder() - .setPoints(PointsIdsList.newBuilder().addIds(id(1)).build()) - .build()) - .putAllPayload(Map.of("test_payload", value(1))) - .build()) - .build(), - PointsUpdateOperation.newBuilder() - .setSetPayload( - SetPayload.newBuilder() - .setPointsSelector( - PointsSelector.newBuilder() - .setPoints(PointsIdsList.newBuilder().addIds(id(1)).build()) - .build()) - .putAllPayload( - Map.of("test_payload_2", value(2), "test_payload_3", value(3))) - .build()) - .build(), - PointsUpdateOperation.newBuilder() - .setDeletePayload( - DeletePayload.newBuilder() - .setPointsSelector( - PointsSelector.newBuilder() - .setPoints(PointsIdsList.newBuilder().addIds(id(1)).build()) - .build()) - .addKeys("test_payload_2") - .build()) - .build(), - PointsUpdateOperation.newBuilder() - .setClearPayload( - ClearPayload.newBuilder() - .setPoints( - PointsSelector.newBuilder() - .setPoints(PointsIdsList.newBuilder().addIds(id(1)).build()) - .build()) - .build()) - .build(), - PointsUpdateOperation.newBuilder() - .setDeletePoints( - DeletePoints.newBuilder() - .setPoints( - PointsSelector.newBuilder() - .setPoints(PointsIdsList.newBuilder().addIds(id(1)).build()) - .build()) - .build()) - .build())) - .get(); +A three-node setup provides a baseline for fault tolerance: if one node goes offline, the remaining two can continue serving queries and maintain a quorum for data consistency. This guards against hardware failures, rolling updates, and network disruptions. Fewer than three nodes leaves you vulnerable to single-point failures that can knock your entire cluster offline. + +> [**We follow the Raft Protocol**](https://qdrant.tech/documentation/guides/distributed_deployment/#raft), so check out the docs and learn why this is important. + +✅ **Set a replication factor of at least 2** to tolerate node failure without losing availability. +```text +PUT /collections/{collection_name} +{ + "vectors": { + "size": 300, + "distance": "Cosine" + }, + "shard_number": 6, + "replication_factor": 2 +} ``` -To batch many points with a single operation type, please use batching -functionality in that operation directly. +Replication ensures that each piece of data is stored on multiple nodes. If one node fails, another replica can step in to serve reads and writes. This prevents data loss and maintains uninterrupted service for critical applications. A replication factor of 2 or higher is particularly important for production workloads where uptime and reliability are non-negotiable. -## [Anchor](https://qdrant.tech/documentation/concepts/points/\#awaiting-result) Awaiting result +✅ **Isolate production from dev/staging**—use separate clusters to avoid noisy neighbors. -If the API is called with the `&wait=false` parameter, or if it is not explicitly specified, the client will receive an acknowledgment of receiving data: +Development and staging environments often run experimental builds, tests, or simulations that can spike resource usage unpredictably. Running these alongside production can degrade performance and stability, impacting real users. By hosting production on a dedicated cluster, you can safeguard critical workloads from development-induced slowdowns and ensure more consistent, reliable performance. -```json -{ - "result": { - "operation_id": 123, - "status": "acknowledged" - }, - "status": "ok", - "time": 0.000206061 -} +### Dealing With Imbalanced or Overworked Nodes -``` +|| +|:-:| +|**"One of my nodes is doing way more work than the others."**| -This response does not mean that the data is available for retrieval yet. This -uses a form of eventual consistency. It may take a short amount of time before it -is actually processed as updating the collection happens in the background. In -fact, it is possible that such request eventually fails. -If inserting a lot of vectors, we also recommend using asynchronous requests to take advantage of pipelining. +❓ **Use Case:** A SaaS platform added hundreds of new customers. Suddenly, latencies spiked because one node was handling 5 times the load. A specific sharding scheme funneled certain "hot" data to just one shard. -If the logic of your application requires a guarantee that the vector will be available for searching immediately after the API responds, then use the flag `?wait=true`. -In this case, the API will return the result only after the operation is finished: +> It's quite possible that the user has multiple shards on one node, which end up handling most traffic while other nodes remain underutilized. -```json -{ - "result": { - "operation_id": 0, - "status": "completed" - }, - "status": "ok", - "time": 0.000206061 -} +In this case, you should [**choose the right number of shards**](https://qdrant.tech/documentation/guides/distributed_deployment/#sharding) based on your node count and expected RPS. -``` +You need to implement a shard strategy that aligns with real usage patterns. First, distribute your shards across all available nodes. This will help balance the load more effectively. After redistributing the shards, run performance tests to see how it affects your system. Then add replicas and test again to see how that changes performance. -##### Was this page useful? +Your sharding strategy also depends on how many collections you have. A single collection is arguably easier to balance, because there is less to move/balance. Also, a single collection also has the least amount of overhead/orchestration. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +#### How to shard? +Proper sharding considers data distribution and query patterns. By default, shards are randomized for uniform distribution if queries always span the entire dataset. If certain tenants or geographical regions get hammered with traffic, you might partition by cluster, by payload (tenant ID) or a custom shard distribution. -Thank you for your feedback! 🙏 +|| +|-| +|**Read More:** [**Sharding Documentation**](https://qdrant.tech/documentation/concepts/sharding/)| -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/points.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +### Manage Your Costs by Scaling Up or Down +![vector-search-production](/articles_data/vector-search-production/vector-search-production-5.jpg) -On this page: +Some teams scale up for daytime surges, then scale down overnight to save resources. If you do this, ensure data is sharded and replicated appropriately, so that scaling up and down won't result in service degradation. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/points.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +If using Qdrant Cloud you could also do this using the [**Replication Factor**](https://qdrant.tech/documentation/guides/distributed_deployment/#replication-factor), though it may be considered a bit of a hack. -× +> If you have 3 nodes with just 1 shard, and replication factor 6. It will create 3 replicas (one on each node) of that shard, because it can't host more. If you add 3 more nodes at peak times, it'll automatically replicate that shard 3 more times in an attempt to match the factor of 6. -[Powered by](https://qdrant.tech/) +If you scale down again, the extra replicas will be dropped. -<|page-42-lllmstxt|> -## cohere-rag-connector -- [Documentation](https://qdrant.tech/documentation/) -- [Examples](https://qdrant.tech/documentation/examples/) -- Implement Cohere RAG connector +✅ **Reshard collections after scaling up** your cluster to rebalance data and avoid OOMs. -# [Anchor](https://qdrant.tech/documentation/examples/cohere-rag-connector/\#implement-custom-connector-for-cohere-rag) Implement custom connector for Cohere RAG +When you add nodes to your cluster, existing replicas rebalance themselves with new shards, but only with a fixed number - which may not fully take advantage of the new hardware. As a result, the original nodes remain overloaded while new nodes sit mostly idle. By **resharding** collections after scaling, you redistribute data evenly across the cluster, preventing hot spots that can lead to out-of-memory (OOM) conditions on overburdened nodes. -| Time: 45 min | Level: Intermediate | | | -| --- | --- | --- | --- | +If new nodes remain empty after joining, you waste resources. If departing nodes leave behind un-migrated data, you risk partial coverage or even data loss. -The usual approach to implementing Retrieval Augmented Generation requires users to build their prompts with the -relevant context the LLM may rely on, and manually sending them to the model. Cohere is quite unique here, as their -models can now speak to the external tools and extract meaningful data on their own. You can virtually connect any data -source and let the Cohere LLM know how to access it. Obviously, vector search goes well with LLMs, and enabling semantic -search over your data is a typical case. +> **Note:** This is only available in Qdrant Cloud as well as Hybrid & Private Clouds, and not when self hosting. -Cohere RAG has lots of interesting features, such as inline citations, which help you to refer to the specific parts of -the documents used to generate the response. +|| +|-| +|**Read More:** [**Distributed Deployment Documentation**](https://qdrant.tech/documentation/guides/distributed_deployment/)| +|**Read More:** [**Resharding**](https://qdrant.tech/documentation/cloud/cluster-scaling/#resharding)| -![Cohere RAG citations](https://qdrant.tech/documentation/tutorials/cohere-rag-connector/cohere-rag-citations.png) +### How to Predict and Test Cluster Performance -_Source: [https://docs.cohere.com/docs/retrieval-augmented-generation-rag](https://docs.cohere.com/docs/retrieval-augmented-generation-rag)_ +|| +|:-:| +|**"I had no idea something was wrong until it was too late."**| -The connectors have to implement a specific interface and expose the data source as HTTP REST API. Cohere documentation -[describes a general process of creating a connector](https://docs.cohere.com/v1/docs/creating-and-deploying-a-connector). -This tutorial guides you step by step on building such a service around Qdrant. +Such issues tend to occur when users don't monitor resource usage, search performance, or security. Critical failures went undetected until users complained. -## [Anchor](https://qdrant.tech/documentation/examples/cohere-rag-connector/\#qdrant-connector) Qdrant connector +✅ **Run load tests** under expected traffic conditions to identify bottlenecks before go-live. -You probably already have some collections you would like to bring to the LLM. Maybe your pipeline was set up using some -of the popular libraries such as Langchain, Llama Index, or Haystack. Cohere connectors may implement even more complex -logic, e.g. hybrid search. In our case, we are going to start with a fresh Qdrant collection, index data using Cohere -Embed v3, build the connector, and finally connect it with the [Command-R model](https://txt.cohere.com/command-r/). +You need to set a plan to use realistic data for your load tests. Ideally, you should test with production traffic or historical data that closely resembles your actual workload. This provides more accurate results than randomly generated test data, as it will better represent real-world usage patterns and data distributions. -### [Anchor](https://qdrant.tech/documentation/examples/cohere-rag-connector/\#building-the-collection) Building the collection +> Design your load test to gradually increase traffic until you reach or exceed your expected production load. For example, if you expect 1000 requests per second (RPS) in production, incrementally scale up your test to reach this threshold while monitoring latency. Responses will separately show server timing for granular monitoring. -First things first, let’s build a collection and configure it for the Cohere `embed-multilingual-v3.0` model. It -produces 1024-dimensional embeddings, and we can choose any of the distance metrics available in Qdrant. Our connector -will act as a personal assistant of a software engineer, and it will expose our notes to suggest the priorities or -actions to perform. +You should test system performance after restarts to understand cold-start behavior. Initial queries after a restart may be significantly slower as caches need to be rebuilt. For example, a query might take 50-60 seconds initially but only 0.5 seconds on subsequent runs. -```python -from qdrant_client import QdrantClient, models +Remember, cold-starts and query behaviour are dataset dependent, which is why you should establish your own baselines and what to expect. -client = QdrantClient( - "https://my-cluster.cloud.qdrant.io:6333", - api_key="my-api-key", -) -client.create_collection( - collection_name="personal-notes", - vectors_config=models.VectorParams( - size=1024, - distance=models.Distance.DOT, - ), -) +|| +|-| +|**Read More:** [Distributed Deployment Documentation](https://qdrant.tech/documentation/guides/distributed_deployment/) -``` +### How to Design Your Systems to Protect Against Failure -Our notes will be represented as simple JSON objects with a `title` and `text` of the specific note. The embeddings will -be created from the `text` field only. +|| +|:-:| +|**"When one node crashed, our entire cluster went down."**| -```python -notes = [\ - {\ - "title": "Project Alpha Review",\ - "text": "Review the current progress of Project Alpha, focusing on the integration of the new API. Check for any compatibility issues with the existing system and document the steps needed to resolve them. Schedule a meeting with the development team to discuss the timeline and any potential roadblocks."\ - },\ - {\ - "title": "Learning Path Update",\ - "text": "Update the learning path document with the latest courses on React and Node.js from Pluralsight. Schedule at least 2 hours weekly to dedicate to these courses. Aim to complete the React course by the end of the month and the Node.js course by mid-next month."\ - },\ - {\ - "title": "Weekly Team Meeting Agenda",\ - "text": "Prepare the agenda for the weekly team meeting. Include the following topics: project updates, review of the sprint backlog, discussion on the new feature requests, and a brainstorming session for improving remote work practices. Send out the agenda and the Zoom link by Thursday afternoon."\ - },\ - {\ - "title": "Code Review Process Improvement",\ - "text": "Analyze the current code review process to identify inefficiencies. Consider adopting a new tool that integrates with our version control system. Explore options such as GitHub Actions for automating parts of the process. Draft a proposal with recommendations and share it with the team for feedback."\ - },\ - {\ - "title": "Cloud Migration Strategy",\ - "text": "Draft a plan for migrating our current on-premise infrastructure to the cloud. The plan should cover the selection of a cloud provider, cost analysis, and a phased migration approach. Identify critical applications for the first phase and any potential risks or challenges. Schedule a meeting with the IT department to discuss the plan."\ - },\ - {\ - "title": "Quarterly Goals Review",\ - "text": "Review the progress towards the quarterly goals. Update the documentation to reflect any completed objectives and outline steps for any remaining goals. Schedule individual meetings with team members to discuss their contributions and any support they might need to achieve their targets."\ - },\ - {\ - "title": "Personal Development Plan",\ - "text": "Reflect on the past quarter's achievements and areas for improvement. Update the personal development plan to include new technical skills to learn, certifications to pursue, and networking events to attend. Set realistic timelines and check-in points to monitor progress."\ - },\ - {\ - "title": "End-of-Year Performance Reviews",\ - "text": "Start preparing for the end-of-year performance reviews. Collect feedback from peers and managers, review project contributions, and document achievements. Consider areas for improvement and set goals for the next year. Schedule preliminary discussions with each team member to gather their self-assessments."\ - },\ - {\ - "title": "Technology Stack Evaluation",\ - "text": "Conduct an evaluation of our current technology stack to identify any outdated technologies or tools that could be replaced for better performance and productivity. Research emerging technologies that might benefit our projects. Prepare a report with findings and recommendations to present to the management team."\ - },\ - {\ - "title": "Team Building Event Planning",\ - "text": "Plan a team-building event for the next quarter. Consider activities that can be done remotely, such as virtual escape rooms or online game nights. Survey the team for their preferences and availability. Draft a budget proposal for the event and submit it for approval."\ - }\ -] +Unfortunately, you didn't plan for failover or chaos testing, so a single point of failure took out production. You should plan for hardware or node crashes by storing data redundantly, testing failovers, and running chaos experiments. -``` +✅ **Regularly test failures** to reveal how your system recovers. -Storing the embeddings along with the metadata is fairly simple. +High concurrency and large memory footprints can expose misconfigurations more quickly, so regularly simulating failures reveals how your system recovers. -```python -import cohere -import uuid +- **Graceful Node Shutdowns**: Drain queries, reassign shard ownership via load balancer. +- **Redundant Data Paths**: Store data on multiple volumes or in multiple locations. +- **Load Tests**: Generate high concurrency or large queries to mimic real surge patterns. -cohere_client = cohere.Client(api_key="my-cohere-api-key") +### Set up Telemetry for Early Detection -response = cohere_client.embed( - texts=[\ - note.get("text")\ - for note in notes\ - ], - model="embed-multilingual-v3.0", - input_type="search_document", -) +❓ **Use Case:** A customer in health care uses telemetry data from their Qdrant deployment to identify performance and scaling issues with their open-source implementation. The telemetry helps them monitor metrics such as search performance, RAM utilization efficiency, and indexing speed. By analyzing this data, they can work toward reducing query response times and optimize their system configuration. -client.upload_points( - collection_name="personal-notes", - points=[\ - models.PointStruct(\ - id=uuid.uuid4().hex,\ - vector=embedding,\ - payload=note,\ - )\ - for note, embedding in zip(notes, response.embeddings)\ - ] -) +✅ **Enable telemetry and monitoring** so you can track latency, throughput, and optimization stats. -``` +**Telemetry** is vital. You need to collect metrics such as search latency distribution, CPU usage, disk throughput, and memory consumption. If memory usage is at 90% during index building, that's a clear sign you need more capacity or more nodes. -Our collection is now ready to be searched over. In the real world, the set of notes would be changing over time, so the -ingestion process won’t be as straightforward. This data is not yet exposed to the LLM, but we will build the connector -in the next step. +**Build dashboards** that monitor CPU usage, memory consumption, disk I/O, and indexing speeds help you catch resource bottlenecks. Otherwise, you learn of problems only when latency spikes or logs fill with errors. -### [Anchor](https://qdrant.tech/documentation/examples/cohere-rag-connector/\#connector-web-service) Connector web service +### What to Monitor? -[FastAPI](https://fastapi.tiangolo.com/) is a modern web framework and perfect a choice for a simple HTTP API. We are -going to use it for the purposes of our connector. There will be just one endpoint, as required by the model. It will -accept POST requests at the `/search` path. There is a single `query` parameter required. Let’s define a corresponding -model. +For retrieval, focus on P99 latency metrics (the response time for the slowest 1% of requests) rather than just average latency. This gives you a better understanding of worst-case performance. -```python -from pydantic import BaseModel +For hardware, monitor resource utilization during tests. If you're not seeing expected CPU utilization (e.g., only 2 out of 8 CPUs being used), there may be configuration issues limiting performance. Test different configurations to find optimal settings for your specific workload. -class SearchQuery(BaseModel): - query: str +**Figure:** If you are scrape monitoring, networking and logging metrics into your own monitoring system, you can use our [Grafana dashboard](https://github.com/qdrant/qdrant-cloud-grafana-dashboard) to visualize these metrics. -``` +![Grafa dashboard](/documentation/cloud/cloud-grafana-dashboard.png) -RAG connector does not have to return the documents in any specific format. There are [some good practices to follow](https://docs.cohere.com/v1/docs/creating-and-deploying-a-connector#configure-the-connection-between-the-connector-and-the-chat-api), -but Cohere models are quite flexible here. Results just have to be returned as JSON, with a list of objects in a -`results` property of the output. We will use the same document structure as we did for the Qdrant payloads, so there -is no conversion required. That requires two additional models to be created. +> Include tests that combine both read and write operations to simulate real-world usage. For example, you might configure a test with 80% reads and 20% writes to match your expected production workload. -```python -from typing import List +By following these comprehensive load testing practices, you'll be able to identify and address potential bottlenecks before your system goes live, ensuring a smoother launch and better user experience. -class Document(BaseModel): - title: str - text: str +|| +|-| +|**Read More:** [**Telemetry and Monitoring Documentation**](https://qdrant.tech/documentation/guides/monitoring/)| +|**Read More:** [**Cloud Monitoring Documentation**](https://qdrant.tech/documentation/hybrid-cloud/networking-logging-monitoring/) -class SearchResults(BaseModel): - results: List[Document] +## 4. Ensuring Disaster Recovery With Database Backups and Snapshots +![vector-search-production](/articles_data/vector-search-production/vector-search-production-6.jpg) -``` +|| +|:-:| +|**"I tried to restore a snapshot, and now everything's broken."**| -Once our model classes are ready, we can implement the logic that will get the query and provide the notes that are -relevant to it. Please note the LLM is not going to define the number of documents to be returned. That’s completely -up to you how many of them you want to bring to the context. +❓ **Use Case:** A digital publisher endured a catastrophic outage and attempted to restore from backups. They discovered an index format mismatch only after partial data was lost. Another company saw query performance drop when snapshot compression hogged CPU during peak traffic. -There are two services we need to interact with - Qdrant server and Cohere API. FastAPI has a concept of a [dependency\\ -injection](https://fastapi.tiangolo.com/tutorial/dependencies/#dependencies), and we will use it to provide both -clients into the implementation. +> Some of our users unfortunately never tested backups, so they only discovered version mismatches or partial/incremental backups missing data at the worst time - during restoration. -In case of queries, we need to set the `input_type` to `search_query` in the calls to Cohere API. +### Full Backups or Snapshot Restores? -```python -from fastapi import FastAPI, Depends -from typing import Annotated +For disaster recovery scenarios, you should use full backups instead of snapshots. Snapshots are primarily designed for moving data between clusters, whereas backups are intended for recovering the entire state of a cluster. -app = FastAPI() +**Figure:** Configuring a cluster backup from the Qdrant Cloud UI -def client() -> QdrantClient: - return QdrantClient(config.QDRANT_URL, api_key=config.QDRANT_API_KEY) +![Configure a cluster backup](/documentation/cloud/backup-schedule.png) -def cohere_client() -> cohere.Client: - return cohere.Client(api_key=config.COHERE_API_KEY) +**With full backups** you copy the entire dataset, including indexes and other configuration. This is great for completeness, but can be expensive and time-consuming. **Full snapshot recovery** is faster, but more complex to coordinate and restore the entire state. -@app.post("/search") -def search( - query: SearchQuery, - client: Annotated[QdrantClient, Depends(client)], - cohere_client: Annotated[cohere.Client, Depends(cohere_client)], -) -> SearchResults: - response = cohere_client.embed( - texts=[query.query], - model="embed-multilingual-v3.0", - input_type="search_query", - ) - results = client.query_points( - collection_name="personal-notes", - query=response.embeddings[0], - limit=2, - ).points - return SearchResults( - results=[\ - Document(**point.payload)\ - for point in results\ - ] - ) +> Snapshots are convenient because they create an archive of a collection or, at a more granular level, an archive of a shard that you can download and upload to another instance. Use this if you don't want to go through the long process of indexing. -``` +✅ **Set up regular snapshots or backups** and verify they can be restored if needed. -Our app might be launched locally for the development purposes, given we have the `uvicorn` server installed: +Whichever you choose, always test the restore process. Some teams only realize backups are incomplete or corrupt when a real disaster hits. -```shell -uvicorn main:app +### Best Ways To Backup Large Deployments -``` +If you host tens of billions of vectors, store backups off-node in a different data center or a remote repository. Also, confirm your restore bandwidth is sufficient. If the restore pipeline is slower than the local disk, it'll take far longer than expected. -FastAPI exposes an interactive documentation at `http://localhost:8000/docs`, where we can test our endpoint. The -`/search` endpoint is available there. +> To avoid mismatched versions after restoration, always preserve index configurations, such as quantization settings or HNSW parameters. -![FastAPI documentation](https://qdrant.tech/documentation/tutorials/cohere-rag-connector/fastapi-openapi.png) +|| +|-| +|**Read More:** [**Snapshot Documentation**](https://qdrant.tech/documentation/concepts/snapshots/)| +|**Read More:** [**Managed Cloud Backup Documentation**](https://qdrant.tech/documentation/cloud/backups/)| +|**Read More:** [**Private Cloud Backup Documentation**](https://qdrant.tech/documentation/private-cloud/backups/)| -We can interact with it and check the documents that will be returned for a specific query. For example, we want to know -recall what we are supposed to do regarding the infrastructure for your projects. +## 5. Tips for Proper Database Administration +![vector-search-production](/articles_data/vector-search-production/vector-search-production-7.jpg) -```shell -curl -X "POST" \ - -H "Content-type: application/json" \ - -d '{"query": "Is there anything I have to do regarding the project infrastructure?"}' \ - "http://localhost:8000/search" +|| +|:-:| +|**"I keep running out of memory and my service is crashing."**| + +You most likely haven't allocated enough CPU and memory to your database, or you haven't matched your hardware resources to concurrency levels. In turn, the system will thrash or terminate under a heavy load. + +❓ **Use Case:** A mid-sized e-commerce company piloted a vector search solution to improve product discovery. Everything ran smoothly in testing, but once in production, memory errors, disk I/O spikes, and query delays piled up. + +Investigations showed they hadn't adjusted the default configuration or reserved dedicated storage for write-ahead logs. Their index repeatedly spilled to disk due to insufficient RAM, hurting performance. + +**✅ Allocate memory and CPU** that match your data volume and concurrency demands + +> Vector databases require careful resource management. If you don't align memory, CPU, and disk settings with real workloads, you'll face random slowdowns or partial failures under peak load. Sometimes that shows up as unpredictable latency. Other times, you'll see severe resource contention saturating CPU or disk. + +**Note:** Not enough CPU may just slow things down. Being out of memory can crash the system. -``` +|| +|-| +|**Read More:** [**Qdrant Configuration Documentation**](https://qdrant.tech/documentation/guides/configuration/)| -The output should look like following: +### Security & Governance -```json -{ - "results": [\ - {\ - "title": "Cloud Migration Strategy",\ - "text": "Draft a plan for migrating our current on-premise infrastructure to the cloud. The plan should cover the selection of a cloud provider, cost analysis, and a phased migration approach. Identify critical applications for the first phase and any potential risks or challenges. Schedule a meeting with the IT department to discuss the plan."\ - },\ - {\ - "title": "Project Alpha Review",\ - "text": "Review the current progress of Project Alpha, focusing on the integration of the new API. Check for any compatibility issues with the existing system and document the steps needed to resolve them. Schedule a meeting with the development team to discuss the timeline and any potential roadblocks."\ - }\ - ] -} +**Use Case:** A manufacturing company created a "Super Chatbot" that relied on many organizational components. The company needed to ensure secure communication between their application components - and transfer of data was paramount. -``` +✅ **Enable TLS/HTTPS** for encrypted traffic in production. -### [Anchor](https://qdrant.tech/documentation/examples/cohere-rag-connector/\#connecting-to-command-r) Connecting to Command-R +Enabling TLS/HTTPS is essential for meeting compliance requirements in regulated industries. This level of security is critical for companies that prioritize data privacy and security, such as those in finance, government and healthcare, helping to overcome potential security team objections. -Our web service is implemented, yet running only on our local machine. It has to be exposed to the public before -Command-R can interact with it. For a quick experiment, it might be enough to set up tunneling using services such as -[ngrok](https://ngrok.com/). We won’t cover all the details in the tutorial, but their -[Quickstart](https://ngrok.com/docs/guides/getting-started/) is a great resource describing the process step-by-step. -Alternatively, you can also deploy the service with a public URL. +> You need to protect data in transit. To enable TLS/HTTPS for encrypted traffic in production, you need to configure secure communication between clients and your Qdrant database, as well as individual cluster nodes. This involves implementing Transport Layer Security (TLS) certificates to encrypt all traffic, preventing unauthorized access and data interception. -Once it’s done, we can create the connector first, and then tell the model to use it, while interacting through the chat -API. Creating a connector is a single call to Cohere client: +If self-hosting, you can set up encryption yourself by [**incorporating TLS directly from the configuration**](https://qdrant.tech/documentation/guides/security/#tls) -```python -connector_response = cohere_client.connectors.create( - name="personal-notes", - url="https:/this-is-my-domain.app/search", -) +```text +service: + # Enable HTTPS for the REST and gRPC API + enable_tls: true + +# TLS configuration. +# Required if either service.enable_tls or cluster.p2p.enable_tls is true. +tls: + # Server certificate chain file + cert: ./tls/cert.pem + # Server private key file + key: ./tls/key.pem ``` -The `connector_response.connector` will be a descriptor, with `id` being one of the attributes. We’ll use this -identifier for our interactions like this: +|| +|-| +|**Read More:** [**Security Documentation**](https://qdrant.tech/documentation/guides/security/)| -```python -response = cohere_client.chat( - message=( - "Is there anything I have to do regarding the project infrastructure? " - "Please mention the tasks briefly." - ), - connectors=[\ - cohere.ChatConnector(id=connector_response.connector.id)\ - ], - model="command-r", -) +### Setting up Access Controls in Production -``` +**Use Case:** A large enterprise needed to implement access controls where "team A is able to access only collection A, B and C but not collection D" in their Qdrant database. -We changed the `model` to `command-r`, as this is currently the best Cohere model available to public. The -`response.text` is the output of the model: +✅ [**Set up Role-Based Access Control (RBAC)**](https://qdrant.tech/documentation/cloud-rbac/) to restrict actions by user or service. -```text -Here are some of the tasks related to project infrastructure that you might have to perform: -- You need to draft a plan for migrating your on-premise infrastructure to the cloud and come up with a plan for the selection of a cloud provider, cost analysis, and a gradual migration approach. -- It's important to evaluate your current technology stack to identify any outdated technologies. You should also research emerging technologies and the benefits they could bring to your projects. +Users can be invited attached to a specific role by inviting them through the **Role Details** page - just click on the Users tab and follow the prompts. Once accepted, they'll be assigned that role's permissions, along with the base role. -``` +Figure: Qdrant Cloud's interface for your database's Role Based Access Control +![image.png](/documentation/cloud/role-based-access-control/invite-user.png) -You only need to create a specific connector once! Please do not call `cohere_client.connectors.create` for every single -message you send to the `chat` method. +✅ **Use scoped API keys or auth tokens** to avoid over-permissioned services. -## [Anchor](https://qdrant.tech/documentation/examples/cohere-rag-connector/\#wrapping-up) Wrapping up +For their private cloud implementation, they set up JWT tokens manually. They incorporated these JWT tokens into their existing Role-Based Access Control system. They created tokens with specific roles, responsibilities, and labels derived from their SSO system. This enabled them to control access at multiple levels, including multi-tenancy and access through metadata fields -We have built a Cohere RAG connector that integrates with your existing knowledge base stored in Qdrant. We covered just -the basic flow, but in real world scenarios, you should also consider e.g. [building the authentication\\ -system](https://docs.cohere.com/docs/connector-authentication) to prevent unauthorized access. +✅ **Follow Principle of Least Privilege** when configuring access—only give permissions that are absolutely necessary. -##### Was this page useful? +For users of Qdrant's managed cloud service, there's an option to configure RBAC directly through the user interface, which automatically creates the role-based access control without requiring manual JWT configuration. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +|| +|-| +|**Read More:** [**Cloud RBAC Documentation**](https://qdrant.tech/documentation/cloud-rbac/)| -Thank you for your feedback! 🙏 +### Remember to Avoid These Common Pitfalls +![vector-search-production](/articles_data/vector-search-production/vector-search-production-8.jpg) -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/cohere-rag-connector.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +❌ Don't forget to index payload fields—**not doing this will slow your search.** -On this page: +❌ Don't run without replication—**single-node setups are fragile**. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/cohere-rag-connector.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +❌ Don't create a collection per user/customer—**use multitenancy**. -× +❌ Don't run latency-critical search alongside heavy batch jobs—**separate workloads**. -[Powered by](https://qdrant.tech/) +❌ Don't skip quantization—it can **greatly reduce memory footprint and speed up searches**. -<|page-43-lllmstxt|> -## platform-deployment-options -- [Documentation](https://qdrant.tech/documentation/) -- [Hybrid cloud](https://qdrant.tech/documentation/hybrid-cloud/) -- Deployment Platforms +❌ Don't keep outdated Qdrant versions running—**update regularly**. -# [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#qdrant-hybrid-cloud-hosting-platforms--deployment-options) Qdrant Hybrid Cloud: Hosting Platforms & Deployment Options +## Conclusion -This page provides an overview of how to deploy Qdrant Hybrid Cloud on various managed Kubernetes platforms. +In conclusion, **vector search in production** isn't tied to a specific cloud provider or infrastructure. The same core principles of **careful configuration, robust ingestion/indexing, intelligent scaling, thorough backups, strong observability, and security** apply universally. By embracing these fundamentals, you'll deliver fast, reliable, and scalable search for your users, regardless of where your hardware or services run. -For a general list of prerequisites and installation steps, see our [Hybrid Cloud setup guide](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/). This platform specific documentation also applies to Qdrant Private Cloud. +![vector-search-production](/articles_data/vector-search-production/vector-search-production-9.jpg) -![Akamai](https://qdrant.tech/documentation/cloud/cloud-providers/akamai.jpg) +<|page-78-lllmstxt|> +## What is Semantic Cache? -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#akamai-linode) Akamai (Linode) +**Semantic cache** is a method of retrieval optimization, where similar queries instantly retrieve the same appropriate response from a knowledge base. -[The Linode Kubernetes Engine (LKE)](https://www.linode.com/products/kubernetes/) is a managed container orchestration engine built on top of Kubernetes. LKE enables you to quickly deploy and manage your containerized applications without needing to build (and maintain) your own Kubernetes cluster. All LKE instances are equipped with a fully managed control plane at no additional cost. +Semantic cache differs from traditional caching methods. In computing, **cache** refers to high-speed memory that efficiently stores frequently accessed data. In the context of [vector databases](/articles/what-is-a-vector-database/), a **semantic cache** improves AI application performance by storing previously retrieved results along with the conditions under which they were computed. This allows the application to reuse those results when the same or similar conditions occur again, rather than finding them from scratch. -First, consult Linode’s managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on LKE**, follow our [step-by-step documentation](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/). +> The term **"semantic"** implies that the cache takes into account the meaning or semantics of the data or computation being cached, rather than just its syntactic representation. This can lead to more efficient caching strategies that exploit the structure or relationships within the data or computation. -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#more-on-linode-kubernetes-engine) More on Linode Kubernetes Engine +![semantic-cache-question](/articles_data/semantic-cache-ai-data-retrieval/semantic-cache-question.png) -- [Getting Started with LKE](https://www.linode.com/docs/products/compute/kubernetes/get-started/) -- [LKE Guides](https://www.linode.com/docs/products/compute/kubernetes/guides/) -- [LKE API Reference](https://www.linode.com/docs/api/) +Traditional caches operate on an exact match basis, while semantic caches search for the meaning of the key rather than an exact match. For example, **"What is the capital of Brazil?"** and **"Can you tell me the capital of Brazil?"** are semantically equivalent, but not exact matches. A semantic cache recognizes such semantic equivalence and provides the correct result. -At the time of writing, Linode [does not support CSI Volume Snapshots](https://github.com/linode/linode-blockstorage-csi-driver/issues/107). +In this blog and video, we will walk you through how to use Qdrant to implement a basic semantic cache system. You can also try the [notebook example](https://github.com/infoslack/qdrant-example/blob/main/semantic-cache.ipynb) for this implementation. -![AWS](https://qdrant.tech/documentation/cloud/cloud-providers/aws.jpg) +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/infoslack/qdrant-example/blob/main/semantic-cache.ipynb) -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#amazon-web-services-aws) Amazon Web Services (AWS) +## Semantic Cache in RAG: the Key-Value Mechanism -[Amazon Elastic Kubernetes Service (Amazon EKS)](https://aws.amazon.com/eks/) is a managed service to run Kubernetes in the AWS cloud and on-premises data centers which can then be paired with Qdrant’s hybrid cloud. With Amazon EKS, you can take advantage of all the performance, scale, reliability, and availability of AWS infrastructure, as well as integrations with AWS networking and security services. +Semantic cache is increasingly used in Retrieval-Augmented Generation (RAG) applications. In RAG, when a user asks a question, we embed it and search our vector database, either by using keyword, semantic, or hybrid search methods. The matched context is then passed to a Language Model (LLM) along with the prompt and user question for response generation. -First, consult AWS’ managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on AWS**, follow our [step-by-step documentation](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/). +Qdrant is recommended for setting up semantic cache as semantically [evaluates](https://qdrant.tech/rag/rag-evaluation-guide/) the response. When semantic cache is implemented, we store common questions and their corresponding answers in a key-value cache. This way, when a user asks a question, we can retrieve the response from the cache if it already exists. -For a good balance between peformance and cost, we recommend: +**Diagram:** Semantic cache improves [RAG](https://qdrant.tech/rag/rag-evaluation-guide/) by directly retrieving stored answers to the user. **Follow along with the gif** and see how semantic cache stores and retrieves answers. -- Depending on your cluster resource configuration either general purpose (m6\*, m7\*, or m8\*), memory optimized (r6\*, r7\*, or r8\*) or cpu optimized (c6\*, c7\*, or c8\*) instance types. Qdrant Hybrid Cloud also supports AWS Graviton ARM64 instances. -- At least gp3 EBS volumes for storage +![Alt Text](/articles_data/semantic-cache-ai-data-retrieval/semantic-cache.gif) -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#more-on-amazon-elastic-kubernetes-service) More on Amazon Elastic Kubernetes Service +When using a key-value cache, it's important to consider that slight variations in question wording can lead to different hash values. The two questions convey the same query but differ in wording. A naive cache search might fail due to distinct hashed versions of the questions. Implementing a more nuanced approach is necessary to accommodate phrasing variations and ensure accurate responses. -- [Getting Started with Amazon EKS](https://docs.aws.amazon.com/eks/) -- [Amazon EKS User Guide](https://docs.aws.amazon.com/eks/latest/userguide/what-is-eks.html) -- [Amazon EKS API Reference](https://docs.aws.amazon.com/eks/latest/APIReference/Welcome.html) +To address this challenge, a semantic cache can be employed instead of relying solely on exact matches. This entails storing questions, answers, and their embeddings in a key-value structure. -Your EKS cluster needs the EKS EBS CSI driver or a similar storage driver: +When a user poses a question, a semantic search by Qdrant is conducted across all cached questions to identify the most similar one. If the similarity score surpasses a predefined threshold, the system assumes equivalence between the user's question and the matched one, providing the corresponding answer accordingly. -- [Amazon EBS CSI Driver](https://docs.aws.amazon.com/eks/latest/userguide/managing-ebs-csi.html) +## Benefits of Semantic Cache for AI Applications -To allow vertical scaling, you need a StorageClass with volume expansion enabled: +Semantic cache contributes to scalability in AI applications by making it simpler to retrieve common queries from vast datasets. The retrieval process can be computationally intensive and implementing a cache component can reduce the load. -- [Amazon EBS CSI Volume Resizing](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/examples/kubernetes/resizing/README.md) +For instance, if hundreds of users repeat the same question, the system can retrieve the precomputed answer from the cache rather than re-executing the entire process. This cache stores questions as keys and their corresponding answers as values, providing an efficient means to handle repeated queries. -```yaml -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - annotations: - storageclass.kubernetes.io/is-default-class: "true" - name: ebs-sc -provisioner: ebs.csi.aws.com -reclaimPolicy: Delete -volumeBindingMode: WaitForFirstConsumer -allowVolumeExpansion: true +> There are **potential cost savings** associated with utilizing semantic cache. Using a semantic cache eliminates the need for repeated searches and generation processes for similar or duplicate questions, thus saving time and LLM API resources, especially when employing costly language model calls like OpenAI's. -``` +## When to Use Semantic Cache? -To allow backups and restores, your EKS cluster needs the CSI snapshot controller: +For applications like question-answering systems where facts are retrieved from documents, caching is beneficial due to the consistent nature of the queries. *However, for text generation tasks requiring varied responses, caching may not be ideal as it returns previous responses, potentially limiting variation.* Thus, the decision to use caching depends on the specific use case. -- [Amazon EBS CSI Snapshot Controller](https://docs.aws.amazon.com/eks/latest/userguide/csi-snapshot-controller.html) +Using a cache might not be ideal for applications where diverse responses are desired across multiple queries. However, in question-answering systems, caching is advantageous since variations are insignificant. It serves as an effective performance optimization tool for chatbots by storing frequently accessed data. -And you need to create a VolumeSnapshotClass: +One strategy involves creating ad-hoc patches for chatbot dialogues, where commonly asked questions are pre-mapped to prepared responses in the cache. This allows the chatbot to swiftly retrieve and deliver responses without relying on a Language Model (LLM) for each query. -```yaml -apiVersion: snapshot.storage.k8s.io/v1 -kind: VolumeSnapshotClass -metadata: - name: csi-snapclass -deletionPolicy: Delete -driver: ebs.csi.aws.com +## Implement Semantic Cache: A Step-by-Step Guide -``` +The first part of this video explains how caching works. In the second part, you can follow along with the code with our [notebook example](https://github.com/infoslack/qdrant-example/blob/main/semantic-cache.ipynb). -![Civo](https://qdrant.tech/documentation/cloud/cloud-providers/civo.jpg) +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/infoslack/qdrant-example/blob/main/semantic-cache.ipynb) -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#civo) Civo +

-[Civo Kubernetes](https://www.civo.com/kubernetes) is a robust, scalable, and managed Kubernetes service. Civo supplies a CNCF-compliant Kubernetes cluster and makes it easy to provide standard Kubernetes applications and containerized workloads. User-defined Kubernetes clusters can be created as self-service without complications using the Civo Portal. +## Embrace the Future of AI Data Retrieval -First, consult Civo’s managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on Civo**, follow our [step-by-step documentation](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/). +[Qdrant](https://github.com/qdrant/qdrant) offers the most flexible way to implement vector search for your RAG and AI applications. You can test out semantic cache on your free Qdrant Cloud instance today! Simply sign up for or log into your [Qdrant Cloud account](https://cloud.qdrant.io/login) and follow our [documentation](/documentation/cloud/). -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#more-on-civo-kubernetes) More on Civo Kubernetes +You can also deploy Qdrant locally and manage via our UI. To do this, check our [Hybrid Cloud](/blog/hybrid-cloud/)! -- [Getting Started with Civo Kubernetes](https://www.civo.com/docs/kubernetes) -- [Civo Tutorials](https://www.civo.com/learn) -- [Frequently Asked Questions on Civo](https://www.civo.com/docs/faq) +[![hybrid-cloud-get-started](/blog/hybrid-cloud-launch-partners/hybrid-cloud-get-started.png)](https://cloud.qdrant.io/login) -To allow backups and restores, you need to create a VolumeSnapshotClass: +<|page-79-lllmstxt|> +Qdrant is designed as an efficient vector database, allowing for a quick search of the nearest neighbours. But, you may find yourself in need of applying some extra filtering on top of the semantic search. Up to version 0.10, Qdrant was offering support for keywords only. Since 0.10, there is a possibility to apply full-text constraints as well. There is a new type of filter that you can use to do that, also combined with every other filter type. -```yaml -apiVersion: snapshot.storage.k8s.io/v1 -kind: VolumeSnapshotClass -metadata: - name: csi-snapclass -deletionPolicy: Delete -driver: csi.civo.com +## Using full-text filters without the payload index -``` +Full-text filters without the index created on a field will return only those entries which contain all the terms included in the query. That is effectively a substring match on all the individual terms but **not a substring on a whole query**. -![Digital Ocean](https://qdrant.tech/documentation/cloud/cloud-providers/digital-ocean.jpg) +![](/blog/from_cms/1_ek61_uvtyn89duqtmqqztq.webp "An example of how to search for “long_sleeves” in a “detail_desc” payload field.") -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#digital-ocean) Digital Ocean +## Full-text search behaviour on an indexed payload field -[DigitalOcean Kubernetes (DOKS)](https://www.digitalocean.com/products/kubernetes) is a managed Kubernetes service that lets you deploy Kubernetes clusters without the complexities of handling the control plane and containerized infrastructure. Clusters are compatible with standard Kubernetes toolchains and integrate natively with DigitalOcean Load Balancers and volumes. +There are more options if you create a full-text index on a field you will filter by. -First, consult Digital Ocean’s managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on DigitalOcean**, follow our [step-by-step documentation](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/). +![](/blog/from_cms/1_pohx4eznqpgoxak6ppzypq.webp "Full-text search behaviour on an indexed payload field There are more options if you create a full-text index on a field you will filter by.") -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#more-on-digitalocean-kubernetes) More on DigitalOcean Kubernetes +First and foremost, you can choose the tokenizer. It defines how Qdrant should split the text into tokens. There are three options available: -- [Getting Started with DOKS](https://docs.digitalocean.com/products/kubernetes/getting-started/quickstart/) -- [DOKS - How To Guides](https://docs.digitalocean.com/products/kubernetes/how-to/) -- [DOKS - Reference Manual](https://docs.digitalocean.com/products/kubernetes/reference/) +* **word** — spaces, punctuation marks and special characters define the token boundaries +* **whitespace** — token boundaries defined by whitespace characters +* **prefix** — token boundaries are the same as for the “word” tokenizer, but in addition to that, there are prefixes created for every single token. As a result, “Qdrant” will be indexed as “Q”, “Qd”, “Qdr”, “Qdra”, “Qdran”, and “Qdrant”. -![Gcore](https://qdrant.tech/documentation/cloud/cloud-providers/gcore.svg) +There are also some additional parameters you can provide, such as -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#gcore) Gcore +* **min_token_len** — minimal length of the token +* **max_token_len** — maximal length of the token +* **lowercase** — if set to *true*, then the index will be case-insensitive, as Qdrant will convert all the texts to lowercase -[Gcore Managed Kubernetes](https://gcore.com/cloud/managed-kubernetes) is a managed container orchestration engine built on top of Kubernetes. Gcore enables you to quickly deploy and manage your containerized applications without needing to build (and maintain) your own Kubernetes cluster. All Gcore instances are equipped with a fully managed control plane at no additional cost. +## Using text filters in practice -First, consult Gcore’s managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on Gcore**, follow our [step-by-step documentation](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/). +![](/blog/from_cms/1_pbtd2tzqtjqqlbi61r8czg.webp "There are also some additional parameters you can provide, such as min_token_len — minimal length of the token max_token_len — maximal length of the token lowercase — if set to true, then the index will be case-insensitive, as Qdrant will convert all the texts to lowercase Using text filters in practice") -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#more-on-gcore-kubernetes-engine) More on Gcore Kubernetes Engine +The main difference between using full-text filters on the indexed vs non-indexed field is the performance of such query. In a simple benchmark, performed on the [H&M dataset](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations) (with over 105k examples), the average query time looks as follows (n=1000): -- [Getting Started with Kubnetes on Gcore](https://gcore.com/docs/cloud/kubernetes/about-gcore-kubernetes) +![](/blog/from_cms/screenshot_31.png) -![Google Cloud Platform](https://qdrant.tech/documentation/cloud/cloud-providers/gcp.jpg) +It is evident that creating a filter on a field that we’ll query often, may lead us to substantial performance gains without much effort. -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#google-cloud-platform-gcp) Google Cloud Platform (GCP) +<|page-80-lllmstxt|> +# How to Optimize Vector Storage by Storing Multiple Vectors Per Object -[Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine) is a managed Kubernetes service that you can use to deploy and operate containerized applications at scale using Google’s infrastructure. GKE provides the operational power of Kubernetes while managing many of the underlying components, such as the control plane and nodes, for you. +In a real case scenario, a single object might be described in several different ways. If you run an e-commerce business, then your items will typically have a name, longer textual description and also a bunch of photos. While cooking, you may care about the list of ingredients, and description of the taste but also the recipe and the way your meal is going to look. Up till now, if you wanted to enable [semantic search](https://qdrant.tech/documentation/tutorials/search-beginners/) with multiple vectors per object, Qdrant would require you to create separate collections for each vector type, even though they could share some other attributes in a payload. However, since Qdrant 0.10 you are able to store all those vectors together in the same collection and share a single copy of the payload! -First, consult GCP’s managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on GCP**, follow our [step-by-step documentation](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/). +Running the new version of Qdrant is as simple as it always was. By running the following command, you are able to set up a single instance that will also expose the HTTP API: -For a good balance between peformance and cost, we recommend: -- Depending on your cluster resource configuration either general purpose (standard), memory optimized (highmem) or cpu optimized (highcpu) instance types of at least 2nd generation. Qdrant Hybrid Cloud also supports ARM64 instances. -- At least pd-balanced disks for storage +``` +docker run -p 6333:6333 qdrant/qdrant:v0.10.1 +``` -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#more-on-the-google-kubernetes-engine) More on the Google Kubernetes Engine +## Creating a collection -- [Getting Started with GKE](https://cloud.google.com/kubernetes-engine/docs/quickstart) -- [GKE Tutorials](https://cloud.google.com/kubernetes-engine/docs/tutorials) -- [GKE Documentation](https://cloud.google.com/kubernetes-engine/docs/) +Adding new functionalities typically requires making some changes to the interfaces, so no surprise we had to do it to enable the multiple vectors support. Currently, if you want to create a collection, you need to define the configuration of all the vectors you want to store for each object. Each vector type has its own name and the distance function used to measure how far the points are. -To allow backups and restores, your GKE cluster needs the CSI VolumeSnapshot controller and class: +```python +from qdrant_client import QdrantClient +from qdrant_client.http.models import VectorParams, Distance -- [Google GKE Volume Snapshots](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/volume-snapshots) +client = QdrantClient() +client.create_collection( + collection_name="multiple_vectors", + vectors_config={ + "title": VectorParams( + size=100, + distance=Distance.EUCLID, + ), + "image": VectorParams( + size=786, + distance=Distance.COSINE, + ), + } +) +``` -```yaml -apiVersion: snapshot.storage.k8s.io/v1 -kind: VolumeSnapshotClass -metadata: - name: csi-snapclass -deletionPolicy: Delete -driver: pd.csi.storage.gke.io +In case you want to keep a single vector per collection, you can still do it without putting a name though. +```python +client.create_collection( + collection_name="single_vector", + vectors_config=VectorParams( + size=100, + distance=Distance.COSINE, + ) +) ``` -![Microsoft Azure](https://qdrant.tech/documentation/cloud/cloud-providers/azure.jpg) +All the search-related operations have slightly changed their interfaces as well, so you can choose which vector to use in a specific request. However, it might be easier to see all the changes by following an end-to-end Qdrant usage on a real-world example. -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#mircrosoft-azure) Mircrosoft Azure +## Building service with multiple embeddings -With [Azure Kubernetes Service (AKS)](https://azure.microsoft.com/en-in/products/kubernetes-service), you can start developing and deploying cloud-native apps in Azure, data centres, or at the edge. Get unified management and governance for on-premises, edge, and multi-cloud Kubernetes clusters. Interoperate with Azure security, identity, cost management, and migration services. +Quite a common approach to building search engines is to combine semantic textual capabilities with image search as well. For that purpose, we need a dataset containing both images and their textual descriptions. There are several datasets available with [MS_COCO_2017_URL_TEXT](https://huggingface.co/datasets/ChristophSchuhmann/MS_COCO_2017_URL_TEXT) being probably the simplest available. And because it’s available on HuggingFace, we can easily use it with their [datasets](https://huggingface.co/docs/datasets/index) library. -First, consult Azure’s managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on Azure**, follow our [step-by-step documentation](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/). +```python +from datasets import load_dataset -For a good balance between peformance and cost, we recommend: +dataset = load_dataset("ChristophSchuhmann/MS_COCO_2017_URL_TEXT") +``` -- Depending on your cluster resource configuration either general purpose (D-family), memory optimized (E-family) or cpu optimized (F-family) instance types. Qdrant Hybrid Cloud also supports Azure Cobalt ARM64 instances. -- At least Premium SSD v2 disks for storage +Right now, we have a dataset with a structure containing the image URL and its textual description in English. For simplicity, we can convert it to the DataFrame, as this structure might be quite convenient for future processing. -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#more-on-azure-kubernetes-service) More on Azure Kubernetes Service +```python +import pandas as pd -- [Getting Started with AKS](https://learn.microsoft.com/en-us/azure/architecture/reference-architectures/containers/aks-start-here) -- [AKS Documentation](https://learn.microsoft.com/en-in/azure/aks/) -- [Best Practices with AKS](https://learn.microsoft.com/en-in/azure/aks/best-practices) +dataset_df = pd.DataFrame(dataset["train"]) +``` -To allow backups and restores, your AKS cluster needs the CSI VolumeSnapshot controller and class: +The dataset consists of two columns: *TEXT* and *URL*. Thus, each data sample is described by two separate pieces of information and each of them has to be encoded with a different model. -- [Azure AKS Volume Snapshots](https://learn.microsoft.com/en-us/azure/aks/azure-disk-csi#create-a-volume-snapshot) +## Processing the data with pretrained models -```yaml -apiVersion: snapshot.storage.k8s.io/v1 -kind: VolumeSnapshotClass -metadata: - name: csi-snapclass -deletionPolicy: Delete -driver: disk.csi.azure.com +Thanks to [embetter](https://github.com/koaning/embetter), we can reuse some existing pretrained models and use a convenient scikit-learn API, including pipelines. This library also provides some utilities to load the images, but only supports the local filesystem, so we need to create our own class that will download the file, given its URL. + +```python +from pathlib import Path +from urllib.request import urlretrieve +from embetter.base import EmbetterBase + +class DownloadFile(EmbetterBase): + def __init__(self, out_dir: Path): + self.out_dir = out_dir + + def transform(self, X, y=None): + output_paths = [] + for x in X: + output_file = self.out_dir / Path(x).name + urlretrieve(x, output_file) + output_paths.append(str(output_file)) + return output_paths ``` -![Oracle Cloud Infrastructure](https://qdrant.tech/documentation/cloud/cloud-providers/oracle.jpg) +Now we’re ready to define the pipelines to process our images and texts using *all-MiniLM-L6-v2* and *vit_base_patch16_224* models respectively. First of all, let’s start with Qdrant configuration. -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#oracle-cloud-infrastructure) Oracle Cloud Infrastructure +## Creating Qdrant collection -[Oracle Cloud Infrastructure Container Engine for Kubernetes (OKE)](https://www.oracle.com/in/cloud/cloud-native/container-engine-kubernetes/) is a managed Kubernetes solution that enables you to deploy Kubernetes clusters while ensuring stable operations for both the control plane and the worker nodes through automatic scaling, upgrades, and security patching. Additionally, OKE offers a completely serverless Kubernetes experience with virtual nodes. +We’re going to put two vectors per object (one for image and another one for text), so we need to create a collection with a configuration allowing us to do so. -First, consult OCI’s managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on OCI**, follow our [step-by-step documentation](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/). +```python +from qdrant_client import QdrantClient +from qdrant_client.http.models import VectorParams, Distance -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#more-on-oci-container-engine) More on OCI Container Engine +client = QdrantClient(timeout=None) +client.create_collection( + collection_name="ms-coco-2017", + vectors_config={ + "text": VectorParams( + size=384, + distance=Distance.EUCLID, + ), + "image": VectorParams( + size=1000, + distance=Distance.COSINE, + ), + }, +) +``` -- [Getting Started with OCI](https://docs.oracle.com/en-us/iaas/Content/ContEng/home.htm) -- [Frequently Asked Questions on OCI](https://www.oracle.com/in/cloud/cloud-native/container-engine-kubernetes/faq/) -- [OCI Product Updates](https://docs.oracle.com/en-us/iaas/releasenotes/services/conteng/) +## Defining the pipelines -To allow backups and restores, your OCI cluster needs the CSI VolumeSnapshot controller and class: +And since we have all the puzzles already in place, we can start the processing to convert raw data into the embeddings we need. The pretrained models come in handy. -- [Prerequisites for Creating Volume Snapshots](https://docs.oracle.com/en-us/iaas/Content/ContEng/Tasks/contengcreatingpersistentvolumeclaim_topic-Provisioning_PVCs_on_BV.htm#contengcreatingpersistentvolumeclaim_topic-Provisioning_PVCs_on_BV-PV_From_Snapshot_CSI__section_volume-snapshot-prerequisites) +```python +from sklearn.pipeline import make_pipeline +from embetter.grab import ColumnGrabber +from embetter.vision import ImageLoader, TimmEncoder +from embetter.text import SentenceEncoder -```yaml -apiVersion: snapshot.storage.k8s.io/v1 -kind: VolumeSnapshotClass -metadata: - name: csi-snapclass -deletionPolicy: Delete -driver: blockvolume.csi.oraclecloud.com +output_directory = Path("./images") + +image_pipeline = make_pipeline( + ColumnGrabber("URL"), + DownloadFile(output_directory), + ImageLoader(), + TimmEncoder("vit_base_patch16_224"), +) +text_pipeline = make_pipeline( + ColumnGrabber("TEXT"), + SentenceEncoder("all-MiniLM-L6-v2"), +) ``` -![OVHcloud](https://qdrant.tech/documentation/cloud/cloud-providers/ovh.jpg) -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#ovhcloud) OVHcloud +Thanks to the scikit-learn API, we can simply call each pipeline on the created DataFrame and put created vectors into Qdrant to enable fast vector search. For convenience, we’re going to put the vectors as other columns in our DataFrame. -[Service Managed Kubernetes](https://www.ovhcloud.com/en-in/public-cloud/kubernetes/), powered by OVH Public Cloud Instances, a leading European cloud provider. With OVHcloud Load Balancers and disks built in. OVHcloud Managed Kubernetes provides high availability, compliance, and CNCF conformance, allowing you to focus on your containerized software layers with total reversibility. +```python +sample_df = dataset_df.sample(n=2000, random_state=643) +image_vectors = image_pipeline.transform(sample_df) +text_vectors = text_pipeline.transform(sample_df) +sample_df["image_vector"] = image_vectors.tolist() +sample_df["text_vector"] = text_vectors.tolist() +``` -First, consult OVHcloud’s managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on OVHcloud**, follow our [step-by-step documentation](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/). -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#more-on-service-managed-kubernetes-by-ovhcloud) More on Service Managed Kubernetes by OVHcloud +The created vectors might be easily put into Qdrant. For the sake of simplicity, we’re going to skip it, but if you are interested in details, please check out the [Jupyter notebook](https://gist.github.com/kacperlukawski/961aaa7946f55110abfcd37fbe869b8f) going step by step. -- [Getting Started with OVH Managed Kubernetes](https://help.ovhcloud.com/csm/en-in-documentation-public-cloud-containers-orchestration-managed-kubernetes-k8s-getting-started) -- [OVH Managed Kubernetes Documentation](https://help.ovhcloud.com/csm/en-in-documentation-public-cloud-containers-orchestration-managed-kubernetes-k8s) -- [OVH Managed Kubernetes Tutorials](https://help.ovhcloud.com/csm/en-in-documentation-public-cloud-containers-orchestration-managed-kubernetes-k8s-tutorials) +## Searching with multiple vectors -![Red Hat](https://qdrant.tech/documentation/cloud/cloud-providers/redhat.jpg) +If you decided to describe each object with several [neural embeddings](https://qdrant.tech/articles/neural-search-tutorial/), then at each search operation you need to provide the vector name along with the [vector embedding](https://qdrant.tech/articles/what-are-embeddings/), so the engine knows which one to use. The interface of the search operation is pretty straightforward and requires an instance of NamedVector. -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#red-hat-openshift) Red Hat OpenShift +```python +from qdrant_client.http.models import NamedVector -[Red Hat OpenShift Kubernetes Engine](https://www.redhat.com/en/technologies/cloud-computing/openshift/kubernetes-engine) provides you with the basic functionality of Red Hat OpenShift. It offers a subset of the features that Red Hat OpenShift Container Platform offers, like full access to an enterprise-ready Kubernetes environment and an extensive compatibility test matrix with many of the software elements that you might use in your data centre. +text_results = client.search( + collection_name="ms-coco-2017", + query_vector=NamedVector( + name="text", + vector=row["text_vector"], + ), + limit=5, + with_vectors=False, + with_payload=True, +) +``` -First, consult Red Hat’s managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on Red Hat OpenShift**, follow our [step-by-step documentation](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/). +If we, on the other hand, decided to search using the image embedding, then we just provide the vector name we have chosen while creating the collection, so instead of “text”, we would provide “image”, as this is how we configured it at the very beginning. -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#more-on-openshift-kubernetes-engine) More on OpenShift Kubernetes Engine +## The results: image vs text search -- [Getting Started with Red Hat OpenShift Kubernetes](https://docs.openshift.com/container-platform/4.15/getting_started/kubernetes-overview.html) -- [Red Hat OpenShift Kubernetes Documentation](https://docs.openshift.com/container-platform/4.15/welcome/index.html) -- [Installing on Container Platforms](https://access.redhat.com/documentation/en-us/openshift_container_platform/4.5/html/installing/index) +Since we have two different vectors describing each object, we can perform the search query using any of those. That shouldn’t be surprising then, that the results are different depending on the chosen embedding method. The images below present the results returned by Qdrant for the image/text on the left-hand side. -Qdrant databases need a persistent storage solution. See [Openshift Storage Overview](https://docs.openshift.com/container-platform/4.15/storage/index.html). +### Image search -To allow vertical scaling, you need a StorageClass with [volume expansion enabled](https://docs.openshift.com/container-platform/4.15/storage/expanding-persistent-volumes.html). +If we query the system using image embedding, then it returns the following results: -To allow backups and restores, your OpenShift cluster needs the [CSI snapshot controller](https://docs.openshift.com/container-platform/4.15/storage/container_storage_interface/persistent-storage-csi-snapshots.html), and you need to create a VolumeSnapshotClass. +![](/blog/from_cms/0_5nqlmjznjkvdrjhj.webp "Image search results") -![Scaleway](https://qdrant.tech/documentation/cloud/cloud-providers/scaleway.jpg) +### Text search -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#scaleway) Scaleway +However, if we use textual description embedding, then the results are slightly different: -[Scaleway Kapsule](https://www.scaleway.com/en/kubernetes-kapsule/) and [Kosmos](https://www.scaleway.com/en/kubernetes-kosmos/) are managed Kubernetes services from [Scaleway](https://www.scaleway.com/en/). They abstract away the complexities of managing and operating a Kubernetes cluster. The primary difference being, Kapsule clusters are composed solely of Scaleway Instances. Whereas, a Kosmos cluster is a managed multi-cloud Kubernetes engine that allows you to connect instances from any cloud provider to a single managed Control-Plane. +![](/blog/from_cms/0_3sdgctswb99xtexl.webp "Text search However, if we use textual description embedding, then the results are slightly different:") -First, consult Scaleway’s managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on Scaleway**, follow our [step-by-step documentation](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/). +It is not surprising that a method used for creating neural encoding plays an important role in the search process and its quality. If your data points might be described using several vectors, then the latest release of Qdrant gives you an opportunity to store them together and reuse the payloads, instead of creating several collections and querying them separately. -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#more-on-scaleway-kubernetes) More on Scaleway Kubernetes +### Summary: -- [Getting Started with Scaleway Kubernetes](https://www.scaleway.com/en/docs/containers/kubernetes/quickstart/#how-to-add-a-scaleway-pool-to-a-kubernetes-cluster) -- [Scaleway Kubernetes Documentation](https://www.scaleway.com/en/docs/containers/kubernetes/) -- [Frequently Asked Questions on Scaleway Kubernetes](https://www.scaleway.com/en/docs/faq/kubernetes/) +- Qdrant 0.10 introduces efficient vector storage optimization, allowing seamless management of multiple vectors per object within a single collection. +- This update streamlines semantic search capabilities by eliminating the need for separate collections for each vector type, enhancing search accuracy and performance. +- With Qdrant's new features, users can easily configure vector parameters, including size and distance functions, for each vector type, optimizing search results and user experience. -![STACKIT](https://qdrant.tech/documentation/cloud/cloud-providers/stackit.jpg) -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#stackit) STACKIT +If you’d like to check out some other examples, please check out our [full notebook](https://gist.github.com/kacperlukawski/961aaa7946f55110abfcd37fbe869b8f) presenting the search results and the whole pipeline implementation. -[STACKIT Kubernetes Engine (SKE)](https://www.stackit.de/en/product/kubernetes/) is a robust, scalable, and managed Kubernetes service. SKE supplies a CNCF-compliant Kubernetes cluster and makes it easy to provide standard Kubernetes applications and containerized workloads. User-defined Kubernetes clusters can be created as self-service without complications using the STACKIT Portal. +<|page-81-lllmstxt|> +# How to Optimize Vector Search Using Batch Search in Qdrant 0.10.0 -First, consult STACKIT’s managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on STACKIT**, follow our [step-by-step documentation](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/). +The latest release of Qdrant 0.10.0 has introduced a lot of functionalities that simplify some common tasks. Those new possibilities come with some slightly modified interfaces of the client library. One of the recently introduced features is the possibility to query the collection with [multiple vectors](https://qdrant.tech/blog/storing-multiple-vectors-per-object-in-qdrant/) at once — a batch search mechanism. -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#more-on-stackit-kubernetes-engine) More on STACKIT Kubernetes Engine +There are a lot of scenarios in which you may need to perform multiple non-related tasks at the same time. Previously, you only could send several requests to Qdrant API on your own. But multiple parallel requests may cause significant network overhead and slow down the process, especially in case of poor connection speed. -- [Getting Started with SKE](https://docs.stackit.cloud/stackit/en/getting-started-ske-10125565.html) -- [SKE Tutorials](https://docs.stackit.cloud/stackit/en/tutorials-ske-66683162.html) -- [Frequently Asked Questions on SKE](https://docs.stackit.cloud/stackit/en/faq-known-issues-of-ske-28476393.html) +Now, thanks to the new batch search, you don’t need to worry about that. Qdrant will handle multiple search requests in just one API call and will perform those requests in the most optimal way. -To allow backups and restores, you need to create a VolumeSnapshotClass: +## An example of using batch search to optimize vector search -```yaml -apiVersion: snapshot.storage.k8s.io/v1 -kind: VolumeSnapshotClass -metadata: - name: csi-snapclass -deletionPolicy: Delete -driver: cinder.csi.openstack.org +We’ve used the official Python client to show how the batch search might be integrated with your application. Since there have been some changes in the interfaces of Qdrant 0.10.0, we’ll go step by step. -``` +### Step 1: Creating the collection -![Vultr](https://qdrant.tech/documentation/cloud/cloud-providers/vultr.jpg) +The first step is to create a collection with a specified configuration — at least vector size and the distance function used to measure the similarity between vectors. -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#vultr) Vultr +```python +from qdrant_client import QdrantClient +from qdrant_client.conversions.common_types import VectorParams -[Vultr Kubernetes Engine (VKE)](https://www.vultr.com/kubernetes/) is a fully-managed product offering with predictable pricing that makes Kubernetes easy to use. Vultr manages the control plane and worker nodes and provides integration with other managed services such as Load Balancers, Block Storage, and DNS. +client = QdrantClient("localhost", 6333) +if not client.collection_exists('test_collection'): + client.create_collection( + collection_name="test_collection", + vectors_config=VectorParams(size=4, distance=Distance.EUCLID), +) +``` -First, consult Vultr’s managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on Vultr**, follow our [step-by-step documentation](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/). +## Step 2: Loading the vectors -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#more-on-vultr-kubernetes-engine) More on Vultr Kubernetes Engine +With the collection created, we can put some vectors into it. We’re going to have just a few examples. -- [VKE Guide](https://docs.vultr.com/vultr-kubernetes-engine) -- [VKE Documentation](https://docs.vultr.com/) -- [Frequently Asked Questions on VKE](https://docs.vultr.com/vultr-kubernetes-engine#frequently-asked-questions) +```python +vectors = [ + [.1, .0, .0, .0], + [.0, .1, .0, .0], + [.0, .0, .1, .0], + [.0, .0, .0, .1], + [.1, .0, .1, .0], + [.0, .1, .0, .1], + [.1, .1, .0, .0], + [.0, .0, .1, .1], + [.1, .1, .1, .1], +] -At the time of writing, Vultr does not support CSI Volume Snapshots. +client.upload_collection( + collection_name="test_collection", + vectors=vectors, +) +``` -![Kubernetes](https://qdrant.tech/documentation/cloud/cloud-providers/kubernetes.jpg) +## Step 3: Batch search in a single request -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#generic-kubernetes-support-on-premises-cloud-edge) Generic Kubernetes Support (on-premises, cloud, edge) +Now we’re ready to start looking for similar vectors, as our collection has some entries. Let’s say we want to find the distance between the selected vector and the most similar database entry and at the same time find the two most similar objects for a different vector query. Up till 0.9, we would need to call the API twice. Now, we can send both requests together: -Qdrant Hybrid Cloud works with any Kubernetes cluster that meets the [standard compliance](https://www.cncf.io/training/certification/software-conformance/) requirements. +```python +results = client.search_batch( + collection_name="test_collection", + requests=[ + SearchRequest( + vector=[0., 0., 2., 0.], + limit=1, + ), + SearchRequest( + vector=[0., 0., 0., 0.01], + with_vector=True, + limit=2, + ) + ] +) -This includes for example: +# Out: [ +# [ScoredPoint(id=2, version=0, score=1.9, +# payload=None, vector=None)], +# [ScoredPoint(id=3, version=0, score=0.09, +# payload=None, vector=[0.0, 0.0, 0.0, 0.1]), +# ScoredPoint(id=1, version=0, score=0.10049876, +# payload=None, vector=[0.0, 0.1, 0.0, 0.0])] +# ] -- [VMWare Tanzu](https://tanzu.vmware.com/kubernetes-grid) -- [Red Hat OpenShift](https://www.openshift.com/) -- [SUSE Rancher](https://www.rancher.com/) -- [Canonical Kubernetes](https://ubuntu.com/kubernetes) -- [RKE](https://rancher.com/docs/rke/latest/en/) -- [RKE2](https://docs.rke2.io/) -- [K3s](https://k3s.io/) +``` -Qdrant databases need persistent block storage. Most storage solutions provide a CSI driver that can be used with Kubernetes. See [CSI drivers](https://kubernetes-csi.github.io/docs/drivers.html) for more information. +Each instance of the SearchRequest class may provide its own search parameters, including vector query but also some additional filters. The response will be a list of individual results for each request. In case any of the requests is malformed, there will be an exception thrown, so either all of them pass or none of them. -To allow vertical scaling, you need a StorageClass with volume expansion enabled. See [Volume Expansion](https://kubernetes.io/docs/concepts/storage/storage-classes/#allow-volume-expansion) for more information. +And that’s it! You no longer have to handle the multiple requests on your own. Qdrant will do it under the hood. -To allow backups and restores, your CSI driver needs to support volume snapshots cluster needs the CSI VolumeSnapshot controller and class. See [CSI Volume Snapshots](https://kubernetes-csi.github.io/docs/snapshot-controller.html) for more information. +## Batch Search Benchmarks -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/\#next-steps) Next Steps +The batch search is fairly easy to be integrated into your application, but if you prefer to see some numbers before deciding to switch, then it’s worth comparing four different options: -Once you’ve got a Kubernetes cluster deployed on a platform of your choosing, you can begin setting up Qdrant Hybrid Cloud. Head to our Qdrant Hybrid Cloud [setup guide](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/) for instructions. +1. Querying the database sequentially. +2. Using many threads/processes with individual requests. +3. Utilizing the batch search of Qdrant in a single request. +4. Combining parallel processing and batch search. -##### Was this page useful? +In order to do that, we’ll create a richer collection of points, with vectors from the *glove-25-angular* dataset, quite a common choice for ANN comparison. If you’re interested in seeing some more details of how we benchmarked Qdrant, let’s take a [look at the Gist](https://gist.github.com/kacperlukawski/2d12faa49e06a5080f4c35ebcb89a2a3). -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +## The results -Thank you for your feedback! 🙏 +We launched the benchmark 5 times on 10000 test vectors and averaged the results. Presented numbers are the mean values of all the attempts: -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/hybrid-cloud/platform-deployment-options.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +1. Sequential search: 225.9 seconds +2. Batch search: 208.0 seconds +3. Multiprocessing search (8 processes): 194.2 seconds +4. Multiprocessing batch search (8 processes, batch size 10): 148.9 seconds -On this page: +The results you may achieve on a specific setup may vary depending on the hardware, however, at the first glance, it seems that batch searching may save you quite a lot of time. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/hybrid-cloud/platform-deployment-options.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Additional improvements could be achieved in the case of distributed deployment, as Qdrant won’t need to make extensive inter-cluster requests. Moreover, if your requests share the same filtering condition, the query optimizer would be able to reuse it among batch requests. -× +## Summary -[Powered by](https://qdrant.tech/) +Batch search allows packing different queries into a single API call and retrieving the results in a single response. If you ever struggled with sending several consecutive queries into Qdrant, then you can easily switch to the new batch search method and simplify your application code. As shown in the benchmarks, that may almost effortlessly speed up your interactions with Qdrant even by over 30%, even not considering the spare network overhead and possible reuse of filters! -<|page-44-lllmstxt|> -## bulk-upload -- [Documentation](https://qdrant.tech/documentation/) -- [Database tutorials](https://qdrant.tech/documentation/database-tutorials/) -- Bulk Upload Vectors +Ready to unlock the potential of batch search and optimize your vector search with Qdrant 0.10.0? Contact us today to learn how we can revolutionize your search capabilities! -# [Anchor](https://qdrant.tech/documentation/database-tutorials/bulk-upload/\#bulk-upload-vectors-to-a-qdrant-collection) Bulk Upload Vectors to a Qdrant Collection +<|page-82-lllmstxt|> +# Bulk Upload Vectors to a Qdrant Collection Uploading a large-scale dataset fast might be a challenge, but Qdrant has a few tricks to help you with that. @@ -19072,25 +36600,24 @@ We recommend using our [Rust client library](https://github.com/qdrant/rust-clie If you are not using Rust, you might want to consider parallelizing your upload process. -## [Anchor](https://qdrant.tech/documentation/database-tutorials/bulk-upload/\#choose-an-indexing-strategy) Choose an Indexing Strategy +## Choose an Indexing Strategy Qdrant incrementally builds an HNSW index for dense vectors as new data arrives. This ensures fast search, but indexing is memory- and CPU-intensive. During bulk ingestion, frequent index updates can reduce throughput and increase resource usage. To control this behavior and optimize for your system’s limits, adjust the following parameters: -| Your Goal | What to Do | Configuration | -| --- | --- | --- | -| Fastest upload, tolerate high RAM usage | Disable indexing completely | `indexing_threshold: 0` | -| Low memory usage during upload | Defer HNSW graph construction (recommended) | `m: 0` | -| Faster index availability after upload | Keep indexing enabled (default behavior) | `m: 16`, `indexing_threshold: 20000` _(default)_ | +| Your Goal | What to Do | Configuration | +|-------------------------------------------|-------------------------------------------------|----------------------------------------------------| +| Fastest upload, tolerate high RAM usage | Disable indexing completely | `indexing_threshold: 0` | +| Low memory usage during upload | Defer HNSW graph construction (recommended) | `m: 0` | +| Faster index availability after upload | Keep indexing enabled (default behavior) | `m: 16`, `indexing_threshold: 20000` *(default)* | Indexing must be re-enabled after upload to activate fast HNSW search if it was disabled during ingestion. -### [Anchor](https://qdrant.tech/documentation/database-tutorials/bulk-upload/\#defer-hnsw-graph-construction-m-0) Defer HNSW graph construction ( `m: 0`) -For dense vectors, setting the HNSW `m` parameter to `0` disables index building entirely. Vectors will still be stored, but not indexed until you enable indexing later. +### Defer HNSW graph construction (`m: 0`) -httppythontypescriptrustjavacsharpgo +For dense vectors, setting the HNSW `m` parameter to `0` disables index building entirely. Vectors will still be stored, but not indexed until you enable indexing later. ```http PUT /collections/{collection_name} @@ -19103,7 +36630,6 @@ PUT /collections/{collection_name} "m": 0 } } - ``` ```python @@ -19118,7 +36644,6 @@ client.create_collection( m=0, ), ) - ``` ```typescript @@ -19135,7 +36660,6 @@ client.createCollection("{collection_name}", { m: 0, }, }); - ``` ```rust @@ -19153,7 +36677,6 @@ client .hnsw_config(HnswConfigDiffBuilder::default().m(0)), ) .await?; - ``` ```java @@ -19183,7 +36706,6 @@ client .setHnswConfig(HnswConfigDiff.newBuilder().setM(0).build()) .build()) .get(); - ``` ```csharp @@ -19197,7 +36719,6 @@ await client.CreateCollectionAsync( vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine }, hnswConfig: new HnswConfigDiff { M = 0 } ); - ``` ```go @@ -19222,13 +36743,10 @@ client.CreateCollection(context.Background(), &qdrant.CreateCollection{ M: qdrant.PtrOf(uint64(0)), }, }) - ``` Once ingestion is complete, re-enable HNSW by setting `m` to your production value (usually 16 or 32). -httppythontypescriptrustjavacsharpgo - ```http PATCH /collections/{collection_name} { @@ -19240,7 +36758,6 @@ PATCH /collections/{collection_name} "m": 16 } } - ``` ```python @@ -19255,7 +36772,6 @@ client.update_collection( m=16, ), ) - ``` ```typescript @@ -19272,7 +36788,6 @@ client.updateCollection("{collection_name}", { m: 16, }, }); - ``` ```rust @@ -19289,7 +36804,6 @@ client .hnsw_config(HnswConfigDiffBuilder::default().m(16)), ) .await?; - ``` ```java @@ -19302,257 +36816,9 @@ QdrantClient client = client.updateCollectionAsync( UpdateCollection.newBuilder() .setCollectionName("{collection_name}") - .setHnswConfig(HnswConfigDiff.newBuilder().setM(16).build()) - .build()) - .get(); - -``` - -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; - -var client = new QdrantClient("localhost", 6334); - -await client.UpdateCollectionAsync( - collectionName: "{collection_name}", - hnswConfig: new HnswConfigDiff { M = 16 } -); - -``` - -```go -import ( - "context" - - "github.com/qdrant/go-client/qdrant" -) - -qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) - -client, err := client.UpdateCollection(context.Background(), &qdrant.UpdateCollection{ - CollectionName: "{collection_name}", - HnswConfig: &qdrant.HnswConfigDiff{ - M: qdrant.PtrOf(uint64(16)), - }, -}) - -``` - -### [Anchor](https://qdrant.tech/documentation/database-tutorials/bulk-upload/\#disable-indexing-completely-indexing_threshold-0) Disable indexing completely ( `indexing_threshold: 0`) - -In case you are doing an initial upload of a large dataset, you might want to disable indexing during upload. It will enable to avoid unnecessary indexing of vectors, which will be overwritten by the next batch. - -Setting `indexing_threshold` to `0` disables indexing altogether: - -httppythontypescriptrustjavacsharpgo - -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 768, - "distance": "Cosine" - }, - "optimizers_config": { - "indexing_threshold": 0 - } -} - -``` - -```python -from qdrant_client import QdrantClient, models - -client = QdrantClient(url="http://localhost:6333") - -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), - optimizers_config=models.OptimizersConfigDiff( - indexing_threshold=0, - ), -) - -``` - -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; - -const client = new QdrantClient({ host: "localhost", port: 6333 }); - -client.createCollection("{collection_name}", { - vectors: { - size: 768, - distance: "Cosine", - }, - optimizers_config: { - indexing_threshold: 0, - }, -}); - -``` - -```rust -use qdrant_client::qdrant::{ - OptimizersConfigDiffBuilder, UpdateCollectionBuilder, -}; -use qdrant_client::Qdrant; - -let client = Qdrant::from_url("http://localhost:6334").build()?; - -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .optimizers_config(OptimizersConfigDiffBuilder::default().indexing_threshold(0)), - ) - .await?; - -``` - -```java -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.grpc.Collections.VectorsConfig; -import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; - -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); - -client.createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig( - VectorsConfig.newBuilder() - .setParams( - VectorParams.newBuilder() - .setSize(768) - .setDistance(Distance.Cosine) - .build()) - .build()) - .setOptimizersConfig( - OptimizersConfigDiff.newBuilder() - .setIndexingThreshold(0) - .build()) - .build() -).get(); - -``` - -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; - -var client = new QdrantClient("localhost", 6334); - -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine }, - optimizersConfig: new OptimizersConfigDiff { IndexingThreshold = 0 } -); - -``` - -```go -import ( - "context" - - "github.com/qdrant/go-client/qdrant" -) - -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) - -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 768, - Distance: qdrant.Distance_Cosine, - }), - OptimizersConfig: &qdrant.OptimizersConfigDiff{ - IndexingThreshold: qdrant.PtrOf(uint64(0)), - }, -}) - -``` - -After upload is done, you can enable indexing by setting `indexing_threshold` to a desired value (default is 20000): - -httppythontypescriptrustjavacsharpgo - -```http -PATCH /collections/{collection_name} -{ - "optimizers_config": { - "indexing_threshold": 20000 - } -} - -``` - -```python -from qdrant_client import QdrantClient, models - -client = QdrantClient(url="http://localhost:6333") - -client.update_collection( - collection_name="{collection_name}", - optimizer_config=models.OptimizersConfigDiff(indexing_threshold=20000), -) - -``` - -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; - -const client = new QdrantClient({ host: "localhost", port: 6333 }); - -client.updateCollection("{collection_name}", { - optimizers_config: { - indexing_threshold: 20000, - }, -}); - -``` - -```rust -use qdrant_client::qdrant::{ - OptimizersConfigDiffBuilder, UpdateCollectionBuilder, -}; -use qdrant_client::Qdrant; - -let client = Qdrant::from_url("http://localhost:6334").build()?; - -client - .update_collection( - UpdateCollectionBuilder::new("{collection_name}") - .optimizers_config(OptimizersConfigDiffBuilder::default().indexing_threshold(20000)), - ) - .await?; - -``` - -```java -import io.qdrant.client.grpc.Collections.UpdateCollection; -import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; - -client.updateCollectionAsync( - UpdateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setOptimizersConfig( - OptimizersConfigDiff.newBuilder() - .setIndexingThreshold(20000) - .build() - ) - .build() -).get(); - + .setHnswConfig(HnswConfigDiff.newBuilder().setM(16).build()) + .build()) + .get(); ``` ```csharp @@ -19562,61 +36828,36 @@ using Qdrant.Client.Grpc; var client = new QdrantClient("localhost", 6334); await client.UpdateCollectionAsync( - collectionName: "{collection_name}", - optimizersConfig: new OptimizersConfigDiff { IndexingThreshold = 20000 } + collectionName: "{collection_name}", + hnswConfig: new HnswConfigDiff { M = 16 } ); - ``` ```go import ( "context" + "github.com/qdrant/go-client/qdrant" ) -client, err := qdrant.NewClient(&qdrant.Config{ +qdrant.NewClient(&qdrant.Config{ Host: "localhost", Port: 6334, }) -client.UpdateCollection(context.Background(), &qdrant.UpdateCollection{ +client, err := client.UpdateCollection(context.Background(), &qdrant.UpdateCollection{ CollectionName: "{collection_name}", - OptimizersConfig: &qdrant.OptimizersConfigDiff{ - IndexingThreshold: qdrant.PtrOf(uint64(20000)), + HnswConfig: &qdrant.HnswConfigDiff{ + M: qdrant.PtrOf(uint64(16)), }, }) - ``` -At this point, Qdrant will begin indexing new and previously unindexed segments in the background. - -## [Anchor](https://qdrant.tech/documentation/database-tutorials/bulk-upload/\#upload-directly-to-disk) Upload directly to disk - -When the vectors you upload do not all fit in RAM, you likely want to use -[memmap](https://qdrant.tech/documentation/concepts/storage/#configuring-memmap-storage) -support. - -During collection -[creation](https://qdrant.tech/documentation/concepts/collections/#create-collection), -memmaps may be enabled on a per-vector basis using the `on_disk` parameter. This -will store vector data directly on disk at all times. It is suitable for -ingesting a large amount of data, essential for the billion scale benchmark. - -Using `memmap_threshold` is not recommended in this case. It would require -the [optimizer](https://qdrant.tech/documentation/concepts/optimizer/) to constantly -transform in-memory segments into memmap segments on disk. This process is -slower, and the optimizer can be a bottleneck when ingesting a large amount of -data. - -Read more about this in -[Configuring Memmap Storage](https://qdrant.tech/documentation/concepts/storage/#configuring-memmap-storage). - -## [Anchor](https://qdrant.tech/documentation/database-tutorials/bulk-upload/\#parallel-upload-into-multiple-shards) Parallel upload into multiple shards +### Disable indexing completely (`indexing_threshold: 0`) -In Qdrant, each collection is split into shards. Each shard has a separate Write-Ahead-Log (WAL), which is responsible for ordering operations. -By creating multiple shards, you can parallelize upload of a large dataset. From 2 to 4 shards per one machine is a reasonable number. +In case you are doing an initial upload of a large dataset, you might want to disable indexing during upload. It will enable to avoid unnecessary indexing of vectors, which will be overwritten by the next batch. -httppythontypescriptrustjavacsharpgo +Setting `indexing_threshold` to `0` disables indexing altogether: ```http PUT /collections/{collection_name} @@ -19625,9 +36866,10 @@ PUT /collections/{collection_name} "size": 768, "distance": "Cosine" }, - "shard_number": 2 + "optimizers_config": { + "indexing_threshold": 0 + } } - ``` ```python @@ -19638,9 +36880,10 @@ client = QdrantClient(url="http://localhost:6333") client.create_collection( collection_name="{collection_name}", vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), - shard_number=2, + optimizers_config=models.OptimizersConfigDiff( + indexing_threshold=0, + ), ) - ``` ```typescript @@ -19653,13 +36896,16 @@ client.createCollection("{collection_name}", { size: 768, distance: "Cosine", }, - shard_number: 2, + optimizers_config: { + indexing_threshold: 0, + }, }); - ``` ```rust -use qdrant_client::qdrant::{CreateCollectionBuilder, Distance, VectorParamsBuilder}; +use qdrant_client::qdrant::{ + OptimizersConfigDiffBuilder, UpdateCollectionBuilder, +}; use qdrant_client::Qdrant; let client = Qdrant::from_url("http://localhost:6334").build()?; @@ -19667,40 +36913,38 @@ let client = Qdrant::from_url("http://localhost:6334").build()?; client .create_collection( CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) - .shard_number(2), + .optimizers_config(OptimizersConfigDiffBuilder::default().indexing_threshold(0)), ) .await?; - ``` ```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; import io.qdrant.client.grpc.Collections.CreateCollection; import io.qdrant.client.grpc.Collections.Distance; import io.qdrant.client.grpc.Collections.VectorParams; import io.qdrant.client.grpc.Collections.VectorsConfig; +import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig( - VectorsConfig.newBuilder() - .setParams( - VectorParams.newBuilder() - .setSize(768) - .setDistance(Distance.Cosine) - .build()) - .build()) - .setShardNumber(2) - .build()) - .get(); - +client.createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(768) + .setDistance(Distance.Cosine) + .build()) + .build()) + .setOptimizersConfig( + OptimizersConfigDiff.newBuilder() + .setIndexingThreshold(0) + .build()) + .build() +).get(); ``` ```csharp @@ -19710,11 +36954,10 @@ using Qdrant.Client.Grpc; var client = new QdrantClient("localhost", 6334); await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine }, - shardNumber: 2 + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine }, + optimizersConfig: new OptimizersConfigDiff { IndexingThreshold = 0 } ); - ``` ```go @@ -19735,13999 +36978,13788 @@ client.CreateCollection(context.Background(), &qdrant.CreateCollection{ Size: 768, Distance: qdrant.Distance_Cosine, }), - ShardNumber: qdrant.PtrOf(uint32(2)), + OptimizersConfig: &qdrant.OptimizersConfigDiff{ + IndexingThreshold: qdrant.PtrOf(uint64(0)), + }, }) - -``` - -##### Was this page useful? - -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No - -Thank you for your feedback! 🙏 - -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/database-tutorials/bulk-upload.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. - -On this page: - -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/database-tutorials/bulk-upload.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) - -× - -[Powered by](https://qdrant.tech/) - -<|page-45-lllmstxt|> -## dedicated-service -- [Articles](https://qdrant.tech/articles/) -- Vector Search as a dedicated service - -[Back to Qdrant Internals](https://qdrant.tech/articles/qdrant-internals/) - -# Vector Search as a dedicated service - -Andrey Vasnetsov - -· - -November 30, 2023 - -![Vector Search as a dedicated service](https://qdrant.tech/articles_data/dedicated-service/preview/title.jpg) - -Ever since the data science community discovered that vector search significantly improves LLM answers, -various vendors and enthusiasts have been arguing over the proper solutions to store embeddings. - -Some say storing them in a specialized engine (aka vector database) is better. Others say that it’s enough to use plugins for existing databases. - -Here are [just](https://nextword.substack.com/p/vector-database-is-not-a-separate) a [few](https://stackoverflow.blog/2023/09/20/do-you-need-a-specialized-vector-database-to-implement-vector-search-well/) of [them](https://www.singlestore.com/blog/why-your-vector-database-should-not-be-a-vector-database/). - -This article presents our vision and arguments on the topic . -We will: - -1. Explain why and when you actually need a dedicated vector solution -2. Debunk some ungrounded claims and anti-patterns to be avoided when building a vector search system. - -A table of contents: - -- _Each database vendor will sooner or later introduce vector capabilities
_ \[ [click](https://qdrant.tech/articles/dedicated-service/#each-database-vendor-will-sooner-or-later-introduce-vector-capabilities-that-will-make-every-database-a-vector-database)\] -- _Having a dedicated vector database requires duplication of data._ \[ [click](https://qdrant.tech/articles/dedicated-service/#having-a-dedicated-vector-database-requires-duplication-of-data)\] -- _Having a dedicated vector database requires complex data synchronization._ \[ [click](https://qdrant.tech/articles/dedicated-service/#having-a-dedicated-vector-database-requires-complex-data-synchronization)\] -- _You have to pay for a vector service uptime and data transfer._ \[ [click](https://qdrant.tech/articles/dedicated-service/#you-have-to-pay-for-a-vector-service-uptime-and-data-transfer-of-both-solutions)\] -- _What is more seamless than your current database adding vector search capability?_ \[ [click](https://qdrant.tech/articles/dedicated-service/#what-is-more-seamless-than-your-current-database-adding-vector-search-capability)\] -- _Databases can support RAG use-case end-to-end._ \[ [click](https://qdrant.tech/articles/dedicated-service/#databases-can-support-rag-use-case-end-to-end)\] - -## [Anchor](https://qdrant.tech/articles/dedicated-service/\#responding-to-claims) Responding to claims - -###### [Anchor](https://qdrant.tech/articles/dedicated-service/\#each-database-vendor-will-sooner-or-later-introduce-vector-capabilities-that-will-make-every-database-a-vector-database) Each database vendor will sooner or later introduce vector capabilities. That will make every database a Vector Database. - -The origins of this misconception lie in the careless use of the term Vector _Database_. -When we think of a _database_, we subconsciously envision a relational database like Postgres or MySQL. -Or, more scientifically, a service built on ACID principles that provides transactions, strong consistency guarantees, and atomicity. - -The majority of Vector Database are not _databases_ in this sense. -It is more accurate to call them _search engines_, but unfortunately, the marketing term _vector database_ has already stuck, and it is unlikely to change. - -_What makes search engines different, and why vector DBs are built as search engines?_ - -First of all, search engines assume different patterns of workloads and prioritize different properties of the system. The core architecture of such solutions is built around those priorities. - -What types of properties do search engines prioritize? - -- **Scalability**. Search engines are built to handle large amounts of data and queries. They are designed to be horizontally scalable and operate with more data than can fit into a single machine. -- **Search speed**. Search engines should guarantee low latency for queries, while the atomicity of updates is less important. -- **Availability**. Search engines must stay available if the majority of the nodes in a cluster are down. At the same time, they can tolerate the eventual consistency of updates. - -![Database guarantees compass](https://qdrant.tech/articles_data/dedicated-service/compass.png) - -Database guarantees compass - -Those priorities lead to different architectural decisions that are not reproducible in a general-purpose database, even if it has vector index support. - -###### [Anchor](https://qdrant.tech/articles/dedicated-service/\#having-a-dedicated-vector-database-requires-duplication-of-data) Having a dedicated vector database requires duplication of data. - -By their very nature, vector embeddings are derivatives of the primary source data. - -In the vast majority of cases, embeddings are derived from some other data, such as text, images, or additional information stored in your system. So, in fact, all embeddings you have in your system can be considered transformations of some original source. - -And the distinguishing feature of derivative data is that it will change when the transformation pipeline changes. -In the case of vector embeddings, the scenario of those changes is quite simple: every time you update the encoder model, all the embeddings will change. - -In systems where vector embeddings are fused with the primary data source, it is impossible to perform such migrations without significantly affecting the production system. - -As a result, even if you want to use a single database for storing all kinds of data, you would still need to duplicate data internally. - -###### [Anchor](https://qdrant.tech/articles/dedicated-service/\#having-a-dedicated-vector-database-requires-complex-data-synchronization) Having a dedicated vector database requires complex data synchronization. - -Most production systems prefer to isolate different types of workloads into separate services. -In many cases, those isolated services are not even related to search use cases. - -For example, databases for analytics and one for serving can be updated from the same source. -Yet they can store and organize the data in a way that is optimal for their typical workloads. - -Search engines are usually isolated for the same reason: you want to avoid creating a noisy neighbor problem and compromise the performance of your main database. - -_To give you some intuition, let’s consider a practical example:_ - -Assume we have a database with 1 million records. -This is a small database by modern standards of any relational database. -You can probably use the smallest free tier of any cloud provider to host it. - -But if we want to use this database for vector search, 1 million OpenAI `text-embedding-ada-002` embeddings will take **~6GB of RAM** (sic!). -As you can see, the vector search use case completely overwhelmed the main database resource requirements. -In practice, this means that your main database becomes burdened with high memory requirements and can not scale efficiently, limited by the size of a single machine. - -Fortunately, the data synchronization problem is not new and definitely not unique to vector search. -There are many well-known solutions, starting with message queues and ending with specialized ETL tools. - -For example, we recently released our [integration with Airbyte](https://qdrant.tech/documentation/integrations/airbyte/), allowing you to synchronize data from various sources into Qdrant incrementally. - -###### [Anchor](https://qdrant.tech/articles/dedicated-service/\#you-have-to-pay-for-a-vector-service-uptime-and-data-transfer-of-both-solutions) You have to pay for a vector service uptime and data transfer of both solutions. - -In the open-source world, you pay for the resources you use, not the number of different databases you run. -Resources depend more on the optimal solution for each use case. -As a result, running a dedicated vector search engine can be even cheaper, as it allows optimization specifically for vector search use cases. - -For instance, Qdrant implements a number of [quantization techniques](https://qdrant.tech/documentation/guides/quantization/) that can significantly reduce the memory footprint of embeddings. - -In terms of data transfer costs, on most cloud providers, network use within a region is usually free. As long as you put the original source data and the vector store in the same region, there are no added data transfer costs. - -###### [Anchor](https://qdrant.tech/articles/dedicated-service/\#what-is-more-seamless-than-your-current-database-adding-vector-search-capability) What is more seamless than your current database adding vector search capability? - -In contrast to the short-term attractiveness of integrated solutions, dedicated search engines propose flexibility and a modular approach. -You don’t need to update the whole production database each time some of the vector plugins are updated. -Maintenance of a dedicated search engine is as isolated from the main database as the data itself. - -In fact, integration of more complex scenarios, such as read/write segregation, is much easier with a dedicated vector solution. -You can easily build cross-region replication to ensure low latency for your users. - -![Read/Write segregation + cross-regional deployment](https://qdrant.tech/articles_data/dedicated-service/region-based-deploy.png) - -Read/Write segregation + cross-regional deployment - -It is especially important in large enterprise organizations, where the responsibility for different parts of the system is distributed among different teams. -In those situations, it is much easier to maintain a dedicated search engine for the AI team than to convince the core team to update the whole primary database. - -Finally, the vector capabilities of the all-in-one database are tied to the development and release cycle of the entire stack. -Their long history of use also means that they need to pay a high price for backward compatibility. - -###### [Anchor](https://qdrant.tech/articles/dedicated-service/\#databases-can-support-rag-use-case-end-to-end) Databases can support RAG use-case end-to-end. - -Putting aside performance and scalability questions, the whole discussion about implementing RAG in the DBs assumes that the only detail missing in traditional databases is the vector index and the ability to make fast ANN queries. - -In fact, the current capabilities of vector search have only scratched the surface of what is possible. -For example, in our recent article, we discuss the possibility of building an [exploration API](https://qdrant.tech/articles/vector-similarity-beyond-search/) to fuel the discovery process - an alternative to kNN search, where you don’t even know what exactly you are looking for. - -## [Anchor](https://qdrant.tech/articles/dedicated-service/\#summary) Summary - -Ultimately, you do not need a vector database if you are looking for a simple vector search functionality with a small amount of data. We genuinely recommend starting with whatever you already have in your stack to prototype. But you need one if you are looking to do more out of it, and it is the central functionality of your application. It is just like using a multi-tool to make something quick or using a dedicated instrument highly optimized for the use case. - -Large-scale production systems usually consist of different specialized services and storage types for good reasons since it is one of the best practices of modern software architecture. Comparable to the orchestration of independent building blocks in a microservice architecture. - -When you stuff the database with a vector index, you compromise both the performance and scalability of the main database and the vector search capabilities. -There is no one-size-fits-all approach that would not compromise on performance or flexibility. -So if your use case utilizes vector search in any significant way, it is worth investing in a dedicated vector search engine, aka vector database. - -##### Was this page useful? - -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No - -Thank you for your feedback! 🙏 - -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/dedicated-service.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. - -On this page: - -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/dedicated-service.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) - -× - -[Powered by](https://qdrant.tech/) - -<|page-46-lllmstxt|> -## data-streaming-kafka-qdrant -- [Documentation](https://qdrant.tech/documentation/) -- [Send data](https://qdrant.tech/documentation/send-data/) -- How to Setup Seamless Data Streaming with Kafka and Qdrant - -# [Anchor](https://qdrant.tech/documentation/send-data/data-streaming-kafka-qdrant/\#setup-data-streaming-with-kafka-via-confluent) Setup Data Streaming with Kafka via Confluent - -**Author:** [M K Pavan Kumar](https://www.linkedin.com/in/kameshwara-pavan-kumar-mantha-91678b21/) , research scholar at [IIITDM, Kurnool](https://iiitk.ac.in/). Specialist in hallucination mitigation techniques and RAG methodologies. -‱ [GitHub](https://github.com/pavanjava) ‱ [Medium](https://medium.com/@manthapavankumar11) - -## [Anchor](https://qdrant.tech/documentation/send-data/data-streaming-kafka-qdrant/\#introduction) Introduction - -This guide will walk you through the detailed steps of installing and setting up the [Qdrant Sink Connector](https://github.com/qdrant/qdrant-kafka), building the necessary infrastructure, and creating a practical playground application. By the end of this article, you will have a deep understanding of how to leverage this powerful integration to streamline your data workflows, ultimately enhancing the performance and capabilities of your data-driven real-time semantic search and RAG applications. - -In this example, original data will be sourced from Azure Blob Storage and MongoDB. - -![1.webp](https://qdrant.tech/documentation/examples/data-streaming-kafka-qdrant/1.webp) - -Figure 1: [Real time Change Data Capture (CDC)](https://www.confluent.io/learn/change-data-capture/) with Kafka and Qdrant. - -## [Anchor](https://qdrant.tech/documentation/send-data/data-streaming-kafka-qdrant/\#the-architecture) The Architecture: - -## [Anchor](https://qdrant.tech/documentation/send-data/data-streaming-kafka-qdrant/\#source-systems) Source Systems - -The architecture begins with the **source systems**, represented by MongoDB and Azure Blob Storage. These systems are vital for storing and managing raw data. MongoDB, a popular NoSQL database, is known for its flexibility in handling various data formats and its capability to scale horizontally. It is widely used for applications that require high performance and scalability. Azure Blob Storage, on the other hand, is Microsoft’s object storage solution for the cloud. It is designed for storing massive amounts of unstructured data, such as text or binary data. The data from these sources is extracted using **source connectors**, which are responsible for capturing changes in real-time and streaming them into Kafka. - -## [Anchor](https://qdrant.tech/documentation/send-data/data-streaming-kafka-qdrant/\#kafka) Kafka - -At the heart of this architecture lies **Kafka**, a distributed event streaming platform capable of handling trillions of events a day. Kafka acts as a central hub where data from various sources can be ingested, processed, and distributed to various downstream systems. Its fault-tolerant and scalable design ensures that data can be reliably transmitted and processed in real-time. Kafka’s capability to handle high-throughput, low-latency data streams makes it an ideal choice for real-time data processing and analytics. The use of **Confluent** enhances Kafka’s functionalities, providing additional tools and services for managing Kafka clusters and stream processing. - -## [Anchor](https://qdrant.tech/documentation/send-data/data-streaming-kafka-qdrant/\#qdrant) Qdrant - -The processed data is then routed to **Qdrant**, a highly scalable vector search engine designed for similarity searches. Qdrant excels at managing and searching through high-dimensional vector data, which is essential for applications involving machine learning and AI, such as recommendation systems, image recognition, and natural language processing. The **Qdrant Sink Connector** for Kafka plays a pivotal role here, enabling seamless integration between Kafka and Qdrant. This connector allows for the real-time ingestion of vector data into Qdrant, ensuring that the data is always up-to-date and ready for high-performance similarity searches. - -## [Anchor](https://qdrant.tech/documentation/send-data/data-streaming-kafka-qdrant/\#integration-and-pipeline-importance) Integration and Pipeline Importance - -The integration of these components forms a powerful and efficient data streaming pipeline. The **Qdrant Sink Connector** ensures that the data flowing through Kafka is continuously ingested into Qdrant without any manual intervention. This real-time integration is crucial for applications that rely on the most current data for decision-making and analysis. By combining the strengths of MongoDB and Azure Blob Storage for data storage, Kafka for data streaming, and Qdrant for vector search, this pipeline provides a robust solution for managing and processing large volumes of data in real-time. The architecture’s scalability, fault-tolerance, and real-time processing capabilities are key to its effectiveness, making it a versatile solution for modern data-driven applications. - -## [Anchor](https://qdrant.tech/documentation/send-data/data-streaming-kafka-qdrant/\#installation-of-confluent-kafka-platform) Installation of Confluent Kafka Platform - -To install the Confluent Kafka Platform (self-managed locally), follow these 3 simple steps: - -**Download and Extract the Distribution Files:** - -- Visit [Confluent Installation Page](https://www.confluent.io/installation/). -- Download the distribution files (tar, zip, etc.). -- Extract the downloaded file using: - -```bash -tar -xvf confluent-.tar.gz - -``` - -or - -```bash -unzip confluent-.zip - -``` - -**Configure Environment Variables:** - -```bash -# Set CONFLUENT_HOME to the installation directory: -export CONFLUENT_HOME=/path/to/confluent- - -# Add Confluent binaries to your PATH -export PATH=$CONFLUENT_HOME/bin:$PATH - -``` - -**Run Confluent Platform Locally:** - -```bash -# Start the Confluent Platform services: -confluent local start -# Stop the Confluent Platform services: -confluent local stop - -``` - -## [Anchor](https://qdrant.tech/documentation/send-data/data-streaming-kafka-qdrant/\#installation-of-qdrant) Installation of Qdrant: - -To install and run Qdrant (self-managed locally), you can use Docker, which simplifies the process. First, ensure you have Docker installed on your system. Then, you can pull the Qdrant image from Docker Hub and run it with the following commands: - -```bash -docker pull qdrant/qdrant -docker run -p 6334:6334 -p 6333:6333 qdrant/qdrant - -``` - -This will download the Qdrant image and start a Qdrant instance accessible at `http://localhost:6333`. For more detailed instructions and alternative installation methods, refer to the [Qdrant installation documentation](https://qdrant.tech/documentation/quick-start/). - -## [Anchor](https://qdrant.tech/documentation/send-data/data-streaming-kafka-qdrant/\#installation-of-qdrant-kafka-sink-connector) Installation of Qdrant-Kafka Sink Connector: - -To install the Qdrant Kafka connector using [Confluent Hub](https://www.confluent.io/hub/), you can utilize the straightforward `confluent-hub install` command. This command simplifies the process by eliminating the need for manual configuration file manipulations. To install the Qdrant Kafka connector version 1.1.0, execute the following command in your terminal: - -```bash - confluent-hub install qdrant/qdrant-kafka:1.1.0 - -``` - -This command downloads and installs the specified connector directly from Confluent Hub into your Confluent Platform or Kafka Connect environment. The installation process ensures that all necessary dependencies are handled automatically, allowing for a seamless integration of the Qdrant Kafka connector with your existing setup. Once installed, the connector can be configured and managed using the Confluent Control Center or the Kafka Connect REST API, enabling efficient data streaming between Kafka and Qdrant without the need for intricate manual setup. - -![2.webp](https://qdrant.tech/documentation/examples/data-streaming-kafka-qdrant/2.webp) - -_Figure 2: Local Confluent platform showing the Source and Sink connectors after installation._ - -Ensure the configuration of the connector once it’s installed as below. keep in mind that your `key.converter` and `value.converter` are very important for kafka to safely deliver the messages from topic to qdrant. - -```bash -{ - "name": "QdrantSinkConnectorConnector_0", - "config": { - "value.converter.schemas.enable": "false", - "name": "QdrantSinkConnectorConnector_0", - "connector.class": "io.qdrant.kafka.QdrantSinkConnector", - "key.converter": "org.apache.kafka.connect.storage.StringConverter", - "value.converter": "org.apache.kafka.connect.json.JsonConverter", - "topics": "topic_62,qdrant_kafka.docs", - "errors.deadletterqueue.topic.name": "dead_queue", - "errors.deadletterqueue.topic.replication.factor": "1", - "qdrant.grpc.url": "http://localhost:6334", - "qdrant.api.key": "************" - } -} - -``` - -## [Anchor](https://qdrant.tech/documentation/send-data/data-streaming-kafka-qdrant/\#installation-of-mongodb) Installation of MongoDB - -For the Kafka to connect MongoDB as source, your MongoDB instance should be running in a `replicaSet` mode. below is the `docker compose` file which will spin a single node `replicaSet` instance of MongoDB. - -```bash -version: "3.8" - -services: - mongo1: - image: mongo:7.0 - command: ["--replSet", "rs0", "--bind_ip_all", "--port", "27017"] - ports: - - 27017:27017 - healthcheck: - test: echo "try { rs.status() } catch (err) { rs.initiate({_id:'rs0',members:[{_id:0,host:'host.docker.internal:27017'}]}) }" | mongosh --port 27017 --quiet - interval: 5s - timeout: 30s - start_period: 0s - start_interval: 1s - retries: 30 - volumes: - - "mongo1_data:/data/db" - - "mongo1_config:/data/configdb" - -volumes: - mongo1_data: - mongo1_config: - ``` -Similarly, install and configure source connector as below. - -```bash -confluent-hub install mongodb/kafka-connect-mongodb:latest - -``` + -After installing the `MongoDB` connector, connector configuration should look like this: +After upload is done, you can enable indexing by setting `indexing_threshold` to a desired value (default is 20000): -```bash +```http +PATCH /collections/{collection_name} { - "name": "MongoSourceConnectorConnector_0", - "config": { - "connector.class": "com.mongodb.kafka.connect.MongoSourceConnector", - "key.converter": "org.apache.kafka.connect.storage.StringConverter", - "value.converter": "org.apache.kafka.connect.storage.StringConverter", - "connection.uri": "mongodb://127.0.0.1:27017/?replicaSet=rs0&directConnection=true", - "database": "qdrant_kafka", - "collection": "docs", - "publish.full.document.only": "true", - "topic.namespace.map": "{\"*\":\"qdrant_kafka.docs\"}", - "copy.existing": "true" - } -} - -``` - -## [Anchor](https://qdrant.tech/documentation/send-data/data-streaming-kafka-qdrant/\#playground-application) Playground Application - -As the infrastructure set is completely done, now it’s time for us to create a simple application and check our setup. the objective of our application is the data is inserted to Mongodb and eventually it will get ingested into Qdrant also using [Change Data Capture (CDC)](https://www.confluent.io/learn/change-data-capture/). - -`requirements.txt` - -```bash -fastembed==0.3.1 -pymongo==4.8.0 -qdrant_client==1.10.1 - -``` - -`project_root_folder/main.py` - -This is just sample code. Nevertheless it can be extended to millions of operations based on your use case. - -pythonpython - -```python -from pymongo import MongoClient -from utils.app_utils import create_qdrant_collection -from fastembed import TextEmbedding - -collection_name: str = 'test' -embed_model_name: str = 'snowflake/snowflake-arctic-embed-s' - -``` - -```python -# Step 0: create qdrant_collection -create_qdrant_collection(collection_name=collection_name, embed_model=embed_model_name) - -# Step 1: Connect to MongoDB -client = MongoClient('mongodb://127.0.0.1:27017/?replicaSet=rs0&directConnection=true') - -# Step 2: Select Database -db = client['qdrant_kafka'] - -# Step 3: Select Collection -collection = db['docs'] - -# Step 4: Create a Document to Insert - -description = "qdrant is a high available vector search engine" -embedding_model = TextEmbedding(model_name=embed_model_name) -vector = next(embedding_model.embed(documents=description)).tolist() -document = { - "collection_name": collection_name, - "id": 1, - "vector": vector, - "payload": { - "name": "qdrant", - "description": description, - "url": "https://qdrant.tech/documentation" + "optimizers_config": { + "indexing_threshold": 20000 } } - -# Step 5: Insert the Document into the Collection -result = collection.insert_one(document) - -# Step 6: Print the Inserted Document's ID -print("Inserted document ID:", result.inserted_id) - ``` -`project_root_folder/utils/app_utils.py` - ```python from qdrant_client import QdrantClient, models -client = QdrantClient(url="http://localhost:6333", api_key="") -dimension_dict = {"snowflake/snowflake-arctic-embed-s": 384} - -def create_qdrant_collection(collection_name: str, embed_model: str): - - if not client.collection_exists(collection_name=collection_name): - client.create_collection( - collection_name=collection_name, - vectors_config=models.VectorParams(size=dimension_dict.get(embed_model), distance=models.Distance.COSINE) - ) +client = QdrantClient(url="http://localhost:6333") +client.update_collection( + collection_name="{collection_name}", + optimizer_config=models.OptimizersConfigDiff(indexing_threshold=20000), +) ``` -Before we run the application, below is the state of MongoDB and Qdrant databases. - -![3.webp](https://qdrant.tech/documentation/examples/data-streaming-kafka-qdrant/3.webp) - -Figure 3: Initial state: no collection named `test` & `no data` in the `docs` collection of MongodDB. - -Once you run the code the data goes into Mongodb and the CDC gets triggered and eventually Qdrant will receive this data. - -![4.webp](https://qdrant.tech/documentation/examples/data-streaming-kafka-qdrant/4.webp) - -Figure 4: The test Qdrant collection is created automatically. - -![5.webp](https://qdrant.tech/documentation/examples/data-streaming-kafka-qdrant/5.webp) - -Figure 5: Data is inserted into both MongoDB and Qdrant. - -## [Anchor](https://qdrant.tech/documentation/send-data/data-streaming-kafka-qdrant/\#conclusion) Conclusion: - -In conclusion, the integration of **Kafka** with **Qdrant** using the **Qdrant Sink Connector** provides a seamless and efficient solution for real-time data streaming and processing. This setup not only enhances the capabilities of your data pipeline but also ensures that high-dimensional vector data is continuously indexed and readily available for similarity searches. By following the installation and setup guide, you can easily establish a robust data flow from your **source systems** like **MongoDB** and **Azure Blob Storage**, through **Kafka**, and into **Qdrant**. This architecture empowers modern applications to leverage real-time data insights and advanced search capabilities, paving the way for innovative data-driven solutions. - -##### Was this page useful? - -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No - -Thank you for your feedback! 🙏 - -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/send-data/data-streaming-kafka-qdrant.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. - -On this page: +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/send-data/data-streaming-kafka-qdrant.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +const client = new QdrantClient({ host: "localhost", port: 6333 }); -× +client.updateCollection("{collection_name}", { + optimizers_config: { + indexing_threshold: 20000, + }, +}); +``` -[Powered by](https://qdrant.tech/) +```rust +use qdrant_client::qdrant::{ + OptimizersConfigDiffBuilder, UpdateCollectionBuilder, +}; +use qdrant_client::Qdrant; -<|page-47-lllmstxt|> -## fastembed-splade -- [Documentation](https://qdrant.tech/documentation/) -- [Fastembed](https://qdrant.tech/documentation/fastembed/) -- Working with SPLADE +let client = Qdrant::from_url("http://localhost:6334").build()?; -# [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-splade/\#how-to-generate-sparse-vectors-with-splade) How to Generate Sparse Vectors with SPLADE +client + .update_collection( + UpdateCollectionBuilder::new("{collection_name}") + .optimizers_config(OptimizersConfigDiffBuilder::default().indexing_threshold(20000)), + ) + .await?; +``` -SPLADE is a novel method for learning sparse text representation vectors, outperforming BM25 in tasks like information retrieval and document classification. Its main advantage is generating efficient and interpretable sparse vectors, making it effective for large-scale text data. +```java +import io.qdrant.client.grpc.Collections.UpdateCollection; +import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-splade/\#setup) Setup +client.updateCollectionAsync( + UpdateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setOptimizersConfig( + OptimizersConfigDiff.newBuilder() + .setIndexingThreshold(20000) + .build() + ) + .build() +).get(); +``` -First, install FastEmbed. +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -```python -pip install -q fastembed +var client = new QdrantClient("localhost", 6334); +await client.UpdateCollectionAsync( + collectionName: "{collection_name}", + optimizersConfig: new OptimizersConfigDiff { IndexingThreshold = 20000 } +); ``` -Next, import the required modules for sparse embeddings and Python’s typing module. +```go +import ( + "context" + "github.com/qdrant/go-client/qdrant" +) -```python -from fastembed import SparseTextEmbedding, SparseEmbedding +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) +client.UpdateCollection(context.Background(), &qdrant.UpdateCollection{ + CollectionName: "{collection_name}", + OptimizersConfig: &qdrant.OptimizersConfigDiff{ + IndexingThreshold: qdrant.PtrOf(uint64(20000)), + }, +}) ``` -You may always check the list of all supported sparse embedding models. -```python -SparseTextEmbedding.list_supported_models() -``` +At this point, Qdrant will begin indexing new and previously unindexed segments in the background. -This will return a list of models, each with its details such as model name, vocabulary size, description, and sources. +## Upload directly to disk -```python -[\ - {\ - 'model': 'prithivida/Splade_PP_en_v1',\ - 'sources': {'hf': 'Qdrant/Splade_PP_en_v1', ...},\ - 'model_file': 'model.onnx',\ - 'description': 'Independent Implementation of SPLADE++ Model for English.',\ - 'license': 'apache-2.0',\ - 'size_in_GB': 0.532,\ - 'vocab_size': 30522,\ - ...\ - },\ - ...\ -] # part of the output was omitted +When the vectors you upload do not all fit in RAM, you likely want to use +[memmap](/documentation/concepts/storage/#configuring-memmap-storage) +support. -``` +During collection +[creation](/documentation/concepts/collections/#create-collection), +memmaps may be enabled on a per-vector basis using the `on_disk` parameter. This +will store vector data directly on disk at all times. It is suitable for +ingesting a large amount of data, essential for the billion scale benchmark. -Now, load the model. +Using `memmap_threshold` is not recommended in this case. It would require +the [optimizer](/documentation/concepts/optimizer/) to constantly +transform in-memory segments into memmap segments on disk. This process is +slower, and the optimizer can be a bottleneck when ingesting a large amount of +data. -```python -model_name = "prithivida/Splade_PP_en_v1" -# This triggers the model download -model = SparseTextEmbedding(model_name=model_name) +Read more about this in +[Configuring Memmap Storage](/documentation/concepts/storage/#configuring-memmap-storage). -``` +## Parallel upload into multiple shards -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-splade/\#embed-data) Embed data +In Qdrant, each collection is split into shards. Each shard has a separate Write-Ahead-Log (WAL), which is responsible for ordering operations. +By creating multiple shards, you can parallelize upload of a large dataset. From 2 to 4 shards per one machine is a reasonable number. -You need to define a list of documents to be embedded. +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 768, + "distance": "Cosine" + }, + "shard_number": 2 +} +``` ```python -documents: list[str] = [\ - "Chandrayaan-3 is India's third lunar mission",\ - "It aimed to land a rover on the Moon's surface - joining the US, China and Russia",\ - "The mission is a follow-up to Chandrayaan-2, which had partial success",\ - "Chandrayaan-3 will be launched by the Indian Space Research Organisation (ISRO)",\ - "The estimated cost of the mission is around $35 million",\ - "It will carry instruments to study the lunar surface and atmosphere",\ - "Chandrayaan-3 landed on the Moon's surface on 23rd August 2023",\ - "It consists of a lander named Vikram and a rover named Pragyan similar to Chandrayaan-2. Its propulsion module would act like an orbiter.",\ - "The propulsion module carries the lander and rover configuration until the spacecraft is in a 100-kilometre (62 mi) lunar orbit",\ - "The mission used GSLV Mk III rocket for its launch",\ - "Chandrayaan-3 was launched from the Satish Dhawan Space Centre in Sriharikota",\ - "Chandrayaan-3 was launched earlier in the year 2023",\ -] +from qdrant_client import QdrantClient, models +client = QdrantClient(url="http://localhost:6333") + +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), + shard_number=2, +) ``` -Then, generate sparse embeddings for each document. -Here, `batch_size` is optional and helps to process documents in batches. +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -```python -sparse_embeddings_list: list[SparseEmbedding] = list( - model.embed(documents, batch_size=6) -) +const client = new QdrantClient({ host: "localhost", port: 6333 }); +client.createCollection("{collection_name}", { + vectors: { + size: 768, + distance: "Cosine", + }, + shard_number: 2, +}); ``` -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-splade/\#retrieve-embeddings) Retrieve embeddings - -`sparse_embeddings_list` contains sparse embeddings for the documents provided earlier. Each element in this list is a `SparseEmbedding` object that contains the sparse vector representation of a document. +```rust +use qdrant_client::qdrant::{CreateCollectionBuilder, Distance, VectorParamsBuilder}; +use qdrant_client::Qdrant; -```python -index = 0 -sparse_embeddings_list[index] +let client = Qdrant::from_url("http://localhost:6334").build()?; +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) + .shard_number(2), + ) + .await?; ``` -This output is a `SparseEmbedding` object for the first document in our list. It contains two arrays: `values` and `indices`. \- The `values` array represents the weights of the features (tokens) in the document. - The `indices` array represents the indices of these features in the model’s vocabulary. - -Each pair of corresponding `values` and `indices` represents a token and its weight in the document. +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; -```python -SparseEmbedding(values=array([0.05297208, 0.01963477, 0.36459631, 1.38508618, 0.71776593,\ - 0.12667948, 0.46230844, 0.446771 , 0.26897505, 1.01519883,\ - 1.5655334 , 0.29412213, 1.53102326, 0.59785569, 1.1001817 ,\ - 0.02079751, 0.09955651, 0.44249091, 0.09747757, 1.53519952,\ - 1.36765671, 0.15740395, 0.49882549, 0.38629025, 0.76612782,\ - 1.25805044, 0.39058095, 0.27236196, 0.45152301, 0.48262018,\ - 0.26085234, 1.35912788, 0.70710695, 1.71639752]), indices=array([ 1010, 1011, 1016, 1017, 2001, 2018, 2034, 2093, 2117,\ - 2319, 2353, 2509, 2634, 2686, 2796, 2817, 2922, 2959,\ - 3003, 3148, 3260, 3390, 3462, 3523, 3822, 4231, 4316,\ - 4774, 5590, 5871, 6416, 11926, 12076, 16469])) +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(768) + .setDistance(Distance.Cosine) + .build()) + .build()) + .setShardNumber(2) + .build()) + .get(); ``` -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-splade/\#examine-weights) Examine weights - -Now, print the first 5 features and their weights for better understanding. +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -```python -for i in range(5): - print(f"Token at index {sparse_embeddings_list[0].indices[i]} has weight {sparse_embeddings_list[0].values[i]}") +var client = new QdrantClient("localhost", 6334); +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine }, + shardNumber: 2 +); ``` -The output will display the token indices and their corresponding weights for the first document. +```go +import ( + "context" -```python -Token at index 1010 has weight 0.05297207832336426 -Token at index 1011 has weight 0.01963476650416851 -Token at index 1016 has weight 0.36459630727767944 -Token at index 1017 has weight 1.385086178779602 -Token at index 2001 has weight 0.7177659273147583 + "github.com/qdrant/go-client/qdrant" +) + +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 768, + Distance: qdrant.Distance_Cosine, + }), + ShardNumber: qdrant.PtrOf(uint32(2)), +}) ``` -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-splade/\#analyze-results) Analyze results +<|page-83-lllmstxt|> +# Frequently Asked Questions: General Topics +|||||| +|-|-|-|-|-| +|[Vectors](/documentation/faq/qdrant-fundamentals/#vectors)|[Search](/documentation/faq/qdrant-fundamentals/#search)|[Collections](/documentation/faq/qdrant-fundamentals/#collections)|[Compatibility](/documentation/faq/qdrant-fundamentals/#compatibility)|[Cloud](/documentation/faq/qdrant-fundamentals/#cloud)| -Let’s use the tokenizer vocab to make sense of these indices. +## Vectors -```python -import json -from tokenizers import Tokenizer +### What is the maximum vector dimension supported by Qdrant? -tokenizer = Tokenizer.from_pretrained("Qdrant/Splade_PP_en_v1") +In dense vectors, Qdrant supports up to 65,535 dimensions. -``` +### What is the maximum size of vector metadata that can be stored? -The `get_tokens_and_weights` function takes a `SparseEmbedding` object and a `tokenizer` as input. It will construct a dictionary where the keys are the decoded tokens, and the values are their corresponding weights. +There is no inherent limitation on metadata size, but it should be [optimized for performance and resource usage](/documentation/guides/optimize/). Users can set upper limits in the configuration. -```python -def get_tokens_and_weights(sparse_embedding, tokenizer): - token_weight_dict = {} - for i in range(len(sparse_embedding.indices)): - token = tokenizer.decode([sparse_embedding.indices[i]]) - weight = sparse_embedding.values[i] - token_weight_dict[token] = weight +### Can the same similarity search query yield different results on different machines? - # Sort the dictionary by weights - token_weight_dict = dict(sorted(token_weight_dict.items(), key=lambda item: item[1], reverse=True)) - return token_weight_dict +Yes, due to differences in hardware configurations and parallel processing, results may vary slightly. -# Test the function with the first SparseEmbedding -print(json.dumps(get_tokens_and_weights(sparse_embeddings_list[index], tokenizer), indent=4)) +### How do I choose the right vector embeddings for my use case? -``` +This depends on the nature of your data and the specific application. Consider factors like dimensionality, domain-specific models, and the performance characteristics of different embeddings. -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-splade/\#dictionary-output) Dictionary output +### How does Qdrant handle different vector embeddings from various providers in the same collection? -The dictionary is then sorted by weights in descending order. +Qdrant natively [supports multiple vectors per data point](/documentation/concepts/vectors/#multivectors), allowing different embeddings from various providers to coexist within the same collection. -```python -{ - "chandra": 1.7163975238800049, - "third": 1.5655333995819092, - "##ya": 1.535199522972107, - "india": 1.5310232639312744, - "3": 1.385086178779602, - "mission": 1.3676567077636719, - "lunar": 1.3591278791427612, - "moon": 1.2580504417419434, - "indian": 1.1001816987991333, - "##an": 1.015198826789856, - "3rd": 0.7661278247833252, - "was": 0.7177659273147583, - "spacecraft": 0.7071069478988647, - "space": 0.5978556871414185, - "flight": 0.4988254904747009, - "satellite": 0.4826201796531677, - "first": 0.46230843663215637, - "expedition": 0.4515230059623718, - "three": 0.4467709958553314, - "fourth": 0.44249090552330017, - "vehicle": 0.390580952167511, - "iii": 0.3862902522087097, - "2": 0.36459630727767944, - "##3": 0.2941221296787262, - "planet": 0.27236196398735046, - "second": 0.26897504925727844, - "missions": 0.2608523368835449, - "launched": 0.15740394592285156, - "had": 0.12667948007583618, - "largest": 0.09955651313066483, - "leader": 0.09747757017612457, - ",": 0.05297207832336426, - "study": 0.02079751156270504, - "-": 0.01963476650416851 -} +### Can I migrate my embeddings from another vector store to Qdrant? -``` +Yes, Qdrant supports migration of embeddings from other vector stores, facilitating easy transitions and adoption of Qdrant’s features. -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-splade/\#observations) Observations +### Why the amount of indexed vectors doesn't match the amount of vectors in the collection? -- The relative order of importance is quite useful. The most important tokens in the sentence have the highest weights. -- **Term Expansion:** The model can expand the terms in the document. This means that the model can generate weights for tokens that are not present in the document but are related to the tokens in the document. This is a powerful feature that allows the model to capture the context of the document. Here, you’ll see that the model has added the tokens ‘3’ from ’third’ and ‘moon’ from ’lunar’ to the sparse vector. +Qdrant doesn't always need to index all vectors in the collection. +It stores data is segments, and if the segment is small enough, it is more efficient to perform a full-scan search on it. -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-splade/\#design-choices) Design choices +Make sure to check that the collection status is `green` and that the number of unindexed vectors smaller than indexing threshold. -- The weights are not normalized. This means that the sum of the weights is not 1 or 100. This is a common practice in sparse embeddings, as it allows the model to capture the importance of each token in the document. -- Tokens are included in the sparse vector only if they are present in the model’s vocabulary. This means that the model will not generate a weight for tokens that it has not seen during training. -- Tokens do not map to words directly – allowing you to gracefully handle typo errors and out-of-vocabulary tokens. +### Why collection info shows inaccurate number of points? -##### Was this page useful? +Collection info API in Qdrant returns an approximate number of points in the collection. +If you need an exact number, you can use the [count](/documentation/concepts/points/#counting-points) API. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +### Vectors in the collection don't match what I uploaded. -Thank you for your feedback! 🙏 +There are two possible reasons for this: -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/fastembed/fastembed-splade.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +- You used the `Cosine` distance metric in the [collection settings](/concepts/collections/#collections). In this case, Qdrant pre-normalizes your vectors for faster distance computation. If you strictly need the original vectors to be preserved, consider using the `Dot` distance metric instead. +- You used the `uint8` [datatype](/documentation/concepts/vectors/#datatypes) to store vectors. `uint8` requires a special format for input values, which might not be compatible with the typical output of embedding models. -On this page: -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/fastembed/fastembed-splade.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +## Search -× +### How does Qdrant handle real-time data updates and search? -[Powered by](https://qdrant.tech/) +Qdrant supports live updates for vector data, with newly inserted, updated and deleted vectors available for immediate search. The system uses full-scan search on unindexed segments during background index updates. -<|page-48-lllmstxt|> -## authentication -- [Documentation](https://qdrant.tech/documentation/) -- [Cloud](https://qdrant.tech/documentation/cloud/) -- Authentication +### My search results contain vectors with null values. Why? -# [Anchor](https://qdrant.tech/documentation/cloud/authentication/\#database-authentication-in-qdrant-managed-cloud) Database Authentication in Qdrant Managed Cloud +By default, Qdrant tries to minimize network traffic and doesn't return vectors in search results. +But you can force Qdrant to do so by setting the `with_vector` parameter of the Search/Scroll to `true`. -This page describes what Database API keys are and shows you how to use the Qdrant Cloud Console to create a Database API key for a cluster. You will learn how to connect to your cluster using the new API key. +If you're still seeing `"vector": null` in your results, it might be that the vector you're passing is not in the correct format, or there's an issue with how you're calling the upsert method. -Database API keys can be configured with granular access control. Database API keys with granular access control can be recognized by starting with `eyJhb`. Please refer to the [Table of access](https://qdrant.tech/documentation/guides/security/#table-of-access) to understand what permissions you can configure. +### How can I search without a vector? -Database API keys with granular access control are available for clusters using version **v1.11.0** and above. +You are likely looking for the [scroll](/documentation/concepts/points/#scroll-points) method. It allows you to retrieve the records based on filters or even iterate over all the records in the collection. -## [Anchor](https://qdrant.tech/documentation/cloud/authentication/\#create-database-api-keys) Create Database API Keys +### Does Qdrant support a full-text search or a hybrid search? -![API Key](https://qdrant.tech/documentation/cloud/create-api-key.png) +Qdrant is a vector search engine in the first place, and we only implement full-text support as long as it doesn't compromise the vector search use case. +That includes both the interface and the performance. -1. Go to the [Cloud Dashboard](https://qdrant.to/cloud). -2. Go to the **API Keys** section of the **Cluster Detail Page**. -3. Click **Create**. -4. Choose a name and an optional expiration (in days, the default is 90 days) for your API key. An empty expiration will result in no expiration. -5. By default, tokens are given cluster-wide permissions, with a choice between manage/write permissions (default) or read-only. +What Qdrant can do: +- Search with full-text filters +- Apply full-text filters to the vector search (i.e., perform vector search among the records with specific words or phrases) +- Do prefix search and semantic [search-as-you-type](/articles/search-as-you-type/) +- Sparse vectors, as used in [SPLADE](https://github.com/naver/splade) or similar models +- [Multi-vectors](/documentation/concepts/vectors/#multivectors), for example ColBERT and other late-interaction models +- Combination of the [multiple searches](/documentation/concepts/hybrid-queries/) +What Qdrant doesn't plan to support: -To restrict a token to a subset of collections, you can select the Collections tab and choose from the collections available in your cluster. -6. Click **Create** and retrieve your API key. +- Non-vector-based retrieval or ranking functions +- Built-in ontologies or knowledge graphs +- Query analyzers and other NLP tools -![API Key](https://qdrant.tech/documentation/cloud/api-key.png) +Of course, you can always combine Qdrant with any specialized tool you need, including full-text search engines. +Read more about [our approach](/articles/hybrid-search/) to hybrid search. -We recommend configuring an expiration and rotating your API keys regularly as a security best practice. +## Collections -How to Use Qdrant's Database API Keys with Granular Access Control - YouTube +### How many collections can I create? -[Photo image of Qdrant - Vector Database & Search Engine](https://www.youtube.com/channel/UC6ftm8PwH1RU_LM1jwG0LQA?embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +As many as you want, but be aware that each collection requires additional resources. +It is _highly_ recommended not to create many small collections, as it will lead to significant resource consumption overhead. -Qdrant - Vector Database & Search Engine +We consider creating a collection for each user/dialog/document as an antipattern. -8.12K subscribers +Please read more about collections, isolation, and multiple users in our [Multitenancy](/documentation/tutorials/multiple-partitions/) tutorial. -[How to Use Qdrant's Database API Keys with Granular Access Control](https://www.youtube.com/watch?v=3c-8tcBIVdQ) +### How do I upload a large number of vectors into a Qdrant collection? -Qdrant - Vector Database & Search Engine +Read about our recommendations in the [bulk upload](/documentation/tutorials/bulk-upload/) tutorial. -Search +### Can I only store quantized vectors and discard full precision vectors? -Watch later +No, Qdrant requires full precision vectors for operations like reindexing, rescoring, etc. -Share +## Compatibility -Copy link +### Is Qdrant compatible with CPUs or GPUs for vector computation? -Info +Qdrant primarily relies on CPU acceleration for scalability and efficiency. However, we also support GPU-accelerated indexing on all major vendors. -Shopping +### Do you guarantee compatibility across versions? -Tap to unmute +In case your version is older, we only guarantee compatibility between two consecutive minor versions. This also applies to client versions. Ensure your client version is never more than one minor version away from your cluster version. +While we will assist with break/fix troubleshooting of issues and errors specific to our products, Qdrant is not accountable for reviewing, writing (or rewriting), or debugging custom code. -If playback doesn't begin shortly, try restarting your device. +### Do you support downgrades? -Share +We do not support downgrading a cluster on any of our products. If you deploy a newer version of Qdrant, your +data is automatically migrated to the newer storage format. This migration is not reversible. -Include playlist +### How do I avoid issues when updating to the latest version? -An error occurred while retrieving sharing information. Please try again later. +We only guarantee compatibility if you update between consecutive versions. You would need to upgrade versions one at a time: `1.1 -> 1.2`, then `1.2 -> 1.3`, then `1.3 -> 1.4`. -0:00 +### Should I create payload indexes before or after uploading? -0:00 / 3:00 -‱Live +Create payload indexes before uploading to avoid index rebuilding. However, there are scenarios where defining idexes after uploading is okay. For example, you can configure a new filter logic after launch. -‱ +You should always index first if you know your filters upfront. If you need to index another payload later, you can still do it, but be aware of the performance hit. -[Watch on YouTube](https://www.youtube.com/watch?v=3c-8tcBIVdQ "Watch on YouTube") +## Should I create one Qdrant collection per user? +No. Creating one collection per user is more resource intensive. -## [Anchor](https://qdrant.tech/documentation/cloud/authentication/\#admin-database-api-keys) Admin Database API Keys +Instead of creating separate collections for each user, we recommend creating a [single collection](https://qdrant.tech/documentation/guides/multiple-partitions/) and separate access using payloads. Each Qdrant point can have a payload as metadata. For multitenancy, you can include a `user_id` or `tenant_id` for each point. To optimize storage further, you can enable [tenant indexing](https://qdrant.tech/documentation/concepts/indexing/#tenant-index) for payload fields. -The previous iteration of Database API keys, called Admin Database API keys, do not have granular access control. Clusters created before January 27, 2025 will still see the option to create Admin Database API keys. Older Admin Database API keys will continue to work, but we do recommend switching to Database API keys with granular access control to take advantage of better security controls. +## Cloud -To enable Database API keys with granular access control, click **Enable** on the **API Keys** section of the Cluster detail page. +### Is it possible to scale down a Qdrant Cloud cluster? -After enabling Database API keys with granular access control for a cluster, existing Admin Database API keys will continue to work, but you will not be able to create new Admin Database API Keys. +Yes, it is possible to both vertically and horizontally scale down a Qdrant Cloud cluster. +Note, that during the vertical scaling down, the disk size cannot be reduced. -## [Anchor](https://qdrant.tech/documentation/cloud/authentication/\#test-cluster-access) Test Cluster Access +<|page-84-lllmstxt|> +# Reranking in RAG with Qdrant Vector Database -After creation, you will receive a code snippet to access your cluster. Your generated request should look very similar to this one: +In Retrieval-Augmented Generation (RAG) systems, irrelevant or missing information can throw off your model's ability to produce accurate, meaningful outputs. One of the best ways to ensure you're feeding your language model the most relevant, context-rich documents is through reranking. It’s a game-changer. -```bash -curl \ - -X GET 'https://xyz-example.cloud-region.cloud-provider.cloud.qdrant.io:6333' \ - --header 'api-key: ' +In this guide, we’ll dive into using reranking to boost the relevance of search results in Qdrant. We’ll start with an easy use case that leverages the Cohere Rerank model. Then, we’ll take it up a notch by exploring ColBERT for a more advanced approach. By the time you’re done, you’ll know how to implement [hybrid search](https://qdrant.tech/articles/hybrid-search/), fine-tune reranking models, and significantly improve your accuracy. -``` +Ready? Let’s jump in. -Open Terminal and run the request. You should get a response that looks like this: +# Understanding Reranking -```bash -{"title":"qdrant - vector search engine","version":"1.13.0","commit":"ffda0b90c8c44fc43c99adab518b9787fe57bde6"} +This section is broken down into key parts to help you easily grasp the background, mechanics, and significance of reranking. -``` +## Background -> **Note:** You need to include the API key in the request header for every -> request over REST or gRPC. +In search systems, two metrics—precision and recall—are the backbone of success. But what do they mean? Precision tells us how many of the retrieved results are actually relevant, while recall measures how well we’ve captured all the relevant results out there. Simply put: -## [Anchor](https://qdrant.tech/documentation/cloud/authentication/\#authenticate-via-sdk) Authenticate via SDK +![image5.png](/documentation/examples/reranking-semantic-search/image5.png) -Now that you have created your first cluster and key, you might want to access your database from within your application. -Our [official Qdrant clients](https://qdrant.tech/documentation/interfaces/) for Python, TypeScript, Go, Rust, .NET and Java all support the API key parameter. +Sparse vector searches usually give you high precision because they’re great at finding exact matches. But, here's the catch—your recall can suffer when relevant documents don’t contain those exact keywords. On the flip side, dense vector searches are fantastic for recall since they grasp the broader, semantic meaning of your query. However, this can lead to lower precision, where you might see results that are only loosely related. -bashpythontypescriptrustjavacsharpgo +This is exactly where reranking comes to the rescue. It takes a wide net of documents (giving you high recall) and then refines them by reordering the top candidates based on their relevance scores—boosting precision without losing that broad understanding. Typically, we retain only the top K candidates after reordering to focus on the most relevant results. -```bash -curl \ - -X GET https://xyz-example.cloud-region.cloud-provider.cloud.qdrant.io:6333 \ - --header 'api-key: ' +## Working -# Alternatively, you can use the `Authorization` header with the `Bearer` prefix -curl \ - -X GET https://xyz-example.cloud-region.cloud-provider.cloud.qdrant.io:6333 \ - --header 'Authorization: Bearer ' +Picture this: You walk into a massive library and ask for a book on "climate change." The librarian pulls out a dozen books for you—some are scientific papers, others are personal essays, and one’s even a novel. Sure, they’re all relevant, but the first one you get handed is the novel. Not exactly what you were hoping for, right? -``` +Now, imagine a smarter, more intuitive librarian who really gets what you’re after. This one knows exactly which books are most impactful, the most current, and perfectly aligned with what you need. That’s what reranking does for your search results—it doesn’t just grab any relevant document; it smartly reorders them so the best ones land at the top of your list. It’s like having a librarian who knows exactly what you’re looking for before you do! -```python -from qdrant_client import QdrantClient +![image6.png](/documentation/examples/reranking-semantic-search/image6.png) -qdrant_client = QdrantClient( - "xyz-example.cloud-region.cloud-provider.cloud.qdrant.io", - api_key="", -) +An illustration of the rerank model prioritizing better results -``` +To become that smart, intuitive librarian, your algorithm needs to learn how to understand both your queries and the documents it retrieves. It has to evaluate the relationship between them effectively, so it can give you exactly what you’re looking for. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +The way reranker models operate varies based on their type, which will be discussed later, but in general, they calculate a relevance score for each document-query pair.Unlike embedding models, which squash everything into a single vector upfront, rerankers keep all the important details intact by using the full transformer output to calculate a similarity score. The result? Precision. But, there’s a trade-off—reranking can be slow. Processing millions of documents can take hours, which is why rerankers focus on refining results, not searching through the entire document collection. -const client = new QdrantClient({ - host: "xyz-example.cloud-region.cloud-provider.cloud.qdrant.io", - apiKey: "", -}); +Rerankers come in different types, each with its own strengths. Let’s break them down: -``` +1. **Cross Encoder Models**: These boost reranking by using a classification system to evaluate pairs of data—like sentences or documents. They spit out a similarity score from 0 to 1, showing how closely the document matches your query. The catch? Cross-encoders need both query and document, so they can’t handle standalone documents or queries by themselves. +2. **Multi-Vector Rerankers (e.g., ColBERT)**: These models take a more efficient route. They encode your query and the documents separately and only compare them later, reducing the computational load. This means document representations can be precomputed, speeding up retrieval times +3. **Large Language Models (LLMs) as Rerankers**: This is a newer, smarter way to rerank. LLMs, like GPT, are getting better by the day. With the right instructions, they can prioritize the most relevant documents for you, leveraging their massive understanding of language to deliver even more accurate results. -```rust -use qdrant_client::Qdrant; +Each of these rerankers has its own special way of making sure you get the best search results, fast and relevant to what you need. -let client = Qdrant::from_url("https://xyz-example.cloud-region.cloud-provider.cloud.qdrant.io:6334") - .api_key("") - .build()?; +## Importance -``` +In the previous section, we explored the background and mechanics of reranking, but now let’s talk about the three big wins you get from using it: -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; +- **Enhancing Search Accuracy:** Reranking is all about making your search results sharper and more relevant. After the initial ranking, rerankers step in, reshuffling the results based on deeper analysis to ensure that the most crucial information is front and center. [Research shows that rerankers](https://cohere.com/blog/rerank) can pull off a serious boost—improving the top results for about 72% of search queries. That’s a huge leap in precision. +- **Reducing Information Overload:** If you feel like you’re drowning in a sea of search results, rerankers can come to your rescue. They filter and fine-tune the flood of information so you get exactly what you need, without the overwhelm. It makes your search experience more focused and way less chaotic. +- **Balancing Speed and Relevance:** First stage retrieval and second stage reranking strike the perfect balance between speed and accuracy. Sure, the second stage may add a bit of latency due to their processing power, but the trade-off is worth it. You get highly relevant results, and in the end, that’s what matters most. -QdrantClient client = - new QdrantClient( - QdrantGrpcClient.newBuilder( - "xyz-example.cloud-region.cloud-provider.cloud.qdrant.io", - 6334, - true) - .withApiKey("") - .build()); +Now that you know why reranking is such a game-changer, let’s dive into the practical side of things. -``` +# Implementing Vector Search with Reranking -```csharp -using Qdrant.Client; +In this section, you’re going to see how to implement vector search with reranking using Cohere. But first, let’s break it down. -var client = new QdrantClient( - host: "xyz-example.cloud-region.cloud-provider.cloud.qdrant.io", - https: true, - apiKey: "" -); +## Overview -``` +A typical search system works in two main stages: Ingestion and Retrieval. Think of ingestion as the process where your data gets prepped and loaded into the system, and retrieval as the part where the magic happens—where your queries pull out the most relevant documents. -```go -import "github.com/qdrant/go-client/qdrant" +Check out the architectural diagram below to visualize how these stages work together. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "xyz-example.cloud-region.cloud-provider.cloud.qdrant.io", - Port: 6334, - APIKey: "", - UseTLS: true, -}) +![image1.png](/documentation/examples/reranking-semantic-search/image1.png) -``` +The two essential stages of a search system: Ingestion and Retrieval Process -##### Was this page useful? +### Ingestion Stage -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +- **Documents:** This is where it all starts. The system takes in raw data or documents that need to be prepped for search—this is your initial input. +- **Embeddings:** Next, these documents are transformed into sparse or dense [embeddings](https://qdrant.tech/documentation/embeddings/), which are basically vector representations. These vectors capture the deep, underlying meaning of the text, allowing your system to perform smart, efficient searches and comparisons based on semantic meaning +- **Vector Database:** Once your documents are converted into these embeddings, they get stored in a vector database—essentially the powerhouse behind fast, accurate similarity searches. Here, we’ll see the capabilities of the Qdrant vector database. -Thank you for your feedback! 🙏 +### Retrieval Stage -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud/authentication.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +- **User's Query:** Now we enter the retrieval phase. The user submits a query, and it’s time to match that query against the stored documents. +- **Embeddings:** Just like with the documents, the user’s query is converted into a sparse or dense embedding. This enables the system to compare the query's meaning with the meanings of the stored documents. +- **Vector Search:** The system searches for the most relevant documents by comparing the query’s embedding to those in the vector database, and it pulls up the closest matches. +- **Rerank:** Once the initial results are in, the reranking process kicks in to ensure you get the best results on top. We’ll be using **Cohere’s** rerank-english-v3.0 model, which excels at reordering English language documents to prioritize relevance. It can handle up to 4096 tokens, giving it plenty of context to work with. And if you’re dealing with multi-lingual data, don’t worry—Cohere’s got reranking models for other languages too. -On this page: +## Implementation -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud/authentication.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Now it’s time to dive into the actual implementation. -× +### Setup -[Powered by](https://qdrant.tech/) +To follow along with this tutorial, you'll need a few key tools:: -<|page-49-lllmstxt|> -## web-ui-gsoc -- [Articles](https://qdrant.tech/articles/) -- Google Summer of Code 2023 - Web UI for Visualization and Exploration +- Python Client for Qdrant +- Cohere -[Back to Ecosystem](https://qdrant.tech/articles/ecosystem/) +Let’s install everything you need in one go using the Python package manager:: -# Google Summer of Code 2023 - Web UI for Visualization and Exploration +```jsx +pip install qdrant-client cohere +``` -Kartik Gupta +--- -· +Now, let’s bring in all the necessary components in one tidy block: -August 28, 2023 +```jsx +from qdrant_client import QdrantClient +from qdrant_client.models import Distance, VectorParams, PointStruct +import cohere +``` -![Google Summer of Code 2023 - Web UI for Visualization and Exploration](https://qdrant.tech/articles_data/web-ui-gsoc/preview/title.jpg) +--- -## [Anchor](https://qdrant.tech/articles/web-ui-gsoc/\#introduction) Introduction +Qdrant is a powerful vector similarity search engine that gives you a production-ready service with an easy-to-use API for storing, searching, and managing data. You can interact with Qdrant through a local or cloud setup, but since we’re working in Colab, let’s go with the cloud setup. -Hello everyone! My name is Kartik Gupta, and I am thrilled to share my coding journey as part of the Google Summer of Code 2023 program. This summer, I had the incredible opportunity to work on an exciting project titled “Web UI for Visualization and Exploration” for Qdrant, a vector search engine. In this article, I will take you through my experience, challenges, and achievements during this enriching coding journey. +### **Steps to Set Up Qdrant Cloud:** -## [Anchor](https://qdrant.tech/articles/web-ui-gsoc/\#project-overview) Project Overview +1. **Sign Up**: Head to Qdrant’s website and sign up for a cloud account using your email, Google, or GitHub credentials. +2. **Create Your First Cluster**: Once you’re in, navigate to the Overview section and follow the onboarding steps under Create First Cluster. +3. **Get Your API Key**: After creating your cluster, an API key will be generated. This key will let you interact with the cluster using the Python client. +4. **Check Your Cluster**: Your new cluster will appear under the Clusters section. From here, you’re all set to start interacting with your data. -Qdrant is a powerful vector search engine widely used for similarity search and clustering. However, it lacked a user-friendly web-based UI for data visualization and exploration. My project aimed to bridge this gap by developing a web-based user interface that allows users to easily interact with and explore their vector data. +Finally, under the Overview section, you’ll see the following code snippet: -## [Anchor](https://qdrant.tech/articles/web-ui-gsoc/\#milestones-and-achievements) Milestones and Achievements +![image7.png](/documentation/examples/reranking-semantic-search/image7.png) -The project was divided into six milestones, each focusing on a specific aspect of the web UI development. Let’s go through each of them and my achievements during the coding period. +Qdrant Overview Section -**1\. Designing a friendly UI on Figma** +Add your API keys. This will let your Python client connect to Qdrant and Cohere. -I started by designing the user interface on Figma, ensuring it was easy to use, visually appealing, and responsive on different devices. I focused on usability and accessibility to create a seamless user experience. ( [Figma Design](https://www.figma.com/file/z54cAcOErNjlVBsZ1DrXyD/Qdant?type=design&node-id=0-1&mode=design&t=Pu22zO2AMFuGhklG-0)) +```jsx +client = QdrantClient( + url="", + api_key="", +) -**2\. Building the layout** +print(client.get_collections()) +``` -The layout route served as a landing page with an overview of the application’s features and navigation links to other routes. +--- -**3\. Creating a view collection route** +Next, we’ll set up Cohere for reranking. Log in to your Cohere account, generate an API key, and add it like this:: -This route enabled users to view a list of collections available in the application. Users could click on a collection to see more details, including the data and vectors associated with it. +```jsx +co = cohere.Client("") +``` -![Collection Page](https://qdrant.tech/articles_data/web-ui-gsoc/collections-page.png) +--- -Collection Page +### Ingestion -**4\. Developing a data page with “find similar” functionality** +### There are three key parts to ingestion: Creating a Collection, Converting Documents to Embeddings, and Upserting the Data. Let’s break it down. -I implemented a data page where users could search for data and find similar data using a recommendation API. The recommendation API suggested similar data based on the Data’s selected ID, providing valuable insights. +### Creating a Collection -![Points Page](https://qdrant.tech/articles_data/web-ui-gsoc/points-page.png) +A collection is basically a named group of points (vectors with data) that you can search through. All the vectors in a collection need to have the same size and be compared using one distance metric. Here’s how to create one: -Points Page +```jsx +client.create_collection( + collection_name="basic-search-rerank", + vectors_config=VectorParams(size=1024, distance=Distance.DOT), +) +``` -**5\. Developing query editor page libraries** +--- -This milestone involved creating a query editor page that allowed users to write queries in a custom language. The editor provided syntax highlighting, autocomplete, and error-checking features for a seamless query writing experience. +Here, the vector size is set to 1024 to match our dense embeddings, and we’re using dot product as the distance metric—perfect for capturing the similarity between vectors, especially when they’re normalized. -![Query Editor Page](https://qdrant.tech/articles_data/web-ui-gsoc/console-page.png) +### Documents to Embeddings -Query Editor Page +Let’s set up some example data. Here’s a query and a few documents for demonstration: -**6\. Developing a route for visualizing vector data points** +```jsx +query = "What is the purpose of feature scaling in machine learning?" -This is done by the reduction of n-dimensional vector in 2-D points and they are displayed with their respective payloads. +documents = [ + "In machine learning, feature scaling is the process of normalizing the range of independent variables or features. The goal is to ensure that all features contribute equally to the model, especially in algorithms like SVM or k-nearest neighbors where distance calculations matter.", + + "Feature scaling is commonly used in data preprocessing to ensure that features are on the same scale. This is particularly important for gradient descent-based algorithms where features with larger scales could disproportionately impact the cost function.", + + "In data science, feature extraction is the process of transforming raw data into a set of engineered features that can be used in predictive models. Feature scaling is related but focuses on adjusting the values of these features.", + + "Unsupervised learning algorithms, such as clustering methods, may benefit from feature scaling as it ensures that features with larger numerical ranges don't dominate the learning process.", + + "One common data preprocessing technique in data science is feature selection. Unlike feature scaling, feature selection aims to reduce the number of input variables used in a model to avoid overfitting.", + + "Principal component analysis (PCA) is a dimensionality reduction technique used in data science to reduce the number of variables. PCA works best when data is scaled, as it relies on variance which can be skewed by features on different scales.", + + "Min-max scaling is a common feature scaling technique that usually transforms features to a fixed range [0, 1]. This method is useful when the distribution of data is not Gaussian.", + + "Standardization, or z-score normalization, is another technique that transforms features into a mean of 0 and a standard deviation of 1. This method is effective for data that follows a normal distribution.", + + "Feature scaling is critical when using algorithms that rely on distances, such as k-means clustering, as unscaled features can lead to misleading results.", + + "Scaling can improve the convergence speed of gradient descent algorithms by preventing issues with different feature scales affecting the cost function's landscape.", + + "In deep learning, feature scaling helps in stabilizing the learning process, allowing for better performance and faster convergence during training.", + + "Robust scaling is another method that uses the median and the interquartile range to scale features, making it less sensitive to outliers.", + + "When working with time series data, feature scaling can help in standardizing the input data, improving model performance across different periods.", + + "Normalization is often used in image processing to scale pixel values to a range that enhances model performance in computer vision tasks.", + + "Feature scaling is significant when features have different units of measurement, such as height in centimeters and weight in kilograms.", + + "In recommendation systems, scaling features such as user ratings can improve the model's ability to find similar users or items.", + + "Dimensionality reduction techniques, like t-SNE and UMAP, often require feature scaling to visualize high-dimensional data in lower dimensions effectively.", + + "Outlier detection techniques can also benefit from feature scaling, as they can be influenced by unscaled features that have extreme values.", + + "Data preprocessing steps, including feature scaling, can significantly impact the performance of machine learning models, making it a crucial part of the modeling pipeline.", + + "In ensemble methods, like random forests, feature scaling is not strictly necessary, but it can still enhance interpretability and comparison of feature importance.", + + "Feature scaling should be applied consistently across training and test datasets to avoid data leakage and ensure reliable model evaluation.", + + "In natural language processing (NLP), scaling can be useful when working with numerical features derived from text data, such as word counts or term frequencies.", + + "Log transformation is a technique that can be applied to skewed data to stabilize variance and make the data more suitable for scaling.", + + "Data augmentation techniques in machine learning may also include scaling to ensure consistency across training datasets, especially in computer vision tasks." +] +``` -![visualization-page](https://qdrant.tech/articles_data/web-ui-gsoc/visualization-page.png) +--- -Vector Visuliztion Page +We’ll generate embeddings for these documents using Cohere’s embed-english-v3.0 model, which produces 1024-dimensional vectors: -## [Anchor](https://qdrant.tech/articles/web-ui-gsoc/\#challenges-and-learning) Challenges and Learning +```python +model="embed-english-v3.0" -Throughout the project, I encountered a series of challenges that stretched my engineering capabilities and provided unique growth opportunities. From mastering new libraries and technologies to ensuring the user interface (UI) was both visually appealing and user-friendly, every obstacle became a stepping stone toward enhancing my skills as a developer. However, each challenge provided an opportunity to learn and grow as a developer. I acquired valuable experience in vector search and dimension reduction techniques. +doc_embeddings = co.embed(texts=documents, + model=model, + input_type="search_document", + embedding_types=['float']) +``` -The most significant learning for me was the importance of effective project management. Setting realistic timelines, collaborating with mentors, and staying proactive with feedback allowed me to complete the milestones efficiently. +--- -### [Anchor](https://qdrant.tech/articles/web-ui-gsoc/\#technical-learning-and-skill-development) Technical Learning and Skill Development +This code taps into the power of the Cohere API to generate embeddings for your list of documents. It uses the embed-english-v3.0 model, sets the input type to "search_document," and asks for the embeddings in float format. The result? A set of dense embeddings, each one representing the deep semantic meaning of your documents. These embeddings will be stored in doc_embeddings, ready for action. -One of the most significant aspects of this journey was diving into the intricate world of vector search and dimension reduction techniques. These areas, previously unfamiliar to me, required rigorous study and exploration. Learning how to process vast amounts of data efficiently and extract meaningful insights through these techniques was both challenging and rewarding. +### Upsert Data -### [Anchor](https://qdrant.tech/articles/web-ui-gsoc/\#effective-project-management) Effective Project Management +We need to transform those dense embeddings into a format Qdrant can work with, and that’s where Points come in. Points are the building blocks of Qdrant—they’re records made up of a vector (the embedding) and an optional payload (like your document text). -Undoubtedly, the most impactful lesson was the art of effective project management. I quickly grasped the importance of setting realistic timelines and goals. Collaborating closely with mentors and maintaining proactive communication proved indispensable. This approach enabled me to navigate the complex development process and successfully achieve the project’s milestones. +Here’s how we convert those embeddings into Points: -### [Anchor](https://qdrant.tech/articles/web-ui-gsoc/\#overcoming-technical-challenges) Overcoming Technical Challenges +```python +points = [] +for idx, (embedding, doc) in enumerate(zip(doc_embeddings.embeddings.float_, documents)): + point = PointStruct( + id=idx, + vector=embedding, + payload={"document": doc} + ) + points.append(point) +``` -#### [Anchor](https://qdrant.tech/articles/web-ui-gsoc/\#autocomplete-feature-in-console) Autocomplete Feature in Console +--- -One particularly intriguing challenge emerged while working on the autocomplete feature within the console. Finding a solution was proving elusive until a breakthrough came from an unexpected direction. My mentor, Andrey, proposed creating a separate module that could support autocomplete based on OpenAPI for our custom language. This ingenious approach not only resolved the issue but also showcased the power of collaborative problem-solving. +What’s happening here? We’re building a list of Points from the embeddings: -#### [Anchor](https://qdrant.tech/articles/web-ui-gsoc/\#optimization-with-web-workers) Optimization with Web Workers +- First, we start with an empty list. +- Then, we loop through both **doc_embeddings** and **documents** at the same time using enumerate() to grab the index (idx) along the way. +- For each pair (an embedding and its corresponding document), we create a PointStruct. Each point gets: + - An id (from idx). + - A vector (the embedding). + - A payload (the actual document text). +- Each Point is added to our list. -The high-processing demands of vector reduction posed another significant challenge. Initially, this task was straining browsers and causing performance issues. The solution materialized in the form of web workers—an independent processing instance that alleviated the strain on browsers. However, a new question arose: how to terminate these workers effectively? With invaluable insights from my mentor, I gained a deeper understanding of web worker dynamics and successfully tackled this challenge. +Once that’s done, it’s time to send these Points into your Qdrant collection with the upsert() function: -#### [Anchor](https://qdrant.tech/articles/web-ui-gsoc/\#console-integration-complexity) Console Integration Complexity +```python +operation_info = client.upsert( + collection_name="basic-search-rerank", + points=points +) +``` -Integrating the console interaction into the application presented multifaceted challenges. Crafting a custom language in Monaco, parsing text to make API requests, and synchronizing the entire process demanded meticulous attention to detail. Overcoming these hurdles was a testament to the complexity of real-world engineering endeavours. +--- -#### [Anchor](https://qdrant.tech/articles/web-ui-gsoc/\#codelens-multiplicity-issue) Codelens Multiplicity Issue +### Now your embeddings are all set in Qdrant, ready to power your search. -An unexpected issue cropped up during the development process: the codelen (run button) registered multiple times, leading to undesired behaviour. This hiccup underscored the importance of thorough testing and debugging, even in seemingly straightforward features. +### Retrieval -### [Anchor](https://qdrant.tech/articles/web-ui-gsoc/\#key-learning-points) Key Learning Points +The first few steps here mirror what we did during ingestion—just like before, we need to convert the query into an embedding: -Amidst these challenges, I garnered valuable insights that have significantly enriched my engineering prowess: +```python +query_embeddings = co.embed(texts=[query], + model=model, + input_type="search_query", + embedding_types=['float']) +``` -**Vector Reduction Techniques**: Navigating the realm of vector reduction techniques provided a deep understanding of how to process and interpret data efficiently. This knowledge opens up new avenues for developing data-driven applications in the future. +--- -**Web Workers Efficiency**: Mastering the intricacies of web workers not only resolved performance concerns but also expanded my repertoire of optimization strategies. This newfound proficiency will undoubtedly find relevance in various future projects. +After that, we'll move on to retrieve results using vector search and apply reranking on the results. This two-stage process is super efficient because we’re grabbing a small set of the most relevant documents first, which is much faster than reranking a huge dataset. -**Monaco Editor and UI Frameworks**: Working extensively with the Monaco Editor, Material-UI (MUI), and Vite enriched my familiarity with these essential tools. I honed my skills in integrating complex UI components seamlessly into applications. +### Vector Search -## [Anchor](https://qdrant.tech/articles/web-ui-gsoc/\#areas-for-improvement-and-future-enhancements) Areas for Improvement and Future Enhancements +This snippet grabs the top 10 most relevant points from your Qdrant collection using the query embedding. -While reflecting on this transformative journey, I recognize several areas that offer room for improvement and future enhancements: +```python +search_result = client.query_points( + collection_name="basic-search-rerank", query=query_embeddings.embeddings.float_[0], limit=10 +).points +``` -1. Enhanced Autocomplete: Further refining the autocomplete feature to support key-value suggestions in JSON structures could greatly enhance the user experience. +--- -2. Error Detection in Console: Integrating the console’s error checker with OpenAPI could enhance its accuracy in identifying errors and offering precise suggestions for improvement. +Here’s how it works: we use the query_points method to search within the "basic-search-rerank" collection. It compares the query embedding (the first embedding in query_embeddings) against all the document embeddings, pulling up the 10 closest matches. The matching points get stored in search_result. -3. Expanded Vector Visualization: Exploring additional visualization methods and optimizing their performance could elevate the utility of the vector visualization route. +And here’s a sneak peek at what you’ll get from the vector search: +| **ID** | **Document** | **Score** | +| --- | --- | --- | +| 0 | In machine learning, feature scaling is the process of normalizing the range of independent... | 0.71 | +| 10 | In deep learning, feature scaling helps stabilize the learning process, allowing for... | 0.69 | +| 1 | Feature scaling is commonly used in data preprocessing to ensure that features are on the... | 0.68 | +| 23 | Data augmentation techniques in machine learning may also include scaling to ensure... | 0.64 | +| 3 | Unsupervised learning algorithms, such as clustering methods, may benefit from feature... | 0.64 | +| 12 | When working with time series data, feature scaling can help standardize the input... | 0.62 | +| 19 | In ensemble methods, like random forests, feature scaling is not strictly necessary... | 0.61 | +| 21 | In natural language processing (NLP), scaling can be useful when working with numerical... | 0.61 | +| 20 | Feature scaling should be applied consistently across training and test datasets... | 0.61 | +| 18 | Data preprocessing steps, including feature scaling, can significantly impact the performance... | 0.61 | -## [Anchor](https://qdrant.tech/articles/web-ui-gsoc/\#conclusion) Conclusion +From the looks of it, the data pulled up is highly relevant to your query. Now, with this solid base of results, it’s time to refine them further with reranking. -Participating in the Google Summer of Code 2023 and working on the “Web UI for Visualization and Exploration” project has been an immensely rewarding experience. I am grateful for the opportunity to contribute to Qdrant and develop a user-friendly interface for vector data exploration. +### Rerank -I want to express my gratitude to my mentors and the entire Qdrant community for their support and guidance throughout this journey. This experience has not only improved my coding skills but also instilled a deeper passion for web development and data analysis. +This code takes the documents from the search results and reranks them based on your query, making sure you get the most relevant ones right at the top. -As my coding journey continues beyond this project, I look forward to applying the knowledge and experience gained here to future endeavours. I am excited to see how Qdrant evolves with the newly developed web UI and how it positively impacts users worldwide. +First, we pull out the documents from the search results. Then we use Cohere’s rerank model to refine these results: -Thank you for joining me on this coding adventure, and I hope to share more exciting projects in the future! Happy coding! +```python +document_list = [point.payload['document'] for point in search_result] -##### Was this page useful? +rerank_results = co.rerank( + model="rerank-english-v3.0", + query=query, + documents=document_list, + top_n=5, +) +``` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +--- -Thank you for your feedback! 🙏 +What’s happening here? In the first line, we’re building a list of documents by grabbing the 'document' field from each search result point. Then, we pass this list, along with the original query, to Cohere’s rerank method. Using the **rerank-english-v3.0** model, it reshuffles the documents and gives you back the top 5, ranked by their relevance to the query. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/web-ui-gsoc.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Here’s the reranked result table, with the new order and their relevance scores: -On this page: +| **Index** | **Document** | **Relevance Score** | +| --- | --- | --- | +| 0 | In machine learning, feature scaling is the process of normalizing the range of independent variables or features. | 0.99995166 | +| 1 | Feature scaling is commonly used in data preprocessing to ensure that features are on the same scale. | 0.99929035 | +| 10 | In deep learning, feature scaling helps stabilize the learning process, allowing for better performance and faster convergence. | 0.998675 | +| 23 | Data augmentation techniques in machine learning may also include scaling to ensure consistency across training datasets. | 0.998043 | +| 3 | Unsupervised learning algorithms, such as clustering methods, may benefit from feature scaling. | 0.9979967 | -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/web-ui-gsoc.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +As you can see, the reranking did its job. Positions for documents 10 and 1 got swapped, showing that the reranker has fine-tuned the results to give you the most relevant content at the top. -× +## Conclusion -[Powered by](https://qdrant.tech/) +Reranking is a powerful way to boost the relevance and precision of search results in RAG systems. By combining Qdrant’s vector search capabilities with tools like Cohere’s Rerank model or ColBERT, you can refine search outputs, ensuring the most relevant information rises to the top. -<|page-50-lllmstxt|> -## food-discovery-demo -- [Articles](https://qdrant.tech/articles/) -- Food Discovery Demo +This guide demonstrated how reranking enhances precision without sacrificing recall, delivering sharper, context-rich results. With these tools, you’re equipped to create search systems that provide meaningful and impactful user experiences. Start implementing reranking to take your applications to the next level! -[Back to Practical Examples](https://qdrant.tech/articles/practicle-examples/) +<|page-85-lllmstxt|> +# Role Management -# Food Discovery Demo +> 💡 You can access this in **Access Management > User & Role Management** *if available see [this page for details](/documentation/cloud-rbac/).* -Kacper Ɓukawski +A **Role** contains a set of **permissions** that define the ability to perform or control specific actions in Qdrant Cloud. Permissions are accessible through the Permissions tab in the Role Details page and offer fine-grained access control, logically grouped for easy identification. -· +## Built-In Roles -September 05, 2023 +Qdrant Cloud includes some built-in roles for common use-cases. The permissions for these built-in roles cannot be changed. -![Food Discovery Demo](https://qdrant.tech/articles_data/food-discovery-demo/preview/title.jpg) +There are three types:  -Not every search journey begins with a specific destination in mind. Sometimes, you just want to explore and see what’s out there and what you might like. -This is especially true when it comes to food. You might be craving something sweet, but you don’t know what. You might be also looking for a new dish to try, -and you just want to see the options available. In these cases, it’s impossible to express your needs in a textual query, as the thing you are looking for is not -yet defined. Qdrant’s semantic search for images is useful when you have a hard time expressing your tastes in words. +- The **Base Role** is assigned to all users, and provides the minimum privileges required to access Qdrant Cloud. +- The **Admin Role**  has all available permissions, except for account write permissions. +- The **Owner Role** has all available permissions assigned, including account write permissions. There can only be one Owner per account currently. -## [Anchor](https://qdrant.tech/articles/food-discovery-demo/\#general-architecture) General architecture +![image.png](/documentation/cloud/role-based-access-control/built-in-roles.png) -We are happy to announce a refreshed version of our [Food Discovery Demo](https://food-discovery.qdrant.tech/). This time available as an open source project, -so you can easily deploy it on your own and play with it. If you prefer to dive into the source code directly, then feel free to check out the [GitHub repository](https://github.com/qdrant/demo-food-discovery/). -Otherwise, read on to learn more about the demo and how it works! +## Custom Roles -In general, our application consists of three parts: a [FastAPI](https://fastapi.tiangolo.com/) backend, a [React](https://react.dev/) frontend, and -a [Qdrant](https://qdrant.tech/) instance. The architecture diagram below shows how these components interact with each other: +An authorized user can create their own custom roles with specific sets of permissions, giving them more control over who has what access to which resource. -![Archtecture diagram](https://qdrant.tech/articles_data/food-discovery-demo/architecture-diagram.png) +![image.png]( /documentation/cloud/role-based-access-control/custom-roles.png) -## [Anchor](https://qdrant.tech/articles/food-discovery-demo/\#why-did-we-use-a-clip-model) Why did we use a CLIP model? +### Creating a Custom Role -CLIP is a neural network that can be used to encode both images and texts into vectors. And more importantly, both images and texts are vectorized into the same -latent space, so we can compare them directly. This lets you perform semantic search on images using text queries and the other way around. For example, if -you search for “flat bread with toppings”, you will get images of pizza. Or if you search for “pizza”, you will get images of some flat bread with toppings, even -if they were not labeled as “pizza”. This is because CLIP embeddings capture the semantics of the images and texts and can find the similarities between them -no matter the wording. +To create a new custom role, click on the **Add** button at the top-right corner of the **Custom Roles** list. -![CLIP model](https://qdrant.tech/articles_data/food-discovery-demo/clip-model.png) +- **Role Name**: Must be unique across roles. +- **Role Description**: Brief description of the role’s purpose. -CLIP is available in many different ways. We used the pretrained `clip-ViT-B-32` model available in the [Sentence-Transformers](https://www.sbert.net/examples/applications/image-search/README.html) -library, as this is the easiest way to get started. +Once created, the new role will appear under the **Custom Roles** section in the navigation. -## [Anchor](https://qdrant.tech/articles/food-discovery-demo/\#the-dataset) The dataset +![image.png](/documentation/cloud/role-based-access-control/create-custom-role.png) -The demo is based on the [Wolt](https://wolt.com/) dataset. It contains over 2M images of dishes from different restaurants along with some additional metadata. -This is how a payload for a single dish looks like: +### Editing a Custom Role -```json -{ - "cafe": { - "address": "VGX7+6R2 Vecchia Napoli, Valletta", - "categories": ["italian", "pasta", "pizza", "burgers", "mediterranean"], - "location": {"lat": 35.8980154, "lon": 14.5145106}, - "menu_id": "610936a4ee8ea7a56f4a372a", - "name": "Vecchia Napoli Is-Suq Tal-Belt", - "rating": 9, - "slug": "vecchia-napoli-skyparks-suq-tal-belt" - }, - "description": "Tomato sauce, mozzarella fior di latte, crispy guanciale, Pecorino Romano cheese and a hint of chilli", - "image": "https://wolt-menu-images-cdn.wolt.com/menu-images/610936a4ee8ea7a56f4a372a/005dfeb2-e734-11ec-b667-ced7a78a5abd_l_amatriciana_pizza_joel_gueller1.jpeg", - "name": "L'Amatriciana" -} +To update a specific role's permissions, select it from the list and click on the **Permissions** tab. Here, you'll find logically grouped options that are easy to identify and edit as needed. Once you've made your changes, save them to apply the updated permissions to the role. -``` +![image.png](/documentation/cloud/role-based-access-control/update-permission.png) -Processing this amount of records takes some time, so we precomputed the CLIP embeddings, stored them in a Qdrant collection and exported the collection as -a snapshot. You may [download it here](https://storage.googleapis.com/common-datasets-snapshots/wolt-clip-ViT-B-32.snapshot). +### Renaming, Deleting and Duplicating a Custom Role -## [Anchor](https://qdrant.tech/articles/food-discovery-demo/\#different-search-modes) Different search modes +Each custom role can be renamed, duplicated or deleted via the action buttons located to the right of the role title bar. -The FastAPI backend [exposes just a single endpoint](https://github.com/qdrant/demo-food-discovery/blob/6b49e11cfbd6412637d527cdd62fe9b9f74ac699/backend/main.py#L37), -however it handles multiple scenarios. Let’s dive into them one by one and understand why they are needed. +- **Rename**: Opens a dialog allowing users to update both the role name and description. +- **Delete**: Triggers a confirmation prompt to confirm the deletion. Once confirmed, this action is irreversible. Any users assigned to the deleted role will automatically be unassigned from it. +- **Duplicate:** Opens a dialog asking for a confirmation and also allowing users to view the list of permissions that will be assigned to the duplicated role -### [Anchor](https://qdrant.tech/articles/food-discovery-demo/\#cold-start) Cold start +![image.png](/documentation/cloud/role-based-access-control/role-actions.png) -Recommendation systems struggle with a cold start problem. When a new user joins the system, there is no data about their preferences, so it’s hard to recommend -anything. The same applies to our demo. When you open it, you will see a random selection of dishes, and it changes every time you refresh the page. Internally, -the demo [chooses some random points](https://github.com/qdrant/demo-food-discovery/blob/6b49e11cfbd6412637d527cdd62fe9b9f74ac699/backend/discovery.py#L70) in the -vector space. +<|page-86-lllmstxt|> +# Build Your First Semantic Search Engine in 5 Minutes -![Random points selection](https://qdrant.tech/articles_data/food-discovery-demo/random-results.png) +| Time: 5 - 15 min | Level: Beginner | | | +| --- | ----------- | ----------- |----------- | -That procedure should result in returning diverse results, so we have a higher chance of showing something interesting to the user. +

-### [Anchor](https://qdrant.tech/articles/food-discovery-demo/\#textual-search) Textual search +## Overview -Since the demo suffers from the cold start problem, we implemented a textual search mode that is useful to start exploring the data. You can type in any text query -by clicking a search icon in the top right corner. The demo will use the CLIP model to encode the query into a vector and then search for the nearest neighbors -in the vector space. +If you are new to vector databases, this tutorial is for you. In 5 minutes you will build a semantic search engine for science fiction books. After you set it up, you will ask the engine about an impending alien threat. Your creation will recommend books as preparation for a potential space attack. -![Random points selection](https://qdrant.tech/articles_data/food-discovery-demo/textual-search.png) +Before you begin, you need to have a [recent version of Python](https://www.python.org/downloads/) installed. If you don't know how to run this code in a virtual environment, follow Python documentation for [Creating Virtual Environments](https://docs.python.org/3/tutorial/venv.html#creating-virtual-environments) first. -This is implemented as [a group search query to Qdrant](https://github.com/qdrant/demo-food-discovery/blob/6b49e11cfbd6412637d527cdd62fe9b9f74ac699/backend/discovery.py#L44). -We didn’t use a simple search, but performed grouping by the restaurant to get more diverse results. [Search groups](https://qdrant.tech/documentation/concepts/search/#search-groups) -is a mechanism similar to `GROUP BY` clause in SQL, and it’s useful when you want to get a specific number of result per group (in our case just one). +This tutorial assumes you're in the bash shell. Use the Python documentation to activate a virtual environment, with commands such as: -```python -import settings +```bash +source tutorial-env/bin/activate +``` -# Encode query into a vector, model is an instance of -# sentence_transformers.SentenceTransformer that loaded CLIP model -query_vector = model.encode(query).tolist() +## 1. Installation -# Search for nearest neighbors, client is an instance of -# qdrant_client.QdrantClient that has to be initialized before -response = client.search_groups( - settings.QDRANT_COLLECTION, - query_vector=query_vector, - group_by=settings.GROUP_BY_FIELD, - limit=search_query.limit, -) +You need to process your data so that the search engine can work with it. The [Sentence Transformers](https://www.sbert.net/) framework gives you access to common Large Language Models that turn raw data into embeddings. +```bash +pip install -U sentence-transformers ``` -### [Anchor](https://qdrant.tech/articles/food-discovery-demo/\#exploring-the-results) Exploring the results +Once encoded, this data needs to be kept somewhere. Qdrant lets you store data as embeddings. You can also use Qdrant to run search queries against this data. This means that you can ask the engine to give you relevant answers that go way beyond keyword matching. -The main feature of the demo is the ability to explore the space of the dishes. You can click on any of them to see more details, but first of all you can like or dislike it, -and the demo will update the search results accordingly. +```bash +pip install -U qdrant-client +``` -![Recommendation results](https://qdrant.tech/articles_data/food-discovery-demo/recommendation-results.png) + -#### [Anchor](https://qdrant.tech/articles/food-discovery-demo/\#negative-feedback-only) Negative feedback only +### Import the models -Qdrant [Recommendation API](https://qdrant.tech/documentation/concepts/search/#recommendation-api) needs at least one positive example to work. However, in our demo -we want to be able to provide only negative examples. This is because we want to be able to say “I don’t like this dish” without having to like anything first. -To achieve this, we use a trick. We negate the vectors of the disliked dishes and use their mean as a query. This way, the disliked dishes will be pushed away -from the search results. **This works because the cosine distance is based on the angle between two vectors, and the angle between a vector and its negation is 180 degrees.** +Once the two main frameworks are defined, you need to specify the exact models this engine will use. -![CLIP model](https://qdrant.tech/articles_data/food-discovery-demo/negated-vector.png) +```python +from qdrant_client import models, QdrantClient +from sentence_transformers import SentenceTransformer +``` -Food Discovery Demo [implements that trick](https://github.com/qdrant/demo-food-discovery/blob/6b49e11cfbd6412637d527cdd62fe9b9f74ac699/backend/discovery.py#L122) -by calling Qdrant twice. Initially, we use the [Scroll API](https://qdrant.tech/documentation/concepts/points/#scroll-points) to find disliked items, -and then calculate a negated mean of all their vectors. That allows using the [Search Groups API](https://qdrant.tech/documentation/concepts/search/#search-groups) -to find the nearest neighbors of the negated mean vector. +The [Sentence Transformers](https://www.sbert.net/index.html) framework contains many embedding models. We'll take [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) as it has a good balance between speed and embedding quality for this tutorial. ```python -import numpy as np - -# Retrieve the disliked points based on their ids -disliked_points, _ = client.scroll( - settings.QDRANT_COLLECTION, - scroll_filter=models.Filter( - must=[\ - models.HasIdCondition(has_id=search_query.negative),\ - ] - ), - with_vectors=True, -) +encoder = SentenceTransformer("all-MiniLM-L6-v2") +``` -# Calculate a mean vector of disliked points -disliked_vectors = np.array([point.vector for point in disliked_points]) -mean_vector = np.mean(disliked_vectors, axis=0) -negated_vector = -mean_vector +## 2. Add the dataset -# Search for nearest neighbors of the negated mean vector -response = client.search_groups( - settings.QDRANT_COLLECTION, - query_vector=negated_vector.tolist(), - group_by=settings.GROUP_BY_FIELD, - limit=search_query.limit, -) +[all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) will encode the data you provide. Here you will list all the science fiction books in your library. Each book has metadata, a name, author, publication year and a short description. +```python +documents = [ + { + "name": "The Time Machine", + "description": "A man travels through time and witnesses the evolution of humanity.", + "author": "H.G. Wells", + "year": 1895, + }, + { + "name": "Ender's Game", + "description": "A young boy is trained to become a military leader in a war against an alien race.", + "author": "Orson Scott Card", + "year": 1985, + }, + { + "name": "Brave New World", + "description": "A dystopian society where people are genetically engineered and conditioned to conform to a strict social hierarchy.", + "author": "Aldous Huxley", + "year": 1932, + }, + { + "name": "The Hitchhiker's Guide to the Galaxy", + "description": "A comedic science fiction series following the misadventures of an unwitting human and his alien friend.", + "author": "Douglas Adams", + "year": 1979, + }, + { + "name": "Dune", + "description": "A desert planet is the site of political intrigue and power struggles.", + "author": "Frank Herbert", + "year": 1965, + }, + { + "name": "Foundation", + "description": "A mathematician develops a science to predict the future of humanity and works to save civilization from collapse.", + "author": "Isaac Asimov", + "year": 1951, + }, + { + "name": "Snow Crash", + "description": "A futuristic world where the internet has evolved into a virtual reality metaverse.", + "author": "Neal Stephenson", + "year": 1992, + }, + { + "name": "Neuromancer", + "description": "A hacker is hired to pull off a near-impossible hack and gets pulled into a web of intrigue.", + "author": "William Gibson", + "year": 1984, + }, + { + "name": "The War of the Worlds", + "description": "A Martian invasion of Earth throws humanity into chaos.", + "author": "H.G. Wells", + "year": 1898, + }, + { + "name": "The Hunger Games", + "description": "A dystopian society where teenagers are forced to fight to the death in a televised spectacle.", + "author": "Suzanne Collins", + "year": 2008, + }, + { + "name": "The Andromeda Strain", + "description": "A deadly virus from outer space threatens to wipe out humanity.", + "author": "Michael Crichton", + "year": 1969, + }, + { + "name": "The Left Hand of Darkness", + "description": "A human ambassador is sent to a planet where the inhabitants are genderless and can change gender at will.", + "author": "Ursula K. Le Guin", + "year": 1969, + }, + { + "name": "The Three-Body Problem", + "description": "Humans encounter an alien civilization that lives in a dying system.", + "author": "Liu Cixin", + "year": 2008, + }, +] ``` -#### [Anchor](https://qdrant.tech/articles/food-discovery-demo/\#positive-and-negative-feedback) Positive and negative feedback +## 3. Define storage location -Since the [Recommendation API](https://qdrant.tech/documentation/concepts/search/#recommendation-api) requires at least one positive example, we can use it only when -the user has liked at least one dish. We could theoretically use the same trick as above and negate the disliked dishes, but it would be a bit weird, as Qdrant has -that feature already built-in, and we can call it just once to do the job. It’s always better to perform the search server-side. Thus, in this case [we just call\\ -the Qdrant server with a list of positive and negative examples](https://github.com/qdrant/demo-food-discovery/blob/6b49e11cfbd6412637d527cdd62fe9b9f74ac699/backend/discovery.py#L166), -so it can find some points which are close to the positive examples and far from the negative ones. +You need to tell Qdrant where to store embeddings. This is a basic demo, so your local computer will use its memory as temporary storage. ```python -response = client.recommend_groups( - settings.QDRANT_COLLECTION, - positive=search_query.positive, - negative=search_query.negative, - group_by=settings.GROUP_BY_FIELD, - limit=search_query.limit, -) - +client = QdrantClient(":memory:") ``` -From the user perspective nothing changes comparing to the previous case. - -### [Anchor](https://qdrant.tech/articles/food-discovery-demo/\#location-based-search) Location-based search +## 4. Create a collection -Last but not least, location plays an important role in the food discovery process. You are definitely looking for something you can find nearby, not on the other -side of the globe. Therefore, your current location can be toggled as a filtering condition. You can enable it by clicking on “Find near me” icon -in the top right. This way you can find the best pizza in your neighborhood, not in the whole world. Qdrant [geo radius filter](https://qdrant.tech/documentation/concepts/filtering/#geo-radius) is a perfect choice for this. It lets you -filter the results by distance from a given point. +All data in Qdrant is organized by collections. In this case, you are storing books, so we are calling it `my_books`. ```python -from qdrant_client import models - -# Create a geo radius filter -query_filter = models.Filter( - must=[\ - models.FieldCondition(\ - key="cafe.location",\ - geo_radius=models.GeoRadius(\ - center=models.GeoPoint(\ - lon=location.longitude,\ - lat=location.latitude,\ - ),\ - radius=location.radius_km * 1000,\ - ),\ - )\ - ] +client.create_collection( + collection_name="my_books", + vectors_config=models.VectorParams( + size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model + distance=models.Distance.COSINE, + ), ) - ``` -Such a filter needs [a payload index](https://qdrant.tech/documentation/concepts/indexing/#payload-index) to work efficiently, and it was created on a collection -we used to create the snapshot. When you import it into your instance, the index will be already there. +- The `vector_size` parameter defines the size of the vectors for a specific collection. If their size is different, it is impossible to calculate the distance between them. 384 is the encoder output dimensionality. You can also use model.get_sentence_embedding_dimension() to get the dimensionality of the model you are using. + +- The `distance` parameter lets you specify the function used to measure the distance between two points. -## [Anchor](https://qdrant.tech/articles/food-discovery-demo/\#using-the-demo) Using the demo -The Food Discovery Demo [is available online](https://food-discovery.qdrant.tech/), but if you prefer to run it locally, you can do it with Docker. The -[README](https://github.com/qdrant/demo-food-discovery/blob/main/README.md) describes all the steps more in detail, but here is a quick start: +## 5. Upload data to collection -```bash -git clone git@github.com:qdrant/demo-food-discovery.git -cd demo-food-discovery -# Create .env file based on .env.example -docker-compose up -d +Tell the database to upload `documents` to the `my_books` collection. This will give each record an id and a payload. The payload is just the metadata from the dataset. +```python +client.upload_points( + collection_name="my_books", + points=[ + models.PointStruct( + id=idx, vector=encoder.encode(doc["description"]).tolist(), payload=doc + ) + for idx, doc in enumerate(documents) + ], +) ``` -The demo will be available at `http://localhost:8001`, but you won’t be able to search anything until you [import the snapshot into your Qdrant\\ -instance](https://qdrant.tech/documentation/concepts/snapshots/#recover-via-api). If you don’t want to bother with hosting a local one, you can use the [Qdrant\\ -Cloud](https://cloud.qdrant.io/) cluster. 4 GB RAM is enough to load all the 2 million entries. +## 6. Ask the engine a question -## [Anchor](https://qdrant.tech/articles/food-discovery-demo/\#fork-and-reuse) Fork and reuse +Now that the data is stored in Qdrant, you can ask it questions and receive semantically relevant results. -Our demo is completely open-source. Feel free to fork it, update with your own dataset or adapt the application to your use case. Whether you’re looking to understand the mechanics -of semantic search or to have a foundation to build a larger project, this demo can serve as a starting point. Check out the [Food Discovery Demo repository](https://github.com/qdrant/demo-food-discovery/) to get started. If you have any questions, feel free to reach out [through Discord](https://qdrant.to/discord). +```python +hits = client.query_points( + collection_name="my_books", + query=encoder.encode("alien invasion").tolist(), + limit=3, +).points -##### Was this page useful? +for hit in hits: + print(hit.payload, "score:", hit.score) +``` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +**Response:** -Thank you for your feedback! 🙏 +The search engine shows three of the most likely responses that have to do with the alien invasion. Each of the responses is assigned a score to show how close the response is to the original inquiry. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/food-discovery-demo.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +```text +{'name': 'The War of the Worlds', 'description': 'A Martian invasion of Earth throws humanity into chaos.', 'author': 'H.G. Wells', 'year': 1898} score: 0.570093257022374 +{'name': "The Hitchhiker's Guide to the Galaxy", 'description': 'A comedic science fiction series following the misadventures of an unwitting human and his alien friend.', 'author': 'Douglas Adams', 'year': 1979} score: 0.5040468703143637 +{'name': 'The Three-Body Problem', 'description': 'Humans encounter an alien civilization that lives in a dying system.', 'author': 'Liu Cixin', 'year': 2008} score: 0.45902943411768216 +``` -On this page: +### Narrow down the query -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/food-discovery-demo.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +How about the most recent book from the early 2000s? -× +```python +hits = client.query_points( + collection_name="my_books", + query=encoder.encode("alien invasion").tolist(), + query_filter=models.Filter( + must=[models.FieldCondition(key="year", range=models.Range(gte=2000))] + ), + limit=1, +).points -[Powered by](https://qdrant.tech/) +for hit in hits: + print(hit.payload, "score:", hit.score) +``` -<|page-51-lllmstxt|> -## capacity-planning -- [Documentation](https://qdrant.tech/documentation/) -- [Guides](https://qdrant.tech/documentation/guides/) -- Capacity Planning +**Response:** -# [Anchor](https://qdrant.tech/documentation/guides/capacity-planning/\#capacity-planning) Capacity Planning +The query has been narrowed down to one result from 2008. -When setting up your cluster, you’ll need to figure out the right balance of **RAM** and **disk storage**. The best setup depends on a few things: +```text +{'name': 'The Three-Body Problem', 'description': 'Humans encounter an alien civilization that lives in a dying system.', 'author': 'Liu Cixin', 'year': 2008} score: 0.45902943411768216 +``` -- How many vectors you have and their dimensions. -- The amount of payload data you’re using and their indexes. -- What data you want to store in memory versus on disk. -- Your cluster’s replication settings. -- Whether you’re using quantization and how you’ve set it up. +## Next Steps -## [Anchor](https://qdrant.tech/documentation/guides/capacity-planning/\#calculating-ram-size) Calculating RAM size +Congratulations, you have just created your very first search engine! Trust us, the rest of Qdrant is not that complicated, either. For your next tutorial you should try building an actual [Neural Search Service with a complete API and a dataset](/documentation/tutorials/neural-search/). -You should store frequently accessed data in RAM for faster retrieval. If you want to keep all vectors in memory for optimal performance, you can use this rough formula for estimation: +<|page-87-lllmstxt|> +# Creating a Hybrid Cloud Environment -```text -memory_size = number_of_vectors * vector_dimension * 4 bytes * 1.5 +The following instruction set will show you how to properly set up a **Qdrant cluster** in your **Hybrid Cloud Environment**. -``` +You can also watch a video demo on how to set up a Hybrid Cloud Environment: +

-At the end, we multiply everything by 1.5. This extra 50% accounts for metadata (such as indexes and point versions) and temporary segments created during optimization. +To learn how Hybrid Cloud works, [read the overview document](/documentation/hybrid-cloud/). -Let’s say you want to store 1 million vectors with 1024 dimensions: +## Prerequisites -```text -memory_size = 1,000,000 * 1024 * 4 bytes * 1.5 +- **Kubernetes cluster:** To create a Hybrid Cloud Environment, you need a [standard compliant](https://www.cncf.io/training/certification/software-conformance/) Kubernetes cluster. You can run this cluster in any cloud, on-premise or edge environment, with distributions that range from AWS EKS to VMWare vSphere. See [Deployment Platforms](/documentation/hybrid-cloud/platform-deployment-options/) for more information. +- **Storage:** For storage, you need to set up the Kubernetes cluster with a Container Storage Interface (CSI) driver that provides block storage. For vertical scaling, the CSI driver needs to support volume expansion. The `StorageClass` needs to be created beforehand. For backups and restores, the driver needs to support CSI snapshots and restores. The `VolumeSnapshotClass` needs to be created beforehand. See [Deployment Platforms](/documentation/hybrid-cloud/platform-deployment-options/) for more information. -``` + -The memory\_size is approximately 6,144,000,000 bytes, or about 5.72 GB. +- **Kubernetes nodes:** You need enough CPU and memory capacity for the Qdrant database clusters that you create. A small amount of resources is also needed for the Hybrid Cloud control plane components. Qdrant Hybrid Cloud supports x86_64 and ARM64 architectures. +- **Permissions:** To install the Qdrant Cloud Agent you need to have `cluster-admin` access in your Kubernetes cluster. +- **Connection:** The Qdrant Cloud Agent in your cluster needs to be able to connect to Qdrant Cloud. It will create an outgoing connection to `grpc.cloud.qdrant.io` and `api.cloud.qdrant.io` on port `443`. +- **Locations:** By default, the Qdrant services (like Qdrant Cloud Agent, Operator and Cluster-Manager) pulls Helm charts and container images from `registry.cloud.qdrant.io`. The Qdrant database container image is pulled from `docker.io`. For a complete list see [Mirroring images and charts](#mirroring-images-and-charts). -Depending on the use case, large datasets can benefit from reduced memory requirements via [quantization](https://qdrant.tech/documentation/guides/quantization/). +> **Note:** You can also mirror these images and charts into your own registry and pull them from there. -## [Anchor](https://qdrant.tech/documentation/guides/capacity-planning/\#calculating-payload-size) Calculating payload size +### CLI tools -This is always different. The size of the payload depends on the [structure and content of your data](https://qdrant.tech/documentation/concepts/payload/#payload-types). For instance: +During the onboarding, you will need to deploy the Qdrant Cloud Agent using Helm. Make sure you have the following tools installed: -- **Text fields** consume space based on length and encoding (e.g. a large chunk of text vs a few words). -- **Floats** have fixed sizes of 8 bytes for `int64` or `float64`. -- **Boolean fields** typically consume 1 byte. +* [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) +* [helm](https://helm.sh/docs/intro/install/) -Calculating total payload size is similar to vectors. We have to multiply it by 1.5 for back-end indexing processes. +You will need to have access to the Kubernetes cluster with `kubectl` and `helm` configured to connect to it. Please refer the documentation of your Kubernetes distribution for more information. -```text -total_payload_size = number_of_points * payload_size * 1.5 +## Installation -``` +1. To set up Hybrid Cloud, open the Qdrant Cloud Console at [cloud.qdrant.io](https://cloud.qdrant.io). On the dashboard, select **Hybrid Cloud**. -Let’s say you want to store 1 million points with JSON payloads of 5KB: +2. Before creating your first Hybrid Cloud Environment, you have to provide billing information and accept the Hybrid Cloud license agreement. The installation wizard will guide you through the process. -```text -total_payload_size = 1,000,000 * 5KB * 1.5 +> **Note:** You will only be charged for the Qdrant cluster you create in a Hybrid Cloud Environment, but not for the environment itself. -``` +3. Now you can specify the following: -The total\_payload\_size is approximately 5,000,000 bytes, or about 4.77 GB. +- **Name:** A name for the Hybrid Cloud Environment +- **Kubernetes Namespace:** The Kubernetes namespace for the services (like agent and operator). Once you select a namespace, you can't change it. -## [Anchor](https://qdrant.tech/documentation/guides/capacity-planning/\#choosing-disk-over-ram) Choosing disk over RAM +You can also configure the StorageClass and VolumeSnapshotClass to use for the Qdrant databases, if you want to deviate from the default settings of your cluster. -For optimal performance, you should store only frequently accessed data in RAM. The rest should be offloaded to the disk. For example, extra payload fields that you don’t use for filtering can be stored on disk. +![Create Hybrid Cloud Environment](/documentation/cloud/hybrid_cloud_env_create.png) -Only [indexed fields](https://qdrant.tech/documentation/concepts/indexing/#payload-index) should be stored in RAM. You can read more about payload storage in the [Storage](https://qdrant.tech/documentation/concepts/storage/#payload-storage) section. +4. You can then enter the YAML configuration for your Kubernetes operator. Qdrant supports a specific list of configuration options, as described in the [Qdrant Operator configuration](/documentation/hybrid-cloud/operator-configuration/) section. -### [Anchor](https://qdrant.tech/documentation/guides/capacity-planning/\#storage-focused-configuration) Storage-focused configuration +5. (Optional) If you have special requirements for any of the following, activate the **Show advanced configuration** option: -If your priority is to handle large volumes of vectors with average search latency, it’s recommended to configure [memory-mapped (mmap) storage](https://qdrant.tech/documentation/concepts/storage/#configuring-memmap-storage). In this setup, vectors are stored on disk in memory-mapped files, while only the most frequently accessed vectors are cached in RAM. +- If you use a proxy to connect from your infrastructure to the Qdrant Cloud API, you can specify the proxy URL, credentials and cetificates. +- Container registry URL for Qdrant services (like Agent, Operator, Cluster-manager and monitoring stack) images. The default is . +- Helm chart repository URL for the Qdrant services. The default is . +- An optional secret with credentials to access your own container registry. +- Log level for the operator and agent. +- Node selectors and tolerations for the operater, agent, cluster-manager and monitoring stack. -The amount of available RAM greatly impacts search performance. As a general rule, if you store half as many vectors in RAM, search latency will roughly double. +![Create Hybrid Cloud Environment - Advanced Configuration](/documentation/cloud/hybrid_cloud_advanced_configuration.png) -Disk speed is also crucial. [Contact us](https://qdrant.tech/documentation/support/) if you have specific requirements for high-volume searches in our Cloud. +6. Once complete, click **Create**. -### [Anchor](https://qdrant.tech/documentation/guides/capacity-planning/\#subgroup-oriented-configuration) Subgroup-oriented configuration +> **Note:** All settings but the Kubernetes namespace can be changed later. -If your use case involves splitting vectors into multiple collections or subgroups based on payload values (e.g., serving searches for multiple users, each with their own subset of vectors), memory-mapped storage is recommended. +### Generate Installation Command -In this scenario, only the active subset of vectors will be cached in RAM, allowing for fast searches for the most recent and active users. You can estimate the required memory size as: +After creating your Hybrid Cloud, select **Generate Installation Command** to generate a script that you can run in your Kubernetes cluster which will perform the initial installation of the Kubernetes operator and agent. -```text -memory_size = number_of_active_vectors * vector_dimension * 4 bytes * 1.5 +![Rotate Hybrid Cloud Secrets](/documentation/cloud/hybrid_cloud_create_command.png) -``` +It will: -Please refer to our [multitenancy](https://qdrant.tech/documentation/guides/multiple-partitions/) documentation for more details on partitioning data in a Qdrant. +- Create the Kubernetes namespace, if not present. +- Set up the necessary secrets with credentials to access the Qdrant container registry and the Qdrant Cloud API. +- Sign in to the Helm registry at `registry.cloud.qdrant.io`. +- Install the Qdrant cloud agent and Kubernetes operator chart. -## [Anchor](https://qdrant.tech/documentation/guides/capacity-planning/\#scaling-disk-space-in-qdrant-cloud) Scaling disk space in Qdrant Cloud +You need this command only for the initial installation. After that, you can update the agent and operator using the Qdrant Cloud Console. -Clusters supporting vector search require substantial disk space compared to other search systems. If you’re running low on disk space, you can use the UI at [cloud.qdrant.io](https://cloud.qdrant.io/) to **Scale Up** your cluster. +> **Note:** If you generate the installation command a second time, it will re-generate the included secrets, and you will have to apply the command again to update them. -When running low on disk space, consider the following benefits of scaling up: +## Advanced configuration -- **Larger Datasets**: Supports larger datasets, which can improve the relevance and quality of search results. -- **Improved Indexing**: Enables the use of advanced indexing strategies like HNSW. -- **Caching**: Enhances speed by having more RAM, allowing more frequently accessed data to be cached. -- **Backups and Redundancy**: Facilitates more frequent backups, which is a key advantage for data safety. +### Mirroring images and charts -Always remember to add 50% of the vector size. This would account for things like indexes and auxiliary data used during operations such as vector insertion, deletion, and search. Thus, the estimated memory size including metadata is: +#### Required artifacts -```text -total_vector_size = number_of_dimensions * 4 bytes * 1.5 +Container images: -``` +- `registry.cloud.qdrant.io/qdrant/qdrant` +- `registry.cloud.qdrant.io/qdrant/qdrant-cloud-agent` +- `registry.cloud.qdrant.io/qdrant/operator` +- `registry.cloud.qdrant.io/qdrant/cluster-manager` +- `registry.cloud.qdrant.io/qdrant/prometheus` +- `registry.cloud.qdrant.io/qdrant/prometheus-config-reloader` +- `registry.cloud.qdrant.io/qdrant/kube-state-metrics` +- `registry.cloud.qdrant.io/qdrant/kubernetes-event-exporter` +- `registry.cloud.qdrant.io/qdrant/qdrant-cluster-exporter` -**Disclaimer** +Open Containers Initiative (OCI) Helm charts: -The above calculations are estimates at best. If you’re looking for more accurate numbers, you should always test your data set in practice. +- `registry.cloud.qdrant.io/qdrant-charts/qdrant-cloud-agent` +- `registry.cloud.qdrant.io/qdrant-charts/operator` +- `registry.cloud.qdrant.io/qdrant-charts/qdrant-cluster-manager` +- `registry.cloud.qdrant.io/qdrant-charts/prometheus` +- `registry.cloud.qdrant.io/qdrant-charts/kubernetes-event-exporter` +- `registry.cloud.qdrant.io/qdrant-charts/qdrant-cluster-exporter` -##### Was this page useful? +To mirror all necessary container images and Helm charts into your own registry, you should use an automatic replication feature that your registry provides, so that you have new image versions available automatically. Alternatively you can manually sync the images with tools like [Skopeo](https://github.com/containers/skopeo). When syncing images manually, make sure that you sync then with all, or with the right CPU architecture. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +##### Automatic replication -Thank you for your feedback! 🙏 +Ensure that you have both the container images in the `/qdrant/` repository, and the helm charts in the `/qdrant-charts/` repository synced. Then go to the advanced section of your Hybrid Cloud Environment and configure your registry locations: -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/capacity-planning.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +* Container registry URL: `your-registry.example.com/qdrant` (this will for example result in `your-registry.example.com/qdrant/qdrant-cloud-agent`) +* Chart repository URL: `oci://your-registry.example.com/qdrant-charts` (this will for example result in `oci://your-registry.example.com/qdrant-charts/qdrant-cloud-agent`) -On this page: +If you registry requires authentication, you have to create your own secrets with authentication information into your `the-qdrant-namespace` namespace. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/capacity-planning.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Example: -× +```shell +kubectl --namespace the-qdrant-namespace create secret docker-registry my-creds --docker-server='your-registry.example.com' --docker-username='your-username' --docker-password='your-password' +``` -[Powered by](https://qdrant.tech/) +You can then reference they secret in the advanced section of your Hybrid Cloud Environment. -<|page-52-lllmstxt|> -## machine-learning -- [Articles](https://qdrant.tech/articles/) -- Machine Learning - -#### Machine Learning - -Explore Machine Learning principles and practices which make modern semantic similarity search possible. Apply Qdrant and vector search capabilities to your ML projects. - -[![Preview](https://qdrant.tech/articles_data/minicoil/preview/preview.jpg)\\ -**miniCOIL: on the Road to Usable Sparse Neural Retrieval** \\ -Introducing miniCOIL, a lightweight sparse neural retriever capable of generalization.\\ -\\ -Evgeniya Sukhodolskaya\\ -\\ -May 13, 2025](https://qdrant.tech/articles/minicoil/)[![Preview](https://qdrant.tech/articles_data/search-feedback-loop/preview/preview.jpg)\\ -**Relevance Feedback in Informational Retrieval** \\ -Relerance feedback: from ancient history to LLMs. Why relevance feedback techniques are good on paper but not popular in neural search, and what we can do about it.\\ -\\ -Evgeniya Sukhodolskaya\\ -\\ -March 27, 2025](https://qdrant.tech/articles/search-feedback-loop/)[![Preview](https://qdrant.tech/articles_data/modern-sparse-neural-retrieval/preview/preview.jpg)\\ -**Modern Sparse Neural Retrieval: From Theory to Practice** \\ -A comprehensive guide to modern sparse neural retrievers: COIL, TILDEv2, SPLADE, and more. Find out how they work and learn how to use them effectively.\\ -\\ -Evgeniya Sukhodolskaya\\ -\\ -October 23, 2024](https://qdrant.tech/articles/modern-sparse-neural-retrieval/)[![Preview](https://qdrant.tech/articles_data/cross-encoder-integration-gsoc/preview/preview.jpg)\\ -**Qdrant Summer of Code 2024 - ONNX Cross Encoders in Python** \\ -A summary of my work and experience at Qdrant Summer of Code 2024.\\ -\\ -Huong (Celine) Hoang\\ -\\ -October 14, 2024](https://qdrant.tech/articles/cross-encoder-integration-gsoc/)[![Preview](https://qdrant.tech/articles_data/late-interaction-models/preview/preview.jpg)\\ -**Any\* Embedding Model Can Become a Late Interaction Model... If You Give It a Chance!** \\ -We recently discovered that embedding models can become late interaction models & can perform surprisingly well in some scenarios. See what we learned here.\\ -\\ -Kacper Ɓukawski\\ -\\ -August 14, 2024](https://qdrant.tech/articles/late-interaction-models/)[![Preview](https://qdrant.tech/articles_data/bm42/preview/preview.jpg)\\ -**BM42: New Baseline for Hybrid Search** \\ -Introducing BM42 - a new sparse embedding approach, which combines the benefits of exact keyword search with the intelligence of transformers.\\ -\\ -Andrey Vasnetsov\\ -\\ -July 01, 2024](https://qdrant.tech/articles/bm42/)[![Preview](https://qdrant.tech/articles_data/embedding-recycling/preview/preview.jpg)\\ -**Layer Recycling and Fine-tuning Efficiency** \\ -Learn when and how to use layer recycling to achieve different performance targets.\\ -\\ -Yusuf Sarıgöz\\ -\\ -August 23, 2022](https://qdrant.tech/articles/embedding-recycler/)[![Preview](https://qdrant.tech/articles_data/cars-recognition/preview/preview.jpg)\\ -**Fine Tuning Similar Cars Search** \\ -Learn how to train a similarity model that can retrieve similar car images in novel categories.\\ -\\ -Yusuf Sarıgöz\\ -\\ -June 28, 2022](https://qdrant.tech/articles/cars-recognition/)[![Preview](https://qdrant.tech/articles_data/detecting-coffee-anomalies/preview/preview.jpg)\\ -**Metric Learning for Anomaly Detection** \\ -Practical use of metric learning for anomaly detection. A way to match the results of a classification-based approach with only ~0.6% of the labeled data.\\ -\\ -Yusuf Sarıgöz\\ -\\ -May 04, 2022](https://qdrant.tech/articles/detecting-coffee-anomalies/)[![Preview](https://qdrant.tech/articles_data/triplet-loss/preview/preview.jpg)\\ -**Triplet Loss - Advanced Intro** \\ -What are the advantages of Triplet Loss over Contrastive loss and how to efficiently implement it?\\ -\\ -Yusuf Sarıgöz\\ -\\ -March 24, 2022](https://qdrant.tech/articles/triplet-loss/)[![Preview](https://qdrant.tech/articles_data/metric-learning-tips/preview/preview.jpg)\\ -**Metric Learning Tips & Tricks** \\ -Practical recommendations on how to train a matching model and serve it in production. Even with no labeled data.\\ -\\ -Andrei Vasnetsov\\ -\\ -May 15, 2021](https://qdrant.tech/articles/metric-learning-tips/) - -× - -[Powered by](https://qdrant.tech/) +##### Manual replication -<|page-53-lllmstxt|> -## support -- [Documentation](https://qdrant.tech/documentation/) -- Support +This example uses Skopeo. -# [Anchor](https://qdrant.tech/documentation/support/\#qdrant-cloud-support-and-troubleshooting) Qdrant Cloud Support and Troubleshooting +You can find your personal credentials for the Qdrant Cloud registry in the onboarding command, or you can fetch them with `kubectl`: -## [Anchor](https://qdrant.tech/documentation/support/\#community-support) Community Support +```shell +kubectl get secrets qdrant-registry-creds --namespace the-qdrant-namespace -o jsonpath='{.data.\.dockerconfigjson}' | base64 --decode | jq -r '.' +``` -All Qdrant Cloud users are welcome to join our [Discord community](https://qdrant.to/discord/). +First login to the source registry: -![Discord](https://qdrant.tech/documentation/cloud/discord.png) +```shell +skopeo login registry.cloud.qdrant.io +``` -## [Anchor](https://qdrant.tech/documentation/support/\#qdrant-cloud-support) Qdrant Cloud Support +Then login to your own registry: -Paying customers have access to our Support team. Links to the support portal are available in the Qdrant Cloud Console. +```shell +skopeo login your-registry.example.com +``` -![Support Portal](https://qdrant.tech/documentation/cloud/support-portal.png) +To sync all container images: -When creating a support ticket, please provide as much information as possible to help us understand your issue. +```shell +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/operator your-registry.example.com/qdrant/operator +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/qdrant-cloud-agent your-registry.example.com/qdrant/qdrant-cloud-agent +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/prometheus your-registry.example.com/qdrant/prometheus +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/prometheus-config-reloader your-registry.example.com/qdrant/prometheus-config-reloader +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/kube-state-metrics your-registry.example.com/qdrant/kube-state-metrics +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/qdrant your-registry.example.com/qdrant/qdrant +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/cluster-manager your-registry.example.com/qdrant/cluster-manager +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/qdrant-cluster-exporter your-registry.example.com/qdrant/qdrant-cluster-exporter +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/kubernetes-event-exporter your-registry.example.com/qdrant/kubernetes-event-exporter +``` -This includes but is not limited to: +To sync all helm charts: -- The ID of your Qdrant Cloud cluster, if it’s not filled out by the UI automatically. You can find the ID on your cluster’s detail page. -- Which collection(s) are affected -- Code examples on how you are interacting with the Qdrant API -- Logs or error messages from your application -- Relevant telemetry from your application +```shell +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant-charts/prometheus your-registry.example.com/qdrant-charts/prometheus +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant-charts/operator your-registry.example.com/qdrant-charts/operator +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant-charts/qdrant-kubernetes-api your-registry.example.com/qdrant-charts/qdrant-kubernetes-api +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant-charts/qdrant-cloud-agent your-registry.example.com/qdrant-charts/qdrant-cloud-agent +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant-charts/qdrant-cluster-exporter your-registry.example.com/qdrant-charts/qdrant-cluster-exporter +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant-charts/kubernetes-event-exporter your-registry.example.com/qdrant-charts/kubernetes-event-exporter +``` -You can also choose a severity, when creating a ticket. This helps us prioritize your issue correctly. Please refer to the [Qdrant Cloud SLA](https://qdrant.to/sla/) for a definition of these severity levels and their corresponding response time SLA for your respective [support tier](https://qdrant.tech/documentation/cloud/premium/). +With the above configuration, you can add the following values to the advanced section of your Hybrid Cloud Environment: -If you are opening a ticket for a Hybrid Cloud or Private Cloud environment, we may ask for additional information about your environment, such as detailed logs of the Qdrant databases or operator and the state of your Kubernetes cluster. +* Container registry URL: `your-registry.example.com/qdrant` +* Chart repository URL: `oci://your-registry.example.com/qdrant-charts` -We have prepared a support bundle script that can help you with collecting all this information. A support bundle will not contain any user data or sensitive information like api keys. It will contain the names and configuration of Qdrant collections though. For more information see the [support bundle documentation](https://github.com/qdrant/qdrant-cloud-support-tools/tree/main/support-bundle). We recommend creating one and attaching it to your support ticket, so that we can help you faster. +If your registry requires authentication, you can create and reference the secret the same way as described above. -##### Was this page useful? +### Rate limits at `docker.io` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +By default, the Qdrant database image will be fetched from Docker Hub, which is the main source of truth. Docker Hub has rate limits for anonymous users. If you have larger setups and also fetch other images from their, you may run into these limits. To solve this, you can provide authentication information for Docker Hub. -Thank you for your feedback! 🙏 +First, create a secret with your Docker Hub credentials into your `the-qdrant-namespace` namespace: -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/support.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +```shell +kubectl create secret docker-registry dockerhub-registry-secret --namespace the-qdrant-namespace --docker-server=https://index.docker.io/v1/ --docker-username= --docker-password= --docker-email= +``` -On this page: +Then, you can reference this secret by adding the following configuration in the operator configuration YAML editor in the advanced section of the Hybrid Cloud Environment: -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/support.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +```yaml +qdrant: + image: + pull_secret: "dockerhub-registry-secret" +``` -× +## Rotating Secrets -[Powered by](https://qdrant.tech/) +If you need to rotate the secrets to pull container images and charts from the Qdrant registry and to authenticate at the Qdrant Cloud API, you can do so by following these steps: -<|page-54-lllmstxt|> -## filtered-search-intro -# Filtered search benchmark +* Go to the Hybrid Cloud environment list or the detail page of the environment. +* In the actions menu, choose "Rotate Secrets" +* Confirm the action +* You will receive a new installation command that you can run in your Kubernetes cluster to update the secrets. -February 13, 2023 +If you don't run the installation command, the secrets will not be updated and the communication between your Hybrid Cloud Environment and the Qdrant Cloud API will not work. -# [Anchor](https://qdrant.tech/benchmarks/filtered-search-intro/\#filtered-search-benchmark) Filtered search benchmark +![Rotate Hybrid Cloud Secrets](/documentation/cloud/hybrid_cloud_rotate_secrets.png) -Applying filters to search results brings a whole new level of complexity. -It is no longer enough to apply one algorithm to plain data. With filtering, it becomes a matter of the _cross-integration_ of the different indices. +## Deleting a Hybrid Cloud Environment -To measure how well different search engines perform in this scenario, we have prepared a set of **Filtered ANN Benchmark Datasets** - -[https://github.com/qdrant/ann-filtering-benchmark-datasets](https://github.com/qdrant/ann-filtering-benchmark-datasets) +To delete a Hybrid Cloud Environment, first delete all Qdrant database clusters in it. Then you can delete the environment itself. -It is similar to the ones used in the [ann-benchmarks project](https://github.com/erikbern/ann-benchmarks/) but enriched with payload metadata and pre-generated filtering requests. It includes synthetic and real-world datasets with various filters, from keywords to geo-spatial queries. +To clean up your Kubernetes cluster, after deleting the Hybrid Cloud Environment, you can download the script from https://github.com/qdrant/qdrant-cloud-support-tools/tree/main/hybrid-cloud-cleanup to remove all Qdrant related resources. -### [Anchor](https://qdrant.tech/benchmarks/filtered-search-intro/\#why-filtering-is-not-trivial) Why filtering is not trivial? + -Not many ANN algorithms are compatible with filtering. -HNSW is one of the few of them, but search engines approach its integration in different ways: +Run the following command while being connected to your Kubernetes cluster. The script requires `kubectl` and `helm` to be installed. -- Some use **post-filtering**, which applies filters after ANN search. It doesn’t scale well as it either loses results or requires many candidates on the first stage. -- Others use **pre-filtering**, which requires a binary mask of the whole dataset to be passed into the ANN algorithm. It is also not scalable, as the mask size grows linearly with the dataset size. +```shell +./hybrid-cloud-cleanup.sh your-qdrant-namespace +``` -On top of it, there is also a problem with search accuracy. -It appears if too many vectors are filtered out, so the HNSW graph becomes disconnected. +<|page-88-lllmstxt|> +# Qdrant Private Cloud Setup -Qdrant uses a different approach, not requiring pre- or post-filtering while addressing the accuracy problem. -Read more about the Qdrant approach in our [Filtrable HNSW](https://qdrant.tech/articles/filtrable-hnsw/) article. +## Requirements -Share this article +- **Kubernetes cluster:** To install Qdrant Private Cloud, you need a [standard compliant](https://www.cncf.io/training/certification/software-conformance/) Kubernetes cluster. You can run this cluster in any cloud, on-premise or edge environment, with distributions that range from AWS EKS to VMWare vSphere. See [Deployment Platforms](/documentation/hybrid-cloud/platform-deployment-options/) for more information. +- **Storage:** For storage, you need to set up the Kubernetes cluster with a Container Storage Interface (CSI) driver that provides block storage. For vertical scaling, the CSI driver needs to support volume expansion. For backups and restores, the driver needs to support CSI snapshots and restores. -[x](https://twitter.com/intent/tweet?url=https%3A%2F%2Fqdrant.tech%2Fbenchmarks%2Ffiltered-search-intro%2F&text=Filtered%20search%20benchmark "x")[LinkedIn](https://www.linkedin.com/sharing/share-offsite/?url=https%3A%2F%2Fqdrant.tech%2Fbenchmarks%2Ffiltered-search-intro%2F "LinkedIn") + -Up! +- **Permissions:** To install the Qdrant Kubernetes Operator you need to have `cluster-admin` access in your Kubernetes cluster. +- **Locations:** By default, the Qdrant Operator Helm charts and container images are served from `registry.cloud.qdrant.io`. -<|page-55-lllmstxt|> -## qdrant-internals -- [Articles](https://qdrant.tech/articles/) -- Qdrant Internals - -#### Qdrant Internals - -Take a look under the hood of Qdrant’s high-performance vector search engine. Explore the architecture, components, and design principles the Qdrant Vector Search Engine is built on. - -[![Preview](https://qdrant.tech/articles_data/dedicated-vector-search/preview/preview.jpg)\\ -**Built for Vector Search** \\ -Why add-on vector search looks good — until you actually use it.\\ -\\ -Evgeniya Sukhodolskaya & Andrey Vasnetsov\\ -\\ -February 17, 2025](https://qdrant.tech/articles/dedicated-vector-search/)[![Preview](https://qdrant.tech/articles_data/gridstore-key-value-storage/preview/preview.jpg)\\ -**Introducing Gridstore: Qdrant's Custom Key-Value Store** \\ -Why and how we built our own key-value store. A short technical report on our procedure and results.\\ -\\ -Luis Cossio, Arnaud Gourlay & David Myriel\\ -\\ -February 05, 2025](https://qdrant.tech/articles/gridstore-key-value-storage/)[![Preview](https://qdrant.tech/articles_data/immutable-data-structures/preview/preview.jpg)\\ -**Qdrant Internals: Immutable Data Structures** \\ -Learn how immutable data structures improve vector search performance in Qdrant.\\ -\\ -Andrey Vasnetsov\\ -\\ -August 20, 2024](https://qdrant.tech/articles/immutable-data-structures/)[![Preview](https://qdrant.tech/articles_data/dedicated-service/preview/preview.jpg)\\ -**Vector Search as a dedicated service** \\ -Why vector search requires a dedicated service.\\ -\\ -Andrey Vasnetsov\\ -\\ -November 30, 2023](https://qdrant.tech/articles/dedicated-service/)[![Preview](https://qdrant.tech/articles_data/geo-polygon-filter-gsoc/preview/preview.jpg)\\ -**Google Summer of Code 2023 - Polygon Geo Filter for Qdrant Vector Database** \\ -A Summary of my work and experience at Qdrant's Gsoc '23.\\ -\\ -Zein Wen\\ -\\ -October 12, 2023](https://qdrant.tech/articles/geo-polygon-filter-gsoc/)[![Preview](https://qdrant.tech/articles_data/binary-quantization/preview/preview.jpg)\\ -**Binary Quantization - Vector Search, 40x Faster** \\ -Binary Quantization is a newly introduced mechanism of reducing the memory footprint and increasing performance\\ -\\ -Nirant Kasliwal\\ -\\ -September 18, 2023](https://qdrant.tech/articles/binary-quantization/)[![Preview](https://qdrant.tech/articles_data/io_uring/preview/preview.jpg)\\ -**Qdrant under the hood: io\_uring** \\ -Slow disk decelerating your Qdrant deployment? Get on top of IO overhead with this one trick!\\ -\\ -Andre Bogus\\ -\\ -June 21, 2023](https://qdrant.tech/articles/io_uring/)[![Preview](https://qdrant.tech/articles_data/product-quantization/preview/preview.jpg)\\ -**Product Quantization in Vector Search \| Qdrant** \\ -Discover product quantization in vector search technology. Learn how it optimizes storage and accelerates search processes for high-dimensional data.\\ -\\ -Kacper Ɓukawski\\ -\\ -May 30, 2023](https://qdrant.tech/articles/product-quantization/)[![Preview](https://qdrant.tech/articles_data/scalar-quantization/preview/preview.jpg)\\ -**Scalar Quantization: Background, Practices & More \| Qdrant** \\ -Discover the efficiency of scalar quantization for optimized data storage and enhanced performance. Learn about its data compression benefits and efficiency improvements.\\ -\\ -Kacper Ɓukawski\\ -\\ -March 27, 2023](https://qdrant.tech/articles/scalar-quantization/)[![Preview](https://qdrant.tech/articles_data/memory-consumption/preview/preview.jpg)\\ -**Minimal RAM you need to serve a million vectors** \\ -How to properly measure RAM usage and optimize Qdrant for memory consumption.\\ -\\ -Andrei Vasnetsov\\ -\\ -December 07, 2022](https://qdrant.tech/articles/memory-consumption/)[![Preview](https://qdrant.tech/articles_data/filtrable-hnsw/preview/preview.jpg)\\ -**Filtrable HNSW** \\ -How to make ANN search with custom filtering? Search in selected subsets without loosing the results.\\ -\\ -Andrei Vasnetsov\\ -\\ -November 24, 2019](https://qdrant.tech/articles/filtrable-hnsw/) - -× - -[Powered by](https://qdrant.tech/) +> **Note:** You can also mirror these images and charts into your own registry and pull them from there. -<|page-56-lllmstxt|> -## send-data -- [Documentation](https://qdrant.tech/documentation/) -- Send Data to Qdrant +### CLI tools -## [Anchor](https://qdrant.tech/documentation/send-data/\#how-to-send-your-data-to-a-qdrant-cluster) How to Send Your Data to a Qdrant Cluster +During the onboarding, you will need to deploy the Qdrant Kubernetes Operator using Helm. Make sure you have the following tools installed: -| Example | Description | Stack | -| --- | --- | --- | -| [Pinecone to Qdrant Data Transfer](https://githubtocolab.com/qdrant/examples/blob/master/data-migration/from-pinecone-to-qdrant.ipynb) | Migrate your vector data from Pinecone to Qdrant. | Qdrant, Vector-io | -| [Stream Data to Qdrant with Kafka](https://qdrant.tech/documentation/send-data/data-streaming-kafka-qdrant/) | Use Confluent to Stream Data to Qdrant via Managed Kafka. | Qdrant, Kafka | -| [Qdrant on Databricks](https://qdrant.tech/documentation/send-data/databricks/) | Learn how to use Qdrant on Databricks using the Spark connector | Qdrant, Databricks, Apache Spark | -| [Qdrant with Airflow and Astronomer](https://qdrant.tech/documentation/send-data/qdrant-airflow-astronomer/) | Build a semantic querying system using Airflow and Astronomer | Qdrant, Airflow, Astronomer | +* [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) +* [helm](https://helm.sh/docs/intro/install/) -##### Was this page useful? +You will need to have access to the Kubernetes cluster with `kubectl` and `helm` configured to connect to it. Please refer the documentation of your Kubernetes distribution for more information. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +### Required artifacts -Thank you for your feedback! 🙏 +Container images: -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/send-data/_index.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +- `registry.cloud.qdrant.io/qdrant/qdrant` +- `registry.cloud.qdrant.io/qdrant/operator` +- `registry.cloud.qdrant.io/qdrant/cluster-manager` +- `registry.cloud.qdrant.io/qdrant/qdrant-cluster-exporter` -On this page: +Open Containers Initiative (OCI) Helm charts: -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/send-data/_index.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +- `registry.cloud.qdrant.io/qdrant-charts/qdrant-private-cloud` +- `registry.cloud.qdrant.io/library/qdrant-kubernetes-api` -× +- The specific versions for every private cloud version are documented in the [Private Cloud Changelog](/documentation/private-cloud/changelog/). -[Powered by](https://qdrant.tech/) +## Installation -<|page-57-lllmstxt|> -## neural-search -- [Documentation](https://qdrant.tech/documentation/) -- [Beginner tutorials](https://qdrant.tech/documentation/beginner-tutorials/) -- Build a Neural Search Service +Once onboarded to Qdrant Private Cloud, you will receive credentials to access the Qdrant Cloud Registry. You can use these credentials to install the Qdrant Private Cloud solution using the following commands: -# [Anchor](https://qdrant.tech/documentation/beginner-tutorials/neural-search/\#build-a-neural-search-service-with-sentence-transformers-and-qdrant) Build a Neural Search Service with Sentence Transformers and Qdrant +1. Create the namespace for your Private Cloud deployment. You can use any name for the namespace, but you will need to update the later steps to reflect this. E.g. -| Time: 30 min | Level: Beginner | Output: [GitHub](https://github.com/qdrant/qdrant_demo/tree/sentense-transformers) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1kPktoudAP8Tu8n8l-iVMOQhVmHkWV_L9?usp=sharing) | -| --- | --- | --- | --- | +```bash +kubectl create namespace qdrant-private-cloud +``` -This tutorial shows you how to build and deploy your own neural search service to look through descriptions of companies from [startups-list.com](https://www.startups-list.com/) and pick the most similar ones to your query. The website contains the company names, descriptions, locations, and a picture for each entry. +2. Create a Kubernetes secret with your Qdrant Cloud Registry credentials, to allow your Kubernetes cluster to pull the necessary container images: +``` +kubectl create secret docker-registry qdrant-registry-creds --docker-server=registry.cloud.qdrant.io --docker-username='your-username' --docker-password='your-password' --namespace qdrant-private-cloud +``` -A neural search service uses artificial neural networks to improve the accuracy and relevance of search results. Besides offering simple keyword results, this system can retrieve results by meaning. It can understand and interpret complex search queries and provide more contextually relevant output, effectively enhancing the user’s search experience. +3. Log in to the Qdrant Cloud Registry using Helm: -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/neural-search/\#workflow) Workflow +```bash +helm registry login 'registry.cloud.qdrant.io' --username 'your-username' --password 'your-password' +``` -To create a neural search service, you will need to transform your raw data and then create a search function to manipulate it. First, you will 1) download and prepare a sample dataset using a modified version of the BERT ML model. Then, you will 2) load the data into Qdrant, 3) create a neural search API and 4) serve it using FastAPI. +4. Install the Qdrant Kubernetes Operator Custom Resource Definitions (CRDs): -![Neural Search Workflow](https://qdrant.tech/docs/workflow-neural-search.png) +```bash +helm upgrade --install qdrant-private-cloud-crds oci://registry.cloud.qdrant.io/qdrant-charts/qdrant-kubernetes-api --namespace qdrant-private-cloud --version v1.17.2 --wait +``` -> **Note**: The code for this tutorial can be found here: \| [Step 1: Data Preparation Process](https://colab.research.google.com/drive/1kPktoudAP8Tu8n8l-iVMOQhVmHkWV_L9?usp=sharing) \| [Step 2: Full Code for Neural Search](https://github.com/qdrant/qdrant_demo/tree/sentense-transformers). \| +5. Install Qdrant Private Cloud: -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/neural-search/\#prerequisites) Prerequisites +```bash +helm upgrade --install qdrant-private-cloud oci://registry.cloud.qdrant.io/qdrant-charts/qdrant-private-cloud --namespace qdrant-private-cloud --version 1.8.0 +``` -To complete this tutorial, you will need: +Ensure that the `qdrant-kubernetes-api` version is compatible with the `qdrant-private-cloud` version you are installing. -- Docker - The easiest way to use Qdrant is to run a pre-built Docker image. -- [Raw parsed data](https://storage.googleapis.com/generall-shared-data/startups_demo.json) from startups-list.com. -- Python version >=3.8 +For a list of available versions consult the [Private Cloud Changelog](/documentation/private-cloud/changelog/). -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/neural-search/\#prepare-sample-dataset) Prepare sample dataset +Current default versions are: -To conduct a neural search on startup descriptions, you must first encode the description data into vectors. To process text, you can use a pre-trained models like [BERT](https://en.wikipedia.org/wiki/BERT_%28language_model%29) or sentence transformers. The [sentence-transformers](https://github.com/UKPLab/sentence-transformers) library lets you conveniently download and use many pre-trained models, such as DistilBERT, MPNet, etc. +* qdrant-kubernetes-api v1.17.2 +* qdrant-private-cloud 1.8.0 -1. First you need to download the dataset. +For more information also see the [Helm Install Documentation](https://helm.sh/docs/helm/helm_install/). -```bash -wget https://storage.googleapis.com/generall-shared-data/startups_demo.json +## Configuring Private Cloud -``` +The Qdrant Private Cloud Helm chart comes with a set of default values which are suitable for most deployments. However, you are able to customize the configuration further to fit your specific needs. See the [Private Cloud Configuration](/documentation/private-cloud/configuration/) page for all available configuration options. -2. Install the SentenceTransformer library as well as other relevant packages. +You must ensure that the default `StorageClasses` and corresponding `VolumeSnapshotClass` are set appropriately for your environment. -```bash -pip install sentence-transformers numpy pandas tqdm +When creating your own `values.yaml` file, as a best practice, only include the values you are actually changing, e.g. with this `values.yaml` file: +```yaml +operator: + settings: + features: + clusterManagement: + storageClass: + database: your-storage-class-name + snapshot: your-storage-class-name + backupManagement: + snapshots: + volumeSnapshotClass: your-volume-snapshot-class-name ``` -3. Import the required modules. - -```python -from sentence_transformers import SentenceTransformer -import numpy as np -import json -import pandas as pd -from tqdm.notebook import tqdm +You can configure Qdrant Private Cloud like this: +```bash +helm upgrade --install qdrant-private-cloud oci://registry.cloud.qdrant.io/qdrant-charts/qdrant-private-cloud --namespace qdrant-private-cloud --version 1.8.0 -f values.yaml ``` -You will be using a pre-trained model called `all-MiniLM-L6-v2`. -This is a performance-optimized sentence embedding model and you can read more about it and other available models [here](https://www.sbert.net/docs/pretrained_models.html). - -4. Download and create a pre-trained sentence encoder. +## Upgrades -```python -model = SentenceTransformer( - "all-MiniLM-L6-v2", device="cuda" -) # or device="cpu" if you don't have a GPU +To upgrade Qdrant Private Cloud to a new version, first upgrade the Qdrant Kubernetes Operator Custom Resource Definitions (CRDs): +```bash +helm upgrade --install qdrant-private-cloud-crds oci://registry.cloud.qdrant.io/qdrant-charts/qdrant-kubernetes-api --namespace qdrant-private-cloud --version v1.17.2 --wait ``` -5. Read the raw data file. - -```python -df = pd.read_json("./startups_demo.json", lines=True) +Then upgrade the Qdrant Private Cloud Helm chart using the same configuration values, e.g.: +```bash +helm upgrade --install qdrant-private-cloud oci://registry.cloud.qdrant.io/qdrant-charts/qdrant-private-cloud --namespace qdrant-private-cloud --version 1.8.0 -f values.yaml ``` -6. Encode all startup descriptions to create an embedding vector for each. Internally, the `encode` function will split the input into batches, which will significantly speed up the process. +Note, that the image tag values are automatically derived from the chart's appVersions and should not be overridden in the `values.yaml`. -```python -vectors = model.encode( - [row.alt + ". " + row.description for row in df.itertuples()], - show_progress_bar=True, -) +For more information also see the [Helm Upgrade Documentation](https://helm.sh/docs/helm/upgrade/). -``` +### Mirroring images and charts -All of the descriptions are now converted into vectors. There are 40474 vectors of 384 dimensions. The output layer of the model has this dimension +To mirror all necessary container images and Helm charts into your own registry, you can either use a replication feature that your registry provides, or you can manually sync the images with [Skopeo](https://github.com/containers/skopeo): -```python -vectors.shape -# > (40474, 384) +First login to the source registry: +```shell +skopeo login registry.cloud.qdrant.io ``` -7. Download the saved vectors into a new file named `startup_vectors.npy` - -```python -np.save("startup_vectors.npy", vectors, allow_pickle=False) +Then login to your own registry: +```shell +skopeo login your-registry.example.com ``` -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/neural-search/\#run-qdrant-in-docker) Run Qdrant in Docker +To sync all container images: -Next, you need to manage all of your data using a vector engine. Qdrant lets you store, update or delete created vectors. Most importantly, it lets you search for the nearest vectors via a convenient API. +```shell +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/qdrant your-registry.example.com/qdrant/qdrant +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/cluster-manager your-registry.example.com/qdrant/cluster-manager +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/operator your-registry.example.com/qdrant/operator +``` -> **Note:** Before you begin, create a project directory and a virtual python environment in it. +To sync all helm charts: -1. Download the Qdrant image from DockerHub. +```shell +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant-charts/qdrant-private-cloud your-registry.example.com/qdrant-charts/qdrant-private-cloud +skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant-charts/qdrant-kubernetes-api your-registry.example.com/qdrant-charts/qdrant-kubernetes-api +``` -```bash -docker pull qdrant/qdrant +During the installation or upgrade, you will need to adapt the image repository and imagePullSecret information in the Helm chart values, e.g.: +```yaml +operator: + image: + repository: your-registry.example.com/qdrant/operator + imagePullSecrets: + - name: your-registry-creds + settings: + features: + clusterManagement: + qdrant: + image: + repository: your-registry.example.com/qdrant/qdrant + pullSecretName: your-registry-creds + +qdrant-cluster-manager: + image: + repository: your-registry.example.com/qdrant/cluster-manager + imagePullSecrets: + - name: your-registry-creds + +qdrant-cluster-exporter: + image: + repository: your-registry.example.com/qdrant/qdrant-cluster-exporter + imagePullSecrets: + - name: your-registry-creds ``` -2. Start Qdrant inside of Docker. - -```bash -docker run -p 6333:6333 \ - -v $(pwd)/qdrant_storage:/qdrant/storage \ - qdrant/qdrant +See [Private Cloud Configuration](/documentation/private-cloud/configuration/) for details. -``` +### Scope of the operator -You should see output like this +By default, the Qdrant Operator will only manage Qdrant clusters in the same Kubernetes namespace, where it is already deployed. The RoleBindings are also limited to this specific namespace. This default is chosen to limit the operator to the least amount of permissions necessary within a Kubernetes cluster. -```text -... -[2021-02-05T00:08:51Z INFO actix_server::builder] Starting 12 workers -[2021-02-05T00:08:51Z INFO actix_server::builder] Starting "actix-web-service-0.0.0.0:6333" service on 0.0.0.0:6333 +If you want to manage Qdrant clusters in multiple namespaces with the same operator, you can either configure a list of namespaces that the operator should watch: +```yaml +operator: + watch: + # If true, watches only the namespace where the Qdrant operator is deployed, otherwise watches the namespaces in watch.namespaces + onlyReleaseNamespace: false + # an empty list watches all namespaces. + namespaces: + - qdrant-private-cloud + - some-other-namespase + limitRBAC: true ``` -Test the service by going to [http://localhost:6333/](http://localhost:6333/). You should see the Qdrant version info in your browser. +Or you can configure the operator to watch all namespaces: -All data uploaded to Qdrant is saved inside the `./qdrant_storage` directory and will be persisted even if you recreate the container. +```yaml +operator: + watch: + # If true, watches only the namespace where the Qdrant operator is deployed, otherwise watches the namespaces in watch.namespaces + onlyReleaseNamespace: false + # an empty list watches all namespaces. + namespaces: [] + limitRBAC: false +``` -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/neural-search/\#upload-data-to-qdrant) Upload data to Qdrant +## Uninstallation -1. Install the official Python client to best interact with Qdrant. +To uninstall the Qdrant Private Cloud solution, you can use the following command: ```bash -pip install qdrant-client - +helm uninstall qdrant-private-cloud --namespace qdrant-private-cloud +helm uninstall qdrant-private-cloud-crds --namespace qdrant-private-cloud +kubectl delete namespace qdrant-private-cloud ``` -At this point, you should have startup records in the `startups_demo.json` file, encoded vectors in `startup_vectors.npy` and Qdrant running on a local machine. - -Now you need to write a script to upload all startup data and vectors into the search engine. - -2. Create a client object for Qdrant. - -```python -# Import client library -from qdrant_client import QdrantClient -from qdrant_client.models import VectorParams, Distance - -client = QdrantClient("http://localhost:6333") +Note that uninstalling the `qdrant-private-cloud-crds` Helm chart will remove all Custom Resource Definitions (CRDs) will also remove all Qdrant clusters that were managed by the operator. -``` +<|page-89-lllmstxt|> +# How Does Vector Search Work in Qdrant? -3. Related vectors need to be added to a collection. Create a new collection for your startup vectors. +

-```python -if not client.collection_exists("startups"): - client.create_collection( - collection_name="startups", - vectors_config=VectorParams(size=384, distance=Distance.COSINE), - ) -``` -4. Create an iterator over the startup data and vectors. +If you are still trying to figure out how vector search works, please read ahead. This document describes how vector search is used, covers Qdrant's place in the larger ecosystem, and outlines how you can use Qdrant to augment your existing projects. -The Qdrant client library defines a special function that allows you to load datasets into the service. -However, since there may be too much data to fit a single computer memory, the function takes an iterator over the data as input. +For those who want to start writing code right away, visit our [Complete Beginners tutorial](/documentation/tutorials/search-beginners/) to build a search engine in 5-15 minutes. -```python -fd = open("./startups_demo.json") +## A Brief History of Search -# payload is now an iterator over startup data -payload = map(json.loads, fd) +Human memory is unreliable. Thus, as long as we have been trying to collect ‘knowledge’ in written form, we had to figure out how to search for relevant content without rereading the same books repeatedly. That’s why some brilliant minds introduced the inverted index. In the simplest form, it’s an appendix to a book, typically put at its end, with a list of the essential terms-and links to pages they occur at. Terms are put in alphabetical order. Back in the day, that was a manually crafted list requiring lots of effort to prepare. Once digitalization started, it became a lot easier, but still, we kept the same general principles. That worked, and still, it does. -# Load all vectors into memory, numpy array works as iterable for itself. -# Other option would be to use Mmap, if you don't want to load all data into RAM -vectors = np.load("./startup_vectors.npy") +If you are looking for a specific topic in a particular book, you can try to find a related phrase and quickly get to the correct page. Of course, assuming you know the proper term. If you don’t, you must try and fail several times or find somebody else to help you form the correct query. -``` +{{< figure src=/docs/gettingstarted/inverted-index.png caption="A simplified version of the inverted index." >}} -5. Upload the data +Time passed, and we haven’t had much change in that area for quite a long time. But our textual data collection started to grow at a greater pace. So we also started building up many processes around those inverted indexes. For example, we allowed our users to provide many words and started splitting them into pieces. That allowed finding some documents which do not necessarily contain all the query words, but possibly part of them. We also started converting words into their root forms to cover more cases, removing stopwords, etc. Effectively we were becoming more and more user-friendly. Still, the idea behind the whole process is derived from the most straightforward keyword-based search known since the Middle Ages, with some tweaks. -```python -client.upload_collection( - collection_name="startups", - vectors=vectors, - payload=payload, - ids=None, # Vector ids will be assigned automatically - batch_size=256, # How many vectors will be uploaded in a single request? -) +{{< figure src=/docs/gettingstarted/tokenization.png caption="The process of tokenization with an additional stopwords removal and converstion to root form of a word." >}} -``` +Technically speaking, we encode the documents and queries into so-called sparse vectors where each position has a corresponding word from the whole dictionary. If the input text contains a specific word, it gets a non-zero value at that position. But in reality, none of the texts will contain more than hundreds of different words. So the majority of vectors will have thousands of zeros and a few non-zero values. That’s why we call them sparse. And they might be already used to calculate some word-based similarity by finding the documents which have the biggest overlap. -Vectors are now uploaded to Qdrant. +{{< figure src=/docs/gettingstarted/query.png caption="An example of a query vectorized to sparse format." >}} -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/neural-search/\#build-the-search-api) Build the search API +Sparse vectors have relatively **high dimensionality**; equal to the size of the dictionary. And the dictionary is obtained automatically from the input data. So if we have a vector, we are able to partially reconstruct the words used in the text that created that vector. -Now that all the preparations are complete, let’s start building a neural search class. +## The Tower of Babel -In order to process incoming requests, neural search will need 2 things: 1) a model to convert the query into a vector and 2) the Qdrant client to perform search queries. +Every once in a while, when we discover new problems with inverted indexes, we come up with a new heuristic to tackle it, at least to some extent. Once we realized that people might describe the same concept with different words, we started building lists of synonyms to convert the query to a normalized form. But that won’t work for the cases we didn’t foresee. Still, we need to craft and maintain our dictionaries manually, so they can support the language that changes over time. Another difficult issue comes to light with multilingual scenarios. Old methods require setting up separate pipelines and keeping humans in the loop to maintain the quality. -1. Create a file named `neural_searcher.py` and specify the following. +{{< figure src=/docs/gettingstarted/babel.jpg caption="The Tower of Babel, Pieter Bruegel." >}} -```python -from qdrant_client import QdrantClient -from sentence_transformers import SentenceTransformer +## The Representation Revolution -class NeuralSearcher: - def __init__(self, collection_name): - self.collection_name = collection_name - # Initialize encoder model - self.model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu") - # initialize Qdrant client - self.qdrant_client = QdrantClient("http://localhost:6333") +The latest research in Machine Learning for NLP is heavily focused on training Deep Language Models. In this process, the neural network takes a large corpus of text as input and creates a mathematical representation of the words in the form of vectors. These vectors are created in such a way that words with similar meanings and occurring in similar contexts are grouped together and represented by similar vectors. And we can also take, for example, an average of all the word vectors to create the vector for a whole text (e.g query, sentence, or paragraph). -``` +![deep neural](/docs/gettingstarted/deep-neural.png) -2. Write the search function. +We can take those **dense vectors** produced by the network and use them as a **different data representation**. They are dense because neural networks will rarely produce zeros at any position. In contrary to sparse ones, they have a relatively low dimensionality — hundreds or a few thousand only. Unfortunately, if we want to have a look and understand the content of the document by looking at the vector it’s no longer possible. Dimensions are no longer representing the presence of specific words. -```python -def search(self, text: str): - # Convert text query into vector - vector = self.model.encode(text).tolist() +Dense vectors can capture the meaning, not the words used in a text. That being said, **Large Language Models can automatically handle synonyms**. More so, since those neural networks might have been trained with multilingual corpora, they translate the same sentence, written in different languages, to similar vector representations, also called **embeddings**. And we can compare them to find similar pieces of text by calculating the distance to other vectors in our database. - # Use `vector` for search for closest vectors in the collection - search_result = self.qdrant_client.query_points( - collection_name=self.collection_name, - query=vector, - query_filter=None, # If you don't want any filters for now - limit=5, # 5 the most closest results is enough - ).points - # `search_result` contains found vector ids with similarity scores along with the stored payload - # In this function you are interested in payload only - payloads = [hit.payload for hit in search_result] - return payloads +{{< figure src=/docs/gettingstarted/input.png caption="Input queries contain different words, but they are still converted into similar vector representations, because the neural encoder can capture the meaning of the sentences. That feature can capture synonyms but also different languages.." >}} -``` +**Vector search** is a process of finding similar objects based on their embeddings similarity. The good thing is, you don’t have to design and train your neural network on your own. Many pre-trained models are available, either on **HuggingFace** or by using libraries like [SentenceTransformers](https://www.sbert.net/?ref=hackernoon.com). If you, however, prefer not to get your hands dirty with neural models, you can also create the embeddings with SaaS tools, like [co.embed API](https://docs.cohere.com/reference/embed?ref=hackernoon.com). -3. Add search filters. +## Why Qdrant? -With Qdrant it is also feasible to add some conditions to the search. -For example, if you wanted to search for startups in a certain city, the search query could look like this: +The challenge with vector search arises when we need to find similar documents in a big set of objects. If we want to find the closest examples, the naive approach would require calculating the distance to every document. That might work with dozens or even hundreds of examples but may become a bottleneck if we have more than that. When we work with relational data, we set up database indexes to speed things up and avoid full table scans. And the same is true for vector search. Qdrant is a fully-fledged vector database that speeds up the search process by using a graph-like structure to find the closest objects in sublinear time. So you don’t calculate the distance to every object from the database, but some candidates only. -```python -from qdrant_client.models import Filter +{{< figure src=/docs/gettingstarted/vector-search.png caption="Vector search with Qdrant. Thanks to HNSW graph we are able to compare the distance to some of the objects from the database, not to all of them." >}} - ... +While doing a semantic search at scale, because this is what we sometimes call the vector search done on texts, we need a specialized tool to do it effectively — a tool like Qdrant. - city_of_interest = "Berlin" +## Next Steps - # Define a filter for cities - city_filter = Filter(**{ - "must": [{\ - "key": "city", # Store city information in a field of the same name\ - "match": { # This condition checks if payload field has the requested value\ - "value": city_of_interest\ - }\ - }] - }) +Vector search is an exciting alternative to sparse methods. It solves the issues we had with the keyword-based search without needing to maintain lots of heuristics manually. It requires an additional component, a neural encoder, to convert text into vectors. - search_result = self.qdrant_client.query_points( - collection_name=self.collection_name, - query=vector, - query_filter=city_filter, - limit=5 - ).points - ... +[**Tutorial 1 - Qdrant for Complete Beginners**](/documentation/tutorials/search-beginners/) +Despite its complicated background, vectors search is extraordinarily simple to set up. With Qdrant, you can have a search engine up-and-running in five minutes. Our [Complete Beginners tutorial](/documentation/tutorials/search-beginners/) will show you how. -``` +[**Tutorial 2 - Question and Answer System**](/articles/qa-with-cohere-and-qdrant/) +However, you can also choose SaaS tools to generate them and avoid building your model. Setting up a vector search project with Qdrant Cloud and Cohere co.embed API is fairly easy if you follow the [Question and Answer system tutorial](/articles/qa-with-cohere-and-qdrant/). -You have now created a class for neural search queries. Now wrap it up into a service. +There is another exciting thing about vector search. You can search for any kind of data as long as there is a neural network that would vectorize your data type. Do you think about a reverse image search? That’s also possible with vector embeddings. -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/neural-search/\#deploy-the-search-with-fastapi) Deploy the search with FastAPI +<|page-90-lllmstxt|> +THIS CONTENT IS GOING TO BE IGNORED FOR NOW -To build the service you will use the FastAPI framework. +# Documentation -1. Install FastAPI. +Qdrant is an AI-native vector database and a semantic search engine. You can use it to extract meaningful information from unstructured data. Want to see how it works? [Clone this repo now](https://github.com/qdrant/qdrant_demo/) and build a search engine in five minutes. -To install it, use the command +||| +|-:|:-| +|[Cloud Quickstart](/documentation/quickstart-cloud/)|[Local Quickstart](/documentation/quick-start/)| -```bash -pip install fastapi uvicorn -``` +## Ready to start developing? -2. Implement the service. +***

Qdrant is open-source and can be self-hosted. However, the quickest way to get started is with our [free tier](https://qdrant.to/cloud) on Qdrant Cloud. It scales easily and provides an UI where you can interact with data.

*** -Create a file named `service.py` and specify the following. +[![Hybrid Cloud](/docs/homepage/cloud-cta.png)](https://qdrant.to/cloud) -The service will have only one API endpoint and will look like this: +## Qdrant's most popular features: +|||| +|:-|:-|:-| +|[Filtrable HNSW](/documentation/filtering/)
Single-stage payload filtering | [Recommendations & Context Search](/documentation/concepts/explore/#explore-the-data)
Exploratory advanced search| [Pure-Vector Hybrid Search](/documentation/hybrid-queries/)
Full text and semantic search in one| +|[Multitenancy](/documentation/guides/multiple-partitions/)
Payload-based partitioning|[Custom Sharding](/documentation/guides/distributed_deployment/#sharding)
For data isolation and distribution|[Role Based Access Control](/documentation/guides/security/?q=jwt#granular-access-control-with-jwt)
Secure JWT-based access | +|[Quantization](/documentation/guides/quantization/)
Compress data for drastic speedups|[Multivector Support](/documentation/concepts/vectors/?q=multivect#multivectors)
For ColBERT late interaction |[Built-in IDF](/documentation/concepts/indexing/?q=inverse+docu#idf-modifier)
Advanced similarity calculation| -```python -from fastapi import FastAPI +## Developer guidebooks: -# The file where NeuralSearcher is stored -from neural_searcher import NeuralSearcher +| [A Complete Guide to Filtering in Vector Search](/articles/vector-search-filtering/)
Beginner & advanced examples showing how to improve precision in vector search.| [Building Hybrid Search with Query API](/articles/hybrid-search/)
Build a pure vector-based hybrid search system with our new fusion feature.| +|----------------------------------------------|-------------------------------| +| [Multitenancy and Sharding: Best Practices](/articles/multitenancy/)
Combine two powerful features for complete data isolation and scaling.| [Benefits of Binary Quantization in Vector Search](/articles/binary-quantization/)
Compress data points while retaining essential meaning for extreme search performance.| -app = FastAPI() +<|page-91-lllmstxt|> +# Automate filtering with LLMs -# Create a neural searcher instance -neural_searcher = NeuralSearcher(collection_name="startups") +Our [complete guide to filtering in vector search](/articles/vector-search-filtering/) describes why filtering is +important, and how to implement it with Qdrant. However, applying filters is easier when you build an application +with a traditional interface. Your UI may contain a form with checkboxes, sliders, and other elements that users can +use to set their criteria. But what if you want to build a RAG-powered application with just the conversational +interface, or even voice commands? In this case, you need to automate the filtering process! -@app.get("/api/search") -def search_startup(q: str): - return {"result": neural_searcher.search(text=q)} +LLMs seem to be particularly good at this task. They can understand natural language and generate structured output +based on it. In this tutorial, we'll show you how to use LLMs to automate filtering in your vector search application. -if __name__ == "__main__": - import uvicorn +## Few notes on Qdrant filters - uvicorn.run(app, host="0.0.0.0", port=8000) +Qdrant Python SDK defines the models using [Pydantic](https://docs.pydantic.dev/latest/). This library is de facto +standard for data validation and serialization in Python. It allows you to define the structure of your data using +Python type hints. For example, our `Filter` model is defined as follows: +```python +class Filter(BaseModel, extra="forbid"): + should: Optional[Union[List["Condition"], "Condition"]] = Field( + default=None, description="At least one of those conditions should match" + ) + min_should: Optional["MinShould"] = Field( + default=None, description="At least minimum amount of given conditions should match" + ) + must: Optional[Union[List["Condition"], "Condition"]] = Field(default=None, description="All conditions must match") + must_not: Optional[Union[List["Condition"], "Condition"]] = Field( + default=None, description="All conditions must NOT match" + ) ``` -3. Run the service. - -```bash -python service.py +Qdrant filters may be nested, and you can express even the most complex conditions using the `must`, `should`, and +`must_not` notation. -``` +## Structured output from LLMs -4. Open your browser at [http://localhost:8000/docs](http://localhost:8000/docs). +It isn't an uncommon practice to use LLMs to generate structured output. It is primarily useful if their output is +intended for further processing by a different application. For example, you can use LLMs to generate SQL queries, +JSON objects, and most importantly, Qdrant filters. Pydantic got adopted by the LLM ecosystem quite well, so there is +plenty of libraries which uses Pydantic models to define the structure of the output for the Language Models. -You should be able to see a debug interface for your service. +One of the interesting projects in this area is [Instructor](https://python.useinstructor.com/) that allows you to +play with different LLM providers and restrict their output to a specific structure. Let's install the library and +already choose a provider we'll use in this tutorial: -![FastAPI Swagger interface](https://qdrant.tech/docs/fastapi_neural_search.png) +```shell +pip install "instructor[anthropic]" +``` -Feel free to play around with it, make queries regarding the companies in our corpus, and check out the results. +Anthropic is not the only option out there, as Instructor supports many other providers including OpenAI, Ollama, +Llama, Gemini, Vertex AI, Groq, Litellm and others. You can choose the one that fits your needs the best, or the one +you already use in your RAG. -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/neural-search/\#next-steps) Next steps +## Using Instructor to generate Qdrant filters -The code from this tutorial has been used to develop a [live online demo](https://qdrant.to/semantic-search-demo). -You can try it to get an intuition for cases when the neural search is useful. -The demo contains a switch that selects between neural and full-text searches. -You can turn the neural search on and off to compare your result with a regular full-text search. +Instructor has some helper methods to decorate the LLM APIs, so you can interact with them as if you were using their +normal SDKs. In case of Anthropic, you just pass an instance of `Anthropic` class to the `from_anthropic` function: -> **Note**: The code for this tutorial can be found here: \| [Step 1: Data Preparation Process](https://colab.research.google.com/drive/1kPktoudAP8Tu8n8l-iVMOQhVmHkWV_L9?usp=sharing) \| [Step 2: Full Code for Neural Search](https://github.com/qdrant/qdrant_demo/tree/sentense-transformers). \| +```python +import instructor +from anthropic import Anthropic -Join our [Discord community](https://qdrant.to/discord), where we talk about vector search and similarity learning, publish other examples of neural networks and neural search applications. +anthropic_client = instructor.from_anthropic( + client=Anthropic( + api_key="YOUR_API_KEY", + ) +) +``` -##### Was this page useful? +A decorated client slightly modifies the original API, so you can pass the `response_model` parameter to the +`.messages.create` method. This parameter should be a Pydantic model that defines the structure of the output. In case +of Qdrant filters, it should be a `Filter` model: -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +```python +from qdrant_client import models -Thank you for your feedback! 🙏 +qdrant_filter = anthropic_client.messages.create( + model="claude-3-5-sonnet-latest", + response_model=models.Filter, + max_tokens=1024, + messages=[ + { + "role": "user", + "content": "red T-shirt" + } + ], +) +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/beginner-tutorials/neural-search.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +The output of this code will be a Pydantic model that represents a Qdrant filter. Surprisingly, there is no need to pass +additional instructions to already figure out that the user wants to filter by the color and the type of the product. +Here is how the output looks like: -On this page: +```python +Filter( + should=None, + min_should=None, + must=[ + FieldCondition( + key="color", + match=MatchValue(value="red"), + range=None, + geo_bounding_box=None, + geo_radius=None, + geo_polygon=None, + values_count=None + ), + FieldCondition( + key="type", + match=MatchValue(value="t-shirt"), + range=None, + geo_bounding_box=None, + geo_radius=None, + geo_polygon=None, + values_count=None + ) + ], + must_not=None +) +``` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/beginner-tutorials/neural-search.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Obviously, giving the model complete freedom to generate the filter may lead to unexpected results, or no results at +all. Your collection probably has payloads with a specific structure, so it doesn't make sense to use anything else. +Moreover, **it's considered a good practice to filter by the fields that have been indexed**. That's why it makes sense +to automatically determine the indexed fields and restrict the output to them. -× +### Restricting the available fields -[Powered by](https://qdrant.tech/) +Qdrant collection info contains a list of the indexes created on a particular collection. You can use this information +to automatically determine the fields that can be used for filtering. Here is how you can do it: -<|page-58-lllmstxt|> -## observability -- [Documentation](https://qdrant.tech/documentation/) -- Observability +```python +from qdrant_client import QdrantClient -## [Anchor](https://qdrant.tech/documentation/observability/\#observability-integrations) Observability Integrations +client = QdrantClient("http://localhost:6333") +collection_info = client.get_collection(collection_name="test_filter") +indexes = collection_info.payload_schema +print(indexes) +``` -| Tool | Description | -| --- | --- | -| [OpenLIT](https://qdrant.tech/documentation/observability/openlit/) | Platform for OpenTelemetry-native Observability & Evals for LLMs and Vector Databases. | -| [OpenLLMetry](https://qdrant.tech/documentation/observability/openllmetry/) | Set of OpenTelemetry extensions to add Observability for your LLM application. | -| [Datadog](https://qdrant.tech/documentation/observability/datadog/) | Cloud-based monitoring and analytics platform. | +Output: -##### Was this page useful? +```python +{ + "city.location": PayloadIndexInfo( + data_type=PayloadSchemaType.GEO, + ... + ), + "city.name": PayloadIndexInfo( + data_type=PayloadSchemaType.KEYWORD, + ... + ), + "color": PayloadIndexInfo( + data_type=PayloadSchemaType.KEYWORD, + ... + ), + "fabric": PayloadIndexInfo( + data_type=PayloadSchemaType.KEYWORD, + ... + ), + "price": PayloadIndexInfo( + data_type=PayloadSchemaType.FLOAT, + ... + ), +} +``` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Our LLM should know the names of the fields it can use, but also their type, as e.g., range filtering only makes sense +for numerical fields, and geo filtering on non-geo fields won't yield anything meaningful. You can pass this information +as a part of the prompt to the LLM, so let's encode it as a string: -Thank you for your feedback! 🙏 +```python +formatted_indexes = "\n".join([ + f"- {index_name} - {index.data_type.name}" + for index_name, index in indexes.items() +]) +print(formatted_indexes) +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/observability/_index.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Output: -On this page: +```text +- fabric - KEYWORD +- city.name - KEYWORD +- color - KEYWORD +- price - FLOAT +- city.location - GEO +``` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/observability/_index.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +**It's a good idea to cache the list of the available fields and their types**, as they are not supposed to change +often. Our interactions with the LLM should be slightly different now: -× +```python +qdrant_filter = anthropic_client.messages.create( + model="claude-3-5-sonnet-latest", + response_model=models.Filter, + max_tokens=1024, + messages=[ + { + "role": "user", + "content": ( + "color is red" + f"\n{formatted_indexes}\n" + ) + } + ], +) +``` -[Powered by](https://qdrant.tech/) +Output: -<|page-59-lllmstxt|> -## cloud-quickstart -- [Documentation](https://qdrant.tech/documentation/) -- Cloud Quickstart +```python +Filter( + should=None, + min_should=None, + must=FieldCondition( + key="color", + match=MatchValue(value="red"), + range=None, + geo_bounding_box=None, + geo_radius=None, + geo_polygon=None, + values_count=None + ), + must_not=None +) +``` -# [Anchor](https://qdrant.tech/documentation/cloud-quickstart/\#how-to-get-started-with-qdrant-cloud) How to Get Started With Qdrant Cloud +The same query, restricted to the available fields, now generates better criteria, as it doesn't try to filter by the +fields that don't exist in the collection. -How to Get Started With Qdrant Cloud - YouTube +### Testing the LLM output -[Photo image of Qdrant - Vector Database & Search Engine](https://www.youtube.com/channel/UC6ftm8PwH1RU_LM1jwG0LQA?embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +Although the LLMs are quite powerful, they are not perfect. If you plan to automate filtering, it makes sense to run +some tests to see how well they perform. Especially edge cases, like queries that cannot be expressed as filters. Let's +see how the LLM will handle the following query: -Qdrant - Vector Database & Search Engine +```python +qdrant_filter = anthropic_client.messages.create( + model="claude-3-5-sonnet-latest", + response_model=models.Filter, + max_tokens=1024, + messages=[ + { + "role": "user", + "content": ( + "fruit salad with no more than 100 calories" + f"\n{formatted_indexes}\n" + ) + } + ], +) +``` -8.12K subscribers +Output: -[How to Get Started With Qdrant Cloud](https://www.youtube.com/watch?v=3hrQP3hh69Y) +```python +Filter( + should=None, + min_should=None, + must=FieldCondition( + key="price", + match=None, + range=Range(lt=None, gt=None, gte=None, lte=100.0), + geo_bounding_box=None, + geo_radius=None, + geo_polygon=None, + values_count=None + ), + must_not=None +) +``` -Qdrant - Vector Database & Search Engine +Surprisingly, the LLM extracted the calorie information from the query and generated a filter based on the price field. +It somehow extracts any numerical information from the query and tries to match it with the available fields. -Search +Generally, giving model some more guidance on how to interpret the query may lead to better results. Adding a system +prompt that defines the rules for the query interpretation may help the model to do a better job. Here is how you can +do it: -Watch later +```python +SYSTEM_PROMPT = """ +You are extracting filters from a text query. Please follow the following rules: +1. Query is provided in the form of a text enclosed in tags. +2. Available indexes are put at the end of the text in the form of a list enclosed in tags. +3. You cannot use any field that is not available in the indexes. +4. Generate a filter only if you are certain that user's intent matches the field name. +5. Prices are always in USD. +6. It's better not to generate a filter than to generate an incorrect one. +""" -Share +qdrant_filter = anthropic_client.messages.create( + model="claude-3-5-sonnet-latest", + response_model=models.Filter, + max_tokens=1024, + messages=[ + { + "role": "user", + "content": SYSTEM_PROMPT.strip(), + }, + { + "role": "assistant", + "content": "Okay, I will follow all the rules." + }, + { + "role": "user", + "content": ( + "fruit salad with no more than 100 calories" + f"\n{formatted_indexes}\n" + ) + } + ], +) +``` -Copy link +Current output: -Info +```python +Filter( + should=None, + min_should=None, + must=None, + must_not=None +) +``` -Shopping +### Handling complex queries -Tap to unmute +We have a bunch of indexes created on the collection, and it is quite interesting to see how the LLM will handle more +complex queries. For example, let's see how it will handle the following query: -If playback doesn't begin shortly, try restarting your device. +```python +qdrant_filter = anthropic_client.messages.create( + model="claude-3-5-sonnet-latest", + response_model=models.Filter, + max_tokens=1024, + messages=[ + { + "role": "user", + "content": SYSTEM_PROMPT.strip(), + }, + { + "role": "assistant", + "content": "Okay, I will follow all the rules." + }, + { + "role": "user", + "content": ( + "" + "white T-shirt available no more than 30 miles from London, " + "but not in the city itself, below $15.70, not made from polyester" + "\n" + "\n" + f"{formatted_indexes}\n" + "" + ) + }, + ], +) +``` -More videos +It might be surprising, but Anthropic Claude is able to generate even such complex filters. Here is the output: -## More videos +```python +Filter( + should=None, + min_should=None, + must=[ + FieldCondition( + key="color", + match=MatchValue(value="white"), + range=None, + geo_bounding_box=None, + geo_radius=None, + geo_polygon=None, + values_count=None + ), + FieldCondition( + key="city.location", + match=None, + range=None, + geo_bounding_box=None, + geo_radius=GeoRadius( + center=GeoPoint(lon=-0.1276, lat=51.5074), + radius=48280.0 + ), + geo_polygon=None, + values_count=None + ), + FieldCondition( + key="price", + match=None, + range=Range(lt=15.7, gt=None, gte=None, lte=None), + geo_bounding_box=None, + geo_radius=None, + geo_polygon=None, + values_count=None + ) + ], must_not=[ + FieldCondition( + key="city.name", + match=MatchValue(value="London"), + range=None, + geo_bounding_box=None, + geo_radius=None, + geo_polygon=None, + values_count=None + ), + FieldCondition( + key="fabric", + match=MatchValue(value="polyester"), + range=None, + geo_bounding_box=None, + geo_radius=None, + geo_polygon=None, + values_count=None + ) + ] +) +``` -You're signed out +The model even knows the coordinates of London and uses them to generate the geo filter. It isn't the best idea to +rely on the model to generate such complex filters, but it's quite impressive that it can do it. -Videos you watch may be added to the TV's watch history and influence TV recommendations. To avoid this, cancel and sign in to YouTube on your computer. +## Further steps -CancelConfirm +Real production systems would rather require more testing and validation of the LLM output. Building a ground truth +dataset with the queries and the expected filters would be a good idea. You can use this dataset to evaluate the model +performance and to see how it behaves in different scenarios. -Share +<|page-92-lllmstxt|> +# Build a Neural Search Service with Sentence Transformers and Qdrant -Include playlist +| Time: 30 min | Level: Beginner | Output: [GitHub](https://github.com/qdrant/qdrant_demo/tree/sentense-transformers) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1kPktoudAP8Tu8n8l-iVMOQhVmHkWV_L9?usp=sharing) | +| --- | ----------- | ----------- |----------- | -An error occurred while retrieving sharing information. Please try again later. +This tutorial shows you how to build and deploy your own neural search service to look through descriptions of companies from [startups-list.com](https://www.startups-list.com/) and pick the most similar ones to your query. The website contains the company names, descriptions, locations, and a picture for each entry. -[Why am I seeing this?](https://support.google.com/youtube/answer/9004474?hl=en) +A neural search service uses artificial neural networks to improve the accuracy and relevance of search results. Besides offering simple keyword results, this system can retrieve results by meaning. It can understand and interpret complex search queries and provide more contextually relevant output, effectively enhancing the user's search experience. -[Watch on](https://www.youtube.com/watch?v=3hrQP3hh69Y&embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) + -0:00 -0:00 / 1:53 -‱Live +## Workflow -‱ +To create a neural search service, you will need to transform your raw data and then create a search function to manipulate it. First, you will 1) download and prepare a sample dataset using a modified version of the BERT ML model. Then, you will 2) load the data into Qdrant, 3) create a neural search API and 4) serve it using FastAPI. -[Watch on YouTube](https://www.youtube.com/watch?v=3hrQP3hh69Y "Watch on YouTube") +![Neural Search Workflow](/docs/workflow-neural-search.png) -You can try vector search on Qdrant Cloud in three steps. +> **Note**: The code for this tutorial can be found here: | [Step 1: Data Preparation Process](https://colab.research.google.com/drive/1kPktoudAP8Tu8n8l-iVMOQhVmHkWV_L9?usp=sharing) | [Step 2: Full Code for Neural Search](https://github.com/qdrant/qdrant_demo/tree/sentense-transformers). | -Instructions are below, but the video is faster: +## Prerequisites -## [Anchor](https://qdrant.tech/documentation/cloud-quickstart/\#setup-a-qdrant-cloud-cluster) Setup a Qdrant Cloud Cluster +To complete this tutorial, you will need: -1. Register for a [Cloud account](https://cloud.qdrant.io/signup) with your email, Google or Github credentials. -2. Go to **Clusters** and follow the onboarding instructions under **Create First Cluster**. +- Docker - The easiest way to use Qdrant is to run a pre-built Docker image. +- [Raw parsed data](https://storage.googleapis.com/generall-shared-data/startups_demo.json) from startups-list.com. +- Python version >=3.8 -![create a cluster](https://qdrant.tech/docs/gettingstarted/gui-quickstart/create-cluster.png) +## Prepare sample dataset -3. When you create it, you will receive an API key. You will need to copy it and store it somewhere self. It will not be displayed again. If you loose it, you can always create a new one on the **Cluster Detail Page** later. +To conduct a neural search on startup descriptions, you must first encode the description data into vectors. To process text, you can use a pre-trained models like [BERT](https://en.wikipedia.org/wiki/BERT_(language_model)) or sentence transformers. The [sentence-transformers](https://github.com/UKPLab/sentence-transformers) library lets you conveniently download and use many pre-trained models, such as DistilBERT, MPNet, etc. -![get api key](https://qdrant.tech/docs/gettingstarted/gui-quickstart/api-key.png) +1. First you need to download the dataset. -## [Anchor](https://qdrant.tech/documentation/cloud-quickstart/\#access-the-cluster-ui) Access the Cluster UI +```bash +wget https://storage.googleapis.com/generall-shared-data/startups_demo.json +``` -1. Click on **Cluster UI** on the **Cluster Detail Page** to access the cluster UI dashboard. -2. Paste your new API key here. You can revoke and create new API keys in the **API Keys** tab on your **Cluster Detail Page**. -3. The key will grant you access to your Qdrant instance. Now you can see the cluster Dashboard. +2. Install the SentenceTransformer library as well as other relevant packages. -![access the dashboard](https://qdrant.tech/docs/gettingstarted/gui-quickstart/access-dashboard.png) +```bash +pip install sentence-transformers numpy pandas tqdm +``` -## [Anchor](https://qdrant.tech/documentation/cloud-quickstart/\#authenticate-via-sdks) Authenticate via SDKs +3. Import the required modules. -Now that you have your cluster and key, you can use our official SDKs to access Qdrant Cloud from within your application. +```python +from sentence_transformers import SentenceTransformer +import numpy as np +import json +import pandas as pd +from tqdm.notebook import tqdm +``` -bashpythontypescriptrustjavacsharpgo +You will be using a pre-trained model called `all-MiniLM-L6-v2`. +This is a performance-optimized sentence embedding model and you can read more about it and other available models [here](https://www.sbert.net/docs/pretrained_models.html). -```bash -curl \ - -X GET https://xyz-example.eu-central.aws.cloud.qdrant.io:6333 \ - --header 'api-key: ' -# Alternatively, you can use the `Authorization` header with the `Bearer` prefix -curl \ - -X GET https://xyz-example.eu-central.aws.cloud.qdrant.io:6333 \ - --header 'Authorization: Bearer ' +4. Download and create a pre-trained sentence encoder. +```python +model = SentenceTransformer( + "all-MiniLM-L6-v2", device="cuda" +) # or device="cpu" if you don't have a GPU ``` +5. Read the raw data file. ```python -from qdrant_client import QdrantClient +df = pd.read_json("./startups_demo.json", lines=True) +``` +6. Encode all startup descriptions to create an embedding vector for each. Internally, the `encode` function will split the input into batches, which will significantly speed up the process. -qdrant_client = QdrantClient( - host="xyz-example.eu-central.aws.cloud.qdrant.io", - api_key="", +```python +vectors = model.encode( + [row.alt + ". " + row.description for row in df.itertuples()], + show_progress_bar=True, ) - ``` +All of the descriptions are now converted into vectors. There are 40474 vectors of 384 dimensions. The output layer of the model has this dimension -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +```python +vectors.shape +# > (40474, 384) +``` -const client = new QdrantClient({ - host: "xyz-example.eu-central.aws.cloud.qdrant.io", - apiKey: "", -}); +7. Download the saved vectors into a new file named `startup_vectors.npy` +```python +np.save("startup_vectors.npy", vectors, allow_pickle=False) ``` -```rust -use qdrant_client::Qdrant; +## Run Qdrant in Docker -let client = Qdrant::from_url("https://xyz-example.eu-central.aws.cloud.qdrant.io:6334") - .api_key("") - .build()?; +Next, you need to manage all of your data using a vector engine. Qdrant lets you store, update or delete created vectors. Most importantly, it lets you search for the nearest vectors via a convenient API. -``` +> **Note:** Before you begin, create a project directory and a virtual python environment in it. -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; +1. Download the Qdrant image from DockerHub. -QdrantClient client = - new QdrantClient( - QdrantGrpcClient.newBuilder( - "xyz-example.eu-central.aws.cloud.qdrant.io", - 6334, - true) - .withApiKey("") - .build()); +```bash +docker pull qdrant/qdrant +``` +2. Start Qdrant inside of Docker. + +```bash +docker run -p 6333:6333 \ + -v $(pwd)/qdrant_storage:/qdrant/storage \ + qdrant/qdrant +``` +You should see output like this +```text +... +[2021-02-05T00:08:51Z INFO actix_server::builder] Starting 12 workers +[2021-02-05T00:08:51Z INFO actix_server::builder] Starting "actix-web-service-0.0.0.0:6333" service on 0.0.0.0:6333 ``` -```csharp -using Qdrant.Client; +Test the service by going to [http://localhost:6333/](http://localhost:6333/). You should see the Qdrant version info in your browser. -var client = new QdrantClient( - host: "xyz-example.eu-central.aws.cloud.qdrant.io", - https: true, - apiKey: "" -); +All data uploaded to Qdrant is saved inside the `./qdrant_storage` directory and will be persisted even if you recreate the container. + +## Upload data to Qdrant + +1. Install the official Python client to best interact with Qdrant. +```bash +pip install qdrant-client ``` -```go -import "github.com/qdrant/go-client/qdrant" +At this point, you should have startup records in the `startups_demo.json` file, encoded vectors in `startup_vectors.npy` and Qdrant running on a local machine. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "xyz-example.eu-central.aws.cloud.qdrant.io", - Port: 6334, - APIKey: "", - UseTLS: true, -}) +Now you need to write a script to upload all startup data and vectors into the search engine. -``` +2. Create a client object for Qdrant. -## [Anchor](https://qdrant.tech/documentation/cloud-quickstart/\#try-the-tutorial-sandbox) Try the Tutorial Sandbox +```python +# Import client library +from qdrant_client import QdrantClient +from qdrant_client.models import VectorParams, Distance -1. Open the interactive **Tutorial**. Here, you can test basic Qdrant API requests. -2. Using the **Quickstart** instructions, create a collection, add vectors and run a search. -3. The output on the right will show you some basic semantic search results. +client = QdrantClient("http://localhost:6333") +``` -![interactive-tutorial](https://qdrant.tech/docs/gettingstarted/gui-quickstart/interactive-tutorial.png) +3. Related vectors need to be added to a collection. Create a new collection for your startup vectors. -## [Anchor](https://qdrant.tech/documentation/cloud-quickstart/\#thats-vector-search) That’s Vector Search! +```python +if not client.collection_exists("startups"): + client.create_collection( + collection_name="startups", + vectors_config=VectorParams(size=384, distance=Distance.COSINE), + ) +``` + -Now that you have a Qdrant Cloud cluster up and running, you should [test remote access](https://qdrant.tech/documentation/cloud/authentication/#test-cluster-access) with a Qdrant Client. +4. Create an iterator over the startup data and vectors. -For more about Qdrant Cloud, check our [dedicated documentation](https://qdrant.tech/documentation/cloud-intro/). +The Qdrant client library defines a special function that allows you to load datasets into the service. +However, since there may be too much data to fit a single computer memory, the function takes an iterator over the data as input. -##### Was this page useful? +```python +fd = open("./startups_demo.json") -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +# payload is now an iterator over startup data +payload = map(json.loads, fd) -Thank you for your feedback! 🙏 +# Load all vectors into memory, numpy array works as iterable for itself. +# Other option would be to use Mmap, if you don't want to load all data into RAM +vectors = np.load("./startup_vectors.npy") +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-quickstart.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +5. Upload the data -On this page: +```python +client.upload_collection( + collection_name="startups", + vectors=vectors, + payload=payload, + ids=None, # Vector ids will be assigned automatically + batch_size=256, # How many vectors will be uploaded in a single request? +) +``` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-quickstart.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Vectors are now uploaded to Qdrant. -× +## Build the search API -[Powered by](https://qdrant.tech/) +Now that all the preparations are complete, let's start building a neural search class. -<|page-60-lllmstxt|> -## documentation -# Qdrant Documentation +In order to process incoming requests, neural search will need 2 things: 1) a model to convert the query into a vector and 2) the Qdrant client to perform search queries. -Qdrant is an AI-native vector database and a semantic search engine. You can use it to extract meaningful information from unstructured data. +1. Create a file named `neural_searcher.py` and specify the following. -[Clone this repo now](https://github.com/qdrant/qdrant_demo/) and build a search engine in five minutes. +```python +from qdrant_client import QdrantClient +from sentence_transformers import SentenceTransformer -[Cloud Quickstart](https://qdrant.tech/documentation/quickstart-cloud/) [Local Quickstart](https://qdrant.tech/documentation/quickstart/) -## Ready to start developing? +class NeuralSearcher: + def __init__(self, collection_name): + self.collection_name = collection_name + # Initialize encoder model + self.model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu") + # initialize Qdrant client + self.qdrant_client = QdrantClient("http://localhost:6333") +``` -Qdrant is open-source and can be self-hosted. However, the quickest way to get started is with our [free tier](https://qdrant.to/cloud) on Qdrant Cloud. It scales easily and provides a UI where you can interact with data. +2. Write the search function. -### Create your first Qdrant Cloud cluster today +```python +def search(self, text: str): + # Convert text query into vector + vector = self.model.encode(text).tolist() -[Get Started](https://qdrant.to/cloud) + # Use `vector` for search for closest vectors in the collection + search_result = self.qdrant_client.query_points( + collection_name=self.collection_name, + query=vector, + query_filter=None, # If you don't want any filters for now + limit=5, # 5 the most closest results is enough + ).points + # `search_result` contains found vector ids with similarity scores along with the stored payload + # In this function you are interested in payload only + payloads = [hit.payload for hit in search_result] + return payloads +``` -![](https://qdrant.tech/img/rocket.svg) +3. Add search filters. -## Optimize Qdrant's performance +With Qdrant it is also feasible to add some conditions to the search. +For example, if you wanted to search for startups in a certain city, the search query could look like this: -Boost search speed, reduce latency, and improve the accuracy and memory usage of your Qdrant deployment. +```python +from qdrant_client.models import Filter -[Learn More](https://qdrant.tech/documentation/guides/optimize/) + ... -[![Documents](https://qdrant.tech/icons/outline/documentation-blue.svg)Documents\\ -**Distributed Deployment** \\ -Scale Qdrant beyond a single node and optimize for high availability, fault tolerance, and billion-scale performance.\\ -Read More](https://qdrant.tech/documentation/guides/distributed_deployment/) + city_of_interest = "Berlin" -[![Documents](https://qdrant.tech/icons/outline/documentation-blue.svg)Documents\\ -**Multitenancy** \\ -Build vector search apps that serve millions of users. Learn about data isolation, security, and performance tuning.\\ -Read More](https://qdrant.tech/documentation/guides/multiple-partitions/) + # Define a filter for cities + city_filter = Filter(**{ + "must": [{ + "key": "city", # Store city information in a field of the same name + "match": { # This condition checks if payload field has the requested value + "value": city_of_interest + } + }] + }) -[![Blog](https://qdrant.tech/icons/outline/blog-purple.svg)Blog\\ -**Vector Quantization** \\ -Learn about cutting-edge techniques for vector quantization and how they can be used to improve search performance.\\ -Read More](https://qdrant.tech/articles/what-is-vector-quantization/) + search_result = self.qdrant_client.query_points( + collection_name=self.collection_name, + query=vector, + query_filter=city_filter, + limit=5 + ).points + ... +``` -× +You have now created a class for neural search queries. Now wrap it up into a service. -[Powered by](https://qdrant.tech/) +## Deploy the search with FastAPI -<|page-61-lllmstxt|> -## cars-recognition -- [Articles](https://qdrant.tech/articles/) -- Fine Tuning Similar Cars Search +To build the service you will use the FastAPI framework. -[Back to Machine Learning](https://qdrant.tech/articles/machine-learning/) +1. Install FastAPI. -# Fine Tuning Similar Cars Search +To install it, use the command -Yusuf Sarıgöz +```bash +pip install fastapi uvicorn +``` -· +2. Implement the service. -June 28, 2022 +Create a file named `service.py` and specify the following. -![Fine Tuning Similar Cars Search](https://qdrant.tech/articles_data/cars-recognition/preview/title.jpg) +The service will have only one API endpoint and will look like this: -Supervised classification is one of the most widely used training objectives in machine learning, -but not every task can be defined as such. For example, +```python +from fastapi import FastAPI -1. Your classes may change quickly —e.g., new classes may be added over time, -2. You may not have samples from every possible category, -3. It may be impossible to enumerate all the possible classes during the training time, -4. You may have an essentially different task, e.g., search or retrieval. +# The file where NeuralSearcher is stored +from neural_searcher import NeuralSearcher -All such problems may be efficiently solved with similarity learning. +app = FastAPI() -N.B.: If you are new to the similarity learning concept, checkout the [awesome-metric-learning](https://github.com/qdrant/awesome-metric-learning) repo for great resources and use case examples. +# Create a neural searcher instance +neural_searcher = NeuralSearcher(collection_name="startups") -However, similarity learning comes with its own difficulties such as: -1. Need for larger batch sizes usually, -2. More sophisticated loss functions, -3. Changing architectures between training and inference. +@app.get("/api/search") +def search_startup(q: str): + return {"result": neural_searcher.search(text=q)} -Quaterion is a fine tuning framework built to tackle such problems in similarity learning. -It uses [PyTorch Lightning](https://www.pytorchlightning.ai/) -as a backend, which is advertized with the motto, “spend more time on research, less on engineering.” -This is also true for Quaterion, and it includes: -1. Trainable and servable model classes, -2. Annotated built-in loss functions, and a wrapper over [pytorch-metric-learning](https://kevinmusgrave.github.io/pytorch-metric-learning/) when you need even more, -3. Sample, dataset and data loader classes to make it easier to work with similarity learning data, -4. A caching mechanism for faster iterations and less memory footprint. +if __name__ == "__main__": + import uvicorn -## [Anchor](https://qdrant.tech/articles/cars-recognition/\#a-closer-look-at-quaterion) A closer look at Quaterion + uvicorn.run(app, host="0.0.0.0", port=8000) +``` -Let’s break down some important modules: +3. Run the service. -- `TrainableModel`: A subclass of `pl.LightNingModule` that has additional hook methods such as `configure_encoders`, `configure_head`, `configure_metrics` and others -to define objects needed for training and evaluation —see below to learn more on these. -- `SimilarityModel`: An inference-only export method to boost code transfer and lower dependencies during the inference time. -In fact, Quaterion is composed of two packages: -1. `quaterion_models`: package that you need for inference. -2. `quaterion`: package that defines objects needed for training and also depends on `quaterion_models`. -- `Encoder` and `EncoderHead`: Two objects that form a `SimilarityModel`. -In most of the cases, you may use a frozen pretrained encoder, e.g., ResNets from `torchvision`, or language modelling -models from `transformers`, with a trainable `EncoderHead` stacked on top of it. -`quaterion_models` offers several ready-to-use `EncoderHead` implementations, -but you may also create your own by subclassing a parent class or easily listing PyTorch modules in a `SequentialHead`. +```bash +python service.py +``` -Quaterion has other objects such as distance functions, evaluation metrics, evaluators, convenient dataset and data loader classes, but these are mostly self-explanatory. -Thus, they will not be explained in detail in this article for brevity. -However, you can always go check out the [documentation](https://quaterion.qdrant.tech/) to learn more about them. +4. Open your browser at [http://localhost:8000/docs](http://localhost:8000/docs). -The focus of this tutorial is a step-by-step solution to a similarity learning problem with Quaterion. -This will also help us better understand how the abovementioned objects fit together in a real project. -Let’s start walking through some of the important parts of the code. +You should be able to see a debug interface for your service. -If you are looking for the complete source code instead, you can find it under the [examples](https://github.com/qdrant/quaterion/tree/master/examples/cars) -directory in the Quaterion repo. +![FastAPI Swagger interface](/docs/fastapi_neural_search.png) -## [Anchor](https://qdrant.tech/articles/cars-recognition/\#dataset) Dataset +Feel free to play around with it, make queries regarding the companies in our corpus, and check out the results. -In this tutorial, we will use the [Stanford Cars](https://pytorch.org/vision/main/generated/torchvision.datasets.StanfordCars.html) -dataset. +## Next steps -![Stanford Cars Dataset](https://storage.googleapis.com/quaterion/docs/class_montage.jpg) +The code from this tutorial has been used to develop a [live online demo](https://qdrant.to/semantic-search-demo). +You can try it to get an intuition for cases when the neural search is useful. +The demo contains a switch that selects between neural and full-text searches. +You can turn the neural search on and off to compare your result with a regular full-text search. -Stanford Cars Dataset +> **Note**: The code for this tutorial can be found here: | [Step 1: Data Preparation Process](https://colab.research.google.com/drive/1kPktoudAP8Tu8n8l-iVMOQhVmHkWV_L9?usp=sharing) | [Step 2: Full Code for Neural Search](https://github.com/qdrant/qdrant_demo/tree/sentense-transformers). | -It has 16185 images of cars from 196 classes, -and it is split into training and testing subsets with almost a 50-50% split. -To make things even more interesting, however, we will first merge training and testing subsets, -then we will split it into two again in such a way that the half of the 196 classes will be put into the training set and the other half will be in the testing set. -This will let us test our model with samples from novel classes that it has never seen in the training phase, -which is what supervised classification cannot achieve but similarity learning can. +Join our [Discord community](https://qdrant.to/discord), where we talk about vector search and similarity learning, publish other examples of neural networks and neural search applications. -In the following code borrowed from [`data.py`](https://github.com/qdrant/quaterion/blob/master/examples/cars/data.py): +<|page-93-lllmstxt|> +# Private Cloud Configuration -- `get_datasets()` function performs the splitting task described above. -- `get_dataloaders()` function creates `GroupSimilarityDataLoader` instances from training and testing datasets. -- Datasets are regular PyTorch datasets that emit `SimilarityGroupSample` instances. +The Qdrant Private Cloud helm chart has several configuration options. The following YAML shows all configuration options with their default values: -N.B.: Currently, Quaterion has two data types to represent samples in a dataset. To learn more about `SimilarityPairSample`, check out the [NLP tutorial](https://quaterion.qdrant.tech/tutorials/nlp_tutorial.html) +```yaml +operator: + # Amount of replicas for the Qdrant operator (v2) + replicaCount: 1 -```python -import numpy as np -import os -import tqdm -from torch.utils.data import Dataset, Subset -from torchvision import datasets, transforms -from typing import Callable -from pytorch_lightning import seed_everything + image: + # Image repository for the qdrant operator + repository: registry.cloud.qdrant.io/qdrant/operator + # Image pullPolicy + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "" -from quaterion.dataset import ( - GroupSimilarityDataLoader, - SimilarityGroupSample, -) + # Optional image pull secrets + imagePullSecrets: + - name: qdrant-registry-creds -# set seed to deterministically sample train and test categories later on -seed_everything(seed=42) + nameOverride: "" + fullnameOverride: "operator" -# dataset will be downloaded to this directory under local directory -dataset_path = os.path.join(".", "torchvision", "datasets") + # Service account configuration + serviceAccount: + create: true + annotations: {} -def get_datasets(input_size: int): - # Use Mean and std values for the ImageNet dataset as the base model was pretrained on it. - # taken from https://www.geeksforgeeks.org/how-to-normalize-images-in-pytorch/ - mean = [0.485, 0.456, 0.406] - std = [0.229, 0.224, 0.225] + # Additional pod annotations + podAnnotations: {} - # create train and test transforms - transform = transforms.Compose( - [\ - transforms.Resize((input_size, input_size)),\ - transforms.ToTensor(),\ - transforms.Normalize(mean, std),\ - ] - ) + # pod security context + podSecurityContext: + runAsNonRoot: true + runAsUser: 10001 + runAsGroup: 20001 + fsGroup: 30001 - # we need to merge train and test splits into a full dataset first, - # and then we will split it to two subsets again with each one composed of distinct labels. - full_dataset = datasets.StanfordCars( - root=dataset_path, split="train", download=True - ) + datasets.StanfordCars(root=dataset_path, split="test", download=True) + # container security context + securityContext: + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 10001 + runAsGroup: 20001 + allowPrivilegeEscalation: false + seccompProfile: + type: RuntimeDefault - # full_dataset contains examples from 196 categories labeled with an integer from 0 to 195 - # randomly sample half of it to be used for training - train_categories = np.random.choice(a=196, size=196 // 2, replace=False) + # Configuration for the Qdrant operator service to expose metrics + service: + enabled: true + type: ClusterIP + metricsPort: 9290 - # get a list of labels for all samples in the dataset - labels_list = np.array([label for _, label in tqdm.tqdm(full_dataset)]) + # Configuration for the Qdrant operator service monitor to scrape metrics + serviceMonitor: + enabled: false - # get a mask for indices where label is included in train_categories - labels_mask = np.isin(labels_list, train_categories) + # Resource requests and limits for the Qdrant operator + resources: {} - # get a list of indices to be used as train samples - train_indices = np.argwhere(labels_mask).squeeze() + # Node selector for the Qdrant operator + nodeSelector: {} - # others will be used as test samples - test_indices = np.argwhere(np.logical_not(labels_mask)).squeeze() + # Tolerations for the Qdrant operator + tolerations: [] - # now that we have distinct indices for train and test sets, we can use `Subset` to create new datasets - # from `full_dataset`, which contain only the samples at given indices. - # finally, we apply transformations created above. - train_dataset = CarsDataset( - Subset(full_dataset, train_indices), transform=transform - ) + # Affinity configuration for the Qdrant operator + affinity: {} - test_dataset = CarsDataset( - Subset(full_dataset, test_indices), transform=transform - ) + watch: + # If true, watches only the namespace where the Qdrant operator is deployed, otherwise watches the namespaces in watch.namespaces + onlyReleaseNamespace: true + # an empty list watches all namespaces. + namespaces: [] - return train_dataset, test_dataset + limitRBAC: true -def get_dataloaders( - batch_size: int, - input_size: int, - shuffle: bool = False, -): - train_dataset, test_dataset = get_datasets(input_size) + # Configuration for the Qdrant operator (v2) + settings: + # Does the operator run inside of a Kubernetes cluster (kubernetes) or outside (local) + appEnvironment: kubernetes + # The log level for the operator + # Available options: DEBUG | INFO | WARN | ERROR + logLevel: INFO + # Metrics contains the operator config related the metrics + metrics: + # The port used for metrics + port: 9290 + # Health contains the operator config related the health probe + healthz: + # The port used for the health probe + port: 8285 + # Controller related settings + controller: + # The period a forced recync is done by the controller (if watches are missed / nothing happened) + forceResyncPeriod: 10h + # QPS indicates the maximum QPS to the master from this client. + # Default is 200 + qps: 200 + # Maximum burst for throttle. + # Default is 500. + burst: 500 + # Features contains the settings for enabling / disabling the individual features of the operator + features: + # ClusterManagement contains the settings for qdrant (database) cluster management + clusterManagement: + # Whether or not the Qdrant cluster features are enabled. + # If disabled, all other properties in this struct are disregarded. Otherwise, the individual features will be inspected. + # Default is true. + enable: true + # The StorageClass used to make database and snapshot PVCs. + # Default is nil, meaning the default storage class of Kubernetes. + storageClass: + # The StorageClass used to make database PVCs. + # Default is nil, meaning the default storage class of Kubernetes. + #database: + # The StorageClass used to make snapshot PVCs. + # Default is nil, meaning the default storage class of Kubernetes. + #snapshot: + # Qdrant config contains settings specific for the database + qdrant: + # The config where to find the image for qdrant + image: + # The repository where to find the image for qdrant + # Default is "qdrant/qdrant" + repository: registry.cloud.qdrant.io/qdrant/qdrant + # Docker image pull policy + # Default "IfNotPresent", unless the tag is dev, master or latest. Then "Always" + #pullPolicy: + # Docker image pull secret name + # This secret should be available in the namespace where the cluster is running + # Default not set + pullSecretName: qdrant-registry-creds + # storage contains the settings for the storage of the Qdrant cluster + storage: + performance: + # CPU budget, how many CPUs (threads) to allocate for an optimization job. + # If 0 - auto selection, keep 1 or more CPUs unallocated depending on CPU size + # If negative - subtract this number of CPUs from the available CPUs. + # If positive - use this exact number of CPUs. + optimizerCpuBudget: 0 + # Enable async scorer which uses io_uring when rescoring. + # Only supported on Linux, must be enabled in your kernel. + # See: + asyncScorer: false + # Qdrant DB log level + # Available options: DEBUG | INFO | WARN | ERROR + # Default is "INFO" + logLevel: INFO + # Default Qdrant security context configuration + securityContext: + # Enable default security context + # Default is false + enabled: false + # Default user for qdrant container + # Default not set + #user: 1000 + # Default fsGroup for qdrant container + # Default not set + #fsUser: 2000 + # Default group for qdrant container + # Default not set + #group: 3000 + # Network policies configuration for the Qdrant databases + networkPolicies: + # Whether or not NetworkPolicy management is enabled. + # If set to false, no NetworkPolicies will be created. + # Default is true. + enable: true + ingress: + - ports: + - protocol: TCP + port: 6333 + - protocol: TCP + port: 6334 + # Allow DNS resolution from qdrant pods at Kubernetes internal DNS server + egress: + - ports: + - protocol: UDP + port: 53 + # Scheduling config contains the settings specific for scheduling + scheduling: + # Default topology spread constraints (list from type corev1.TopologySpreadConstraint) + # Default is an empty list + topologySpreadConstraints: [] + # Default pod disruption budget (object from type policyv1.PodDisruptionBudgetSpec) + # Default is not set + podDisruptionBudget: {} + # ClusterManager config contains the settings specific for cluster manager + clusterManager: + # Whether or not the cluster manager (on operator level). + # If disabled, all other properties in this struct are disregarded. Otherwise, the individual features will be inspected. + # Default is false. + enable: true + # The endpoint address where the cluster manager can be reached + endpointAddress: "http://qdrant-cluster-manager" + # InvocationInterval is the interval between calls (started after the previous call is retured) + # Default is 10 seconds + invocationInterval: 10s + # Timeout is the duration a single call to the cluster manager is allowed to take. + # Default is 30 seconds + timeout: 30s + # Specifies overrides for the manage rules + manageRulesOverrides: + #dry_run: + #max_transfers: + #max_transfers_per_collection: + #rebalance: + #replicate: + # Ingress config contains the settings specific for ingress + ingress: + # Whether or not the Ingress feature is enabled. + # Default is true. + enable: false + # Which specific ingress provider should be used + # Default is KubernetesIngress + provider: KubernetesIngress + # The specific settings when the Provider is QdrantCloudTraefik + qdrantCloudTraefik: + # Enable tls + # Default is false + tls: false + # Secret with TLS certificate + # Default is None + secretName: "" + # List of Traefik middlewares to apply + # Default is an empty list + middlewares: [] + # IP Allowlist Strategy for Traefik + # Default is None + ipAllowlistStrategy: + # Enable body validator plugin and matching ingressroute rules + # Default is false + enableBodyValidatorPlugin: false + # The specific settings when the Provider is KubernetesIngress + kubernetesIngress: + # Name of the ingress class + # Default is None + #ingressClassName: + # TelemetryTimeout is the duration a single call to the cluster telemetry endpoint is allowed to take. + # Default is 3 seconds + telemetryTimeout: 3s + # MaxConcurrentReconciles is the maximum number of concurrent Reconciles which can be run. Defaults to 20. + maxConcurrentReconciles: 20 + # VolumeExpansionMode specifies the expansion mode, which can be online or offline (e.g. in case of Azure). + # Available options: Online, Offline + # Default is Online + volumeExpansionMode: Online + # BackupManagementConfig contains the settings for backup management + backupManagement: + # Whether or not the backup features are enabled. + # If disabled, all other properties in this struct are disregarded. Otherwise, the individual features will be inspected. + # Default is true. + enable: true + # Snapshots contains the settings for snapshots as part of backup management. + snapshots: + # Whether or not the Snapshot feature is enabled. + # Default is true. + enable: true + # The VolumeSnapshotClass used to make VolumeSnapshots. + # Default is "csi-snapclass". + volumeSnapshotClass: "csi-snapclass" + # The duration a snapshot is retained when the phase becomes Failed or Skipped + # Default is 72h (3d). + retainUnsuccessful: 72h + # MaxConcurrentReconciles is the maximum number of concurrent Reconciles which can be run. Defaults to 1. + maxConcurrentReconciles: 1 + # ScheduledSnapshots contains the settings for scheduled snapshot as part of backup management. + scheduledSnapshots: + # Whether or not the ScheduledSnapshot feature is enabled. + # Default is true. + enable: true + # MaxConcurrentReconciles is the maximum number of concurrent Reconciles which can be run. Defaults to 1. + maxConcurrentReconciles: 1 + # Restores contains the settings for restoring (a snapshot) as part of backup management. + restores: + # Whether or not the Restore feature is enabled. + # Default is true. + enable: true + # MaxConcurrentReconciles is the maximum number of concurrent Reconciles which can be run. Defaults to 1. + maxConcurrentReconciles: 1 - train_dataloader = GroupSimilarityDataLoader( - train_dataset, batch_size=batch_size, shuffle=shuffle - ) +qdrant-cluster-manager: + replicaCount: 1 - test_dataloader = GroupSimilarityDataLoader( - test_dataset, batch_size=batch_size, shuffle=False - ) + image: + repository: registry.cloud.qdrant.io/qdrant/cluster-manager + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "" - return train_dataloader, test_dataloader + imagePullSecrets: + - name: qdrant-registry-creds + nameOverride: "" + fullnameOverride: "qdrant-cluster-manager" -class CarsDataset(Dataset): - def __init__(self, dataset: Dataset, transform: Callable): - self._dataset = dataset - self._transform = transform + serviceAccount: + # Specifies whether a service account should be created + create: true + # Automatically mount a ServiceAccount's API credentials? + automount: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" - def __len__(self) -> int: - return len(self._dataset) + podAnnotations: {} + podLabels: {} - def __getitem__(self, index) -> SimilarityGroupSample: - image, label = self._dataset[index] - image = self._transform(image) + podSecurityContext: + runAsNonRoot: true + runAsUser: 10001 + runAsGroup: 20001 + fsGroup: 30001 - return SimilarityGroupSample(obj=image, group=label) + securityContext: + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 10001 + runAsGroup: 20001 + allowPrivilegeEscalation: false + seccompProfile: + type: RuntimeDefault -``` + service: + type: ClusterIP -## [Anchor](https://qdrant.tech/articles/cars-recognition/\#trainable-model) Trainable Model + networkPolicy: + create: true -Now it’s time to review one of the most exciting building blocks of Quaterion: [TrainableModel](https://quaterion.qdrant.tech/quaterion.train.trainable_model.html#module-quaterion.train.trainable_model). -It is the base class for models you would like to configure for training, -and it provides several hook methods starting with `configure_` to set up every aspect of the training phase -just like [`pl.LightningModule`](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.LightningModule.html), its own base class. -It is central to fine tuning with Quaterion, so we will break down this essential code in [`models.py`](https://github.com/qdrant/quaterion/blob/master/examples/cars/models.py) -and review each method separately. Let’s begin with the imports: + resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi -```python -import torch -import torchvision -from quaterion_models.encoders import Encoder -from quaterion_models.heads import EncoderHead, SkipConnectionHead -from torch import nn -from typing import Dict, Union, Optional, List + nodeSelector: {} -from quaterion import TrainableModel -from quaterion.eval.attached_metric import AttachedMetric -from quaterion.eval.group import RetrievalRPrecision -from quaterion.loss import SimilarityLoss, TripletLoss -from quaterion.train.cache import CacheConfig, CacheType + tolerations: [] -from .encoders import CarsEncoder + affinity: {} -``` +qdrant-cluster-exporter: + image: + repository: registry.cloud.qdrant.io/qdrant/qdrant-cluster-exporter + pullPolicy: Always + # Overrides the image tag. Defaults to the chart appVersion. + tag: "" -In the following code snippet, we subclass `TrainableModel`. -You may use `__init__()` to store some attributes to be used in various `configure_*` methods later on. -The more interesting part is, however, in the [`configure_encoders()`](https://quaterion.qdrant.tech/quaterion.train.trainable_model.html#quaterion.train.trainable_model.TrainableModel.configure_encoders) method. -We need to return an instance of [`Encoder`](https://quaterion-models.qdrant.tech/quaterion_models.encoders.encoder.html#quaterion_models.encoders.encoder.Encoder) (or a dictionary with `Encoder` instances as values) from this method. -In our case, it is an instance of `CarsEncoders`, which we will review soon. -Notice now how it is created with a pretrained ResNet152 model whose classification layer is replaced by an identity function. + imagePullSecrets: + - name: qdrant-registry-creds -```python -class Model(TrainableModel): - def __init__(self, lr: float, mining: str): - self._lr = lr - self._mining = mining - super().__init__() + nameOverride: "" + fullnameOverride: "" - def configure_encoders(self) -> Union[Encoder, Dict[str, Encoder]]: - pre_trained_encoder = torchvision.models.resnet152(pretrained=True) - pre_trained_encoder.fc = nn.Identity() - return CarsEncoder(pre_trained_encoder) + serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" -``` + rbac: + create: true -In Quaterion, a [`SimilarityModel`](https://quaterion-models.qdrant.tech/quaterion_models.model.html#quaterion_models.model.SimilarityModel) is composed of one or more `Encoder` s -and an [`EncoderHead`](https://quaterion-models.qdrant.tech/quaterion_models.heads.encoder_head.html#quaterion_models.heads.encoder_head.EncoderHead). -`quaterion_models` has [several `EncoderHead` implementations](https://quaterion-models.qdrant.tech/quaterion_models.heads.html#module-quaterion_models.heads) -with a unified API such as a configurable dropout value. -You may use one of them or create your own subclass of `EncoderHead`. -In either case, you need to return an instance of it from [`configure_head`](https://quaterion.qdrant.tech/quaterion.train.trainable_model.html#quaterion.train.trainable_model.TrainableModel.configure_head) -In this example, we will use a `SkipConnectionHead`, which is lightweight and more resistant to overfitting. + podAnnotations: {} -```python - def configure_head(self, input_embedding_size) -> EncoderHead: - return SkipConnectionHead(input_embedding_size, dropout=0.1) + podSecurityContext: + runAsNonRoot: true + runAsUser: 65534 + runAsGroup: 65534 + fsGroup: 65534 -``` + securityContext: + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 65534 + runAsGroup: 65534 -Quaterion has implementations of [some popular loss functions](https://quaterion.qdrant.tech/quaterion.loss.html) for similarity learning, all of which subclass either [`GroupLoss`](https://quaterion.qdrant.tech/quaterion.loss.group_loss.html#quaterion.loss.group_loss.GroupLoss) -or [`PairwiseLoss`](https://quaterion.qdrant.tech/quaterion.loss.pairwise_loss.html#quaterion.loss.pairwise_loss.PairwiseLoss). -In this example, we will use [`TripletLoss`](https://quaterion.qdrant.tech/quaterion.loss.triplet_loss.html#quaterion.loss.triplet_loss.TripletLoss), -which is a subclass of `GroupLoss`. In general, subclasses of `GroupLoss` are used with -datasets in which samples are assigned with some group (or label). In our example label is a make of the car. -Those datasets should emit `SimilarityGroupSample`. -Other alternatives are implementations of `PairwiseLoss`, which consume `SimilarityPairSample` \- pair of objects for which similarity is specified individually. -To see an example of the latter, you may need to check out the [NLP Tutorial](https://quaterion.qdrant.tech/tutorials/nlp_tutorial.html) + service: + enabled: true + type: ClusterIP + port: 9090 + portName: metrics -```python - def configure_loss(self) -> SimilarityLoss: - return TripletLoss(mining=self._mining, margin=0.5) + strategy: + # Prevents double-scraping by terminating the old pod before creating a new one + # The pod scrapes a large volume of metrics with high cardinality + type: Recreate -``` + resources: {} + # We usually recommend not setting default resources and to leave this as a conscious + # choice for the user. This allows charts to run on environments with fewer + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi -`configure_optimizers()` may be familiar to PyTorch Lightning users, -but there is a novel `self.model` used inside that method. -It is an instance of `SimilarityModel` and is automatically created by Quaterion from the return values of `configure_encoders()` and `configure_head()`. + nodeSelector: {} -```python - def configure_optimizers(self): - optimizer = torch.optim.Adam(self.model.parameters(), self._lr) - return optimizer + tolerations: [] -``` + affinity: {} -Caching in Quaterion is used for avoiding calculation of outputs of a frozen pretrained `Encoder` in every epoch. -When it is configured, outputs will be computed once and cached in the preferred device for direct usage later on. -It provides both a considerable speedup and less memory footprint. -However, it is quite a bit versatile and has several knobs to tune. -To get the most out of its potential, it’s recommended that you check out the [cache tutorial](https://quaterion.qdrant.tech/tutorials/cache_tutorial.html). -For the sake of making this article self-contained, you need to return a [`CacheConfig`](https://quaterion.qdrant.tech/quaterion.train.cache.cache_config.html#quaterion.train.cache.cache_config.CacheConfig) -instance from [`configure_caches()`](https://quaterion.qdrant.tech/quaterion.train.trainable_model.html#quaterion.train.trainable_model.TrainableModel.configure_caches) -to specify cache-related preferences such as: + serviceMonitor: + enabled: true + honorLabels: true + scrapeInterval: 60s + scrapeTimeout: 55s -- [`CacheType`](https://quaterion.qdrant.tech/quaterion.train.cache.cache_config.html#quaterion.train.cache.cache_config.CacheType), i.e., whether to store caches on CPU or GPU, -- `save_dir`, i.e., where to persist caches for subsequent runs, -- `batch_size`, i.e., batch size to be used only when creating caches - the batch size to be used during the actual training might be different. + # Limit RBAC to the release namespace + limitRBAC: false -```python - def configure_caches(self) -> Optional[CacheConfig]: - return CacheConfig( - cache_type=CacheType.AUTO, save_dir="./cache_dir", batch_size=32 - ) + # Watched Namespaces Configuration + watch: + # If true, only the namespace where the exporter is deployed is watched, otherwise it watches the namespaces defined in watch.namespaces + onlyReleaseNamespace: false + # an empty list watches all namespaces + namespaces: [] + # Configuration for the qdrant cluster exporter + config: + # The log level for the cluster-exporter + # Available options: DEBUG | INFO | WARN | ERROR + logLevel: INFO + # Controller related settings + controller: + # Schedule for the controller to do a forced resync (if watches are missed / nothing happened) + forceResyncPeriod: 10h + # Indicates the maximum QPS from this client to the master + # Default is 200 + qps: 200 + # Maximum burst for throttle. + # Default is 500. + burst: 500 + # Maximum number of concurrent reconciliations + maxConcurrentReconciles: 20 + # Controller's object requeueing interval + requeueInterval: 30s + # Exporter Metrics Configuration + metrics: + # The port on which the metrics are exposed + port: 9090 + # The path on which the metrics are exposed + path: /metrics + # Exporter Health Check Configuration + healthz: + # The port used for the health probe + port: 8085 + # Qdrant Telemetry and Metrics Cache Configuration + cache: + # The period after which the cache is invalidated + ttl: 60s + # Qdrant Rest Client Configuration + qdrant: + restAPI: + # The qdrant rest api port + port: 6333 + # Qdrant API Request Timeout after which requests to Qdrant are canceled if not completed + timeout: 20s + # Path where qdrant exposes metrics + metricsPath: "metrics" + # Qdrant Telemetry Configuration + telemetry: + # Path where qdrant exposes telemetry + path: "telemetry" + # The level of details for telemetry + detailsLevel: 6 + # Whether to anonymize the telemetry data + anonymize: true ``` -We have just configured the training-related settings of a `TrainableModel`. -However, evaluation is an integral part of experimentation in machine learning, -and you may configure evaluation metrics by returning one or more [`AttachedMetric`](https://quaterion.qdrant.tech/quaterion.eval.attached_metric.html#quaterion.eval.attached_metric.AttachedMetric) -instances from `configure_metrics()`. Quaterion has several built-in [group](https://quaterion.qdrant.tech/quaterion.eval.group.html) -and [pairwise](https://quaterion.qdrant.tech/quaterion.eval.pair.html) -evaluation metrics. - -```python - def configure_metrics(self) -> Union[AttachedMetric, List[AttachedMetric]]: - return AttachedMetric( - "rrp", - metric=RetrievalRPrecision(), - prog_bar=True, - on_epoch=True, - on_step=False, - ) +<|page-94-lllmstxt|> +# Backup and Restore Qdrant Collections Using Snapshots -``` +| Time: 20 min | Level: Beginner | | | +|--------------|-----------------|--|----| -## [Anchor](https://qdrant.tech/articles/cars-recognition/\#encoder) Encoder +A collection is a basic unit of data storage in Qdrant. It contains vectors, their IDs, and payloads. However, keeping the search efficient requires additional data structures to be built on top of the data. Building these data structures may take a while, especially for large collections. +That's why using snapshots is the best way to export and import Qdrant collections, as they contain all the bits and pieces required to restore the entire collection efficiently. -As previously stated, a `SimilarityModel` is composed of one or more `Encoder` s and an `EncoderHead`. -Even if we freeze pretrained `Encoder` instances, -`EncoderHead` is still trainable and has enough parameters to adapt to the new task at hand. -It is recommended that you set the `trainable` property to `False` whenever possible, -as it lets you benefit from the caching mechanism described above. -Another important property is `embedding_size`, which will be passed to `TrainableModel.configure_head()` as `input_embedding_size` -to let you properly initialize the head layer. -Let’s see how an `Encoder` is implemented in the following code borrowed from [`encoders.py`](https://github.com/qdrant/quaterion/blob/master/examples/cars/encoders.py): +This tutorial will show you how to create a snapshot of a collection and restore it. Since working with snapshots in a distributed environment might be thought to be a bit more complex, we will use a 3-node Qdrant cluster. However, the same approach applies to a single-node setup. -```python -import os + -import torch -import torch.nn as nn -from quaterion_models.encoders import Encoder +You can use the techniques described in this page to migrate a cluster. Follow the instructions +in this tutorial to create and download snapshots. When you [Restore from snapshot](#restore-from-snapshot), restore your data to the new cluster. -class CarsEncoder(Encoder): - def __init__(self, encoder_model: nn.Module): - super().__init__() - self._encoder = encoder_model - self._embedding_size = 2048 # last dimension from the ResNet model +## Prerequisites - @property - def trainable(self) -> bool: - return False +Let's assume you already have a running Qdrant instance or a cluster. If not, you can follow the [installation guide](/documentation/guides/installation/) to set up a local Qdrant instance or use [Qdrant Cloud](https://cloud.qdrant.io/) to create a cluster in a few clicks. - @property - def embedding_size(self) -> int: - return self._embedding_size +Once the cluster is running, let's install the required dependencies: +```shell +pip install qdrant-client datasets ``` -An `Encoder` is a regular `torch.nn.Module` subclass, -and we need to implement the forward pass logic in the `forward` method. -Depending on how you create your submodules, this method may be more complex; -however, we simply pass the input through a pretrained ResNet152 backbone in this example: - -```python - def forward(self, images): - embeddings = self._encoder.forward(images) - return embeddings - -``` +### Establish a connection to Qdrant -An important step of machine learning development is proper saving and loading of models. -Quaterion lets you save your `SimilarityModel` with [`TrainableModel.save_servable()`](https://quaterion.qdrant.tech/quaterion.train.trainable_model.html#quaterion.train.trainable_model.TrainableModel.save_servable) -and restore it with [`SimilarityModel.load()`](https://quaterion-models.qdrant.tech/quaterion_models.model.html#quaterion_models.model.SimilarityModel.load). -To be able to use these two methods, you need to implement `save()` and `load()` methods in your `Encoder`. -Additionally, it is also important that you define your subclass of `Encoder` outside the `__main__` namespace, -i.e., in a separate file from your main entry point. -It may not be restored properly otherwise. +We are going to use the Python SDK and raw HTTP calls to interact with Qdrant. Since we are going to use a 3-node cluster, we need to know the URLs of all the nodes. For the simplicity, let's keep them all in constants, along with the API key, so we can refer to them later: ```python - def save(self, output_path: str): - os.makedirs(output_path, exist_ok=True) - torch.save(self._encoder, os.path.join(output_path, "encoder.pth")) - - @classmethod - def load(cls, input_path): - encoder_model = torch.load(os.path.join(input_path, "encoder.pth")) - return CarsEncoder(encoder_model) - +QDRANT_MAIN_URL = "https://my-cluster.com:6333" +QDRANT_NODES = ( + "https://node-0.my-cluster.com:6333", + "https://node-1.my-cluster.com:6333", + "https://node-2.my-cluster.com:6333", +) +QDRANT_API_KEY = "my-api-key" ``` -## [Anchor](https://qdrant.tech/articles/cars-recognition/\#training) Training - -With all essential objects implemented, it is easy to bring them all together and run a training loop with the [`Quaterion.fit()`](https://quaterion.qdrant.tech/quaterion.main.html#quaterion.main.Quaterion.fit) -method. It expects: - -- A `TrainableModel`, -- A [`pl.Trainer`](https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html), -- A [`SimilarityDataLoader`](https://quaterion.qdrant.tech/quaterion.dataset.similarity_data_loader.html#quaterion.dataset.similarity_data_loader.SimilarityDataLoader) for training data, -- And optionally, another `SimilarityDataLoader` for evaluation data. + -We need to import a few objects to prepare all of these: +We can now create a client instance: ```python -import os -import pytorch_lightning as pl -import torch -from pytorch_lightning.callbacks import EarlyStopping, ModelSummary - -from quaterion import Quaterion -from .data import get_dataloaders -from .models import Model +from qdrant_client import QdrantClient +client = QdrantClient(QDRANT_MAIN_URL, api_key=QDRANT_API_KEY) ``` -The `train()` function in the following code snippet expects several hyperparameter values as arguments. -They can be defined in a `config.py` or passed from the command line. -However, that part of the code is omitted for brevity. -Instead let’s focus on how all the building blocks are initialized and passed to `Quaterion.fit()`, -which is responsible for running the whole loop. -When the training loop is complete, you can simply call `TrainableModel.save_servable()` -to save the current state of the `SimilarityModel` instance: - -```python -def train( - lr: float, - mining: str, - batch_size: int, - epochs: int, - input_size: int, - shuffle: bool, - save_dir: str, -): - model = Model( - lr=lr, - mining=mining, - ) +First of all, we are going to create a collection from a precomputed dataset. If you already have a collection, you can skip this step and start by [creating a snapshot](#create-and-download-snapshots). - train_dataloader, val_dataloader = get_dataloaders( - batch_size=batch_size, input_size=input_size, shuffle=shuffle - ) +
+ (Optional) Create collection and import data - early_stopping = EarlyStopping( - monitor="validation_loss", - patience=50, - ) +### Load the dataset - trainer = pl.Trainer( - gpus=1 if torch.cuda.is_available() else 0, - max_epochs=epochs, - callbacks=[early_stopping, ModelSummary(max_depth=3)], - enable_checkpointing=False, - log_every_n_steps=1, - ) +We are going to use a dataset with precomputed embeddings, available on Hugging Face Hub. The dataset is called [Qdrant/arxiv-titles-instructorxl-embeddings](https://huggingface.co/datasets/Qdrant/arxiv-titles-instructorxl-embeddings) and was created using the [InstructorXL](https://huggingface.co/hkunlp/instructor-xl) model. It contains 2.25M embeddings for the titles of the papers from the [arXiv](https://arxiv.org/) dataset. - Quaterion.fit( - trainable_model=model, - trainer=trainer, - train_dataloader=train_dataloader, - val_dataloader=val_dataloader, - ) +Loading the dataset is as simple as: - model.save_servable(save_dir) +```python +from datasets import load_dataset +dataset = load_dataset( + "Qdrant/arxiv-titles-instructorxl-embeddings", split="train", streaming=True +) ``` -## [Anchor](https://qdrant.tech/articles/cars-recognition/\#evaluation) Evaluation - -Let’s see what we have achieved with these simple steps. -[`evaluate.py`](https://github.com/qdrant/quaterion/blob/master/examples/cars/evaluate.py) has two functions to evaluate both the baseline model and the tuned similarity model. -We will review only the latter for brevity. -In addition to the ease of restoring a `SimilarityModel`, this code snippet also shows -how to use [`Evaluator`](https://quaterion.qdrant.tech/quaterion.eval.evaluator.html#quaterion.eval.evaluator.Evaluator) -to evaluate the performance of a `SimilarityModel` on a given dataset -by given evaluation metrics. - -![Comparison of original and tuned models for retrieval](https://storage.googleapis.com/quaterion/docs/original_vs_tuned_cars.png) - -Comparison of original and tuned models for retrieval - -Full evaluation of a dataset usually grows exponentially, -and thus you may want to perform a partial evaluation on a sampled subset. -In this case, you may use [samplers](https://quaterion.qdrant.tech/quaterion.eval.samplers.html) -to limit the evaluation. -Similar to `Quaterion.fit()` used for training, [`Quaterion.evaluate()`](https://quaterion.qdrant.tech/quaterion.main.html#quaterion.main.Quaterion.evaluate) -runs a complete evaluation loop. It takes the following as arguments: - -- An `Evaluator` instance created with given evaluation metrics and a `Sampler`, -- The `SimilarityModel` to be evaluated, -- And the evaluation dataset. +We used the streaming mode, so the dataset is not loaded into memory. Instead, we can iterate through it and extract the id and vector embedding: ```python -def eval_tuned_encoder(dataset, device): - print("Evaluating tuned encoder...") - tuned_cars_model = SimilarityModel.load( - os.path.join(os.path.dirname(__file__), "cars_encoders") - ).to(device) - tuned_cars_model.eval() - - result = Quaterion.evaluate( - evaluator=Evaluator( - metrics=RetrievalRPrecision(), - sampler=GroupSampler(sample_size=1000, device=device, log_progress=True), - ), - model=tuned_cars_model, - dataset=dataset, - ) - - print(result) - +for payload in dataset: + id_ = payload.pop("id") + vector = payload.pop("vector") + print(id_, vector, payload) ``` -## [Anchor](https://qdrant.tech/articles/cars-recognition/\#conclusion) Conclusion - -In this tutorial, we trained a similarity model to search for similar cars from novel categories unseen in the training phase. -Then, we evaluated it on a test dataset by the Retrieval R-Precision metric. -The base model scored 0.1207, -and our tuned model hit 0.2540, a twice higher score. -These scores can be seen in the following figure: - -![Metrics for the base and tuned models](https://qdrant.tech/articles_data/cars-recognition/cars_metrics.png) - -Metrics for the base and tuned models - -##### Was this page useful? - -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No - -Thank you for your feedback! 🙏 - -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/cars-recognition.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. - -On this page: - -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/cars-recognition.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) - -× - -[Powered by](https://qdrant.tech/) - -<|page-62-lllmstxt|> -## rag-chatbot-scaleway -- [Documentation](https://qdrant.tech/documentation/) -- [Examples](https://qdrant.tech/documentation/examples/) -- Blog-Reading Chatbot with GPT-4o - -# [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-scaleway/\#blog-reading-chatbot-with-gpt-4o) Blog-Reading Chatbot with GPT-4o - -| Time: 90 min | Level: Advanced | [GitHub](https://github.com/qdrant/examples/blob/langchain-lcel-rag/langchain-lcel-rag/Langchain-LCEL-RAG-Demo.ipynb) | | -| --- | --- | --- | --- | - -In this tutorial, you will build a RAG system that combines blog content ingestion with the capabilities of semantic search. **OpenAI’s GPT-4o LLM** is powerful, but scaling its use requires us to supply context systematically. - -RAG enhances the LLM’s generation of answers by retrieving relevant documents to aid the question-answering process. This setup showcases the integration of advanced search and AI language processing to improve information retrieval and generation tasks. - -A notebook for this tutorial is available on [GitHub](https://github.com/qdrant/examples/blob/langchain-lcel-rag/langchain-lcel-rag/Langchain-LCEL-RAG-Demo.ipynb). - -**Data Privacy and Sovereignty:** RAG applications often rely on sensitive or proprietary internal data. Running the entire stack within your own environment becomes crucial for maintaining control over this data. Qdrant Hybrid Cloud deployed on [Scaleway](https://www.scaleway.com/) addresses this need perfectly, offering a secure, scalable platform that still leverages the full potential of RAG. Scaleway offers serverless [Functions](https://www.scaleway.com/en/serverless-functions/) and serverless [Jobs](https://www.scaleway.com/en/serverless-jobs/), both of which are ideal for embedding creation in large-scale RAG cases. - -## [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-scaleway/\#components) Components - -- **Cloud Host:** [Scaleway on managed Kubernetes](https://www.scaleway.com/en/kubernetes-kapsule/) for compatibility with Qdrant Hybrid Cloud. -- **Vector Database:** Qdrant Hybrid Cloud as the vector search engine for retrieval. -- **LLM:** GPT-4o, developed by OpenAI is utilized as the generator for producing answers. -- **Framework:** [LangChain](https://www.langchain.com/) for extensive RAG capabilities. - -![Architecture diagram](https://qdrant.tech/documentation/examples/rag-chatbot-scaleway/architecture-diagram.png) - -> Langchain [supports a wide range of LLMs](https://python.langchain.com/docs/integrations/chat/), and GPT-4o is used as the main generator in this tutorial. You can easily swap it out for your preferred model that might be launched on your premises to complete the fully private setup. For the sake of simplicity, we used the OpenAI APIs, but LangChain makes the transition seamless. - -## [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-scaleway/\#deploying-qdrant-hybrid-cloud-on-scaleway) Deploying Qdrant Hybrid Cloud on Scaleway - -[Scaleway Kapsule](https://www.scaleway.com/en/kubernetes-kapsule/) and [Kosmos](https://www.scaleway.com/en/kubernetes-kosmos/) are managed Kubernetes services from [Scaleway](https://www.scaleway.com/en/). They abstract away the complexities of managing and operating a Kubernetes cluster. The primary difference being, Kapsule clusters are composed solely of Scaleway Instances. Whereas, a Kosmos cluster is a managed multi-cloud Kubernetes engine that allows you to connect instances from any cloud provider to a single managed Control-Plane. - -1. To start using managed Kubernetes on Scaleway, follow the [platform-specific documentation](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/#scaleway). -2. Once your Kubernetes clusters are up, [you can begin deploying Qdrant Hybrid Cloud](https://qdrant.tech/documentation/hybrid-cloud/). - -## [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-scaleway/\#prerequisites) Prerequisites - -To prepare the environment for working with Qdrant and related libraries, it’s necessary to install all required Python packages. This can be done using Poetry, a tool for dependency management and packaging in Python. The code snippet imports various libraries essential for the tasks ahead, including `bs4` for parsing HTML and XML documents, `langchain` and its community extensions for working with language models and document loaders, and `Qdrant` for vector storage and retrieval. These imports lay the groundwork for utilizing Qdrant alongside other tools for natural language processing and machine learning tasks. - -Qdrant will be running on a specific URL and access will be restricted by the API key. Make sure to store them both as environment variables as well: - -```shell -export QDRANT_URL="https://qdrant.example.com" -export QDRANT_API_KEY="your-api-key" +A single payload looks like this: +```json +{ + 'title': 'Dynamics of partially localized brane systems', + 'DOI': '1109.1415' +} ``` -_Optional:_ Whenever you use LangChain, you can also [configure LangSmith](https://docs.smith.langchain.com/), which will help us trace, monitor and debug LangChain applications. You can sign up for LangSmith [here](https://smith.langchain.com/). - -```shell -export LANGCHAIN_TRACING_V2=true -export LANGCHAIN_API_KEY="your-api-key" -export LANGCHAIN_PROJECT="your-project" # if not specified, defaults to "default" -``` +### Create a collection -Now you can get started: +First things first, we need to create our collection. We're not going to play with the configuration of it, but it makes sense to do it right now. +The configuration is also a part of the collection snapshot. ```python -import getpass -import os - -import bs4 -from langchain import hub -from langchain_community.document_loaders import WebBaseLoader -from langchain_qdrant import Qdrant -from langchain_core.output_parsers import StrOutputParser -from langchain_core.runnables import RunnablePassthrough -from langchain_openai import ChatOpenAI, OpenAIEmbeddings -from langchain_text_splitters import RecursiveCharacterTextSplitter +from qdrant_client import models +if not client.collection_exists("test_collection"): + client.create_collection( + collection_name="test_collection", + vectors_config=models.VectorParams( + size=768, # Size of the embedding vector generated by the InstructorXL model + distance=models.Distance.COSINE + ), + ) ``` -Set up the OpenAI API key: +### Upload the dataset -```python -os.environ["OPENAI_API_KEY"] = getpass.getpass() +Calculating the embeddings is usually a bottleneck of the vector search pipelines, but we are happy to have them in place already. Since the goal of this tutorial is to show how to create a snapshot, **we are going to upload only a small part of the dataset**. -``` +```python +ids, vectors, payloads = [], [], [] +for payload in dataset: + id_ = payload.pop("id") + vector = payload.pop("vector") -Initialize the language model: + ids.append(id_) + vectors.append(vector) + payloads.append(payload) -```python -llm = ChatOpenAI(model="gpt-4o") + # We are going to upload only 1000 vectors + if len(ids) == 1000: + break +client.upsert( + collection_name="test_collection", + points=models.Batch( + ids=ids, + vectors=vectors, + payloads=payloads, + ), +) ``` -It is here that we configure both the Embeddings and LLM. You can replace this with your own models using Ollama or other services. Scaleway has some great [L4 GPU Instances](https://www.scaleway.com/en/l4-gpu-instance/) you can use for compute here. - -## [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-scaleway/\#download-and-parse-data) Download and parse data +Our collection is now ready to be used for search. Let's create a snapshot of it. -To begin working with blog post contents, the process involves loading and parsing the HTML content. This is achieved using `urllib` and `BeautifulSoup`, which are tools designed for such tasks. After the content is loaded and parsed, it is indexed using Qdrant, a powerful tool for managing and querying vector data. The code snippet demonstrates how to load, chunk, and index the contents of a blog post by specifying the URL of the blog and the specific HTML elements to parse. This step is crucial for preparing the data for further processing and analysis with Qdrant. +
-```python -# Load, chunk and index the contents of the blog. -loader = WebBaseLoader( - web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",), - bs_kwargs=dict( - parse_only=bs4.SoupStrainer( - class_=("post-content", "post-title", "post-header") - ) - ), -) -docs = loader.load() +If you already have a collection, you can skip the previous step and start by [creating a snapshot](#create-and-download-snapshots). -``` +## Create and download snapshots -### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-scaleway/\#chunking-data) Chunking data +Qdrant exposes an HTTP endpoint to request creating a snapshot, but we can also call it with the Python SDK. +Our setup consists of 3 nodes, so we need to call the endpoint **on each of them** and create a snapshot on each node. While using Python SDK, that means creating a separate client instance for each node. -When dealing with large documents, such as a blog post exceeding 42,000 characters, it’s crucial to manage the data efficiently for processing. Many models have a limited context window and struggle with long inputs, making it difficult to extract or find relevant information. To overcome this, the document is divided into smaller chunks. This approach enhances the model’s ability to process and retrieve the most pertinent sections of the document effectively. -In this scenario, the document is split into chunks using the `RecursiveCharacterTextSplitter` with a specified chunk size and overlap. This method ensures that no critical information is lost between chunks. Following the splitting, these chunks are then indexed into Qdrant—a vector database for efficient similarity search and storage of embeddings. The `Qdrant.from_documents` function is utilized for indexing, with documents being the split chunks and embeddings generated through `OpenAIEmbeddings`. The entire process is facilitated within an in-memory database, signifying that the operations are performed without the need for persistent storage, and the collection is named “lilianweng” for reference. + -This chunking and indexing strategy significantly improves the management and retrieval of information from large documents, making it a practical solution for handling extensive texts in data processing workflows. ```python -text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) -text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) -splits = text_splitter.split_documents(docs) - -vectorstore = Qdrant.from_documents( - documents=splits, - embedding=OpenAIEmbeddings(), - collection_name="lilianweng", - url=os.environ["QDRANT_URL"], - api_key=os.environ["QDRANT_API_KEY"], -) +snapshot_urls = [] +for node_url in QDRANT_NODES: + node_client = QdrantClient(node_url, api_key=QDRANT_API_KEY) + snapshot_info = node_client.create_snapshot(collection_name="test_collection") + snapshot_url = f"{node_url}/collections/test_collection/snapshots/{snapshot_info.name}" + snapshot_urls.append(snapshot_url) ``` -## [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-scaleway/\#retrieve-and-generate-content) Retrieve and generate content +```http +// for `https://node-0.my-cluster.com:6333` +POST /collections/test_collection/snapshots -The `vectorstore` is used as a retriever to fetch relevant documents based on vector similarity. The `hub.pull("rlm/rag-prompt")` function is used to pull a specific prompt from a repository, which is designed to work with retrieved documents and a question to generate a response. +// for `https://node-1.my-cluster.com:6333` +POST /collections/test_collection/snapshots -The `format_docs` function formats the retrieved documents into a single string, preparing them for further processing. This formatted string, along with a question, is passed through a chain of operations. Firstly, the context (formatted documents) and the question are processed by the retriever and the prompt. Then, the result is fed into a large language model ( `llm`) for content generation. Finally, the output is parsed into a string format using `StrOutputParser()`. +// for `https://node-2.my-cluster.com:6333` +POST /collections/test_collection/snapshots +``` -This chain of operations demonstrates a sophisticated approach to information retrieval and content generation, leveraging both the semantic understanding capabilities of vector search and the generative prowess of large language models. +
+ Response -Now, retrieve and generate data using relevant snippets from the blogL +```json +{ + "result": { + "name": "test_collection-559032209313046-2024-01-03-13-20-11.snapshot", + "creation_time": "2024-01-03T13:20:11", + "size": 18956800 + }, + "status": "ok", + "time": 0.307644965 +} +``` +
-```python -retriever = vectorstore.as_retriever() -prompt = hub.pull("rlm/rag-prompt") -def format_docs(docs): - return "\n\n".join(doc.page_content for doc in docs) -rag_chain = ( - {"context": retriever | format_docs, "question": RunnablePassthrough()} - | prompt - | llm - | StrOutputParser() -) +Once we have the snapshot URLs, we can download them. Please make sure to include the API key in the request headers. +Downloading the snapshot **can be done only through the HTTP API**, so we are going to use the `requests` library. -``` +```python +import requests +import os -### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-scaleway/\#invoking-the-rag-chain) Invoking the RAG Chain +# Create a directory to store snapshots +os.makedirs("snapshots", exist_ok=True) -```python -rag_chain.invoke("What is Task Decomposition?") +local_snapshot_paths = [] +for snapshot_url in snapshot_urls: + snapshot_name = os.path.basename(snapshot_url) + local_snapshot_path = os.path.join("snapshots", snapshot_name) + + response = requests.get( + snapshot_url, headers={"api-key": QDRANT_API_KEY} + ) + with open(local_snapshot_path, "wb") as f: + response.raise_for_status() + f.write(response.content) + local_snapshot_paths.append(local_snapshot_path) ``` -## [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-scaleway/\#next-steps) Next steps: +Alternatively, you can use the `wget` command: -We built a solid foundation for a simple chatbot, but there is still a lot to do. If you want to make the -system production-ready, you should consider implementing the mechanism into your existing stack. We recommend +```bash +wget https://node-0.my-cluster.com:6333/collections/test_collection/snapshots/test_collection-559032209313046-2024-01-03-13-20-11.snapshot \ + --header="api-key: ${QDRANT_API_KEY}" \ + -O node-0-shapshot.snapshot -Our vector database can easily be hosted on [Scaleway](https://www.scaleway.com/), our trusted [Qdrant Hybrid Cloud](https://qdrant.tech/documentation/hybrid-cloud/) partner. This means that Qdrant can be run from your Scaleway region, but the database itself can still be managed from within Qdrant Cloud’s interface. Both products have been tested for compatibility and scalability, and we recommend their [managed Kubernetes](https://www.scaleway.com/en/kubernetes-kapsule/) service. -Their French deployment regions e.g. France are excellent for network latency and data sovereignty. For hosted GPUs, try [rendering with L4 GPU instances](https://www.scaleway.com/en/l4-gpu-instance/). +wget https://node-1.my-cluster.com:6333/collections/test_collection/snapshots/test_collection-559032209313047-2024-01-03-13-20-12.snapshot \ + --header="api-key: ${QDRANT_API_KEY}" \ + -O node-1-shapshot.snapshot -If you have any questions, feel free to ask on our [Discord community](https://qdrant.to/discord). +wget https://node-2.my-cluster.com:6333/collections/test_collection/snapshots/test_collection-559032209313048-2024-01-03-13-20-13.snapshot \ + --header="api-key: ${QDRANT_API_KEY}" \ + -O node-2-shapshot.snapshot +``` -##### Was this page useful? +The snapshots are now stored locally. We can use them to restore the collection to a different Qdrant instance, or treat them as a backup. We will create another collection using the same data on the same cluster. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +## Restore from snapshot -Thank you for your feedback! 🙏 +Our brand-new snapshot is ready to be restored. Typically, it is used to move a collection to a different Qdrant instance, but we are going to use it to create a new collection on the same cluster. +It is just going to have a different name, `test_collection_import`. We do not need to create a collection first, as it is going to be created automatically. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/rag-chatbot-scaleway.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Restoring collection is also done separately on each node, but our Python SDK does not support it yet. We are going to use the HTTP API instead, +and send a request to each node using `requests` library. -On this page: +```python +for node_url, snapshot_path in zip(QDRANT_NODES, local_snapshot_paths): + snapshot_name = os.path.basename(snapshot_path) + requests.post( + f"{node_url}/collections/test_collection_import/snapshots/upload?priority=snapshot", + headers={ + "api-key": QDRANT_API_KEY, + }, + files={"snapshot": (snapshot_name, open(snapshot_path, "rb"))}, + ) +``` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/rag-chatbot-scaleway.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Alternatively, you can use the `curl` command: -× +```bash +curl -X POST 'https://node-0.my-cluster.com:6333/collections/test_collection_import/snapshots/upload?priority=snapshot' \ + -H 'api-key: ${QDRANT_API_KEY}' \ + -H 'Content-Type:multipart/form-data' \ + -F 'snapshot=@node-0-shapshot.snapshot' -[Powered by](https://qdrant.tech/) +curl -X POST 'https://node-1.my-cluster.com:6333/collections/test_collection_import/snapshots/upload?priority=snapshot' \ + -H 'api-key: ${QDRANT_API_KEY}' \ + -H 'Content-Type:multipart/form-data' \ + -F 'snapshot=@node-1-shapshot.snapshot' -<|page-63-lllmstxt|> -## operator-configuration -- [Documentation](https://qdrant.tech/documentation/) -- [Hybrid cloud](https://qdrant.tech/documentation/hybrid-cloud/) -- Configure the Qdrant Operator +curl -X POST 'https://node-2.my-cluster.com:6333/collections/test_collection_import/snapshots/upload?priority=snapshot' \ + -H 'api-key: ${QDRANT_API_KEY}' \ + -H 'Content-Type:multipart/form-data' \ + -F 'snapshot=@node-2-shapshot.snapshot' +``` -# [Anchor](https://qdrant.tech/documentation/hybrid-cloud/operator-configuration/\#configuring-qdrant-operator-advanced-options) Configuring Qdrant Operator: Advanced Options -The Qdrant Operator has several configuration options, which can be configured in the advanced section of your Hybrid Cloud Environment. +**Important:** We selected `priority=snapshot` to make sure that the snapshot is preferred over the data stored on the node. You can read mode about the priority in the [documentation](/documentation/concepts/snapshots/#snapshot-priority). -The following YAML shows all configuration options with their default values: +Apart from Snapshots, Qdrant also provides the [Qdrant Migration Tool](https://github.com/qdrant/migration) that supports: +- Migration between Qdrant Cloud instances. +- Migrating vectors from other providers into Qdrant. +- Migrating from Qdrant OSS to Qdrant Cloud. -```yaml -# Additional pod annotations -podAnnotations: {} +Follow our [migration guide](/documentation/database-tutorials/migration/) to learn how to effectively use the Qdrant Migration tool. -# Configuration for the Qdrant operator service monitor to scrape metrics -serviceMonitor: - enabled: false +<|page-95-lllmstxt|> +# Creating a Qdrant Cluster in Hybrid Cloud -# Resource requests and limits for the Qdrant operator -resources: {} +Once a Hybrid Cloud Environment has been created you can follow the normal process to [create a Qdrant cluster](/documentation/cloud/create-cluster/) in that environment. This page also contains additional information on how to create a [production-ready cluster](/documentation/cloud/create-cluster/#creating-a-production-ready-cluster). -# Node selector for the Qdrant operator -nodeSelector: {} +Make sure to select your Hybrid Cloud Environment as the target. -# Tolerations for the Qdrant operator -tolerations: [] +![Create Hybrid Cloud Cluster](/documentation/cloud/hybrid_cloud_create_cluster.png) -# Affinity configuration for the Qdrant operator -affinity: {} +Note that in the "Kubernetes Configuration" section you can additionally configure: -# Configuration for the Qdrant operator (v2) -settings: - # The log level for the operator - # Available options: DEBUG | INFO | WARN | ERROR - logLevel: INFO - # Controller related settings - controller: - # The period a forced recync is done by the controller (if watches are missed / nothing happened) - forceResyncPeriod: 10h - # QPS indicates the maximum QPS to the master from this client. - # Default is 200 - qps: 200 - # Maximum burst for throttle. - # Default is 500. - burst: 500 - # Features contains the settings for enabling / disabling the individual features of the operator - features: - # ClusterManagement contains the settings for qdrant (database) cluster management - clusterManagement: - # Whether or not the Qdrant cluster features are enabled. - # If disabled, all other properties in this struct are disregarded. Otherwise, the individual features will be inspected. - # Default is true. - enable: true - # The StorageClass used to make database and snapshot PVCs. - # Default is nil, meaning the default storage class of Kubernetes. - storageClass: - # The StorageClass used to make database PVCs. - # Default is nil, meaning the default storage class of Kubernetes. - #database: - # The StorageClass used to make snapshot PVCs. - # Default is nil, meaning the default storage class of Kubernetes. - #snapshot: - # Qdrant config contains settings specific for the database - qdrant: - # The config where to find the image for qdrant - image: - # The repository where to find the image for qdrant - # Default is "qdrant/qdrant" - repository: qdrant/qdrant - # Docker image pull policy - # Default "IfNotPresent", unless the tag is dev, master or latest. Then "Always" - #pullPolicy: - # Docker image pull secret name - # This secret should be available in the namespace where the cluster is running - # Default not set - #pullSecretName: - # storage contains the settings for the storage of the Qdrant cluster - storage: - performance: - # CPU budget, how many CPUs (threads) to allocate for an optimization job. - # If 0 - auto selection, keep 1 or more CPUs unallocated depending on CPU size - # If negative - subtract this number of CPUs from the available CPUs. - # If positive - use this exact number of CPUs. - optimizerCpuBudget: 0 - # Enable async scorer which uses io_uring when rescoring. - # Only supported on Linux, must be enabled in your kernel. - # See: - asyncScorer: false - # Qdrant DB log level - # Available options: DEBUG | INFO | WARN | ERROR - # Default is "INFO" - logLevel: INFO - # Default Qdrant security context configuration - securityContext: - # Enable default security context - # Default is false - enabled: false - # Default user for qdrant container - # Default not set - #user: 1000 - # Default fsGroup for qdrant container - # Default not set - #fsUser: 2000 - # Default group for qdrant container - # Default not set - #group: 3000 - # Network policies configuration for the Qdrant databases - networkPolicies: - ingress: - - ports: - - protocol: TCP - port: 6333 - - protocol: TCP - port: 6334 - # Allow DNS resolution from qdrant pods at Kubernetes internal DNS server - egress: - - ports: - - protocol: UDP - port: 53 - # Scheduling config contains the settings specific for scheduling - scheduling: - # Default topology spread constraints (list from type corev1.TopologySpreadConstraint) - topologySpreadConstraints: - - maxSkew: 1 - topologyKey: "kubernetes.io/hostname" - whenUnsatisfiable: "ScheduleAnyway" - # Default pod disruption budget (object from type policyv1.PodDisruptionBudgetSpec) - podDisruptionBudget: - maxUnavailable: 1 - # ClusterManager config contains the settings specific for cluster manager - clusterManager: - # Whether or not the cluster manager (on operator level). - # If disabled, all other properties in this struct are disregarded. Otherwise, the individual features will be inspected. - # Default is false. - enable: true - # The endpoint address the cluster manager could be reached - # If set, this should be a full URL like: http://cluster-manager.qdrant-cloud-ns.svc.cluster.local:7333 - endpointAddress: http://qdrant-cluster-manager:80 - # InvocationInterval is the interval between calls (started after the previous call is retured) - # Default is 10 seconds - invocationInterval: 10s - # Timeout is the duration a single call to the cluster manager is allowed to take. - # Default is 30 seconds - timeout: 30s - # Specifies overrides for the manage rules - manageRulesOverrides: - #dry_run: - #max_transfers: - #max_transfers_per_collection: - #rebalance: - #replicate: - # Ingress config contains the settings specific for ingress - ingress: - # Whether or not the Ingress feature is enabled. - # Default is true. - enable: false - # Which specific ingress provider should be used - # Default is KubernetesIngress - provider: KubernetesIngress - # The specific settings when the Provider is QdrantCloudTraefik - qdrantCloudTraefik: - # Enable tls - # Default is false - tls: false - # Secret with TLS certificate - # Default is None - secretName: "" - # List of Traefik middlewares to apply - # Default is an empty list - middlewares: [] - # IP Allowlist Strategy for Traefik - # Default is None - ipAllowlistStrategy: - # Enable body validator plugin and matching ingressroute rules - # Default is false - enableBodyValidatorPlugin: false - # The specific settings when the Provider is KubernetesIngress - kubernetesIngress: - # Name of the ingress class - # Default is None - #ingressClassName: - # TelemetryTimeout is the duration a single call to the cluster telemetry endpoint is allowed to take. - # Default is 3 seconds - telemetryTimeout: 3s - # MaxConcurrentReconciles is the maximum number of concurrent Reconciles which can be run. Defaults to 20. - maxConcurrentReconciles: 20 - # VolumeExpansionMode specifies the expansion mode, which can be online or offline (e.g. in case of Azure). - # Available options: Online, Offline - # Default is Online - volumeExpansionMode: Online - # BackupManagementConfig contains the settings for backup management - backupManagement: - # Whether or not the backup features are enabled. - # If disabled, all other properties in this struct are disregarded. Otherwise, the individual features will be inspected. - # Default is true. - enable: true - # Snapshots contains the settings for snapshots as part of backup management. - snapshots: - # Whether or not the Snapshot feature is enabled. - # Default is true. - enable: true - # The VolumeSnapshotClass used to make VolumeSnapshots. - # Default is "csi-snapclass". - volumeSnapshotClass: "csi-snapclass" - # The duration a snapshot is retained when the phase becomes Failed or Skipped - # Default is 72h (3d). - retainUnsuccessful: 72h - # MaxConcurrentReconciles is the maximum number of concurrent Reconciles which can be run. Defaults to 1. - maxConcurrentReconciles: 1 - # ScheduledSnapshots contains the settings for scheduled snapshot as part of backup management. - scheduledSnapshots: - # Whether or not the ScheduledSnapshot feature is enabled. - # Default is true. - enable: true - # MaxConcurrentReconciles is the maximum number of concurrent Reconciles which can be run. Defaults to 1. - maxConcurrentReconciles: 1 - # Restores contains the settings for restoring (a snapshot) as part of backup management. - restores: - # Whether or not the Restore feature is enabled. - # Default is true. - enable: true - # MaxConcurrentReconciles is the maximum number of concurrent Reconciles which can be run. Defaults to 1. - maxConcurrentReconciles: 1 +* Node selectors for the Qdrant database pods +* Toleration for the Qdrant database pods +* Additional labels for the Qdrant database pods +* A service type and annotations for the Qdrant database service -``` +These settings can also be changed after the cluster is created on the cluster detail page. -##### Was this page useful? +![Create Hybrid Cloud Cluster - Kubernetes Configuration](/documentation/cloud/hybrid_cloud_kubernetes_configuration.png) -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +### Scheduling Configuration -Thank you for your feedback! 🙏 +When creating or editing a cluster, you can configure how the database Pods get scheduled in your Kubernetes cluster. This can be useful to ensure that the Qdrant databases will run on dedicated nodes. You can configure the necessary node selectors and tolerations in the "Kubernetes Configuration" section during cluster creation, or on the cluster detail page. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/hybrid-cloud/operator-configuration.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +### Authentication to your Qdrant Clusters -On this page: + -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/hybrid-cloud/operator-configuration.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +In Hybrid Cloud the authentication information is provided by Kubernetes secrets. -× +You can configure authentication for your Qdrant clusters in the "Configuration" section of the Qdrant Cluster detail page. There you can configure the Kubernetes secret name and key to be used as an API key and/or read-only API key. -[Powered by](https://qdrant.tech/) +![Hybrid Cloud API Key configuration](/documentation/cloud/hybrid_cloud_api_key.png) -<|page-64-lllmstxt|> -## fastembed-colbert -- [Documentation](https://qdrant.tech/documentation/) -- [Fastembed](https://qdrant.tech/documentation/fastembed/) -- Working with ColBERT +One way to create a secret is with kubectl: -# [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-colbert/\#how-to-generate-colbert-multivectors-with-fastembed) How to Generate ColBERT Multivectors with FastEmbed +```shell +kubectl create secret generic qdrant-api-key --from-literal=api-key=your-secret-api-key --namespace the-qdrant-namespace +``` -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-colbert/\#colbert) ColBERT +The resulting secret will look like this: -ColBERT is an embedding model that produces a matrix (multivector) representation of input text, -generating one vector per token (a token being a meaningful text unit for a machine learning model). -This approach allows ColBERT to capture more nuanced input semantics than many dense embedding models, -which represent an entire input with a single vector. By producing more granular input representations, -ColBERT becomes a strong retriever. However, this advantage comes at the cost of increased resource consumption compared to -traditional dense embedding models, both in terms of speed and memory. +```yaml +apiVersion: v1 +data: + api-key: ... +kind: Secret +metadata: + name: qdrant-api-key + namespace: the-qdrant-namespace +type: kubernetes.io/generic +``` -Despite ColBERT being a powerful retriever, its speed limitation might make it less suitable for large-scale retrieval. -Therefore, we generally recommend using ColBERT for reranking a small set of already retrieved examples, rather than for first-stage retrieval. -A simple dense retriever can initially retrieve around 100-500 candidates, which can then be reranked with ColBERT to bring the most relevant results -to the top. +With this command the secret name would be `qdrant-api-key` and the key would be `api-key`. -ColBERT is a considerable alternative of a reranking model to [cross-encoders](https://sbert.net/examples/applications/cross-encoder/README.html), since -it tends to be faster on inference time due to its `late interaction` mechanism. +If you want to retrieve the secret again, you can also use `kubectl`: -How does `late interaction` work? Cross-encoders ingest a query and a document glued together as one input. -A cross-encoder model divides this input into meaningful (for the model) parts and checks how these parts relate. -So, all interactions between the query and the document happen “early” inside the model. -Late interaction models, such as ColBERT, only do the first part, generating document and query parts suitable for comparison. -All interactions between these parts are expected to be done “later” outside the model. +```shell +kubectl get secret qdrant-api-key -o jsonpath="{.data.api-key}" --namespace the-qdrant-namespace | base64 --decode +``` -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-colbert/\#using-colbert-in-qdrant) Using ColBERT in Qdrant +#### Watch the Video -Qdrant supports [multivector representations](https://qdrant.tech/documentation/concepts/vectors/#multivectors) out of the box so that you can use any late interaction model as `ColBERT` or `ColPali` in Qdrant without any additional pre/post-processing. +In this tutorial, we walk you through the steps to expose your Qdrant database cluster running on Qdrant Hybrid Cloud to external applications or users outside your Kubernetes cluster. Learn how to configure TLS certificates for secure communication, set up authentication, and explore different methods like load balancers, ingress, and port configurations. -This tutorial uses ColBERT as a first-stage retriever on a toy dataset. -You can see how to use ColBERT as a reranker in our [multi-stage queries documentation](https://qdrant.tech/documentation/concepts/hybrid-queries/#multi-stage-queries). + -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-colbert/\#setup) Setup +### Exposing Qdrant clusters to your client applications -Install `fastembed`. +You can expose your Qdrant clusters to your client applications using Kubernetes services and ingresses. By default, a `ClusterIP` service is created for each Qdrant cluster. -```python -pip install fastembed +Within your Kubernetes cluster, you can access the Qdrant cluster using the service name and port: +``` +http://qdrant-9a9f48c7-bb90-4fb2-816f-418a46a74b24.qdrant-namespace.svc:6333 ``` -Imports late interaction models for text embedding. +This endpoint is also visible on the cluster detail page. -```python -from fastembed import LateInteractionTextEmbedding +If you want to access the database from your local developer machine, you can use `kubectl port-forward` to forward the service port to your local machine: +``` +kubectl --namespace your-qdrant-namespace port-forward service/qdrant-9a9f48c7-bb90-4fb2-816f-418a46a74b24 6333:6333 ``` -You can list which late interaction models are supported in FastEmbed. - -```python -LateInteractionTextEmbedding.list_supported_models() +You can also expose the database outside the Kubernetes cluster with a `LoadBalancer` (if supported in your Kubernetes environment) or `NodePort` service or an ingress. -``` +The service type and necessary annotations can be configured in the "Kubernetes Configuration" section during cluster creation, or on the cluster detail page. -This command displays the available models. The output shows details about the model, including output embedding dimensions, model description, model size, model sources, and model file. +![Hybrid Cloud API Key configuration](/documentation/cloud/hybrid_cloud_service.png) -```python -[{'model': 'colbert-ir/colbertv2.0',\ - 'dim': 128,\ - 'description': 'Late interaction model',\ - 'size_in_GB': 0.44,\ - 'sources': {'hf': 'colbert-ir/colbertv2.0'},\ - 'model_file': 'model.onnx'},\ - {'model': 'answerdotai/answerai-colbert-small-v1',\ - 'dim': 96,\ - 'description': 'Text embeddings, Unimodal (text), Multilingual (~100 languages), 512 input tokens truncation, 2024 year',\ - 'size_in_GB': 0.13,\ - 'sources': {'hf': 'answerdotai/answerai-colbert-small-v1'},\ - 'model_file': 'vespa_colbert.onnx'}] +Especially if you create a LoadBalancer Service, you may need to provide annotations for the loadbalancer configration. Please refer to the documention of your cloud provider for more details. -``` +Examples: -Now, load the model. +* [AWS EKS LoadBalancer annotations](https://kubernetes-sigs.github.io/aws-load-balancer-controller/latest/guide/service/annotations/) +* [Azure AKS Public LoadBalancer annotations](https://learn.microsoft.com/en-us/azure/aks/load-balancer-standard) +* [Azure AKS Internal LoadBalancer annotations](https://learn.microsoft.com/en-us/azure/aks/internal-lb) +* [GCP GKE LoadBalancer annotations](https://cloud.google.com/kubernetes-engine/docs/concepts/service-load-balancer-parameters) -```python -model_name = "colbert-ir/colbertv2.0" -embedding_model = LateInteractionTextEmbedding(model_name) +You could also create a Loadbalancer service manually like this: +```yaml +apiVersion: v1 +kind: Service +metadata: + name: qdrant-9a9f48c7-bb90-4fb2-816f-418a46a74b24-lb + namespace: qdrant-namespace +spec: + type: LoadBalancer + ports: + - name: http + port: 6333 + - name: grpc + port: 6334 + selector: + app: qdrant + cluster-id: 9a9f48c7-bb90-4fb2-816f-418a46a74b24 ``` -The model files will be fetched and downloaded, with progress showing. - -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-colbert/\#embed-data) Embed data +An ingress could look like this: -We will vectorize a toy movie description dataset with ColBERT: +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: qdrant-9a9f48c7-bb90-4fb2-816f-418a46a74b24 + namespace: qdrant-namespace +spec: + rules: + - host: qdrant-9a9f48c7-bb90-4fb2-816f-418a46a74b24.your-domain.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: qdrant-9a9f48c7-bb90-4fb2-816f-418a46a74b24 + port: + number: 6333 +``` -Movie description dataset - -```python -descriptions = ["In 1431, Jeanne d'Arc is placed on trial on charges of heresy. The ecclesiastical jurists attempt to force Jeanne to recant her claims of holy visions.",\ - "A film projectionist longs to be a detective, and puts his meagre skills to work when he is framed by a rival for stealing his girlfriend's father's pocketwatch.",\ - "A group of high-end professional thieves start to feel the heat from the LAPD when they unknowingly leave a clue at their latest heist.",\ - "A petty thief with an utter resemblance to a samurai warlord is hired as the lord's double. When the warlord later dies the thief is forced to take up arms in his place.",\ - "A young boy named Kubo must locate a magical suit of armour worn by his late father in order to defeat a vengeful spirit from the past.",\ - "A biopic detailing the 2 decades that Punjabi Sikh revolutionary Udham Singh spent planning the assassination of the man responsible for the Jallianwala Bagh massacre.",\ - "When a machine that allows therapists to enter their patients' dreams is stolen, all hell breaks loose. Only a young female therapist, Paprika, can stop it.",\ - "An ordinary word processor has the worst night of his life after he agrees to visit a girl in Soho whom he met that evening at a coffee shop.",\ - "A story that revolves around drug abuse in the affluent north Indian State of Punjab and how the youth there have succumbed to it en-masse resulting in a socio-economic decline.",\ - "A world-weary political journalist picks up the story of a woman's search for her son, who was taken away from her decades ago after she became pregnant and was forced to live in a convent.",\ - "Concurrent theatrical ending of the TV series Neon Genesis Evangelion (1995).",\ - "During World War II, a rebellious U.S. Army Major is assigned a dozen convicted murderers to train and lead them into a mass assassination mission of German officers.",\ - "The toys are mistakenly delivered to a day-care center instead of the attic right before Andy leaves for college, and it's up to Woody to convince the other toys that they weren't abandoned and to return home.",\ - "A soldier fighting aliens gets to relive the same day over and over again, the day restarting every time he dies.",\ - "After two male musicians witness a mob hit, they flee the state in an all-female band disguised as women, but further complications set in.",\ - "Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household.",\ - "A renegade reporter trailing a young runaway heiress for a big story joins her on a bus heading from Florida to New York, and they end up stuck with each other when the bus leaves them behind at one of the stops.",\ - "Story of 40-man Turkish task force who must defend a relay station.",\ - "Spinal Tap, one of England's loudest bands, is chronicled by film director Marty DiBergi on what proves to be a fateful tour.",\ - "Oskar, an overlooked and bullied boy, finds love and revenge through Eli, a beautiful but peculiar girl."] +Please refer to the Kubernetes, ingress controller and cloud provider documentation for more details. -``` +If you expose the database like this, you will be able to see this also reflected as an endpoint on the cluster detail page. And will see the Qdrant database dashboard link pointing to it. -The vectorization is done with an `embed` generator function. + -```python -descriptions_embeddings = list( - embedding_model.embed(descriptions) -) +### Configuring TLS -``` +If you want to configure TLS for accessing your Qdrant database in Hybrid Cloud, there are two options: -Let’s check the size of one of the produced embeddings. +* You can offload TLS at the ingress or loadbalancer level. +* You can configure TLS directly in the Qdrant database. -```python -descriptions_embeddings[0].shape +If you want to offload TLS at the ingress or loadbancer level, please refer to their respective documents. -``` +If you want to configure TLS directly in the Qdrant database, you can reference a secret containing the TLS certificate and key in the "Configuration" section of the Qdrant Cluster detail page. -We get the following result +![Hybrid Cloud API Key configuration](/documentation/cloud/hybrid_cloud_tls.png) -```bash -(48, 128) +To create such a secret, you can use `kubectl`: +```shell + kubectl create secret tls qdrant-tls --cert=mydomain.com.crt --key=mydomain.com.key --namespace the-qdrant-namespace ``` -That means that for the first description, we have **48** vectors of lengths **128** representing it. +The resulting secret will look like this: -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-colbert/\#upload-embeddings-to-qdrant) Upload embeddings to Qdrant +```yaml +apiVersion: v1 +data: + tls.crt: ... + tls.key: ... +kind: Secret +metadata: + name: qdrant-tls + namespace: the-qdrant-namespace +type: kubernetes.io/tls +``` -Install `qdrant-client` +With this command the secret name to enter into the UI would be `qdrant-tls` and the keys would be `tls.crt` and `tls.key`. -```python -pip install "qdrant-client>=1.14.2" +### Configuring CPU and memory resource reservations -``` +When creating a Qdrant database cluster, Qdrant Cloud schedules Pods with specific CPU and memory requests and limits to ensure optimal performance. It will use equal requests and limits for stability. Ideally, Kubernetes nodes should match the Pod size, with one database Pod per VM. -Qdrant Client has a simple in-memory mode that allows you to experiment locally on small data volumes. -Alternatively, you could use for experiments [a free cluster](https://qdrant.tech/documentation/cloud/create-cluster/#create-a-cluster) in Qdrant Cloud. +By default, Qdrant Cloud will reserve 20% of available CPU and memory on each Pod. This is done to leave room for the operating system, Kubernetes, and system components. This conservative default may need adjustment depending on node size, whereby smaller nodes might require more, and larger nodes less resources reserved. -```python -from qdrant_client import QdrantClient, models +You can modify this reservation in the “Configuration” section of the Qdrant Cluster detail page. -qdrant_client = QdrantClient(":memory:") # Qdrant is running from RAM. +If you want to check how much resources are availabe on an empty Kubernetes node, you can use the following command: +```shell +kubectl describe node ``` -Now, let’s create a small [collection](https://qdrant.tech/documentation/concepts/collections/) with our movie data. -For that, we will use the [multivectors](https://qdrant.tech/documentation/concepts/vectors/#multivectors) functionality supported in Qdrant. -To configure multivector collection, we need to specify: +This will give you a breakdown of the available resources to Kubernetes and how much is already reserved and used for system Pods. -- similarity metric between vectors; -- the size of each vector (for ColBERT, it’s **128**); -- similarity metric between multivectors (matrices), for example, `maximum`, so for vector from matrix A, we find the most similar vector from matrix B, and their similarity score will be out matrix similarity. +<|page-96-lllmstxt|> +![data-ingestion-beginners-7](/documentation/examples/data-ingestion-beginners/data-ingestion-7.png) -```python -qdrant_client.create_collection( - collection_name="movies", - vectors_config=models.VectorParams( - size=128, #size of each vector produced by ColBERT - distance=models.Distance.COSINE, #similarity metric between each vector - multivector_config=models.MultiVectorConfig( - comparator=models.MultiVectorComparator.MAX_SIM #similarity metric between multivectors (matrices) - ), - ), -) +# Send S3 Data to Qdrant Vector Store with LangChain + +| Time: 30 min | Level: Beginner | | | +| --- | ----------- | ----------- |----------- | -``` +**Data ingestion into a vector store** is essential for building effective search and retrieval algorithms, especially since nearly 80% of data is unstructured, lacking any predefined format. -To make this collection human-readable, let’s save movie metadata (name, description in text form and movie’s length) together with an embedded description. +In this tutorial, we’ll create a streamlined data ingestion pipeline, pulling data directly from **AWS S3** and feeding it into Qdrant. We’ll dive into vector embeddings, transforming unstructured data into a format that allows you to search documents semantically. Prepare to discover new ways to uncover insights hidden within unstructured data! -Movie metadata +## Ingestion Workflow Architecture -```python -metadata = [{"movie_name": "The Passion of Joan of Arc", "movie_watch_time_min": 114, "movie_description": "In 1431, Jeanne d'Arc is placed on trial on charges of heresy. The ecclesiastical jurists attempt to force Jeanne to recant her claims of holy visions."},\ -{"movie_name": "Sherlock Jr.", "movie_watch_time_min": 45, "movie_description": "A film projectionist longs to be a detective, and puts his meagre skills to work when he is framed by a rival for stealing his girlfriend's father's pocketwatch."},\ -{"movie_name": "Heat", "movie_watch_time_min": 170, "movie_description": "A group of high-end professional thieves start to feel the heat from the LAPD when they unknowingly leave a clue at their latest heist."},\ -{"movie_name": "Kagemusha", "movie_watch_time_min": 162, "movie_description": "A petty thief with an utter resemblance to a samurai warlord is hired as the lord's double. When the warlord later dies the thief is forced to take up arms in his place."},\ -{"movie_name": "Kubo and the Two Strings", "movie_watch_time_min": 101, "movie_description": "A young boy named Kubo must locate a magical suit of armour worn by his late father in order to defeat a vengeful spirit from the past."},\ -{"movie_name": "Sardar Udham", "movie_watch_time_min": 164, "movie_description": "A biopic detailing the 2 decades that Punjabi Sikh revolutionary Udham Singh spent planning the assassination of the man responsible for the Jallianwala Bagh massacre."},\ -{"movie_name": "Paprika", "movie_watch_time_min": 90, "movie_description": "When a machine that allows therapists to enter their patients' dreams is stolen, all hell breaks loose. Only a young female therapist, Paprika, can stop it."},\ -{"movie_name": "After Hours", "movie_watch_time_min": 97, "movie_description": "An ordinary word processor has the worst night of his life after he agrees to visit a girl in Soho whom he met that evening at a coffee shop."},\ -{"movie_name": "Udta Punjab", "movie_watch_time_min": 148, "movie_description": "A story that revolves around drug abuse in the affluent north Indian State of Punjab and how the youth there have succumbed to it en-masse resulting in a socio-economic decline."},\ -{"movie_name": "Philomena", "movie_watch_time_min": 98, "movie_description": "A world-weary political journalist picks up the story of a woman's search for her son, who was taken away from her decades ago after she became pregnant and was forced to live in a convent."},\ -{"movie_name": "Neon Genesis Evangelion: The End of Evangelion", "movie_watch_time_min": 87, "movie_description": "Concurrent theatrical ending of the TV series Neon Genesis Evangelion (1995)."},\ -{"movie_name": "The Dirty Dozen", "movie_watch_time_min": 150, "movie_description": "During World War II, a rebellious U.S. Army Major is assigned a dozen convicted murderers to train and lead them into a mass assassination mission of German officers."},\ -{"movie_name": "Toy Story 3", "movie_watch_time_min": 103, "movie_description": "The toys are mistakenly delivered to a day-care center instead of the attic right before Andy leaves for college, and it's up to Woody to convince the other toys that they weren't abandoned and to return home."},\ -{"movie_name": "Edge of Tomorrow", "movie_watch_time_min": 113, "movie_description": "A soldier fighting aliens gets to relive the same day over and over again, the day restarting every time he dies."},\ -{"movie_name": "Some Like It Hot", "movie_watch_time_min": 121, "movie_description": "After two male musicians witness a mob hit, they flee the state in an all-female band disguised as women, but further complications set in."},\ -{"movie_name": "Snow White and the Seven Dwarfs", "movie_watch_time_min": 83, "movie_description": "Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household."},\ -{"movie_name": "It Happened One Night", "movie_watch_time_min": 105, "movie_description": "A renegade reporter trailing a young runaway heiress for a big story joins her on a bus heading from Florida to New York, and they end up stuck with each other when the bus leaves them behind at one of the stops."},\ -{"movie_name": "Nefes: Vatan Sagolsun", "movie_watch_time_min": 128, "movie_description": "Story of 40-man Turkish task force who must defend a relay station."},\ -{"movie_name": "This Is Spinal Tap", "movie_watch_time_min": 82, "movie_description": "Spinal Tap, one of England's loudest bands, is chronicled by film director Marty DiBergi on what proves to be a fateful tour."},\ -{"movie_name": "Let the Right One In", "movie_watch_time_min": 114, "movie_description": "Oskar, an overlooked and bullied boy, finds love and revenge through Eli, a beautiful but peculiar girl."}] +We’ll set up a powerful document ingestion and analysis pipeline in this workflow using cloud storage, natural language processing (NLP) tools, and embedding technologies. Starting with raw data in an S3 bucket, we'll preprocess it with LangChain, apply embedding APIs for both text and images and store the results in Qdrant – a vector database optimized for similarity search. -``` +**Figure 1: Data Ingestion Workflow Architecture** -```python -qdrant_client.upload_points( - collection_name="movies", - points=[\ - models.PointStruct(\ - id=idx,\ - payload=metadata[idx],\ - vector=vector\ - )\ - for idx, vector in enumerate(descriptions_embeddings)\ - ], -) +![data-ingestion-beginners-5](/documentation/examples/data-ingestion-beginners/data-ingestion-5.png) -``` +Let's break down each component of this workflow: -Upload with implicit embeddings computation +- **S3 Bucket:** This is our starting point—a centralized, scalable storage solution for various file types like PDFs, images, and text. +- **LangChain:** Acting as the pipeline’s orchestrator, LangChain handles extraction, preprocessing, and manages data flow for embedding generation. It simplifies processing PDFs, so you won’t need to worry about applying OCR (Optical Character Recognition) here. +- **Qdrant:** As your vector database, Qdrant stores embeddings and their [payloads](https://qdrant.tech/documentation/concepts/payload/), enabling efficient similarity search and retrieval across all content types. -```python -description_documents = [models.Document(text=description, model=model_name) for description in descriptions] -qdrant_client.upload_points( - collection_name="movies", - points=[\ - models.PointStruct(\ - id=idx,\ - payload=metadata[idx],\ - vector=description_document\ - )\ - for idx, description_document in enumerate(description_documents)\ - ], -) +## Prerequisites -``` +![data-ingestion-beginners-11](/documentation/examples/data-ingestion-beginners/data-ingestion-11.png) -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-colbert/\#querying) Querying +In this section, you’ll get a step-by-step guide on ingesting data from an S3 bucket. But before we dive in, let’s make sure you’re set up with all the prerequisites: -ColBERT uses two distinct methods for embedding documents and queries, as do we in Fastembed. However, we altered query pre-processing used in ColBERT, so we don’t have to cut all queries after 32-token length but ingest longer queries directly. +| | | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| Sample Data | We’ll use a sample dataset, where each folder includes product reviews in text format along with corresponding images. | +| AWS Account | An active [AWS account](https://aws.amazon.com/free/) with access to S3 services. | +| Qdrant Cloud | A [Qdrant Cloud account](https://cloud.qdrant.io) with access to the WebUI for managing collections and running queries. | +| LangChain | You will use this [popular framework](https://www.langchain.com) to tie everything together. | -```python -qdrant_client.query_points( - collection_name="movies", - query=list(embedding_model.query_embed("A movie for kids with fantasy elements and wonders"))[0], #converting generator object into numpy.ndarray - limit=1, #How many closest to the query movies we would like to get - #with_vectors=True, #If this option is used, vectors will also be returned - with_payload=True #So metadata is provided in the output -) -``` +#### Supported Document Types -Query points with implicit embeddings computation +The documents used for ingestion can be of various types, such as PDFs, text files, or images. We will organize a structured S3 bucket with folders with the supported document types for testing and experimentation. -```python -query_document = models.Document(text="A movie for kids with fantasy elements and wonders", model=model_name) -qdrant_client.query_points( - collection_name="movies", - query=query_document, - limit=1, -) +#### Python Environment + +Ensure you have a Python environment (Python 3.9 or higher) with these libraries installed: +```python +boto3 +langchain-community +langchain +python-dotenv +unstructured +unstructured[pdf] +qdrant_client +fastembed ``` -The result is the following: +--- -```bash -QueryResponse(points=[ScoredPoint(id=4, version=0, score=12.063469,\ -payload={'movie_name': 'Kubo and the Two Strings', 'movie_watch_time_min': 101,\ -'movie_description': 'A young boy named Kubo must locate a magical suit of armour worn by his late father in order to defeat a vengeful spirit from the past.'},\ -vector=None, shard_key=None, order_value=None)]) +**Access Keys:** Store your AWS access key, S3 secret key, and Qdrant API key in a .env file for easy access. Here’s a sample `.env` file. +```text +ACCESS_KEY = "" +SECRET_ACCESS_KEY = "" +QDRANT_KEY = "" ``` -##### Was this page useful? +--- -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No + -Thank you for your feedback! 🙏 +## Step 1: Ingesting Data from S3 -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/fastembed/fastembed-colbert.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +![data-ingestion-beginners-9.png](/documentation/examples/data-ingestion-beginners/data-ingestion-9.png) -On this page: +The LangChain framework makes it easy to ingest data from storage services like AWS S3, with built-in support for loading documents in formats such as PDFs, images, and text files. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/fastembed/fastembed-colbert.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +To connect LangChain with S3, you’ll use the `S3DirectoryLoader`, which lets you load files directly from an S3 bucket into LangChain’s pipeline. -× +### Example: Configuring LangChain to Load Files from S3 -[Powered by](https://qdrant.tech/) +Here’s how to set up LangChain to ingest data from an S3 bucket: -<|page-65-lllmstxt|> -## cross-encoder-integration-gsoc -- [Articles](https://qdrant.tech/articles/) -- Qdrant Summer of Code 2024 - ONNX Cross Encoders in Python +```python +from langchain_community.document_loaders import S3DirectoryLoader -[Back to Machine Learning](https://qdrant.tech/articles/machine-learning/) +# Initialize the S3 document loader +loader = S3DirectoryLoader( + "product-dataset", # S3 bucket name + "p_1", #S3 Folder name containing the data for the first product + aws_access_key_id=aws_access_key_id, # AWS Access Key + aws_secret_access_key=aws_secret_access_key # AWS Secret Access Key +) -# Qdrant Summer of Code 2024 - ONNX Cross Encoders in Python +# Load documents from the specified S3 bucket +docs = loader.load() +``` -Huong (Celine) Hoang +--- -· +## Step 2. Turning Documents into Embeddings -October 14, 2024 +[Embeddings](/articles/what-are-embeddings/) are the secret sauce here—they’re numerical representations of data (like text, images, or audio) that capture the “meaning” in a form that’s easy to compare. By converting text and images into embeddings, you’ll be able to perform similarity searches quickly and efficiently. Think of embeddings as the bridge to storing and retrieving meaningful insights from your data in Qdrant. -![Qdrant Summer of Code 2024 - ONNX Cross Encoders in Python](https://qdrant.tech/articles_data/cross-encoder-integration-gsoc/preview/title.jpg) +### Models We’ll Use for Generating Embeddings -## [Anchor](https://qdrant.tech/articles/cross-encoder-integration-gsoc/\#introduction) Introduction +To get things rolling, we’ll use two powerful models: -Hi everyone! I’m Huong (Celine) Hoang, and I’m thrilled to share my experience working at Qdrant this summer as part of their Summer of Code 2024 program. During my internship, I worked on integrating cross-encoders into the FastEmbed library for re-ranking tasks. This enhancement widened the capabilities of the Qdrant ecosystem, enabling developers to build more context-aware search applications, such as question-answering systems, using Qdrant’s suite of libraries. +1. **`sentence-transformers/all-MiniLM-L6-v2` Embeddings** for transforming text data. +2. **`CLIP` (Contrastive Language-Image Pretraining)** for image data. -This project was both technically challenging and rewarding, pushing me to grow my skills in handling large-scale ONNX (Open Neural Network Exchange) model integrations, tokenization, and more. Let me take you through the journey, the lessons learned, and where things are headed next. +--- -## [Anchor](https://qdrant.tech/articles/cross-encoder-integration-gsoc/\#project-overview) Project Overview +### Document Processing Function -Qdrant is well known for its vector search capabilities, but my task was to go one step further — introducing cross-encoders for re-ranking. Traditionally, the FastEmbed library would generate embeddings, but cross-encoders don’t do that. Instead, they provide a list of scores based on how well a query matches a list of documents. This kind of re-ranking is critical when you want to refine search results and bring the most relevant answers to the top. +![data-ingestion-beginners-8.png](/documentation/examples/data-ingestion-beginners/data-ingestion-8.png) -The project revolved around creating a new input-output scheme: text data to scores. For this, I designed a family of classes to support ONNX models. Some of the key models I worked with included Xenova/ms-marco-MiniLM-L-6-v2, Xenova/ms-marco-MiniLM-L-12-v2, and BAAI/bge-reranker, all designed for re-ranking tasks. +Next, we’ll define two functions — `process_text` and `process_image` to handle different file types in our document pipeline. The `process_text` function extracts and returns the raw content from a text-based document, while `process_image` retrieves an image from an S3 source and loads it into memory. -An important point to mention is that FastEmbed is a minimalistic library: it doesn’t have heavy dependencies like PyTorch or TensorFlow, and as a result, it is lightweight, occupying far less storage space. +```python +from PIL import Image -Below is a diagram that represents the overall workflow for this project, detailing the key steps from user interaction to the final output validation: +def process_text(doc): + source = doc.metadata['source'] # Extract document source (e.g., S3 URL) -![Search workflow with reranking](https://qdrant.tech/articles_data/cross-encoder-integration-gsoc/rerank-workflow.png) + text = doc.page_content # Extract the content from the text file + print(f"Processing text from {source}") + return source, text -Search workflow with reranking +def process_image(doc): + source = doc.metadata['source'] # Extract document source (e.g., S3 URL) + print(f"Processing image from {source}") -## [Anchor](https://qdrant.tech/articles/cross-encoder-integration-gsoc/\#technical-challenges) Technical Challenges + bucket_name, object_key = parse_s3_url(source) # Parse the S3 URL + response = s3.get_object(Bucket=bucket_name, Key=object_key) # Fetch image from S3 + img_bytes = response['Body'].read() -### [Anchor](https://qdrant.tech/articles/cross-encoder-integration-gsoc/\#1-building-a-new-input-output-scheme) 1\. Building a New Input-Output Scheme + img = Image.open(io.BytesIO(img_bytes)) + return source, img +``` -FastEmbed already had support for embeddings, but re-ranking with cross-encoders meant building a completely new family of classes. These models accept a query and a set of documents, then return a list of relevance scores. For that, I created the base classes like `TextCrossEncoderBase` and `OnnxCrossEncoder`, taking inspiration from existing text embedding models. +### Helper Functions for Document Processing -One thing I had to ensure was that the new class hierarchy was user-friendly. Users should be able to work with cross-encoders without needing to know the complexities of the underlying models. For instance, they should be able to just write: +To retrieve images from S3, a helper function `parse_s3_url` breaks down the S3 URL into its bucket and critical components. This is essential for fetching the image from S3 storage. ```python -from fastembed.rerank.cross_encoder import TextCrossEncoder - -encoder = TextCrossEncoder(model_name="Xenova/ms-marco-MiniLM-L-6-v2") -scores = encoder.rerank(query, documents) - +def parse_s3_url(s3_url): + parts = s3_url.replace("s3://", "").split("/", 1) + bucket_name = parts[0] + object_key = parts[1] + return bucket_name, object_key ``` -Meanwhile, behind the scenes, we manage all the model loading, tokenization, and scoring. - -### [Anchor](https://qdrant.tech/articles/cross-encoder-integration-gsoc/\#2-handling-tokenization-for-cross-encoders) 2\. Handling Tokenization for Cross-Encoders +--- -Cross-encoders require careful tokenization because they need to distinguish between the query and the documents. This is done using token type IDs, which help the model differentiate between the two. To implement this, I configured the tokenizer to handle pairs of inputs—concatenating the query with each document and assigning token types accordingly. +## Step 3: Loading Embeddings into Qdrant -Efficient tokenization is critical to ensure the performance of the models, and I optimized it specifically for ONNX models. +![data-ingestion-beginners-10](/documentation/examples/data-ingestion-beginners/data-ingestion-10.png) -### [Anchor](https://qdrant.tech/articles/cross-encoder-integration-gsoc/\#3-model-loading-and-integration) 3\. Model Loading and Integration +Now that your documents have been processed and converted into embeddings, the next step is to load these embeddings into Qdrant. -One of the most rewarding parts of the project was integrating the ONNX models into the FastEmbed library. ONNX models need to be loaded into a runtime environment that efficiently manages the computations. +### Creating a Collection in Qdrant -While PyTorch is a common framework for these types of tasks, FastEmbed exclusively supports ONNX models, making it both lightweight and efficient. I focused on extensive testing to ensure that the ONNX models performed equivalently to their PyTorch counterparts, ensuring users could trust the results. +In Qdrant, data is organized in collections, each representing a set of embeddings (or points) and their associated metadata (payload). To store the embeddings generated earlier, you’ll first need to create a collection. -I added support for batching as well, allowing users to re-rank large sets of documents without compromising speed. +Here’s how to create a collection in Qdrant to store both text and image embeddings: -### [Anchor](https://qdrant.tech/articles/cross-encoder-integration-gsoc/\#4-debugging-and-code-reviews) 4\. Debugging and Code Reviews +```python +def create_collection(collection_name): + qdrant_client.create_collection( + collection_name, + vectors_config={ + "text_embedding": models.VectorParams( + size=384, # Dimension of text embeddings + distance=models.Distance.COSINE, # Cosine similarity is used for comparison + ), + "image_embedding": models.VectorParams( + size=512, # Dimension of image embeddings + distance=models.Distance.COSINE, # Cosine similarity is used for comparison + ), + }, + ) -During the project, I encountered a number of challenges, including issues with model configurations, tokenizers, and test cases. With the help of my mentor, George Panchuk, I was able to resolve these issues and improve my understanding of best practices, particularly around code readability, maintainability, and style. +create_collection("products-data") +``` -One notable lesson was the importance of keeping the code organized and maintainable, with a strong focus on readability. This included properly structuring modules and ensuring the entire codebase followed a clear, consistent style. +--- -### [Anchor](https://qdrant.tech/articles/cross-encoder-integration-gsoc/\#5-testing-and-validation) 5\. Testing and Validation +This function creates a collection for storing text (384 dimensions) and image (512 dimensions) embeddings, using cosine similarity to compare embeddings within the collection. -To ensure the accuracy and performance of the models, I conducted extensive testing. I compared the output of ONNX models with their PyTorch counterparts, ensuring the conversion to ONNX was correct. A key part of this process was rigorous testing to verify the outputs and identify potential issues, such as incorrect conversions or bugs in our implementation. +Once the collection is set up, you can load the embeddings into Qdrant. This involves inserting (or updating) the embeddings and their associated metadata (payload) into the specified collection. -For instance, a test to validate the model’s output was structured as follows: +Here’s the code for loading embeddings into Qdrant: ```python -def test_rerank(): - is_ci = os.getenv("CI") +def ingest_data(points): + operation_info = qdrant_client.upsert( + collection_name="products-data", # Collection where data is being inserted + points=points + ) + return operation_info +``` - for model_desc in TextCrossEncoder.list_supported_models(): - if not is_ci and model_desc["size_in_GB"] > 1: - continue +--- - model_name = model_desc["model"] - model = TextCrossEncoder(model_name=model_name) +**Explanation of Ingestion** - query = "What is the capital of France?" - documents = ["Paris is the capital of France.", "Berlin is the capital of Germany."] - scores = np.array(model.rerank(query, documents)) +1. **Upserting the Data Point:** The upsert method on the `qdrant_client` inserts each PointStruct into the specified collection. If a point with the same ID already exists, it will be updated with the new values. +2. **Operation Info:** The function returns `operation_info`, which contains details about the upsert operation, such as success status or any potential errors. - canonical_scores = CANONICAL_SCORE_VALUES[model_name] - assert np.allclose( - scores, canonical_scores, atol=1e-3 - ), f"Model: {model_name}, Scores: {scores}, Expected: {canonical_scores}" +**Running the Ingestion Code** -``` +Here’s how to call the function and ingest data: -The `CANONICAL_SCORE_VALUES` were retrieved directly from the result of applying the original PyTorch models to the same input +```python +from qdrant_client import models -## [Anchor](https://qdrant.tech/articles/cross-encoder-integration-gsoc/\#outcomes-and-future-improvements) Outcomes and Future Improvements +if __name__ == "__main__": + collection_name = "products-data" + create_collection(collection_name) + for i in range(1,6): # Five documents + folder = f"p_{i}" + loader = S3DirectoryLoader( + "product-dataset", + folder, + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key + ) + docs = loader.load() + points, text_review, product_image = [], "", "" + for idx, doc in enumerate(docs): + source = doc.metadata['source'] + if source.endswith(".txt") or source.endswith(".pdf"): + _text_review_source, text_review = process_text(doc) + elif source.endswith(".png"): + product_image_source, product_image = process_image(doc) + if text_review: + point = models.PointStruct( + id=idx, # Unique identifier for each point + vector={ + "text_embedding": models.Document( + text=text_review, model="sentence-transformers/all-MiniLM-L6-v2" + ), + "image_embedding": models.Image( + image=product_image, model="Qdrant/clip-ViT-B-32-vision" + ), + }, + payload={"review": text_review, "product_image": product_image_source}, + ) + points.append(point) + operation_info = ingest_data(points) + print(operation_info) +``` -By the end of my project, I successfully added cross-encoders to the FastEmbed library, allowing users to re-rank search results based on relevance scores. This enhancement opens up new possibilities for applications that rely on contextual ranking, such as search engines and recommendation systems. -This functionality will be available as of FastEmbed `0.4.0`. +The `PointStruct` is instantiated with these key parameters: -Some areas for future improvements include: +- **id:** A unique identifier for each embedding, typically an incremental index. -- Expanding Model Support: We could add more cross-encoder models, especially from the sentence transformers library, to give users more options. -- Parallelization: Optimizing batch processing to handle even larger datasets could further improve performance. -- Custom Tokenization: For models with non-standard tokenization, like BAAI/bge-reranker, more specific tokenizer configurations could be added. +- **vector:** A dictionary holding the text and image inputs to be embedded. `qdrant-client` uses [FastEmbed](https://github.com/qdrant/fastembed) under the hood to automatically generate vector representations from these inputs locally. -## [Anchor](https://qdrant.tech/articles/cross-encoder-integration-gsoc/\#overall-experience-and-wrapping-up) Overall Experience and Wrapping Up +- **payload:** A dictionary storing additional metadata, like product reviews and image references, which is invaluable for retrieval and context during searches. -Looking back, this internship has been an incredibly valuable experience. I’ve grown not only as a developer but also as someone who can take on complex projects and see them through from start to finish. The Qdrant team has been so supportive, especially during the debugging and review stages. I’ve learned so much about model integration, ONNX, and how to build tools that are user-friendly and scalable. +The code dynamically loads folders from an S3 bucket, processes text and image files separately, and stores their embeddings and associated data in dedicated lists. It then creates a `PointStruct` for each data entry and calls the ingestion function to load it into Qdrant. -One key takeaway for me is the importance of understanding the user experience. It’s not just about getting the models to work but making sure they are easy to use and integrate into real-world applications. This experience has solidified my passion for building solutions that truly make an impact, and I’m excited to continue working on projects like this in the future. +### Exploring the Qdrant WebUI Dashboard -Thank you for taking the time to read about my journey with Qdrant and the FastEmbed library. I’m excited to see how this work will continue to improve search experiences for users! +Once the embeddings are loaded into Qdrant, you can use the WebUI dashboard to visualize and manage your collections. The dashboard provides a clear, structured interface for viewing collections and their data. Let’s take a closer look in the next section. -##### Was this page useful? +## Step 4: Visualizing Data in Qdrant WebUI -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +To start visualizing your data in the Qdrant WebUI, head to the **Overview** section and select **Access the database**. -Thank you for your feedback! 🙏 +**Figure 2: Accessing the Database from the Qdrant UI** +![data-ingestion-beginners-2.png](/documentation/examples/data-ingestion-beginners/data-ingestion-2.png) -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/cross-encoder-integration-gsoc.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +When prompted, enter your API key. Once inside, you’ll be able to view your collections and the corresponding data points. You should see your collection displayed like this: -On this page: +**Figure 3: The product-data Collection in Qdrant** +![data-ingestion-beginners-4.png](/documentation/examples/data-ingestion-beginners/data-ingestion-4.png) -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/cross-encoder-integration-gsoc.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Here’s a look at the most recent point ingested into Qdrant: -× +**Figure 4: The Latest Point Added to the product-data Collection** +![data-ingestion-beginners-6.png](/documentation/examples/data-ingestion-beginners/data-ingestion-6.png) -[Powered by](https://qdrant.tech/) +The Qdrant WebUI’s search functionality allows you to perform vector searches across your collections. With options to apply filters and parameters, retrieving relevant embeddings and exploring relationships within your data becomes easy. To start, head over to the **Console** in the left panel, where you can create queries: -<|page-66-lllmstxt|> -## qdrant-0-10-release -- [Articles](https://qdrant.tech/articles/) -- Qdrant 0.10 released +**Figure 5: Overview of Console in Qdrant** +![data-ingestion-beginners-1.png](/documentation/examples/data-ingestion-beginners/data-ingestion-1.png) -[Back to Qdrant Articles](https://qdrant.tech/articles/) +The first query retrieves all collections, the second fetches points from the product-data collection, and the third performs a sample query. This demonstrates how straightforward it is to interact with your data in the Qdrant UI. -# Qdrant 0.10 released +Now, let’s retrieve some documents from the database using a query!. -Kacper Ɓukawski +**Figure 6: Querying the Qdrant Client to Retrieve Relevant Documents** +![data-ingestion-beginners-3.png](/documentation/examples/data-ingestion-beginners/data-ingestion-3.png) -· +In this example, we queried **Phones with improved design**. Then, we converted the text to vectors using OpenAI and retrieved a relevant phone review highlighting design improvements. -September 19, 2022 +## Conclusion -![Qdrant 0.10 released](https://qdrant.tech/articles_data/qdrant-0-10-release/preview/title.jpg) +In this guide, we set up an S3 bucket, ingested various data types, and stored embeddings in Qdrant. Using LangChain, we dynamically processed text and image files, making it easy to work with each file type. -[Qdrant 0.10 is a new version](https://github.com/qdrant/qdrant/releases/tag/v0.10.0) that brings a lot of performance -improvements, but also some new features which were heavily requested by our users. Here is an overview of what has changed. +Now, it’s your turn. Try experimenting with different data types, such as videos, and explore Qdrant’s advanced features to enhance your applications. To get started, [sign up](https://cloud.qdrant.io/signup) for Qdrant today. -## [Anchor](https://qdrant.tech/articles/qdrant-0-10-release/\#storing-multiple-vectors-per-object) Storing multiple vectors per object +![data-ingestion-beginners-12](/documentation/examples/data-ingestion-beginners/data-ingestion-12.png) -Previously, if you wanted to use semantic search with multiple vectors per object, you had to create separate collections -for each vector type. This was even if the vectors shared some other attributes in the payload. With Qdrant 0.10, you can -now store all of these vectors together in the same collection, which allows you to share a single copy of the payload. -This makes it easier to use semantic search with multiple vector types, and reduces the amount of work you need to do to -set up your collections. +<|page-97-lllmstxt|> +# Frequently Asked Questions: Database Optimization -## [Anchor](https://qdrant.tech/articles/qdrant-0-10-release/\#batch-vector-search) Batch vector search +### How do I reduce memory usage? -Previously, you had to send multiple requests to the Qdrant API to perform multiple non-related tasks. However, this -can cause significant network overhead and slow down the process, especially if you have a poor connection speed. -Fortunately, the [new batch search feature](https://qdrant.tech/documentation/concepts/search/#batch-search-api) allows -you to avoid this issue. With just one API call, Qdrant will handle multiple search requests in the most efficient way -possible. This means that you can perform multiple tasks simultaneously without having to worry about network overhead -or slow performance. +The primary source of memory usage is vector data. There are several ways to address that: -## [Anchor](https://qdrant.tech/articles/qdrant-0-10-release/\#built-in-arm-support) Built-in ARM support +- Configure [Quantization](/documentation/guides/quantization/) to reduce the memory usage of vectors. +- Configure on-disk vector storage -To make our application accessible to ARM users, we have compiled it specifically for that platform. If it is not -compiled for ARM, the device will have to emulate it, which can slow down performance. To ensure the best possible -experience for ARM users, we have created Docker images specifically for that platform. Keep in mind that using -a limited set of processor instructions may affect the performance of your vector search. Therefore, we have tested -both ARM and non-ARM architectures using similar setups to understand the potential impact on performance. +The choice of the approach depends on your requirements. +Read more about [configuring the optimal](/documentation/tutorials/optimize/) use of Qdrant. -## [Anchor](https://qdrant.tech/articles/qdrant-0-10-release/\#full-text-filtering) Full-text filtering +### How do you choose the machine configuration? -Qdrant is a vector database that allows you to quickly search for the nearest neighbors. However, you may need to apply -additional filters on top of the semantic search. Up until version 0.10, Qdrant only supported keyword filters. With the -release of Qdrant 0.10, [you can now use full-text filters](https://qdrant.tech/documentation/concepts/filtering/#full-text-match) -as well. This new filter type can be used on its own or in combination with other filter types to provide even more -flexibility in your searches. +There are two main scenarios of Qdrant usage in terms of resource consumption: -##### Was this page useful? +- **Performance-optimized** -- when you need to serve vector search as fast (many) as possible. In this case, you need to have as much vector data in RAM as possible. Use our [calculator](https://cloud.qdrant.io/calculator) to estimate the required RAM. +- **Storage-optimized** -- when you need to store many vectors and minimize costs by compromising some search speed. In this case, pay attention to the disk speed instead. More about it in the article about [Memory Consumption](/articles/memory-consumption/). -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +### I configured on-disk vector storage, but memory usage is still high. Why? -Thank you for your feedback! 🙏 +Firstly, memory usage metrics as reported by `top` or `htop` may be misleading. They are not showing the minimal amount of memory required to run the service. +If the RSS memory usage is 10 GB, it doesn't mean that it won't work on a machine with 8 GB of RAM. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/qdrant-0-10-release.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Qdrant uses many techniques to reduce search latency, including caching disk data in RAM and preloading data from disk to RAM. +As a result, the Qdrant process might use more memory than the minimum required to run the service. -On this page: +> Unused RAM is wasted RAM -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/qdrant-0-10-release.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +If you want to limit the memory usage of the service, we recommend using [limits in Docker](https://docs.docker.com/config/containers/resource_constraints/#memory) or Kubernetes. -× +### My requests are very slow or time out. What should I do? -[Powered by](https://qdrant.tech/) +There are several possible reasons for that: -<|page-67-lllmstxt|> -## installation -- [Documentation](https://qdrant.tech/documentation/) -- [Guides](https://qdrant.tech/documentation/guides/) -- Installation +- **Using filters without payload index** -- If you're performing a search with a filter but you don't have a payload index, Qdrant will have to load whole payload data from disk to check the filtering condition. Ensure you have adequately configured [payload indexes](/documentation/concepts/indexing/#payload-index). +- **Usage of on-disk vector storage with slow disks** -- If you're using on-disk vector storage, ensure you have fast enough disks. We recommend using local SSDs with at least 50k IOPS. Read more about the influence of the disk speed on the search latency in the article about [Memory Consumption](/articles/memory-consumption/). +- **Large limit or non-optimal query parameters** -- A large limit or offset might lead to significant performance degradation. Please pay close attention to the query/collection parameters that significantly diverge from the defaults. They might be the reason for the performance issues. -# [Anchor](https://qdrant.tech/documentation/guides/installation/\#installation-requirements) Installation requirements +<|page-98-lllmstxt|> +# How to Effectively Use Multivector Representations in Qdrant for Reranking +Multivector Representations are one of the most powerful features of Qdrant. However, most people don't use them effectively, resulting in massive RAM overhead, slow inserts, and wasted compute. -The following sections describe the requirements for deploying Qdrant. +In this tutorial, you'll discover how to effectively use multivector representations in Qdrant. -## [Anchor](https://qdrant.tech/documentation/guides/installation/\#cpu-and-memory) CPU and memory +## What are Multivector Representations? +In most vector engines, each document is represented by a single vector - an approach that works well for short texts but often struggles with longer documents. Single vector representations perform pooling of the token-level embeddings, which obviously leads to losing some information. -The preferred size of your CPU and RAM depends on: +Multivector representations offer a more fine-grained alternative where a single document is represented using multiple vectors, often at the token or phrase level. This enables more precise matching between specific query terms and relevant parts of the document. Matching is especially effective in Late Interaction models like [ColBERT](https://qdrant.tech/documentation/fastembed/fastembed-colbert/), which retain token-level embeddings and perform interaction during query time leading to relevance scoring. -- Number of vectors -- Vector dimensions -- [Payloads](https://qdrant.tech/documentation/concepts/payload/) and their indexes -- Storage -- Replication -- How you configure quantization +![Multivector Representations](/documentation/advanced-tutorials/multivectors.png) -Our [Cloud Pricing Calculator](https://cloud.qdrant.io/calculator) can help you estimate required resources without payload or index data. +As you will see later in the tutorial, Qdrant supports multivectors and thus late interaction models natively. -### [Anchor](https://qdrant.tech/documentation/guides/installation/\#supported-cpu-architectures) Supported CPU architectures: +## Why Token-level Vectors are Useful -**64-bit system:** +With token-level vectors, models like ColBERT can match specific query tokens to the most relevant parts of a document, enabling high-accuracy retrieval through Late Interaction. -- x86\_64/amd64 -- AArch64/arm64 +In late interaction, each document is converted into multiple token-level vectors instead of a single vector. The query is also tokenized and embedded into various vectors. Then, the query and document vectors are matched using a similarity function: MaxSim. You can see how it is calculated [here](https://qdrant.tech/documentation/concepts/vectors/#multivectors). -**32-bit system:** +In traditional retrieval, the query and document are converted into single embeddings, after which similarity is computed. This is an early interaction because the information is compressed before retrieval. -- Not supported +## What is Rescoring, and Why is it Used? +Rescoring is two-fold: +- Retrieve relevant documents using a fast model. +- Rerank them using a more accurate but slower model such as ColBERT. -### [Anchor](https://qdrant.tech/documentation/guides/installation/\#storage) Storage +## Why Indexing Every Vector by Default is a Problem +In multivector representations (such as those used by Late Interaction models like ColBERT), a single logical document results in hundreds of token-level vectors. Indexing each of these vectors individually with HNSW in Qdrant can lead to: -For persistent storage, Qdrant requires block-level access to storage devices with a [POSIX-compatible file system](https://www.quobyte.com/storage-explained/posix-filesystem/). Network systems such as [iSCSI](https://en.wikipedia.org/wiki/ISCSI) that provide block-level access are also acceptable. -Qdrant won’t work with [Network file systems](https://en.wikipedia.org/wiki/File_system#Network_file_systems) such as NFS, or [Object storage](https://en.wikipedia.org/wiki/Object_storage) systems such as S3. +- High RAM usage +- Slow insert times due to the complexity of maintaining the HNSW graph -If you offload vectors to a local disk, we recommend you use a solid-state (SSD or NVMe) drive. +However, because multivectors are typically used in the reranking stage (after a first-pass retrieval using dense vectors), there's often no need to index these token-level vectors with HNSW. -### [Anchor](https://qdrant.tech/documentation/guides/installation/\#networking) Networking +Instead, they can be stored as multi-vector fields (without HNSW indexing) and used at query-time for reranking, which reduces resource overhead and improves performance. -Each Qdrant instance requires three open ports: +For more on this, check out Qdrant's detailed breakdown in our [Scaling PDF Retrieval with Qdrant tutorial](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/#math-behind-the-scaling). -- `6333` \- For the HTTP API, for the [Monitoring](https://qdrant.tech/documentation/guides/monitoring/) health and metrics endpoints -- `6334` \- For the [gRPC](https://qdrant.tech/documentation/interfaces/#grpc-interface) API -- `6335` \- For [Distributed deployment](https://qdrant.tech/documentation/guides/distributed_deployment/) +With Qdrant, you have full control of how indexing works. You can disable indexing by setting the HNSW `m` parameter to `0`: +```python +from qdrant_client import QdrantClient, models -All Qdrant instances in a cluster must be able to: +client = QdrantClient("http://localhost:6333") +collection_name = "dense_multivector_demo" +client.create_collection( + collection_name=collection_name, + vectors_config={ + "dense": models.VectorParams( + size=384, + distance=models.Distance.COSINE + # Leave HNSW indexing ON for dense + ), + "colbert": models.VectorParams( + size=128, + distance=models.Distance.COSINE, + multivector_config=models.MultiVectorConfig( + comparator=models.MultiVectorComparator.MAX_SIM + ), + hnsw_config=models.HnswConfigDiff(m=0) # Disable HNSW for reranking + ) + } +) +``` +By disabling HNSW on multivectors, you: +- Save compute. +- Reduce memory usage. +- Speed up vector uploads. -- Communicate with each other over these ports -- Allow incoming connections to ports `6333` and `6334` from clients that use Qdrant. +## How to Generate Multivectors Using FastEmbed +Let's demonstrate how to effectively use multivectors using [FastEmbed](https://github.com/qdrant/fastembed), which wraps ColBERT into a simple API. -### [Anchor](https://qdrant.tech/documentation/guides/installation/\#security) Security +Install FastEmbed and Qdrant: -The default configuration of Qdrant might not be secure enough for every situation. Please see [our security documentation](https://qdrant.tech/documentation/guides/security/) for more information. +```bash +pip install qdrant-client[fastembed]>=1.14.2 +``` -## [Anchor](https://qdrant.tech/documentation/guides/installation/\#installation-options) Installation options +## Step-by-Step: ColBERT + Qdrant Setup +Ensure that Qdrant is running and create a client: +```python +from qdrant_client import QdrantClient, models -Qdrant can be installed in different ways depending on your needs: +# 1. Connect to Qdrant server +client = QdrantClient("http://localhost:6333") +``` +## 1. Encode Documents +Next, encode your documents: +```python +from fastembed import TextEmbedding, LateInteractionTextEmbedding +# Example documents and query +documents = [ + "Artificial intelligence is used in hospitals for cancer diagnosis and treatment.", + "Self-driving cars use AI to detect obstacles and make driving decisions.", + "AI is transforming customer service through chatbots and automation.", + # ... +] +query_text = "How does AI help in medicine?" -For production, you can use our Qdrant Cloud to run Qdrant either fully managed in our infrastructure or with Hybrid Cloud in yours. +dense_documents = [ + models.Document(text=doc, model="BAAI/bge-small-en") + for doc in documents +] +dense_query = models.Document(text=query_text, model="BAAI/bge-small-en") -If you want to run Qdrant in your own infrastructure, without any cloud connection, we recommend to install Qdrant in a Kubernetes cluster with our Qdrant Private Cloud Enterprise Operator. +colbert_documents = [ + models.Document(text=doc, model="colbert-ir/colbertv2.0") + for doc in documents +] +colbert_query = models.Document(text=query_text, model="colbert-ir/colbertv2.0") -For testing or development setups, you can run the Qdrant container or as a binary executable. We also provide a Helm chart for an easy installation in Kubernetes. +``` -## [Anchor](https://qdrant.tech/documentation/guides/installation/\#production) Production +### 2. Create a Qdrant collection +Then create a Qdrant collection with both vector types. Note that we leave indexing on for the `dense` vector but turn it off for the `colbert` vector that will be used for reranking. +```python +collection_name = "dense_multivector_demo" +client.create_collection( + collection_name=collection_name, + vectors_config={ + "dense": models.VectorParams( + size=384, + distance=models.Distance.COSINE + # Leave HNSW indexing ON for dense + ), + "colbert": models.VectorParams( + size=128, + distance=models.Distance.COSINE, + multivector_config=models.MultiVectorConfig( + comparator=models.MultiVectorComparator.MAX_SIM + ), + hnsw_config=models.HnswConfigDiff(m=0) # Disable HNSW for reranking + ) + } +) -### [Anchor](https://qdrant.tech/documentation/guides/installation/\#qdrant-cloud) Qdrant Cloud +``` -You can set up production with the [Qdrant Cloud](https://qdrant.to/cloud), which provides fully managed Qdrant databases. -It provides horizontal and vertical scaling, one click installation and upgrades, monitoring, logging, as well as backup and disaster recovery. For more information, see the [Qdrant Cloud documentation](https://qdrant.tech/documentation/cloud/). +### 3. Upload Documents (Dense + Multivector) +Now upload the vectors, with `batch_size=8`. We do not have many documents, but batching is always recommended. +```python +points = [ + models.PointStruct( + id=i, + vector={ + "dense": dense_documents[i], + "colbert": colbert_documents[i] + }, + payload={"text": documents[i]} + ) for i in range(len(documents)) +] +client.upload_points( + collection_name="dense_multivector_demo", + points=points, + batch_size=8 +) +``` -### [Anchor](https://qdrant.tech/documentation/guides/installation/\#qdrant-kubernetes-operator) Qdrant Kubernetes Operator +### Query with Retrieval + Reranking in One Call +Now let’s run a search: -We provide a Qdrant Enterprise Operator for Kubernetes installations as part of our [Qdrant Private Cloud](https://qdrant.tech/documentation/private-cloud/) offering. For more information, [use this form](https://qdrant.to/contact-us) to contact us. +```python +results = client.query_points( + collection_name="dense_multivector_demo", + prefetch=models.Prefetch( + query=dense_query, + using="dense", + ), + query=colbert_query, + using="colbert", + limit=3, + with_payload=True +) -### [Anchor](https://qdrant.tech/documentation/guides/installation/\#kubernetes) Kubernetes +``` -You can use a ready-made [Helm Chart](https://helm.sh/docs/) to run Qdrant in your Kubernetes cluster. While it is possible to deploy Qdrant in a distributed setup with the Helm chart, it does not come with the same level of features for zero-downtime upgrades, up and down-scaling, monitoring, logging, and backup and disaster recovery as the Qdrant Cloud offering or the Qdrant Private Cloud Enterprise Operator. Instead you must manage and set this up [yourself](https://qdrant.tech/documentation/guides/distributed_deployment/). Support for the Helm chart is limited to community support. +- The dense vector retrieves the top candidates quickly. +- The Colbert multivector reranks them using token-level `MaxSim` with fine-grained precision. +- Returns the top 3 results. -The following table gives you an overview about the feature differences between the Qdrant Cloud and the Helm chart: +## Conclusion +Multivector search is one of the most powerful features of a vector database when used correctly. With this functionality in Qdrant, you can: +- Store token-level embeddings natively. +- Disable indexing to reduce overhead. +- Run fast retrieval and accurate reranking in one API call. +- Efficiently scale late interaction. -| Feature | Qdrant Helm Chart | Qdrant Cloud | -| --- | --- | --- | -| Open-source | ✅ | | -| Community support only | ✅ | | -| Quick to get started | ✅ | ✅ | -| Vertical and horizontal scaling | ✅ | ✅ | -| API keys with granular access control | ✅ | ✅ | -| Qdrant version upgrades | ✅ | ✅ | -| Support for transit and storage encryption | ✅ | ✅ | -| Zero-downtime upgrades with optimized restart strategy | | ✅ | -| Production ready out-of the box | | ✅ | -| Dataloss prevention on downscaling | | ✅ | -| Full cluster backup and disaster recovery | | ✅ | -| Automatic shard rebalancing | | ✅ | -| Re-sharding support | | ✅ | -| Automatic persistent volume scaling | | ✅ | -| Advanced telemetry | | ✅ | -| One-click API key revoking | | ✅ | -| Recreating nodes with new volumes in existing cluster | | ✅ | -| Enterprise support | | ✅ | +Combining FastEmbed and Qdrant leads to a production-ready pipeline for ColBERT-style reranking without wasting resources. You can do this locally or use Qdrant Cloud. Qdrant offers an easy-to-use API to get started with your search engine, so if you’re ready to dive in, sign up for free at [Qdrant Cloud](https://qdrant.tech/cloud/) and start building. -To install the helm chart: +<|page-99-lllmstxt|> +# Upload and Search Large collections cost-efficiently -```bash -helm repo add qdrant https://qdrant.to/helm -helm install qdrant qdrant/qdrant +| Time: 2 days | Level: Advanced | | | +|--------------|-----------------|--|----| -``` -For more information, see the [qdrant-helm](https://github.com/qdrant/qdrant-helm/tree/main/charts/qdrant) README. +In this tutorial, we will describe an approach to upload, index, and search a large volume of data cost-efficiently, +on an example of the real-world dataset [LAION-400M](https://laion.ai/blog/laion-400-open-dataset/). -### [Anchor](https://qdrant.tech/documentation/guides/installation/\#docker-and-docker-compose) Docker and Docker Compose +The goal of this tutorial is to demonstrate what minimal amount of resources is required to index and search a large dataset, +while still maintaining a reasonable search latency and accuracy. -Usually, we recommend to run Qdrant in Kubernetes, or use the Qdrant Cloud for production setups. This makes setting up highly available and scalable Qdrant clusters with backups and disaster recovery a lot easier. +All relevant code snippets are available in the [GitHub repository](https://github.com/qdrant/laion-400m-benchmark). -However, you can also use Docker and Docker Compose to run Qdrant in production, by following the setup instructions in the [Docker](https://qdrant.tech/documentation/guides/installation/#docker) and [Docker Compose](https://qdrant.tech/documentation/guides/installation/#docker-compose) Development sections. -In addition, you have to make sure: +The recommended Qdrant version for this tutorial is `v1.13.5` and higher. -- To use a performant [persistent storage](https://qdrant.tech/documentation/guides/installation/#storage) for your data -- To configure the [security settings](https://qdrant.tech/documentation/guides/security/) for your deployment -- To set up and configure Qdrant on multiple nodes for a highly available [distributed deployment](https://qdrant.tech/documentation/guides/distributed_deployment/) -- To set up a load balancer for your Qdrant cluster -- To create a [backup and disaster recovery strategy](https://qdrant.tech/documentation/concepts/snapshots/) for your data -- To integrate Qdrant with your [monitoring](https://qdrant.tech/documentation/guides/monitoring/) and logging solutions -## [Anchor](https://qdrant.tech/documentation/guides/installation/\#development) Development +## Dataset -For development and testing, we recommend that you set up Qdrant in Docker. We also have different client libraries. +The dataset we will use is [LAION-400M](https://laion.ai/blog/laion-400-open-dataset/), a collection of approximately 400 million vectors obtained from +images extracted from a Common Crawl dataset. Each vector is 512-dimensional and generated using a [CLIP](https://openai.com/blog/clip/) model. -### [Anchor](https://qdrant.tech/documentation/guides/installation/\#docker) Docker +Vectors are associated with a number of metadata fields, such as `url`, `caption`, `LICENSE`, etc. -The easiest way to start using Qdrant for testing or development is to run the Qdrant container image. -The latest versions are always available on [DockerHub](https://hub.docker.com/r/qdrant/qdrant/tags?page=1&ordering=last_updated). +The overall payload size is approximately 200 GB, and the vectors are 400 GB. -Make sure that [Docker](https://docs.docker.com/engine/install/), [Podman](https://podman.io/docs/installation) or the container runtime of your choice is installed and running. The following instructions use Docker. + -Pull the image: +The dataset is available in the form of 409 chunks, each containing approximately 1M vectors. +We will use the following [python script](https://github.com/qdrant/laion-400m-benchmark/blob/master/upload.py) to upload dataset chunks one by one. -```bash -docker pull qdrant/qdrant +## Hardware -``` +After some initial experiments, we figured out a minimal hardware configuration for the task: -In the following command, revise `$(pwd)/path/to/data` for your Docker configuration. Then use the updated command to run the container: +- 8 CPU cores +- 64Gb RAM +- 650Gb Disk space -```bash -docker run -p 6333:6333 \ - -v $(pwd)/path/to/data:/qdrant/storage \ - qdrant/qdrant +{{< figure src="/documentation/tutorials/large-scale-search/hardware.png" caption="Hardware configuration" >}} -``` -With this command, you start a Qdrant instance with the default configuration. -It stores all data in the `./path/to/data` directory. +This configuration is enough to index and explore the dataset in a single-user mode; latency is reasonable enough to build interactive graphs and navigate in the dashboard. -By default, Qdrant uses port 6333, so at [localhost:6333](http://localhost:6333/) you should see the welcome message. +Naturally, you might need more CPU cores and RAM for production-grade configurations. -To change the Qdrant configuration, you can overwrite the production configuration: +It is important to ensure high network bandwidth for this experiment so you are running the client and server in the same region. -```bash -docker run -p 6333:6333 \ - -v $(pwd)/path/to/data:/qdrant/storage \ - -v $(pwd)/path/to/custom_config.yaml:/qdrant/config/production.yaml \ - qdrant/qdrant -``` +## Uploading and Indexing -Alternatively, you can use your own `custom_config.yaml` configuration file: +We will use the following [python script](https://github.com/qdrant/laion-400m-benchmark/blob/master/upload.py) to upload dataset chunks one by one. ```bash -docker run -p 6333:6333 \ - -v $(pwd)/path/to/data:/qdrant/storage \ - -v $(pwd)/path/to/custom_config.yaml:/qdrant/config/custom_config.yaml \ - qdrant/qdrant \ - ./qdrant --config-path config/custom_config.yaml +export QDRANT_URL="https://xxxx-xxxx.xxxx.cloud.qdrant.io" +export QDRANT_API_KEY="xxxx-xxxx-xxxx-xxxx" +python upload.py ``` -For more information, see the [Configuration](https://qdrant.tech/documentation/guides/configuration/) documentation. - -### [Anchor](https://qdrant.tech/documentation/guides/installation/\#docker-compose) Docker Compose - -You can also use [Docker Compose](https://docs.docker.com/compose/) to run Qdrant. - -Here is an example customized compose file for a single node Qdrant cluster: - -```yaml -services: - qdrant: - image: qdrant/qdrant:latest - restart: always - container_name: qdrant - ports: - - 6333:6333 - - 6334:6334 - expose: - - 6333 - - 6334 - - 6335 - configs: - - source: qdrant_config - target: /qdrant/config/production.yaml - volumes: - - ./qdrant_data:/qdrant/storage +This script will download chunks of the LAION dataset one by one and upload them to Qdrant. Intermediate data is not persisted on disk, so the script doesn't require much disk space on the client side. -configs: - qdrant_config: - content: | - log_level: INFO +Let's take a look at the collection configuration we used: +```python +client.create_collection( + QDRANT_COLLECTION_NAME, + vectors_config=models.VectorParams( + size=512, # CLIP model output size + distance=models.Distance.COSINE, # CLIP model uses cosine distance + datatype=models.Datatype.FLOAT16, # We only need 16 bits for float, otherwise disk usage would be 800Gb instead of 400Gb + on_disk=True # We don't need original vectors in RAM + ), + # Even though CLIP vectors don't work well with binary quantization, out of the box, + # we can rely on query-time oversampling to get more accurate results + quantization_config=models.BinaryQuantization( + binary=models.BinaryQuantizationConfig( + always_ram=True, + ) + ), + optimizers_config=models.OptimizersConfigDiff( + # Bigger size of segments are desired for faster search + # However it might be slower for indexing + max_segment_size=5_000_000, + ), + # Having larger M value is desirable for higher accuracy, + # but in our case we care more about memory usage + # We could still achieve reasonable accuracy even with M=6 + oversampling + hnsw_config=models.HnswConfigDiff( + m=6, # decrease M for lower memory usage + on_disk=False + ), + ) ``` -### [Anchor](https://qdrant.tech/documentation/guides/installation/\#from-source) From source - -Qdrant is written in Rust and can be compiled into a binary executable. -This installation method can be helpful if you want to compile Qdrant for a specific processor architecture or if you do not want to use Docker. - -Before compiling, make sure that the necessary libraries and the [rust toolchain](https://www.rust-lang.org/tools/install) are installed. -The current list of required libraries can be found in the [Dockerfile](https://github.com/qdrant/qdrant/blob/master/Dockerfile). +There are a few important points to note: -Build Qdrant with Cargo: +- We use `FLOAT16` datatype for vectors, which allows us to store vectors in half the size compared to `FLOAT32`. There are no significant accuracy losses for this dataset. +- We use `BinaryQuantization` with `always_ram=True` to enable query-time oversampling. This allows us to get an accurate and resource-efficient search, even though 512d CLIP vectors don't work well with binary quantization out of the box. +- We use `HnswConfig` with `m=6` to reduce memory usage. We will look deeper into memory usage in the next section. -```bash -cargo build --release --bin qdrant +Goal of this configuration is to ensure that prefetch component of the search never needs to load data from disk, and at least a minimal version of vectors and vector index is always in RAM. +The second stage of the search can explicitly determine how many times we can afford to load data from a disk. -``` -After a successful build, you can find the binary in the following subdirectory `./target/release/qdrant`. +In our experiment, the upload process was going at 5000 points per second. +The indexation process was going in parallel with the upload and was happening at the rate of approximately 4000 points per second. -## [Anchor](https://qdrant.tech/documentation/guides/installation/\#client-libraries) Client libraries -In addition to the service, Qdrant provides a variety of client libraries for different programming languages. For a full list, see our [Client libraries](https://qdrant.tech/documentation/interfaces/#client-libraries) documentation. +{{< figure src="/documentation/tutorials/large-scale-search/upload_process.png" caption="Upload and indexation process" >}} -##### Was this page useful? +## Memory Usage -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +After the upload and indexation process is finished, let's take a detailed look at the memory usage of the Qdrant server. -Thank you for your feedback! 🙏 +{{< figure src="/documentation/tutorials/large-scale-search/memory_usage.png" caption="Memory usage" >}} -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/installation.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +On the high level, memory usage consists of 3 components: -On this page: +- System memory - 8.34Gb - this is memory reserved for internal systems and OS, it doesn't depend on the dataset size. +- Data memory - 39.27Gb - this is a resident memory of qdrant process, it can't be evicter and qdrant process will crash if it exceeds the limit. +- Cache memory - 14.54Gb - this is a disk cache qdrant uses. It is necessary for fast search but can be evicted if needed. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/installation.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) -× +The most interest for us is Data and Cache memory. Let's look what exactly is stored in these components. -[Powered by](https://qdrant.tech/) +In our scenario, Qdrant uses memory to store the following components: -<|page-68-lllmstxt|> -## permission-reference -- [Documentation](https://qdrant.tech/documentation/) -- [Cloud rbac](https://qdrant.tech/documentation/cloud-rbac/) -- Permission Reference +- Storing vectors +- Storing vector index +- Storing information about IDs and versions of points -# [Anchor](https://qdrant.tech/documentation/cloud-rbac/permission-reference/\#permission-reference)**Permission Reference** + -This document outlines the permissions available in Qdrant Cloud. +### Size of vectors -* * * +In our scenario, we store only quantized vectors in RAM, so it is relatively easy to calculate the required size: -> 💡 When enabling `write:*` permissions in the UI, the corresponding `read:*` permission will also be enabled and non-actionable. This guarantees access to resources after creating and/or updating them. +```text +400_000_000 * 512d / 8 bits / 1024 (Kb) / 1024 (Mb) / 1024 (Gb) = 23.84Gb +``` -## [Anchor](https://qdrant.tech/documentation/cloud-rbac/permission-reference/\#identity-and-access-management)**Identity and Access Management** +### Size of vector index -Permissions for users, user roles, management keys, and invitations. +Vector index is a bit more complicated, as it is not a simple matrix. -| Permission | Description | -| --- | --- | -| `read:roles` | View roles in the Access Management page. | -| `write:roles` | Create and modify roles in the Access Management page. | -| `delete:roles` | Remove roles in the Access Management page. | -| `read:management_keys` | View Cloud Management Keys in the Access Management page. | -| `write:management_keys` | Create and manage Cloud Management Keys. | -| `delete:management_keys` | Remove Cloud Management Keys in the Access Management page. | -| `write:invites` | Invite new users to an account and revoke invitations. | -| `read:invites` | View pending invites in an account. | -| `delete:invites` | Remove an invitation. | -| `read:users` | View user details in the profile page.
\- Also applicable in User Management and Role details (User tab). | -| `delete:users` | Remove users from an account.
\- Applicable in User Management and Role details (User tab). | +Internally, it is stored as a list of connections in a graph, and each connection is a 4-byte integer. -* * * +The number of connections is defined by the `M` parameter of the HNSW index, and in our case, it is `6` on the high level and `2 x M` on level 0. -## [Anchor](https://qdrant.tech/documentation/cloud-rbac/permission-reference/\#cluster)**Cluster** +This gives us the following estimation: -Permissions for API Keys, backups, clusters, and backup schedules. +```text +400_000_000 * (6 * 2) * 4 bytes / 1024 (Kb) / 1024 (Mb) / 1024 (Gb) = 17.881Gb +``` -### [Anchor](https://qdrant.tech/documentation/cloud-rbac/permission-reference/\#api-keys)**API Keys** +In practice the size of index is a bit smaller due to the [compression](https://qdrant.tech/blog/qdrant-1.13.x/#hnsw-graph-compression) we implemented in Qdrant v1.13.0, but it is still a good estimation. -| Permission | Description | -| --- | --- | -| `read:api_keys` | View Database API Keys for Managed Cloud clusters. | -| `write:api_keys` | Create new Database API Keys for Managed Cloud clusters. | -| `delete:api_keys` | Remove Database API Keys for Managed Cloud clusters. | +The HNSW index in Qdrant is stored as a mmap, and it can be evicted from RAM if needed. +So, the memory consumption of HNSW falls under the category of `Cache memory`. -### [Anchor](https://qdrant.tech/documentation/cloud-rbac/permission-reference/\#backups)**Backups** -| Permission | Description | -| --- | --- | -| `read:backups` | View backups in the **Backups page** and **Cluster details > Backups tab**. | -| `write:backups` | Create backups from the **Backups page** and **Cluster details > Backups tab**. | -| `delete:backups` | Remove backups from the **Backups page** and **Cluster details > Backups tab**. | +### Size of IDs and versions -### [Anchor](https://qdrant.tech/documentation/cloud-rbac/permission-reference/\#clusters)**Clusters** +Qdrant must store additional information about each point, such as ID and version. +This information is needed on each request, so it is very important to keep it in RAM for fast access. -| Permission | Description | -| --- | --- | -| `read:clusters` | View cluster details. | -| `write:clusters` | Modify cluster settings. | -| `delete:clusters` | Delete clusters. | +Let's take a look at Qdrant internals to understand how much memory is required for this information. -### [Anchor](https://qdrant.tech/documentation/cloud-rbac/permission-reference/\#backup-schedules)**Backup Schedules** +```rust -| Permission | Description | -| --- | --- | -| `read:backup_schedules` | View backup schedules in the **Backups page** and **Cluster details > Backups tab**. | -| `write:backup_schedules` | Create backup schedules from the **Backups page** and **Cluster details > Backups tab**. | -| `delete:backup_schedules` | Remove backup schedules from the **Backups page** and **Cluster details > Backups tab**. | +// This is s simplified version of the IdTracker struct +// It omits all optimizations and small details, +// but gives a good estimation of memory usage +IdTracker { + // Mapping of internal id to version (u64), compressed to 4 bytes + // Required for versioning and conflict resolution between segments + internal_to_version, // 400M x 4 = 1.5Gb -* * * + // Mapping of external id to internal id, 4 bytes per point. + // Required to determine original point ID after search inside the segment + internal_to_external: Vec, // 400M x 16 = 6.4Gb -## [Anchor](https://qdrant.tech/documentation/cloud-rbac/permission-reference/\#hybrid-cloud)**Hybrid Cloud** + // Mapping of external id to internal id. For numeric ids it uses 8 bytes, + // UUIDs are stored as 16 bytes. + // Required to determine sequential point ID inside the segment + external_to_internal: Vec, // 400M x (8 + 4) = 4.5Gb +} +``` -Permissions for Hybrid Cloud environments. +In the v1.13.5 we introduced a [significant optimization](https://github.com/qdrant/qdrant/pull/6023) to reduce the memory usage of `IdTracker` by approximately 2 times. +So the total memory usage of `IdTracker` in our case is approximately `12.4Gb`. -| Permission | Description | -| --- | --- | -| `read:hybrid_cloud_environments` | View Hybrid Cloud environment details. | -| `write:hybrid_cloud_environments` | Modify Hybrid Cloud environment settings. | -| `delete:hybrid_cloud_environments` | Delete Hybrid Cloud environments. | +So total expected RAM usage of Qdrant server in our case is approximately `23.84Gb + 17.881Gb + 12.4Gb = 54.121Gb`, which is very close to the actual memory usage we observed: `39.27Gb + 14.54Gb = 53.81Gb`. -* * * +We had to apply some simplifications to the estimations, but they are good enough to understand the memory usage of the Qdrant server. -## [Anchor](https://qdrant.tech/documentation/cloud-rbac/permission-reference/\#payment--billing)**Payment & Billing** -Permissions for payment methods and billing information. +## Search -| Permission | Description | -| --- | --- | -| `read:payment_information` | View payment methods and billing details. | -| `write:payment_information` | Modify or remove payment methods and billing details. | +After the dataset is uploaded and indexed, we can start searching for similar vectors. -* * * +We can start by exploring the dataset in Web-UI. So you can get an intuition into the search performance, not just table numbers. -## [Anchor](https://qdrant.tech/documentation/cloud-rbac/permission-reference/\#account-management)**Account Management** +{{< figure src="/documentation/tutorials/large-scale-search/web-ui-bear1.png" caption="Web-UI Bear image" width="80%" >}} -Permissions for managing user accounts. +{{< figure src="/documentation/tutorials/large-scale-search/web-ui-bear2.png" caption="Web-UI similar Bear image" width="80%" >}} -| Permission | Description | -| --- | --- | -| `read:account` | View account details that the user is a part of. | -| `write:account` | Modify account details such as:
\- Editing the account name
\- Setting an account as default
\- Leaving an account
**(Only available to Owners)** | -| `delete:account` | Remove an account from:
\- The **Profile page** (list of user accounts).
\- The **active account** (if the user is an owner/admin). | +Web-UI default requests do not use oversampling, but the observable results are still good enough to see the resemblance between images. -* * * -## [Anchor](https://qdrant.tech/documentation/cloud-rbac/permission-reference/\#profile)**Profile** +### Ground truth data -Permissions for accessing personal profile information. +However, to estimate the search performance more accurately, we need to compare search results with the ground truth. +Unfortunately, the LAION dataset doesn't contain usable ground truth, so we had to generate it ourselves. -| Permission | Description | -| --- | --- | -| `read:profile` | View the user’s own profile information.
**(Assigned to all users by default)** | +To do this, we need to perform a full-scan search for each vector in the dataset and store the results in a separate file. +Unfortunately, this process is very time-consuming and requires a lot of resources, so we had to limit the number of queries to 100, +we provide a ready-to-use [ground truth file](https://github.com/qdrant/laion-400m-benchmark/blob/master/expected.py) and the [script](https://github.com/qdrant/laion-400m-benchmark/blob/master/full_scan.py) to generate it (requires 512Gb RAM machine and about 20 hours of execution time). -* * * -##### Was this page useful? +Our ground truth file contains 100 queries, each with 50 results. The first 100 vectors of the dataset itself were used to generate queries. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No + -Thank you for your feedback! 🙏 -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-rbac/permission-reference.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +### Search Query -On this page: +To precisely control the amount of oversampling, we will use the following search query: -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-rbac/permission-reference.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +```python -× +limit = 50 +rescore_limit = 1000 # oversampling factor is 20 -[Powered by](https://qdrant.tech/) +query = vectors[query_id] # One of existing vectors -<|page-69-lllmstxt|> -## vector-search-manuals -- [Articles](https://qdrant.tech/articles/) -- Vector Search Manuals - -#### Vector Search Manuals - -Take full control of your vector data with Qdrant. Learn how to easily store, organize, and optimize vectors for high-performance similarity search. - -[![Preview](https://qdrant.tech/articles_data/vector-search-production/preview/preview.jpg)\\ -**Vector Search in Production** \\ -We gathered our most recommended tips and tricks to make your production deployment run smoothly.\\ -\\ -David Myriel\\ -\\ -April 30, 2025](https://qdrant.tech/articles/vector-search-production/)[![Preview](https://qdrant.tech/articles_data/indexing-optimization/preview/preview.jpg)\\ -**Optimizing Memory for Bulk Uploads** \\ -Efficient memory management is key when handling large-scale vector data. Learn how to optimize memory consumption during bulk uploads in Qdrant and keep your deployments performant under heavy load.\\ -\\ -Sabrina Aquino\\ -\\ -February 13, 2025](https://qdrant.tech/articles/indexing-optimization/)[![Preview](https://qdrant.tech/articles_data/vector-search-resource-optimization/preview/preview.jpg)\\ -**Vector Search Resource Optimization Guide** \\ -Learn how to get the most from Qdrant's optimization features. Discover key tricks and best practices to boost vector search performance and reduce Qdrant's resource usage.\\ -\\ -David Myriel\\ -\\ -February 09, 2025](https://qdrant.tech/articles/vector-search-resource-optimization/)[![Preview](https://qdrant.tech/articles_data/what-is-a-vector-database/preview/preview.jpg)\\ -**An Introduction to Vector Databases** \\ -Discover what a vector database is, its core functionalities, and real-world applications.\\ -\\ -Sabrina Aquino\\ -\\ -October 09, 2024](https://qdrant.tech/articles/what-is-a-vector-database/)[![Preview](https://qdrant.tech/articles_data/what-is-vector-quantization/preview/preview.jpg)\\ -**What is Vector Quantization?** \\ -In this article, we'll teach you about compression methods like Scalar, Product, and Binary Quantization. Learn how to choose the best method for your specific application.\\ -\\ -Sabrina Aquino\\ -\\ -September 25, 2024](https://qdrant.tech/articles/what-is-vector-quantization/)[![Preview](https://qdrant.tech/articles_data/vector-search-filtering/preview/preview.jpg)\\ -**A Complete Guide to Filtering in Vector Search** \\ -Learn everything about filtering in Qdrant. Discover key tricks and best practices to boost semantic search performance and reduce Qdrant's resource usage.\\ -\\ -Sabrina Aquino, David Myriel\\ -\\ -September 10, 2024](https://qdrant.tech/articles/vector-search-filtering/)[![Preview](https://qdrant.tech/articles_data/hybrid-search/preview/preview.jpg)\\ -**Hybrid Search Revamped - Building with Qdrant's Query API** \\ -Our new Query API allows you to build a hybrid search system that uses different search methods to improve search quality & experience. Learn more here.\\ -\\ -Kacper Ɓukawski\\ -\\ -July 25, 2024](https://qdrant.tech/articles/hybrid-search/)[![Preview](https://qdrant.tech/articles_data/data-privacy/preview/preview.jpg)\\ -**Data Privacy with Qdrant: Implementing Role-Based Access Control (RBAC)** \\ -Discover how Qdrant's Role-Based Access Control (RBAC) ensures data privacy and compliance for your AI applications. Build secure and scalable systems with ease. Read more now!\\ -\\ -Qdrant Team\\ -\\ -June 18, 2024](https://qdrant.tech/articles/data-privacy/)[![Preview](https://qdrant.tech/articles_data/what-are-embeddings/preview/preview.jpg)\\ -**What are Vector Embeddings? - Revolutionize Your Search Experience** \\ -Discover the power of vector embeddings. Learn how to harness the potential of numerical machine learning representations to create a personalized Neural Search Service with FastEmbed.\\ -\\ -Sabrina Aquino\\ -\\ -February 06, 2024](https://qdrant.tech/articles/what-are-embeddings/)[![Preview](https://qdrant.tech/articles_data/multitenancy/preview/preview.jpg)\\ -**How to Implement Multitenancy and Custom Sharding in Qdrant** \\ -Discover how multitenancy and custom sharding in Qdrant can streamline your machine-learning operations. Learn how to scale efficiently and manage data securely.\\ -\\ -David Myriel\\ -\\ -February 06, 2024](https://qdrant.tech/articles/multitenancy/)[![Preview](https://qdrant.tech/articles_data/sparse-vectors/preview/preview.jpg)\\ -**What is a Sparse Vector? How to Achieve Vector-based Hybrid Search** \\ -Learn what sparse vectors are, how they work, and their importance in modern data processing. Explore methods like SPLADE for creating and leveraging sparse vectors efficiently.\\ -\\ -Nirant Kasliwal\\ -\\ -December 09, 2023](https://qdrant.tech/articles/sparse-vectors/)[![Preview](https://qdrant.tech/articles_data/storing-multiple-vectors-per-object-in-qdrant/preview/preview.jpg)\\ -**Optimizing Semantic Search by Managing Multiple Vectors** \\ -Discover the power of vector storage optimization and learn how to efficiently manage multiple vectors per object for enhanced semantic search capabilities.\\ -\\ -Kacper Ɓukawski\\ -\\ -October 05, 2022](https://qdrant.tech/articles/storing-multiple-vectors-per-object-in-qdrant/)[![Preview](https://qdrant.tech/articles_data/batch-vector-search-with-qdrant/preview/preview.jpg)\\ -**Mastering Batch Search for Vector Optimization** \\ -Discover how to optimize your vector search capabilities with efficient batch search. Learn optimization strategies for faster, more accurate results.\\ -\\ -Kacper Ɓukawski\\ -\\ -September 26, 2022](https://qdrant.tech/articles/batch-vector-search-with-qdrant/)[![Preview](https://qdrant.tech/articles_data/neural-search-tutorial/preview/preview.jpg)\\ -**Neural Search 101: A Complete Guide and Step-by-Step Tutorial** \\ -Discover the power of neural search. Learn what neural search is and follow our tutorial to build a neural search service using BERT, Qdrant, and FastAPI.\\ -\\ -Andrey Vasnetsov\\ -\\ -June 10, 2021](https://qdrant.tech/articles/neural-search-tutorial/) - -× - -[Powered by](https://qdrant.tech/) +response = client.query_points( + collection_name=QDRANT_COLLECTION_NAME, + query=query, + limit=limit, + # Go to disk + search_params=models.SearchParams( + quantization=models.QuantizationSearchParams( + rescore=True, + ), + ), + # Prefetch is performed using only in-RAM data, + # so querying even large amount of data is fast + prefetch=models.Prefetch( + query=query, + limit=rescore_limit, + params=models.SearchParams( + quantization=models.QuantizationSearchParams( + # Avoid rescoring in prefetch + # We should do it explicitly on the second stage + rescore=False, + ), + ) + ) + ) +``` -<|page-70-lllmstxt|> -## snapshots -- [Documentation](https://qdrant.tech/documentation/) -- [Concepts](https://qdrant.tech/documentation/concepts/) -- Snapshots +As you can see, this query contains two stages: -# [Anchor](https://qdrant.tech/documentation/concepts/snapshots/\#snapshots) Snapshots +- First stage is a prefetch, which is performed using only in-RAM data. It is very fast and allows us to get a large amount of candidates. +- The second stage is a rescore, which is performed with full-size vectors stored on disks. -_Available as of v0.8.4_ +By using 2-stage search we can precisely control the amount of data loaded from disk and ensure the balance between search speed and accuracy. -Snapshots are `tar` archive files that contain data and configuration of a specific collection on a specific node at a specific time. In a distributed setup, when you have multiple nodes in your cluster, you must create snapshots for each node separately when dealing with a single collection. +You can find the complete code of the search process in the [eval.py](https://github.com/qdrant/laion-400m-benchmark/blob/master/eval.py) -This feature can be used to archive data or easily replicate an existing deployment. For disaster recovery, Qdrant Cloud users may prefer to use [Backups](https://qdrant.tech/documentation/cloud/backups/) instead, which are physical disk-level copies of your data. -For a step-by-step guide on how to use snapshots, see our [tutorial](https://qdrant.tech/documentation/tutorials/create-snapshot/). +## Performance tweak -## [Anchor](https://qdrant.tech/documentation/concepts/snapshots/\#create-snapshot) Create snapshot +One important performance tweak we found useful for this dataset is to enable [Async IO](https://qdrant.tech/articles/io_uring) in Qdrant. -To create a new snapshot for an existing collection: +By default, Qdrant uses synchronous IO, which is good for in-memory datasets but can be a bottleneck when we want to read a lot of data from a disk. -httppythontypescriptrustjavacsharpgo +Async IO (implemented with `io_uring`) allows to send parallel requests to the disk and saturate the disk bandwidth. -```http -POST /collections/{collection_name}/snapshots +This is exactly what we are looking for when performing large-scale re-scoring with original vectors. -``` +Instead of reading vectors one by one and waiting for the disk response 1000 times, we can send 1000 requests to the disk and wait for all of them to complete. This allows us to saturate the disk bandwidth and get faster results. -```python -from qdrant_client import QdrantClient +To enable Async IO in Qdrant, you need to set the following environment variable: -client = QdrantClient(url="http://localhost:6333") +```bash +QDRANT__STORAGE__PERFORMANCE__ASYNC_SCORER=true +``` -client.create_snapshot(collection_name="{collection_name}") +Or set parameter in config file: +```yaml +storage: + performance: + async_scorer: true ``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +In Qdrant Managed cloud Async IO can be enabled via `Advanced optimizations` section in cluster `Configuration` tab. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +{{< figure src="/documentation/tutorials/large-scale-search/async_io.png" caption="Async IO configuration in Cloud" width="80%" >}} -client.createSnapshot("{collection_name}"); -``` +## Running search requests -```rust -use qdrant_client::Qdrant; +Once all the preparations are done, we can run the search requests and evaluate the results. -let client = Qdrant::from_url("http://localhost:6334").build()?; +You can find the full code of the search process in the [eval.py](https://github.com/qdrant/laion-400m-benchmark/blob/master/eval.py) -client.create_snapshot("{collection_name}").await?; +This script will run 100 search requests with configured oversampling factor and compare the results with the ground truth. +```bash +python eval.py --rescore_limit 1000 ``` -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; +In our request we achieved the following results: -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +| Rescore Limit | Precision@50 | Time per request | +|---------------|--------------|------------------| +| 1000 | 75.2% | 0.7s | +| 5000 | 81.0% | 2.2s | -client.createSnapshotAsync("{collection_name}").get(); +Additional experiments with `m=16` demonstrated that we can achieve `85%` precision with `rescore_limit=1000`, but they would require slightly more memory. -``` +{{< figure src="/documentation/tutorials/large-scale-search/precision.png" caption="Log of search evaluation" width="50%">}} -```csharp -using Qdrant.Client; -var client = new QdrantClient("localhost", 6334); +## Conclusion -await client.CreateSnapshotAsync("{collection_name}"); +In this tutorial we demonstrated how to upload, index and search a large dataset in Qdrant cost-efficiently. +Binary quantization can be applied even on 512d vectors, if combined with query-time oversampling. -``` +Qdrant allows to precisely control where each part of storage is located, which allows to achieve a good balance between search speed and memory usage. -```go -import ( - "context" +### Potential improvements - "github.com/qdrant/go-client/qdrant" -) +In this experiment, we investigated in detail which parts of the storage are responsible for memory usage and how to control them. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +One especially interesting part is the `VectorIndex` component, which is responsible for storing the graph of connections between vectors. -client.CreateSnapshot(context.Background(), "{collection_name}") +In our further research, we will investigate the possibility of making HNSW more disk-friendly so it can be offloaded to disk without significant performance losses. -``` +<|page-100-lllmstxt|> +# How to Generate Text Embedings with FastEmbed -This is a synchronous operation for which a `tar` archive file will be generated into the `snapshot_path`. +## Install FastEmbed +```python +pip install fastembed +``` +Just for demo purposes, you will use Lists and NumPy to work with sample data. +```python +from typing import List +import numpy as np +``` -### [Anchor](https://qdrant.tech/documentation/concepts/snapshots/\#delete-snapshot) Delete snapshot +## Load default model -_Available as of v1.0.0_ +In this example, you will use the default text embedding model, `BAAI/bge-small-en-v1.5`. +```python +from fastembed import TextEmbedding +``` -httppythontypescriptrustjavacsharpgo +## Add sample data -```http -DELETE /collections/{collection_name}/snapshots/{snapshot_name} +Now, add two sample documents. Your documents must be in a list, and each document must be a string +```python +documents: List[str] = [ + "FastEmbed is lighter than Transformers & Sentence-Transformers.", + "FastEmbed is supported by and maintained by Qdrant.", +] +``` +Download and initialize the model. Print a message to verify the process. +```python +embedding_model = TextEmbedding() +print("The model BAAI/bge-small-en-v1.5 is ready to use.") ``` +## Embed data +Generate embeddings for both documents. ```python -from qdrant_client import QdrantClient +embeddings_generator = embedding_model.embed(documents) +embeddings_list = list(embeddings_generator) +len(embeddings_list[0]) +``` +Here is the sample document list. The default model creates vectors with 384 dimensions. -client = QdrantClient(url="http://localhost:6333") +```bash +Document: This is built to be faster and lighter than other embedding libraries e.g. Transformers, Sentence-Transformers, etc. +Vector of type: with shape: (384,) +Document: fastembed is supported by and maintained by Qdrant. +Vector of type: with shape: (384,) +``` -client.delete_snapshot( - collection_name="{collection_name}", snapshot_name="{snapshot_name}" -) +## Visualize embeddings +```python +print("Embeddings:\n", embeddings_list) +``` +The embeddings don't look too interesting, but here is a visual. +```bash +Embeddings: + [[-0.11154681 0.00976555 0.00524559 0.01951888 -0.01934952 0.02943449 + -0.10519084 -0.00890122 0.01831438 0.01486796 -0.05642502 0.02561352 + -0.00120165 0.00637456 0.02633459 0.0089221 0.05313658 0.03955453 + -0.04400245 -0.02929407 0.04691846 -0.02515868 0.00778646 -0.05410657 +... + -0.00243012 -0.01820582 0.02938612 0.02108984 -0.02178085 0.02971899 + -0.00790564 0.03561783 0.0652488 -0.04371546 -0.05550042 0.02651665 + -0.01116153 -0.01682246 -0.05976734 -0.03143916 0.06522726 0.01801389 + -0.02611006 0.01627177 -0.0368538 0.03968835 0.027597 0.03305927]] ``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +<|page-101-lllmstxt|> +# Reranking Hybrid Search Results with Qdrant Vector Database -const client = new QdrantClient({ host: "localhost", port: 6333 }); +Hybrid search combines dense and sparse retrieval to deliver precise and comprehensive results. By adding reranking with ColBERT, you can further refine search outputs for maximum relevance. -client.deleteSnapshot("{collection_name}", "{snapshot_name}"); +In this guide, we’ll show you how to implement hybrid search with reranking in Qdrant, leveraging dense, sparse, and late interaction embeddings to create an efficient, high-accuracy search system. Let’s get started! -``` +## Overview -```rust -use qdrant_client::qdrant::DeleteSnapshotRequestBuilder; -use qdrant_client::Qdrant; +Let’s start by breaking down the architecture: -let client = Qdrant::from_url("http://localhost:6334").build()?; +![image3.png](/documentation/examples/reranking-hybrid-search/image3.png) -client - .delete_snapshot(DeleteSnapshotRequestBuilder::new( - "{collection_name}", - "{snapshot_name}", - )) - .await?; +Processing Dense, Sparse, and Late Interaction Embeddings in Vector Databases (VDB) -``` +### Ingestion Stage -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; +Here’s how we’re going to set up the advanced hybrid search. The process is similar to what we did earlier but with a few powerful additions: -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +1. **Documents**: Just like before, we start with the raw input—our set of documents that need to be indexed for search. +2. **Dense Embeddings**: We’ll generate dense embeddings for each document, just like in the basic search. These embeddings capture the deeper, semantic meanings behind the text. +3. **Sparse Embeddings**: This is where it gets interesting. Alongside dense embeddings, we’ll create sparse embeddings using more traditional, keyword-based methods. Specifically, we’ll use BM25, a probabilistic retrieval model. BM25 ranks documents based on how relevant their terms are to a given query, taking into account how often terms appear, document length, and how common the term is across all documents. It’s perfect for keyword-heavy searches. +4. **Late Interaction Embeddings**: Now, we add the magic of ColBERT. ColBERT uses a two-stage approach. First, it generates contextualized embeddings for both queries and documents using BERT, and then it performs late interaction—matching those embeddings efficiently using a dot product to fine-tune relevance. This step allows for deeper, contextual understanding, making sure you get the most precise results. +5. **Vector Database**: All of these embeddings—dense, sparse, and late interaction—are stored in a vector database like Qdrant. This allows you to efficiently search, retrieve, and rerank your documents based on multiple layers of relevance. -client.deleteSnapshotAsync("{collection_name}", "{snapshot_name}").get(); +![image2.png](/documentation/examples/reranking-hybrid-search/image2.png) -``` +Query Retrieval and Reranking Process in Search Systems -```csharp -using Qdrant.Client; +### Retrieval Stage -var client = new QdrantClient("localhost", 6334); +Now, let's talk about how we’re going to pull the best results once the user submits a query: -await client.DeleteSnapshotAsync(collectionName: "{collection_name}", snapshotName: "{snapshot_name}"); +1. **User’s Query**: The user enters a query, and that query is transformed into multiple types of embeddings. We’re talking about representations that capture both the deeper meaning (dense) and specific keywords (sparse). +2. **Embeddings**: The query gets converted into various embeddings—some for understanding the semantics (dense embeddings) and others for focusing on keyword matches (sparse embeddings). +3. **Hybrid Search**: Our hybrid search uses both dense and sparse embeddings to find the most relevant documents. The dense embeddings ensure we capture the overall meaning of the query, while sparse embeddings make sure we don’t miss out on those key, important terms. +4. **Rerank**: Once we’ve got a set of documents, the final step is reranking. This is where late interaction embeddings come into play, giving you results that are not only relevant but tuned to your query by prioritizing the documents that truly meet the user's intent. -``` +## Implementation -```go -import ( - "context" +Let’s see it in action in this section. - "github.com/qdrant/go-client/qdrant" -) +### Additional Setup -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +This time around, we’re using FastEmbed—a lightweight Python library designed for generating embeddings, and it supports popular text models right out of the box. First things first, you’ll need to install it: -client.DeleteSnapshot(context.Background(), "{collection_name}", "{snapshot_name}") +```python +pip install fastembed +``` + +--- +Here are the models we’ll be pulling from FastEmbed: + +```python +from fastembed import TextEmbedding, LateInteractionTextEmbedding, SparseTextEmbedding ``` -## [Anchor](https://qdrant.tech/documentation/concepts/snapshots/\#list-snapshot) List snapshot +--- -List of snapshots for a collection: +### Ingestion -httppythontypescriptrustjavacsharpgo +As before, we’ll convert our documents into embeddings, but thanks to FastEmbed, the process is even more straightforward because all the models you need are conveniently available in one location. -```http -GET /collections/{collection_name}/snapshots +### Embeddings -``` +First, let’s load the models we need: ```python -from qdrant_client import QdrantClient +dense_embedding_model = TextEmbedding("sentence-transformers/all-MiniLM-L6-v2") +bm25_embedding_model = SparseTextEmbedding("Qdrant/bm25") +late_interaction_embedding_model = LateInteractionTextEmbedding("colbert-ir/colbertv2.0") +``` -client = QdrantClient(url="http://localhost:6333") +--- -client.list_snapshots(collection_name="{collection_name}") +Now, let’s convert our documents into embeddings: +```python +dense_embeddings = list(dense_embedding_model.embed(doc for doc in documents)) +bm25_embeddings = list(bm25_embedding_model.embed(doc for doc in documents)) +late_interaction_embeddings = list(late_interaction_embedding_model.embed(doc for doc in documents)) ``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +--- -const client = new QdrantClient({ host: "localhost", port: 6333 }); +Since we’re dealing with multiple types of embeddings (dense, sparse, and late interaction), we’ll need to store them in a collection that supports a multi-vector setup. The previous collection we created won’t work here, so we’ll create a new one designed specifically for handling these different types of embeddings. -client.listSnapshots("{collection_name}"); +### Create Collection + +Now, we’re setting up a new collection in Qdrant for our hybrid search with the right configurations to handle all the different vector types we’re working with. + +Here’s how you do it: + +```python +from qdrant_client.models import Distance, VectorParams, models +client.create_collection( + "hybrid-search", + vectors_config={ + "all-MiniLM-L6-v2": models.VectorParams( + size=len(dense_embeddings[0]), + distance=models.Distance.COSINE, + ), + "colbertv2.0": models.VectorParams( + size=len(late_interaction_embeddings[0][0]), + distance=models.Distance.COSINE, + multivector_config=models.MultiVectorConfig( + comparator=models.MultiVectorComparator.MAX_SIM, + ), + hnsw_config=models.HnswConfigDiff(m=0) # Disable HNSW for reranking + ), + }, + sparse_vectors_config={ + "bm25": models.SparseVectorParams(modifier=models.Modifier.IDF + ) + } +) ``` -```rust -use qdrant_client::Qdrant; +--- -let client = Qdrant::from_url("http://localhost:6334").build()?; +What’s happening here? We’re creating a collection called "hybrid-search", and we’re configuring it to handle: -client.list_snapshots("{collection_name}").await?; +- **Dense embeddings** from the model all-MiniLM-L6-v2 using cosine distance for comparisons. +- **Late interaction embeddings** from colbertv2.0, also using cosine distance, but with a multivector configuration to use the maximum similarity comparator. Note that we set `m=0` in the `colbertv2.0` vector to prevent indexing since it's not needed for reranking. +- **Sparse embeddings** from BM25 for keyword-based searches. They use `dot_product` for similarity calculation. -``` +This setup ensures that all the different types of vectors are stored and compared correctly for your hybrid search. -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; +### Upsert Data -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +Next, we need to insert the documents along with their multiple embeddings into the **hybrid-search** collection: -client.listSnapshotAsync("{collection_name}").get(); +```python +from qdrant_client.models import PointStruct +points = [] +for idx, (dense_embedding, bm25_embedding, late_interaction_embedding, doc) in enumerate(zip(dense_embeddings, bm25_embeddings, late_interaction_embeddings, documents)): + + point = PointStruct( + id=idx, + vector={ + "all-MiniLM-L6-v2": dense_embedding, + "bm25": bm25_embedding.as_object(), + "colbertv2.0": late_interaction_embedding, + }, + payload={"document": doc} + ) + points.append(point) +operation_info = client.upsert( + collection_name="hybrid-search", + points=points +) ``` -```csharp -using Qdrant.Client; + -var client = new QdrantClient("localhost", 6334); +
+ Upload with implicit embeddings computation -await client.ListSnapshotsAsync("{collection_name}"); -``` +```python +from qdrant_client.models import PointStruct +points = [] -```go -import ( - "context" +for idx, doc in enumerate(documents): + point = PointStruct( + id=idx, + vector={ + "all-MiniLM-L6-v2": models.Document(text=doc, model="sentence-transformers/all-MiniLM-L6-v2"), + "bm25": models.Document(text=doc, model="Qdrant/bm25"), + "colbertv2.0": models.Document(text=doc, model="colbert-ir/colbertv2.0"), + }, + payload={"document": doc} + ) + points.append(point) - "github.com/qdrant/go-client/qdrant" +operation_info = client.upsert( + collection_name="hybrid-search", + points=points ) +``` +
-client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +--- -client.ListSnapshots(context.Background(), "{collection_name}") +This code pulls everything together by creating a list of **PointStruct** objects, each containing the embeddings and corresponding documents. -``` +For each document, it adds: -## [Anchor](https://qdrant.tech/documentation/concepts/snapshots/\#retrieve-snapshot) Retrieve snapshot +- **Dense embeddings** for the deep, semantic meaning. +- **BM25 embeddings** for powerful keyword-based search. +- **ColBERT embeddings** for precise contextual interactions. -To download a specified snapshot from a collection as a file: +Once that’s done, the points are uploaded into our **"hybrid-search"** collection using the upsert method, ensuring everything’s in place. -httpshell +### Retrieval -```http -GET /collections/{collection_name}/snapshots/{snapshot_name} +For retrieval, it’s time to convert the user’s query into the required embeddings. Here’s how you can do it: +```python +dense_vectors = next(dense_embedding_model.query_embed(query)) +sparse_vectors = next(bm25_embedding_model.query_embed(query)) +late_vectors = next(late_interaction_embedding_model.query_embed(query)) ``` -```shell -curl 'http://{qdrant-url}:6333/collections/{collection_name}/snapshots/snapshot-2022-10-10.snapshot' \ - -H 'api-key: ********' \ - --output 'filename.snapshot' +--- -``` +The real magic of hybrid search lies in the **prefetch** parameter. This lets you run multiple sub-queries in one go, combining the power of dense and sparse embeddings. Here’s how to set it up, after which we execute the hybrid search: -## [Anchor](https://qdrant.tech/documentation/concepts/snapshots/\#restore-snapshot) Restore snapshot +```python +prefetch = [ + models.Prefetch( + query=dense_vectors, + using="all-MiniLM-L6-v2", + limit=20, + ), + models.Prefetch( + query=models.SparseVector(**sparse_vectors.as_object()), + using="bm25", + limit=20, + ), + ] +``` -Snapshots can be restored in three possible ways: +--- -1. [Recovering from a URL or local file](https://qdrant.tech/documentation/concepts/snapshots/#recover-from-a-url-or-local-file) (useful for restoring a snapshot file that is on a remote server or already stored on the node) -2. [Recovering from an uploaded file](https://qdrant.tech/documentation/concepts/snapshots/#recover-from-an-uploaded-file) (useful for migrating data to a new cluster) -3. [Recovering during start-up](https://qdrant.tech/documentation/concepts/snapshots/#recover-during-start-up) (useful when running a self-hosted single-node Qdrant instance) +This code kicks off a hybrid search by running two sub-queries: -Regardless of the method used, Qdrant will extract the shard data from the snapshot and properly register shards in the cluster. -If there are other active replicas of the recovered shards in the cluster, Qdrant will replicate them to the newly recovered node by default to maintain data consistency. +- One using dense embeddings from "all-MiniLM-L6-v2" to capture the semantic meaning of the query. +- The other using sparse embeddings from BM25 for strong keyword matching. -### [Anchor](https://qdrant.tech/documentation/concepts/snapshots/\#recover-from-a-url-or-local-file) Recover from a URL or local file +Each sub-query is limited to 20 results. These sub-queries are bundled together using the prefetch parameter, allowing them to run in parallel. -_Available as of v0.11.3_ +### Rerank -This method of recovery requires the snapshot file to be downloadable from a URL or exist as a local file on the node (like if you [created the snapshot](https://qdrant.tech/documentation/concepts/snapshots/#create-snapshot) on this node previously). If instead you need to upload a snapshot file, see the next section. +Now that we've got our initial hybrid search results, it’s time to rerank them using late interaction embeddings for maximum precision. Here’s how you can do it: -To recover from a URL or local file use the [snapshot recovery endpoint](https://api.qdrant.tech/master/api-reference/snapshots/recover-from-snapshot). This endpoint accepts either a URL like `https://example.com` or a [file URI](https://en.wikipedia.org/wiki/File_URI_scheme) like `file:///tmp/snapshot-2022-10-10.snapshot`. If the target collection does not exist, it will be created. +```python +results = client.query_points( + "hybrid-search", + prefetch=prefetch, + query=late_vectors, + using="colbertv2.0", + with_payload=True, + limit=10, +) +``` -httppythontypescript + -```http -PUT /collections/{collection_name}/snapshots/recover -{ - "location": "http://qdrant-node-1:6333/collections/{collection_name}/snapshots/snapshot-2022-10-10.shapshot" -} +
+ Query points with implicit embeddings computation -``` ```python -from qdrant_client import QdrantClient +prefetch = [ + models.Prefetch( + query=models.Document(text=query, model="sentence-transformers/all-MiniLM-L6-v2"), + using="all-MiniLM-L6-v2", + limit=20, + ), + models.Prefetch( + query=models.Document(text=query, model="Qdrant/bm25"), + using="bm25", + limit=20, + ), + ] +results = client.query_points( + "hybrid-search", + prefetch=prefetch, + query=models.Document(text=query, model="colbert-ir/colbertv2.0"), + using="colbertv2.0", + with_payload=True, + limit=10, +) +``` +
-client = QdrantClient(url="http://qdrant-node-2:6333") -client.recover_snapshot( - "{collection_name}", - "http://qdrant-node-1:6333/collections/collection_name/snapshots/snapshot-2022-10-10.shapshot", -) +--- -``` +Let’s look at how the positions change after applying reranking. Notice how some documents shift in rank based on their relevance according to the late interaction embeddings. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +| | **Document** | **First Query Rank** | **Second Query Rank** | **Rank Change** | +| --- | --- | --- | --- | --- | +| | In machine learning, feature scaling is the process of normalizing the range of independent variables or features. The goal is to ensure that all features contribute equally to the model, especially in algorithms like SVM or k-nearest neighbors where distance calculations matter. | 1 | 1 | No Change | +| | Feature scaling is commonly used in data preprocessing to ensure that features are on the same scale. This is particularly important for gradient descent-based algorithms where features with larger scales could disproportionately impact the cost function. | 2 | 6 | Moved Down | +| | Unsupervised learning algorithms, such as clustering methods, may benefit from feature scaling, which ensures that features with larger numerical ranges don't dominate the learning process. | 3 | 4 | Moved Down | +| | Data preprocessing steps, including feature scaling, can significantly impact the performance of machine learning models, making it a crucial part of the modeling pipeline. | 5 | 2 | Moved Up | -const client = new QdrantClient({ host: "localhost", port: 6333 }); +Great! We've now explored how reranking works and successfully implemented it. -client.recoverSnapshot("{collection_name}", { - location: "http://qdrant-node-1:6333/collections/{collection_name}/snapshots/snapshot-2022-10-10.shapshot", -}); +## Best Practices in Reranking -``` +Reranking can dramatically improve the relevance of search results, especially when combined with hybrid search. Here are some best practices to keep in mind: -### [Anchor](https://qdrant.tech/documentation/concepts/snapshots/\#recover-from-an-uploaded-file) Recover from an uploaded file +- **Implement Hybrid Reranking**: Blend keyword-based (sparse) and vector-based (dense) search results for a more comprehensive ranking system. +- **Continuous Testing and Monitoring**: Regularly evaluate your reranking models to avoid overfitting and make timely adjustments to maintain performance. +- **Balance Relevance and Latency**: Reranking can be computationally expensive, so aim for a balance between relevance and speed. Therefore, the first step is to retrieve the relevant documents and then use reranking on it. -The snapshot file can also be uploaded as a file and restored using the [recover from uploaded snapshot](https://api.qdrant.tech/master/api-reference/snapshots/recover-from-uploaded-snapshot). This endpoint accepts the raw snapshot data in the request body. If the target collection does not exist, it will be created. +## Conclusion -```bash -curl -X POST 'http://{qdrant-url}:6333/collections/{collection_name}/snapshots/upload?priority=snapshot' \ - -H 'api-key: ********' \ - -H 'Content-Type:multipart/form-data' \ - -F 'snapshot=@/path/to/snapshot-2022-10-10.shapshot' +Reranking is a powerful tool that boosts the relevance of search results, especially when combined with hybrid search methods. While it can add some latency due to its complexity, applying it to a smaller, pre-filtered subset of results ensures both speed and relevance. -``` +Qdrant offers an easy-to-use API to get started with your own search engine, so if you’re ready to dive in, sign up for free at [Qdrant Cloud](https://qdrant.tech/) and start building -This method is typically used to migrate data from one cluster to another, so we recommend setting the [priority](https://qdrant.tech/documentation/concepts/snapshots/#snapshot-priority) to “snapshot” for that use-case. +<|page-102-lllmstxt|> +# Navigate Your Codebase with Semantic Search and Qdrant -### [Anchor](https://qdrant.tech/documentation/concepts/snapshots/\#recover-during-start-up) Recover during start-up +| Time: 45 min | Level: Intermediate | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/qdrant/examples/blob/master/code-search/code-search.ipynb) | | +|--------------|---------------------|--|----| -If you have a single-node deployment, you can recover any collection at start-up and it will be immediately available. -Restoring snapshots is done through the Qdrant CLI at start-up time via the `--snapshot` argument which accepts a list of pairs such as `:` +You too can enrich your applications with Qdrant semantic search. In this +tutorial, we describe how you can use Qdrant to navigate a codebase, to help +you find relevant code snippets. As an example, we will use the [Qdrant](https://github.com/qdrant/qdrant) +source code itself, which is mostly written in Rust. -For example: + -```bash -./qdrant --snapshot /snapshots/test-collection-archive.snapshot:test-collection --snapshot /snapshots/test-collection-archive.snapshot:test-copy-collection +## The approach -``` +We want to search codebases using natural semantic queries, and searching for code based on similar logic. You can set up these tasks with embeddings: -The target collection **must** be absent otherwise the program will exit with an error. +1. General usage neural encoder for Natural Language Processing (NLP), in our case + `sentence-transformers/all-MiniLM-L6-v2`. +2. Specialized embeddings for code-to-code similarity search. We use the + `jina-embeddings-v2-base-code` model. -If you wish instead to overwrite an existing collection, use the `--force_snapshot` flag with caution. +To prepare our code for `all-MiniLM-L6-v2`, we preprocess the code to text that +more closely resembles natural language. The Jina embeddings model supports a +variety of standard programming languages, so there is no need to preprocess the +snippets. We can use the code as is. -### [Anchor](https://qdrant.tech/documentation/concepts/snapshots/\#snapshot-priority) Snapshot priority +NLP-based search is based on function signatures, but code search may return +smaller pieces, such as loops. So, if we receive a particular function signature +from the NLP model and part of its implementation from the code model, we merge +the results and highlight the overlap. -When recovering a snapshot to a non-empty node, there may be conflicts between the snapshot data and the existing data. The “priority” setting controls how Qdrant handles these conflicts. The priority setting is important because different priorities can give very -different end results. The default priority may not be best for all situations. +## Data preparation -The available snapshot recovery priorities are: +Chunking the application sources into smaller parts is a non-trivial task. In +general, functions, class methods, structs, enums, and all the other language-specific +constructs are good candidates for chunks. They are big enough to +contain some meaningful information, but small enough to be processed by +embedding models with a limited context window. You can also use docstrings, +comments, and other metadata can be used to enrich the chunks with additional +information. -- `replica`: _(default)_ prefer existing data over the snapshot. -- `snapshot`: prefer snapshot data over existing data. -- `no_sync`: restore snapshot without any additional synchronization. +![Code chunking strategy](/documentation/tutorials/code-search/data-chunking.png) -To recover a new collection from a snapshot, you need to set -the priority to `snapshot`. With `snapshot` priority, all data from the snapshot -will be recovered onto the cluster. With `replica` priority _(default)_, you’d -end up with an empty collection because the collection on the cluster did not -contain any points and that source was preferred. +### Parsing the codebase -`no_sync` is for specialized use cases and is not commonly used. It allows -managing shards and transferring shards between clusters manually without any -additional synchronization. Using it incorrectly will leave your cluster in a -broken state. +While our example uses Rust, you can use our approach with any other language. +You can parse code with a [Language Server Protocol](https://microsoft.github.io/language-server-protocol/) (**LSP**) +compatible tool. You can use an LSP to build a graph of the codebase, and then extract chunks. +We did our work with the [rust-analyzer](https://rust-analyzer.github.io/). +We exported the parsed codebase into the [LSIF](https://microsoft.github.io/language-server-protocol/specifications/lsif/0.4.0/specification/) +format, a standard for code intelligence data. Next, we used the LSIF data to +navigate the codebase and extract the chunks. For details, see our [code search +demo](https://github.com/qdrant/demo-code-search). -To recover from a URL, you specify an additional parameter in the request body: + -httpbashpythontypescript +We then exported the chunks into JSON documents with not only the code itself, +but also context with the location of the code in the project. For example, see +the description of the `await_ready_for_timeout` function from the `IsReady` +struct in the `common` module: -```http -PUT /collections/{collection_name}/snapshots/recover +```json { - "location": "http://qdrant-node-1:6333/collections/{collection_name}/snapshots/snapshot-2022-10-10.shapshot", - "priority": "snapshot" + "name":"await_ready_for_timeout", + "signature":"fn await_ready_for_timeout (& self , timeout : Duration) -> bool", + "code_type":"Function", + "docstring":"= \" Return `true` if ready, `false` if timed out.\"", + "line":44, + "line_from":43, + "line_to":51, + "context":{ + "module":"common", + "file_path":"lib/collection/src/common/is_ready.rs", + "file_name":"is_ready.rs", + "struct_name":"IsReady", + "snippet":" /// Return `true` if ready, `false` if timed out.\n pub fn await_ready_for_timeout(&self, timeout: Duration) -> bool {\n let mut is_ready = self.value.lock();\n if !*is_ready {\n !self.condvar.wait_for(&mut is_ready, timeout).timed_out()\n } else {\n true\n }\n }\n" + } } - ``` -```bash -curl -X POST 'http://qdrant-node-1:6333/collections/{collection_name}/snapshots/upload?priority=snapshot' \ - -H 'api-key: ********' \ - -H 'Content-Type:multipart/form-data' \ - -F 'snapshot=@/path/to/snapshot-2022-10-10.shapshot' +You can examine the Qdrant structures, parsed in JSON, in the [`structures.jsonl` +file](https://storage.googleapis.com/tutorial-attachments/code-search/structures.jsonl) +in our Google Cloud Storage bucket. Download it and use it as a source of data for our code search. +```shell +wget https://storage.googleapis.com/tutorial-attachments/code-search/structures.jsonl ``` -```python -from qdrant_client import QdrantClient, models - -client = QdrantClient(url="http://qdrant-node-2:6333") +Next, load the file and parse the lines into a list of dictionaries: -client.recover_snapshot( - "{collection_name}", - "http://qdrant-node-1:6333/collections/{collection_name}/snapshots/snapshot-2022-10-10.shapshot", - priority=models.SnapshotPriority.SNAPSHOT, -) +```python +import json +structures = [] +with open("structures.jsonl", "r") as fp: + for i, row in enumerate(fp): + entry = json.loads(row) + structures.append(entry) ``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; - -const client = new QdrantClient({ host: "localhost", port: 6333 }); - -client.recoverSnapshot("{collection_name}", { - location: "http://qdrant-node-1:6333/collections/{collection_name}/snapshots/snapshot-2022-10-10.shapshot", - priority: "snapshot" -}); +### Code to *natural language* conversion -``` +Each programming language has its own syntax which is not a part of the natural +language. Thus, a general-purpose model probably does not understand the code +as is. We can, however, normalize the data by removing code specifics and +including additional context, such as module, class, function, and file name. +We took the following steps: -## [Anchor](https://qdrant.tech/documentation/concepts/snapshots/\#snapshots-for-the-whole-storage) Snapshots for the whole storage +1. Extract the signature of the function, method, or other code construct. +2. Divide camel case and snake case names into separate words. +3. Take the docstring, comments, and other important metadata. +4. Build a sentence from the extracted data using a predefined template. +5. Remove the special characters and replace them with spaces. -_Available as of v0.8.5_ +As input, expect dictionaries with the same structure. Define a `textify` +function to do the conversion. We'll use an `inflection` library to convert +with different naming conventions. -Sometimes it might be handy to create snapshot not just for a single collection, but for the whole storage, including collection aliases. -Qdrant provides a dedicated API for that as well. It is similar to collection-level snapshots, but does not require `collection_name`. +```shell +pip install inflection +``` -### [Anchor](https://qdrant.tech/documentation/concepts/snapshots/\#create-full-storage-snapshot) Create full storage snapshot +Once all dependencies are installed, we define the `textify` function: -httppythontypescriptrustjavacsharpgo +```python +import inflection +import re -```http -POST /snapshots +from typing import Dict, Any -``` +def textify(chunk: Dict[str, Any]) -> str: + # Get rid of all the camel case / snake case + # - inflection.underscore changes the camel case to snake case + # - inflection.humanize converts the snake case to human readable form + name = inflection.humanize(inflection.underscore(chunk["name"])) + signature = inflection.humanize(inflection.underscore(chunk["signature"])) -```python -from qdrant_client import QdrantClient + # Check if docstring is provided + docstring = "" + if chunk["docstring"]: + docstring = f"that does {chunk['docstring']} " -client = QdrantClient(url="http://localhost:6333") + # Extract the location of that snippet of code + context = ( + f"module {chunk['context']['module']} " + f"file {chunk['context']['file_name']}" + ) + if chunk["context"]["struct_name"]: + struct_name = inflection.humanize( + inflection.underscore(chunk["context"]["struct_name"]) + ) + context = f"defined in struct {struct_name} {context}" -client.create_full_snapshot() + # Combine all the bits and pieces together + text_representation = ( + f"{chunk['code_type']} {name} " + f"{docstring}" + f"defined as {signature} " + f"{context}" + ) + # Remove any special characters and concatenate the tokens + tokens = re.split(r"\W", text_representation) + tokens = filter(lambda x: x, tokens) + return " ".join(tokens) ``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +Now we can use `textify` to convert all chunks into text representations: -const client = new QdrantClient({ host: "localhost", port: 6333 }); +```python +text_representations = list(map(textify, structures)) +``` -client.createFullSnapshot(); +This is how the `await_ready_for_timeout` function description appears: +```text +Function Await ready for timeout that does Return true if ready false if timed out defined as Fn await ready for timeout self timeout duration bool defined in struct Is ready module common file is_ready rs ``` -```rust -use qdrant_client::Qdrant; - -let client = Qdrant::from_url("http://localhost:6334").build()?; +## Ingestion pipeline -client.create_full_snapshot().await?; +Next, we'll build a pipeline for vectorizing the data and set up a semantic search mechanism for both embedding models. -``` +### Building Qdrant collection -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; +We use the `qdrant-client` library with the `fastembed` extra to interact with the Qdrant server and generate vector embeddings locally. Let's install it: -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +```shell +pip install "qdrant-client[fastembed]" +``` -client.createFullSnapshotAsync().get(); +Of course, we need a running Qdrant server for vector search. If you need one, +you can [use a local Docker container](/documentation/quick-start/) +or deploy it using the [Qdrant Cloud](https://cloud.qdrant.io/). +You can use either to follow this tutorial. Configure the connection parameters: +```python +QDRANT_URL = "https://my-cluster.cloud.qdrant.io:6333" # http://localhost:6333 for local instance +QDRANT_API_KEY = "THIS_IS_YOUR_API_KEY" # None for local instance ``` -```csharp -using Qdrant.Client; - -var client = new QdrantClient("localhost", 6334); +Then use the library to create a collection: -await client.CreateFullSnapshotAsync(); +```python +from qdrant_client import QdrantClient, models +client = QdrantClient(QDRANT_URL, api_key=QDRANT_API_KEY) +client.create_collection( + "qdrant-sources", + vectors_config={ + "text": models.VectorParams( + size=client.get_embedding_size( + model_name="sentence-transformers/all-MiniLM-L6-v2" + ), + distance=models.Distance.COSINE, + ), + "code": models.VectorParams( + size=client.get_embedding_size( + model_name="jinaai/jina-embeddings-v2-base-code" + ), + distance=models.Distance.COSINE, + ), + }, +) ``` -```go -import ( - "context" +Our newly created collection is ready to accept the data. Let's upload the embeddings: - "github.com/qdrant/go-client/qdrant" -) +```python +import uuid -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +# Extract the code snippets from the structures to a separate list +code_snippets = [ + structure["context"]["snippet"] for structure in structures +] -client.CreateFullSnapshot(context.Background()) +points = [ + models.PointStruct( + id=uuid.uuid4().hex, + vector={ + "text": models.Document( + text=text, model="sentence-transformers/all-MiniLM-L6-v2" + ), + "code": models.Document( + text=code, model="jinaai/jina-embeddings-v2-base-code" + ), + }, + payload=structure, + ) + for text, code, structure in zip(text_representations, code_snippets, structures) +] +# Note: This might take a while since inference happens implicitly. +# Parallel processing can help. +# But too many processes may trigger swap memory and hurt performance. +client.upload_points("qdrant-sources", points=points, batch_size=64) ``` -### [Anchor](https://qdrant.tech/documentation/concepts/snapshots/\#delete-full-storage-snapshot) Delete full storage snapshot - -_Available as of v1.0.0_ +Internally, `qdrant-client` uses [FastEmbed](https://github.com/qdrant/fastembed) to implicitly convert our documents into their vector representations. +The uploaded points are immediately available for search. Next, query the +collection to find relevant code snippets. -httppythontypescriptrustjavacsharpgo +## Querying the codebase -```http -DELETE /snapshots/{snapshot_name} +We use one of the models to search the collection. Start with text embeddings. +Run the following query "*How do I count points in a collection?*". Review the +results. -``` + ```python -from qdrant_client import QdrantClient - -client = QdrantClient(url="http://localhost:6333") - -client.delete_full_snapshot(snapshot_name="{snapshot_name}") +query = "How do I count points in a collection?" +hits = client.query_points( + "qdrant-sources", + query=models.Document(text=query, model="sentence-transformers/all-MiniLM-L6-v2"), + using="text", + limit=5, +).points ``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +Now, review the results. The following table lists the module, the file name +and score. Each line includes a link to the signature, as a code block from +the file. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +| module | file_name | score | signature | +|--------------------|---------------------|------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| toc | point_ops.rs | 0.59448624 | [ `pub async fn count`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/storage/src/content_manager/toc/point_ops.rs#L120) | +| operations | types.rs | 0.5493385 | [ `pub struct CountRequestInternal`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/collection/src/operations/types.rs#L831) | +| collection_manager | segments_updater.rs | 0.5121002 | [ `pub(crate) fn upsert_points<'a, T>`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/collection/src/collection_manager/segments_updater.rs#L339) | +| collection | point_ops.rs | 0.5063539 | [ `pub async fn count`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/collection/src/collection/point_ops.rs#L213) | +| map_index | mod.rs | 0.49973983 | [ `fn get_points_with_value_count`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/segment/src/index/field_index/map_index/mod.rs#L88) | -client.deleteFullSnapshot("{snapshot_name}"); +It seems we were able to find some relevant code structures. Let's try the same with the code embeddings: +```python +hits = client.query_points( + "qdrant-sources", + query=models.Document(text=query, model="jinaai/jina-embeddings-v2-base-code"), + using="code", + limit=5, +).points ``` -```rust -use qdrant_client::Qdrant; +Output: -let client = Qdrant::from_url("http://localhost:6334").build()?; +| module | file_name | score | signature | +|---------------|----------------------------|------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| field_index | geo_index.rs | 0.73278356 | [ `fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/segment/src/index/field_index/geo_index.rs#L612) | +| numeric_index | mod.rs | 0.7254976 | [ `fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/index/field_index/numeric_index/mod.rs#L322) | +| map_index | mod.rs | 0.7124739 | [ `fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/index/field_index/map_index/mod.rs#L315) | +| map_index | mod.rs | 0.7124739 | [ `fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/index/field_index/map_index/mod.rs#L429) | +| fixtures | payload_context_fixture.rs | 0.706204 | [ `fn total_point_count`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/fixtures/payload_context_fixture.rs#L122) | -client.delete_full_snapshot("{snapshot_name}").await?; +While the scores retrieved by different models are not comparable, but we can +see that the results are different. Code and text embeddings can capture +different aspects of the codebase. We can use both models to query the collection +and then combine the results to get the most relevant code snippets, from a single batch request. -``` +```python +responses = client.query_batch_points( + collection_name="qdrant-sources", + requests=[ + models.QueryRequest( + query=models.Document( + text=query, model="sentence-transformers/all-MiniLM-L6-v2" + ), + using="text", + with_payload=True, + limit=5, + ), + models.QueryRequest( + query=models.Document( + text=query, model="jinaai/jina-embeddings-v2-base-code" + ), + using="code", + with_payload=True, + limit=5, + ), + ], +) -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; +results = [response.points for response in responses] +``` -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +Output: -client.deleteFullSnapshotAsync("{snapshot_name}").get(); +| module | file_name | score | signature | +|--------------------|----------------------------|------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| toc | point_ops.rs | 0.59448624 | [ `pub async fn count`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/storage/src/content_manager/toc/point_ops.rs#L120) | +| operations | types.rs | 0.5493385 | [ `pub struct CountRequestInternal`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/collection/src/operations/types.rs#L831) | +| collection_manager | segments_updater.rs | 0.5121002 | [ `pub(crate) fn upsert_points<'a, T>`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/collection/src/collection_manager/segments_updater.rs#L339) | +| collection | point_ops.rs | 0.5063539 | [ `pub async fn count`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/collection/src/collection/point_ops.rs#L213) | +| map_index | mod.rs | 0.49973983 | [ `fn get_points_with_value_count`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/segment/src/index/field_index/map_index/mod.rs#L88) | +| field_index | geo_index.rs | 0.73278356 | [ `fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/segment/src/index/field_index/geo_index.rs#L612) | +| numeric_index | mod.rs | 0.7254976 | [ `fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/index/field_index/numeric_index/mod.rs#L322) | +| map_index | mod.rs | 0.7124739 | [ `fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/index/field_index/map_index/mod.rs#L315) | +| map_index | mod.rs | 0.7124739 | [ `fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/index/field_index/map_index/mod.rs#L429) | +| fixtures | payload_context_fixture.rs | 0.706204 | [ `fn total_point_count`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/fixtures/payload_context_fixture.rs#L122) | -``` +This is one example of how you can use different models and combine the results. +In a real-world scenario, you might run some reranking and deduplication, as +well as additional processing of the results. -```csharp -using Qdrant.Client; +### Code search demo -var client = new QdrantClient("localhost", 6334); +Our [Code search demo](https://code-search.qdrant.tech/) uses the following process: -await client.DeleteFullSnapshotAsync("{snapshot_name}"); +1. The user sends a query. +1. Both models vectorize that query simultaneously. We get two different + vectors. +1. Both vectors are used in parallel to find relevant snippets. We expect + 5 examples from the NLP search and 20 examples from the code search. +1. Once we retrieve results for both vectors, we merge them in one of the + following scenarios: + 1. If both methods return different results, we prefer the results from + the general usage model (NLP). + 1. If there is an overlap between the search results, we merge overlapping + snippets. -``` +In the screenshot, we search for `flush of wal`. The result +shows relevant code, merged from both models. Note the highlighted +code in lines 621-629. It's where both models agree. -```go -import ( - "context" +![Results from both models, with overlap](/documentation/tutorials/code-search/code-search-demo-example.png) - "github.com/qdrant/go-client/qdrant" -) +Now you see semantic code intelligence, in action. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +### Grouping the results -client.DeleteFullSnapshot(context.Background(), "{snapshot_name}") +You can improve the search results, by grouping them by payload properties. +In our case, we can group the results by the module. If we use code embeddings, +we can see multiple results from the `map_index` module. Let's group the +results and assume a single result per module: +```python +results = client.query_points_groups( + collection_name="qdrant-sources", + using="code", + query=models.Document(text=query, model="jinaai/jina-embeddings-v2-base-code"), + group_by="context.module", + limit=5, + group_size=1, +) ``` -### [Anchor](https://qdrant.tech/documentation/concepts/snapshots/\#list-full-storage-snapshots) List full storage snapshots +Output: -httppythontypescriptrustjavacsharpgo +| module | file_name | score | signature | +|---------------|----------------------------|------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| field_index | geo_index.rs | 0.73278356 | [ `fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/7aa164bd2dda1c0fc9bf3a0da42e656c95c2e52a/lib/segment/src/index/field_index/geo_index.rs#L612) | +| numeric_index | mod.rs | 0.7254976 | [ `fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/index/field_index/numeric_index/mod.rs#L322) | +| map_index | mod.rs | 0.7124739 | [ `fn count_indexed_points`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/index/field_index/map_index/mod.rs#L315) | +| fixtures | payload_context_fixture.rs | 0.706204 | [ `fn total_point_count`](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/fixtures/payload_context_fixture.rs#L122) | +| hnsw_index | graph_links.rs | 0.6998417 | [ `fn num_points `](https://github.com/qdrant/qdrant/blob/3fbe1cae6cb7f51a0c5bb4b45cfe6749ac76ed59/lib/segment/src/index/hnsw_index/graph_links.rs#L477) | -```http -GET /snapshots +With the grouping feature, we get more diverse results. -``` +## Summary -```python -from qdrant_client import QdrantClient +This tutorial demonstrates how to use Qdrant to navigate a codebase. For an +end-to-end implementation, review the [code search +notebook](https://colab.research.google.com/github/qdrant/examples/blob/master/code-search/code-search.ipynb) and the +[code-search-demo](https://github.com/qdrant/demo-code-search). You can also check out [a running version of the code +search demo](https://code-search.qdrant.tech/) which exposes Qdrant codebase for search with a web interface. -client = QdrantClient("localhost", port=6333) +<|page-103-lllmstxt|> +![agentic-rag-crewai-zoom](/documentation/examples/agentic-rag-crewai-zoom/agentic-rag-1.png) -client.list_full_snapshots() +# Agentic RAG With CrewAI & Qdrant Vector Database -``` +| Time: 45 min | Level: Beginner | Output: [GitHub](https://github.com/qdrant/examples/tree/master/agentic_rag_zoom_crewai) | +| --- | ----------- | ----------- |----------- | -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +By combining the power of Qdrant for vector search and CrewAI for orchestrating modular agents, you can build systems that don't just answer questions but analyze, interpret, and act. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +Traditional RAG systems focus on fetching data and generating responses, but they lack the ability to reason deeply or handle multi-step processes. -client.listFullSnapshots(); +In this tutorial, we'll walk you through building an Agentic RAG system step by step. By the end, you'll have a working framework for storing data in a Qdrant Vector Database and extracting insights using CrewAI agents in conjunction with Vector Search over your data. -``` +We already built this app for you. [Clone this repository](https://github.com/qdrant/examples/tree/master/agentic_rag_zoom_crewai) and follow along with the tutorial. -```rust -use qdrant_client::Qdrant; +## What You'll Build +In this hands-on tutorial, we'll create a system that: -let client = Qdrant::from_url("http://localhost:6334").build()?; +1. Uses Qdrant to store and retrieve meeting transcripts as vector embeddings +2. Leverages CrewAI agents to analyze and summarize meeting data +3. Presents insights in a simple Streamlit interface for easy interaction -client.list_full_snapshots().await?; +This project demonstrates how to build a Vector Search powered Agentic workflow to extract insights from meeting recordings. By combining Qdrant's vector search capabilities with CrewAI agents, users can search through and analyze their own meeting content. -``` +The application first converts the meeting transcript into vector embeddings and stores them in a Qdrant vector database. It then uses CrewAI agents to query the vector database and extract insights from the meeting content. Finally, it uses Anthropic Claude to generate natural language responses to user queries based on the extracted insights from the vector database. -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; +### How Does It Work? -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +When you interact with the system, here's what happens behind the scenes: -client.listFullSnapshotAsync().get(); +First the user submits a query to the system. In this example, we want to find out the average length of Marketing meetings. Since one of the data points from the meetings is the duration of the meeting, the agent can calculate the average duration of the meetings by averaging the duration of all meetings with the keyword "Marketing" in the topic or content. -``` +![User Query Interface](/articles_data/agentic-rag-crewai-zoom/query1.png) -```csharp -using Qdrant.Client; +Next, the agent used the `search_meetings` tool to search the Qdrant vector database for the most semantically similar meeting points. We asked about Marketing meetings, so the agent searched the database with the search meeting tool for all meetings with the keyword "Marketing" in the topic or content. -var client = new QdrantClient("localhost", 6334); +![Vector Search Results](/articles_data/agentic-rag-crewai-zoom/output0.png) -await client.ListFullSnapshotsAsync(); -``` +Next, the agent used the `calculator` tool to find the average duration of the meetings. -```go -import ( - "context" +![Duration Calculation](/articles_data/agentic-rag-crewai-zoom/output.png) - "github.com/qdrant/go-client/qdrant" -) +Finally, the agent used the `Information Synthesizer` tool to synthesize the analysis and present it in a natural language format. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +![Synthesized Analysis](/articles_data/agentic-rag-crewai-zoom/output4.png) -client.ListFullSnapshots(context.Background()) +The user sees the final output in a chat-like interface. -``` +![Chat Interface](/articles_data/agentic-rag-crewai-zoom/app.png) -### [Anchor](https://qdrant.tech/documentation/concepts/snapshots/\#download-full-storage-snapshot) Download full storage snapshot +The user can then continue to interact with the system by asking more questions. -```http -GET /snapshots/{snapshot_name} -``` +### Architecture -## [Anchor](https://qdrant.tech/documentation/concepts/snapshots/\#restore-full-storage-snapshot) Restore full storage snapshot +The system is built on three main components: +- **Qdrant Vector Database**: Stores meeting transcripts and summaries as vector embeddings, enabling semantic search +- **CrewAI Framework**: Coordinates AI agents that handle different aspects of meeting analysis +- **Anthropic Claude**: Provides natural language understanding and response generation -Restoring snapshots can only be done through the Qdrant CLI at startup time. +1. **Data Processing Pipeline** + - Processes meeting transcripts and metadata + - Creates embeddings with SentenceTransformer + - Manages Qdrant collection and data upload -For example: +2. **AI Agent System** + - Implements CrewAI agent logic + - Handles vector search integration + - Processes queries with Claude -```bash -./qdrant --storage-snapshot /snapshots/full-snapshot-2022-07-18-11-20-51.snapshot +3. **User Interface** + - Provides chat-like web interface + - Shows real-time processing feedback + - Maintains conversation history -``` +--- -## [Anchor](https://qdrant.tech/documentation/concepts/snapshots/\#storage) Storage +## Getting Started -Created, uploaded and recovered snapshots are stored as `.snapshot` files. By -default, they’re stored on the [local file system](https://qdrant.tech/documentation/concepts/snapshots/#local-file-system). You may -also configure to use an [S3 storage](https://qdrant.tech/documentation/concepts/snapshots/#s3) service for them. +![agentic-rag-crewai-zoom](/documentation/examples/agentic-rag-crewai-zoom/agentic-rag-2.png) -### [Anchor](https://qdrant.tech/documentation/concepts/snapshots/\#local-file-system) Local file system +1. **Get API Credentials for Qdrant**: + - Sign up for an account at [Qdrant Cloud](https://cloud.qdrant.io/signup). + - Create a new cluster and copy the **Cluster URL** (format: https://xxx.gcp.cloud.qdrant.io). + - Go to **Data Access Control** and generate an **API key**. -By default, snapshots are stored at `./snapshots` or at `/qdrant/snapshots` when -using our Docker image. +2. **Get API Credentials for AI Services**: + - Get an API key from [Anthropic](https://www.anthropic.com/) + - Get an API key from [OpenAI](https://platform.openai.com/) -The target directory can be controlled through the [configuration](https://qdrant.tech/documentation/guides/configuration/): +--- -```yaml -storage: - # Specify where you want to store snapshots. - snapshots_path: ./snapshots +## Setup +1. **Clone the Repository**: +```bash +git clone https://github.com/qdrant/examples.git +cd agentic_rag_zoom_crewai ``` -Alternatively you may use the environment variable `QDRANT__STORAGE__SNAPSHOTS_PATH=./snapshots`. - -_Available as of v1.3.0_ +2. **Create and Activate a Python Virtual Environment with Python 3.10 for compatibility**: +```bash +python3.10 -m venv venv +source venv/bin/activate # Windows: venv\Scripts\activate +``` -While a snapshot is being created, temporary files are placed in the configured -storage directory by default. In case of limited capacity or a slow -network attached disk, you can specify a separate location for temporary files: +3. **Install Dependencies**: +```bash +pip install -r requirements.txt +``` -```yaml -storage: - # Where to store temporary files - temp_path: /tmp +4. **Configure Environment Variables**: +Create a `.env.local` file with: +```bash +openai_api_key=your_openai_key_here +anthropic_api_key=your_anthropic_key_here +qdrant_url=your_qdrant_url_here +qdrant_api_key=your_qdrant_api_key_here ``` -### [Anchor](https://qdrant.tech/documentation/concepts/snapshots/\#s3) S3 +--- -_Available as of v1.10.0_ +## Usage -Rather than storing snapshots on the local file system, you may also configure -to store snapshots in an S3-compatible storage service. To enable this, you must -configure it in the [configuration](https://qdrant.tech/documentation/guides/configuration/) file. +### 1. Process Meeting Data +The [`data_loader.py`](https://github.com/qdrant/examples/blob/master/agentic_rag_zoom_crewai/vector/data_loader.py) script processes meeting data and stores it in Qdrant: -For example, to configure for AWS S3: +```bash +python vector/data_loader.py +``` -```yaml -storage: - snapshots_config: - # Use 's3' to store snapshots on S3 - snapshots_storage: s3 +After this script has run, you should see a new collection in your Qdrant Cloud account called `zoom_recordings`. This collection contains the vector embeddings of the meeting transcripts. The points in the collection contain the original meeting data, including the topic, content, and summary. - s3_config: - # Bucket name - bucket: your_bucket_here +### 2. Launch the Interface +The [`streamlit_app.py`](https://github.com/qdrant/examples/blob/master/agentic_rag_zoom_crewai/vector/streamlit_app.py) is located in the `vector` folder. To launch it, run: - # Bucket region (e.g. eu-central-1) - region: your_bucket_region_here +```bash +streamlit run vector/streamlit_app.py +``` +When you run this script, you will be able to interact with the system through a chat-like interface. Ask questions about the meeting content, and the system will use the AI agents to find the most relevant information and present it in a natural language format. - # Storage access key - # Can be specified either here or in the `QDRANT__STORAGE__SNAPSHOTS_CONFIG__S3_CONFIG__ACCESS_KEY` environment variable. - access_key: your_access_key_here - # Storage secret key - # Can be specified either here or in the `QDRANT__STORAGE__SNAPSHOTS_CONFIG__S3_CONFIG__SECRET_KEY` environment variable. - secret_key: your_secret_key_here +### The Data Pipeline - # S3-Compatible Storage URL - # Can be specified either here or in the `QDRANT__STORAGE__SNAPSHOTS_CONFIG__S3_CONFIG__ENDPOINT_URL` environment variable. - endpoint_url: your_url_here +At the heart of our system is the data processing pipeline: +```python +class MeetingData: + def _initialize(self): + self.data_dir = Path(__file__).parent.parent / 'data' + self.meetings = self._load_meetings() + + self.qdrant_client = QdrantClient( + url=os.getenv('qdrant_url'), + api_key=os.getenv('qdrant_api_key') + ) + self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2') ``` +The singleton pattern in data_loader.py is implemented through a MeetingData class that uses Python's __new__ and __init__ methods. The class maintains a private _instance variable to track if an instance exists, and a _initialized flag to ensure the initialization code only runs once. When creating a new instance with MeetingData(), __new__ first checks if _instance exists - if it doesn't, it creates one and sets the initialization flag to False. The __init__ method then checks this flag, and if it's False, runs the initialization code and sets the flag to True. This ensures that all subsequent calls to MeetingData() return the same instance with the same initialized resources. -Apart from Snapshots, Qdrant also provides the [Qdrant Migration Tool](https://github.com/qdrant/migration) that supports: +When processing meetings, we need to consider both the content and context. Each meeting gets converted into a rich text representation before being transformed into a vector: -- Migration between Qdrant Cloud instances. -- Migrating vectors from other providers into Qdrant. -- Migrating from Qdrant OSS to Qdrant Cloud. +```python +text_to_embed = f""" + Topic: {meeting.get('topic', '')} + Content: {meeting.get('vtt_content', '')} + Summary: {json.dumps(meeting.get('summary', {}))} +""" +``` + +This structured format ensures our vector embeddings capture the full context of each meeting. But processing meetings one at a time would be inefficient. Instead, we batch process our data: -Follow our [migration guide](https://qdrant.tech/documentation/database-tutorials/migration/) to learn how to effectively use the Qdrant Migration tool. +```python +batch_size = 100 +for i in range(0, len(points), batch_size): + batch = points[i:i + batch_size] + self.qdrant_client.upsert( + collection_name='zoom_recordings', + points=batch + ) +``` -##### Was this page useful? +### Building the AI Agent System -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Our AI system uses a tool-based approach. Let's start with the simplest tool - a calculator for meeting statistics: -Thank you for your feedback! 🙏 +```python +class CalculatorTool(BaseTool): + name: str = "calculator" + description: str = "Perform basic mathematical calculations" + + def _run(self, a: int, b: int) -> dict: + return { + "addition": a + b, + "multiplication": a * b + } +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/snapshots.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +But the real power comes from our vector search integration. This tool converts natural language queries into vector representations and searches our meeting database: -On this page: +```python +class SearchMeetingsTool(BaseTool): + def _run(self, query: str) -> List[Dict]: + response = openai_client.embeddings.create( + model="text-embedding-ada-002", + input=query + ) + query_vector = response.data[0].embedding + + return self.qdrant_client.search( + collection_name='zoom_recordings', + query_vector=query_vector, + limit=10 + ) +``` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/snapshots.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +The search results then feed into our analysis tool, which uses Claude to provide deeper insights: -× +```python +class MeetingAnalysisTool(BaseTool): + def _run(self, meeting_data: dict) -> Dict: + meetings_text = self._format_meetings(meeting_data) + + message = client.messages.create( + model="claude-3-sonnet-20240229", + messages=[{ + "role": "user", + "content": f"Analyze these meetings:\n\n{meetings_text}" + }] + ) +``` -[Powered by](https://qdrant.tech/) +### Orchestrating the Workflow -<|page-71-lllmstxt|> -## examples -- [Documentation](https://qdrant.tech/documentation/) -- Build Prototypes +The magic happens when we bring these tools together under our agent framework. We create two specialized agents: -# [Anchor](https://qdrant.tech/documentation/examples/\#examples) Examples +```python +researcher = Agent( + role='Research Assistant', + goal='Find and analyze relevant information', + tools=[calculator, searcher, analyzer] +) -| End-to-End Code Samples | Description | Stack | -| --- | --- | --- | -| [Multitenancy with LlamaIndex](https://qdrant.tech/documentation/examples/llama-index-multitenancy/) | Handle data coming from multiple users in LlamaIndex. | Qdrant, Python, LlamaIndex | -| [Implement custom connector for Cohere RAG](https://qdrant.tech/documentation/examples/cohere-rag-connector/) | Bring data stored in Qdrant to Cohere RAG | Qdrant, Cohere, FastAPI | -| [Chatbot for Interactive Learning](https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/) | Build a Private RAG Chatbot for Interactive Learning | Qdrant, Haystack, OpenShift | -| [Information Extraction Engine](https://qdrant.tech/documentation/examples/rag-chatbot-vultr-dspy-ollama/) | Build a Private RAG Information Extraction Engine | Qdrant, Vultr, DSPy, Ollama | -| [System for Employee Onboarding](https://qdrant.tech/documentation/examples/natural-language-search-oracle-cloud-infrastructure-cohere-langchain/) | Build a RAG System for Employee Onboarding | Qdrant, Cohere, LangChain | -| [System for Contract Management](https://qdrant.tech/documentation/examples/rag-contract-management-stackit-aleph-alpha/) | Build a Region-Specific RAG System for Contract Management | Qdrant, Aleph Alpha, STACKIT | -| [Question-Answering System for Customer Support](https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws/) | Build a RAG System for AI Customer Support | Qdrant, Cohere, Airbyte, AWS | -| [Hybrid Search on PDF Documents](https://qdrant.tech/documentation/examples/hybrid-search-llamaindex-jinaai/) | Develop a Hybrid Search System for Product PDF Manuals | Qdrant, LlamaIndex, Jina AI | -| [Blog-Reading RAG Chatbot](https://qdrant.tech/documentation/examples/rag-chatbot-scaleway/) | Develop a RAG-based Chatbot on Scaleway and with LangChain | Qdrant, LangChain, GPT-4o | -| [Movie Recommendation System](https://qdrant.tech/documentation/examples/recommendation-system-ovhcloud/) | Build a Movie Recommendation System with LlamaIndex and With JinaAI | Qdrant | -| [GraphRAG Agent](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/) | Build a GraphRAG Agent with Neo4J and Qdrant | Qdrant, Neo4j | -| [Building a Chain-of-Thought Medical Chatbot with Qdrant and DSPy](https://qdrant.tech/documentation/examples/Qdrant-DSPy-medicalbot/) | How to build a medical chatbot grounded in medical literature with Qdrant and DSPy. | Qdrant, DSPy | - -## [Anchor](https://qdrant.tech/documentation/examples/\#notebooks) Notebooks +synthesizer = Agent( + role='Information Synthesizer', + goal='Create comprehensive and clear responses' +) +``` -Our Notebooks offer complex instructions that are supported with a throrough explanation. Follow along by trying out the code and get the most out of each example. +These agents work together in a coordinated workflow. The researcher gathers and analyzes information, while the synthesizer creates clear, actionable responses. This separation of concerns allows each agent to focus on its strengths. -| Example | Description | Stack | -| --- | --- | --- | -| [Intro to Semantic Search and Recommendations Systems](https://githubtocolab.com/qdrant/examples/blob/master/qdrant_101_getting_started/getting_started.ipynb) | Learn how to get started building semantic search and recommendation systems. | Qdrant | -| [Search and Recommend Newspaper Articles](https://githubtocolab.com/qdrant/examples/blob/master/qdrant_101_text_data/qdrant_and_text_data.ipynb) | Work with text data to develop a semantic search and a recommendation engine for news articles. | Qdrant | -| [Recommendation System for Songs](https://githubtocolab.com/qdrant/examples/blob/master/qdrant_101_audio_data/03_qdrant_101_audio.ipynb) | Use Qdrant to develop a music recommendation engine based on audio embeddings. | Qdrant | -| [Image Comparison System for Skin Conditions](https://colab.research.google.com/github/qdrant/examples/blob/master/qdrant_101_image_data/04_qdrant_101_cv.ipynb) | Use Qdrant to compare challenging images with labels representing different skin diseases. | Qdrant | -| [Question and Answer System with LlamaIndex](https://github.com/qdrant/examples/blob/949669f001a03131afebf2ecd1e0ce63cab01c81/llama_index_recency/Qdrant%20and%20LlamaIndex%20%E2%80%94%20A%20new%20way%20to%20keep%20your%20Q%26A%20systems%20up-to-date.ipynb) | Combine Qdrant and LlamaIndex to create a self-updating Q&A system. | Qdrant, LlamaIndex, Cohere | -| [Extractive QA System](https://githubtocolab.com/qdrant/examples/blob/master/extractive_qa/extractive-question-answering.ipynb) | Extract answers directly from context to generate highly relevant answers. | Qdrant | -| [Ecommerce Reverse Image Search](https://githubtocolab.com/qdrant/examples/blob/master/ecommerce_reverse_image_search/ecommerce-reverse-image-search.ipynb) | Accept images as search queries to receive semantically appropriate answers. | Qdrant | -| [Basic RAG](https://githubtocolab.com/qdrant/examples/blob/master/rag-openai-qdrant/rag-openai-qdrant.ipynb) | Basic RAG pipeline with Qdrant and OpenAI SDKs. | OpenAI, Qdrant, FastEmbed | +### Building the User Interface -##### Was this page useful? +The Streamlit interface provides a clean, chat-like experience for interacting with our AI system. Let's start with the basic setup: -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +```python +st.set_page_config( + page_title="Meeting Assistant", + page_icon="đŸ€–", + layout="wide" +) +``` -Thank you for your feedback! 🙏 +To make the interface more engaging, we add custom styling that makes the output easier to read: -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/_index.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +```python +st.markdown(""" + +""", unsafe_allow_html=True) +``` -On this page: +One of the key features is real-time feedback during processing. We achieve this with a custom output handler: -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/_index.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +```python +class ConsoleOutput: + def __init__(self, placeholder): + self.placeholder = placeholder + self.buffer = [] + self.update_interval = 0.5 # seconds + self.last_update = time.time() -× + def write(self, text): + self.buffer.append(text) + if time.time() - self.last_update > self.update_interval: + self._update_display() +``` -[Powered by](https://qdrant.tech/) +This handler buffers the output and updates the display periodically, creating a smooth user experience. When a user sends a query, we process it with visual feedback: -<|page-72-lllmstxt|> -## scalar-quantization -- [Articles](https://qdrant.tech/articles/) -- Scalar Quantization: Background, Practices & More \| Qdrant +```python +with st.chat_message("assistant"): + message_placeholder = st.empty() + progress_bar = st.progress(0) + console_placeholder = st.empty() + + try: + console_output = ConsoleOutput(console_placeholder) + with contextlib.redirect_stdout(console_output): + progress_bar.progress(0.3) + full_response = get_crew_response(prompt) + progress_bar.progress(1.0) +``` -[Back to Qdrant Internals](https://qdrant.tech/articles/qdrant-internals/) +The interface maintains a chat history, making it feel like a natural conversation: -# Scalar Quantization: Background, Practices & More \| Qdrant +```python +if "messages" not in st.session_state: + st.session_state.messages = [] -Kacper Ɓukawski +for message in st.session_state.messages: + with st.chat_message(message["role"]): + st.markdown(message["content"]) +``` -· +We also include helpful examples and settings in the sidebar: -March 27, 2023 +```python +with st.sidebar: + st.header("Settings") + search_limit = st.slider("Number of results", 1, 10, 5) + + analysis_depth = st.select_slider( + "Analysis Depth", + options=["Basic", "Standard", "Detailed"], + value="Standard" + ) +``` -![Scalar Quantization: Background, Practices & More | Qdrant](https://qdrant.tech/articles_data/scalar-quantization/preview/title.jpg) +This combination of features creates an interface that's both powerful and approachable. Users can see their query being processed in real-time, adjust settings to their needs, and maintain context through the chat history. -# [Anchor](https://qdrant.tech/articles/scalar-quantization/\#efficiency-unleashed-the-power-of-scalar-quantization) Efficiency Unleashed: The Power of Scalar Quantization +--- +## Conclusion -High-dimensional vector embeddings can be memory-intensive, especially when working with -large datasets consisting of millions of vectors. Memory footprint really starts being -a concern when we scale things up. A simple choice of the data type used to store a single -number impacts even billions of numbers and can drive the memory requirements crazy. The -higher the precision of your type, the more accurately you can represent the numbers. -The more accurate your vectors, the more precise is the distance calculation. But the -advantages stop paying off when you need to order more and more memory. +![agentic-rag-crewai-zoom](/documentation/examples/agentic-rag-crewai-zoom/agentic-rag-3.png) -Qdrant chose `float32` as a default type used to store the numbers of your embeddings. -So a single number needs 4 bytes of the memory and a 512-dimensional vector occupies -2 kB. That’s only the memory used to store the vector. There is also an overhead of the -HNSW graph, so as a rule of thumb we estimate the memory size with the following formula: +This tutorial has demonstrated how to build a sophisticated meeting analysis system that combines vector search with AI agents. Let's recap the key components we've covered: -```text -memory_size = 1.5 * number_of_vectors * vector_dimension * 4 bytes +1. **Vector Search Integration** + - Efficient storage and retrieval of meeting content using Qdrant + - Semantic search capabilities through vector embeddings + - Batched processing for optimal performance -``` +2. **AI Agent Framework** + - Tool-based approach for modular functionality + - Specialized agents for research and analysis + - Integration with Claude for intelligent insights -While Qdrant offers various options to store some parts of the data on disk, starting -from version 1.1.0, you can also optimize your memory by compressing the embeddings. -We’ve implemented the mechanism of **Scalar Quantization**! It turns out to have not -only a positive impact on memory but also on the performance. +3. **Interactive Interface** + - Real-time feedback and progress tracking + - Persistent chat history + - Configurable search and analysis settings -## [Anchor](https://qdrant.tech/articles/scalar-quantization/\#scalar-quantization) Scalar quantization +The resulting system demonstrates the power of combining vector search with AI agents to create an intelligent meeting assistant. By following this tutorial, you've learned how to: +- Process and store meeting data efficiently +- Implement semantic search capabilities +- Create specialized AI agents for analysis +- Build an intuitive user interface -Scalar quantization is a data compression technique that converts floating point values -into integers. In case of Qdrant `float32` gets converted into `int8`, so a single number -needs 75% less memory. It’s not a simple rounding though! It’s a process that makes that -transformation partially reversible, so we can also revert integers back to floats with -a small loss of precision. +This foundation can be extended in many ways, such as: +- Adding more specialized agents +- Implementing additional analysis tools +- Enhancing the user interface +- Integrating with other data sources -### [Anchor](https://qdrant.tech/articles/scalar-quantization/\#theoretical-background) Theoretical background +The code is available in the [repository](https://github.com/qdrant/examples/tree/master/agentic_rag_zoom_crewai), and we encourage you to experiment with your own modifications and improvements. -Assume we have a collection of `float32` vectors and denote a single value as `f32`. -In reality neural embeddings do not cover a whole range represented by the floating -point numbers, but rather a small subrange. Since we know all the other vectors, we can -establish some statistics of all the numbers. For example, the distribution of the values -will be typically normal: +--- -![A distribution of the vector values](https://qdrant.tech/articles_data/scalar-quantization/float32-distribution.png) +<|page-104-lllmstxt|> +# User Management -Our example shows that 99% of the values come from a `[-2.0, 5.0]` range. And the -conversion to `int8` will surely lose some precision, so we rather prefer keeping the -representation accuracy within the range of 99% of the most probable values and ignoring -the precision of the outliers. There might be a different choice of the range width, -actually, any value from a range `[0, 1]`, where `0` means empty range, and `1` would -keep all the values. That’s a hyperparameter of the procedure called `quantile`. A value -of `0.95` or `0.99` is typically a reasonable choice, but in general `quantile ∈ [0, 1]`. +> 💡 You can access this in **Access Management > User & Role Management** *if available see [this page for details](/documentation/cloud-rbac/).* -#### [Anchor](https://qdrant.tech/articles/scalar-quantization/\#conversion-to-integers) Conversion to integers +## Inviting Users to an Account -Let’s talk about the conversion to `int8`. Integers also have a finite set of values that -might be represented. Within a single byte they may represent up to 256 different values, -either from `[-128, 127]` or `[0, 255]`. +Users can be invited via the **User Management** section, where they are assigned the **Base role** by default. Additionally, users have the option to select a specific role when inviting another user. The **Base role** is a predefined role with minimal permissions, granting users access to the platform while restricting them to viewing only their own profile. -![Value ranges represented by int8](https://qdrant.tech/articles_data/scalar-quantization/int8-value-range.png) +![image.png](/documentation/cloud/role-based-access-control/user-invitation.png) -Since we put some boundaries on the numbers that might be represented by the `f32`, and -`i8` has some natural boundaries, the process of converting the values between those -two ranges is quite natural: +### Inviting Users from a Role -f32=α×i8+offset +Users can be invited attached to a specific role by inviting them through the **Role Details** page - just click on the Users tab and follow the prompts. -i8=f32−offsetα +Once accepted, they'll be assigned that role's permissions, along with the base role. -The parameters α and offset has to be calculated for a given set of vectors, -but that comes easily by putting the minimum and maximum of the represented range for -both `f32` and `i8`. +![image.png](/documentation/cloud/role-based-access-control/invite-user.png) -![Float32 to int8 conversion](https://qdrant.tech/articles_data/scalar-quantization/float32-to-int8-conversion.png) +### Revoking an Invitation -For the unsigned `int8` it will go as following: +Before being accepted, an Admin/Owner can cancel a pending invite directly on either the **User Management** or **Role Details** page. -{−2=α×0+offset5=α×255+offset +![image.png](/documentation/cloud/role-based-access-control/revoke-invite.png) -In case of signed `int8`, we’ll just change the represented range boundaries: +## Updating a User’s Roles -{−2=α×(−128)+offset5=α×127+offset +Authorized users can give or take away roles from users in **User Management**. -For any set of vector values we can simply calculate the α and offset and -those values have to be stored along with the collection to enable to conversion between -the types. +![image.png](/documentation/cloud/role-based-access-control/update-user-role.png) -#### [Anchor](https://qdrant.tech/articles/scalar-quantization/\#distance-calculation) Distance calculation +![image.png](/documentation/cloud/role-based-access-control/update-user-role-edit-dialog.png) -We do not store the vectors in the collections represented by `int8` instead of `float32` -just for the sake of compressing the memory. But the coordinates are being used while we -calculate the distance between the vectors. Both dot product and cosine distance requires -multiplying the corresponding coordinates of two vectors, so that’s the operation we -perform quite often on `float32`. Here is how it would look like if we perform the -conversion to `int8`: +## Removing a User from an Account -f32×f32â€Č==(α×i8+offset)×(α×i8â€Č+offset)==α2×i8×i8â€Č+offset×α×i8â€Č+offset×α×i8+offset2⏟pre-compute +Users can be removed from an account by clicking on their name in either **User Management** (via Actions). This option is only available after they've accepted the invitation to join, ensuring that only active users can be removed. -The first term, α2×i8×i8â€Č has to be calculated when we measure the -distance as it depends on both vectors. However, both the second and the third term -(offset×α×i8â€Č and offset×α×i8 respectively), -depend only on a single vector and those might be precomputed and kept for each vector. -The last term, offset2 does not depend on any of the values, so it might be even -computed once and reused. +![image.png](/documentation/cloud/role-based-access-control/remove-user.png) -If we had to calculate all the terms to measure the distance, the performance could have -been even worse than without the conversion. But thanks for the fact we can precompute -the majority of the terms, things are getting simpler. And in turns out the scalar -quantization has a positive impact not only on the memory usage, but also on the -performance. As usual, we performed some benchmarks to support this statement! +<|page-105-lllmstxt|> +# Agentic RAG With LangGraph and Qdrant -## [Anchor](https://qdrant.tech/articles/scalar-quantization/\#benchmarks) Benchmarks +Traditional Retrieval-Augmented Generation (RAG) systems follow a straightforward path: query → retrieve → generate. Sure, this works well for many scenarios. But let’s face it—this linear approach often struggles when you're dealing with complex queries that demand multiple steps or pulling together diverse types of information. -We simply used the same approach as we use in all [the other benchmarks we publish](https://qdrant.tech/benchmarks/). -Both [Arxiv-titles-384-angular-no-filters](https://github.com/qdrant/ann-filtering-benchmark-datasets) -and [Gist-960](https://github.com/erikbern/ann-benchmarks/) datasets were chosen to make -the comparison between non-quantized and quantized vectors. The results are summarized -in the tables: +[Agentic RAG](https://qdrant.tech/articles/agentic-rag/) takes things up a notch by introducing AI agents that can orchestrate multiple retrieval steps and smartly decide how to gather and use the information you need. Think of it this way: in an Agentic RAG workflow, RAG becomes just one powerful tool in a much bigger and more versatile toolkit. -#### [Anchor](https://qdrant.tech/articles/scalar-quantization/\#arxiv-titles-384-angular-no-filters) Arxiv-titles-384-angular-no-filters +By combining LangGraph’s robust state management with Qdrant’s cutting-edge vector search, we’ll build a system that doesn’t just answer questions—it tackles complex, multi-step information retrieval tasks with finesse. -| | ef = 128 | ef = 256 | ef = 512 | -| --- | --- | --- | --- | -| | Upload and indexing time | Mean search precision | Mean search time | Mean search precision | Mean search time | Mean search precision | Mean search time | -| --- | --- | --- | --- | --- | --- | --- | --- | -| Non-quantized vectors | 649 s | 0.989 | 0.0094 | 0.994 | 0.0932 | 0.996 | 0.161 | -| Scalar Quantization | 496 s | 0.986 | 0.0037 | 0.993 | 0.060 | 0.996 | 0.115 | -| Difference | -23.57% | -0.3% | -60.64% | -0.1% | -35.62% | 0% | -28.57% | - -A slight decrease in search precision results in a considerable improvement in the -latency. Unless you aim for the highest precision possible, you should not notice the -difference in your search quality. +## What We’ll Build -#### [Anchor](https://qdrant.tech/articles/scalar-quantization/\#gist-960) Gist-960 +We’re building an AI agent to answer questions about Hugging Face and Transformers documentation using LangGraph. At the heart of our AI agent lies LangGraph, which acts like a conductor in an orchestra. It directs the flow between various components—deciding when to retrieve information, when to perform a web search, and when to generate responses. -| | ef = 128 | ef = 256 | ef = 512 | -| --- | --- | --- | --- | -| | Upload and indexing time | Mean search precision | Mean search time | Mean search precision | Mean search time | Mean search precision | Mean search time | -| --- | --- | --- | --- | --- | --- | --- | --- | -| Non-quantized vectors | 452 | 0.802 | 0.077 | 0.887 | 0.135 | 0.941 | 0.231 | -| Scalar Quantization | 312 | 0.802 | 0.043 | 0.888 | 0.077 | 0.941 | 0.135 | -| Difference | -30.79% | 0% | -44,16% | +0.11% | -42.96% | 0% | -41,56% | +The components are: two Qdrant vector stores and the Brave web search engine. However, our agent doesn’t just blindly follow one path. Instead, it evaluates each query and decides whether to tap into the first vector store, the second one, or search the web. -In all the cases, the decrease in search precision is negligible, but we keep a latency -reduction of at least 28.57%, even up to 60,64%, while searching. As a rule of thumb, -the higher the dimensionality of the vectors, the lower the precision loss. +This selective approach gives your system the flexibility to choose the best data source for the job, rather than being locked into the same retrieval process every time, like traditional RAG. While we won’t dive into query refinement in this tutorial, the concepts you’ll learn here are a solid foundation for adding that functionality down the line. -### [Anchor](https://qdrant.tech/articles/scalar-quantization/\#oversampling-and-rescoring) Oversampling and rescoring +## Workflow -A distinctive feature of the Qdrant architecture is the ability to combine the search for quantized and original vectors in a single query. -This enables the best combination of speed, accuracy, and RAM usage. +![image1](/documentation/examples/agentic-rag-langgraph/image1.png) -Qdrant stores the original vectors, so it is possible to rescore the top-k results with -the original vectors after doing the neighbours search in quantized space. That obviously -has some impact on the performance, but in order to measure how big it is, we made the -comparison in different search scenarios. -We used a machine with a very slow network-mounted disk and tested the following scenarios with different amounts of allowed RAM: +| **Step** | **Description** | +|----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **1. User Input** | You start by entering a query or request through an interface, like a chatbot or a web form. This query is sent straight to the AI Agent, the brain of the operation.| +| **2. AI Agent Processes the Query** | The AI Agent analyzes your query, figuring out what you’re asking and which tools or data sources will best answer your question. | +| **3. Tool Selection** | Based on its analysis, the AI Agent picks the right tool for the job. Your data is spread across two vector databases, and depending on the query, it chooses the appropriate one. For queries needing real-time or external web data, the agent taps into a web search tool powered by BraveSearchAPI. | +| **4. Query Execution** | The AI Agent then puts its chosen tool to work:
- **RAG Tool 1** queries Vector Database 1.
- **RAG Tool 2** queries Vector Database 2.
- **Web Search Tool** dives into the internet using the search API. | +| **5. Data Retrieval** | The results roll in:
- Vector Database 1 and 2 return the most relevant documents for your query.
- The Web Search Tool provides up-to-date or external information.| +| **6. Response Generation** | Using a text generation model (like GPT), the AI Agent crafts a detailed and accurate response tailored to your query. | +| **7. User Response** | The polished response is sent back to you through the interface, ready to use. | -| Setup | RPS | Precision | -| --- | --- | --- | -| 4.5GB memory | 600 | 0.99 | -| 4.5GB memory + SQ + rescore | 1000 | 0.989 | +## The Stack -And another group with more strict memory limits: +The architecture taps into cutting-edge tools to power efficient Agentic RAG workflows. Here’s a quick overview of its components and the technologies you’ll need: -| Setup | RPS | Precision | -| --- | --- | --- | -| 2GB memory | 2 | 0.99 | -| 2GB memory + SQ + rescore | 30 | 0.989 | -| 2GB memory + SQ + no rescore | 1200 | 0.974 | +- **AI Agent:** The mastermind of the system, this agent parses your queries, picks the right tools, and integrates the responses. We’ll use OpenAI’s *gpt-4o* as the reasoning engine, managed seamlessly by LangGraph. +- **Embedding:** Queries are transformed into vector embeddings using OpenAI’s *text-embedding-3-small* model. +- **Vector Database:** Embeddings are stored and used for similarity searches, with Qdrant stepping in as our database of choice. +- **LLM:** Responses are generated using OpenAI’s *gpt-4o*, ensuring answers are accurate and contextually grounded. +- **Search Tools:** To extend RAG’s capabilities, we’ve added a web search component powered by BraveSearchAPI, perfect for real-time and external data retrieval. +- **Workflow Management:** The entire orchestration and decision-making flow is built with LangGraph, providing the flexibility and intelligence needed to handle complex workflows. -In those experiments, throughput was mainly defined by the number of disk reads, and quantization efficiently reduces it by allowing more vectors in RAM. -Read more about on-disk storage in Qdrant and how we measure its performance in our article: [Minimal RAM you need to serve a million vectors](https://qdrant.tech/articles/memory-consumption/). +Ready to start building this system from the ground up? Let’s get to it! -The mechanism of Scalar Quantization with rescoring disabled pushes the limits of low-end -machines even further. It seems like handling lots of requests does not require an -expensive setup if you can agree to a small decrease in the search precision. +## Implementation -### [Anchor](https://qdrant.tech/articles/scalar-quantization/\#accessing-best-practices) Accessing best practices +Before we dive into building our agent, let’s get everything set up. -Qdrant documentation on [Scalar Quantization](https://qdrant.tech/documentation/quantization/#setting-up-quantization-in-qdrant) -is a great resource describing different scenarios and strategies to achieve up to 4x -lower memory footprint and even up to 2x performance increase. +### Imports -##### Was this page useful? +Here’s a list of key imports required: -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +```python +import os +import json +from typing import Annotated, TypedDict +from dotenv import load_dotenv +from langchain.embeddings import OpenAIEmbeddings +from langgraph import StateGraph, tool, ToolNode, ToolMessage +from langchain.document_loaders import HuggingFaceDatasetLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.llms import ChatOpenAI +from qdrant_client import QdrantClient +from qdrant_client.http.models import VectorParams +from brave_search import BraveSearch +``` -Thank you for your feedback! 🙏 +### Qdrant Vector Database Setup -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/scalar-quantization.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +We’ll use **Qdrant Cloud** as our vector store for document embeddings. Here’s how to set it up: -On this page: +| **Step** | **Description** | +|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **1. Create an Account** | If you don’t already have one, head to Qdrant Cloud and sign up. | +| **2. Set Up a Cluster** | Log in to your account and find the **Create New Cluster** button on the dashboard. Follow the prompts to configure:
- Select your **preferred region**.
- Choose the **free tier** for testing. | +| **3. Secure Your Details** | Once your cluster is ready, note these details:
- **Cluster URL** (e.g., https://xxx-xxx-xxx.aws.cloud.qdrant.io)
- **API Key** | -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/scalar-quantization.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Save these securely for future use! -× +### OpenAI API Configuration -[Powered by](https://qdrant.tech/) +Your OpenAI API key will power both embedding generation and language model interactions. Visit [OpenAI's platform](https://platform.openai.com/) and sign up for an account. In the API section of your dashboard, create a new API key. We'll use the text-embedding-3-small model for embeddings and GPT-4 as the language model. -<|page-73-lllmstxt|> -## faq-question-answering -- [Articles](https://qdrant.tech/articles/) -- Q&A with Similarity Learning +### Brave Search -[Back to Practical Examples](https://qdrant.tech/articles/practicle-examples/) +To enhance search capabilities, we’ll integrate Brave Search. Visit the [Brave API](https://api.search.brave.com/) and complete their API access request process to obtain an API key. This key will enable web search functionality for our agent. -# Q&A with Similarity Learning +For added security, store all API keys in a .env file. -George Panchuk +```json +OPENAI_API_KEY = +QDRANT_KEY = +QDRANT_URL = +BRAVE_API_KEY = +``` -· +--- -June 28, 2022 +Then load the environment variables: -![Q&A with Similarity Learning](https://qdrant.tech/articles_data/faq-question-answering/preview/title.jpg) +```python +load_dotenv() +qdrant_key = os.getenv("QDRANT_KEY") +qdrant_url = os.getenv("QDRANT_URL") +brave_key = os.getenv("BRAVE_API_KEY") +``` -# [Anchor](https://qdrant.tech/articles/faq-question-answering/\#question-answering-system-with-similarity-learning-and-quaterion) Question-answering system with Similarity Learning and Quaterion +--- -Many problems in modern machine learning are approached as classification tasks. -Some are the classification tasks by design, but others are artificially transformed into such. -And when you try to apply an approach, which does not naturally fit your problem, you risk coming up with over-complicated or bulky solutions. -In some cases, you would even get worse performance. +### Document Processing -Imagine that you got a new task and decided to solve it with a good old classification approach. -Firstly, you will need labeled data. -If it came on a plate with the task, you’re lucky, but if it didn’t, you might need to label it manually. -And I guess you are already familiar with how painful it might be. +Before we can create our agent, we need to process and store the documentation. We’ll be working with two datasets from Hugging Face: their general documentation and Transformers-specific documentation. -Assuming you somehow labeled all required data and trained a model. -It shows good performance - well done! -But a day later, your manager told you about a bunch of new data with new classes, which your model has to handle. -You repeat your pipeline. -Then, two days later, you’ve been reached out one more time. -You need to update the model again, and again, and again. -Sounds tedious and expensive for me, does not it for you? +Here’s our document preprocessing function: -## [Anchor](https://qdrant.tech/articles/faq-question-answering/\#automating-customer-support) Automating customer support +```python +def preprocess_dataset(docs_list): + text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( + chunk_size=700, + chunk_overlap=50, + disallowed_special=() + ) + doc_splits = text_splitter.split_documents(docs_list) + return doc_splits +``` -Let’s now take a look at the concrete example. There is a pressing problem with automating customer support. -The service should be capable of answering user questions and retrieving relevant articles from the documentation without any human involvement. +--- -With the classification approach, you need to build a hierarchy of classification models to determine the question’s topic. -You have to collect and label a whole custom dataset of your private documentation topics to train that. -And then, each time you have a new topic in your documentation, you have to re-train the whole pile of classifiers with additionally labeled data. -Can we make it easier? +This function processes our documents by splitting them into manageable chunks, ensuring important context is preserved at the chunk boundaries through overlap. We’ll use the HuggingFaceDatasetLoader to load the datasets into Hugging Face documents. -## [Anchor](https://qdrant.tech/articles/faq-question-answering/\#similarity-option) Similarity option +```python +hugging_face_doc = HuggingFaceDatasetLoader("m-ric/huggingface_doc","text") +transformers_doc = HuggingFaceDatasetLoader("m-ric/transformers_documentation_en","text") +``` +--- -One of the possible alternatives is Similarity Learning, which we are going to discuss in this article. -It suggests getting rid of the classes and making decisions based on the similarity between objects instead. -To do it quickly, we would need some intermediate representation - embeddings. -Embeddings are high-dimensional vectors with semantic information accumulated in them. +In this demo, we are selecting the first 50 documents from the dataset and passing them to the processing function. -As embeddings are vectors, one can apply a simple function to calculate the similarity score between them, for example, cosine or euclidean distance. -So with similarity learning, all we need to do is provide pairs of correct questions and answers. -And then, the model will learn to distinguish proper answers by the similarity of embeddings. +```python +hf_splits = preprocess_dataset(hugging_face_doc.load()[:number_of_docs]) +transformer_splits = preprocess_dataset(transformers_doc.load()[:number_of_docs]) +``` +--- -> If you want to learn more about similarity learning and applications, check out this [article](https://qdrant.tech/documentation/tutorials/neural-search/) which might be an asset. +Our splits are ready. Let’s create a collection in Qdrant to store them. -## [Anchor](https://qdrant.tech/articles/faq-question-answering/\#lets-build) Let’s build +### Defining the State -Similarity learning approach seems a lot simpler than classification in this case, and if you have some -doubts on your mind, let me dispel them. +In LangGraph, a **state** refers to the data or information stored and maintained at a specific point during the execution of a process or a series of operations. States capture the intermediate or final results that the system needs to keep track of to manage and control the flow of tasks, -As I have no any resource with exhaustive F.A.Q. which might serve as a dataset, I’ve scrapped it from sites of popular cloud providers. -The dataset consists of just 8.5k pairs of question and answers, you can take a closer look at it [here](https://github.com/qdrant/demo-cloud-faq). +LangGraph works with a state-based system. We define our state like this: -Once we have data, we need to obtain embeddings for it. -It is not a novel technique in NLP to represent texts as embeddings. -There are plenty of algorithms and models to calculate them. -You could have heard of Word2Vec, GloVe, ELMo, BERT, all these models can provide text embeddings. +```python +class State(TypedDict): + messages: Annotated[list, add_messages] +``` +--- -However, it is better to produce embeddings with a model trained for semantic similarity tasks. -For instance, we can find such models at [sentence-transformers](https://www.sbert.net/docs/pretrained_models.html). -Authors claim that `all-mpnet-base-v2` provides the best quality, but let’s pick `all-MiniLM-L6-v2` for our tutorial -as it is 5x faster and still offers good results. +Let’s build our tools. -Having all this, we can test our approach. We won’t take all our dataset at the moment, but only -a part of it. To measure model’s performance we will use two metrics - -[mean reciprocal rank](https://en.wikipedia.org/wiki/Mean_reciprocal_rank) and -[precision@1](https://en.wikipedia.org/wiki/Evaluation_measures_%28information_retrieval%29#Precision_at_k). -We have a [ready script](https://github.com/qdrant/demo-cloud-faq/blob/experiments/faq/baseline.py) -for this experiment, let’s just launch it now. +### Building the Tools -| precision@1 | reciprocal\_rank | -| --- | --- | -| 0.564 | 0.663 | +Our agent is equipped with three powerful tools: -That’s already quite decent quality, but maybe we can do better? +1. **Hugging Face Documentation Retriever** +2. **Transformers Documentation Retriever** +3. **Web Search Tool** -## [Anchor](https://qdrant.tech/articles/faq-question-answering/\#improving-results-with-fine-tuning) Improving results with fine-tuning +Let’s start by defining a retriever that takes documents and a collection name, then returns a retriever. The query is transformed into vectors using **OpenAIEmbeddings**. -Actually, we can! Model we used has a good natural language understanding, but it has never seen -our data. An approach called `fine-tuning` might be helpful to overcome this issue. With -fine-tuning you don’t need to design a task-specific architecture, but take a model pre-trained on -another task, apply a couple of layers on top and train its parameters. +```python +def create_retriever(collection_name, doc_splits): + vectorstore = QdrantVectorStore.from_documents( + doc_splits, + OpenAIEmbeddings(model="text-embedding-3-small"), + url=qdrant_url, + api_key=qdrant_key, + collection_name=collection_name, + ) + return vectorstore.as_retriever() +``` -Sounds good, but as similarity learning is not as common as classification, it might be a bit inconvenient to fine-tune a model with traditional tools. -For this reason we will use [Quaterion](https://github.com/qdrant/quaterion) \- a framework for fine-tuning similarity learning models. -Let’s see how we can train models with it +--- -First, create our project and call it `faq`. +Both the Hugging Face documentation retriever and the Transformers documentation retriever use this same function. With this setup, it’s incredibly simple to create separate tools for each. -> All project dependencies, utils scripts not covered in the tutorial can be found in the -> [repository](https://github.com/qdrant/demo-cloud-faq/tree/tutorial). +```python +hf_retriever_tool = create_retriever_tool( + hf_retriever, + "retriever_hugging_face_documentation", + "Search and return information about hugging face documentation, it includes the guide and Python code.", +) -### [Anchor](https://qdrant.tech/articles/faq-question-answering/\#configure-training) Configure training +transformer_retriever_tool = create_retriever_tool( + transformer_retriever, + "retriever_transformer", + "Search and return information specifically about transformers library", +) +``` -The main entity in Quaterion is [TrainableModel](https://quaterion.qdrant.tech/quaterion.train.trainable_model.html). -This class makes model’s building process fast and convenient. +--- -`TrainableModel` is a wrapper around [pytorch\_lightning.LightningModule](https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html). +For web search, we create a simple yet effective tool using Brave Search: -[Lightning](https://www.pytorchlightning.ai/) handles all the training process complexities, like training loop, device managing, etc. and saves user from a necessity to implement all this routine manually. -Also Lightning’s modularity is worth to be mentioned. -It improves separation of responsibilities, makes code more readable, robust and easy to write. -All these features make Pytorch Lightning a perfect training backend for Quaterion. +```python +@tool("web_search_tool") +def search_tool(query): + search = BraveSearch.from_api_key(api_key=brave_key, search_kwargs={"count": 3}) + return search.run(query) +``` -To use `TrainableModel` you need to inherit your model class from it. -The same way you would use `LightningModule` in pure `pytorch_lightning`. -Mandatory methods are `configure_loss`, `configure_encoders`, `configure_head`, -`configure_optimizers`. +--- -The majority of mentioned methods are quite easy to implement, you’ll probably just need a couple of -imports to do that. But `configure_encoders` requires some code:) +The search_tool function leverages the BraveSearch API to perform a search. It takes a query, retrieves the top 3 search results using the API key, and returns the results. -Let’s create a `model.py` with model’s template and a placeholder for `configure_encoders` -for the moment. +Next, we’ll set up and integrate our tools with a language model: ```python -from typing import Union, Dict, Optional +tools = [hf_retriever_tool, transformer_retriever_tool, search_tool] -from torch.optim import Adam +tool_node = ToolNode(tools=tools) -from quaterion import TrainableModel -from quaterion.loss import MultipleNegativesRankingLoss, SimilarityLoss -from quaterion_models.encoders import Encoder -from quaterion_models.heads import EncoderHead -from quaterion_models.heads.skip_connection_head import SkipConnectionHead +llm = ChatOpenAI(model="gpt-4o", temperature=0) -class FAQModel(TrainableModel): - def __init__(self, lr=10e-5, *args, **kwargs): - self.lr = lr - super().__init__(*args, **kwargs) +llm_with_tools = llm.bind_tools(tools) +``` - def configure_optimizers(self): - return Adam(self.model.parameters(), lr=self.lr) +--- - def configure_loss(self) -> SimilarityLoss: - return MultipleNegativesRankingLoss(symmetric=True) +Here, the ToolNode class handles and orchestrates our tools: - def configure_encoders(self) -> Union[Encoder, Dict[str, Encoder]]: - ... # ToDo +```python +class ToolNode: + def __init__(self, tools: list) -> None: + self.tools_by_name = {tool.name: tool for tool in tools} - def configure_head(self, input_embedding_size: int) -> EncoderHead: - return SkipConnectionHead(input_embedding_size) + def __call__(self, inputs: dict): + if messages := inputs.get("messages", []): + message = messages[-1] + else: + raise ValueError("No message found in input") + + outputs = [] + for tool_call in message.tool_calls: + tool_result = self.tools_by_name[tool_call["name"]].invoke( + tool_call["args"] + ) + outputs.append( + ToolMessage( + content=json.dumps(tool_result), + name=tool_call["name"], + tool_call_id=tool_call["id"], + ) + ) + return {"messages": outputs} ``` -- `configure_optimizers` is a method provided by Lightning. An eagle-eye of you could notice -mysterious `self.model`, it is actually a [SimilarityModel](https://quaterion-models.qdrant.tech/quaterion_models.model.html) instance. We will cover it later. -- `configure_loss` is a loss function to be used during training. You can choose a ready-made implementation from Quaterion. -However, since Quaterion’s purpose is not to cover all possible losses, or other entities and -features of similarity learning, but to provide a convenient framework to build and use such models, -there might not be a desired loss. In this case it is possible to use [PytorchMetricLearningWrapper](https://quaterion.qdrant.tech/quaterion.loss.extras.pytorch_metric_learning_wrapper.html) -to bring required loss from [pytorch-metric-learning](https://kevinmusgrave.github.io/pytorch-metric-learning/) library, which has a rich collection of losses. -You can also implement a custom loss yourself. -- `configure_head` \- model built via Quaterion is a combination of encoders and a top layer - head. -As with losses, some head implementations are provided. They can be found at [quaterion\_models.heads](https://quaterion-models.qdrant.tech/quaterion_models.heads.html). +--- -At our example we use [MultipleNegativesRankingLoss](https://quaterion.qdrant.tech/quaterion.loss.multiple_negatives_ranking_loss.html). -This loss is especially good for training retrieval tasks. -It assumes that we pass only positive pairs (similar objects) and considers all other objects as negative examples. +The ToolNode class handles tool execution by initializing a list of tools and mapping tool names to their corresponding functions. It processes input dictionaries, extracts the last message, and checks for tool_calls from LLM tool-calling capability providers such as Anthropic, OpenAI, and others. -`MultipleNegativesRankingLoss` use cosine to measure distance under the hood, but it is a configurable parameter. -Quaterion provides implementation for other distances as well. You can find available ones at [quaterion.distances](https://quaterion.qdrant.tech/quaterion.distances.html). +### Routing and Decision Making -Now we can come back to `configure_encoders`:) +Our agent needs to determine when to use tools and when to end the cycle. This decision is managed by the routing function: -### [Anchor](https://qdrant.tech/articles/faq-question-answering/\#configure-encoder) Configure Encoder +```python +def route(state: State): + if isinstance(state, list): + ai_message = state[-1] + elif messages := state.get("messages", []): + ai_message = messages[-1] + else: + raise ValueError(f"No messages found in input state to tool_edge: {state}") -The encoder task is to convert objects into embeddings. -They usually take advantage of some pre-trained models, in our case `all-MiniLM-L6-v2` from `sentence-transformers`. -In order to use it in Quaterion, we need to create a wrapper inherited from the [Encoder](https://quaterion-models.qdrant.tech/quaterion_models.encoders.encoder.html) class. + if hasattr(ai_message, "tool_calls") and len(ai_message.tool_calls) > 0: + return "tools" -Let’s create our encoder in `encoder.py` + return END +``` -```python -import os +--- -from torch import Tensor, nn -from sentence_transformers.models import Transformer, Pooling +## Putting It All Together: The Graph -from quaterion_models.encoders import Encoder -from quaterion_models.types import TensorInterchange, CollateFnType +Finally, we’ll construct the graph that ties everything together: -class FAQEncoder(Encoder): - def __init__(self, transformer, pooling): - super().__init__() - self.transformer = transformer - self.pooling = pooling - self.encoder = nn.Sequential(self.transformer, self.pooling) +```python +graph_builder = StateGraph(State) - @property - def trainable(self) -> bool: - # Defines if we want to train encoder itself, or head layer only - return False +graph_builder.add_node("agent", agent) +graph_builder.add_node("tools", tool_node) - @property - def embedding_size(self) -> int: - return self.transformer.get_word_embedding_dimension() +graph_builder.add_conditional_edges( + "agent", + route, + {"tools": "tools", END: END}, +) - def forward(self, batch: TensorInterchange) -> Tensor: - return self.encoder(batch)["sentence_embedding"] +graph_builder.add_edge("tools", "agent") +graph_builder.add_edge(START, "agent") +``` - def get_collate_fn(self) -> CollateFnType: - return self.transformer.tokenize +--- - @staticmethod - def _transformer_path(path: str): - return os.path.join(path, "transformer") +This is what the graph looks like: - @staticmethod - def _pooling_path(path: str): - return os.path.join(path, "pooling") +![image2](/documentation/examples/agentic-rag-langgraph/image2.jpg) - def save(self, output_path: str): - transformer_path = self._transformer_path(output_path) - os.makedirs(transformer_path, exist_ok=True) - pooling_path = self._pooling_path(output_path) - os.makedirs(pooling_path, exist_ok=True) - self.transformer.save(transformer_path) - self.pooling.save(pooling_path) +Fig. 3: Agentic RAG with LangGraph - @classmethod - def load(cls, input_path: str) -> Encoder: - transformer = Transformer.load(cls._transformer_path(input_path)) - pooling = Pooling.load(cls._pooling_path(input_path)) - return cls(transformer=transformer, pooling=pooling) +### Running the Agent + +With everything set up, we can run our agent using a simple function: +```python +def run_agent(user_input: str): + for event in graph.stream({"messages": [("user", user_input)]}): + for value in event.values(): + print("Assistant:", value["messages"][-1].content) ``` -As you can notice, there are more methods implemented, then we’ve already discussed. Let’s go -through them now! +--- + +Now, you’re ready to ask questions about Hugging Face and Transformers! Our agent will intelligently combine information from the documentation with web search results when needed. -- In `__init__` we register our pre-trained layers, similar as you do in [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html) descendant. +For example, you can ask: -- `trainable` defines whether current `Encoder` layers should be updated during training or not. If `trainable=False`, then all layers will be frozen. +```txt +In the Transformers library, are there any multilingual models? +``` -- `embedding_size` is a size of encoder’s output, it is required for proper `head` configuration. +The agent will dive into the Transformers documentation, extract relevant details about multilingual models, and deliver a clear, comprehensive answer. -- `get_collate_fn` is a tricky one. Here you should return a method which prepares a batch of raw -data into the input, suitable for the encoder. If `get_collate_fn` is not overridden, then the [default\_collate](https://pytorch.org/docs/stable/data.html#torch.utils.data.default_collate) will be used. +Here’s what the response might look like: +```txt +Yes, the Transformers library includes several multilingual models. Here are some examples: -The remaining methods are considered self-describing. +BERT Multilingual: +Models like `bert-base-multilingual-uncased` can be used just like monolingual models. -As our encoder is ready, we now are able to fill `configure_encoders`. -Just insert the following code into `model.py`: +XLM (Cross-lingual Language Model): +Models like `xlm-mlm-ende-1024` (English-German), `xlm-mlm-enfr-1024` (English-French), and others use language embeddings to specify the language used at inference. -```python -... -from sentence_transformers import SentenceTransformer -from sentence_transformers.models import Transformer, Pooling -from faq.encoder import FAQEncoder +M2M100: +Models like `facebook/m2m100_418M` and `facebook/m2m100_1.2B` are used for multilingual translation. -class FAQModel(TrainableModel): - ... - def configure_encoders(self) -> Union[Encoder, Dict[str, Encoder]]: - pre_trained_model = SentenceTransformer("all-MiniLM-L6-v2") - transformer: Transformer = pre_trained_model[0] - pooling: Pooling = pre_trained_model[1] - encoder = FAQEncoder(transformer, pooling) - return encoder +MBart: +Models like `facebook/mbart-large-50-one-to-many-mmt` and `facebook/mbart-large-50-many-to-many-mmt` are used for multilingual machine translation across 50 languages. +These models are designed to handle multiple languages and can be used for tasks like translation, classification, and more. ``` -### [Anchor](https://qdrant.tech/articles/faq-question-answering/\#data-preparation) Data preparation +--- -Okay, we have raw data and a trainable model. But we don’t know yet how to feed this data to our model. +## Conclusion -Currently, Quaterion takes two types of similarity representation - pairs and groups. +We’ve successfully implemented Agentic RAG. But this is just the beginning—there’s plenty more you can explore to take your system to the next level. -The groups format assumes that all objects split into groups of similar objects. All objects inside -one group are similar, and all other objects outside this group considered dissimilar to them. +Agentic RAG is transforming how businesses connect data sources with AI, enabling smarter and more dynamic interactions. In this tutorial, you’ve learned how to build an Agentic RAG system that combines the power of LangGraph, Qdrant, and web search into one seamless workflow. -But in the case of pairs, we can only assume similarity between explicitly specified pairs of objects. +This system doesn’t just stop at retrieving relevant information from Hugging Face and Transformers documentation. It also smartly falls back to web search when needed, ensuring no query goes unanswered. With Qdrant as the vector database backbone, you get fast, scalable semantic search that excels at retrieving precise information—even from massive datasets. -We can apply any of the approaches with our data, but pairs one seems more intuitive. +To truly grasp the potential of this approach, why not apply these concepts to your own projects? Customize the template we’ve shared to fit your unique use case, and unlock the full potential of Agentic RAG for your business needs. The possibilities are endless. -The format in which Similarity is represented determines which loss can be used. -For example, _ContrastiveLoss_ and _MultipleNegativesRankingLoss_ works with pairs format. +<|page-106-lllmstxt|> +# Use Collaborative Filtering to Build a Movie Recommendation System with Qdrant -[SimilarityPairSample](https://quaterion.qdrant.tech/quaterion.dataset.similarity_samples.html#quaterion.dataset.similarity_samples.SimilarityPairSample) could be used to represent pairs. -Let’s take a look at it: +| Time: 45 min | Level: Intermediate | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/qdrant/examples/blob/master/collaborative-filtering/collaborative-filtering.ipynb) | | +|--------------|---------------------|--|----| -```python -@dataclass -class SimilarityPairSample: - obj_a: Any - obj_b: Any - score: float = 1.0 - subgroup: int = 0 +Every time Spotify recommends the next song from a band you've never heard of, it uses a recommendation algorithm based on other users' interactions with that song. This type of algorithm is known as **collaborative filtering**. -``` +Unlike content-based recommendations, collaborative filtering excels when the objects' semantics are loosely or unrelated to users' preferences. This adaptability is what makes it so fascinating. Movie, music, or book recommendations are good examples of such use cases. After all, we rarely choose which book to read purely based on the plot twists. -Here might be some questions: what `score` and `subgroup` are? +The traditional way to build a collaborative filtering engine involves training a model that converts the sparse matrix of user-to-item relations into a compressed, dense representation of user and item vectors. Some of the most commonly referenced algorithms for this purpose include [SVD (Singular Value Decomposition)](https://en.wikipedia.org/wiki/Singular_value_decomposition) and [Factorization Machines](https://en.wikipedia.org/wiki/Matrix_factorization_(recommender_systems)). However, the model training approach requires significant resource investments. Model training necessitates data, regular re-training, and a mature infrastructure. -Well, `score` is a measure of expected samples similarity. -If you only need to specify if two samples are similar or not, you can use `1.0` and `0.0` respectively. +## Methodology -`subgroups` parameter is required for more granular description of what negative examples could be. -By default, all pairs belong the subgroup zero. -That means that we would need to specify all negative examples manually. -But in most cases, we can avoid this by enabling different subgroups. -All objects from different subgroups will be considered as negative examples in loss, and thus it -provides a way to set negative examples implicitly. +Fortunately, there is a way to build collaborative filtering systems without any model training. You can obtain interpretable recommendations and have a scalable system using a technique based on similarity search. Let’s explore how this works with an example of building a movie recommendation system. -With this knowledge, we now can create our `Dataset` class in `dataset.py` to feed our model: +

-```python -import json -from typing import List, Dict +## Implementation -from torch.utils.data import Dataset -from quaterion.dataset.similarity_samples import SimilarityPairSample +To implement this, you will use a simple yet powerful resource: [Qdrant with Sparse Vectors](https://qdrant.tech/articles/sparse-vectors/). -class FAQDataset(Dataset): - """Dataset class to process .jsonl files with FAQ from popular cloud providers.""" +Notebook: [You can try this code here](https://githubtocolab.com/qdrant/examples/blob/master/collaborative-filtering/collaborative-filtering.ipynb) - def __init__(self, dataset_path): - self.dataset: List[Dict[str, str]] = self.read_dataset(dataset_path) - def __getitem__(self, index) -> SimilarityPairSample: - line = self.dataset[index] - question = line["question"] - # All questions have a unique subgroup - # Meaning that all other answers are considered negative pairs - subgroup = hash(question) - return SimilarityPairSample( - obj_a=question, - obj_b=line["answer"], - score=1, - subgroup=subgroup - ) +### Setup - def __len__(self): - return len(self.dataset) +You have to first import the necessary libraries and define the environment. - @staticmethod - def read_dataset(dataset_path) -> List[Dict[str, str]]: - """Read jsonl-file into a memory.""" - with open(dataset_path, "r") as fd: - return [json.loads(json_line) for json_line in fd] +```python +import os +import pandas as pd +import requests +from qdrant_client import QdrantClient, models +from qdrant_client.models import PointStruct, SparseVector, NamedSparseVector +from collections import defaultdict -``` +# OMDB API Key - for movie posters +omdb_api_key = os.getenv("OMDB_API_KEY") -We assigned a unique subgroup for each question, so all other objects which have different question will be considered as negative examples. +# Collection name +collection_name = "movies" -### [Anchor](https://qdrant.tech/articles/faq-question-answering/\#evaluation-metric) Evaluation Metric +# Set Qdrant Client +qdrant_client = QdrantClient( + os.getenv("QDRANT_HOST"), + api_key=os.getenv("QDRANT_API_KEY") +) +``` -We still haven’t added any metrics to the model. For this purpose Quaterion provides `configure_metrics`. -We just need to override it and attach interested metrics. +### Define output -Quaterion has some popular retrieval metrics implemented - such as _precision @ k_ or _mean reciprocal rank_. -They can be found in [quaterion.eval](https://quaterion.qdrant.tech/quaterion.eval.html) package. -But there are just a few metrics, it is assumed that desirable ones will be made by user or taken from another libraries. -You will probably need to inherit from `PairMetric` or `GroupMetric` to implement a new one. +Here, you will configure the recommendation engine to retrieve movie posters as output. -In `configure_metrics` we need to return a list of `AttachedMetric`. -They are just wrappers around metric instances and helps to log metrics more easily. -Under the hood `logging` is handled by `pytorch-lightning`. -You can configure it as you want - pass required parameters as keyword arguments to `AttachedMetric`. -For additional info visit [logging documentation page](https://pytorch-lightning.readthedocs.io/en/stable/extensions/logging.html) +```python +# Function to get movie poster using OMDB API +def get_movie_poster(imdb_id, api_key): + url = f"https://www.omdbapi.com/?i={imdb_id}&apikey={api_key}" + data = requests.get(url).json() + return data.get('Poster'), data +``` -Let’s add mentioned metrics for our `FAQModel`. -Add this code to `model.py`: +### Prepare the data + +Load the movie datasets. These include three main CSV files: user ratings, movie titles, and OMDB IDs. ```python -... -from quaterion.eval.pair import RetrievalPrecision, RetrievalReciprocalRank -from quaterion.eval.attached_metric import AttachedMetric +# Load CSV files +ratings_df = pd.read_csv('data/ratings.csv', low_memory=False) +movies_df = pd.read_csv('data/movies.csv', low_memory=False) -class FAQModel(TrainableModel): - def __init__(self, lr=10e-5, *args, **kwargs): - self.lr = lr - super().__init__(*args, **kwargs) +# Convert movieId in ratings_df and movies_df to string +ratings_df['movieId'] = ratings_df['movieId'].astype(str) +movies_df['movieId'] = movies_df['movieId'].astype(str) - ... - def configure_metrics(self): - return [\ - AttachedMetric(\ - "RetrievalPrecision",\ - RetrievalPrecision(k=1),\ - prog_bar=True,\ - on_epoch=True,\ - ),\ - AttachedMetric(\ - "RetrievalReciprocalRank",\ - RetrievalReciprocalRank(),\ - prog_bar=True,\ - on_epoch=True\ - ),\ - ] +rating = ratings_df['rating'] -``` +# Normalize ratings +ratings_df['rating'] = (rating - rating.mean()) / rating.std() -### [Anchor](https://qdrant.tech/articles/faq-question-answering/\#fast-training-with-cache) Fast training with Cache +# Merge ratings with movie metadata to get movie titles +merged_df = ratings_df.merge( + movies_df[['movieId', 'title']], + left_on='movieId', right_on='movieId', how='inner' +) -Quaterion has one more cherry on top of the cake when it comes to non-trainable encoders. -If encoders are frozen, they are deterministic and emit the exact embeddings for the same input data on each epoch. -It provides a way to avoid repeated calculations and reduce training time. -For this purpose Quaterion has a cache functionality. +# Aggregate ratings to handle duplicate (userId, title) pairs +ratings_agg_df = merged_df.groupby(['userId', 'movieId']).rating.mean().reset_index() -Before training starts, the cache runs one epoch to pre-calculate all embeddings with frozen encoders and then store them on a device you chose (currently CPU or GPU). -Everything you need is to define which encoders are trainable or not and set cache settings. -And that’s it: everything else Quaterion will handle for you. +ratings_agg_df.head() +``` -To configure cache you need to override `configure_cache` method in `TrainableModel`. -This method should return an instance of [CacheConfig](https://quaterion.qdrant.tech/quaterion.train.cache.cache_config.html#quaterion.train.cache.cache_config.CacheConfig). +| |userId |movieId |rating | +|---|-----------|---------|---------| +|0 |1 |1 |0.429960 | +|1 |1 |1036 |1.369846 | +|2 |1 |1049 |-0.509926| +|3 |1 |1066 |0.429960 | +|4 |1 |110 |0.429960 | -Let’s add cache to our model: +### Convert to sparse -```python -... -from quaterion.train.cache import CacheConfig, CacheType -... -class FAQModel(TrainableModel): - ... - def configure_caches(self) -> Optional[CacheConfig]: - return CacheConfig(CacheType.AUTO) - ... +If you want to search across numerous reviews from different users, you can represent these reviews in a sparse matrix. +```python +# Convert ratings to sparse vectors +user_sparse_vectors = defaultdict(lambda: {"values": [], "indices": []}) +for row in ratings_agg_df.itertuples(): + user_sparse_vectors[row.userId]["values"].append(row.rating) + user_sparse_vectors[row.userId]["indices"].append(int(row.movieId)) ``` -[CacheType](https://quaterion.qdrant.tech/quaterion.train.cache.cache_config.html#quaterion.train.cache.cache_config.CacheType) determines how the cache will be stored in memory. +![collaborative-filtering](/blog/collaborative-filtering/collaborative-filtering.png) -### [Anchor](https://qdrant.tech/articles/faq-question-answering/\#training) Training -Now we need to combine all our code together in `train.py` and launch a training process. +### Upload the data -```python -import torch -import pytorch_lightning as pl +Here, you will initialize the Qdrant client and create a new collection to store the data. +Convert the user ratings to sparse vectors and include the `movieId` in the payload. -from quaterion import Quaterion -from quaterion.dataset import PairsSimilarityDataLoader +```python +# Define a data generator +def data_generator(): + for user_id, sparse_vector in user_sparse_vectors.items(): + yield PointStruct( + id=user_id, + vector={"ratings": SparseVector( + indices=sparse_vector["indices"], + values=sparse_vector["values"] + )}, + payload={"user_id": user_id, "movie_id": sparse_vector["indices"]} + ) -from faq.dataset import FAQDataset +# Upload points using the data generator +qdrant_client.upload_points( + collection_name=collection_name, + points=data_generator() +) +``` -def train(model, train_dataset_path, val_dataset_path, params): - use_gpu = params.get("cuda", torch.cuda.is_available()) +### Define query - trainer = pl.Trainer( - min_epochs=params.get("min_epochs", 1), - max_epochs=params.get("max_epochs", 500), - auto_select_gpus=use_gpu, - log_every_n_steps=params.get("log_every_n_steps", 1), - gpus=int(use_gpu), - ) - train_dataset = FAQDataset(train_dataset_path) - val_dataset = FAQDataset(val_dataset_path) - train_dataloader = PairsSimilarityDataLoader( - train_dataset, batch_size=1024 - ) - val_dataloader = PairsSimilarityDataLoader( - val_dataset, batch_size=1024 - ) +In order to get recommendations, we need to find users with similar tastes to ours. +Let's describe our preferences by providing ratings for some of our favorite movies. - Quaterion.fit(model, trainer, train_dataloader, val_dataloader) +`1` indicates that we like the movie, `-1` indicates that we dislike it. -if __name__ == "__main__": - import os - from pytorch_lightning import seed_everything - from faq.model import FAQModel - from faq.config import DATA_DIR, ROOT_DIR - seed_everything(42, workers=True) - faq_model = FAQModel() - train_path = os.path.join( - DATA_DIR, - "train_cloud_faq_dataset.jsonl" - ) - val_path = os.path.join( - DATA_DIR, - "val_cloud_faq_dataset.jsonl" - ) - train(faq_model, train_path, val_path, {}) - faq_model.save_servable(os.path.join(ROOT_DIR, "servable")) +```python +my_ratings = { + 603: 1, # Matrix + 13475: 1, # Star Trek + 11: 1, # Star Wars + 1091: -1, # The Thing + 862: 1, # Toy Story + 597: -1, # Titanic + 680: -1, # Pulp Fiction + 13: 1, # Forrest Gump + 120: 1, # Lord of the Rings + 87: -1, # Indiana Jones + 562: -1 # Die Hard +} ``` -Here are a couple of unseen classes, `PairsSimilarityDataLoader`, which is a native dataloader for -`SimilarityPairSample` objects, and `Quaterion` is an entry point to the training process. +
+Click to see the code for to_vector -### [Anchor](https://qdrant.tech/articles/faq-question-answering/\#dataset-wise-evaluation) Dataset-wise evaluation +```python +# Create sparse vector from my_ratings +def to_vector(ratings): + vector = SparseVector( + values=[], + indices=[] + ) + for movie_id, rating in ratings.items(): + vector.values.append(rating) + vector.indices.append(movie_id) + return vector +``` -Up to this moment we’ve calculated only batch-wise metrics. -Such metrics can fluctuate a lot depending on a batch size and can be misleading. -It might be helpful if we can calculate a metric on a whole dataset or some large part of it. -Raw data may consume a huge amount of memory, and usually we can’t fit it into one batch. -Embeddings, on the contrary, most probably will consume less. +
-That’s where `Evaluator` enters the scene. -At first, having dataset of `SimilaritySample`, `Evaluator` encodes it via `SimilarityModel` and compute corresponding labels. -After that, it calculates a metric value, which could be more representative than batch-wise ones. -However, you still can find yourself in a situation where evaluation becomes too slow, or there is no enough space left in the memory. -A bottleneck might be a squared distance matrix, which one needs to calculate to compute a retrieval metric. -You can mitigate this bottleneck by calculating a rectangle matrix with reduced size. -`Evaluator` accepts `sampler` with a sample size to select only specified amount of embeddings. -If sample size is not specified, evaluation is performed on all embeddings. +### Run the query -Fewer words! Let’s add evaluator to our code and finish `train.py`. +From the uploaded list of movies with ratings, we can perform a search in Qdrant to get the top most similar users to us. ```python -... -from quaterion.eval.evaluator import Evaluator -from quaterion.eval.pair import RetrievalReciprocalRank, RetrievalPrecision -from quaterion.eval.samplers.pair_sampler import PairSampler -... +# Perform the search +results = qdrant_client.query_points( + collection_name=collection_name, + query=to_vector(my_ratings), + using="ratings", + limit=20 +).points +``` -def train(model, train_dataset_path, val_dataset_path, params): - ... +Now we can find the movies liked by the other similar users, but we haven't seen yet. +Let's combine the results from found users, filter out seen movies, and sort by the score. - metrics = { - "rrk": RetrievalReciprocalRank(), - "rp@1": RetrievalPrecision(k=1) - } - sampler = PairSampler() - evaluator = Evaluator(metrics, sampler) - results = Quaterion.evaluate(evaluator, val_dataset, model.model) - print(f"results: {results}") +```python +# Convert results to scores and sort by score +def results_to_scores(results): + movie_scores = defaultdict(lambda: 0) + for result in results: + for movie_id in result.payload["movie_id"]: + movie_scores[movie_id] += result.score + return movie_scores +# Convert results to scores and sort by score +movie_scores = results_to_scores(results) +top_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True) ``` -### [Anchor](https://qdrant.tech/articles/faq-question-answering/\#train-results) Train Results +
-At this point we can train our model, I do it via `python3 -m faq.train`. + Visualize results in Jupyter Notebook + +Finally, we display the top 5 recommended movies along with their posters and titles. -| epoch | train\_precision@1 | train\_reciprocal\_rank | val\_precision@1 | val\_reciprocal\_rank | -| --- | --- | --- | --- | --- | -| 0 | 0.650 | 0.732 | 0.659 | 0.741 | -| 100 | 0.665 | 0.746 | 0.673 | 0.754 | -| 200 | 0.677 | 0.757 | 0.682 | 0.763 | -| 300 | 0.686 | 0.765 | 0.688 | 0.768 | -| 400 | 0.695 | 0.772 | 0.694 | 0.773 | -| 500 | 0.701 | 0.778 | 0.700 | 0.777 | +```python +# Create HTML to display top 5 results +html_content = "
" -Results obtained with `Evaluator`: +for movie_id, score in top_movies[:5]: + imdb_id_row = links.loc[links['movieId'] == int(movie_id), 'imdbId'] + if not imdb_id_row.empty: + imdb_id = imdb_id_row.values[0] + poster_url, movie_info = get_movie_poster(imdb_id, omdb_api_key) + movie_title = movie_info.get('Title', 'Unknown Title') + + html_content += f""" +
+ Poster +
{movie_title}
+
Score: {score}
+
+ """ + else: + continue # Skip if imdb_id is not found -| precision@1 | reciprocal\_rank | -| --- | --- | -| 0.577 | 0.675 | +html_content += "
" -After training all the metrics have been increased. -And this training was done in just 3 minutes on a single gpu! -There is no overfitting and the results are steadily growing, although I think there is still room for improvement and experimentation. +display(HTML(html_content)) +``` -## [Anchor](https://qdrant.tech/articles/faq-question-answering/\#model-serving) Model serving +
-As you could already notice, Quaterion framework is split into two separate libraries: `quaterion` -and [quaterion-models](https://quaterion-models.qdrant.tech/). -The former one contains training related stuff like losses, cache, `pytorch-lightning` dependency, etc. -While the latter one contains only modules necessary for serving: encoders, heads and `SimilarityModel` itself. +## Recommendations -The reasons for this separation are: +For a complete display of movie posters, check the [notebook output](https://github.com/qdrant/examples/blob/master/collaborative-filtering/collaborative-filtering.ipynb). Here are the results without html content. -- less amount of entities you need to operate in a production environment -- reduced memory footprint +```text +Toy Story, Score: 131.2033799 +Monty Python and the Holy Grail, Score: 131.2033799 +Star Wars: Episode V - The Empire Strikes Back, Score: 131.2033799 +Star Wars: Episode VI - Return of the Jedi, Score: 131.2033799 +Men in Black, Score: 131.2033799 +``` -It is essential to isolate training dependencies from the serving environment cause the training step is usually more complicated. -Training dependencies are quickly going out of control, significantly slowing down the deployment and serving timings and increasing unnecessary resource usage. +On top of collaborative filtering, we can further enhance the recommendation system by incorporating other features like user demographics, movie genres, or movie tags. -The very last row of `train.py` \- `faq_model.save_servable(...)` saves encoders and the model in a fashion that eliminates all Quaterion dependencies and stores only the most necessary data to run a model in production. +Or, for example, only consider recent ratings via a time-based filter. This way, we can recommend movies that are currently popular among users. -In `serve.py` we load and encode all the answers and then look for the closest vectors to the questions we are interested in: +## Conclusion -```python -import os -import json +As demonstrated, it is possible to build an interesting movie recommendation system without intensive model training using Qdrant and Sparse Vectors. This approach not only simplifies the recommendation process but also makes it scalable and interpretable. In future tutorials, we can experiment more with this combination to further enhance our recommendation systems. -import torch -from quaterion_models.model import SimilarityModel -from quaterion.distances import Distance +<|page-107-lllmstxt|> +# Configure, Scale & Update Qdrant Hybrid Cloud Clusters -from faq.config import DATA_DIR, ROOT_DIR +## Configure Clusters -if __name__ == "__main__": - device = "cuda:0" if torch.cuda.is_available() else "cpu" - model = SimilarityModel.load(os.path.join(ROOT_DIR, "servable")) - model.to(device) - dataset_path = os.path.join(DATA_DIR, "val_cloud_faq_dataset.jsonl") +Alongside Hybrid Cloud specific scheduling options, you can also adjust various other advanced configuration options for your clusters. See [Configure Clusters](/documentation/cloud/configure-cluster/) for more details. - with open(dataset_path) as fd: - answers = [json.loads(json_line)["answer"] for json_line in fd] +## Scale Clusters - # everything is ready, let's encode our answers - answer_embeddings = model.encode(answers, to_numpy=False) +Hybrid cloud clusters can be scaled up and down, horizontall and vertically, at any time. For more details see [Scale Clusters](/documentation/cloud/cluster-scaling/). - # Some prepared questions and answers to ensure that our model works as intended - questions = [\ - "what is the pricing of aws lambda functions powered by aws graviton2 processors?",\ - "can i run a cluster or job for a long time?",\ - "what is the dell open manage system administrator suite (omsa)?",\ - "what are the differences between the event streams standard and event streams enterprise plans?",\ - ] - ground_truth_answers = [\ - "aws lambda functions powered by aws graviton2 processors are 20% cheaper compared to x86-based lambda functions",\ - "yes, you can run a cluster for as long as is required",\ - "omsa enables you to perform certain hardware configuration tasks and to monitor the hardware directly via the operating system",\ - "to find out more information about the different event streams plans, see choosing your plan",\ - ] +### Automatic Shard Rebalancing - # encode our questions and find the closest to them answer embeddings - question_embeddings = model.encode(questions, to_numpy=False) - distance = Distance.get_by_name(Distance.COSINE) - question_answers_distances = distance.distance_matrix( - question_embeddings, answer_embeddings - ) - answers_indices = question_answers_distances.min(dim=1)[1] - for q_ind, a_ind in enumerate(answers_indices): - print("Q:", questions[q_ind]) - print("A:", answers[a_ind], end="\n\n") - assert ( - answers[a_ind] == ground_truth_answers[q_ind] - ), f"<{answers[a_ind]}> != <{ground_truth_answers[q_ind]}>" +Qdrant Cloud supports automatic shard rebalancing when scaling your cluster horizontally. This ensures that data is evenly distributed across the nodes, optimizing performance and resource utilization. For more details see [Shard Rebalancing](/documentation/cloud/configure-cluster/#shard-rebalancing). -``` +### Resharding -We stored our collection of answer embeddings in memory and perform search directly in Python. -For production purposes, it’s better to use some sort of vector search engine like [Qdrant](https://github.com/qdrant/qdrant). -It provides durability, speed boost, and a bunch of other features. +In Qdrant Cloud, you can change the number of shards in your existing collections without having to recreate the collection from scratch. This feature is called resharding and allows you to scale your collections up or down as needed. For more details see [Resharding](/documentation/cloud/cluster-scaling/#resharding). -So far, we’ve implemented a whole training process, prepared model for serving and even applied a -trained model today with `Quaterion`. +## Update Clusters -Thank you for your time and attention! -I hope you enjoyed this huge tutorial and will use `Quaterion` for your similarity learning projects. +You can update the version of your cluster at any time. For more details see [Update Clusters](/documentation/cloud/cluster-upgrades/). -All ready to use code can be found [here](https://github.com/qdrant/demo-cloud-faq/tree/tutorial). +<|page-108-lllmstxt|> +# Using FastEmbed with Qdrant for Vector Search -Stay tuned!:) +## Install Qdrant Client and FastEmbed +```python +pip install "qdrant-client[fastembed]>=1.14.2" +``` -##### Was this page useful? +## Initialize the client +Qdrant Client has a simple in-memory mode that lets you try semantic search locally. +```python +from qdrant_client import QdrantClient, models -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +client = QdrantClient(":memory:") # Qdrant is running from RAM. +``` -Thank you for your feedback! 🙏 +## Add data +Now you can add two sample documents, their associated metadata, and a point `id` for each. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/faq-question-answering.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +```python +docs = [ + "Qdrant has a LangChain integration for chatbots.", + "Qdrant has a LlamaIndex integration for agents.", +] +metadata = [ + {"source": "langchain-docs"}, + {"source": "llamaindex-docs"}, +] +ids = [42, 2] -On this page: +``` +## Create a collection -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/faq-question-answering.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Qdrant stores vectors and associated metadata in collections. +Collection requires vector parameters to be set during creation. +In this tutorial, we'll be using `BAAI/bge-small-en` to compute embeddings. -× +```python +model_name = "BAAI/bge-small-en" +client.create_collection( + collection_name="test_collection", + vectors_config=models.VectorParams( + size=client.get_embedding_size(model_name), + distance=models.Distance.COSINE + ), # size and distance are model dependent +) -[Powered by](https://qdrant.tech/) +``` -<|page-74-lllmstxt|> -## vector-search -- [Documentation](https://qdrant.tech/documentation/) -- [Overview](https://qdrant.tech/documentation/overview/) -- Understanding Vector Search in Qdrant +## Upsert documents to the collection -# [Anchor](https://qdrant.tech/documentation/overview/vector-search/\#how-does-vector-search-work-in-qdrant) How Does Vector Search Work in Qdrant? +Qdrant client can do inference implicitly within its methods via FastEmbed integration. +It requires wrapping your data in models, like `models.Document` (or `models.Image` if you're working with images) -If you are still trying to figure out how vector search works, please read ahead. This document describes how vector search is used, covers Qdrant’s place in the larger ecosystem, and outlines how you can use Qdrant to augment your existing projects. +```python +metadata_with_docs = [ + {"document": doc, "source": meta["source"]} for doc, meta in zip(docs, metadata) +] +client.upload_collection( + collection_name="test_collection", + vectors=[models.Document(text=doc, model=model_name) for doc in docs], + payload=metadata_with_docs, + ids=ids, +) +``` +## Run vector search -For those who want to start writing code right away, visit our [Complete Beginners tutorial](https://qdrant.tech/documentation/tutorials/search-beginners/) to build a search engine in 5-15 minutes. +Here, you will ask a dummy question that will allow you to retrieve a semantically relevant result. -## [Anchor](https://qdrant.tech/documentation/overview/vector-search/\#a-brief-history-of-search) A Brief History of Search +```python +search_result = client.query_points( + collection_name="test_collection", + query=models.Document( + text="Which integration is best for agents?", + model=model_name + ) +).points +print(search_result) +``` +The semantic search engine will retrieve the most similar result in order of relevance. In this case, the second statement about LlamaIndex is more relevant. -Human memory is unreliable. Thus, as long as we have been trying to collect ‘knowledge’ in written form, we had to figure out how to search for relevant content without rereading the same books repeatedly. That’s why some brilliant minds introduced the inverted index. In the simplest form, it’s an appendix to a book, typically put at its end, with a list of the essential terms-and links to pages they occur at. Terms are put in alphabetical order. Back in the day, that was a manually crafted list requiring lots of effort to prepare. Once digitalization started, it became a lot easier, but still, we kept the same general principles. That worked, and still, it does. +```python +[ + ScoredPoint( + id=2, + score=0.87491801319731, + payload={ + "document": "Qdrant has a LlamaIndex integration for agents.", + "source": "llamaindex-docs", + }, + ... + ), + ScoredPoint( + id=42, + score=0.8351846627714035, + payload={ + "document": "Qdrant has a LangChain integration for chatbots.", + "source": "langchain-docs", + }, + ... + ), +] +``` -If you are looking for a specific topic in a particular book, you can try to find a related phrase and quickly get to the correct page. Of course, assuming you know the proper term. If you don’t, you must try and fail several times or find somebody else to help you form the correct query. +<|page-109-lllmstxt|> +# Load and Search Hugging Face Datasets with Qdrant -![A simplified version of the inverted index.](https://qdrant.tech/docs/gettingstarted/inverted-index.png) +[Hugging Face](https://huggingface.co/) provides a platform for sharing and using ML models and +datasets. [Qdrant](https://huggingface.co/Qdrant) also publishes datasets along with the +embeddings that you can use to practice with Qdrant and build your applications based on semantic +search. **Please [let us know](https://qdrant.to/discord) if you'd like to see a specific dataset!** -A simplified version of the inverted index. +## arxiv-titles-instructorxl-embeddings -Time passed, and we haven’t had much change in that area for quite a long time. But our textual data collection started to grow at a greater pace. So we also started building up many processes around those inverted indexes. For example, we allowed our users to provide many words and started splitting them into pieces. That allowed finding some documents which do not necessarily contain all the query words, but possibly part of them. We also started converting words into their root forms to cover more cases, removing stopwords, etc. Effectively we were becoming more and more user-friendly. Still, the idea behind the whole process is derived from the most straightforward keyword-based search known since the Middle Ages, with some tweaks. +[This dataset](https://huggingface.co/datasets/Qdrant/arxiv-titles-instructorxl-embeddings) contains +embeddings generated from the paper titles only. Each vector has a payload with the title used to +create it, along with the DOI (Digital Object Identifier). -![The process of tokenization with an additional stopwords removal and converstion to root form of a word.](https://qdrant.tech/docs/gettingstarted/tokenization.png) +```json +{ + "title": "Nash Social Welfare for Indivisible Items under Separable, Piecewise-Linear Concave Utilities", + "DOI": "1612.05191" +} +``` -The process of tokenization with an additional stopwords removal and converstion to root form of a word. +You can find a detailed description of the dataset in the [Practice Datasets](/documentation/datasets/#journal-article-titles) +section. If you prefer loading the dataset from a Qdrant snapshot, it also linked there. -Technically speaking, we encode the documents and queries into so-called sparse vectors where each position has a corresponding word from the whole dictionary. If the input text contains a specific word, it gets a non-zero value at that position. But in reality, none of the texts will contain more than hundreds of different words. So the majority of vectors will have thousands of zeros and a few non-zero values. That’s why we call them sparse. And they might be already used to calculate some word-based similarity by finding the documents which have the biggest overlap. +Loading the dataset is as simple as using the `load_dataset` function from the `datasets` library: -![An example of a query vectorized to sparse format.](https://qdrant.tech/docs/gettingstarted/query.png) +```python +from datasets import load_dataset -An example of a query vectorized to sparse format. +dataset = load_dataset("Qdrant/arxiv-titles-instructorxl-embeddings") +``` -Sparse vectors have relatively **high dimensionality**; equal to the size of the dictionary. And the dictionary is obtained automatically from the input data. So if we have a vector, we are able to partially reconstruct the words used in the text that created that vector. + -## [Anchor](https://qdrant.tech/documentation/overview/vector-search/\#the-tower-of-babel) The Tower of Babel +The dataset contains 2,250,000 vectors. This is how you can check the list of the features in the dataset: -Every once in a while, when we discover new problems with inverted indexes, we come up with a new heuristic to tackle it, at least to some extent. Once we realized that people might describe the same concept with different words, we started building lists of synonyms to convert the query to a normalized form. But that won’t work for the cases we didn’t foresee. Still, we need to craft and maintain our dictionaries manually, so they can support the language that changes over time. Another difficult issue comes to light with multilingual scenarios. Old methods require setting up separate pipelines and keeping humans in the loop to maintain the quality. +```python +dataset.features +``` -![The Tower of Babel, Pieter Bruegel.](https://qdrant.tech/docs/gettingstarted/babel.jpg) +### Streaming the dataset -The Tower of Babel, Pieter Bruegel. +Dataset streaming lets you work with a dataset without downloading it. The data is streamed as +you iterate over the dataset. You can read more about it in the [Hugging Face +documentation](https://huggingface.co/docs/datasets/stream). -## [Anchor](https://qdrant.tech/documentation/overview/vector-search/\#the-representation-revolution) The Representation Revolution +```python +from datasets import load_dataset -The latest research in Machine Learning for NLP is heavily focused on training Deep Language Models. In this process, the neural network takes a large corpus of text as input and creates a mathematical representation of the words in the form of vectors. These vectors are created in such a way that words with similar meanings and occurring in similar contexts are grouped together and represented by similar vectors. And we can also take, for example, an average of all the word vectors to create the vector for a whole text (e.g query, sentence, or paragraph). +dataset = load_dataset( + "Qdrant/arxiv-titles-instructorxl-embeddings", split="train", streaming=True +) +``` -![deep neural](https://qdrant.tech/docs/gettingstarted/deep-neural.png) +### Loading the dataset into Qdrant -We can take those **dense vectors** produced by the network and use them as a **different data representation**. They are dense because neural networks will rarely produce zeros at any position. In contrary to sparse ones, they have a relatively low dimensionality — hundreds or a few thousand only. Unfortunately, if we want to have a look and understand the content of the document by looking at the vector it’s no longer possible. Dimensions are no longer representing the presence of specific words. +You can load the dataset into Qdrant using the [Python SDK](https://github.com/qdrant/qdrant-client). +The embeddings are already precomputed, so you can store them in a collection, that we're going +to create in a second: -Dense vectors can capture the meaning, not the words used in a text. That being said, **Large Language Models can automatically handle synonyms**. Moreso, since those neural networks might have been trained with multilingual corpora, they translate the same sentence, written in different languages, to similar vector representations, also called **embeddings**. And we can compare them to find similar pieces of text by calculating the distance to other vectors in our database. +```python +from qdrant_client import QdrantClient, models -![Input queries contain different words, but they are still converted into similar vector representations, because the neural encoder can capture the meaning of the sentences. That feature can capture synonyms but also different languages..](https://qdrant.tech/docs/gettingstarted/input.png) +client = QdrantClient("http://localhost:6333") -Input queries contain different words, but they are still converted into similar vector representations, because the neural encoder can capture the meaning of the sentences. That feature can capture synonyms but also different languages.. +client.create_collection( + collection_name="arxiv-titles-instructorxl-embeddings", + vectors_config=models.VectorParams( + size=768, + distance=models.Distance.COSINE, + ), +) +``` -**Vector search** is a process of finding similar objects based on their embeddings similarity. The good thing is, you don’t have to design and train your neural network on your own. Many pre-trained models are available, either on **HuggingFace** or by using libraries like [SentenceTransformers](https://www.sbert.net/?ref=hackernoon.com). If you, however, prefer not to get your hands dirty with neural models, you can also create the embeddings with SaaS tools, like [co.embed API](https://docs.cohere.com/reference/embed?ref=hackernoon.com). +It is always a good idea to use batching, while loading a large dataset, so let's do that. +We are going to need a helper function to split the dataset into batches: -## [Anchor](https://qdrant.tech/documentation/overview/vector-search/\#why-qdrant) Why Qdrant? +```python +from itertools import islice -The challenge with vector search arises when we need to find similar documents in a big set of objects. If we want to find the closest examples, the naive approach would require calculating the distance to every document. That might work with dozens or even hundreds of examples but may become a bottleneck if we have more than that. When we work with relational data, we set up database indexes to speed things up and avoid full table scans. And the same is true for vector search. Qdrant is a fully-fledged vector database that speeds up the search process by using a graph-like structure to find the closest objects in sublinear time. So you don’t calculate the distance to every object from the database, but some candidates only. +def batched(iterable, n): + iterator = iter(iterable) + while batch := list(islice(iterator, n)): + yield batch +``` -![Vector search with Qdrant. Thanks to HNSW graph we are able to compare the distance to some of the objects from the database, not to all of them.](https://qdrant.tech/docs/gettingstarted/vector-search.png) +If you are a happy user of Python 3.12+, you can use the [`batched` function from the `itertools` +](https://docs.python.org/3/library/itertools.html#itertools.batched) package instead. -Vector search with Qdrant. Thanks to HNSW graph we are able to compare the distance to some of the objects from the database, not to all of them. +No matter what Python version you are using, you can use the `upsert` method to load the dataset, +batch by batch, into Qdrant: + +```python +batch_size = 100 + +for batch in batched(dataset, batch_size): + ids = [point.pop("id") for point in batch] + vectors = [point.pop("vector") for point in batch] -While doing a semantic search at scale, because this is what we sometimes call the vector search done on texts, we need a specialized tool to do it effectively — a tool like Qdrant. + client.upsert( + collection_name="arxiv-titles-instructorxl-embeddings", + points=models.Batch( + ids=ids, + vectors=vectors, + payloads=batch, + ), + ) +``` -## [Anchor](https://qdrant.tech/documentation/overview/vector-search/\#next-steps) Next Steps +Your collection is ready to be used for search! Please [let us know using Discord](https://qdrant.to/discord) +if you would like to see more datasets published on Hugging Face hub. -Vector search is an exciting alternative to sparse methods. It solves the issues we had with the keyword-based search without needing to maintain lots of heuristics manually. It requires an additional component, a neural encoder, to convert text into vectors. +<|page-110-lllmstxt|> +# Managing a Qdrant Cluster -[**Tutorial 1 - Qdrant for Complete Beginners**](https://qdrant.tech/documentation/tutorials/search-beginners/) -Despite its complicated background, vectors search is extraordinarily simple to set up. With Qdrant, you can have a search engine up-and-running in five minutes. Our [Complete Beginners tutorial](https://qdrant.tech/documentation/tutorials/search-beginners/) will show you how. +The most minimal QdrantCluster configuration is: -[**Tutorial 2 - Question and Answer System**](https://qdrant.tech/articles/qa-with-cohere-and-qdrant/) -However, you can also choose SaaS tools to generate them and avoid building your model. Setting up a vector search project with Qdrant Cloud and Cohere co.embed API is fairly easy if you follow the [Question and Answer system tutorial](https://qdrant.tech/articles/qa-with-cohere-and-qdrant/). +```yaml +apiVersion: qdrant.io/v1 +kind: QdrantCluster +metadata: + name: qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840 + labels: + cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + customer-id: "acme-industries" +spec: + id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + version: "v1.11.3" + size: 1 + resources: + cpu: 100m + memory: "1Gi" + storage: "2Gi" +``` -There is another exciting thing about vector search. You can search for any kind of data as long as there is a neural network that would vectorize your data type. Do you think about a reverse image search? That’s also possible with vector embeddings. +The `id` should be unique across all Qdrant clusters in the same namespace, the `name` must follow the above pattern and the `cluster-id` and `customer-id` labels are mandatory. -##### Was this page useful? +There are lots more configuration options to configure scheduling, security, networking, and more. For full details see the [Qdrant Private Cloud API Reference](/documentation/private-cloud/api-reference/). -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +## Scaling a Cluster -Thank you for your feedback! 🙏 +To scale a cluster, update the CPU, memory and storage resources in the QdrantCluster spec. The Qdrant operator will automatically adjust the cluster configuration. This operation is highly available on a multi-node cluster with replicated collections. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/overview/vector-search.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. + -On this page: +## Upgrading the Qdrant version -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/overview/vector-search.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +To upgrade the Qdrant version of a database cluster, update the `version` field in the QdrantCluster spec. The Qdrant operator will automatically upgrade the cluster to the new version. The upgrade process is highly available on a multi-node cluster with replicated collections. -× +Note, that you should not skip minor versions when upgrading. For example, if you are running version `v1.11.3`, you can upgrade to `v1.11.5` or `v1.12.6`, but not directly to `v1.13.0`. -[Powered by](https://qdrant.tech/) +## Exposing a Cluster -<|page-75-lllmstxt|> -## benchmarks -# Vector Database Benchmarks +By default, a QdrantCluster will be exposed through an internal `ClusterIP` service. To expose the cluster to the outside world, you can create a `NodePort` service, a `LoadBalancer` service or an `Ingress` resource. -# [Anchor](https://qdrant.tech/benchmarks/\#benchmarking-vector-databases) Benchmarking Vector Databases +This is an example on how to create a QdrantCluster with a `LoadBalancer` service: -At Qdrant, performance is the top-most priority. We always make sure that we use system resources efficiently so you get the **fastest and most accurate results at the cheapest cloud costs**. So all of our decisions from [choosing Rust](https://qdrant.tech/articles/why-rust/), [io optimisations](https://qdrant.tech/articles/io_uring/), [serverless support](https://qdrant.tech/articles/serverless/), [binary quantization](https://qdrant.tech/articles/binary-quantization/), to our [fastembed library](https://qdrant.tech/articles/fastembed/) are all based on our principle. In this article, we will compare how Qdrant performs against the other vector search engines. +```yaml +apiVersion: qdrant.io/v1 +kind: QdrantCluster +metadata: + name: qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840 + labels: + cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + customer-id: "acme-industries" +spec: + id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + version: "v1.11.3" + size: 1 + resources: + cpu: 100m + memory: "1Gi" + storage: "2Gi" + service: + type: LoadBalancer + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: nlb +``` -Here are the principles we followed while designing these benchmarks: +Especially if you create a LoadBalancer Service, you may need to provide annotations for the loadbalancer configration. Please refer to the documention of your cloud provider for more details. -- We do comparative benchmarks, which means we focus on **relative numbers** rather than absolute numbers. -- We use affordable hardware, so that you can reproduce the results easily. -- We run benchmarks on the same exact machines to avoid any possible hardware bias. -- All the benchmarks are [open-sourced](https://github.com/qdrant/vector-db-benchmark), so you can contribute and improve them. +Examples: -Scenarios we tested +* [AWS EKS LoadBalancer annotations](https://kubernetes-sigs.github.io/aws-load-balancer-controller/latest/guide/service/annotations/) +* [Azure AKS Public LoadBalancer annotations](https://learn.microsoft.com/en-us/azure/aks/load-balancer-standard) +* [Azure AKS Internal LoadBalancer annotations](https://learn.microsoft.com/en-us/azure/aks/internal-lb) +* [GCP GKE LoadBalancer annotations](https://cloud.google.com/kubernetes-engine/docs/concepts/service-load-balancer-parameters) +* + -1. Upload & Search benchmark on single node [Benchmark](https://qdrant.tech/benchmarks/single-node-speed-benchmark/) -2. Filtered search benchmark - [Benchmark](https://qdrant.tech/benchmarks/#filtered-search-benchmark) -3. Memory consumption benchmark - Coming soon -4. Cluster mode benchmark - Coming soon +## Authentication and Authorization -Some of our experiment design decisions are described in the [F.A.Q Section](https://qdrant.tech/benchmarks/#benchmarks-faq). -Reach out to us on our [Discord channel](https://qdrant.to/discord) if you want to discuss anything related Qdrant or these benchmarks. + -## [Anchor](https://qdrant.tech/benchmarks/\#single-node-benchmarks) Single node benchmarks +Authentication information is provided by Kubernetes secrets. -We benchmarked several vector databases using various configurations of them on different datasets to check how the results may vary. Those datasets may have different vector dimensionality but also vary in terms of the distance function being used. We also tried to capture the difference we can expect while using some different configuration parameters, for both the engine itself and the search operation separately. +One way to create a secret is with kubectl: -**Updated: January/June 2024** +```shell +kubectl create secret generic qdrant-api-key --from-literal=api-key=your-secret-api-key --from-literal=read-only-api-key=your-secret-read-only-api-key --namespace qdrant-private-cloud +``` -Dataset:dbpedia-openai-1M-1536-angulardeep-image-96-angulargist-960-euclideanglove-100-angular +The resulting secret will look like this: -Search threads:1001 +```yaml +apiVersion: v1 +data: + api-key: ... + read-only-api-key: ... +kind: Secret +metadata: + name: qdrant-api-key + namespace: qdrant-private-cloud +type: kubernetes.io/generic +``` -Plot values: +You can reference the secret in the QdrantCluster spec: -RPS +```yaml +apiVersion: qdrant.io/v1 +kind: QdrantCluster +metadata: + name: qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840 + labels: + cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + customer-id: "acme-industries" +spec: + id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + version: "v1.11.3" + size: 1 + resources: + cpu: 100m + memory: "1Gi" + storage: "2Gi" + config: + service: + api_key: + secretKeyRef: + name: qdrant-api-key + key: api-key + read_only_api_key: + secretKeyRef: + name: qdrant-api-key + key: read-only-api-key + jwt_rbac: true +``` -Latency +If you set the `jwt_rbac` flag, you will also be able to create granular [JWT tokens for role based access control](/documentation/guides/security/#granular-access-control-with-jwt). -p95 latency +### Configuring TLS for Database Access -Index time +If you want to configure TLS for accessing your Qdrant database, there are two options: -| Engine | Setup | Dataset | Upload Time(m) | Upload + Index Time(m) | Latency(ms) | P95(ms) | P99(ms) | RPS | Precision | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| qdrant | qdrant-sq-rps-m-64-ef-512 | dbpedia-openai-1M-1536-angular | 3.51 | 24.43 | 3.54 | 4.95 | 8.62 | 1238.0016 | 0.99 | -| weaviate | latest-weaviate-m32 | dbpedia-openai-1M-1536-angular | 13.94 | 13.94 | 4.99 | 7.16 | 11.33 | 1142.13 | 0.97 | -| elasticsearch | elasticsearch-m-32-ef-128 | dbpedia-openai-1M-1536-angular | 19.18 | 83.72 | 22.10 | 72.53 | 135.68 | 716.80 | 0.98 | -| redis | redis-m-32-ef-256 | dbpedia-openai-1M-1536-angular | 92.49 | 92.49 | 140.65 | 160.85 | 167.35 | 625.27 | 0.97 | -| milvus | milvus-m-16-ef-128 | dbpedia-openai-1M-1536-angular | 0.27 | 1.16 | 393.31 | 441.32 | 576.65 | 219.11 | 0.99 | +* You can offload TLS at the ingress or loadbalancer level. +* You can configure TLS directly in the Qdrant database. -_Download raw data: [here](https://qdrant.tech/benchmarks/results-1-100-thread-2024-06-15.json)_ +If you want to configure TLS directly in the Qdrant database, you can provide this as a secret. -## [Anchor](https://qdrant.tech/benchmarks/\#observations) Observations +To create such a secret, you can use `kubectl`: -Most of the engines have improved since [our last run](https://qdrant.tech/benchmarks/single-node-speed-benchmark-2022/). Both life and software have trade-offs but some clearly do better: +```shell + kubectl create secret tls qdrant-tls --cert=mydomain.com.crt --key=mydomain.com.key --namespace the-qdrant-namespace +``` -- **`Qdrant` achives highest RPS and lowest latencies in almost all the scenarios, no matter the precision threshold and the metric we choose.** It has also shown 4x RPS gains on one of the datasets. -- `Elasticsearch` has become considerably fast for many cases but it’s very slow in terms of indexing time. It can be 10x slower when storing 10M+ vectors of 96 dimensions! (32mins vs 5.5 hrs) -- `Milvus` is the fastest when it comes to indexing time and maintains good precision. However, it’s not on-par with others when it comes to RPS or latency when you have higher dimension embeddings or more number of vectors. -- `Redis` is able to achieve good RPS but mostly for lower precision. It also achieved low latency with single thread, however its latency goes up quickly with more parallel requests. Part of this speed gain comes from their custom protocol. -- `Weaviate` has improved the least since our last run. +The resulting secret will look like this: -## [Anchor](https://qdrant.tech/benchmarks/\#how-to-read-the-results) How to read the results +```yaml +apiVersion: v1 +data: + tls.crt: ... + tls.key: ... +kind: Secret +metadata: + name: qdrant-tls + namespace: the-qdrant-namespace +type: kubernetes.io/tls +``` +You can reference the secret in the QdrantCluster spec: -- Choose the dataset and the metric you want to check. -- Select a precision threshold that would be satisfactory for your usecase. This is important because ANN search is all about trading precision for speed. This means in any vector search benchmark, **two results must be compared only when you have similar precision**. However most benchmarks miss this critical aspect. -- The table is sorted by the value of the selected metric (RPS / Latency / p95 latency / Index time), and the first entry is always the winner of the category 🏆 +```yaml +apiVersion: qdrant.io/v1 +kind: QdrantCluster +metadata: + name: test-cluster +spec: + id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + version: "v1.11.3" + size: 1 + resources: + cpu: 100m + memory: "1Gi" + storage: "2Gi" + config: + service: + enable_tls: true + tls: + cert: + secretKeyRef: + name: qdrant-tls + key: tls.crt + key: + secretKeyRef: + name: qdrant-tls + key: tls.key +``` -### [Anchor](https://qdrant.tech/benchmarks/\#latency-vs-rps) Latency vs RPS +### Configuring TLS for Inter-cluster Communication -In our benchmark we test two main search usage scenarios that arise in practice. +*Available as of Operator v2.2.0* -- **Requests-per-Second (RPS)**: Serve more requests per second in exchange of individual requests taking longer (i.e. higher latency). This is a typical scenario for a web application, where multiple users are searching at the same time. -To simulate this scenario, we run client requests in parallel with multiple threads and measure how many requests the engine can handle per second. -- **Latency**: React quickly to individual requests rather than serving more requests in parallel. This is a typical scenario for applications where server response time is critical. Self-driving cars, manufacturing robots, and other real-time systems are good examples of such applications. -To simulate this scenario, we run client in a single thread and measure how long each request takes. + -### [Anchor](https://qdrant.tech/benchmarks/\#tested-datasets) Tested datasets +If you want to encrypt communication between Qdrant nodes, you need to enable TLS by providing +certificate, key, and root CA certificate used for generating the former. -Our [benchmark tool](https://github.com/qdrant/vector-db-benchmark) is inspired by [github.com/erikbern/ann-benchmarks](https://github.com/erikbern/ann-benchmarks/). We used the following datasets to test the performance of the engines on ANN Search tasks: +Similar to the instruction stated in the previous section, you need to create a secret: -| Datasets | \# Vectors | Dimensions | Distance | -| --- | --- | --- | --- | -| [dbpedia-openai-1M-angular](https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M) | 1M | 1536 | cosine | -| [deep-image-96-angular](http://sites.skoltech.ru/compvision/noimi/) | 10M | 96 | cosine | -| [gist-960-euclidean](http://corpus-texmex.irisa.fr/) | 1M | 960 | euclidean | -| [glove-100-angular](https://nlp.stanford.edu/projects/glove/) | 1.2M | 100 | cosine | +```shell + kubectl create secret generic qdrant-p2p-tls \ + --from-file=tls.crt=qdrant-nodes.crt \ + --from-file=tls.key=qdrant-nodes.key \ + --from-file=ca.crt=root-ca.crt + --namespace the-qdrant-namespace +``` -### [Anchor](https://qdrant.tech/benchmarks/\#setup) Setup +The resulting secret will look like this: -![Benchmarks configuration](https://qdrant.tech/benchmarks/client-server.png) +```yaml +apiVersion: v1 +data: + tls.crt: ... + tls.key: ... + ca.crt: ... +kind: Secret +metadata: + name: qdrant-p2p-tls + namespace: the-qdrant-namespace +type: Opaque +``` +You can reference the secret in the QdrantCluster spec: -Benchmarks configuration +```yaml +apiVersion: qdrant.io/v1 +kind: QdrantCluster +metadata: + name: test-cluster + labels: + cluster-id: "my-cluster" + customer-id: "acme-industries" +spec: + id: "my-cluster" + version: "v1.13.3" + size: 2 + resources: + cpu: 100m + memory: "1Gi" + storage: "2Gi" + config: + service: + enable_tls: true + tls: + caCert: + secretKeyRef: + name: qdrant-p2p-tls + key: ca.crt + cert: + secretKeyRef: + name: qdrant-p2p-tls + key: tls.crt + key: + secretKeyRef: + name: qdrant-p2p-tls + key: tls.key +``` -- This was our setup for this experiment: - - Client: 8 vcpus, 16 GiB memory, 64GiB storage ( `Standard D8ls v5` on Azure Cloud) - - Server: 8 vcpus, 32 GiB memory, 64GiB storage ( `Standard D8s v3` on Azure Cloud) -- The Python client uploads data to the server, waits for all required indexes to be constructed, and then performs searches with configured number of threads. We repeat this process with different configurations for each engine, and then select the best one for a given precision. -- We ran all the engines in docker and limited their memory to 25GB. This was used to ensure fairness by avoiding the case of some engine configs being too greedy with RAM usage. This 25 GB limit is completely fair because even to serve the largest `dbpedia-openai-1M-1536-angular` dataset, one hardly needs `1M * 1536 * 4bytes * 1.5 = 8.6GB` of RAM (including vectors + index). Hence, we decided to provide all the engines with ~3x the requirement. + -To measure how well different search engines perform in this scenario, we have prepared a set of **Filtered ANN Benchmark Datasets** - -[https://github.com/qdrant/ann-filtering-benchmark-datasets](https://github.com/qdrant/ann-filtering-benchmark-datasets) +## GPU support -It is similar to the ones used in the [ann-benchmarks project](https://github.com/erikbern/ann-benchmarks/) but enriched with payload metadata and pre-generated filtering requests. It includes synthetic and real-world datasets with various filters, from keywords to geo-spatial queries. +Starting with Qdrant 1.13 and private-cloud version 1.6.1 you can create a cluster that uses GPUs to accelarate indexing. -### [Anchor](https://qdrant.tech/benchmarks/\#why-filtering-is-not-trivial) Why filtering is not trivial? +As a prerequisite, you need to have a Kubernetes cluster with GPU support. You can check the [Kubernetes documentation](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/) for generic information on GPUs and Kubernetes, or the documentation of your specific Kubernetes distribution. -Not many ANN algorithms are compatible with filtering. -HNSW is one of the few of them, but search engines approach its integration in different ways: +Examples: -- Some use **post-filtering**, which applies filters after ANN search. It doesn’t scale well as it either loses results or requires many candidates on the first stage. -- Others use **pre-filtering**, which requires a binary mask of the whole dataset to be passed into the ANN algorithm. It is also not scalable, as the mask size grows linearly with the dataset size. +* [AWS EKS GPU support](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/amazon-eks.html) +* [Azure AKS GPU support](https://docs.microsoft.com/en-us/azure/aks/gpu-cluster) +* [GCP GKE GPU support](https://cloud.google.com/kubernetes-engine/docs/how-to/gpus) +* [Vultr Kubernetes GPU support](https://blogs.vultr.com/whats-new-vultr-q2-2023) -On top of it, there is also a problem with search accuracy. -It appears if too many vectors are filtered out, so the HNSW graph becomes disconnected. +Once you have a Kubernetes cluster with GPU support, you can create a QdrantCluster with GPU support: -Qdrant uses a different approach, not requiring pre- or post-filtering while addressing the accuracy problem. -Read more about the Qdrant approach in our [Filtrable HNSW](https://qdrant.tech/articles/filtrable-hnsw/) article. +```yaml +apiVersion: qdrant.io/v1 +kind: QdrantCluster +metadata: + name: qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840 + labels: + cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + customer-id: "acme-industries" +spec: + id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + version: "v1.13.4" + size: 1 + resources: + cpu: 2 + memory: "8Gi" + storage: "40Gi" + gpu: + gpuType: "nvidia" +``` -## [Anchor](https://qdrant.tech/benchmarks/\#) +Once the cluster Pod has started, you can check in the logs if the GPU is detected: -**Updated: Feb 2023** +```shell +$ kubectl logs qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840-0 -Dataset:keyword-100range-100int-2048100-kw-small-vocabkeyword-2048geo-radius-100range-2048geo-radius-2048int-100h-and-m-2048arxiv-titles-384 +Starting initializing for pod 0 + _ _ + __ _ __| |_ __ __ _ _ __ | |_ + / _` |/ _` | '__/ _` | '_ \| __| +| (_| | (_| | | | (_| | | | | |_ + \__, |\__,_|_| \__,_|_| |_|\__| + |_| -Plot values: +Version: 1.13.4, build: 7abc6843 +Access web UI at http://localhost:6333/dashboard -Regular search +2025-03-14T10:25:30.509636Z INFO gpu::instance: Found GPU device: NVIDIA A16-2Q +2025-03-14T10:25:30.509679Z INFO gpu::instance: Found GPU device: llvmpipe (LLVM 15.0.7, 256 bits) +2025-03-14T10:25:30.509734Z INFO gpu::device: Create GPU device NVIDIA A16-2Q +... +``` -Filter search +For more GPU configuration options, see the [Qdrant Private Cloud API Reference](/documentation/private-cloud/api-reference/). -_Download raw data: [here](https://qdrant.tech/benchmarks/filter-result-2023-02-03.json)_ +## Ephemeral Snapshot Volumes -## [Anchor](https://qdrant.tech/benchmarks/\#filtered-results) Filtered Results +If you do not [create snapshots](https://api.qdrant.tech/api-reference/snapshots/create-snapshot), or there is no need +to keep them available after cluster restart, the snapshot storage classname can be set to `emptyDir`: -As you can see from the charts, there are three main patterns: +```yaml +apiVersion: qdrant.io/v1 +kind: QdrantCluster +metadata: + name: qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840 + labels: + cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + customer-id: "acme-industries" +spec: + id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + version: "v1.13.4" + size: 1 + resources: + cpu: 2 + memory: "8Gi" + storage: "40Gi" + storageClassNames: + snapshots: emptyDir +``` -- **Speed boost** \- for some engines/queries, the filtered search is faster than the unfiltered one. It might happen if the filter is restrictive enough, to completely avoid the usage of the vector index. +See [Kubernetes docs on emptyDir volumes](https://kubernetes.io/docs/concepts/storage/volumes/#emptydir) for more details, +on how k8s node ephemeral storage is allocated and used. -- **Speed downturn** \- some engines struggle to keep high RPS, it might be related to the requirement of building a filtering mask for the dataset, as described above. +## Automatic Shard Rebalancing -- **Accuracy collapse** \- some engines are loosing accuracy dramatically under some filters. It is related to the fact that the HNSW graph becomes disconnected, and the search becomes unreliable. +Qdrant Private Cloud supports automatic shard rebalancing. This means that when you scale up or down the number of nodes in a cluster, the operator will automatically redistribute the shards across the available nodes to ensure an even distribution of data. +To enable automatic shard rebalancing, you can set the `rebalancestrategy` field in the QdrantCluster spec: -Qdrant avoids all these problems and also benefits from the speed boost, as it implements an advanced [query planning strategy](https://qdrant.tech/documentation/search/#query-planning). +```yaml +apiVersion: qdrant.io/v1 +kind: QdrantCluster +metadata: + name: qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840 + labels: + cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + customer-id: "acme-industries" +spec: + id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + version: "v1.15.1" + size: 3 + rebalanceStrategy: by_count_and_size + resources: + cpu: 2 + memory: "8Gi" + storage: "40Gi" +``` -# [Anchor](https://qdrant.tech/benchmarks/\#benchmarks-faq) Benchmarks F.A.Q. +For a list of all available rebalancing strategies, see the [Qdrant Private Cloud API Reference](/documentation/private-cloud/api-reference/#rebalancestrategy). -## [Anchor](https://qdrant.tech/benchmarks/\#are-we-biased) Are we biased? +## Resharding -Probably, yes. Even if we try to be objective, we are not experts in using all the existing vector databases. -We build Qdrant and know the most about it. -Due to that, we could have missed some important tweaks in different vector search engines. +In Qdrant Cloud, you can change the number of shards in your existing collections without having to recreate the collection from scratch. This feature is called resharding and allows you to scale your collections up or down as needed. For more details see [Resharding](/documentation/cloud/cluster-scaling/#resharding). -However, we tried our best, kept scrolling the docs up and down, experimented with combinations of different configurations, and gave all of them an equal chance to stand out. If you believe you can do it better than us, our **benchmarks are fully [open-sourced](https://github.com/qdrant/vector-db-benchmark), and contributions are welcome**! +<|page-111-lllmstxt|> +# **Permission Reference** -## [Anchor](https://qdrant.tech/benchmarks/\#what-do-we-measure) What do we measure? +This document outlines the permissions available in Qdrant Cloud. -There are several factors considered while deciding on which database to use. -Of course, some of them support a different subset of functionalities, and those might be a key factor to make the decision. -But in general, we all care about the search precision, speed, and resources required to achieve it. +--- -There is one important thing - **the speed of the vector databases should to be compared only if they achieve the same precision**. Otherwise, they could maximize the speed factors by providing inaccurate results, which everybody would rather avoid. Thus, our benchmark results are compared only at a specific search precision threshold. +> 💡 When enabling `write:*` permissions in the UI, the corresponding `read:*` permission will also be enabled and non-actionable. This guarantees access to resources after creating and/or updating them. -## [Anchor](https://qdrant.tech/benchmarks/\#how-we-select-hardware) How we select hardware? +## **Identity and Access Management** +Permissions for users, user roles, management keys, and invitations. -In our experiments, we are not focusing on the absolute values of the metrics but rather on a relative comparison of different engines. -What is important is the fact we used the same machine for all the tests. -It was just wiped off between launching different engines. +| Permission | Description | +|------------|------------| +| `read:roles` | View roles in the Access Management page. | +| `write:roles` | Create and modify roles in the Access Management page. | +| `delete:roles` | Remove roles in the Access Management page. | +| `read:management_keys` | View Cloud Management Keys in the Access Management page. | +| `write:management_keys` | Create and manage Cloud Management Keys. | +| `delete:management_keys` | Remove Cloud Management Keys in the Access Management page. | +| `write:invites` | Invite new users to an account and revoke invitations. | +| `read:invites` | View pending invites in an account. | +| `delete:invites` | Remove an invitation. | +| `read:users` | View user details in the profile page.
- Also applicable in User Management and Role details (User tab). | +| `delete:users` | Remove users from an account.
- Applicable in User Management and Role details (User tab). | -We selected an average machine, which you can easily rent from almost any cloud provider. No extra quota or custom configuration is required. +--- -## [Anchor](https://qdrant.tech/benchmarks/\#why-you-are-not-comparing-with-faiss-or-annoy) Why you are not comparing with FAISS or Annoy? +## **Cluster** +Permissions for API Keys, backups, clusters, and backup schedules. -Libraries like FAISS provide a great tool to do experiments with vector search. But they are far away from real usage in production environments. -If you are using FAISS in production, in the best case, you never need to update it in real-time. In the worst case, you have to create your custom wrapper around it to support CRUD, high availability, horizontal scalability, concurrent access, and so on. +### **API Keys** +| Permission | Description | +|------------|------------| +| `read:api_keys` | View Database API Keys for Managed Cloud clusters. | +| `write:api_keys` | Create new Database API Keys for Managed Cloud clusters. | +| `delete:api_keys` | Remove Database API Keys for Managed Cloud clusters. | -Some vector search engines even use FAISS under the hood, but a search engine is much more than just an indexing algorithm. +### **Backups** +| Permission | Description | +|------------|------------| +| `read:backups` | View backups in the **Backups page** and **Cluster details > Backups tab**. | +| `write:backups` | Create backups from the **Backups page** and **Cluster details > Backups tab**. | +| `delete:backups` | Remove backups from the **Backups page** and **Cluster details > Backups tab**. | -We do, however, use the same benchmark datasets as the famous [ann-benchmarks project](https://github.com/erikbern/ann-benchmarks), so you can align your expectations for any practical reasons. +### **Clusters** +| Permission | Description | +|------------|------------| +| `read:clusters` | View cluster details. | +| `write:clusters` | Modify cluster settings. | +| `delete:clusters` | Delete clusters. | -### [Anchor](https://qdrant.tech/benchmarks/\#why-we-decided-to-test-with-the-python-client) Why we decided to test with the Python client +### **Cluster Data** +| Permission | Description | +|------------|------------| +| `read:cluster_data` | View cluster data, used for the Cluster UI button on Cluster Details. [Maps to global `read-only` JWT access for the cluster.](/documentation/guides/security/) | +| `write:cluster_data` | View and modify cluster data, used for the Cluster UI button on Cluster Details. [Maps to global `read-write` JWT access for the cluster.](/documentation/guides/security/) | -There is no consensus when it comes to the best technology to run benchmarks. You’re free to choose Go, Java or Rust-based systems. But there are two main reasons for us to use Python for this: +### **Backup Schedules** +| Permission | Description | +|------------|------------| +| `read:backup_schedules` | View backup schedules in the **Backups page** and **Cluster details > Backups tab**. | +| `write:backup_schedules` | Create backup schedules from the **Backups page** and **Cluster details > Backups tab**. | +| `delete:backup_schedules` | Remove backup schedules from the **Backups page** and **Cluster details > Backups tab**. | -1. While generating embeddings you’re most likely going to use Python and python based ML frameworks. -2. Based on GitHub stars, python clients are one of the most popular clients across all the engines. +--- -From the user’s perspective, the crucial thing is the latency perceived while using a specific library - in most cases a Python client. -Nobody can and even should redefine the whole technology stack, just because of using a specific search tool. -That’s why we decided to focus primarily on official Python libraries, provided by the database authors. -Those may use some different protocols under the hood, but at the end of the day, we do not care how the data is transferred, as long as it ends up in the target location. +## **Hybrid Cloud** +Permissions for Hybrid Cloud environments. -## [Anchor](https://qdrant.tech/benchmarks/\#what-about-closed-source-saas-platforms) What about closed-source SaaS platforms? +| Permission | Description | +|------------|------------| +| `read:hybrid_cloud_environments` | View Hybrid Cloud environment details. | +| `write:hybrid_cloud_environments` | Modify Hybrid Cloud environment settings. | +| `delete:hybrid_cloud_environments` | Delete Hybrid Cloud environments. | -There are some vector databases available as SaaS only so that we couldn’t test them on the same machine as the rest of the systems. -That makes the comparison unfair. That’s why we purely focused on testing the Open Source vector databases, so everybody may reproduce the benchmarks easily. +--- -This is not the final list, and we’ll continue benchmarking as many different engines as possible. +## **Payment & Billing** +Permissions for payment methods and billing information. -## [Anchor](https://qdrant.tech/benchmarks/\#how-to-reproduce-the-benchmark) How to reproduce the benchmark? +| Permission | Description | +|------------|------------| +| `read:payment_information` | View payment methods and billing details. | +| `write:payment_information` | Modify or remove payment methods and billing details. | -The source code is available on [Github](https://github.com/qdrant/vector-db-benchmark) and has a `README.md` file describing the process of running the benchmark for a specific engine. +--- -## [Anchor](https://qdrant.tech/benchmarks/\#how-to-contribute) How to contribute? +## **Account Management** +Permissions for managing user accounts. -We made the benchmark Open Source because we believe that it has to be transparent. We could have misconfigured one of the engines or just done it inefficiently. If you feel like you could help us out, check out our [benchmark repository](https://github.com/qdrant/vector-db-benchmark). +| Permission | Description | +|------------|------------| +| `read:account` | View account details that the user is a part of. | +| `write:account` | Modify account details such as:
- Editing the account name
- Setting an account as default
- Leaving an account
**(Only available to Owners)** | +| `delete:account` | Remove an account from:
- The **Profile page** (list of user accounts).
- The **active account** (if the user is an owner/admin). | -Up! +--- -<|page-76-lllmstxt|> -## fastembed-quickstart -- [Documentation](https://qdrant.tech/documentation/) -- [Fastembed](https://qdrant.tech/documentation/fastembed/) -- Quickstart +## **Profile** +Permissions for accessing personal profile information. -# [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-quickstart/\#how-to-generate-text-embedings-with-fastembed) How to Generate Text Embedings with FastEmbed +| Permission | Description | +|------------|------------| +| `read:profile` | View the user’s own profile information.
**(Assigned to all users by default)** | -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-quickstart/\#install-fastembed) Install FastEmbed +--- -```python -pip install fastembed +<|page-112-lllmstxt|> +# Build a Hybrid Search Service with FastEmbed and Qdrant -``` +| Time: 20 min | Level: Beginner | Output: [GitHub](https://github.com/qdrant/qdrant_demo/) | +| --- | ----------- | ----------- |----------- | -Just for demo purposes, you will use Lists and NumPy to work with sample data. +This tutorial shows you how to build and deploy your own hybrid search service to look through descriptions of companies from [startups-list.com](https://www.startups-list.com/) and pick the most similar ones to your query. +The website contains the company names, descriptions, locations, and a picture for each entry. -```python -from typing import List -import numpy as np +As we have already written on our [blog](/articles/hybrid-search/), there is no single definition of hybrid search. +In this tutorial we are covering the case with a combination of dense and [sparse embeddings](/articles/sparse-vectors/). +The former ones refer to the embeddings generated by such well-known neural networks as BERT, while the latter ones are more related to a traditional full-text search approach. -``` +Our hybrid search service will use [Fastembed](https://github.com/qdrant/fastembed) package to generate embeddings of text descriptions and [FastAPI](https://fastapi.tiangolo.com/) to serve the search API. +Fastembed natively integrates with Qdrant client, so you can easily upload the data into Qdrant and perform search queries. -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-quickstart/\#load-default-model) Load default model +![Hybrid Search Schema](/documentation/tutorials/hybrid-search-with-fastembed/hybrid-search-schema.png) -In this example, you will use the default text embedding model, `BAAI/bge-small-en-v1.5`. -```python -from fastembed import TextEmbedding +## Workflow -``` +To create a hybrid search service, you will need to transform your raw data and then create a search function to manipulate it. +First, you will 1) download and prepare a sample dataset using a modified version of the BERT ML model. Then, you will 2) load the data into Qdrant, 3) create a hybrid search API and 4) serve it using FastAPI. -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-quickstart/\#add-sample-data) Add sample data +![Hybrid Search Workflow](/docs/workflow-neural-search.png) -Now, add two sample documents. Your documents must be in a list, and each document must be a string +## Prerequisites -```python -documents: List[str] = [\ - "FastEmbed is lighter than Transformers & Sentence-Transformers.",\ - "FastEmbed is supported by and maintained by Qdrant.",\ -] +To complete this tutorial, you will need: -``` +- Docker - The easiest way to use Qdrant is to run a pre-built Docker image. +- [Raw parsed data](https://storage.googleapis.com/generall-shared-data/startups_demo.json) from startups-list.com. +- Python version >=3.9 -Download and initialize the model. Print a message to verify the process. +## Prepare sample dataset -```python -embedding_model = TextEmbedding() -print("The model BAAI/bge-small-en-v1.5 is ready to use.") +To conduct a hybrid search on startup descriptions, you must first encode the description data into vectors. +Fastembed integration into qdrant client combines encoding and uploading into a single step. -``` +It also takes care of batching and parallelization, so you don't have to worry about it. -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-quickstart/\#embed-data) Embed data +Let's start by downloading the data and installing the necessary packages. -Generate embeddings for both documents. -```python -embeddings_generator = embedding_model.embed(documents) -embeddings_list = list(embeddings_generator) -len(embeddings_list[0]) +1. First you need to download the dataset. +```bash +wget https://storage.googleapis.com/generall-shared-data/startups_demo.json ``` -Here is the sample document list. The default model creates vectors with 384 dimensions. - -```bash -Document: This is built to be faster and lighter than other embedding libraries e.g. Transformers, Sentence-Transformers, etc. -Vector of type: with shape: (384,) -Document: fastembed is supported by and maintained by Qdrant. -Vector of type: with shape: (384,) +## Run Qdrant in Docker -``` +Next, you need to manage all of your data using a vector engine. Qdrant lets you store, update or delete created vectors. Most importantly, it lets you search for the nearest vectors via a convenient API. -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-quickstart/\#visualize-embeddings) Visualize embeddings +> **Note:** Before you begin, create a project directory and a virtual python environment in it. -```python -print("Embeddings:\n", embeddings_list) +1. Download the Qdrant image from DockerHub. +```bash +docker pull qdrant/qdrant ``` - -The embeddings don’t look too interesting, but here is a visual. +2. Start Qdrant inside of Docker. ```bash -Embeddings: - [[-0.11154681 0.00976555 0.00524559 0.01951888 -0.01934952 0.02943449\ - -0.10519084 -0.00890122 0.01831438 0.01486796 -0.05642502 0.02561352\ - -0.00120165 0.00637456 0.02633459 0.0089221 0.05313658 0.03955453\ - -0.04400245 -0.02929407 0.04691846 -0.02515868 0.00778646 -0.05410657\ -...\ - -0.00243012 -0.01820582 0.02938612 0.02108984 -0.02178085 0.02971899\ - -0.00790564 0.03561783 0.0652488 -0.04371546 -0.05550042 0.02651665\ - -0.01116153 -0.01682246 -0.05976734 -0.03143916 0.06522726 0.01801389\ - -0.02611006 0.01627177 -0.0368538 0.03968835 0.027597 0.03305927]] +docker run -p 6333:6333 \ + -v $(pwd)/qdrant_storage:/qdrant/storage \ + qdrant/qdrant +``` +You should see output like this +```text +... +[2021-02-05T00:08:51Z INFO actix_server::builder] Starting 12 workers +[2021-02-05T00:08:51Z INFO actix_server::builder] Starting "actix-web-service-0.0.0.0:6333" service on 0.0.0.0:6333 ``` -##### Was this page useful? +Test the service by going to [http://localhost:6333/](http://localhost:6333/). You should see the Qdrant version info in your browser. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +All data uploaded to Qdrant is saved inside the `./qdrant_storage` directory and will be persisted even if you recreate the container. -Thank you for your feedback! 🙏 -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/fastembed/fastembed-quickstart.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +## Upload data to Qdrant -On this page: +1. Install the official Python client to best interact with Qdrant. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/fastembed/fastembed-quickstart.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +```bash +pip install "qdrant-client[fastembed]>=1.14.2" +``` +> **Note:** This tutorial requires fastembed of version >=0.6.1. -× +At this point, you should have startup records in the `startups_demo.json` file and Qdrant running on a local machine. -[Powered by](https://qdrant.tech/) +Now you need to write a script to upload all startup data and vectors into the search engine. -<|page-77-lllmstxt|> -## collaborative-filtering -- [Documentation](https://qdrant.tech/documentation/) -- [Advanced tutorials](https://qdrant.tech/documentation/advanced-tutorials/) -- Build a Recommendation System with Collaborative Filtering +2. Create a client object for Qdrant. -# [Anchor](https://qdrant.tech/documentation/advanced-tutorials/collaborative-filtering/\#use-collaborative-filtering-to-build-a-movie-recommendation-system-with-qdrant) Use Collaborative Filtering to Build a Movie Recommendation System with Qdrant +```python +# Import client library +from qdrant_client import QdrantClient, models -| Time: 45 min | Level: Intermediate | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/qdrant/examples/blob/master/collaborative-filtering/collaborative-filtering.ipynb) | | -| --- | --- | --- | --- | +client = QdrantClient(url="http://localhost:6333") +``` -Every time Spotify recommends the next song from a band you’ve never heard of, it uses a recommendation algorithm based on other users’ interactions with that song. This type of algorithm is known as **collaborative filtering**. -Unlike content-based recommendations, collaborative filtering excels when the objects’ semantics are loosely or unrelated to users’ preferences. This adaptability is what makes it so fascinating. Movie, music, or book recommendations are good examples of such use cases. After all, we rarely choose which book to read purely based on the plot twists. +3. Choose models to encode your data and prepare collections. -The traditional way to build a collaborative filtering engine involves training a model that converts the sparse matrix of user-to-item relations into a compressed, dense representation of user and item vectors. Some of the most commonly referenced algorithms for this purpose include [SVD (Singular Value Decomposition)](https://en.wikipedia.org/wiki/Singular_value_decomposition) and [Factorization Machines](https://en.wikipedia.org/wiki/Matrix_factorization_%28recommender_systems%29). However, the model training approach requires significant resource investments. Model training necessitates data, regular re-training, and a mature infrastructure. +In this tutorial, we will be using two pre-trained models to compute dense and sparse vectors correspondingly +The models are: `sentence-transformers/all-MiniLM-L6-v2` and `prithivida/Splade_PP_en_v1`. +As soon as the choice is made, we need to configure a collection in Qdrant. -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/collaborative-filtering/\#methodology) Methodology +```python +dense_vector_name = "dense" +sparse_vector_name = "sparse" +dense_model_name = "sentence-transformers/all-MiniLM-L6-v2" +sparse_model_name = "prithivida/Splade_PP_en_v1" +if not client.collection_exists("startups"): + client.create_collection( + collection_name="startups", + vectors_config={ + dense_vector_name: models.VectorParams( + size=client.get_embedding_size(dense_model_name), + distance=models.Distance.COSINE + ) + }, # size and distance are model dependent + sparse_vectors_config={sparse_vector_name: models.SparseVectorParams()}, + ) +``` -Fortunately, there is a way to build collaborative filtering systems without any model training. You can obtain interpretable recommendations and have a scalable system using a technique based on similarity search. Let’s explore how this works with an example of building a movie recommendation system. +Qdrant requires vectors to have their own names and configurations. +Parameters `size` and `distance` are mandatory, however, you can additionaly specify extended configuration for your vectors, like `quantization_config` or `hnsw_config`. -Recommendation system with Qdrant and sparse vectors (Collaborative Filtering) - YouTube -[Photo image of Qdrant - Vector Database & Search Engine](https://www.youtube.com/channel/UC6ftm8PwH1RU_LM1jwG0LQA?embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +4. Read data from the file. -Qdrant - Vector Database & Search Engine +```python +import json -8.12K subscribers +payload_path = "startups_demo.json" +documents = [] +metadata = [] -[Recommendation system with Qdrant and sparse vectors (Collaborative Filtering)](https://www.youtube.com/watch?v=9B7RrmQCQeQ) +with open(payload_path) as fd: + for line in fd: + obj = json.loads(line) + description = obj["description"] + dense_document = models.Document(text=description, model=dense_model_name) + sparse_document = models.Document(text=description, model=sparse_model_name) + documents.append( + { + dense_vector_name: dense_document, + sparse_vector_name: sparse_document, + } + ) + metadata.append(obj) +``` -Qdrant - Vector Database & Search Engine +In this block of code, we read data from `startups_demo.json` file and split it into two list: `documents` and `metadata`. +Documents are models with descriptions of startups and model names to embed data. Metadata is payload associated with each startup, such as the name, location, and picture. +We will use `documents` to encode the data into vectors. -Search +6. Encode and upload data. -Watch later +```python + client.upload_collection( + collection_name="startups", + vectors=tqdm.tqdm(documents), + payload=metadata, + parallel=4, # Use 4 CPU cores to encode data. + # This will spawn a model per process, which might be memory expensive + # Make sure that your system does not use swap, and reduce the amount + # # of processes if it does. + # Otherwise, it might significantly slow down the process. + # Requires wrapping code into if __name__ == '__main__' block + ) +``` -Share + -Copy link +
+ Upload processed data -Info +Download and unpack the processed data from [here](https://storage.googleapis.com/dataset-startup-search/startup-list-com/startups_hybrid_search_processed_40k.tar.gz) or use the following script: -Shopping +```bash +wget https://storage.googleapis.com/dataset-startup-search/startup-list-com/startups_hybrid_search_processed_40k.tar.gz +tar -xvf startups_hybrid_search_processed_40k.tar.gz +``` -Tap to unmute +Then you can upload the data to Qdrant. -If playback doesn't begin shortly, try restarting your device. +```python +import json +import numpy as np -More videos -## More videos +def named_vectors( + vectors: list[float], + sparse_vectors: list[models.SparseVector] +) -> dict: + for vector, sparse_vector in zip(vectors, sparse_vectors): + yield { + dense_vector_name: vector, + sparse_vector_name: models.SparseVector(**sparse_vector), + } -You're signed out -Videos you watch may be added to the TV's watch history and influence TV recommendations. To avoid this, cancel and sign in to YouTube on your computer. +with open("dense_vectors.npy", "rb") as f: + vectors = np.load(f) +with open("sparse_vectors.json", "r") as f: + sparse_vectors = json.load(f) -CancelConfirm +with open("payload.json", "r") as f: + payload = json.load(f) -Share +client.upload_collection( + "startups", + vectors=named_vectors(vectors, sparse_vectors), + payload=payload +) +``` +
-Include playlist +The `upload_collection` method will encode all documents and upload them to Qdrant. -An error occurred while retrieving sharing information. Please try again later. +The `parallel` parameter enables data-parallelism instead of built-in ONNX parallelism. -[Watch on](https://www.youtube.com/watch?v=9B7RrmQCQeQ&embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +Additionally, you can specify ids for each document, if you want to use them later to update or delete documents. +If you don't specify ids, they will be generated automatically. -0:00 +You can monitor the progress of the encoding by passing tqdm progress bar to the `upload_collection` method. -0:00 / 3:55 -‱Live +```python +from tqdm import tqdm -‱ +client.upload_collection( + collection_name="startups", + vectors=documents, + payload=metadata, + ids=tqdm(range(len(documents))), +) +``` -[Watch on YouTube](https://www.youtube.com/watch?v=9B7RrmQCQeQ "Watch on YouTube") +## Build the search API -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/collaborative-filtering/\#implementation) Implementation +Now that all the preparations are complete, let's start building a neural search class. -To implement this, you will use a simple yet powerful resource: [Qdrant with Sparse Vectors](https://qdrant.tech/articles/sparse-vectors/). +In order to process incoming requests, the hybrid search class will need 3 things: 1) models to convert the query into a vector, 2) the Qdrant client to perform search queries, 3) fusion function to re-rank dense and sparse search results. -Notebook: [You can try this code here](https://githubtocolab.com/qdrant/examples/blob/master/collaborative-filtering/collaborative-filtering.ipynb) +Qdrant supports 2 fusion functions for combining the results: [reciprocal rank fusion](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) and [distribution based score fusion](https://qdrant.tech/documentation/concepts/hybrid-queries/?q=distribution+based+sc#:~:text=Distribution%2DBased%20Score%20Fusion) -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/collaborative-filtering/\#setup) Setup -You have to first import the necessary libraries and define the environment. +1. Create a file named `hybrid_searcher.py` and specify the following. ```python -import os -import pandas as pd -import requests from qdrant_client import QdrantClient, models -from qdrant_client.models import PointStruct, SparseVector, NamedSparseVector -from collections import defaultdict -# OMDB API Key - for movie posters -omdb_api_key = os.getenv("OMDB_API_KEY") - -# Collection name -collection_name = "movies" - -# Set Qdrant Client -qdrant_client = QdrantClient( - os.getenv("QDRANT_HOST"), - api_key=os.getenv("QDRANT_API_KEY") -) +class HybridSearcher: + DENSE_MODEL = "sentence-transformers/all-MiniLM-L6-v2" + SPARSE_MODEL = "prithivida/Splade_PP_en_v1" + + def __init__(self, collection_name): + self.collection_name = collection_name + self.qdrant_client = QdrantClient() ``` -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/collaborative-filtering/\#define-output) Define output - -Here, you will configure the recommendation engine to retrieve movie posters as output. +2. Write the search function. ```python -# Function to get movie poster using OMDB API -def get_movie_poster(imdb_id, api_key): - url = f"https://www.omdbapi.com/?i={imdb_id}&apikey={api_key}" - data = requests.get(url).json() - return data.get('Poster'), data +def search(self, text: str): + search_result = self.qdrant_client.query_points( + collection_name=self.collection_name, + query=models.FusionQuery( + fusion=models.Fusion.RRF # we are using reciprocal rank fusion here + ), + prefetch=[ + models.Prefetch( + query=models.Document(text=text, model=self.DENSE_MODEL), + using=dense_vector_name, + ), + models.Prefetch( + query=models.Document(text=text, model=self.SPARSE_MODEL), + using=sparse_vector_name, + ), + ], + query_filter=None, # If you don't want any filters for now + limit=5, # 5 the closest results + ).points + # `search_result` contains models.QueryResponse structure + # We can access list of scored points with the corresponding similarity scores, + # vectors (if `with_vectors` was set to `True`), and payload via `points` attribute. + + # Select and return metadata + metadata = [point.payload for point in search_result] + return metadata ``` -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/collaborative-filtering/\#prepare-the-data) Prepare the data +3. Add search filters. -Load the movie datasets. These include three main CSV files: user ratings, movie titles, and OMDB IDs. +With Qdrant it is also feasible to add some conditions to the search. +For example, if you wanted to search for startups in a certain city, the search query could look like this: ```python -# Load CSV files -ratings_df = pd.read_csv('data/ratings.csv', low_memory=False) -movies_df = pd.read_csv('data/movies.csv', low_memory=False) - -# Convert movieId in ratings_df and movies_df to string -ratings_df['movieId'] = ratings_df['movieId'].astype(str) -movies_df['movieId'] = movies_df['movieId'].astype(str) - -rating = ratings_df['rating'] - -# Normalize ratings -ratings_df['rating'] = (rating - rating.mean()) / rating.std() - -# Merge ratings with movie metadata to get movie titles -merged_df = ratings_df.merge( - movies_df[['movieId', 'title']], - left_on='movieId', right_on='movieId', how='inner' -) - -# Aggregate ratings to handle duplicate (userId, title) pairs -ratings_agg_df = merged_df.groupby(['userId', 'movieId']).rating.mean().reset_index() - -ratings_agg_df.head() - -``` - -| | userId | movieId | rating | -| --- | --- | --- | --- | -| 0 | 1 | 1 | 0.429960 | -| 1 | 1 | 1036 | 1.369846 | -| 2 | 1 | 1049 | -0.509926 | -| 3 | 1 | 1066 | 0.429960 | -| 4 | 1 | 110 | 0.429960 | - -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/collaborative-filtering/\#convert-to-sparse) Convert to sparse + ... -If you want to search across numerous reviews from different users, you can represent these reviews in a sparse matrix. + city_of_interest = "Berlin" -```python -# Convert ratings to sparse vectors -user_sparse_vectors = defaultdict(lambda: {"values": [], "indices": []}) -for row in ratings_agg_df.itertuples(): - user_sparse_vectors[row.userId]["values"].append(row.rating) - user_sparse_vectors[row.userId]["indices"].append(int(row.movieId)) + # Define a filter for cities + city_filter = models.Filter( + must=[ + models.FieldCondition( + key="city", + match=models.MatchValue(value=city_of_interest) + ) + ] + ) + # NOTE: it is not a hybrid search! It's just a dense query for simplicity + search_result = self.qdrant_client.query_points( + collection_name=self.collection_name, + query=models.Document(text=text, model=self.DENSE_MODEL), + query_filter=city_filter, + limit=5 + ).points + ... ``` -![collaborative-filtering](https://qdrant.tech/blog/collaborative-filtering/collaborative-filtering.png) +You have now created a class for neural search queries. Now wrap it up into a service. -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/collaborative-filtering/\#upload-the-data) Upload the data +## Deploy the search with FastAPI -Here, you will initialize the Qdrant client and create a new collection to store the data. -Convert the user ratings to sparse vectors and include the `movieId` in the payload. +To build the service you will use the FastAPI framework. -```python -# Define a data generator -def data_generator(): - for user_id, sparse_vector in user_sparse_vectors.items(): - yield PointStruct( - id=user_id, - vector={"ratings": SparseVector( - indices=sparse_vector["indices"], - values=sparse_vector["values"] - )}, - payload={"user_id": user_id, "movie_id": sparse_vector["indices"]} - ) +1. Install FastAPI. -# Upload points using the data generator -qdrant_client.upload_points( - collection_name=collection_name, - points=data_generator() -) +To install it, use the command +```bash +pip install fastapi uvicorn ``` -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/collaborative-filtering/\#define-query) Define query +2. Implement the service. -In order to get recommendations, we need to find users with similar tastes to ours. -Let’s describe our preferences by providing ratings for some of our favorite movies. +Create a file named `service.py` and specify the following. -`1` indicates that we like the movie, `-1` indicates that we dislike it. +The service will have only one API endpoint and will look like this: ```python -my_ratings = { - 603: 1, # Matrix - 13475: 1, # Star Trek - 11: 1, # Star Wars - 1091: -1, # The Thing - 862: 1, # Toy Story - 597: -1, # Titanic - 680: -1, # Pulp Fiction - 13: 1, # Forrest Gump - 120: 1, # Lord of the Rings - 87: -1, # Indiana Jones - 562: -1 # Die Hard -} +from fastapi import FastAPI -``` +# The file where HybridSearcher is stored +from hybrid_searcher import HybridSearcher -Click to see the code for `to_vector` +app = FastAPI() -```python -# Create sparse vector from my_ratings -def to_vector(ratings): - vector = SparseVector( - values=[], - indices=[] - ) - for movie_id, rating in ratings.items(): - vector.values.append(rating) - vector.indices.append(movie_id) - return vector +# Create a neural searcher instance +hybrid_searcher = HybridSearcher(collection_name="startups") -``` -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/collaborative-filtering/\#run-the-query) Run the query +@app.get("/api/search") +def search_startup(q: str): + return {"result": hybrid_searcher.search(text=q)} -From the uploaded list of movies with ratings, we can perform a search in Qdrant to get the top most similar users to us. -```python -# Perform the search -results = qdrant_client.query_points( - collection_name=collection_name, - query=to_vector(my_ratings), - using="ratings", - limit=20 -).points +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8000) ``` -Now we can find the movies liked by the other similar users, but we haven’t seen yet. -Let’s combine the results from found users, filter out seen movies, and sort by the score. - -```python -# Convert results to scores and sort by score -def results_to_scores(results): - movie_scores = defaultdict(lambda: 0) - for result in results: - for movie_id in result.payload["movie_id"]: - movie_scores[movie_id] += result.score - return movie_scores - -# Convert results to scores and sort by score -movie_scores = results_to_scores(results) -top_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True) +3. Run the service. +```bash +python service.py ``` -Visualize results in Jupyter Notebook - -Finally, we display the top 5 recommended movies along with their posters and titles. +4. Open your browser at [http://localhost:8000/docs](http://localhost:8000/docs). -```python -# Create HTML to display top 5 results -html_content = "
" +You should be able to see a debug interface for your service. -for movie_id, score in top_movies[:5]: - imdb_id_row = links.loc[links['movieId'] == int(movie_id), 'imdbId'] - if not imdb_id_row.empty: - imdb_id = imdb_id_row.values[0] - poster_url, movie_info = get_movie_poster(imdb_id, omdb_api_key) - movie_title = movie_info.get('Title', 'Unknown Title') +![FastAPI Swagger interface](/docs/fastapi_neural_search.png) - html_content += f""" -
- Poster -
{movie_title}
-
Score: {score}
-
- """ - else: - continue # Skip if imdb_id is not found +Feel free to play around with it, make queries regarding the companies in our corpus, and check out the results. -html_content += "
" +Join our [Discord community](https://qdrant.to/discord), where we talk about vector search and similarity learning, publish other examples of neural networks and neural search applications. -display(HTML(html_content)) +<|page-113-lllmstxt|> +# Introduction -``` +Vector databases are a relatively new way for interacting with abstract data representations +derived from opaque machine learning models such as deep learning architectures. These +representations are often called vectors or embeddings and they are a compressed version of +the data used to train a machine learning model to accomplish a task like sentiment analysis, +speech recognition, object detection, and many others. -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/collaborative-filtering/\#recommendations) Recommendations +These new databases shine in many applications like [semantic search](https://en.wikipedia.org/wiki/Semantic_search) +and [recommendation systems](https://en.wikipedia.org/wiki/Recommender_system), and here, we'll +learn about one of the most popular and fastest growing vector databases in the market, [Qdrant](https://github.com/qdrant/qdrant). -For a complete display of movie posters, check the [notebook output](https://github.com/qdrant/examples/blob/master/collaborative-filtering/collaborative-filtering.ipynb). Here are the results without html content. +## What is Qdrant? -```text -Toy Story, Score: 131.2033799 -Monty Python and the Holy Grail, Score: 131.2033799 -Star Wars: Episode V - The Empire Strikes Back, Score: 131.2033799 -Star Wars: Episode VI - Return of the Jedi, Score: 131.2033799 -Men in Black, Score: 131.2033799 +[Qdrant](https://github.com/qdrant/qdrant) "is a vector similarity search engine that provides a production-ready +service with a convenient API to store, search, and manage points (i.e. vectors) with an additional +payload." You can think of the payloads as additional pieces of information that can help you +hone in on your search and also receive useful information that you can give to your users. -``` +You can get started using Qdrant with the Python `qdrant-client`, by pulling the latest docker +image of `qdrant` and connecting to it locally, or by trying out [Qdrant's Cloud](https://cloud.qdrant.io/) +free tier option until you are ready to make the full switch. -On top of collaborative filtering, we can further enhance the recommendation system by incorporating other features like user demographics, movie genres, or movie tags. +With that out of the way, let's talk about what are vector databases. -Or, for example, only consider recent ratings via a time-based filter. This way, we can recommend movies that are currently popular among users. +## What Are Vector Databases? -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/collaborative-filtering/\#conclusion) Conclusion +![dbs](https://raw.githubusercontent.com/ramonpzg/mlops-sydney-2023/main/images/databases.png) -As demonstrated, it is possible to build an interesting movie recommendation system without intensive model training using Qdrant and Sparse Vectors. This approach not only simplifies the recommendation process but also makes it scalable and interpretable. In future tutorials, we can experiment more with this combination to further enhance our recommendation systems. +Vector databases are a type of database designed to store and query high-dimensional vectors +efficiently. In traditional [OLTP](https://www.ibm.com/topics/oltp) and [OLAP](https://www.ibm.com/topics/olap) +databases (as seen in the image above), data is organized in rows and columns (and these are +called **Tables**), and queries are performed based on the values in those columns. However, +in certain applications including image recognition, natural language processing, and recommendation +systems, data is often represented as vectors in a high-dimensional space, and these vectors, plus +an id and a payload we call a point. These points are the elements we store in something called a **Collection** within a vector +database like Qdrant. -##### Was this page useful? +A vector in this context is a mathematical representation of an object or data point, where elements of +the vector implicitly or explicitly correspond to specific features or attributes of the object. For example, +in an image recognition system, a vector could represent an image, with each element of the vector +representing a pixel value or a descriptor/characteristic of that pixel. In a music recommendation +system, each vector could represent a song, and elements of the vector would capture song characteristics +such as tempo, genre, lyrics, and so on. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Vector databases are optimized for **storing** and **querying** these high-dimensional vectors +efficiently, and they often use specialized data structures and indexing techniques such as +Hierarchical Navigable Small World (HNSW) -- which is used to implement Approximate Nearest +Neighbors -- and Product Quantization, among others. These databases enable fast similarity +and semantic search while allowing users to find vectors that are the closest to a given query +vector based on some distance metric. The most commonly used distance metrics are Euclidean +Distance, Cosine Similarity, and Dot Product, and these three are fully supported Qdrant. -Thank you for your feedback! 🙏 +Here's a quick overview of the three: +- [**Cosine Similarity**](https://en.wikipedia.org/wiki/Cosine_similarity) - Cosine similarity +is a way to measure how similar two vectors are. To simplify, it reflects whether the vectors +have the same direction (similar) or are poles apart. Cosine similarity is often used with text representations +to compare how similar two documents or sentences are to each other. The output of cosine similarity ranges +from -1 to 1, where -1 means the two vectors are completely dissimilar, and 1 indicates maximum similarity. +- [**Dot Product**](https://en.wikipedia.org/wiki/Dot_product) - The dot product similarity metric is another way +of measuring how similar two vectors are. Unlike cosine similarity, it also considers the length of the vectors. +This might be important when, for example, vector representations of your documents are built +based on the term (word) frequencies. The dot product similarity is calculated by multiplying the respective values +in the two vectors and then summing those products. The higher the sum, the more similar the two vectors are. +If you normalize the vectors (so the numbers in them sum up to 1), the dot product similarity will become +the cosine similarity. +- [**Euclidean Distance**](https://en.wikipedia.org/wiki/Euclidean_distance) - Euclidean +distance is a way to measure the distance between two points in space, similar to how we +measure the distance between two places on a map. It's calculated by finding the square root +of the sum of the squared differences between the two points' coordinates. This distance metric +is also commonly used in machine learning to measure how similar or dissimilar two vectors are. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/advanced-tutorials/collaborative-filtering.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Now that we know what vector databases are and how they are structurally different than other +databases, let's go over why they are important. -On this page: +## Why do we need Vector Databases? -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/advanced-tutorials/collaborative-filtering.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Vector databases play a crucial role in various applications that require similarity search, such +as recommendation systems, content-based image retrieval, and personalized search. By taking +advantage of their efficient indexing and searching techniques, vector databases enable faster +and more accurate retrieval of unstructured data already represented as vectors, which can +help put in front of users the most relevant results to their queries. -× +In addition, other benefits of using vector databases include: +1. Efficient storage and indexing of high-dimensional data. +3. Ability to handle large-scale datasets with billions of data points. +4. Support for real-time analytics and queries. +5. Ability to handle vectors derived from complex data types such as images, videos, and natural language text. +6. Improved performance and reduced latency in machine learning and AI applications. +7. Reduced development and deployment time and cost compared to building a custom solution. -[Powered by](https://qdrant.tech/) +Keep in mind that the specific benefits of using a vector database may vary depending on the +use case of your organization and the features of the database you ultimately choose. -<|page-78-lllmstxt|> -## search-as-you-type -- [Articles](https://qdrant.tech/articles/) -- Semantic Search As You Type +Let's now evaluate, at a high-level, the way Qdrant is architected. -[Back to Practical Examples](https://qdrant.tech/articles/practicle-examples/) +## High-Level Overview of Qdrant's Architecture -# Semantic Search As You Type +![qdrant](https://raw.githubusercontent.com/ramonpzg/mlops-sydney-2023/main/images/qdrant_overview_high_level.png) -Andre Bogus +The diagram above represents a high-level overview of some of the main components of Qdrant. Here +are the terminologies you should get familiar with. -· +- [Collections](/documentation/concepts/collections/): A collection is a named set of points (vectors with a payload) among which you can search. The vector of each point within the same collection must have the same dimensionality and be compared by a single metric. [Named vectors](/documentation/concepts/collections/#collection-with-multiple-vectors) can be used to have multiple vectors in a single point, each of which can have their own dimensionality and metric requirements. +- [Distance Metrics](https://en.wikipedia.org/wiki/Metric_space): These are used to measure +similarities among vectors and they must be selected at the same time you are creating a +collection. The choice of metric depends on the way the vectors were obtained and, in particular, +on the neural network that will be used to encode new queries. +- [Points](/documentation/concepts/points/): The points are the central entity that +Qdrant operates with and they consist of a vector and an optional id and payload. + - id: a unique identifier for your vectors. + - Vector: a high-dimensional representation of data, for example, an image, a sound, a document, a video, etc. + - [Payload](/documentation/concepts/payload/): A payload is a JSON object with additional data you can add to a vector. +- [Storage](/documentation/concepts/storage/): Qdrant can use one of two options for +storage, **In-memory** storage (Stores all vectors in RAM, has the highest speed since disk +access is required only for persistence), or **Memmap** storage, (creates a virtual address +space associated with the file on disk). +- Clients: the programming languages you can use to connect to Qdrant. -August 14, 2023 +## Next Steps -![Semantic Search As You Type](https://qdrant.tech/articles_data/search-as-you-type/preview/title.jpg) +Now that you know more about vector databases and Qdrant, you are ready to get started with one +of our tutorials. If you've never used a vector database, go ahead and jump straight into +the **Getting Started** section. Conversely, if you are a seasoned developer in these +technology, jump to the section most relevant to your use case. -Qdrant is one of the fastest vector search engines out there, so while looking for a demo to show off, we came upon the idea to do a search-as-you-type box with a fully semantic search backend. Now we already have a semantic/keyword hybrid search on our website. But that one is written in Python, which incurs some overhead for the interpreter. Naturally, I wanted to see how fast I could go using Rust. +As you go through the tutorials, please let us know if any questions come up in our +[Discord channel here](https://qdrant.to/discord). 😎 -Since Qdrant doesn’t embed by itself, I had to decide on an embedding model. The prior version used the [SentenceTransformers](https://www.sbert.net/) package, which in turn employs Bert-based [All-MiniLM-L6-V2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/tree/main) model. This model is battle-tested and delivers fair results at speed, so not experimenting on this front I took an [ONNX version](https://huggingface.co/optimum/all-MiniLM-L6-v2/tree/main) and ran that within the service. +<|page-114-lllmstxt|> +![agentic-rag-camelai-astronaut](/documentation/examples/agentic-rag-camelai-discord/astronaut-main.png) -The workflow looks like this: +# Agentic RAG Discord ChatBot with Qdrant, CAMEL-AI, & OpenAI -![Search Qdrant by Embedding](https://qdrant.tech/articles_data/search-as-you-type/Qdrant_Search_by_Embedding.png) +| Time: 45 min | Level: Intermediate | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Ymqzm6ySoyVOekY7fteQBCFCXYiYyHxw#scrollTo=QQZXwzqmNfaS) | +| --- | ----------- | ----------- |----------- | -This will, after tokenizing and embedding send a `/collections/site/points/search` POST request to Qdrant, sending the following JSON: -```json -POST collections/site/points/search -{ - "vector": [-0.06716014,-0.056464013, ...(382 values omitted)], - "limit": 5, - "with_payload": true, -} -``` -Even with avoiding a network round-trip, the embedding still takes some time. As always in optimization, if you cannot do the work faster, a good solution is to avoid work altogether (please don’t tell my employer). This can be done by pre-computing common prefixes and calculating embeddings for them, then storing them in a `prefix_cache` collection. Now the [`recommend`](https://api.qdrant.tech/api-reference/search/recommend-points) API method can find the best matches without doing any embedding. For now, I use short (up to and including 5 letters) prefixes, but I can also parse the logs to get the most common search terms and add them to the cache later. +Unlike traditional RAG techniques, which passively retrieve context and generate responses, **agentic RAG** involves active decision-making and multi-step reasoning by the chatbot. Instead of just fetching data, the chatbot makes decisions, dynamically interacts with various data sources, and adapts based on context, giving it a much more dynamic and intelligent approach. -![Qdrant Recommendation](https://qdrant.tech/articles_data/search-as-you-type/Qdrant_Recommendation.png) +In this tutorial, we’ll develop a fully functional chatbot using Qdrant, [CAMEL-AI](https://www.camel-ai.org/), and [OpenAI](https://openai.com/). -Making that work requires setting up the `prefix_cache` collection with points that have the prefix as their `point_id` and the embedding as their `vector`, which lets us do the lookup with no search or index. The `prefix_to_id` function currently uses the `u64` variant of `PointId`, which can hold eight bytes, enough for this use. If the need arises, one could instead encode the names as UUID, hashing the input. Since I know all our prefixes are within 8 bytes, I decided against this for now. +Let’s get started! -The `recommend` endpoint works roughly the same as `search_points`, but instead of searching for a vector, Qdrant searches for one or more points (you can also give negative example points the search engine will try to avoid in the results). It was built to help drive recommendation engines, saving the round-trip of sending the current point’s vector back to Qdrant to find more similar ones. However Qdrant goes a bit further by allowing us to select a different collection to lookup the points, which allows us to keep our `prefix_cache` collection separate from the site data. So in our case, Qdrant first looks up the point from the `prefix_cache`, takes its vector and searches for that in the `site` collection, using the precomputed embeddings from the cache. The API endpoint expects a POST of the following JSON to `/collections/site/points/recommend`: +--- -```json -POST collections/site/points/recommend -{ - "positive": [1936024932], - "limit": 5, - "with_payload": true, - "lookup_from": { - "collection": "prefix_cache" - } -} +## Workflow Overview -``` +Below is a high-level look at our Agentic RAG workflow: -Now I have, in the best Rust tradition, a blazingly fast semantic search. -To demo it, I used our [Qdrant documentation website](https://qdrant.tech/documentation/)’s page search, replacing our previous Python implementation. So in order to not just spew empty words, here is a benchmark, showing different queries that exercise different code paths. +| Step | Description | +|------|-------------------------------------------------------------------------------------------------------------------| +| **1. Environment Setup** | Install required libraries (`camel-ai`, `qdrant-client`, `discord.py`) and set up the Python environment. | +| **2. Set Up the OpenAI Embedding Instance** | Create an OpenAI account, generate an API key, and configure the embedding model. | +| **3. Configure the Qdrant Client** | Sign up for Qdrant Cloud, create a cluster, configure `QdrantStorage`, and set up the API connection. | +| **4. Scrape and Process Data** | Use `VectorRetriever` to scrape Qdrant documentation, chunk text, and store embeddings in Qdrant. | +| **5. Set Up the CAMEL-AI ChatAgent** | Instantiate a CAMEL-AI `ChatAgent` with OpenAI models for multi-step reasoning and context-aware responses. | +| **6. Create and Configure the Discord Bot** | Register a new bot in the Discord Developer Portal, invite it to a server, and enable permissions. | +| **7. Build the Discord Bot** | Integrate Discord.py with CAMEL-AI and Qdrant to retrieve context and generate intelligent responses. | +| **8. Test the Bot** | Run the bot in a live Discord server and verify that it provides relevant, context-rich answers. | -Since the operations themselves are far faster than the network whose fickle nature would have swamped most measurable differences, I benchmarked both the Python and Rust services locally. I’m measuring both versions on the same AMD Ryzen 9 5900HX with 16GB RAM running Linux. The table shows the average time and error bound in milliseconds. I only measured up to a thousand concurrent requests. None of the services showed any slowdown with more requests in that range. I do not expect our service to become DDOS’d, so I didn’t benchmark with more load. -Without further ado, here are the results: +## Architecture Diagram -| query length | Short | Long | -| --- | --- | --- | -| Python 🐍 | 16 ± 4 ms | 16 ± 4 ms | -| Rust 🩀 | 1œ ± œ ms | 5 ± 1 ms | +Below is the architecture diagram representing the workflow and interactions of the chatbot: -The Rust version consistently outperforms the Python version and offers a semantic search even on few-character queries. If the prefix cache is hit (as in the short query length), the semantic search can even get more than ten times faster than the Python version. The general speed-up is due to both the relatively lower overhead of Rust + Actix Web compared to Python + FastAPI (even if that already performs admirably), as well as using ONNX Runtime instead of SentenceTransformers for the embedding. The prefix cache gives the Rust version a real boost by doing a semantic search without doing any embedding work. +![Architecture Diagram](/documentation/examples/agentic-rag-camelai-discord/diagram_discord_bot.png) -As an aside, while the millisecond differences shown here may mean relatively little for our users, whose latency will be dominated by the network in between, when typing, every millisecond more or less can make a difference in user perception. Also search-as-you-type generates between three and five times as much load as a plain search, so the service will experience more traffic. Less time per request means being able to handle more of them. +The workflow starts by **scraping, chunking, and upserting** content from URLs using the `vector_retriever.process()` method, which generates embeddings with the **OpenAI embedding instance**. These embeddings, along with their metadata, are then indexed and stored in **Qdrant** via the `QdrantStorage` class. -Mission accomplished! But wait, there’s more! +When a user sends a query through the **Discord bot**, it is processed by `vector_retriever.query()`, which first embeds the query using **OpenAI Embeddings** and then retrieves the most relevant matches from Qdrant via `QdrantStorage`. The retrieved context (e.g., relevant documentation snippets) is then passed to an **OpenAI-powered Qdrant Agent** under **CAMEL-AI**, which generates a final, context-aware response. -### [Anchor](https://qdrant.tech/articles/search-as-you-type/\#prioritizing-exact-matches-and-headings) Prioritizing Exact Matches and Headings +The Qdrant Agent processes the retrieved vectors using the `GPT_4O_MINI` language model, producing a response that is contextually relevant to the user's query. This response is then sent back to the user through the **Discord bot**, completing the flow. -To improve on the quality of the results, Qdrant can do multiple searches in parallel, and then the service puts the results in sequence, taking the first best matches. The extended code searches: +--- -1. Text matches in titles -2. Text matches in body (paragraphs or lists) -3. Semantic matches in titles -4. Any Semantic matches +## **Step 1: Environment Setup** -Those are put together by taking them in the above order, deduplicating as necessary. +Before diving into the implementation, here's a high-level overview of the stack we'll use: -![merge workflow](https://qdrant.tech/articles_data/search-as-you-type/sayt_merge.png) +| **Component** | **Purpose** | +|-----------------|-------------------------------------------------------------------------------------------------------| +| **Qdrant** | Vector database for storing and querying document embeddings. | +| **OpenAI** | Embedding and language model for generating vector representations and chatbot responses. | +| **CAMEL-AI** | Framework for managing dialogue flow, retrieval, and AI agent interactions. | +| **Discord API** | Platform for deploying and interacting with the chatbot. | -Instead of sending a `search` or `recommend` request, one can also send a `search/batch` or `recommend/batch` request, respectively. Each of those contain a `"searches"` property with any number of search/recommend JSON requests: +### Install Dependencies -```json -POST collections/site/points/search/batch -{ - "searches": [\ - {\ - "vector": [-0.06716014,-0.056464013, ...],\ - "filter": {\ - "must": [\ - { "key": "text", "match": { "text": }},\ - { "key": "tag", "match": { "any": ["h1", "h2", "h3"] }},\ - ]\ - }\ - ...,\ - },\ - {\ - "vector": [-0.06716014,-0.056464013, ...],\ - "filter": {\ - "must": [ { "key": "body", "match": { "text": }} ]\ - }\ - ...,\ - },\ - {\ - "vector": [-0.06716014,-0.056464013, ...],\ - "filter": {\ - "must": [ { "key": "tag", "match": { "any": ["h1", "h2", "h3"] }} ]\ - }\ - ...,\ - },\ - {\ - "vector": [-0.06716014,-0.056464013, ...],\ - ...,\ - },\ - ] -} +We’ll install CAMEL-AI, which includes all necessary dependencies: +```python +!pip install camel-ai[all]==0.2.17 ``` -As the queries are done in a batch request, there isn’t any additional network overhead and only very modest computation overhead, yet the results will be better in many cases. - -The only additional complexity is to flatten the result lists and take the first 5 results, deduplicating by point ID. Now there is one final problem: The query may be short enough to take the recommend code path, but still not be in the prefix cache. In that case, doing the search _sequentially_ would mean two round-trips between the service and the Qdrant instance. The solution is to _concurrently_ start both requests and take the first successful non-empty result. -![sequential vs. concurrent flow](https://qdrant.tech/articles_data/search-as-you-type/sayt_concurrency.png) - -While this means more load for the Qdrant vector search engine, this is not the limiting factor. The relevant data is already in cache in many cases, so the overhead stays within acceptable bounds, and the maximum latency in case of prefix cache misses is measurably reduced. +--- -The code is available on the [Qdrant github](https://github.com/qdrant/page-search) +## **Step 2: Set Up the OpenAI Embedding Instance** -To sum up: Rust is fast, recommend lets us use precomputed embeddings, batch requests are awesome and one can do a semantic search in mere milliseconds. +1. **Create an OpenAI Account**: Go to [OpenAI](https://platform.openai.com/signup) and sign up for an account if you don’t already have one. -##### Was this page useful? +2. **Generate an API Key**: -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No + - After logging in, click on your profile icon in the top-right corner and select **API keys**. -Thank you for your feedback! 🙏 + - Click **Create new secret key**. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/search-as-you-type.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. + - Copy the generated API key and store it securely. You won’t be able to see it again. -On this page: +Here’s how to set up the OpenAI client in your code: -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/search-as-you-type.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Create a `.env` file in your project directory and add your API key: -× +```bash +OPENAI_API_KEY= +``` -[Powered by](https://qdrant.tech/) +Make sure to replace `` with your actual API key. -<|page-79-lllmstxt|> -## multimodal-search -- [Documentation](https://qdrant.tech/documentation/) -- Multilingual & Multimodal RAG with LlamaIndex +Now, start the OpenAI Client -# [Anchor](https://qdrant.tech/documentation/multimodal-search/\#multilingual--multimodal-search-with-llamaindex) Multilingual & Multimodal Search with LlamaIndex +```python +import openai +import os +from dotenv import load_dotenv -![Snow prints](https://qdrant.tech/documentation/examples/multimodal-search/image-1.png) +load_dotenv() -| Time: 15 min | Level: Beginner | Output: [GitHub](https://github.com/qdrant/examples/blob/master/multimodal-search/Multimodal_Search_with_LlamaIndex.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/qdrant/examples/blob/master/multimodal-search/Multimodal_Search_with_LlamaIndex.ipynb) | -| --- | --- | --- | --- | +openai_client = openai.Client( + api_key=os.getenv("OPENAI_API_KEY") +) +``` -## [Anchor](https://qdrant.tech/documentation/multimodal-search/\#overview) Overview +To set up the embedding instance, we will use text embedding 3 large: -We often understand and share information more effectively when combining different types of data. For example, the taste of comfort food can trigger childhood memories. We might describe a song with just “pam pam clap” sounds. Instead of writing paragraphs. Sometimes, we may use emojis and stickers to express how we feel or to share complex ideas. +```python +from camel.embeddings import OpenAIEmbedding +from camel.types import EmbeddingModelType -Modalities of data such as **text**, **images**, **video** and **audio** in various combinations form valuable use cases for Semantic Search applications. +embedding_instance = OpenAIEmbedding(model_type=EmbeddingModelType.TEXT_EMBEDDING_3_LARGE) +``` -Vector databases, being **modality-agnostic**, are perfect for building these applications. +## **Step 3: Configure the Qdrant Client** -In this simple tutorial, we are working with two simple modalities: **image** and **text** data. However, you can create a Semantic Search application with any combination of modalities if you choose the right embedding model to bridge the **semantic gap**. +For this tutorial, we will be using the **Qdrant Cloud Free Tier**. Here's how to set it up: -> The **semantic gap** refers to the difference between low-level features (aka brightness) and high-level concepts (aka cuteness). +1. **Create an Account**: Sign up for a Qdrant Cloud account at [Qdrant Cloud](https://cloud.qdrant.io). -For example, the [vdr-2b-multi-v1 model](https://huggingface.co/llamaindex/vdr-2b-multi-v1) from LlamaIndex is designed for multilingual embedding, particularly effective for visual document retrieval across multiple languages and domains. It allows for searching and querying visually rich multilingual documents without the need for OCR or other data extraction pipelines. +2. **Create a Cluster**: + - Navigate to the **Overview** section. + - Follow the onboarding instructions under **Create First Cluster** to set up your cluster. + - When you create the cluster, you will receive an **API Key**. Copy and securely store it, as you will need it later. -## [Anchor](https://qdrant.tech/documentation/multimodal-search/\#setup) Setup +3. **Wait for the Cluster to Provision**: + - Your new cluster will appear under the **Clusters** section. -First, install the required libraries `qdrant-client` and `llama-index-embeddings-huggingface`. +After obtaining your Qdrant Cloud details, add to your `.env` file: ```bash -pip install qdrant-client llama-index-embeddings-huggingface - +QDRANT_CLOUD_URL= +QDRANT_CLOUD_API_KEY= ``` -## [Anchor](https://qdrant.tech/documentation/multimodal-search/\#dataset) Dataset +### Configure the QdrantStorage -To make the demonstration simple, we created a tiny dataset of images and their captions for you. +The `QdrantStorage` will deal with connecting with the Qdrant Client for all necessary operations to your collection. + +```python +from camel.retrievers import VectorRetriever + +# Define collection name +collection_name = "qdrant-agent" -Images can be downloaded from [here](https://github.com/qdrant/examples/tree/master/multimodal-search/images). It’s **important** to place them in the same folder as your code/notebook, in the folder named `images`. +storage_instance = QdrantStorage( + vector_dim=embedding_instance.get_output_dim(), + url_and_api_key=( + qdrant_cloud_url, + qdrant_api_key, + ), + collection_name=collection_name, +) +``` +Make sure to update the `` and `` fields. -## [Anchor](https://qdrant.tech/documentation/multimodal-search/\#vectorize-data) Vectorize data +--- -`LlamaIndex`’s `vdr-2b-multi-v1` model supports cross-lingual retrieval, allowing for effective searches across languages and domains. It encodes document page screenshots into dense single-vector representations, eliminating the need for OCR and other complex data extraction processes. +## **Step 4: Scrape and Process Data** -Let’s embed the images and their captions in the **shared embedding space**. +We'll use CamelAI `VectorRetriever` library to help us to It processes content from a file or URL, divides it into chunks, and stores the embeddings in the specified Qdrant collection. ```python -from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from camel.retrievers import VectorRetriever -model = HuggingFaceEmbedding( - model_name="llamaindex/vdr-2b-multi-v1", - device="cpu", # "mps" for mac, "cuda" for nvidia GPUs - trust_remote_code=True, -) +vector_retriever = VectorRetriever(embedding_model=embedding_instance, + storage=storage_instance) -documents = [\ - {"caption": "An image about plane emergency safety.", "image": "images/image-1.png"},\ - {"caption": "An image about airplane components.", "image": "images/image-2.png"},\ - {"caption": "An image about COVID safety restrictions.", "image": "images/image-3.png"},\ - {"caption": "An confidential image about UFO sightings.", "image": "images/image-4.png"},\ - {"caption": "An image about unusual footprints on Aralar 2011.", "image": "images/image-5.png"},\ +qdrant_urls = [ + "https://qdrant.tech/documentation/overview", + "https://qdrant.tech/documentation/guides/installation", + "https://qdrant.tech/documentation/concepts/filtering", + "https://qdrant.tech/documentation/concepts/indexing", + "https://qdrant.tech/documentation/guides/distributed_deployment", + "https://qdrant.tech/documentation/guides/quantization" + # Add more URLs as needed ] -text_embeddings = model.get_text_embedding_batch([doc["caption"] for doc in documents]) -image_embeddings = model.get_image_embedding_batch([doc["image"] for doc in documents]) +for qdrant_url in qdrant_urls: + vector_retriever.process( + content=qdrant_url, + ) ``` -## [Anchor](https://qdrant.tech/documentation/multimodal-search/\#upload-data-to-qdrant) Upload data to Qdrant +--- -1. **Create a client object for Qdrant**. +## **Step 5: Setup the CAMEL-AI ChatAgent Instance** + +Define the OpenAI model and create a CAMEL-AI ChatAgent instance. ```python -from qdrant_client import QdrantClient, models +from camel.configs import ChatGPTConfig +from camel.models import ModelFactory +from camel.types import ModelPlatformType, ModelType +from camel.agents import ChatAgent -# docker run -p 6333:6333 qdrant/qdrant -client = QdrantClient(url="http://localhost:6333/") +# Create a ChatGPT configuration +config = ChatGPTConfig(temperature=0.2).as_dict() + +# Create an OpenAI model using the configuration +openai_model = ModelFactory.create( + model_platform=ModelPlatformType.OPENAI, + model_type=ModelType.GPT_4O_MINI, + model_config_dict=config, +) + +assistant_sys_msg = """You are a helpful assistant to answer question, + I will give you the Original Query and Retrieved Context, + answer the Original Query based on the Retrieved Context, + if you can't answer the question just say I don't know.""" + +qdrant_agent = ChatAgent(system_message=assistant_sys_msg, model=openai_model) ``` -2. **Create a new collection for the images with captions**. +--- -```python -COLLECTION_NAME = "llama-multi" +## **Step 6: Create and Configure the Discord Bot** -if not client.collection_exists(COLLECTION_NAME): - client.create_collection( - collection_name=COLLECTION_NAME, - vectors_config={ - "image": models.VectorParams(size=len(image_embeddings[0]), distance=models.Distance.COSINE), - "text": models.VectorParams(size=len(text_embeddings[0]), distance=models.Distance.COSINE), - } - ) +Now let's bring the bot to life! It will serve as the interface through which users can interact with the agentic RAG system you’ve built. -``` +### Create a New Discord Bot -3. **Upload our images with captions to the Collection**. +1. Go to the [Discord Developer Portal](https://discord.com/developers/applications) and log in with your Discord account. -```python -client.upload_points( - collection_name=COLLECTION_NAME, - points=[\ - models.PointStruct(\ - id=idx,\ - vector={\ - "text": text_embeddings[idx],\ - "image": image_embeddings[idx],\ - },\ - payload=doc\ - )\ - for idx, doc in enumerate(documents)\ - ] -) +2. Click on the **New Application** button. -``` +3. Give your application a name and click **Create**. -## [Anchor](https://qdrant.tech/documentation/multimodal-search/\#search) Search +4. Navigate to the **Bot** tab on the left sidebar and click **Add Bot**. -### [Anchor](https://qdrant.tech/documentation/multimodal-search/\#text-to-image) Text-to-Image +5. Once the bot is created, click **Reset Token** under the **Token** section to generate a new bot token. Copy this token securely as you will need it later. -Let’s see what image we will get to the query “ _Adventures on snow hills_”. +### Invite the Bot to Your Server -```python -from PIL import Image +1. Go to the **OAuth2** tab and then to the **URL Generator** section. -find_image = model.get_query_embedding("Adventures on snow hills") +2. Under **Scopes**, select **bot**. -Image.open(client.query_points( - collection_name=COLLECTION_NAME, - query=find_image, - using="image", - with_payload=["image"], - limit=1 -).points[0].payload['image']) +3. Under **Bot Permissions**, select the necessary permissions: -``` + - Send Messages -Let’s also run the same query in Italian and compare the results. + - Read Message History -### [Anchor](https://qdrant.tech/documentation/multimodal-search/\#multilingual-search) Multilingual Search +4. Copy the generated URL and paste it into your browser. -Now, let’s do a multilingual search using an Italian query: +5. Select the server where you want to invite the bot and click **Authorize**. -```python -Image.open(client.query_points( - collection_name=COLLECTION_NAME, - query=model.get_query_embedding("Avventure sulle colline innevate"), - using="image", - with_payload=["image"], - limit=1 -).points[0].payload['image']) +### Grant the Bot Permissions -``` +1. Go back to the **Bot** tab. -**Response:** +2. Enable the following under **Privileged Gateway Intents**: -![Snow prints](https://qdrant.tech/documentation/advanced-tutorials/snow-prints.png) + - Server Members Intent -### [Anchor](https://qdrant.tech/documentation/multimodal-search/\#image-to-text) Image-to-Text + - Message Content Intent -Now, let’s do a reverse search with the following image: +Now, the bot is ready to be integrated with your code. -![Airplane](https://qdrant.tech/documentation/advanced-tutorials/airplane.png) +## **Step 7: Build the Discord Bot** -```python -client.query_points( - collection_name=COLLECTION_NAME, - query=model.get_image_embedding("images/image-2.png"), - # Now we are searching only among text vectors with our image query - using="text", - with_payload=["caption"], - limit=1 -).points[0].payload['caption'] +Add to your `.env` file: +```bash +DISCORD_BOT_TOKEN= ``` -**Response:** +We'll use `discord.py` to create a simple Discord bot that interacts with users and retrieves context from Qdrant before responding. -```text -'An image about plane emergency safety.' +```python +from camel.bots import DiscordApp +import nest_asyncio +import discord -``` +nest_asyncio.apply() +discord_q_bot = DiscordApp(token=os.getenv("DISCORD_BOT_TOKEN")) -## [Anchor](https://qdrant.tech/documentation/multimodal-search/\#next-steps) Next steps +@discord_q_bot.client.event # triggers when a message is sent in the channel +async def on_message(message: discord.Message): + if message.author == discord_q_bot.client.user: + return -Use cases of even just Image & Text Multimodal Search are countless: E-Commerce, Media Management, Content Recommendation, Emotion Recognition Systems, Biomedical Image Retrieval, Spoken Sign Language Transcription, etc. + if message.type != discord.MessageType.default: + return -Imagine a scenario: a user wants to find a product similar to a picture they have, but they also have specific textual requirements, like “ _in beige colour_”. You can search using just texts or images and combine their embeddings in a **late fusion manner** (summing and weighting might work surprisingly well). + if message.author.bot: + return + user_input = message.content -Moreover, using [Discovery Search](https://qdrant.tech/articles/discovery-search/) with both modalities, you can provide users with information that is impossible to retrieve unimodally! + retrieved_info = vector_retriever.query( + query=user_input, top_k=10, similarity_threshold=0.6 + ) -Join our [Discord community](https://qdrant.to/discord), where we talk about vector search and similarity learning, experiment, and have fun! + user_msg = str(retrieved_info) + assistant_response = qdrant_agent.step(user_msg) + response_content = assistant_response.msgs[0].content -##### Was this page useful? + if len(response_content) > 2000: # discord message length limit + for chunk in [response_content[i:i+2000] for i in range(0, len(response_content), 2000)]: + await message.channel.send(chunk) + else: + await message.channel.send(response_content) -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +discord_q_bot.run() +``` +--- -Thank you for your feedback! 🙏 +## **Step 9: Test the Bot** -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/multimodal-search.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +1. Invite your bot to your Discord server using the OAuth2 URL from the Discord Developer Portal. -On this page: +2. Run the notebook. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/multimodal-search.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +3. Start chatting with the bot in your Discord server. It will retrieve context from Qdrant and provide relevant answers based on your queries. -× +![agentic-rag-discord-bot-what-is-quantization](/documentation/examples/agentic-rag-camelai-discord/example.png) -[Powered by](https://qdrant.tech/) +--- -<|page-80-lllmstxt|> -## concepts -- [Documentation](https://qdrant.tech/documentation/) -- Concepts -# [Anchor](https://qdrant.tech/documentation/concepts/\#concepts) Concepts +## Conclusion -Think of these concepts as a glossary. Each of these concepts include a link to -detailed information, usually with examples. If you’re new to AI, these concepts -can help you learn more about AI and the Qdrant approach. +Nice work! You've built an agentic RAG-powered Discord bot that retrieves relevant information with Qdrant, generates smart responses with OpenAI, and handles multi-step reasoning using CAMEL-AI. Here’s a quick recap: -## [Anchor](https://qdrant.tech/documentation/concepts/\#collections) Collections -[Collections](https://qdrant.tech/documentation/concepts/collections/) define a named set of points that you can use for your search. +- **Smart Knowledge Retrieval:** Your chatbot can now pull relevant info from large datasets using Qdrant’s vector search. -## [Anchor](https://qdrant.tech/documentation/concepts/\#payload) Payload +- **Autonomous Reasoning with CAMEL-AI:** Enables multi-step reasoning instead of just regurgitating text. -A [Payload](https://qdrant.tech/documentation/concepts/payload/) describes information that you can store with vectors. +- **Live Discord Deployment:** You launched the chatbot on Discord, making it interactive and ready to help real users. -## [Anchor](https://qdrant.tech/documentation/concepts/\#points) Points +One of the biggest advantages of CAMEL-AI is the abstraction it provides, allowing you to focus on designing intelligent interactions rather than worrying about low-level implementation details. -[Points](https://qdrant.tech/documentation/concepts/points/) are a record which consists of a vector and an optional payload. +You’re now well-equipped to tackle more complex real-world problems that require scalable, autonomous knowledge systems. -## [Anchor](https://qdrant.tech/documentation/concepts/\#search) Search +<|page-115-lllmstxt|> +# Backups -[Search](https://qdrant.tech/documentation/concepts/search/) describes _similarity search_, which set up related objects close to each other in vector space. +To create a one-time backup, create a `QdrantClusterSnapshot` resource: -## [Anchor](https://qdrant.tech/documentation/concepts/\#explore) Explore +```yaml +apiVersion: qdrant.io/v1 +kind: QdrantClusterSnapshot +metadata: + name: "qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840-snapshot-timestamp" + labels: + cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + customer-id: "acme-industries" +spec: + cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + retention: 1h +``` -[Explore](https://qdrant.tech/documentation/concepts/explore/) includes several APIs for exploring data in your collections. +You can also create a recurring backup with the `QdrantClusterScheduledSnapshot` resource: -## [Anchor](https://qdrant.tech/documentation/concepts/\#hybrid-queries) Hybrid Queries +```yaml +apiVersion: qdrant.io/v1 +kind: QdrantClusterScheduledSnapshot +metadata: + name: "qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840-snapshot-timestamp" + labels: + cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + customer-id: "acme-industries" +spec: + scheduleShortId: a7d8d973 + cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + # every hour + schedule: "0 * * * *" + retention: 1h +``` -[Hybrid Queries](https://qdrant.tech/documentation/concepts/hybrid-queries/) combines multiple queries or performs them in more than one stage. +To resture from a backup, create a `QdrantClusterRestore` resource: -## [Anchor](https://qdrant.tech/documentation/concepts/\#filtering) Filtering +```yaml +apiVersion: qdrant.io/v1 +kind: QdrantClusterRestore +metadata: + name: "qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840-snapshot-restore-01" + labels: + cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + customer-id: "acme-industries" +spec: + source: + snapshotName: qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840-snapshot-timestamp + namespace: qdrant-private-cloud + destination: + name: qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840 + namespace: qdrant-private-cloud +``` -[Filtering](https://qdrant.tech/documentation/concepts/filtering/) defines various database-style clauses, conditions, and more. +Note that with all resources `cluster-id` and `customer-id` label must be set to the values of the corresponding `QdrantCluster` resource. -## [Anchor](https://qdrant.tech/documentation/concepts/\#optimizer) Optimizer +<|page-116-lllmstxt|> +# Using Qdrant’s Async API for Efficient Python Applications -[Optimizer](https://qdrant.tech/documentation/concepts/optimizer/) describes options to rebuild -database structures for faster search. They include a vacuum, a merge, and an -indexing optimizer. +Asynchronous programming is being broadly adopted in the Python ecosystem. Tools such as FastAPI [have embraced this new +paradigm](https://fastapi.tiangolo.com/async/), but it is also becoming a standard for ML models served as SaaS. For example, the Cohere SDK +[provides an async client](https://github.com/cohere-ai/cohere-python/blob/856a4c3bd29e7a75fa66154b8ac9fcdf1e0745e0/src/cohere/client.py#L189) next to its synchronous counterpart. -## [Anchor](https://qdrant.tech/documentation/concepts/\#storage) Storage +Databases are often launched as separate services and are accessed via a network. All the interactions with them are IO-bound and can +be performed asynchronously so as not to waste time actively waiting for a server response. In Python, this is achieved by +using [`async/await`](https://docs.python.org/3/library/asyncio-task.html) syntax. That lets the interpreter switch to another task +while waiting for a response from the server. -[Storage](https://qdrant.tech/documentation/concepts/storage/) describes the configuration of storage in segments, which include indexes and an ID mapper. +## When to use async API -## [Anchor](https://qdrant.tech/documentation/concepts/\#indexing) Indexing +There is no need to use async API if the application you are writing will never support multiple users at once (e.g it is a script that runs once per day). However, if you are writing a web service that multiple users will use simultaneously, you shouldn't be +blocking the threads of the web server as it limits the number of concurrent requests it can handle. In this case, you should use +the async API. -[Indexing](https://qdrant.tech/documentation/concepts/indexing/) lists and describes available indexes. They include payload, vector, sparse vector, and a filterable index. +Modern web frameworks like [FastAPI](https://fastapi.tiangolo.com/) and [Quart](https://quart.palletsprojects.com/en/latest/) support +async API out of the box. Mixing asynchronous code with an existing synchronous codebase might be a challenge. The `async/await` syntax +cannot be used in synchronous functions. On the other hand, calling an IO-bound operation synchronously in async code is considered +an antipattern. Therefore, if you build an async web service, exposed through an [ASGI](https://asgi.readthedocs.io/en/latest/) server, +you should use the async API for all the interactions with Qdrant. -## [Anchor](https://qdrant.tech/documentation/concepts/\#snapshots) Snapshots + -[Snapshots](https://qdrant.tech/documentation/concepts/snapshots/) describe the backup/restore process (and more) for each node at specific times. +### Using Qdrant asynchronously -##### Was this page useful? +The simplest way of running asynchronous code is to use define `async` function and use the `asyncio.run` in the following way to run it: -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +```python +from qdrant_client import models -Thank you for your feedback! 🙏 +import qdrant_client +import asyncio -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/_index.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. -On this page: +async def main(): + client = qdrant_client.AsyncQdrantClient("localhost") -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/_index.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) + # Create a collection + await client.create_collection( + collection_name="my_collection", + vectors_config=models.VectorParams(size=4, distance=models.Distance.COSINE), + ) -× + # Insert a vector + await client.upsert( + collection_name="my_collection", + points=[ + models.PointStruct( + id="5c56c793-69f3-4fbf-87e6-c4bf54c28c26", + payload={ + "color": "red", + }, + vector=[0.9, 0.1, 0.1, 0.5], + ), + ], + ) -[Powered by](https://qdrant.tech/) + # Search for nearest neighbors + points = await client.query_points( + collection_name="my_collection", + query=[0.9, 0.1, 0.1, 0.5], + limit=2, + ).points -<|page-81-lllmstxt|> -## what-is-vector-quantization -- [Articles](https://qdrant.tech/articles/) -- What is Vector Quantization? + # Your async code using AsyncQdrantClient might be put here + # ... -[Back to Vector Search Manuals](https://qdrant.tech/articles/vector-search-manuals/) -# What is Vector Quantization? +asyncio.run(main()) +``` -Sabrina Aquino +The `AsyncQdrantClient` provides the same methods as the synchronous counterpart `QdrantClient`. If you already have a synchronous +codebase, switching to async API is as simple as replacing `QdrantClient` with `AsyncQdrantClient` and adding `await` before each +method call. -· + -September 25, 2024 +<|page-117-lllmstxt|> +# How to Get Started With Qdrant Cloud -![What is Vector Quantization?](https://qdrant.tech/articles_data/what-is-vector-quantization/preview/title.jpg) +

+

You can try vector search on Qdrant Cloud in three steps. +
Instructions are below, but the video is faster:

-Vector quantization is a data compression technique used to reduce the size of high-dimensional data. Compressing vectors reduces memory usage while maintaining nearly all of the essential information. This method allows for more efficient storage and faster search operations, particularly in large datasets. +## Setup a Qdrant Cloud Cluster -When working with high-dimensional vectors, such as embeddings from providers like OpenAI, a single 1536-dimensional vector requires **6 KB of memory**. +1. Register for a [Cloud account](https://cloud.qdrant.io/signup) with your email, Google or Github credentials. +2. Go to **Clusters** and follow the onboarding instructions under **Create First Cluster**. -![1536-dimensional vector size is 6 KB](https://qdrant.tech/articles_data/what-is-vector-quantization/vector-size.png) +![create a cluster](/docs/gettingstarted/gui-quickstart/create-cluster.png) -With 1 million vectors needing around 6 GB of memory, as your dataset grows to multiple **millions of vectors**, the memory and processing demands increase significantly. +3. When you create it, you will receive an API key. You will need to copy it and store it somewhere self. It will not be displayed again. If you loose it, you can always create a new one on the **Cluster Detail Page** later. -To understand why this process is so computationally demanding, let’s take a look at the nature of the [HNSW index](https://qdrant.tech/documentation/concepts/indexing/#vector-index). +![get api key](/docs/gettingstarted/gui-quickstart/api-key.png) -The **HNSW (Hierarchical Navigable Small World) index** organizes vectors in a layered graph, connecting each vector to its nearest neighbors. At each layer, the algorithm narrows down the search area until it reaches the lower layers, where it efficiently finds the closest matches to the query. -![HNSW Search visualization](https://qdrant.tech/articles_data/what-is-vector-quantization/hnsw.png) +## Access the Cluster UI -Each time a new vector is added, the system must determine its position in the existing graph, a process similar to searching. This makes both inserting and searching for vectors complex operations. +1. Click on **Cluster UI** on the **Cluster Detail Page** to access the cluster UI dashboard. +2. Paste your new API key here. You can revoke and create new API keys in the **API Keys** tab on your **Cluster Detail Page**. +3. The key will grant you access to your Qdrant instance. Now you can see the cluster Dashboard. -One of the key challenges with the HNSW index is that it requires a lot of **random reads** and **sequential traversals** through the graph. This makes the process computationally expensive, especially when you’re dealing with millions of high-dimensional vectors. +![access the dashboard](/docs/gettingstarted/gui-quickstart/access-dashboard.png) -The system has to jump between various points in the graph in an unpredictable manner. This unpredictability makes optimization difficult, and as the dataset grows, the memory and processing requirements increase significantly. +## Authenticate via SDKs -![HNSW Search visualization](https://qdrant.tech/articles_data/what-is-vector-quantization/hnsw-search2.png) +Now that you have your cluster and key, you can use our official SDKs to access Qdrant Cloud from within your application. -Since vectors need to be stored in **fast storage** like **RAM** or **SSD** for low-latency searches, as the size of the data grows, so does the cost of storing and processing it efficiently. +```bash +curl \ + -X GET https://xyz-example.eu-central.aws.cloud.qdrant.io:6333 \ + --header 'api-key: ' -**Quantization** offers a solution by compressing vectors to smaller memory sizes, making the process more efficient. +# Alternatively, you can use the `Authorization` header with the `Bearer` prefix +curl \ + -X GET https://xyz-example.eu-central.aws.cloud.qdrant.io:6333 \ + --header 'Authorization: Bearer ' +``` -There are several methods to achieve this, and here we will focus on three main ones: +```python +from qdrant_client import QdrantClient -![Types of Quantization: 1. Scalar Quantization, 2. Product Quantization, 3. Binary Quantization](https://qdrant.tech/articles_data/what-is-vector-quantization/types-of-quant.png) +qdrant_client = QdrantClient( + host="xyz-example.eu-central.aws.cloud.qdrant.io", + api_key="", +) +``` -## [Anchor](https://qdrant.tech/articles/what-is-vector-quantization/\#1-what-is-scalar-quantization) 1\. What is Scalar Quantization? +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -![](https://qdrant.tech/articles_data/what-is-vector-quantization/astronaut-mars.jpg) +const client = new QdrantClient({ + host: "xyz-example.eu-central.aws.cloud.qdrant.io", + apiKey: "", +}); +``` -In Qdrant, each dimension is represented by a `float32` value, which uses **4 bytes** of memory. When using [Scalar Quantization](https://qdrant.tech/documentation/guides/quantization/#scalar-quantization), we map our vectors to a range that the smaller `int8` type can represent. An `int8` is only **1 byte** and can represent 256 values (from -128 to 127, or 0 to 255). This results in a **75% reduction** in memory size. +```rust +use qdrant_client::Qdrant; -For example, if our data lies in the range of -1.0 to 1.0, Scalar Quantization will transform these values to a range that `int8` can represent, i.e., within -128 to 127. The system **maps** the `float32` values into this range. +let client = Qdrant::from_url("https://xyz-example.eu-central.aws.cloud.qdrant.io:6334") + .api_key("") + .build()?; +``` -Here’s a simple linear example of what this process looks like: +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; -![Scalar Quantization example](https://qdrant.tech/articles_data/what-is-vector-quantization/scalar-quant.png) +QdrantClient client = + new QdrantClient( + QdrantGrpcClient.newBuilder( + "xyz-example.eu-central.aws.cloud.qdrant.io", + 6334, + true) + .withApiKey("") + .build()); +``` -To set up Scalar Quantization in Qdrant, you need to include the `quantization_config` section when creating or updating a collection: +```csharp +using Qdrant.Client; -httppython +var client = new QdrantClient( + host: "xyz-example.eu-central.aws.cloud.qdrant.io", + https: true, + apiKey: "" +); +``` -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 128, - "distance": "Cosine" - }, - "quantization_config": { - "scalar": { - "type": "int8", - "quantile": 0.99, - "always_ram": true - } - } -} +```go +import "github.com/qdrant/go-client/qdrant" +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "xyz-example.eu-central.aws.cloud.qdrant.io", + Port: 6334, + APIKey: "", + UseTLS: true, +}) ``` -```python -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=128, distance=models.Distance.COSINE), - quantization_config=models.ScalarQuantization( - scalar=models.ScalarQuantizationConfig( - type=models.ScalarType.INT8, - quantile=0.99, - always_ram=True, - ), - ), -) +## Try the Tutorial Sandbox -``` +1. Open the interactive **Tutorial**. Here, you can test basic Qdrant API requests. +2. Using the **Quickstart** instructions, create a collection, add vectors and run a search. +3. The output on the right will show you some basic semantic search results. -The `quantile` parameter is used to calculate the quantization bounds. For example, if you specify a `0.99` quantile, the most extreme 1% of values will be excluded from the quantization bounds. +![interactive-tutorial](/docs/gettingstarted/gui-quickstart/interactive-tutorial.png) -This parameter only affects the resulting precision, not the memory footprint. You can adjust it if you experience a significant decrease in search quality. +## That's Vector Search! +You can stay in the sandbox and continue trying our different API calls.
+When ready, use the Console and our complete REST API to try other operations. -Scalar Quantization is a great choice if you’re looking to boost search speed and compression without losing much accuracy. It also slightly improves performance, as distance calculations (such as dot product or cosine similarity) using `int8` values are computationally simpler than using `float32` values. +## What's Next? -While the performance gains of Scalar Quantization may not match those achieved with Binary Quantization (which we’ll discuss later), it remains an excellent default choice when Binary Quantization isn’t suitable for your use case. +Now that you have a Qdrant Cloud cluster up and running, you should [test remote access](/documentation/cloud/authentication/#test-cluster-access) with a Qdrant Client. -## [Anchor](https://qdrant.tech/articles/what-is-vector-quantization/\#2-what-is-binary-quantization) 2\. What is Binary Quantization? +For more about Qdrant Cloud, check our [dedicated documentation](/documentation/cloud-intro/). -![Astronaut in surreal white environment](https://qdrant.tech/articles_data/what-is-vector-quantization/astronaut-white-surreal.jpg) +<|page-118-lllmstxt|> +# Configuring Qdrant Operator: Advanced Options -[Binary Quantization](https://qdrant.tech/documentation/guides/quantization/#binary-quantization) is an excellent option if you’re looking to **reduce memory** usage while also achieving a significant **boost in speed**. It works by converting high-dimensional vectors into simple binary (0 or 1) representations. +The Qdrant Operator has several configuration options, which can be configured in the advanced section of your Hybrid Cloud Environment. -- Values greater than zero are converted to 1. -- Values less than or equal to zero are converted to 0. +The following YAML shows all configuration options with their default values: -Let’s consider our initial example of a 1536-dimensional vector that requires **6 KB** of memory (4 bytes for each `float32` value). +```yaml +# Additional pod annotations +podAnnotations: {} -After Binary Quantization, each dimension is reduced to 1 bit (1/8 byte), so the memory required is: +# Configuration for the Qdrant operator service monitor to scrape metrics +serviceMonitor: + enabled: false -1536 dimensions8 bits per byte=192 bytes +# Resource requests and limits for the Qdrant operator +resources: {} -This leads to a **32x** memory reduction. +# Node selector for the Qdrant operator +nodeSelector: {} -![Binary Quantization example](https://qdrant.tech/articles_data/what-is-vector-quantization/binary-quant.png) +# Tolerations for the Qdrant operator +tolerations: [] -Qdrant automates the Binary Quantization process during indexing. As vectors are added to your collection, each 32-bit floating-point component is converted into a binary value according to the configuration you define. +# Affinity configuration for the Qdrant operator +affinity: {} -Here’s how you can set it up: +# Configuration for the Qdrant operator (v2) +settings: + # The log level for the operator + # Available options: DEBUG | INFO | WARN | ERROR + logLevel: INFO + # Controller related settings + controller: + # The period a forced recync is done by the controller (if watches are missed / nothing happened) + forceResyncPeriod: 10h + # QPS indicates the maximum QPS to the master from this client. + # Default is 200 + qps: 200 + # Maximum burst for throttle. + # Default is 500. + burst: 500 + # Features contains the settings for enabling / disabling the individual features of the operator + features: + # ClusterManagement contains the settings for qdrant (database) cluster management + clusterManagement: + # Whether or not the Qdrant cluster features are enabled. + # If disabled, all other properties in this struct are disregarded. Otherwise, the individual features will be inspected. + # Default is true. + enable: true + # The StorageClass used to make database and snapshot PVCs. + # Default is nil, meaning the default storage class of Kubernetes. + storageClass: + # The StorageClass used to make database PVCs. + # Default is nil, meaning the default storage class of Kubernetes. + #database: + # The StorageClass used to make snapshot PVCs. + # Default is nil, meaning the default storage class of Kubernetes. + #snapshot: + # Qdrant config contains settings specific for the database + qdrant: + # The config where to find the image for qdrant + image: + # The repository where to find the image for qdrant + # Default is "qdrant/qdrant" + repository: qdrant/qdrant + # Docker image pull policy + # Default "IfNotPresent", unless the tag is dev, master or latest. Then "Always" + #pullPolicy: + # Docker image pull secret name + # This secret should be available in the namespace where the cluster is running + # Default not set + #pullSecretName: + # storage contains the settings for the storage of the Qdrant cluster + storage: + performance: + # CPU budget, how many CPUs (threads) to allocate for an optimization job. + # If 0 - auto selection, keep 1 or more CPUs unallocated depending on CPU size + # If negative - subtract this number of CPUs from the available CPUs. + # If positive - use this exact number of CPUs. + optimizerCpuBudget: 0 + # Enable async scorer which uses io_uring when rescoring. + # Only supported on Linux, must be enabled in your kernel. + # See: + asyncScorer: false + # Qdrant DB log level + # Available options: DEBUG | INFO | WARN | ERROR + # Default is "INFO" + logLevel: INFO + # Default Qdrant security context configuration + securityContext: + # Enable default security context + # Default is false + enabled: false + # Default user for qdrant container + # Default not set + #user: 1000 + # Default fsGroup for qdrant container + # Default not set + #fsUser: 2000 + # Default group for qdrant container + # Default not set + #group: 3000 + # Network policies configuration for the Qdrant databases + networkPolicies: + ingress: + - ports: + - protocol: TCP + port: 6333 + - protocol: TCP + port: 6334 + # Allow DNS resolution from qdrant pods at Kubernetes internal DNS server + egress: + - ports: + - protocol: UDP + port: 53 + # Scheduling config contains the settings specific for scheduling + scheduling: + # Default topology spread constraints (list from type corev1.TopologySpreadConstraint) + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: "kubernetes.io/hostname" + whenUnsatisfiable: "ScheduleAnyway" + # Default pod disruption budget (object from type policyv1.PodDisruptionBudgetSpec) + podDisruptionBudget: + maxUnavailable: 1 + # ClusterManager config contains the settings specific for cluster manager + clusterManager: + # Whether or not the cluster manager (on operator level). + # If disabled, all other properties in this struct are disregarded. Otherwise, the individual features will be inspected. + # Default is false. + enable: true + # The endpoint address the cluster manager could be reached + # If set, this should be a full URL like: http://cluster-manager.qdrant-cloud-ns.svc.cluster.local:7333 + endpointAddress: http://qdrant-cluster-manager:80 + # InvocationInterval is the interval between calls (started after the previous call is retured) + # Default is 10 seconds + invocationInterval: 10s + # Timeout is the duration a single call to the cluster manager is allowed to take. + # Default is 30 seconds + timeout: 30s + # Specifies overrides for the manage rules + manageRulesOverrides: + #dry_run: + #max_transfers: + #max_transfers_per_collection: + #rebalance: + #replicate: + # Ingress config contains the settings specific for ingress + ingress: + # Whether or not the Ingress feature is enabled. + # Default is true. + enable: false + # Which specific ingress provider should be used + # Default is KubernetesIngress + provider: KubernetesIngress + # The specific settings when the Provider is QdrantCloudTraefik + qdrantCloudTraefik: + # Enable tls + # Default is false + tls: false + # Secret with TLS certificate + # Default is None + secretName: "" + # List of Traefik middlewares to apply + # Default is an empty list + middlewares: [] + # IP Allowlist Strategy for Traefik + # Default is None + ipAllowlistStrategy: + # Enable body validator plugin and matching ingressroute rules + # Default is false + enableBodyValidatorPlugin: false + # The specific settings when the Provider is KubernetesIngress + kubernetesIngress: + # Name of the ingress class + # Default is None + #ingressClassName: + # TelemetryTimeout is the duration a single call to the cluster telemetry endpoint is allowed to take. + # Default is 3 seconds + telemetryTimeout: 3s + # MaxConcurrentReconciles is the maximum number of concurrent Reconciles which can be run. Defaults to 20. + maxConcurrentReconciles: 20 + # VolumeExpansionMode specifies the expansion mode, which can be online or offline (e.g. in case of Azure). + # Available options: Online, Offline + # Default is Online + volumeExpansionMode: Online + # BackupManagementConfig contains the settings for backup management + backupManagement: + # Whether or not the backup features are enabled. + # If disabled, all other properties in this struct are disregarded. Otherwise, the individual features will be inspected. + # Default is true. + enable: true + # Snapshots contains the settings for snapshots as part of backup management. + snapshots: + # Whether or not the Snapshot feature is enabled. + # Default is true. + enable: true + # The VolumeSnapshotClass used to make VolumeSnapshots. + # Default is "csi-snapclass". + volumeSnapshotClass: "csi-snapclass" + # The duration a snapshot is retained when the phase becomes Failed or Skipped + # Default is 72h (3d). + retainUnsuccessful: 72h + # MaxConcurrentReconciles is the maximum number of concurrent Reconciles which can be run. Defaults to 1. + maxConcurrentReconciles: 1 + # ScheduledSnapshots contains the settings for scheduled snapshot as part of backup management. + scheduledSnapshots: + # Whether or not the ScheduledSnapshot feature is enabled. + # Default is true. + enable: true + # MaxConcurrentReconciles is the maximum number of concurrent Reconciles which can be run. Defaults to 1. + maxConcurrentReconciles: 1 + # Restores contains the settings for restoring (a snapshot) as part of backup management. + restores: + # Whether or not the Restore feature is enabled. + # Default is true. + enable: true + # MaxConcurrentReconciles is the maximum number of concurrent Reconciles which can be run. Defaults to 1. + maxConcurrentReconciles: 1 +``` + +<|page-119-lllmstxt|> +# Configuring Logging & Monitoring in Qdrant Private Cloud -httppython +## Logging -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 1536, - "distance": "Cosine" - }, - "quantization_config": { - "binary": { - "always_ram": true - } - } -} +You can access the logs with kubectl or the Kubernetes log management tool of your choice. For example: +```bash +kubectl -n qdrant-private-cloud logs -l app=qdrant,cluster-id=a7d8d973-0cc5-42de-8d7b-c29d14d24840 ``` -```python -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE), - quantization_config=models.BinaryQuantization( - binary=models.BinaryQuantizationConfig( - always_ram=True, - ), - ), -) +**Configuring log levels:** You can configure log levels for the databases individually through the QdrantCluster spec. Example: +```yaml +apiVersion: qdrant.io/v1 +kind: QdrantCluster +metadata: + name: qdrant-a7d8d973-0cc5-42de-8d7b-c29d14d24840 + labels: + cluster-id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + customer-id: "acme-industries" +spec: + id: "a7d8d973-0cc5-42de-8d7b-c29d14d24840" + version: "v1.11.3" + size: 1 + resources: + cpu: 100m + memory: "1Gi" + storage: "2Gi" + config: + log_level: "DEBUG" ``` -Binary Quantization is by far the quantization method that provides the most significant processing **speed gains** compared to Scalar and Product Quantizations. This is because the binary representation allows the system to use highly optimized CPU instructions, such as [XOR](https://en.wikipedia.org/wiki/XOR_gate#:~:text=XOR%20represents%20the%20inequality%20function,the%20other%20but%20not%20both%22) and [Popcount](https://en.wikipedia.org/wiki/Hamming_weight), for fast distance computations. - -It can speed up search operations by **up to 40x**, depending on the dataset and hardware. - -Not all models are equally compatible with Binary Quantization, and in the comparison above, we are only using models that are compatible. Some models may experience a greater loss in accuracy when quantized. We recommend using Binary Quantization with models that have **at least 1024 dimensions** to minimize accuracy loss. +### Integrating with a log management system -The models that have shown the best compatibility with this method include: +You can integrate the logs into any log management system that supports Kubernetes. There are no Qdrant specific configurations necessary. Just configure the agents of your system to collect the logs from all Pods in the Qdrant namespace. -- **OpenAI text-embedding-ada-002** (1536 dimensions) -- **Cohere AI embed-english-v2.0** (4096 dimensions) +## Monitoring -These models demonstrate minimal accuracy loss while still benefiting from substantial speed and memory gains. +The Qdrant Cloud console gives you access to basic metrics about CPU, memory and disk usage of your Qdrant clusters. -Even though Binary Quantization is incredibly fast and memory-efficient, the trade-offs are in **precision** and **model compatibility**, so you may need to ensure search quality using techniques like oversampling and rescoring. +If you want to integrate the Qdrant metrics into your own monitoring system, you can instruct it to scrape the following endpoints that provide metrics in a Prometheus/OpenTelemetry compatible format: -If you’re interested in exploring Binary Quantization in more detail—including implementation examples, benchmark results, and usage recommendations—check out our dedicated article on [Binary Quantization - Vector Search, 40x Faster](https://qdrant.tech/articles/binary-quantization/). +* `/metrics` on port 6333 of every Qdrant database Pod, this provides metrics about each the database and its internals itself +* `/metrics` on port 9290 of the Qdrant Operator Pod, this provides metrics about the Operator, as well as the status of Qdrant Clusters and Snapshots +* For metrics about the state of Kubernetes resources like Pods and PersistentVolumes within the Qdrant Hybrid Cloud namespace, we recommend using [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) -## [Anchor](https://qdrant.tech/articles/what-is-vector-quantization/\#3-what-is-product-quantization) 3\. What is Product Quantization? +### Grafana dashboard -![](https://qdrant.tech/articles_data/what-is-vector-quantization/astronaut-centroids.jpg) +If you scrape the above metrics into your own monitoring system, and your are using Grafana, you can use our [Grafana dashboard](https://github.com/qdrant/qdrant-cloud-grafana-dashboard) to visualize these metrics. -[Product Quantization](https://qdrant.tech/documentation/guides/quantization/#product-quantization) is a method used to compress high-dimensional vectors by representing them with a smaller set of representative points. +![Grafa dashboard](/documentation/cloud/cloud-grafana-dashboard.png) -The process begins by splitting the original high-dimensional vectors into smaller **sub-vectors.** Each sub-vector represents a segment of the original vector, capturing different characteristics of the data. +<|page-120-lllmstxt|> +# Measure and Improve Retrieval Quality in Semantic Search -![Creation of the Sub-vector](https://qdrant.tech/articles_data/what-is-vector-quantization/subvec.png) +| Time: 30 min | Level: Intermediate | | | +|--------------|---------------------|--|----| -For each sub-vector, a separate **codebook** is created, representing regions in the data space where common patterns occur. +Semantic search pipelines are as good as the embeddings they use. If your model cannot properly represent input data, similar objects might +be far away from each other in the vector space. No surprise, that the search results will be poor in this case. There is, however, another +component of the process which can also degrade the quality of the search results. It is the ANN algorithm itself. -The codebook in Qdrant is trained automatically during the indexing process. As vectors are added to the collection, Qdrant uses your specified quantization settings in the `quantization_config` to build the codebook and quantize the vectors. Here’s how you can set it up: +In this tutorial, we will show how to measure the quality of the semantic retrieval and how to tune the parameters of the HNSW, the ANN +algorithm used in Qdrant, to obtain the best results. -httppython +## Embeddings quality -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 1024, - "distance": "Cosine" - }, - "quantization_config": { - "product": { - "compression": "x32", - "always_ram": true - } - } -} +The quality of the embeddings is a topic for a separate tutorial. In a nutshell, it is usually measured and compared by benchmarks, such as +[Massive Text Embedding Benchmark (MTEB)](https://huggingface.co/spaces/mteb/leaderboard). The evaluation process itself is pretty +straightforward and is based on a ground truth dataset built by humans. We have a set of queries and a set of the documents we would expect +to receive for each of them. In the [evaluation process](https://qdrant.tech/rag/rag-evaluation-guide/), we take a query, find the most similar documents in the vector space and compare +them with the ground truth. In that setup, **finding the most similar documents is implemented as full kNN search, without any approximation**. +As a result, we can measure the quality of the embeddings themselves, without the influence of the ANN algorithm. -``` +## Retrieval quality -```python -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=1024, distance=models.Distance.COSINE), - quantization_config=models.ProductQuantization( - product=models.ProductQuantizationConfig( - compression=models.CompressionRatio.X32, - always_ram=True, - ), - ), -) +Embeddings quality is indeed the most important factor in the semantic search quality. However, vector search engines, such as Qdrant, do not +perform pure kNN search. Instead, they use **Approximate Nearest Neighbors** (ANN) algorithms, which are much faster than the exact search, +but can return suboptimal results. We can also **measure the retrieval quality of that approximation** which also contributes to the overall +search quality. -``` +### Quality metrics -Each region in the codebook is defined by a **centroid**, which serves as a representative point summarizing the characteristics of that region. Instead of treating every single data point as equally important, we can group similar sub-vectors together and represent them with a single centroid that captures the general characteristics of that group. +There are various ways of how quantify the quality of semantic search. Some of them, such as [Precision@k](https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Precision_at_k), +are based on the number of relevant documents in the top-k search results. Others, such as [Mean Reciprocal Rank (MRR)](https://en.wikipedia.org/wiki/Mean_reciprocal_rank), +take into account the position of the first relevant document in the search results. [DCG and NDCG](https://en.wikipedia.org/wiki/Discounted_cumulative_gain) +metrics are, in turn, based on the relevance score of the documents. -The centroids used in Product Quantization are determined using the **[K-means clustering algorithm](https://en.wikipedia.org/wiki/K-means_clustering)**. +If we treat the search pipeline as a whole, we could use them all. The same is true for the embeddings quality evaluation. However, for the +ANN algorithm itself, anything based on the relevance score or ranking is not applicable. Ranking in vector search relies on the distance +between the query and the document in the vector space, however distance is not going to change due to approximation, as the function is +still the same. -![Codebook and Centroids example](https://qdrant.tech/articles_data/what-is-vector-quantization/code-book.png) +Therefore, it only makes sense to measure the quality of the ANN algorithm by the number of relevant documents in the top-k search results, +such as `precision@k`. It is calculated as the number of relevant documents in the top-k search results divided by `k`. In case of testing +just the ANN algorithm, we can use the exact kNN search as a ground truth, with `k` being fixed. It will be a measure on **how well the ANN +algorithm approximates the exact search**. -Qdrant always selects **K = 256** as the number of centroids in its implementation, based on the fact that 256 is the maximum number of unique values that can be represented by a single byte. +## Measure the quality of the search results -This makes the compression process efficient because each centroid index can be stored in a single byte. +Let's build a quality [evaluation](https://qdrant.tech/rag/rag-evaluation-guide/) of the ANN algorithm in Qdrant. We will, first, call the search endpoint in a standard way to obtain +the approximate search results. Then, we will call the exact search endpoint to obtain the exact matches, and finally compare both results +in terms of precision. -The original high-dimensional vectors are quantized by mapping each sub-vector to the nearest centroid in its respective codebook. +Before we start, let's create a collection, fill it with some data and then start our evaluation. We will use the same dataset as in the +[Loading a dataset from Hugging Face hub](/documentation/tutorials/huggingface-datasets/) tutorial, `Qdrant/arxiv-titles-instructorxl-embeddings` +from the [Hugging Face hub](https://huggingface.co/datasets/Qdrant/arxiv-titles-instructorxl-embeddings). Let's download it in a streaming +mode, as we are only going to use part of it. -![Vectors being mapped to their corresponding centroids example](https://qdrant.tech/articles_data/what-is-vector-quantization/mapping.png) +```python +from datasets import load_dataset -The compressed vector stores the index of the closest centroid for each sub-vector. +dataset = load_dataset( + "Qdrant/arxiv-titles-instructorxl-embeddings", split="train", streaming=True +) +``` -Here’s how a 1024-dimensional vector, originally taking up 4096 bytes, is reduced to just 128 bytes by representing it as 128 indexes, each pointing to the centroid of a sub-vector: +We need some data to be indexed and another set for the testing purposes. Let's get the first 50000 items for the training and the next 1000 +for the testing. -![Product Quantization example](https://qdrant.tech/articles_data/what-is-vector-quantization/product-quant.png) +```python +dataset_iterator = iter(dataset) +train_dataset = [next(dataset_iterator) for _ in range(60000)] +test_dataset = [next(dataset_iterator) for _ in range(1000)] +``` -After setting up quantization and adding your vectors, you can perform searches as usual. Qdrant will automatically use the quantized vectors, optimizing both speed and memory usage. Optionally, you can enable rescoring for better accuracy. +Now, let's create a collection and index the training data. This collection will be created with the default configuration. Please be aware that +it might be different from your collection settings, and it's always important to test exactly the same configuration you are going to use later +in production. -httppython + -```http -POST /collections/{collection_name}/points/search -{ - "query": [0.22, -0.01, -0.98, 0.37], - "params": { - "quantization": { - "rescore": true - } - }, - "limit": 10 -} +```python +from qdrant_client import QdrantClient, models +client = QdrantClient("http://localhost:6333") +client.create_collection( + collection_name="arxiv-titles-instructorxl-embeddings", + vectors_config=models.VectorParams( + size=768, # Size of the embeddings generated by InstructorXL model + distance=models.Distance.COSINE, + ), +) ``` +We are now ready to index the training data. Uploading the records is going to trigger the indexing process, which will build the HNSW graph. +The indexing process may take some time, depending on the size of the dataset, but your data is going to be available for search immediately +after receiving the response from the `upsert` endpoint. **As long as the indexing is not finished, and HNSW not built, Qdrant will perform +the exact search**. We have to wait until the indexing is finished to be sure that the approximate search is performed. + ```python -client.query_points( - collection_name="my_collection", - query_vector=[0.22, -0.01, -0.98, 0.37], # Your query vector - search_params=models.SearchParams( - quantization=models.QuantizationSearchParams( - rescore=True # Enables rescoring with original vectors +client.upload_points( # upload_points is available as of qdrant-client v1.7.1 + collection_name="arxiv-titles-instructorxl-embeddings", + points=[ + models.PointStruct( + id=item["id"], + vector=item["vector"], + payload=item, ) - ), - limit=10 # Return the top 10 results + for item in train_dataset + ] ) +while True: + collection_info = client.get_collection(collection_name="arxiv-titles-instructorxl-embeddings") + if collection_info.status == models.CollectionStatus.GREEN: + # Collection status is green, which means the indexing is finished + break ``` -Product Quantization can significantly reduce memory usage, potentially offering up to **64x** compression in certain configurations. However, it’s important to note that this level of compression can lead to a noticeable drop in quality. - -If your application requires high precision or real-time performance, Product Quantization may not be the best choice. However, if **memory savings** are critical and some accuracy loss is acceptable, it could still be an ideal solution. - -Here’s a comparison of speed, accuracy, and compression for all three methods, adapted from [Qdrant’s documentation](https://qdrant.tech/documentation/guides/quantization/#how-to-choose-the-right-quantization-method): +## Standard mode vs exact search -| Quantization method | Accuracy | Speed | Compression | -| --- | --- | --- | --- | -| Scalar | 0.99 | up to x2 | 4 | -| Product | 0.7 | 0.5 | up to 64 | -| Binary | 0.95\* | up to x40 | 32 | +Qdrant has a built-in exact search mode, which can be used to measure the quality of the search results. In this mode, Qdrant performs a +full kNN search for each query, without any approximation. It is not suitable for production use with high load, but it is perfect for the +evaluation of the ANN algorithm and its parameters. It might be triggered by setting the `exact` parameter to `True` in the search request. +We are simply going to use all the examples from the test dataset as queries and compare the results of the approximate search with the +results of the exact search. Let's create a helper function with `k` being a parameter, so we can calculate the `precision@k` for different +values of `k`. -\\* \- for compatible models +```python +def avg_precision_at_k(k: int): + precisions = [] + for item in test_dataset: + ann_result = client.query_points( + collection_name="arxiv-titles-instructorxl-embeddings", + query=item["vector"], + limit=k, + ).points + + knn_result = client.query_points( + collection_name="arxiv-titles-instructorxl-embeddings", + query=item["vector"], + limit=k, + search_params=models.SearchParams( + exact=True, # Turns on the exact search mode + ), + ).points -For a more in-depth understanding of the benchmarks you can expect, check out our dedicated article on [Product Quantization in Vector Search](https://qdrant.tech/articles/product-quantization/). + # We can calculate the precision@k by comparing the ids of the search results + ann_ids = set(item.id for item in ann_result) + knn_ids = set(item.id for item in knn_result) + precision = len(ann_ids.intersection(knn_ids)) / k + precisions.append(precision) + + return sum(precisions) / len(precisions) +``` -## [Anchor](https://qdrant.tech/articles/what-is-vector-quantization/\#rescoring-oversampling-and-reranking) Rescoring, Oversampling, and Reranking +Calculating the `precision@5` is as simple as calling the function with the corresponding parameter: -When we use quantization methods like Scalar, Binary, or Product Quantization, we’re compressing our vectors to save memory and improve performance. However, this compression removes some detail from the original vectors. +```python +print(f"avg(precision@5) = {avg_precision_at_k(k=5)}") +``` -This can slightly reduce the accuracy of our similarity searches because the quantized vectors are approximations of the original data. To mitigate this loss of accuracy, you can use **oversampling** and **rescoring**, which help improve the accuracy of the final search results. +Response: -The original vectors are never deleted during this process, and you can easily switch between quantization methods or parameters by updating the collection configuration at any time. +```text +avg(precision@5) = 0.9935999999999995 +``` -Here’s how the process works, step by step: +As we can see, the precision of the approximate search vs exact search is pretty high. There are, however, some scenarios when we +need higher precision and can accept higher latency. HNSW is pretty tunable, and we can increase the precision by changing its parameters. + +## Tweaking the HNSW parameters -### [Anchor](https://qdrant.tech/articles/what-is-vector-quantization/\#1-initial-quantized-search) 1\. Initial Quantized Search +HNSW is a hierarchical graph, where each node has a set of links to other nodes. The number of edges per node is called the `m` parameter. +The larger the value of it, the higher the precision of the search, but more space required. The `ef_construct` parameter is the number of +neighbours to consider during the index building. Again, the larger the value, the higher the precision, but the longer the indexing time. +The default values of these parameters are `m=16` and `ef_construct=100`. Let's try to increase them to `m=32` and `ef_construct=200` and +see how it affects the precision. Of course, we need to wait until the indexing is finished before we can perform the search. -When you perform a search, Qdrant retrieves the top candidates using the quantized vectors based on their similarity to the query vector, as determined by the quantized data. This step is fast because we’re using the quantized vectors. +```python +client.update_collection( + collection_name="arxiv-titles-instructorxl-embeddings", + hnsw_config=models.HnswConfigDiff( + m=32, # Increase the number of edges per node from the default 16 to 32 + ef_construct=200, # Increase the number of neighbours from the default 100 to 200 + ) +) -![ANN Search with Quantization](https://qdrant.tech/articles_data/what-is-vector-quantization/ann-search-quantized.png) +while True: + collection_info = client.get_collection(collection_name="arxiv-titles-instructorxl-embeddings") + if collection_info.status == models.CollectionStatus.GREEN: + # Collection status is green, which means the indexing is finished + break +``` -### [Anchor](https://qdrant.tech/articles/what-is-vector-quantization/\#2-oversampling) 2\. Oversampling +The same function can be used to calculate the average `precision@5`: -Oversampling is a technique that helps compensate for any precision lost due to quantization. Since quantization simplifies vectors, some relevant matches could be missed in the initial search. To avoid this, you can **retrieve more candidates**, increasing the chances that the most relevant vectors make it into the final results. +```python +print(f"avg(precision@5) = {avg_precision_at_k(k=5)}") +``` -You can control the number of extra candidates by setting an `oversampling` parameter. For example, if your desired number of results ( `limit`) is 4 and you set an `oversampling` factor of 2, Qdrant will retrieve 8 candidates (4 × 2). +Response: -![ANN Search with Quantization and Oversampling](https://qdrant.tech/articles_data/what-is-vector-quantization/ann-search-quantized-oversampling.png) +```text +avg(precision@5) = 0.9969999999999998 +``` -You can adjust the oversampling factor to control how many extra vectors Qdrant includes in the initial pool. More candidates mean a better chance of obtaining high-quality top-K results, especially after rescoring with the original vectors. +The precision has obviously increased, and we know how to control it. However, there is a trade-off between the precision and the search +latency and memory requirements. In some specific cases, we may want to increase the precision as much as possible, so now we know how +to do it. -### [Anchor](https://qdrant.tech/articles/what-is-vector-quantization/\#3-rescoring-with-original-vectors) 3\. Rescoring with Original Vectors +## Wrapping up -After oversampling to gather more potential matches, each candidate is re-evaluated based on additional criteria to ensure higher accuracy and relevance to the query. +Assessing the quality of retrieval is a critical aspect of [evaluating](https://qdrant.tech/rag/rag-evaluation-guide/) semantic search performance. It is imperative to measure retrieval quality when aiming for optimal quality of. +your search results. Qdrant provides a built-in exact search mode, which can be used to measure the quality of the ANN algorithm itself, +even in an automated way, as part of your CI/CD pipeline. -The rescoring process **maps** the quantized vectors to their corresponding original vectors, allowing you to consider factors like context, metadata, or additional relevance that wasn’t included in the initial search, leading to more accurate results. +Again, **the quality of the embeddings is the most important factor**. HNSW does a pretty good job in terms of precision, and it is +parameterizable and tunable, when required. There are some other ANN algorithms available out there, such as [IVF*](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes#cell-probe-methods-indexivf-indexes), +but they usually [perform worse than HNSW in terms of quality and performance](https://nirantk.com/writing/pgvector-vs-qdrant/#correctness). -![Rescoring with Original Vectors](https://qdrant.tech/articles_data/what-is-vector-quantization/rescoring.png) +<|page-121-lllmstxt|> +# Configuring Networking, Logging & Monitoring in Qdrant Hybrid Cloud -During rescoring, one of the lower-ranked candidates from oversampling might turn out to be a better match than some of the original top-K candidates. +## Configure network policies -Even though rescoring uses the original, larger vectors, the process remains much faster because only a very small number of vectors are read. The initial quantized search already identifies the specific vectors to read, rescore, and rerank. +For security reasons, each database cluster is secured with network policies. By default, database pods only allow egress traffic between each and allow ingress traffic to ports 6333 (rest) and 6334 (grpc) from within the Kubernetes cluster. -### [Anchor](https://qdrant.tech/articles/what-is-vector-quantization/\#4-reranking) 4\. Reranking +You can modify the default network policies in the Hybrid Cloud environment configuration: -With the new similarity scores from rescoring, **reranking** is where the final top-K candidates are determined based on the updated similarity scores. +```yaml +qdrant: + networkPolicies: + ingress: + - from: + - ipBlock: + cidr: 192.168.0.0/22 + - podSelector: + matchLabels: + app: client-app + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: client-namespace + - podSelector: + matchLabels: + app: traefik + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system + ports: + - port: 6333 + protocol: TCP + - port: 6334 + protocol: TCP +``` -For example, in our case with a limit of 4, a candidate that ranked 6th in the initial quantized search might improve its score after rescoring because the original vectors capture more context or metadata. As a result, this candidate could move into the final top 4 after reranking, replacing a less relevant option from the initial search. +## Logging -![Reranking with Original Vectors](https://qdrant.tech/articles_data/what-is-vector-quantization/reranking.png) +You can access the logs with kubectl or the Kubernetes log management tool of your choice. For example: -Here’s how you can set it up: +```bash +kubectl -n qdrant-namespace logs -l app=qdrant,cluster-id=9a9f48c7-bb90-4fb2-816f-418a46a74b24 +``` -httppython +**Configuring log levels:** You can configure log levels for the databases individually in the configuration section of the Qdrant Cluster detail page. The log level for the **Qdrant Cloud Agent** and **Operator** can be set in the [Hybrid Cloud Environment configuration](/documentation/hybrid-cloud/operator-configuration/). -```http -POST /collections/{collection_name}/points/search +### Integrating with a log management system -{ - "query": [0.22, -0.01, -0.98, 0.37], - "params": { - "quantization": { - "rescore": true, - "oversampling": 2 - } - }, - "limit": 4 -} +You can integrate the logs into any log management system that supports Kubernetes. There are no Qdrant specific configurations necessary. Just configure the agents of your system to collect the logs from all Pods in the Qdrant namespace. -``` +## Monitoring -```python -client.query_points( - collection_name="my_collection", - query_vector=[0.22, -0.01, -0.98, 0.37], - search_params=models.SearchParams( - quantization=models.QuantizationSearchParams( - rescore=True, # Enables rescoring with original vectors - oversampling=2 # Retrieves extra candidates for rescoring - ) - ), - limit=4 # Desired number of final results -) +The Qdrant Cloud console gives you access to basic metrics about CPU, memory and disk usage of your Qdrant clusters. -``` +If you want to integrate the Qdrant metrics into your own monitoring system, you can instruct it to scrape the following endpoints that provide metrics in a Prometheus/OpenTelemetry compatible format: -You can adjust the `oversampling` factor to find the right balance between search speed and result accuracy. +* `/metrics` on port 6333 of every Qdrant database Pod, this provides metrics about each the database and its internals itself +* `/metrics` on port 9290 of the Qdrant Operator Pod, this provides metrics about the Operator, as well as the status of Qdrant Clusters and Snapshots +* `/metrics` on port 9090 of the Qdrant Cloud Agent Pod, this provides metrics about the Agent and its connection to the Qdrant Cloud control plane +* `/metrics` on port 8080 of the [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) Pod, this provides metrics about the state of Kubernetes resources like Pods and PersistentVolumes within the Qdrant Hybrid Cloud namespace (useful, if you are not running kube-state-metrics cluster-wide anyway) -If quantization is impacting performance in an application that requires high accuracy, combining oversampling with rescoring is a great choice. However, if you need faster searches and can tolerate some loss in accuracy, you might choose to use oversampling without rescoring, or adjust the oversampling factor to a lower value. +### Grafana dashboard -## [Anchor](https://qdrant.tech/articles/what-is-vector-quantization/\#distributing-resources-between-disk--memory) Distributing Resources Between Disk & Memory +If you scrape the above metrics into your own monitoring system, and your are using Grafana, you can use our [Grafana dashboard](https://github.com/qdrant/qdrant-cloud-grafana-dashboard) to visualize these metrics. -Qdrant stores both the quantized and original vectors. When you enable quantization, both the original and quantized vectors are stored in RAM by default. You can move the original vectors to disk to significantly reduce RAM usage and lower system costs. Simply enabling quantization is not enough—you need to explicitly move the original vectors to disk by setting `on_disk=True`. +![Grafa dashboard](/documentation/cloud/cloud-grafana-dashboard.png) -Here’s an example configuration: +<|page-122-lllmstxt|> +# Scaling PDF Retrieval with Qdrant -httppython +![scaling-pdf-retrieval-qdrant](/documentation/tutorials/pdf-retrieval-at-scale/image1.png) -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 1536, - "distance": "Cosine", - "on_disk": true # Move original vectors to disk - }, - "quantization_config": { - "binary": { - "always_ram": true # Store only quantized vectors in RAM - } - } -} +| Time: 30 min | Level: Intermediate |Output: [GitHub](https://github.com/qdrant/examples/blob/master/pdf-retrieval-at-scale/ColPali_ColQwen2_Tutorial.ipynb)|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/qdrant/examples/blob/master/pdf-retrieval-at-scale/ColPali_ColQwen2_Tutorial.ipynb) | +| --- | ----------- | ----------- | ----------- | -``` +Efficient PDF documents retrieval is a common requirement in tasks like **(agentic) retrieval-augmented generation (RAG)** and many other search-based applications. At the same time, setting up PDF documents retrieval is rarely possible without additional challenges. -```python -client.update_collection( - collection_name="my_collection", - vectors_config=models.VectorParams( - size=1536, - distance=models.Distance.COSINE, - on_disk=True # Move original vectors to disk - ), - quantization_config=models.BinaryQuantization( - binary=models.BinaryQuantizationConfig( - always_ram=True # Store only quantized vectors in RAM - ) - ) -) +Many traditional PDF retrieval solutions rely on **optical character recognition (OCR)** together with use case-specific heuristics to handle visually complex elements like tables, images and charts. These algorithms are often non-transferable -- even within the same domain -- with their task-customized parsing and chunking strategies, labor-intensive, prone to errors, and difficult to scale. -``` +Recent advancements in **Vision Large Language Models (VLLMs)**, such as [**ColPali**](https://huggingface.co/blog/manu/colpali) and its successor [**ColQwen**](https://huggingface.co/vidore/colqwen2-v0.1), started the transformation of the PDF retrieval. These multimodal models work directly with PDF pages as inputs, no pre-processing required. Anything that can be converted into an **image** (think of PDFs as screenshots of document pages) can be effectively processed by these models. Being far simpler in use, VLLMs achieve state-of-the-art performance in PDF retrieval benchmarks like the [Visual Document Retrieval (ViDoRe) Benchmark](https://huggingface.co/spaces/vidore/vidore-leaderboard). -Without explicitly setting `on_disk=True`, you won’t see any RAM savings, even with quantization enabled. So, make sure to configure both storage and quantization options based on your memory and performance needs. If your storage has high disk latency, consider disabling rescoring to maintain speed. +## How VLLMs Work for PDF Retrieval -### [Anchor](https://qdrant.tech/articles/what-is-vector-quantization/\#speeding-up-rescoring-with-io_uring) Speeding Up Rescoring with io\_uring +VLLMs like **ColPali** and **ColQwen** generate **multivector representations** for each PDF page; the representations are stored and indexed in a vector database. During the retrieval process, models dynamically create multivector representations for (textual) user queries, and precise retrieval -- matching between PDF pages and queries -- is achieved through [late-interaction mechanism](/blog/qdrant-colpali/#how-colpali-works-under-the-hood). -When dealing with large collections of quantized vectors, frequent disk reads are required to retrieve both original and compressed data for rescoring operations. While `mmap` helps with efficient I/O by reducing user-to-kernel transitions, rescoring can still be slowed down when working with large datasets on disk due to the need for frequent disk reads. + -On Linux-based systems, `io_uring` allows multiple disk operations to be processed in parallel, significantly reducing I/O overhead. This optimization is particularly effective during rescoring, where multiple vectors need to be re-evaluated after the initial search. With io\_uring, Qdrant can retrieve and rescore vectors from disk in the most efficient way, improving overall search performance. +## Challenges of Scaling VLLMs -When you perform vector quantization and store data on disk, Qdrant often needs to access multiple vectors in parallel. Without io\_uring, this process can be slowed down due to the system’s limitations in handling many disk accesses. +The heavy multivector representations produced by VLLMs make PDF retrieval at scale computationally intensive. These models are inefficient for large-scale PDF retrieval tasks if used without optimization. -To enable `io_uring` in Qdrant, add the following to your storage configuration: +### Math Behind the Scaling -```yaml -storage: - async_scorer: true # Enable io_uring for async storage +**ColPali** generates over **1,000 vectors per PDF page**, while its successor, **ColQwen**, generates slightly fewer — up to **768 vectors**, dynamically adjusted based on the image size. Typically, ColQwen produces **~700 vectors per page**. -``` +To understand the impact, consider the construction of an [**HNSW index**](/articles/what-is-a-vector-database/#1-indexing-hnsw-index-and-sending-data-to-qdrant), a common indexing algorithm for vector databases. Let's roughly estimate the number of comparisons needed to insert a new PDF page into the index. -Without this configuration, Qdrant will default to using `mmap` for disk I/O operations. +- **Vectors per page:** ~700 (ColQwen) or ~1,000 (ColPali) +- **[ef_construct](/documentation/concepts/indexing/#vector-index):** 100 (default) -For more information and benchmarks comparing io\_uring with traditional I/O approaches like mmap, check out [Qdrant’s io\_uring implementation article.](https://qdrant.tech/articles/io_uring/) +The lower bound estimation for the number of vector comparisions comparisons would be: -## [Anchor](https://qdrant.tech/articles/what-is-vector-quantization/\#performance-of-quantized-vs-non-quantized-data) Performance of Quantized vs. Non-Quantized Data +$$ +700 \times 700 \times 100 = 49 \ \text{millions} +$$ -Qdrant uses the quantized vectors by default if they are available. If you want to evaluate how quantization affects your search results, you can temporarily disable it to compare results from quantized and non-quantized searches. To do this, set `ignore: true` in the query: +Now imagine how much it will take to build an index on **20,000 pages**! -httppython +For ColPali, this number doubles. The result is **extremely slow index construction time**. -```http -POST /collections/{collection_name}/points/query -{ - "query": [0.22, -0.01, -0.98, 0.37], - "params": { - "quantization": { - "ignore": true, - } - }, - "limit": 4 -} +### Our Solution -``` +We recommend reducing the number of vectors in a PDF page representation for the **first-stage retrieval**. After the first stage retrieval with a reduced amount of vectors, we propose to **rerank** retrieved subset with the original uncompressed representation. -```python -client.query_points( - collection_name="{collection_name}", - query=[0.22, -0.01, -0.98, 0.37], - search_params=models.SearchParams( - quantization=models.QuantizationSearchParams( - ignore=True - ) - ), -) + -``` +The reduction of vectors can be achieved by applying a **mean pooling operation** to the multivector VLLM-generated outputs. Mean pooling averages the values across all vectors within a selected subgroup, condensing multiple vectors into a single representative vector. If done right, it allows the preservation of important information from the original page while significantly reducing the number of vectors. -### [Anchor](https://qdrant.tech/articles/what-is-vector-quantization/\#switching-between-quantization-methods) Switching Between Quantization Methods +VLLMs generate vectors corresponding to patches that represent different portions of a PDF page. These patches can be grouped in columns and rows of a PDF page. -Not sure if you’ve chosen the right quantization method? In Qdrant, you have the flexibility to remove quantization and rely solely on the original vectors, adjust the quantization type, or change compression parameters at any time without affecting your original vectors. +For example: +- ColPali divides PDF page into **1,024 patches**. +- Applying mean pooling by rows (or columns) of this patch matrix reduces the page representation to just **32 vectors**. -To switch to binary quantization and adjust the compression rate, for example, you can update the collection’s quantization configuration using the `update_collection` method: +![ColPali patching of a PDF page](/documentation/tutorials/pdf-retrieval-at-scale/pooling-by-rows.png) -httppython +We tested this approach with the ColPali model, mean pooling its multivectors by PDF page rows. The results showed: +- **Indexing time faster by an order of magnitude** +- **Retrieval quality comparable to the original model** -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 1536, - "distance": "Cosine" - }, - "quantization_config": { - "binary": { - "always_ram": true, - "compression_rate": 0.8 # Set the new compression rate - } - } -} +For details of this experiment refer to our [gitHub repository](https://github.com/qdrant/demo-colpali-optimized), [ColPali optimization blog post](/blog/colpali-qdrant-optimization/) or [webinar "PDF Retrieval at Scale"](https://www.youtube.com/watch?v=_h6SN1WwnLs) -``` +## Goal of This Tutorial -```python -client.update_collection( - collection_name="my_collection", - quantization_config=models.BinaryQuantization( - binary=models.BinaryQuantizationConfig( - always_ram=True, # Store only quantized vectors in RAM - compression_rate=0.8 # Set the new compression rate - ) - ), -) +In this tutorial, we will demonstrate a scalable approach to PDF retrieval using **Qdrant** and **ColPali** & **ColQwen2** VLLMs. +The presented approach is **highly recommended** to avoid the common pitfalls of long indexing times and slow retrieval speeds. -``` +In the following sections, we will demonstrate an optimized retrieval algorithm born out of our successful experimentation: -If you decide to **turn off quantization** and use only the original vectors, you can remove the quantization settings entirely with `quantization_config=None`: +**First-Stage Retrieval with Mean-Pooled Vectors:** + - Construct an HNSW index using **only mean-pooled vectors**. + - Use them for the first-stage retrieval. -httppython +**Reranking with Original Model Multivectors:** + - Use the original multivectors from ColPali or ColQwen2 **to rerank** the results retrieved in the first stage. -```http -PUT /collections/my_collection -{ - "vectors": { - "size": 1536, - "distance": "Cosine" - }, - "quantization_config": null # Remove quantization and use original vectors only -} +## Setup +Install & import required libraries +```python +# pip install colpali_engine>=0.3.1 +from colpali_engine.models import ColPali, ColPaliProcessor +# pip install qdrant-client>=1.12.0 +from qdrant_client import QdrantClient, models ``` +To run these experiments, we’re using a **Qdrant cluster**. If you’re just getting started, you can set up a **free-tier cluster** for testing and exploration. Follow the instructions in the documentation ["How to Create a Free-Tier Qdrant Cluster"](/documentation/cloud/create-cluster/#free-clusters) + ```python -client.update_collection( - collection_name="my_collection", - quantization_config=None # Remove quantization and rely on original vectors only +client = QdrantClient( + url=, + api_key= ) - ``` -## [Anchor](https://qdrant.tech/articles/what-is-vector-quantization/\#wrapping-up) Wrapping Up - -![](https://qdrant.tech/articles_data/what-is-vector-quantization/astronaut-running.jpg) - -Quantization methods like Scalar, Product, and Binary Quantization offer powerful ways to optimize memory usage and improve search performance when dealing with large datasets of high-dimensional vectors. Each method comes with its own trade-offs between memory savings, computational speed, and accuracy. - -Here are some final thoughts to help you choose the right quantization method for your needs: - -| **Quantization Method** | **Key Features** | **When to Use** | -| --- | --- | --- | -| **Binary Quantization** | ‱ **Fastest method and most memory-efficient**
‱ Up to **40x** faster search and **32x** reduced memory footprint | ‱ Use with tested models like OpenAI’s `text-embedding-ada-002` and Cohere’s `embed-english-v2.0`
‱ When speed and memory efficiency are critical | -| **Scalar Quantization** | ‱ **Minimal loss of accuracy**
‱ Up to **4x** reduced memory footprint | ‱ Safe default choice for most applications.
‱ Offers a good balance between accuracy, speed, and compression. | -| **Product Quantization** | ‱ **Highest compression ratio**
‱ Up to **64x** reduced memory footprint | ‱ When minimizing memory usage is the top priority
‱ Acceptable if some loss of accuracy and slower indexing is tolerable | - -### [Anchor](https://qdrant.tech/articles/what-is-vector-quantization/\#learn-more) Learn More - -If you want to learn more about improving accuracy, memory efficiency, and speed when using quantization in Qdrant, we have a dedicated [Quantization tips](https://qdrant.tech/documentation/guides/quantization/#quantization-tips) section in our docs that explains all the quantization tips you can use to enhance your results. +Download **ColPali** model along with its input processors. Make sure to select the backend that suits your setup. -Learn more about optimizing real-time precision with oversampling in Binary Quantization by watching this interview with Qdrant’s CTO, Andrey Vasnetsov: +```python +colpali_model = ColPali.from_pretrained( + "vidore/colpali-v1.3", + torch_dtype=torch.bfloat16, + device_map="mps", # Use "cuda:0" for GPU, "cpu" for CPU, or "mps" for Apple Silicon + ).eval() -Binary Quantization - Andrey Vasnetsov \| Vector Space Talk #001 - YouTube +colpali_processor = ColPaliProcessor.from_pretrained("vidore/colpali-v1.3") +``` -[Photo image of Qdrant - Vector Database & Search Engine](https://www.youtube.com/channel/UC6ftm8PwH1RU_LM1jwG0LQA?embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +
+ For ColQwen model -Qdrant - Vector Database & Search Engine +```python +from colpali_engine.models import ColQwen2, ColQwen2Processor -8.12K subscribers +colqwen_model = ColQwen2.from_pretrained( + "vidore/colqwen2-v0.1", + torch_dtype=torch.bfloat16, + device_map="mps", # Use "cuda:0" for GPU, "cpu" for CPU, or "mps" for Apple Silicon + ).eval() -[Binary Quantization - Andrey Vasnetsov \| Vector Space Talk #001](https://www.youtube.com/watch?v=4aUq5VnR_VI) +colqwen_processor = ColQwen2Processor.from_pretrained("vidore/colqwen2-v0.1") +``` +
-Qdrant - Vector Database & Search Engine -Search -Watch later +## Create Qdrant Collections -Share +We can now create a collection in Qdrant to store the multivector representations of PDF pages generated by **ColPali** or **ColQwen**. -Copy link +Collection will include **mean pooled** by rows and columns representations of a PDF page, as well as the **original** multivector representation. -Info + -Shopping +```python +client.create_collection( + collection_name=collection_name, + vectors_config={ + "original": + models.VectorParams( #switch off HNSW + size=128, + distance=models.Distance.COSINE, + multivector_config=models.MultiVectorConfig( + comparator=models.MultiVectorComparator.MAX_SIM + ), + hnsw_config=models.HnswConfigDiff( + m=0 #switching off HNSW + ) + ), + "mean_pooling_columns": models.VectorParams( + size=128, + distance=models.Distance.COSINE, + multivector_config=models.MultiVectorConfig( + comparator=models.MultiVectorComparator.MAX_SIM + ) + ), + "mean_pooling_rows": models.VectorParams( + size=128, + distance=models.Distance.COSINE, + multivector_config=models.MultiVectorConfig( + comparator=models.MultiVectorComparator.MAX_SIM + ) + ) + } +) +``` +## Choose a dataset -Tap to unmute +We’ll use the **UFO Dataset** by Daniel van Strien for this tutorial. It’s available on Hugging Face; you can download it directly from there. -If playback doesn't begin shortly, try restarting your device. +```python +from datasets import load_dataset +ufo_dataset = "davanstrien/ufo-ColPali" +dataset = load_dataset(ufo_dataset, split="train") +``` -More videos +## Embedding and Mean Pooling -## More videos +We'll use a function that generates multivector representations and their mean pooled versions of each PDF page (aka image) in batches. +For complete understanding, it's important to consider the following specifics of **ColPali** and **ColQwen**: -You're signed out +**ColPali:** +In theory, ColPali is designed to generate 1,024 vectors per PDF page, but in practice, it produces 1,030 vectors. This discrepancy is due to ColPali's pre-processor, which appends the text `Describe the image.` to each input. This additional text generates an extra 6 multivectors. -Videos you watch may be added to the TV's watch history and influence TV recommendations. To avoid this, cancel and sign in to YouTube on your computer. +**ColQwen:** +ColQwen dynamically determines the number of patches in "rows and columns" of a PDF page based on its size. Consequently, the number of multivectors can vary between inputs. ColQwen pre-processor prepends `<|im_start|>user<|vision_start|>` and appends `<|vision_end|>Describe the image.<|im_end|><|endoftext|>`. -CancelConfirm +For example, that's how ColQwen multivector output is formed. -Share +![that's how ColQwen multivector output is formed](/documentation/tutorials/pdf-retrieval-at-scale/ColQwen-preprocessing.png) -Include playlist +The `get_patches` function is to get the number of `x_patches` (rows) and `y_patches` (columns) ColPali/ColQwen2 models will divide a PDF page into. +For ColPali, the numbers will always be 32 by 32; ColQwen will define them dynamically based on the PDF page size. -An error occurred while retrieving sharing information. Please try again later. +```python +x_patches, y_patches = model_processor.get_n_patches( + image_size, + patch_size=model.patch_size +) +``` -[Watch on](https://www.youtube.com/watch?v=4aUq5VnR_VI&embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +
+ For ColQwen model -0:00 +```python +model_processor.get_n_patches( + image_size, + patch_size=model.patch_size, + spatial_merge_size=model.spatial_merge_size +) +``` +
-0:00 / 20:44 -‱Live -‱ +We choose to **preserve prefix and postfix multivectors**. Our **pooling** operation compresses the multivectors representing **the image tokens** based on the number of rows and columns determined by the model (static 32x32 for ColPali, dynamic XxY for ColQwen). Function retains and integrates the additional multivectors produced by the model back to pooled representations. -[Watch on YouTube](https://www.youtube.com/watch?v=4aUq5VnR_VI "Watch on YouTube") -Stay up-to-date on the latest in [vector search](https://qdrant.tech/advanced-search/) and quantization, share your projects, ask questions, [join our vector search community](https://discord.com/invite/qdrant)! +Simplified version of pooling for **ColPali** model: -##### Was this page useful? +(see the full version -- also applicable for **ColQwen** -- in the [tutorial notebook](https://githubtocolab.com/qdrant/examples/blob/master/pdf-retrieval-at-scale/ColPali_ColQwen2_Tutorial.ipynb)) -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +```python -Thank you for your feedback! 🙏 +processed_images = model_processor.process_images(image_batch) +# Image embeddings of shape (batch_size, 1030, 128) +image_embeddings = model(**processed_images) -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/what-is-quantization.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +# (1030, 128) +image_embedding = image_embeddings[0] # take the first element of the batch -On this page: +# Now we need to identify vectors that correspond to the image tokens +# It can be done by selecting tokens corresponding to special `image_token_id` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/what-is-quantization.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +# (1030, ) - boolean mask (for the first element in the batch), True for image tokens +mask = processed_images.input_ids[0] == model_processor.image_token_id -× +# For convenience, we now select only image tokens +# and reshape them to (x_patches, y_patches, dim) -[Powered by](https://qdrant.tech/) +# (x_patches, y_patches, 128) +image_patch_embeddings = image_embedding[mask].view(x_patches, y_patches, model.dim) -![Company Logo](https://cdn.cookielaw.org/logos/static/ot_company_logo.png) +# Now we can apply mean pooling by rows and columns -## Privacy Preference Center +# (x_patches, 128) +pooled_by_rows = image_patch_embeddings.mean(dim=0) -Cookies used on the site are categorized, and below, you can read about each category and allow or deny some or all of them. When categories that have been previously allowed are disabled, all cookies assigned to that category will be removed from your browser. -Additionally, you can see a list of cookies assigned to each category and detailed information in the cookie declaration. +# (y_patches, 128) +pooled_by_columns = image_patch_embeddings.mean(dim=1) +# [Optionally] we can also concatenate special tokens to the pooled representations, +# For ColPali, it's only postfix -[More information](https://qdrant.tech/legal/privacy-policy/#cookies-and-web-beacons) +# (x_patches + 6, 128) +pooled_by_rows = torch.cat([pooled_by_rows, image_embedding[~mask]]) -Allow All +# (y_patches + 6, 128) +pooled_by_columns = torch.cat([pooled_by_columns, image_embedding[~mask]]) +``` -### Manage Consent Preferences + -#### Targeting Cookies +## Upload to Qdrant -Targeting Cookies +The upload process is trivial; the only thing to pay attention to is the compute cost for ColPali and ColQwen2 models. +In low-resource environments, it's recommended to use a smaller batch size for embedding and mean pooling. -These cookies may be set through our site by our advertising partners. They may be used by those companies to build a profile of your interests and show you relevant adverts on other sites. They do not store directly personal information, but are based on uniquely identifying your browser and internet device. If you do not allow these cookies, you will experience less targeted advertising. +Full version of the upload code is available in the [tutorial notebook](https://githubtocolab.com/qdrant/examples/blob/master/pdf-retrieval-at-scale/ColPali_ColQwen2_Tutorial.ipynb) -#### Functional Cookies -Functional Cookies +## Querying PDFs -These cookies enable the website to provide enhanced functionality and personalisation. They may be set by us or by third party providers whose services we have added to our pages. If you do not allow these cookies then some or all of these services may not function properly. +After indexing PDF documents, we can move on to querying them using our two-stage retrieval approach. -#### Strictly Necessary Cookies +```python +query = "Lee Harvey Oswald's involvement in the JFK assassination" +processed_queries = model_processor.process_queries([query]).to(model.device) -Always Active +# Resulting query embedding is a tensor of shape (22, 128) +query_embedding = model(**processed_queries)[0] +``` -These cookies are necessary for the website to function and cannot be switched off in our systems. They are usually only set in response to actions made by you which amount to a request for services, such as setting your privacy preferences, logging in or filling in forms. You can set your browser to block or alert you about these cookies, but some parts of the site will not then work. These cookies do not store any personally identifiable information. +Now let's design a function for the two-stage retrieval with multivectors produced by VLLMs: -#### Performance Cookies +- **Step 1:** Prefetch results using a compressed multivector representation & HNSW index. +- **Step 2:** Re-rank the prefetched results using the original multivector representation. -Performance Cookies +Let's query our collections using combined mean pooled representations for the first stage of retrieval. -These cookies allow us to count visits and traffic sources so we can measure and improve the performance of our site. They help us to know which pages are the most and least popular and see how visitors move around the site. All information these cookies collect is aggregated and therefore anonymous. If you do not allow these cookies we will not know when you have visited our site, and will not be able to monitor its performance. +```python +# Final amount of results to return +search_limit = 10 +# Amount of results to prefetch for reranking +prefetch_limit = 100 -Back Button +response = client.query_points( + collection_name=collection_name, + query=query_embedding, + prefetch=[ + models.Prefetch( + query=query_embedding, + limit=prefetch_limit, + using="mean_pooling_columns" + ), + models.Prefetch( + query=query_embedding, + limit=prefetch_limit, + using="mean_pooling_rows" + ), + ], + limit=search_limit, + with_payload=True, + with_vector=False, + using="original" +) +``` -### Cookie List +And check the top retrieved result to our query *"Lee Harvey Oswald's involvement in the JFK assassination"*. -Search Icon +```python +dataset[response.points[0].payload['index']]['image'] +``` -Filter Icon +![Results, ColPali](/documentation/tutorials/pdf-retrieval-at-scale/result-VLLMs.png) -Clear -checkbox labellabel +## Conclusion -ApplyCancel +In this tutorial, we demonstrated an optimized approach using **Qdrant for PDF retrieval at scale** with VLLMs producing **heavy multivector representations** like **ColPali** and **ColQwen2**. -ConsentLeg.Interest +Without such optimization, the performance of retrieval systems can degrade severely, both in terms of indexing time and query latency, especially as the dataset size grows. -checkbox labellabel +We **strongly recommend** implementing this approach in your workflows to ensure efficient and scalable PDF retrieval. Neglecting to optimize the retrieval process could result in unacceptably slow performance, hindering the usability of your system. -checkbox labellabel +Start scaling your PDF retrieval today! -checkbox labellabel +<|page-123-lllmstxt|> +# How to use miniCOIL, Qdrant's Sparse Neural Retriever -Reject AllConfirm My Choices +**miniCOIL** is an open-sourced sparse neural retrieval model that acts as if a BM25-based retriever understood the contextual meaning of keywords and ranked results accordingly. -[![Powered by Onetrust](https://cdn.cookielaw.org/logos/static/powered_by_logo.svg)](https://www.onetrust.com/products/cookie-consent/) +**miniCOIL** scoring is based on the BM25 formula scaled by the semantic similarity between matched keywords in a query and a document. +$$ +\text{miniCOIL}(D,Q) = \sum_{i=1}^{N} \text{IDF}(q_i) \cdot \text{Importance}^{q_i}_{D} \cdot {\color{YellowGreen}\text{Meaning}^{q_i \times d_j}} \text{, where keyword } d_j \in D \text{ equals } q_i +$$ -<|page-82-lllmstxt|> -## indexing -- [Documentation](https://qdrant.tech/documentation/) -- [Concepts](https://qdrant.tech/documentation/concepts/) -- Indexing +A detailed breakdown of the idea behind miniCOIL can be found in the +["miniCOIL: on the road to Usable Sparse Neural Retreival" article](https://qdrant.tech/articles/minicoil/) or, in a [recorded talk "miniCOIL: Sparse Neural Retrieval Done Right"](https://youtu.be/f1sBJMSgBXA?si=G3C5--UVRKAW5WJ0). -# [Anchor](https://qdrant.tech/documentation/concepts/indexing/\#indexing) Indexing +This tutorial will demonstrate how miniCOIL-based sparse neural retrieval performs compared to BM25-based lexical retrieval. -A key feature of Qdrant is the effective combination of vector and traditional indexes. It is essential to have this because for vector search to work effectively with filters, having vector index only is not enough. In simpler terms, a vector index speeds up vector search, and payload indexes speed up filtering. +## When to use miniCOIL -The indexes in the segments exist independently, but the parameters of the indexes themselves are configured for the whole collection. +When exact keyword matches in the retrieved results are a requirement, and all matches should be ranked based on the contextual meaning of keywords. -Not all segments automatically have indexes. -Their necessity is determined by the [optimizer](https://qdrant.tech/documentation/concepts/optimizer/) settings and depends, as a rule, on the number of stored points. +If results should be similar by meaning but are expressed differently, with no overlapping keywords, you should use dense embeddings or combine them with miniCOIL in a hybrid search setting. -## [Anchor](https://qdrant.tech/documentation/concepts/indexing/\#payload-index) Payload Index +## Setup -Payload index in Qdrant is similar to the index in conventional document-oriented databases. -This index is built for a specific field and type, and is used for quick point requests by the corresponding filtering condition. +Install `qdrant-client` integration with `fastembed`. -The index is also used to accurately estimate the filter cardinality, which helps the [query planning](https://qdrant.tech/documentation/concepts/search/#query-planning) choose a search strategy. +```python +pip install "qdrant-client[fastembed]" +``` -Creating an index requires additional computational resources and memory, so choosing fields to be indexed is essential. Qdrant does not make this choice but grants it to the user. +Then, initialize the Qdrant client. You could use for experiments [a free cluster](https://qdrant.tech/documentation/cloud-quickstart/#authenticate-via-sdks) in Qdrant Cloud or run a [local Qdrant instance via Docker](https://qdrant.tech/documentation/quickstart/#initialize-the-client). + +We'll run our search on a list of book and article titles containing the keywords "*vector*" and "*search*" used in different contexts, to demonstrate how miniCOIL captures the meaning of these keywords as opposed to BM25. + +
+ A dataset + +```python +documents = [ + "Vector Graphics in Modern Web Design", + "The Art of Search and Self-Discovery", + "Efficient Vector Search Algorithms for Large Datasets", + "Searching the Soul: A Journey Through Mindfulness", + "Vector-Based Animations for User Interface Design", + "Search Engines: A Technical and Social Overview", + "The Rise of Vector Databases in AI Systems", + "Search Patterns in Human Behavior", + "Vector Illustrations: A Guide for Creatives", + "Search and Rescue: Technologies in Emergency Response", + "Vectors in Physics: From Arrows to Equations", + "Searching for Lost Time in the Digital Age", + "Vector Spaces and Linear Transformations", + "The Endless Search for Truth in Philosophy", + "3D Modeling with Vectors in Blender", + "Search Optimization Strategies for E-commerce", + "Vector Drawing Techniques with Open-Source Tools", + "In Search of Meaning: A Psychological Perspective", + "Advanced Vector Calculus for Engineers", + "Search Interfaces: UX Principles and Case Studies", + "The Use of Vector Fields in Meteorology", + "Search and Destroy: Cybersecurity in the 21st Century", + "From Bitmap to Vector: A Designer’s Guide", + "Search Engines and the Democratization of Knowledge", + "Vector Geometry in Game Development", + "The Human Search for Connection in a Digital World", + "AI-Powered Vector Search in Recommendation Systems", + "Searchable Archives: The History of Digital Retrieval", + "Vector Control Strategies in Public Health", + "The Search for Extraterrestrial Intelligence" +] +``` +
-To mark a field as indexable, you can use the following: +## Create Collection +Let's create a collection to store and index titles. -httppythontypescriptrustjavacsharpgo +As miniCOIL was designed with Qdrant's ability to calculate the keywords Inverse Document Frequency (IDF) in mind, we need to configure miniCOIL sparse vectors with [IDF modifier](https://qdrant.tech/documentation/concepts/indexing/#idf-modifier). -```http -PUT /collections/{collection_name}/index -{ - "field_name": "name_of_the_field_to_index", - "field_schema": "keyword" -} + +```python +client.create_collection( + collection_name="{minicoil_collection_name}", + sparse_vectors_config={ + "minicoil": models.SparseVectorParams( + modifier=models.Modifier.IDF #Inverse Document Frequency + ) + } +) ``` +
+ Analogously, we configure a collection with BM25-based sparse vectors + ```python -client.create_payload_index( - collection_name="{collection_name}", - field_name="name_of_the_field_to_index", - field_schema="keyword", +client.create_collection( + collection_name="{bm25_collection_name}", + sparse_vectors_config={ + "bm25": models.SparseVectorParams( + modifier=models.Modifier.IDF + ) + } ) - ``` -```typescript -client.createPayloadIndex("{collection_name}", { - field_name: "name_of_the_field_to_index", - field_schema: "keyword", -}); +
-``` +## Convert to Sparse Vectors & Upload to Qdrant -```rust -use qdrant_client::qdrant::{CreateFieldIndexCollectionBuilder, FieldType}; +Next, we need to convert titles to miniCOIL sparse representations and upsert them into the configured collection. -client - .create_field_index( - CreateFieldIndexCollectionBuilder::new( - "{collection_name}", - "name_of_the_field_to_index", - FieldType::Keyword, - ) - .wait(true), - ) - .await?; +Qdrant and FastEmbed integration allows for hiding the inference process under the hood. -``` +That means: -```java -import io.qdrant.client.grpc.Collections.PayloadSchemaType; +- FastEmbed downloads the selected model from Hugging Face; +- FastEmbed runs local inference under the hood; +- Inferenced sparse representations are uploaded to Qdrant. -client.createPayloadIndexAsync( - "{collection_name}", - "name_of_the_field_to_index", - PayloadSchemaType.Keyword, - null, - true, - null, - null); +```python +#Estimating the average length of the documents in the corpus +avg_documents_length = sum(len(document.split()) for document in documents) / len(documents) +client.upsert( + collection_name="{minicoil_collection_name}", + points=[ + models.PointStruct( + id=i, + payload={ + "text": documents[i] + }, + vector={ + # Sparse miniCOIL vectors + "minicoil": models.Document( + text=documents[i], + model="Qdrant/minicoil-v1", + options={"avg_len": avg_documents_length} + #Average length of documents in the corpus + # (a part of the BM25 formula on which miniCOIL is built) + ) + }, + ) + for i in range(len(documents)) + ], +) ``` -```csharp -using Qdrant.Client; - -var client = new QdrantClient("localhost", 6334); +
+ Analogously, we convert & upsert BM25-based sparse vectors -await client.CreatePayloadIndexAsync( - collectionName: "{collection_name}", - fieldName: "name_of_the_field_to_index" -); +```python +#Estimating the average length of the documents in the corpus +avg_documents_length = sum(len(document.split()) for document in documents) / len(documents) +client.upsert( + collection_name="{bm25_collection_name}", + points=[ + models.PointStruct( + id=i, + payload={ + "text": documents[i] + }, + vector={ + # Sparse vector from BM25 + "bm25": models.Document( + text=documents[i], + model="Qdrant/bm25", + options={"avg_len": avg_documents_length} + #Average length of documents in the corpus + # (a part of the BM25 formula) + ) + }, + ) + for i in range(len(documents)) + ], +) ``` -```go -import ( - "context" +
- "github.com/qdrant/go-client/qdrant" -) +## Retrieve with miniCOIL +Using query *"Vectors in Medicine"*, we'll demo the difference between miniCOIL and BM25-based retrieval. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +None of the indexed titles contain the keyword *"medicine"*, so it won't contribute to the similarity score. +At the same time, the word *"vector"* appears once in many titles, and its role is roughly equal in all of them from the perspective of the BM25-based retriever. +miniCOIL, however, can capture the meaning of the keyword *"vector"* in the context of *"medicine"* and match a document where *"vector"* is used in a medicine-related context. -client.CreateFieldIndex(context.Background(), &qdrant.CreateFieldIndexCollection{ - CollectionName: "{collection_name}", - FieldName: "name_of_the_field_to_index", - FieldType: qdrant.FieldType_FieldTypeKeyword.Enum(), -}) +For BM25-based retrieval: + +```python +query = "Vectors in Medicine" +client.query_points( + collection_name="{bm25_collection_name}", + query=models.Document( + text=query, + model="Qdrant/bm25" + ), + using="bm25", + limit=1, +) ``` -You can use dot notation to specify a nested field for indexing. Similar to specifying [nested filters](https://qdrant.tech/documentation/concepts/filtering/#nested-key). +Result will be: -Available field types are: +```bash +QueryResponse( + points=[ + ScoredPoint( + id=18, version=1, score=0.8405092, + payload={ + 'title': 'Advanced Vector Calculus for Engineers' + }, + vector=None, shard_key=None, order_value=None) + ] + ) +``` -- `keyword` \- for [keyword](https://qdrant.tech/documentation/concepts/payload/#keyword) payload, affects [Match](https://qdrant.tech/documentation/concepts/filtering/#match) filtering conditions. -- `integer` \- for [integer](https://qdrant.tech/documentation/concepts/payload/#integer) payload, affects [Match](https://qdrant.tech/documentation/concepts/filtering/#match) and [Range](https://qdrant.tech/documentation/concepts/filtering/#range) filtering conditions. -- `float` \- for [float](https://qdrant.tech/documentation/concepts/payload/#float) payload, affects [Range](https://qdrant.tech/documentation/concepts/filtering/#range) filtering conditions. -- `bool` \- for [bool](https://qdrant.tech/documentation/concepts/payload/#bool) payload, affects [Match](https://qdrant.tech/documentation/concepts/filtering/#match) filtering conditions (available as of v1.4.0). -- `geo` \- for [geo](https://qdrant.tech/documentation/concepts/payload/#geo) payload, affects [Geo Bounding Box](https://qdrant.tech/documentation/concepts/filtering/#geo-bounding-box) and [Geo Radius](https://qdrant.tech/documentation/concepts/filtering/#geo-radius) filtering conditions. -- `datetime` \- for [datetime](https://qdrant.tech/documentation/concepts/payload/#datetime) payload, affects [Range](https://qdrant.tech/documentation/concepts/filtering/#range) filtering conditions (available as of v1.8.0). -- `text` \- a special kind of index, available for [keyword](https://qdrant.tech/documentation/concepts/payload/#keyword) / string payloads, affects [Full Text search](https://qdrant.tech/documentation/concepts/filtering/#full-text-match) filtering conditions. -- `uuid` \- a special type of index, similar to `keyword`, but optimized for [UUID values](https://qdrant.tech/documentation/concepts/payload/#uuid). -Affects [Match](https://qdrant.tech/documentation/concepts/filtering/#match) filtering conditions. (available as of v1.11.0) - -Payload index may occupy some additional memory, so it is recommended to only use index for those fields that are used in filtering conditions. -If you need to filter by many fields and the memory limits does not allow to index all of them, it is recommended to choose the field that limits the search result the most. -As a rule, the more different values a payload value has, the more efficiently the index will be used. +While for miniCOIL-based retrieval: -### [Anchor](https://qdrant.tech/documentation/concepts/indexing/\#full-text-index) Full-text index +```python +query = "Vectors in Medicine" -_Available as of v0.10.0_ +client.query_points( + collection_name="{minicoil_collection_name}", + query=models.Document( + text=query, + model="Qdrant/minicoil-v1" + ), + using="minicoil", + limit=1 +) +``` -Qdrant supports full-text search for string payload. -Full-text index allows you to filter points by the presence of a word or a phrase in the payload field. +We will get: -Full-text index configuration is a bit more complex than other indexes, as you can specify the tokenization parameters. -Tokenization is the process of splitting a string into tokens, which are then indexed in the inverted index. +```bash +QueryResponse( + points=[ + ScoredPoint( + id=28, version=1, score=0.7005557, + payload={ + 'title': 'Vector Control Strategies in Public Health' + }, + vector=None, shard_key=None, order_value=None) + ] + ) +``` -To create a full-text index, you can use the following: +<|page-124-lllmstxt|> +# Interfaces -httppythontypescriptrustjavacsharpgo +Qdrant supports these "official" clients. -```http -PUT /collections/{collection_name}/index -{ - "field_name": "name_of_the_field_to_index", - "field_schema": { - "type": "text", - "tokenizer": "word", - "min_token_len": 2, - "max_token_len": 20, - "lowercase": true - } -} +> **Note:** If you are using a language that is not listed here, you can use the REST API directly or generate a client for your language +using [OpenAPI](https://github.com/qdrant/qdrant/blob/master/docs/redoc/master/openapi.json) +or [protobuf](https://github.com/qdrant/qdrant/tree/master/lib/api/src/grpc/proto) definitions. -``` +## Client Libraries -```python -from qdrant_client import QdrantClient, models +||Client Repository|Installation|Version| +|-|-|-|-| +|[![python](/docs/misc/python.webp)](https://python-client.qdrant.tech/)|**[Python](https://github.com/qdrant/qdrant-client)** + **[(Client Docs)](https://python-client.qdrant.tech/)**|`pip install qdrant-client[fastembed]`|[Latest Release](https://github.com/qdrant/qdrant-client/releases)| +|![typescript](/docs/misc/ts.webp)|**[JavaScript / Typescript](https://github.com/qdrant/qdrant-js)**|`npm install @qdrant/js-client-rest`|[Latest Release](https://github.com/qdrant/qdrant-js/releases)| +|![rust](/docs/misc/rust.png)|**[Rust](https://github.com/qdrant/rust-client)**|`cargo add qdrant-client`|[Latest Release](https://github.com/qdrant/rust-client/releases)| +|![golang](/docs/misc/go.webp)|**[Go](https://github.com/qdrant/go-client)**|`go get github.com/qdrant/go-client`|[Latest Release](https://github.com/qdrant/go-client/releases)| +|![.net](/docs/misc/dotnet.webp)|**[.NET](https://github.com/qdrant/qdrant-dotnet)**|`dotnet add package Qdrant.Client`|[Latest Release](https://github.com/qdrant/qdrant-dotnet/releases)| +|![java](/docs/misc/java.webp)|**[Java](https://github.com/qdrant/java-client)**|[Available on Maven Central](https://central.sonatype.com/artifact/io.qdrant/client)|[Latest Release](https://github.com/qdrant/java-client/releases)| -client = QdrantClient(url="http://localhost:6333") +## API Reference -client.create_payload_index( - collection_name="{collection_name}", - field_name="name_of_the_field_to_index", - field_schema=models.TextIndexParams( - type="text", - tokenizer=models.TokenizerType.WORD, - min_token_len=2, - max_token_len=15, - lowercase=True, - ), -) +All interaction with Qdrant takes place via the REST API. We recommend using REST API if you are using Qdrant for the first time or if you are working on a prototype. -``` +| API | Documentation | +| -------- | ------------------------------------------------------------------------------------ | +| REST API | [OpenAPI Specification](https://api.qdrant.tech/api-reference) | +| gRPC API | [gRPC Documentation](https://github.com/qdrant/qdrant/blob/master/docs/grpc/docs.md) | -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +### gRPC Interface -const client = new QdrantClient({ host: "localhost", port: 6333 }); +The gRPC methods follow the same principles as REST. For each REST endpoint, there is a corresponding gRPC method. -client.createPayloadIndex("{collection_name}", { - field_name: "name_of_the_field_to_index", - field_schema: { - type: "text", - tokenizer: "word", - min_token_len: 2, - max_token_len: 15, - lowercase: true, - }, -}); +As per the [configuration file](https://github.com/qdrant/qdrant/blob/master/config/config.yaml), the gRPC interface is available on the specified port. +```yaml +service: + grpc_port: 6334 ``` -```rust -use qdrant_client::qdrant::{ - payload_index_params::IndexParams, CreateFieldIndexCollectionBuilder, FieldType, - PayloadIndexParams, TextIndexParams, TokenizerType, -}; -use qdrant_client::Qdrant; - -let client = Qdrant::from_url("http://localhost:6334").build()?; + -client - .create_field_index( - CreateFieldIndexCollectionBuilder::new( - "{collection_name}", - "name_of_the_field_to_index", - FieldType::Text, - ) - .field_index_params(PayloadIndexParams { - index_params: Some(IndexParams::TextIndexParams(TextIndexParams { - tokenizer: TokenizerType::Word as i32, - min_token_len: Some(2), - max_token_len: Some(10), - lowercase: Some(true), - })), - }), - ) - .await?; +Running the service inside of Docker will look like this: +```bash +docker run -p 6333:6333 -p 6334:6334 \ + -v $(pwd)/qdrant_storage:/qdrant/storage:z \ + qdrant/qdrant ``` -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.PayloadIndexParams; -import io.qdrant.client.grpc.Collections.PayloadSchemaType; -import io.qdrant.client.grpc.Collections.TextIndexParams; -import io.qdrant.client.grpc.Collections.TokenizerType; - -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); - -client - .createPayloadIndexAsync( - "{collection_name}", - "name_of_the_field_to_index", - PayloadSchemaType.Text, - PayloadIndexParams.newBuilder() - .setTextIndexParams( - TextIndexParams.newBuilder() - .setTokenizer(TokenizerType.Word) - .setMinTokenLen(2) - .setMaxTokenLen(10) - .setLowercase(true) - .build()) - .build(), - null, - null, - null) - .get(); +**When to use gRPC:** The choice between gRPC and the REST API is a trade-off between convenience and speed. gRPC is a binary protocol and can be more challenging to debug. We recommend using gRPC if you are already familiar with Qdrant and are trying to optimize the performance of your application. -``` +<|page-125-lllmstxt|> +# API Reference -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +## Packages +- [qdrant.io/v1](#qdrantiov1) -var client = new QdrantClient("localhost", 6334); -await client.CreatePayloadIndexAsync( - collectionName: "{collection_name}", - fieldName: "name_of_the_field_to_index", - schemaType: PayloadSchemaType.Text, - indexParams: new PayloadIndexParams - { - TextIndexParams = new TextIndexParams - { - Tokenizer = TokenizerType.Word, - MinTokenLen = 2, - MaxTokenLen = 10, - Lowercase = true - } - } -); +## qdrant.io/v1 -``` +Package v1 contains API Schema definitions for the qdrant.io v1 API group -```go -import ( - "context" +### Resource Types +- [QdrantCloudRegion](#qdrantcloudregion) +- [QdrantCloudRegionList](#qdrantcloudregionlist) +- [QdrantCluster](#qdrantcluster) +- [QdrantClusterList](#qdrantclusterlist) +- [QdrantClusterRestore](#qdrantclusterrestore) +- [QdrantClusterRestoreList](#qdrantclusterrestorelist) +- [QdrantClusterScheduledSnapshot](#qdrantclusterscheduledsnapshot) +- [QdrantClusterScheduledSnapshotList](#qdrantclusterscheduledsnapshotlist) +- [QdrantClusterSnapshot](#qdrantclustersnapshot) +- [QdrantClusterSnapshotList](#qdrantclustersnapshotlist) +- [QdrantEntity](#qdrantentity) +- [QdrantEntityList](#qdrantentitylist) +- [QdrantRelease](#qdrantrelease) +- [QdrantReleaseList](#qdrantreleaselist) - "github.com/qdrant/go-client/qdrant" -) -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) -client.CreateFieldIndex(context.Background(), &qdrant.CreateFieldIndexCollection{ - CollectionName: "{collection_name}", - FieldName: "name_of_the_field_to_index", - FieldType: qdrant.FieldType_FieldTypeText.Enum(), - FieldIndexParams: qdrant.NewPayloadIndexParamsText( - &qdrant.TextIndexParams{ - Tokenizer: qdrant.TokenizerType_Whitespace, - MinTokenLen: qdrant.PtrOf(uint64(2)), - MaxTokenLen: qdrant.PtrOf(uint64(10)), - Lowercase: qdrant.PtrOf(true), - }), -}) -``` -Available tokenizers are: +#### ClusterPhase -- `word` \- splits the string into words, separated by spaces, punctuation marks, and special characters. -- `whitespace` \- splits the string into words, separated by spaces. -- `prefix` \- splits the string into words, separated by spaces, punctuation marks, and special characters, and then creates a prefix index for each word. For example: `hello` will be indexed as `h`, `he`, `hel`, `hell`, `hello`. -- `multilingual` \- special type of tokenizer based on [charabia](https://github.com/meilisearch/charabia) package. It allows proper tokenization and lemmatization for multiple languages, including those with non-latin alphabets and non-space delimiters. See [charabia documentation](https://github.com/meilisearch/charabia) for full list of supported languages supported normalization options. In the default build configuration, qdrant does not include support for all languages, due to the increasing size of the resulting binary. Chinese, Japanese and Korean languages are not enabled by default, but can be enabled by building qdrant from source with `--features multiling-chinese,multiling-japanese,multiling-korean` flags. +_Underlying type:_ _string_ -See [Full Text match](https://qdrant.tech/documentation/concepts/filtering/#full-text-match) for examples of querying with full-text index. -### [Anchor](https://qdrant.tech/documentation/concepts/indexing/\#parameterized-index) Parameterized index -_Available as of v1.8.0_ -We’ve added a parameterized variant to the `integer` index, which allows -you to fine-tune indexing and search performance. -Both the regular and parameterized `integer` indexes use the following flags: +_Appears in:_ +- [QdrantClusterStatus](#qdrantclusterstatus) -- `lookup`: enables support for direct lookup using -[Match](https://qdrant.tech/documentation/concepts/filtering/#match) filters. -- `range`: enables support for -[Range](https://qdrant.tech/documentation/concepts/filtering/#range) filters. +| Field | Description | +| --- | --- | +| `Creating` | | +| `FailedToCreate` | | +| `Updating` | | +| `FailedToUpdate` | | +| `Scaling` | | +| `Upgrading` | | +| `Suspending` | | +| `Suspended` | | +| `FailedToSuspend` | | +| `Resuming` | | +| `FailedToResume` | | +| `Healthy` | | +| `NotReady` | | +| `RecoveryMode` | | +| `ManualMaintenance` | | -The regular `integer` index assumes both `lookup` and `range` are `true`. In -contrast, to configure a parameterized index, you would set only one of these -filters to `true`: -| `lookup` | `range` | Result | -| --- | --- | --- | -| `true` | `true` | Regular integer index | -| `true` | `false` | Parameterized integer index | -| `false` | `true` | Parameterized integer index | -| `false` | `false` | No integer index | +#### ComponentPhase -The parameterized index can enhance performance in collections with millions -of points. We encourage you to try it out. If it does not enhance performance -in your use case, you can always restore the regular `integer` index. +_Underlying type:_ _string_ -Note: If you set `"lookup": true` with a range filter, that may lead to -significant performance issues. -For example, the following code sets up a parameterized integer index which -supports only range filters: -httppythontypescriptrustjavacsharpgo -```http -PUT /collections/{collection_name}/index -{ - "field_name": "name_of_the_field_to_index", - "field_schema": { - "type": "integer", - "lookup": false, - "range": true - } -} -``` +_Appears in:_ +- [ComponentStatus](#componentstatus) -```python -from qdrant_client import QdrantClient, models +| Field | Description | +| --- | --- | +| `Ready` | | +| `NotReady` | | +| `Unknown` | | +| `NotFound` | | -client = QdrantClient(url="http://localhost:6333") -client.create_payload_index( - collection_name="{collection_name}", - field_name="name_of_the_field_to_index", - field_schema=models.IntegerIndexParams( - type=models.IntegerIndexType.INTEGER, - lookup=False, - range=True, - ), -) +#### ComponentReference -``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; -const client = new QdrantClient({ host: "localhost", port: 6333 }); -client.createPayloadIndex("{collection_name}", { - field_name: "name_of_the_field_to_index", - field_schema: { - type: "integer", - lookup: false, - range: true, - }, -}); -``` -```rust -use qdrant_client::qdrant::{ - payload_index_params::IndexParams, CreateFieldIndexCollectionBuilder, FieldType, - IntegerIndexParams, PayloadIndexParams, -}; -use qdrant_client::Qdrant; -let client = Qdrant::from_url("http://localhost:6334").build()?; +_Appears in:_ +- [QdrantCloudRegionSpec](#qdrantcloudregionspec) -client - .create_field_index( - CreateFieldIndexCollectionBuilder::new( - "{collection_name}", - "name_of_the_field_to_index", - FieldType::Integer, - ) - .field_index_params(PayloadIndexParams { - index_params: Some(IndexParams::IntegerIndexParams(IntegerIndexParams { - lookup: false, - range: true, - })), - }), - ) - .await?; +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | APIVersion is the group and version of the component being referenced. | | | +| `kind` _string_ | Kind is the type of component being referenced | | | +| `name` _string_ | Name is the name of component being referenced | | | +| `namespace` _string_ | Namespace is the namespace of component being referenced. | | | +| `markedForDeletion` _boolean_ | MarkedForDeletion specifies whether the component is marked for deletion | | | -``` -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.IntegerIndexParams; -import io.qdrant.client.grpc.Collections.PayloadIndexParams; -import io.qdrant.client.grpc.Collections.PayloadSchemaType; +#### ComponentStatus -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -client - .createPayloadIndexAsync( - "{collection_name}", - "name_of_the_field_to_index", - PayloadSchemaType.Integer, - PayloadIndexParams.newBuilder() - .setIntegerIndexParams( - IntegerIndexParams.newBuilder().setLookup(false).setRange(true).build()) - .build(), - null, - null, - null) - .get(); -``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -var client = new QdrantClient("localhost", 6334); -await client.CreatePayloadIndexAsync( - collectionName: "{collection_name}", - fieldName: "name_of_the_field_to_index", - schemaType: PayloadSchemaType.Integer, - indexParams: new PayloadIndexParams - { - IntegerIndexParams = new() - { - Lookup = false, - Range = true - } - } -); -``` +_Appears in:_ +- [QdrantCloudRegionStatus](#qdrantcloudregionstatus) -```go -import ( - "context" +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `name` _string_ | Name specifies the name of the component | | | +| `namespace` _string_ | Namespace specifies the namespace of the component | | | +| `version` _string_ | Version specifies the version of the component | | | +| `phase` _[ComponentPhase](#componentphase)_ | Phase specifies the current phase of the component | | | +| `message` _string_ | Message specifies the info explaining the current phase of the component | | | - "github.com/qdrant/go-client/qdrant" -) -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +#### EntityPhase -client.CreateFieldIndex(context.Background(), &qdrant.CreateFieldIndexCollection{ - CollectionName: "{collection_name}", - FieldName: "name_of_the_field_to_index", - FieldType: qdrant.FieldType_FieldTypeInteger.Enum(), - FieldIndexParams: qdrant.NewPayloadIndexParamsInt( - &qdrant.IntegerIndexParams{ - Lookup: false, - Range: true, - }), -}) +_Underlying type:_ _string_ -``` -### [Anchor](https://qdrant.tech/documentation/concepts/indexing/\#on-disk-payload-index) On-disk payload index -_Available as of v1.11.0_ -By default all payload-related structures are stored in memory. In this way, the vector index can quickly access payload values during search. -As latency in this case is critical, it is recommended to keep hot payload indexes in memory. -There are, however, cases when payload indexes are too large or rarely used. In those cases, it is possible to store payload indexes on disk. +_Appears in:_ +- [QdrantEntityStatus](#qdrantentitystatus) -To configure on-disk payload index, you can use the following index parameters: +| Field | Description | +| --- | --- | +| `Creating` | | +| `Ready` | | +| `Updating` | | +| `Failing` | | +| `Deleting` | | +| `Deleted` | | -httppythontypescriptrustjavacsharpgo -```http -PUT /collections/{collection_name}/index -{ - "field_name": "payload_field_name", - "field_schema": { - "type": "keyword", - "on_disk": true - } -} +#### EntityResult -``` +_Underlying type:_ _string_ -```python -client.create_payload_index( - collection_name="{collection_name}", - field_name="payload_field_name", - field_schema=models.KeywordIndexParams( - type=models.KeywordIndexType.KEYWORD, - on_disk=True, - ), -) +EntityResult is the last result from the invocation to a manager -``` -```typescript -client.createPayloadIndex("{collection_name}", { - field_name: "payload_field_name", - field_schema: { - type: "keyword", - on_disk: true - }, -}); -``` +_Appears in:_ +- [QdrantEntityStatusResult](#qdrantentitystatusresult) -```rust -use qdrant_client::qdrant::{ - CreateFieldIndexCollectionBuilder, - KeywordIndexParamsBuilder, - FieldType -}; -use qdrant_client::{Qdrant, QdrantError}; +| Field | Description | +| --- | --- | +| `Ok` | | +| `Pending` | | +| `Error` | | -let client = Qdrant::from_url("http://localhost:6334").build()?; -client.create_field_index( - CreateFieldIndexCollectionBuilder::new( - "{collection_name}", - "payload_field_name", - FieldType::Keyword, - ) - .field_index_params( - KeywordIndexParamsBuilder::default() - .on_disk(true), - ), -); +#### GPU -``` -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.PayloadIndexParams; -import io.qdrant.client.grpc.Collections.PayloadSchemaType; -import io.qdrant.client.grpc.Collections.KeywordIndexParams; -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -client - .createPayloadIndexAsync( - "{collection_name}", - "payload_field_name", - PayloadSchemaType.Keyword, - PayloadIndexParams.newBuilder() - .setKeywordIndexParams( - KeywordIndexParams.newBuilder() - .setOnDisk(true) - .build()) - .build(), - null, - null, - null) - .get(); -``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -var client = new QdrantClient("localhost", 6334); +_Appears in:_ +- [QdrantClusterSpec](#qdrantclusterspec) -await client.CreatePayloadIndexAsync( - collectionName: "{collection_name}", - fieldName: "payload_field_name", - schemaType: PayloadSchemaType.Keyword, - indexParams: new PayloadIndexParams - { - KeywordIndexParams = new KeywordIndexParams - { - OnDisk = true - } - } -); +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `gpuType` _[GPUType](#gputype)_ | GPUType specifies the type of the GPU to use. If set, GPU indexing is enabled. | | Enum: [nvidia amd]
| +| `forceHalfPrecision` _boolean_ | ForceHalfPrecision for `f32` values while indexing.
`f16` conversion will take place
only inside GPU memory and won't affect storage type. | false | | +| `deviceFilter` _string array_ | DeviceFilter for GPU devices by hardware name. Case-insensitive.
List of substrings to match against the gpu device name.
Example: [- "nvidia"]
If not specified, all devices are accepted. | | MinItems: 1
| +| `devices` _string array_ | Devices is a List of explicit GPU devices to use.
If host has multiple GPUs, this option allows to select specific devices
by their index in the list of found devices.
If `deviceFilter` is set, indexes are applied after filtering.
If not specified, all devices are accepted. | | MinItems: 1
| +| `parallelIndexes` _integer_ | ParallelIndexes is the number of parallel indexes to run on the GPU. | 1 | Minimum: 1
| +| `groupsCount` _integer_ | GroupsCount is the amount of used vulkan "groups" of GPU.
In other words, how many parallel points can be indexed by GPU.
Optimal value might depend on the GPU model.
Proportional, but doesn't necessary equal to the physical number of warps.
Do not change this value unless you know what you are doing. | | Minimum: 1
| +| `allowIntegrated` _boolean_ | AllowIntegrated specifies whether to allow integrated GPUs to be used. | false | | -``` -```go -import ( - "context" +#### GPUType - "github.com/qdrant/go-client/qdrant" -) +_Underlying type:_ _string_ -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +GPUType specifies the type of GPU to use. -client.CreateFieldIndex(context.Background(), &qdrant.CreateFieldIndexCollection{ - CollectionName: "{collection_name}", - FieldName: "name_of_the_field_to_index", - FieldType: qdrant.FieldType_FieldTypeKeyword.Enum(), - FieldIndexParams: qdrant.NewPayloadIndexParamsKeyword( - &qdrant.KeywordIndexParams{ - OnDisk: qdrant.PtrOf(true), - }), -}) +_Validation:_ +- Enum: [nvidia amd] -``` +_Appears in:_ +- [GPU](#gpu) -Payload index on-disk is supported for following types: +| Field | Description | +| --- | --- | +| `nvidia` | | +| `amd` | | -- `keyword` -- `integer` -- `float` -- `datetime` -- `uuid` -- `text` -- `geo` -The list will be extended in future versions. +#### HelmRelease -### [Anchor](https://qdrant.tech/documentation/concepts/indexing/\#tenant-index) Tenant Index -_Available as of v1.11.0_ -Many vector search use-cases require multitenancy. In a multi-tenant scenario the collection is expected to contain multiple subsets of data, where each subset belongs to a different tenant. -Qdrant supports efficient multi-tenant search by enabling [special configuration](https://qdrant.tech/documentation/guides/multiple-partitions/) vector index, which disables global search and only builds sub-indexes for each tenant. -However, knowing that the collection contains multiple tenants unlocks more opportunities for optimization. -To optimize storage in Qdrant further, you can enable tenant indexing for payload fields. -This option will tell Qdrant which fields are used for tenant identification and will allow Qdrant to structure storage for faster search of tenant-specific data. -One example of such optimization is localizing tenant-specific data closer on disk, which will reduce the number of disk reads during search. -To enable tenant index for a field, you can use the following index parameters: +_Appears in:_ +- [QdrantCloudRegionSpec](#qdrantcloudregionspec) -httppythontypescriptrustjavacsharpgo +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `markedForDeletionAt` _string_ | MarkedForDeletionAt specifies the time when the helm release was marked for deletion | | | +| `object` _[HelmRelease](#helmrelease)_ | Object specifies the helm release object | | EmbeddedResource: \{\}
| -```http -PUT /collections/{collection_name}/index -{ - "field_name": "payload_field_name", - "field_schema": { - "type": "keyword", - "is_tenant": true - } -} -``` +#### HelmRepository -```python -client.create_payload_index( - collection_name="{collection_name}", - field_name="payload_field_name", - field_schema=models.KeywordIndexParams( - type=models.KeywordIndexType.KEYWORD, - is_tenant=True, - ), -) -``` -```typescript -client.createPayloadIndex("{collection_name}", { - field_name: "payload_field_name", - field_schema: { - type: "keyword", - is_tenant: true - }, -}); -``` -```rust -use qdrant_client::qdrant::{ - CreateFieldIndexCollectionBuilder, - KeywordIndexParamsBuilder, - FieldType -}; -use qdrant_client::{Qdrant, QdrantError}; -let client = Qdrant::from_url("http://localhost:6334").build()?; -client.create_field_index( - CreateFieldIndexCollectionBuilder::new( - "{collection_name}", - "payload_field_name", - FieldType::Keyword, - ) - .field_index_params( - KeywordIndexParamsBuilder::default() - .is_tenant(true), - ), -); +_Appears in:_ +- [QdrantCloudRegionSpec](#qdrantcloudregionspec) -``` +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `markedForDeletionAt` _string_ | MarkedForDeletionAt specifies the time when the helm repository was marked for deletion | | | +| `object` _[HelmRepository](#helmrepository)_ | Object specifies the helm repository object | | EmbeddedResource: \{\}
| -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.PayloadIndexParams; -import io.qdrant.client.grpc.Collections.PayloadSchemaType; -import io.qdrant.client.grpc.Collections.KeywordIndexParams; -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +#### InferenceConfig -client - .createPayloadIndexAsync( - "{collection_name}", - "payload_field_name", - PayloadSchemaType.Keyword, - PayloadIndexParams.newBuilder() - .setKeywordIndexParams( - KeywordIndexParams.newBuilder() - .setIsTenant(true) - .build()) - .build(), - null, - null, - null) - .get(); -``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -var client = new QdrantClient("localhost", 6334); -await client.CreatePayloadIndexAsync( - collectionName: "{collection_name}", - fieldName: "payload_field_name", - schemaType: PayloadSchemaType.Keyword, - indexParams: new PayloadIndexParams - { - KeywordIndexParams = new KeywordIndexParams - { - IsTenant = true - } - } -); -``` -```go -import ( - "context" +_Appears in:_ +- [QdrantConfiguration](#qdrantconfiguration) - "github.com/qdrant/go-client/qdrant" -) +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `enabled` _boolean_ | Enabled specifies whether to enable inference for the cluster or not. | false | | -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) -client.CreateFieldIndex(context.Background(), &qdrant.CreateFieldIndexCollection{ - CollectionName: "{collection_name}", - FieldName: "name_of_the_field_to_index", - FieldType: qdrant.FieldType_FieldTypeKeyword.Enum(), - FieldIndexParams: qdrant.NewPayloadIndexParamsKeyword( - &qdrant.KeywordIndexParams{ - IsTenant: qdrant.PtrOf(true), - }), -}) +#### Ingress -``` -Tenant optimization is supported for the following datatypes: -- `keyword` -- `uuid` -### [Anchor](https://qdrant.tech/documentation/concepts/indexing/\#principal-index) Principal Index -_Available as of v1.11.0_ -Similar to the tenant index, the principal index is used to optimize storage for faster search, assuming that the search request is primarily filtered by the principal field. -A good example of a use case for the principal index is time-related data, where each point is associated with a timestamp. In this case, the principal index can be used to optimize storage for faster search with time-based filters. +_Appears in:_ +- [QdrantClusterSpec](#qdrantclusterspec) -httppythontypescriptrustjavacsharpgo +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `enabled` _boolean_ | Enabled specifies whether to enable ingress for the cluster or not. | | | +| `annotations` _object (keys:string, values:string)_ | Annotations specifies annotations for the ingress. | | | +| `ingressClassName` _string_ | IngressClassName specifies the name of the ingress class | | | +| `host` _string_ | Host specifies the host for the ingress. | | | +| `tls` _boolean_ | TLS specifies whether to enable tls for the ingress.
The default depends on the ingress provider:
- KubernetesIngress: False
- NginxIngress: False
- QdrantCloudTraefik: Depending on the config.tls setting of the operator. | | | +| `tlsSecretName` _string_ | TLSSecretName specifies the name of the secret containing the tls certificate. | | | +| `nginx` _[NGINXConfig](#nginxconfig)_ | NGINX specifies the nginx ingress specific configurations. | | | +| `traefik` _[TraefikConfig](#traefikconfig)_ | Traefik specifies the traefik ingress specific configurations. | | | -```http -PUT /collections/{collection_name}/index -{ - "field_name": "timestamp", - "field_schema": { - "type": "integer", - "is_principal": true - } -} -``` +#### KubernetesDistribution -```python -client.create_payload_index( - collection_name="{collection_name}", - field_name="timestamp", - field_schema=models.IntegerIndexParams( - type=models.IntegerIndexType.INTEGER, - is_principal=True, - ), -) +_Underlying type:_ _string_ -``` -```typescript -client.createPayloadIndex("{collection_name}", { - field_name: "timestamp", - field_schema: { - type: "integer", - is_principal: true - }, -}); -``` -```rust -use qdrant_client::qdrant::{ - CreateFieldIndexCollectionBuilder, - IntegerIndexParamsBuilder, - FieldType -}; -use qdrant_client::{Qdrant, QdrantError}; -let client = Qdrant::from_url("http://localhost:6334").build()?; +_Appears in:_ +- [QdrantCloudRegionStatus](#qdrantcloudregionstatus) -client.create_field_index( - CreateFieldIndexCollectionBuilder::new( - "{collection_name}", - "timestamp", - FieldType::Integer, - ) - .field_index_params( - IntegerIndexParamsBuilder::default() - .is_principal(true), - ), -); +| Field | Description | +| --- | --- | +| `unknown` | | +| `aws` | | +| `gcp` | | +| `azure` | | +| `do` | | +| `scaleway` | | +| `openshift` | | +| `linode` | | +| `civo` | | +| `oci` | | +| `ovhcloud` | | +| `stackit` | | +| `vultr` | | +| `k3s` | | -``` -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.PayloadIndexParams; -import io.qdrant.client.grpc.Collections.PayloadSchemaType; -import io.qdrant.client.grpc.Collections.IntegerIndexParams; +#### KubernetesPod -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -client - .createPayloadIndexAsync( - "{collection_name}", - "timestamp", - PayloadSchemaType.Integer, - PayloadIndexParams.newBuilder() - .setIntegerIndexParams( - KeywordIndexParams.newBuilder() - .setIsPrincipa(true) - .build()) - .build(), - null, - null, - null) - .get(); -``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -var client = new QdrantClient("localhost", 6334); -await client.CreatePayloadIndexAsync( - collectionName: "{collection_name}", - fieldName: "timestamp", - schemaType: PayloadSchemaType.Integer, - indexParams: new PayloadIndexParams - { - IntegerIndexParams = new IntegerIndexParams - { - IsPrincipal = true - } - } -); -``` +_Appears in:_ +- [KubernetesStatefulSet](#kubernetesstatefulset) -```go -import ( - "context" +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `annotations` _object (keys:string, values:string)_ | Annotations specifies the annotations for the Pods. | | | +| `labels` _object (keys:string, values:string)_ | Labels specifies the labels for the Pods. | | | +| `extraEnv` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | ExtraEnv specifies the extra environment variables for the Pods. | | | - "github.com/qdrant/go-client/qdrant" -) -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +#### KubernetesService -client.CreateFieldIndex(context.Background(), &qdrant.CreateFieldIndexCollection{ - CollectionName: "{collection_name}", - FieldName: "name_of_the_field_to_index", - FieldType: qdrant.FieldType_FieldTypeInteger.Enum(), - FieldIndexParams: qdrant.NewPayloadIndexParamsInt( - &qdrant.IntegerIndexParams{ - IsPrincipal: qdrant.PtrOf(true), - }), -}) -``` -Principal optimization is supported for following types: -- `integer` -- `float` -- `datetime` -## [Anchor](https://qdrant.tech/documentation/concepts/indexing/\#vector-index) Vector Index -A vector index is a data structure built on vectors through a specific mathematical model. -Through the vector index, we can efficiently query several vectors similar to the target vector. -Qdrant currently only uses HNSW as a dense vector index. +_Appears in:_ +- [QdrantClusterSpec](#qdrantclusterspec) -[HNSW](https://arxiv.org/abs/1603.09320) (Hierarchical Navigable Small World Graph) is a graph-based indexing algorithm. It builds a multi-layer navigation structure for an image according to certain rules. In this structure, the upper layers are more sparse and the distances between nodes are farther. The lower layers are denser and the distances between nodes are closer. The search starts from the uppermost layer, finds the node closest to the target in this layer, and then enters the next layer to begin another search. After multiple iterations, it can quickly approach the target position. +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `type` _[ServiceType](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#servicetype-v1-core)_ | Type specifies the type of the Service: "ClusterIP", "NodePort", "LoadBalancer". | ClusterIP | | +| `annotations` _object (keys:string, values:string)_ | Annotations specifies the annotations for the Service. | | | -In order to improve performance, HNSW limits the maximum degree of nodes on each layer of the graph to `m`. In addition, you can use `ef_construct` (when building index) or `ef` (when searching targets) to specify a search range. -The corresponding parameters could be configured in the configuration file: +#### KubernetesStatefulSet -```yaml -storage: - # Default parameters of HNSW Index. Could be overridden for each collection or named vector individually - hnsw_index: - # Number of edges per node in the index graph. - # Larger the value - more accurate the search, more space required. - m: 16 - # Number of neighbours to consider during the index building. - # Larger the value - more accurate the search, more time required to build index. - ef_construct: 100 - # Minimal size threshold (in KiloBytes) below which full-scan is preferred over HNSW search. - # This measures the total size of vectors being queried against. - # When the maximum estimated amount of points that a condition satisfies is smaller than - # `full_scan_threshold_kb`, the query planner will use full-scan search instead of HNSW index - # traversal for better performance. - # Note: 1Kb = 1 vector of size 256 - full_scan_threshold: 10000 -``` -And so in the process of creating a [collection](https://qdrant.tech/documentation/concepts/collections/). The `ef` parameter is configured during [the search](https://qdrant.tech/documentation/concepts/search/) and by default is equal to `ef_construct`. -HNSW is chosen for several reasons. -First, HNSW is well-compatible with the modification that allows Qdrant to use filters during a search. -Second, it is one of the most accurate and fastest algorithms, according to [public benchmarks](https://github.com/erikbern/ann-benchmarks). -_Available as of v1.1.1_ -The HNSW parameters can also be configured on a collection and named vector -level by setting [`hnsw_config`](https://qdrant.tech/documentation/concepts/indexing/#vector-index) to fine-tune search -performance. -## [Anchor](https://qdrant.tech/documentation/concepts/indexing/\#sparse-vector-index) Sparse Vector Index +_Appears in:_ +- [QdrantClusterSpec](#qdrantclusterspec) -_Available as of v1.7.0_ +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `annotations` _object (keys:string, values:string)_ | Annotations specifies the annotations for the StatefulSet. | | | +| `pods` _[KubernetesPod](#kubernetespod)_ | Pods specifies the configuration of the Pods of the Qdrant StatefulSet. | | | -Sparse vectors in Qdrant are indexed with a special data structure, which is optimized for vectors that have a high proportion of zeroes. In some ways, this indexing method is similar to the inverted index, which is used in text search engines. -- A sparse vector index in Qdrant is exact, meaning it does not use any approximation algorithms. -- All sparse vectors added to the collection are immediately indexed in the mutable version of a sparse index. +#### MetricSource -With Qdrant, you can benefit from a more compact and efficient immutable sparse index, which is constructed during the same optimization process as the dense vector index. +_Underlying type:_ _string_ -This approach is particularly useful for collections storing both dense and sparse vectors. -To configure a sparse vector index, create a collection with the following parameters: -httppythontypescriptrustjavacsharpgo -```http -PUT /collections/{collection_name} -{ - "sparse_vectors": { - "text": { - "index": { - "on_disk": false - } - } - } -} -``` +_Appears in:_ +- [Monitoring](#monitoring) -```python -from qdrant_client import QdrantClient, models +| Field | Description | +| --- | --- | +| `kubelet` | | +| `api` | | -client = QdrantClient(url="http://localhost:6333") -client.create_collection( - collection_name="{collection_name}", - vectors_config={}, - sparse_vectors_config={ - "text": models.SparseVectorParams( - index=models.SparseIndexParams(on_disk=False), - ) - }, -) +#### Monitoring -``` -```typescript -import { QdrantClient, Schemas } from "@qdrant/js-client-rest"; -const client = new QdrantClient({ host: "localhost", port: 6333 }); -client.createCollection("{collection_name}", { - sparse_vectors: { - "splade-model-name": { - index: { - on_disk: false - } - } - } -}); -``` -```rust -use qdrant_client::qdrant::{ - CreateCollectionBuilder, SparseIndexConfigBuilder, SparseVectorParamsBuilder, - SparseVectorsConfigBuilder, -}; -use qdrant_client::Qdrant; -let client = Qdrant::from_url("http://localhost:6334").build()?; +_Appears in:_ +- [QdrantCloudRegionStatus](#qdrantcloudregionstatus) -let mut sparse_vectors_config = SparseVectorsConfigBuilder::default(); +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `cAdvisorMetricSource` _[MetricSource](#metricsource)_ | CAdvisorMetricSource specifies the cAdvisor metric source | | | +| `nodeMetricSource` _[MetricSource](#metricsource)_ | NodeMetricSource specifies the node metric source | | | -sparse_vectors_config.add_named_vector_params( - "splade-model-name", - SparseVectorParamsBuilder::default() - .index(SparseIndexConfigBuilder::default().on_disk(true)), -); -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .sparse_vectors_config(sparse_vectors_config), - ) - .await?; +#### NGINXConfig -``` -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections; -QdrantClient client = new QdrantClient( - QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -client.createCollectionAsync( - Collections.CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setSparseVectorsConfig( - Collections.SparseVectorConfig.newBuilder().putMap( - "splade-model-name", - Collections.SparseVectorParams.newBuilder() - .setIndex( - Collections.SparseIndexConfig - .newBuilder() - .setOnDisk(false) - .build() - ).build() - ).build() - ).build() -).get(); -``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +_Appears in:_ +- [Ingress](#ingress) -var client = new QdrantClient("localhost", 6334); +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `allowedSourceRanges` _string array_ | AllowedSourceRanges specifies the allowed CIDR source ranges for the ingress. | | | +| `grpcHost` _string_ | GRPCHost specifies the host name for the GRPC ingress. | | | -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - sparseVectorsConfig: ("splade-model-name", new SparseVectorParams{ - Index = new SparseIndexConfig { - OnDisk = false, - } - }) -); -``` +#### NodeInfo -```go -import ( - "context" - "github.com/qdrant/go-client/qdrant" -) -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - SparseVectorsConfig: qdrant.NewSparseVectorsConfig( - map[string]*qdrant.SparseVectorParams{ - "splade-model-name": { - Index: &qdrant.SparseIndexConfig{ - OnDisk: qdrant.PtrOf(false), - }}, - }), -}) -``` -\` -The following parameters may affect performance: +_Appears in:_ +- [QdrantCloudRegionStatus](#qdrantcloudregionstatus) -- `on_disk: true` \- The index is stored on disk, which lets you save memory. This may slow down search performance. -- `on_disk: false` \- The index is still persisted on disk, but it is also loaded into memory for faster search. +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `name` _string_ | Name specifies the name of the node | | | +| `region` _string_ | Region specifies the region of the node | | | +| `zone` _string_ | Zone specifies the zone of the node | | | +| `instanceType` _string_ | InstanceType specifies the instance type of the node | | | +| `arch` _string_ | Arch specifies the CPU architecture of the node | | | +| `capacity` _[NodeResourceInfo](#noderesourceinfo)_ | Capacity specifies the capacity of the node | | | +| `allocatable` _[NodeResourceInfo](#noderesourceinfo)_ | Allocatable specifies the allocatable resources of the node | | | -Unlike a dense vector index, a sparse vector index does not require a pre-defined vector size. It automatically adjusts to the size of the vectors added to the collection. -**Note:** A sparse vector index only supports dot-product similarity searches. It does not support other distance metrics. +#### NodeResourceInfo -### [Anchor](https://qdrant.tech/documentation/concepts/indexing/\#idf-modifier) IDF Modifier -_Available as of v1.10.0_ -For many search algorithms, it is important to consider how often an item occurs in a collection. -Intuitively speaking, the less frequently an item appears in a collection, the more important it is in a search. -This is also known as the Inverse Document Frequency (IDF). It is used in text search engines to rank search results based on the rarity of a word in a collection. -IDF depends on the currently stored documents and therefore can’t be pre-computed in the sparse vectors in streaming inference mode. -In order to support IDF in the sparse vector index, Qdrant provides an option to modify the sparse vector query with the IDF statistics automatically. -The only requirement is to enable the IDF modifier in the collection configuration: -httppythontypescriptrustjavacsharpgo +_Appears in:_ +- [NodeInfo](#nodeinfo) -```http -PUT /collections/{collection_name} -{ - "sparse_vectors": { - "text": { - "modifier": "idf" - } - } -} +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `cpu` _string_ | CPU specifies the CPU resources of the node | | | +| `memory` _string_ | Memory specifies the memory resources of the node | | | +| `pods` _string_ | Pods specifies the pods resources of the node | | | +| `ephemeralStorage` _string_ | EphemeralStorage specifies the ephemeral storage resources of the node | | | -``` -```python -from qdrant_client import QdrantClient, models +#### NodeStatus -client = QdrantClient(url="http://localhost:6333") -client.create_collection( - collection_name="{collection_name}", - vectors_config={}, - sparse_vectors_config={ - "text": models.SparseVectorParams( - modifier=models.Modifier.IDF, - ), - }, -) -``` -```typescript -import { QdrantClient, Schemas } from "@qdrant/js-client-rest"; -const client = new QdrantClient({ host: "localhost", port: 6333 }); -client.createCollection("{collection_name}", { - sparse_vectors: { - "text": { - modifier: "idf" - } - } -}); -``` +_Appears in:_ +- [QdrantClusterStatus](#qdrantclusterstatus) -```rust -use qdrant_client::qdrant::{ - CreateCollectionBuilder, Modifier, SparseVectorParamsBuilder, SparseVectorsConfigBuilder, -}; -use qdrant_client::{Qdrant, QdrantError}; +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `name` _string_ | Name specifies the name of the node | | | +| `started_at` _string_ | StartedAt specifies the time when the node started (in RFC3339 format) | | | +| `state` _object (keys:[PodConditionType](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#podconditiontype-v1-core), values:[ConditionStatus](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#conditionstatus-v1-core))_ | States specifies the condition states of the node | | | +| `version` _string_ | Version specifies the version of Qdrant running on the node | | | -let client = Qdrant::from_url("http://localhost:6334").build()?; -let mut sparse_vectors_config = SparseVectorsConfigBuilder::default(); -sparse_vectors_config.add_named_vector_params( - "text", - SparseVectorParamsBuilder::default().modifier(Modifier::Idf), -); +#### Pause -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .sparse_vectors_config(sparse_vectors_config), - ) - .await?; -``` -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Modifier; -import io.qdrant.client.grpc.Collections.SparseVectorConfig; -import io.qdrant.client.grpc.Collections.SparseVectorParams; -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setSparseVectorsConfig( - SparseVectorConfig.newBuilder() - .putMap("text", SparseVectorParams.newBuilder().setModifier(Modifier.Idf).build())) - .build()) - .get(); -``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +_Appears in:_ +- [QdrantClusterSpec](#qdrantclusterspec) -var client = new QdrantClient("localhost", 6334); +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `owner` _string_ | Owner specifies the owner of the pause request. | | | +| `reason` _string_ | Reason specifies the reason for the pause request. | | | +| `creationTimestamp` _string_ | CreationTimestamp specifies the time when the pause request was created. | | | -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - sparseVectorsConfig: ("text", new SparseVectorParams { - Modifier = Modifier.Idf, - }) -); -``` +#### QdrantCloudRegion -```go -import ( - "context" - "github.com/qdrant/go-client/qdrant" -) -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +QdrantCloudRegion is the Schema for the qdrantcloudregions API -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - SparseVectorsConfig: qdrant.NewSparseVectorsConfig( - map[string]*qdrant.SparseVectorParams{ - "text": { - Modifier: qdrant.Modifier_Idf.Enum(), - }, - }), -}) -``` -Qdrant uses the following formula to calculate the IDF modifier: +_Appears in:_ +- [QdrantCloudRegionList](#qdrantcloudregionlist) -IDF(qi)=ln⁡(N−n(qi)+0.5n(qi)+0.5+1) +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `qdrant.io/v1` | | | +| `kind` _string_ | `QdrantCloudRegion` | | | +| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `spec` _[QdrantCloudRegionSpec](#qdrantcloudregionspec)_ | | | | -Where: -- `N` is the total number of documents in the collection. -- `n` is the number of documents containing non-zero values for the given vector element. +#### QdrantCloudRegionList -## [Anchor](https://qdrant.tech/documentation/concepts/indexing/\#filtrable-index) Filtrable Index -Separately, a payload index and a vector index cannot solve the problem of search using the filter completely. -In the case of weak filters, you can use the HNSW index as it is. In the case of stringent filters, you can use the payload index and complete rescore. -However, for cases in the middle, this approach does not work well. +QdrantCloudRegionList contains a list of QdrantCloudRegion -On the one hand, we cannot apply a full scan on too many vectors. On the other hand, the HNSW graph starts to fall apart when using too strict filters. -![HNSW fail](https://qdrant.tech/docs/precision_by_m.png) -![hnsw graph](https://qdrant.tech/docs/graph.gif) -You can find more information on why this happens in our [blog post](https://blog.vasnetsov.com/posts/categorical-hnsw/). -Qdrant solves this problem by extending the HNSW graph with additional edges based on the stored payload values. -Extra edges allow you to efficiently search for nearby vectors using the HNSW index and apply filters as you search in the graph. +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `qdrant.io/v1` | | | +| `kind` _string_ | `QdrantCloudRegionList` | | | +| `metadata` _[ListMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#listmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `items` _[QdrantCloudRegion](#qdrantcloudregion) array_ | | | | -This approach minimizes the overhead on condition checks since you only need to calculate the conditions for a small fraction of the points involved in the search. -##### Was this page useful? +#### QdrantCloudRegionSpec -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No -Thank you for your feedback! 🙏 -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/indexing.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +QdrantCloudRegionSpec defines the desired state of QdrantCloudRegion -On this page: -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/indexing.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) -× +_Appears in:_ +- [QdrantCloudRegion](#qdrantcloudregion) -[Powered by](https://qdrant.tech/) +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `id` _string_ | Id specifies the unique identifier of the region | | | +| `components` _[ComponentReference](#componentreference) array_ | Components specifies the list of components to be installed in the region | | | +| `helmRepositories` _[HelmRepository](#helmrepository) array_ | HelmRepositories specifies the list of helm repositories to be created to the region
Deprecated: Use "Components" instead | | | +| `helmReleases` _[HelmRelease](#helmrelease) array_ | HelmReleases specifies the list of helm releases to be created to the region
Deprecated: Use "Components" instead | | | -<|page-83-lllmstxt|> -## backups -- [Documentation](https://qdrant.tech/documentation/) -- [Cloud](https://qdrant.tech/documentation/cloud/) -- Backup Clusters -# [Anchor](https://qdrant.tech/documentation/cloud/backups/\#backing-up-qdrant-cloud-clusters) Backing up Qdrant Cloud Clusters -Qdrant organizes cloud instances as clusters. On occasion, you may need to -restore your cluster because of application or system failure. -You may already have a source of truth for your data in a regular database. If you -have a problem, you could reindex the data into your Qdrant vector search cluster. -However, this process can take time. For high availability critical projects we -recommend replication. It guarantees the proper cluster functionality as long as -at least one replica is running. +#### QdrantCluster -For other use-cases such as disaster recovery, you can set up automatic or -self-service backups. -## [Anchor](https://qdrant.tech/documentation/cloud/backups/\#prerequisites) Prerequisites -You can back up your Qdrant clusters though the Qdrant Cloud -Dashboard at [https://cloud.qdrant.io](https://cloud.qdrant.io/). This section assumes that you’ve already -set up your cluster, as described in the following sections: +QdrantCluster is the Schema for the qdrantclusters API -- [Create a cluster](https://qdrant.tech/documentation/cloud/create-cluster/) -- Set up [Authentication](https://qdrant.tech/documentation/cloud/authentication/) -- Configure one or more [Collections](https://qdrant.tech/documentation/concepts/collections/) -## [Anchor](https://qdrant.tech/documentation/cloud/backups/\#automatic-backups) Automatic Backups -You can set up automatic backups of your clusters with our Cloud UI. With the -procedures listed in this page, you can set up -snapshots on a daily/weekly/monthly basis. You can keep as many snapshots as you -need. You can restore a cluster from the snapshot of your choice. +_Appears in:_ +- [QdrantClusterList](#qdrantclusterlist) -> Note: When you restore a snapshot, consider the following: -> -> - The affected cluster is not available while a snapshot is being restored. -> - If you changed the cluster setup after the copy was created, the cluster -> resets to the previous configuration. -> - The previous configuration includes: -> - CPU -> - Memory -> - Node count -> - Qdrant version +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `qdrant.io/v1` | | | +| `kind` _string_ | `QdrantCluster` | | | +| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `spec` _[QdrantClusterSpec](#qdrantclusterspec)_ | | | | -### [Anchor](https://qdrant.tech/documentation/cloud/backups/\#configure-a-backup) Configure a Backup -After you have taken the prerequisite steps, you can configure a backup with the -[Qdrant Cloud Dashboard](https://cloud.qdrant.io/). To do so, take these steps: +#### QdrantClusterList -1. On the **Cluster Detail Page** and select the **Backups** tab. -2. Now you can set up a backup schedule. -The **Days of Retention** is the number of days after a backup snapshot is -deleted. -3. Alternatively, you can select **Backup now** to take an immediate snapshot. -![Configure a cluster backup](https://qdrant.tech/documentation/cloud/backup-schedule.png) -### [Anchor](https://qdrant.tech/documentation/cloud/backups/\#restore-a-backup) Restore a Backup +QdrantClusterList contains a list of QdrantCluster -If you have a backup, it appears in the list of **Available Backups**. You can -choose to restore or delete the backups of your choice. -![Restore or delete a cluster backup](https://qdrant.tech/documentation/cloud/restore-delete.png) -## [Anchor](https://qdrant.tech/documentation/cloud/backups/\#backups-with-a-snapshot) Backups With a Snapshot -Qdrant also offers a snapshot API which allows you to create a snapshot -of a specific collection or your entire cluster. For more information, see our -[snapshot documentation](https://qdrant.tech/documentation/concepts/snapshots/). -Here is how you can take a snapshot and recover a collection: +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `qdrant.io/v1` | | | +| `kind` _string_ | `QdrantClusterList` | | | +| `metadata` _[ListMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#listmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `items` _[QdrantCluster](#qdrantcluster) array_ | | | | -1. Take a snapshot: - - For a single node cluster, call the snapshot endpoint on the exposed URL. - - For a multi node cluster call a snapshot on each node of the collection. - Specifically, prepend `node-{num}-` to your cluster URL. - Then call the [snapshot endpoint](https://qdrant.tech/documentation/concepts/snapshots/#create-snapshot) on the individual hosts. Start with node 0. - - In the response, you’ll see the name of the snapshot. -2. Delete and recreate the collection. -3. Recover the snapshot: - - Call the [recover endpoint](https://qdrant.tech/documentation/concepts/snapshots/#recover-in-cluster-deployment). Set a location which points to the snapshot file ( `file:///qdrant/snapshots/{collection_name}/{snapshot_file_name}`) for each host. -## [Anchor](https://qdrant.tech/documentation/cloud/backups/\#backup-considerations) Backup Considerations +#### QdrantClusterRestore -Backups are incremental for AWS and GCP clusters. For example, if you have two backups, backup number 2 -contains only the data that changed since backup number 1. This reduces the -total cost of your backups. -For Azure clusters, backups are based on total disk usage. The cost is calculated -as half of the disk usage when the backup was taken. -You can create multiple backup schedules. +QdrantClusterRestore is the Schema for the qdrantclusterrestores API -When you restore a snapshot, any changes made after the date of the snapshot -are lost. -##### Was this page useful? -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +_Appears in:_ +- [QdrantClusterRestoreList](#qdrantclusterrestorelist) -Thank you for your feedback! 🙏 +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `qdrant.io/v1` | | | +| `kind` _string_ | `QdrantClusterRestore` | | | +| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `spec` _[QdrantClusterRestoreSpec](#qdrantclusterrestorespec)_ | | | | -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud/backups.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. -On this page: +#### QdrantClusterRestoreList -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud/backups.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) -× -[Powered by](https://qdrant.tech/) +QdrantClusterRestoreList contains a list of QdrantClusterRestore objects -<|page-84-lllmstxt|> -## qdrant-1.3.x -- [Articles](https://qdrant.tech/articles/) -- Introducing Qdrant 1.3.0 -[Back to Qdrant Articles](https://qdrant.tech/articles/) -# Introducing Qdrant 1.3.0 -David Sertic -· +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `qdrant.io/v1` | | | +| `kind` _string_ | `QdrantClusterRestoreList` | | | +| `metadata` _[ListMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#listmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `items` _[QdrantClusterRestore](#qdrantclusterrestore) array_ | | | | -June 26, 2023 -![Introducing Qdrant 1.3.0](https://qdrant.tech/articles_data/qdrant-1.3.x/preview/title.jpg) +#### QdrantClusterRestoreSpec -A brand-new [Qdrant 1.3.0 release](https://github.com/qdrant/qdrant/releases/tag/v1.3.0) comes packed with a plethora of new features, performance improvements and bux fixes: -1. Asynchronous I/O interface: Reduce overhead by managing I/O operations asynchronously, thus minimizing context switches. -2. Oversampling for Quantization: Improve the accuracy and performance of your queries while using Scalar or Product Quantization. -3. Grouping API lookup: Storage optimization method that lets you look for points in another collection using group ids. -4. Qdrant Web UI: A convenient dashboard to help you manage data stored in Qdrant. -5. Temp directory for Snapshots: Set a separate storage directory for temporary snapshots on a faster disk. -6. Other important changes -Your feedback is valuable to us, and are always tying to include some of your feature requests into our roadmap. Join [our Discord community](https://qdrant.to/discord) and help us build Qdrant!. +QdrantClusterRestoreSpec defines the desired state of QdrantClusterRestore -## [Anchor](https://qdrant.tech/articles/qdrant-1.3.x/\#new-features) New features -### [Anchor](https://qdrant.tech/articles/qdrant-1.3.x/\#asychronous-io-interface) Asychronous I/O interface -Going forward, we will support the `io_uring` asychnronous interface for storage devices on Linux-based systems. Since its introduction, `io_uring` has been proven to speed up slow-disk deployments as it decouples kernel work from the IO process. +_Appears in:_ +- [QdrantClusterRestore](#qdrantclusterrestore) -This interface uses two ring buffers to queue and manage I/O operations asynchronously, avoiding costly context switches and reducing overhead. Unlike mmap, it frees the user threads to do computations instead of waiting for the kernel to complete. +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `source` _[RestoreSource](#restoresource)_ | Source defines the source snapshot from which the restore will be done | | | +| `destination` _[RestoreDestination](#restoredestination)_ | Destination defines the destination cluster where the source data will end up | | | -![io_uring](https://qdrant.tech/articles_data/qdrant-1.3.x/io-uring.png) -#### [Anchor](https://qdrant.tech/articles/qdrant-1.3.x/\#enable-the-interface-from-your-config-file) Enable the interface from your config file: -```yaml -storage: - # enable the async scorer which uses io_uring - async_scorer: true -``` +#### QdrantClusterScheduledSnapshot -You can return to the mmap based backend by either deleting the `async_scorer` entry or setting the value to `false`. -This optimization will mainly benefit workloads with lots of disk IO (e.g. querying on-disk collections with rescoring). -Please keep in mind that this feature is experimental and that the interface may change in further versions. -### [Anchor](https://qdrant.tech/articles/qdrant-1.3.x/\#oversampling-for-quantization) Oversampling for quantization +QdrantClusterScheduledSnapshot is the Schema for the qdrantclusterscheduledsnapshots API -We are introducing [oversampling](https://qdrant.tech/documentation/guides/quantization/#oversampling) as a new way to help you improve the accuracy and performance of similarity search algorithms. With this method, you are able to significantly compress high-dimensional vectors in memory and then compensate the accuracy loss by re-scoring additional points with the original vectors. -You will experience much faster performance with quantization due to parallel disk usage when reading vectors. Much better IO means that you can keep quantized vectors in RAM, so the pre-selection will be even faster. Finally, once pre-selection is done, you can use parallel IO to retrieve original vectors, which is significantly faster than traversing HNSW on slow disks. -#### [Anchor](https://qdrant.tech/articles/qdrant-1.3.x/\#set-the-oversampling-factor-via-query) Set the oversampling factor via query: +_Appears in:_ +- [QdrantClusterScheduledSnapshotList](#qdrantclusterscheduledsnapshotlist) -Here is how you can configure the oversampling factor - define how many extra vectors should be pre-selected using the quantized index, and then re-scored using original vectors. +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `qdrant.io/v1` | | | +| `kind` _string_ | `QdrantClusterScheduledSnapshot` | | | +| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `spec` _[QdrantClusterScheduledSnapshotSpec](#qdrantclusterscheduledsnapshotspec)_ | | | | -httppython -```http -POST /collections/{collection_name}/points/search -{ - "params": { - "quantization": { - "ignore": false, - "rescore": true, - "oversampling": 2.4 - } - }, - "vector": [0.2, 0.1, 0.9, 0.7], - "limit": 100 -} +#### QdrantClusterScheduledSnapshotList -``` -```python -from qdrant_client import QdrantClient -from qdrant_client.http import models -client = QdrantClient("localhost", port=6333) +QdrantClusterScheduledSnapshotList contains a list of QdrantCluster -client.search( - collection_name="{collection_name}", - query_vector=[0.2, 0.1, 0.9, 0.7], - search_params=models.SearchParams( - quantization=models.QuantizationSearchParams( - ignore=False, - rescore=True, - oversampling=2.4 - ) - ) -) -``` -In this case, if `oversampling` is 2.4 and `limit` is 100, then 240 vectors will be pre-selected using quantized index, and then the top 100 points will be returned after re-scoring with the unquantized vectors. -As you can see from the example above, this parameter is set during the query. This is a flexible method that will let you tune query accuracy. While the index is not changed, you can decide how many points you want to retrieve using quantized vectors. -### [Anchor](https://qdrant.tech/articles/qdrant-1.3.x/\#grouping-api-lookup) Grouping API lookup +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `qdrant.io/v1` | | | +| `kind` _string_ | `QdrantClusterScheduledSnapshotList` | | | +| `metadata` _[ListMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#listmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `items` _[QdrantClusterScheduledSnapshot](#qdrantclusterscheduledsnapshot) array_ | | | | -In version 1.2.0, we introduced a mechanism for requesting groups of points. Our new feature extends this functionality by giving you the option to look for points in another collection using the group ids. We wanted to add this feature, since having a single point for the shared data of the same item optimizes storage use, particularly if the payload is large. -This has the extra benefit of having a single point to update when the information shared by the points in a group changes. +#### QdrantClusterScheduledSnapshotSpec -![Group Lookup](https://qdrant.tech/articles_data/qdrant-1.3.x/group-lookup.png) -For example, if you have a collection of documents, you may want to chunk them and store the points for the chunks in a separate collection, making sure that you store the point id from the document it belongs in the payload of the chunk point. -#### [Anchor](https://qdrant.tech/articles/qdrant-1.3.x/\#adding-the-parameter-to-grouping-api-request) Adding the parameter to grouping API request: +QdrantClusterScheduledSnapshotSpec defines the desired state of QdrantCluster -When using the grouping API, add the `with_lookup` parameter to bring the information from those points into each group: -httppython -```http -POST /collections/chunks/points/search/groups -{ - // Same as in the regular search API - "vector": [1.1], - ..., +_Appears in:_ +- [QdrantClusterScheduledSnapshot](#qdrantclusterscheduledsnapshot) - // Grouping parameters - "group_by": "document_id", - "limit": 2, - "group_size": 2, +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `cluster-id` _string_ | Id specifies the unique identifier of the cluster | | | +| `scheduleShortId` _string_ | Specifies short Id which identifies a schedule | | MaxLength: 8
| +| `schedule` _string_ | Cron expression for frequency of creating snapshots, see https://en.wikipedia.org/wiki/Cron.
The schedule is specified in UTC. | | Pattern: `^(@(annually\|yearly\|monthly\|weekly\|daily\|hourly\|reboot))\|(@every (\d+(ns\|us\|”s\|ms\|s\|m\|h))+)\|((((\d+,)+\d+\|([\d\*]+(\/\|-)\d+)\|\d+\|\*) ?)\{5,7\})$`
| +| `retention` _string_ | Retention of schedule in hours | | Pattern: `^[0-9]+h$`
| - // Lookup parameters - "with_lookup": { - // Name of the collection to look up points in - "collection_name": "documents", - // Options for specifying what to bring from the payload - // of the looked up point, true by default - "with_payload": ["title", "text"], - // Options for specifying what to bring from the vector(s) - // of the looked up point, true by default - "with_vectors: false, - } -} -``` +#### QdrantClusterSnapshot -```python -client.search_groups( - collection_name="chunks", - # Same as in the regular search() API - query_vector=[1.1], - ..., - # Grouping parameters - group_by="document_id", # Path of the field to group by - limit=2, # Max amount of groups - group_size=2, # Max amount of points per group +QdrantClusterSnapshot is the Schema for the qdrantclustersnapshots API - # Lookup parameters - with_lookup=models.WithLookup( - # Name of the collection to look up points in - collection_name="documents", - # Options for specifying what to bring from the payload - # of the looked up point, True by default - with_payload=["title", "text"] - # Options for specifying what to bring from the vector(s) - # of the looked up point, True by default - with_vectors=False, - ) -) +_Appears in:_ +- [QdrantClusterSnapshotList](#qdrantclustersnapshotlist) -``` +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `qdrant.io/v1` | | | +| `kind` _string_ | `QdrantClusterSnapshot` | | | +| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `spec` _[QdrantClusterSnapshotSpec](#qdrantclustersnapshotspec)_ | | | | -### [Anchor](https://qdrant.tech/articles/qdrant-1.3.x/\#qdrant-web-user-interface) Qdrant web user interface -We are excited to announce a more user-friendly way to organize and work with your collections inside of Qdrant. Our dashboard’s design is simple, but very intuitive and easy to access. +#### QdrantClusterSnapshotList -Try it out now! If you have Docker running, you can [quickstart Qdrant](https://qdrant.tech/documentation/quick-start/) and access the Dashboard locally from [http://localhost:6333/dashboard](http://localhost:6333/dashboard). You should see this simple access point to Qdrant: -![Qdrant Web UI](https://qdrant.tech/articles_data/qdrant-1.3.x/web-ui.png) -### [Anchor](https://qdrant.tech/articles/qdrant-1.3.x/\#temporary-directory-for-snapshots) Temporary directory for Snapshots +QdrantClusterSnapshotList contains a list of QdrantClusterSnapshot -Currently, temporary snapshot files are created inside the `/storage` directory. Oftentimes `/storage` is a network-mounted disk. Therefore, we found this method suboptimal because `/storage` is limited in disk size and also because writing data to it may affect disk performance as it consumes bandwidth. This new feature allows you to specify a different directory on another disk that is faster. We expect this feature to significantly optimize cloud performance. -To change it, access `config.yaml` and set `storage.temp_path` to another directory location. -## [Anchor](https://qdrant.tech/articles/qdrant-1.3.x/\#important-changes) Important changes -The latest release focuses not only on the new features but also introduces some changes making -Qdrant even more reliable. -### [Anchor](https://qdrant.tech/articles/qdrant-1.3.x/\#optimizing-group-requests) Optimizing group requests +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `qdrant.io/v1` | | | +| `kind` _string_ | `QdrantClusterSnapshotList` | | | +| `metadata` _[ListMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#listmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `items` _[QdrantClusterSnapshot](#qdrantclustersnapshot) array_ | | | | -Internally, `is_empty` was not using the index when it was called, so it had to deserialize the whole payload to see if the key had values or not. Our new update makes sure to check the index first, before confirming with the payload if it is actually `empty`/ `null`, so these changes improve performance only when the negated condition is true (e.g. it improves when the field is not empty). Going forward, this will improve the way grouping API requests are handled. -### [Anchor](https://qdrant.tech/articles/qdrant-1.3.x/\#faster-read-access-with-mmap) Faster read access with mmap +#### QdrantClusterSnapshotPhase -If you used mmap, you most likely found that segments were always created with cold caches. The first request to the database needed to request the disk, which made startup slower despite plenty of RAM being available. We have implemeneted a way to ask the kernel to “heat up” the disk cache and make initialization much faster. +_Underlying type:_ _string_ -The function is expected to be used on startup and after segment optimization and reloading of newly indexed segment. So far this is only implemented for “immutable” memmaps. -## [Anchor](https://qdrant.tech/articles/qdrant-1.3.x/\#release-notes) Release notes -As usual, [our release notes](https://github.com/qdrant/qdrant/releases/tag/v1.3.0) describe all the changes -introduced in the latest version. -##### Was this page useful? -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +_Appears in:_ +- [QdrantClusterSnapshotStatus](#qdrantclustersnapshotstatus) -Thank you for your feedback! 🙏 +| Field | Description | +| --- | --- | +| `Running` | | +| `Skipped` | | +| `Failed` | | +| `Succeeded` | | -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/qdrant-1.3.x.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. -On this page: +#### QdrantClusterSnapshotSpec -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/qdrant-1.3.x.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) -× -[Powered by](https://qdrant.tech/) -<|page-85-lllmstxt|> -## distributed_deployment -- [Documentation](https://qdrant.tech/documentation/) -- [Guides](https://qdrant.tech/documentation/guides/) -- Distributed Deployment -# [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#distributed-deployment) Distributed deployment -Since version v0.8.0 Qdrant supports a distributed deployment mode. -In this mode, multiple Qdrant services communicate with each other to distribute the data across the peers to extend the storage capabilities and increase stability. -## [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#how-many-qdrant-nodes-should-i-run) How many Qdrant nodes should I run? +_Appears in:_ +- [QdrantClusterSnapshot](#qdrantclustersnapshot) -The ideal number of Qdrant nodes depends on how much you value cost-saving, resilience, and performance/scalability in relation to each other. +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `cluster-id` _string_ | The cluster ID for which a Snapshot need to be taken
The cluster should be in the same namespace as this QdrantClusterSnapshot is located | | | +| `creation-timestamp` _integer_ | The CreationTimestamp of the backup (expressed in Unix epoch format) | | | +| `scheduleShortId` _string_ | Specifies the short Id which identifies a schedule, if any.
This field should not be set if the backup is made manually. | | MaxLength: 8
| +| `retention` _string_ | The retention period of this snapshot in hours, if any.
If not set, the backup doesn't have a retention period, meaning it will not be removed. | | Pattern: `^[0-9]+h$`
| -- **Prioritizing cost-saving**: If cost is most important to you, run a single Qdrant node. This is not recommended for production environments. Drawbacks: - - Resilience: Users will experience downtime during node restarts, and recovery is not possible unless you have backups or snapshots. - - Performance: Limited to the resources of a single server. -- **Prioritizing resilience**: If resilience is most important to you, run a Qdrant cluster with three or more nodes and two or more shard replicas. Clusters with three or more nodes and replication can perform all operations even while one node is down. Additionally, they gain performance benefits from load-balancing and they can recover from the permanent loss of one node without the need for backups or snapshots (but backups are still strongly recommended). This is most recommended for production environments. Drawbacks: - - Cost: Larger clusters are more costly than smaller clusters, which is the only drawback of this configuration. -- **Balancing cost, resilience, and performance**: Running a two-node Qdrant cluster with replicated shards allows the cluster to respond to most read/write requests even when one node is down, such as during maintenance events. Having two nodes also means greater performance than a single-node cluster while still being cheaper than a three-node cluster. Drawbacks: - - Resilience (uptime): The cluster cannot perform operations on collections when one node is down. Those operations require >50% of nodes to be running, so this is only possible in a 3+ node cluster. Since creating, editing, and deleting collections are usually rare operations, many users find this drawback to be negligible. - - Resilience (data integrity): If the data on one of the two nodes is permanently lost or corrupted, it cannot be recovered aside from snapshots or backups. Only 3+ node clusters can recover from the permanent loss of a single node since recovery operations require >50% of the cluster to be healthy. - - Cost: Replicating your shards requires storing two copies of your data. - - Performance: The maximum performance of a Qdrant cluster increases as you add more nodes. +#### QdrantClusterSpec -In summary, single-node clusters are best for non-production workloads, replicated 3+ node clusters are the gold standard, and replicated 2-node clusters strike a good balance. -## [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#enabling-distributed-mode-in-self-hosted-qdrant) Enabling distributed mode in self-hosted Qdrant -To enable distributed deployment - enable the cluster mode in the [configuration](https://qdrant.tech/documentation/guides/configuration/) or using the ENV variable: `QDRANT__CLUSTER__ENABLED=true`. +QdrantClusterSpec defines the desired state of QdrantCluster -```yaml -cluster: - # Use `enabled: true` to run Qdrant in distributed deployment mode - enabled: true - # Configuration of the inter-cluster communication - p2p: - # Port for internal communication between peers - port: 6335 - # Configuration related to distributed consensus algorithm - consensus: - # How frequently peers should ping each other. - # Setting this parameter to lower value will allow consensus - # to detect disconnected node earlier, but too frequent - # tick period may create significant network and CPU overhead. - # We encourage you NOT to change this parameter unless you know what you are doing. - tick_period_ms: 100 -``` +_Appears in:_ +- [QdrantCluster](#qdrantcluster) -By default, Qdrant will use port `6335` for its internal communication. -All peers should be accessible on this port from within the cluster, but make sure to isolate this port from outside access, as it might be used to perform write operations. +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `id` _string_ | Id specifies the unique identifier of the cluster | | | +| `version` _string_ | Version specifies the version of Qdrant to deploy | | | +| `size` _integer_ | Size specifies the desired number of Qdrant nodes in the cluster | | Maximum: 30
Minimum: 1
| +| `servicePerNode` _boolean_ | ServicePerNode specifies whether the cluster should start a dedicated service for each node. | true | | +| `clusterManager` _boolean_ | ClusterManager specifies whether to use the cluster manager for this cluster.
The Python-operator will deploy a dedicated cluster manager instance.
The Go-operator will use a shared instance.
If not set, the default will be taken from the operator config. | | | +| `suspend` _boolean_ | Suspend specifies whether to suspend the cluster.
If enabled, the cluster will be suspended and all related resources will be removed except the PVCs. | false | | +| `pauses` _[Pause](#pause) array_ | Pauses specifies a list of pause request by developer for manual maintenance.
Operator will skip handling any changes in the CR if any pause request is present. | | | +| `image` _[QdrantImage](#qdrantimage)_ | Image specifies the image to use for each Qdrant node. | | | +| `resources` _[Resources](#resources)_ | Resources specifies the resources to allocate for each Qdrant node. | | | +| `security` _[QdrantSecurityContext](#qdrantsecuritycontext)_ | Security specifies the security context for each Qdrant node. | | | +| `tolerations` _[Toleration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#toleration-v1-core) array_ | Tolerations specifies the tolerations for each Qdrant node. | | | +| `nodeSelector` _object (keys:string, values:string)_ | NodeSelector specifies the node selector for each Qdrant node. | | | +| `config` _[QdrantConfiguration](#qdrantconfiguration)_ | Config specifies the Qdrant configuration setttings for the clusters. | | | +| `ingress` _[Ingress](#ingress)_ | Ingress specifies the ingress for the cluster. | | | +| `service` _[KubernetesService](#kubernetesservice)_ | Service specifies the configuration of the Qdrant Kubernetes Service. | | | +| `gpu` _[GPU](#gpu)_ | GPU specifies GPU configuration for the cluster. If this field is not set, no GPU will be used. | | | +| `statefulSet` _[KubernetesStatefulSet](#kubernetesstatefulset)_ | StatefulSet specifies the configuration of the Qdrant Kubernetes StatefulSet. | | | +| `storageClassNames` _[StorageClassNames](#storageclassnames)_ | StorageClassNames specifies the storage class names for db and snapshots. | | | +| `topologySpreadConstraints` _[TopologySpreadConstraint](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#topologyspreadconstraint-v1-core)_ | TopologySpreadConstraints specifies the topology spread constraints for the cluster. | | | +| `podDisruptionBudget` _[PodDisruptionBudgetSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#poddisruptionbudgetspec-v1-policy)_ | PodDisruptionBudget specifies the pod disruption budget for the cluster. | | | +| `restartAllPodsConcurrently` _boolean_ | RestartAllPodsConcurrently specifies whether to restart all pods concurrently (also called one-shot-restart).
If enabled, all the pods in the cluster will be restarted concurrently in situations where multiple pods
need to be restarted, like when RestartedAtAnnotationKey is added/updated or the Qdrant version needs to be upgraded.
This helps sharded but not replicated clusters to reduce downtime to a possible minimum during restart.
If unset, the operator is going to restart nodes concurrently if none of the collections if replicated. | | | +| `startupDelaySeconds` _integer_ | If StartupDelaySeconds is set (> 0), an additional 'sleep ' will be emitted to the pod startup.
The sleep will be added when a pod is restarted, it will not force any pod to restart.
This feature can be used for debugging the core, e.g. if a pod is in crash loop, it provided a way
to inspect the attached storage. | | | +| `rebalanceStrategy` _[RebalanceStrategy](#rebalancestrategy)_ | RebalanceStrategy specifies the strategy to use for automaticially rebalancing shards the cluster.
Cluster-manager needs to be enabled for this feature to work. | | Enum: [by_count by_size by_count_and_size]
| -Additionally, you must provide the `--uri` flag to the first peer so it can tell other nodes how it should be reached: -```bash -./qdrant --uri 'http://qdrant_node_1:6335' -``` -Subsequent peers in a cluster must know at least one node of the existing cluster to synchronize through it with the rest of the cluster. +#### QdrantConfiguration -To do this, they need to be provided with a bootstrap URL: -```bash -./qdrant --bootstrap 'http://qdrant_node_1:6335' -``` -The URL of the new peers themselves will be calculated automatically from the IP address of their request. -But it is also possible to provide them individually using the `--uri` argument. -```text -USAGE: - qdrant [OPTIONS] -OPTIONS: - --bootstrap - Uri of the peer to bootstrap from in case of multi-peer deployment. If not specified - - this peer will be considered as a first in a new deployment - --uri - Uri of this peer. Other peers should be able to reach it by this uri. +_Appears in:_ +- [QdrantClusterSpec](#qdrantclusterspec) - This value has to be supplied if this is the first peer in a new deployment. +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `collection` _[QdrantConfigurationCollection](#qdrantconfigurationcollection)_ | Collection specifies the default collection configuration for Qdrant. | | | +| `log_level` _string_ | LogLevel specifies the log level for Qdrant. | | | +| `service` _[QdrantConfigurationService](#qdrantconfigurationservice)_ | Service specifies the service level configuration for Qdrant. | | | +| `tls` _[QdrantConfigurationTLS](#qdrantconfigurationtls)_ | TLS specifies the TLS configuration for Qdrant. | | | +| `storage` _[StorageConfig](#storageconfig)_ | Storage specifies the storage configuration for Qdrant. | | | +| `inference` _[InferenceConfig](#inferenceconfig)_ | Inference configuration. This is used in Qdrant Managed Cloud only. If not set Inference is not available to this cluster. | | | - In case this is not the first peer and it bootstraps the value is optional. If not - supplied then qdrant will take internal grpc port from config and derive the IP address - of this peer on bootstrap peer (receiving side) -``` +#### QdrantConfigurationCollection -After a successful synchronization you can observe the state of the cluster through the [REST API](https://api.qdrant.tech/master/api-reference/distributed/cluster-status): -```http -GET /cluster -``` -Example result: -```json -{ - "result": { - "status": "enabled", - "peer_id": 11532566549086892000, - "peers": { - "9834046559507417430": { - "uri": "http://172.18.0.3:6335/" - }, - "11532566549086892528": { - "uri": "http://qdrant_node_1:6335/" - } - }, - "raft_info": { - "term": 1, - "commit": 4, - "pending_operations": 1, - "leader": 11532566549086892000, - "role": "Leader" - } - }, - "status": "ok", - "time": 5.731e-06 -} -``` -Note that enabling distributed mode does not automatically replicate your data. See the section on [making use of a new distributed Qdrant cluster](https://qdrant.tech/documentation/guides/distributed_deployment/#making-use-of-a-new-distributed-qdrant-cluster) for the next steps. +_Appears in:_ +- [QdrantConfiguration](#qdrantconfiguration) -## [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#enabling-distributed-mode-in-qdrant-cloud) Enabling distributed mode in Qdrant Cloud +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `replication_factor` _integer_ | ReplicationFactor specifies the default number of replicas of each shard | | | +| `write_consistency_factor` _integer_ | WriteConsistencyFactor specifies how many replicas should apply the operation to consider it successful | | | +| `vectors` _[QdrantConfigurationCollectionVectors](#qdrantconfigurationcollectionvectors)_ | Vectors specifies the default parameters for vectors | | | -For best results, first ensure your cluster is running Qdrant v1.7.4 or higher. Older versions of Qdrant do support distributed mode, but improvements in v1.7.4 make distributed clusters more resilient during outages. -In the [Qdrant Cloud console](https://cloud.qdrant.io/), click “Scale Up” to increase your cluster size to >1. Qdrant Cloud configures the distributed mode settings automatically. +#### QdrantConfigurationCollectionVectors -After the scale-up process completes, you will have a new empty node running alongside your existing node(s). To replicate data into this new empty node, see the next section. -## [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#making-use-of-a-new-distributed-qdrant-cluster) Making use of a new distributed Qdrant cluster -When you enable distributed mode and scale up to two or more nodes, your data does not move to the new node automatically; it starts out empty. To make use of your new empty node, do one of the following: -- Create a new replicated collection by setting the [replication\_factor](https://qdrant.tech/documentation/guides/distributed_deployment/#replication-factor) to 2 or more and setting the [number of shards](https://qdrant.tech/documentation/guides/distributed_deployment/#choosing-the-right-number-of-shards) to a multiple of your number of nodes. -- If you have an existing collection which does not contain enough shards for each node, you must create a new collection as described in the previous bullet point. -- If you already have enough shards for each node and you merely need to replicate your data, follow the directions for [creating new shard replicas](https://qdrant.tech/documentation/guides/distributed_deployment/#creating-new-shard-replicas). -- If you already have enough shards for each node and your data is already replicated, you can move data (without replicating it) onto the new node(s) by [moving shards](https://qdrant.tech/documentation/guides/distributed_deployment/#moving-shards). -## [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#raft) Raft -Qdrant uses the [Raft](https://raft.github.io/) consensus protocol to maintain consistency regarding the cluster topology and the collections structure. -Operations on points, on the other hand, do not go through the consensus infrastructure. -Qdrant is not intended to have strong transaction guarantees, which allows it to perform point operations with low overhead. -In practice, it means that Qdrant does not guarantee atomic distributed updates but allows you to wait until the [operation is complete](https://qdrant.tech/documentation/concepts/points/#awaiting-result) to see the results of your writes. +_Appears in:_ +- [QdrantConfigurationCollection](#qdrantconfigurationcollection) -Operations on collections, on the contrary, are part of the consensus which guarantees that all operations are durable and eventually executed by all nodes. -In practice it means that a majority of nodes agree on what operations should be applied before the service will perform them. +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `on_disk` _boolean_ | OnDisk specifies whether vectors should be stored in memory or on disk. | | | -Practically, it means that if the cluster is in a transition state - either electing a new leader after a failure or starting up, the collection update operations will be denied. -You may use the cluster [REST API](https://api.qdrant.tech/master/api-reference/distributed/cluster-status) to check the state of the consensus. +#### QdrantConfigurationService -## [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#sharding) Sharding -A Collection in Qdrant is made of one or more shards. -A shard is an independent store of points which is able to perform all operations provided by collections. -There are two methods of distributing points across shards: -- **Automatic sharding**: Points are distributed among shards by using a [consistent hashing](https://en.wikipedia.org/wiki/Consistent_hashing) algorithm, so that shards are managing non-intersecting subsets of points. This is the default behavior. -- **User-defined sharding**: _Available as of v1.7.0_ \- Each point is uploaded to a specific shard, so that operations can hit only the shard or shards they need. Even with this distribution, shards still ensure having non-intersecting subsets of points. [See more
](https://qdrant.tech/documentation/guides/distributed_deployment/#user-defined-sharding) -Each node knows where all parts of the collection are stored through the [consensus protocol](https://qdrant.tech/documentation/guides/distributed_deployment/#raft), so when you send a search request to one Qdrant node, it automatically queries all other nodes to obtain the full search result. -### [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#choosing-the-right-number-of-shards) Choosing the right number of shards +_Appears in:_ +- [QdrantConfiguration](#qdrantconfiguration) -When you create a collection, Qdrant splits the collection into `shard_number` shards. If left unset, `shard_number` is set to the number of nodes in your cluster when the collection was created. The `shard_number` cannot be changed without recreating the collection. +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `api_key` _[QdrantSecretKeyRef](#qdrantsecretkeyref)_ | ApiKey for the qdrant instance | | | +| `read_only_api_key` _[QdrantSecretKeyRef](#qdrantsecretkeyref)_ | ReadOnlyApiKey for the qdrant instance | | | +| `jwt_rbac` _boolean_ | JwtRbac specifies whether to enable jwt rbac for the qdrant instance
Default is false | | | +| `hide_jwt_dashboard` _boolean_ | HideJwtDashboard specifies whether to hide the JWT dashboard of the embedded UI
Default is false | | | +| `enable_tls` _boolean_ | EnableTLS specifies whether to enable tls for the qdrant instance
Default is false | | | +| `max_request_size_mb` _integer_ | MaxRequestSizeMb specifies them maximum size of POST data in a single request in megabytes
Default, if not set is 32 (MB) | | | -httppythontypescriptrustjavacsharpgo -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 300, - "distance": "Cosine" - }, - "shard_number": 6 -} +#### QdrantConfigurationTLS -``` -```python -from qdrant_client import QdrantClient, models -client = QdrantClient(url="http://localhost:6333") -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=300, distance=models.Distance.COSINE), - shard_number=6, -) -``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; -const client = new QdrantClient({ host: "localhost", port: 6333 }); +_Appears in:_ +- [QdrantConfiguration](#qdrantconfiguration) -client.createCollection("{collection_name}", { - vectors: { - size: 300, - distance: "Cosine", - }, - shard_number: 6, -}); +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `cert` _[QdrantSecretKeyRef](#qdrantsecretkeyref)_ | Reference to the secret containing the server certificate chain file | | | +| `key` _[QdrantSecretKeyRef](#qdrantsecretkeyref)_ | Reference to the secret containing the server private key file | | | +| `caCert` _[QdrantSecretKeyRef](#qdrantsecretkeyref)_ | Reference to the secret containing the CA certificate file | | | -``` -```rust -use qdrant_client::qdrant::{CreateCollectionBuilder, Distance, VectorParamsBuilder}; -use qdrant_client::Qdrant; +#### QdrantEntity -let client = Qdrant::from_url("http://localhost:6334").build()?; -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(300, Distance::Cosine)) - .shard_number(6), - ) - .await?; -``` +QdrantEntity is the Schema for the qdrantentities API -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.grpc.Collections.VectorsConfig; -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig( - VectorsConfig.newBuilder() - .setParams( - VectorParams.newBuilder() - .setSize(300) - .setDistance(Distance.Cosine) - .build()) - .build()) - .setShardNumber(6) - .build()) - .get(); +_Appears in:_ +- [QdrantEntityList](#qdrantentitylist) -``` +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `qdrant.io/v1` | | | +| `kind` _string_ | `QdrantEntity` | | | +| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `spec` _[QdrantEntitySpec](#qdrantentityspec)_ | | | | -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -var client = new QdrantClient("localhost", 6334); +#### QdrantEntityList -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 300, Distance = Distance.Cosine }, - shardNumber: 6 -); -``` -```go -import ( - "context" +QdrantEntityList contains a list of QdrantEntity objects - "github.com/qdrant/go-client/qdrant" -) -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 300, - Distance: qdrant.Distance_Cosine, - }), - ShardNumber: qdrant.PtrOf(uint32(6)), -}) -``` -To ensure all nodes in your cluster are evenly utilized, the number of shards must be a multiple of the number of nodes you are currently running in your cluster. +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `qdrant.io/v1` | | | +| `kind` _string_ | `QdrantEntityList` | | | +| `metadata` _[ListMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#listmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `items` _[QdrantEntity](#qdrantentity) array_ | | | | -> Aside: Advanced use cases such as multitenancy may require an uneven distribution of shards. See [Multitenancy](https://qdrant.tech/articles/multitenancy/). -We recommend creating at least 2 shards per node to allow future expansion without having to re-shard. [Resharding](https://qdrant.tech/documentation/guides/distributed_deployment/#resharding) is possible when using our cloud offering, but should be avoided if hosting elsewhere as it would require creating a new collection. +#### QdrantEntitySpec -If you anticipate a lot of growth, we recommend 12 shards since you can expand from 1 node up to 2, 3, 6, and 12 nodes without having to re-shard. Having more than 12 shards in a small cluster may not be worth the performance overhead. -Shards are evenly distributed across all existing nodes when a collection is first created, but Qdrant does not automatically rebalance shards if your cluster size or replication factor changes (since this is an expensive operation on large clusters). See the next section for how to move shards after scaling operations. -### [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#resharding) Resharding +QdrantEntitySpec defines the desired state of QdrantEntity -_Available as of v1.13.0 in Cloud_ -Resharding allows you to change the number of shards in your existing collections if you’re hosting with our [Cloud](https://qdrant.tech/documentation/cloud-intro/) offering. -Resharding can change the number of shards both up and down, without having to recreate the collection from scratch. +_Appears in:_ +- [QdrantEntity](#qdrantentity) -Please refer to the [Resharding](https://qdrant.tech/documentation/cloud/cluster-scaling/#resharding) section in our cloud documentation for more details. +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `id` _string_ | The unique identifier of the entity (in UUID format). | | | +| `entityType` _string_ | The type of the entity. | | | +| `clusterId` _string_ | The optional cluster identifier | | | +| `createdAt` _[MicroTime](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#microtime-v1-meta)_ | Timestamp when the entity was created. | | | +| `lastUpdatedAt` _[MicroTime](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#microtime-v1-meta)_ | Timestamp when the entity was last updated. | | | +| `deletedAt` _[MicroTime](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#microtime-v1-meta)_ | Timestamp when the entity was deleted (or is started to be deleting).
If not set the entity is not deleted | | | +| `payload` _[JSON](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#json-v1-apiextensions-k8s-io)_ | Generic payload for this entity | | | -### [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#moving-shards) Moving shards -_Available as of v0.9.0_ -Qdrant allows moving shards between nodes in the cluster and removing nodes from the cluster. This functionality unlocks the ability to dynamically scale the cluster size without downtime. It also allows you to upgrade or migrate nodes without downtime. -Qdrant provides the information regarding the current shard distribution in the cluster with the [Collection Cluster info API](https://api.qdrant.tech/master/api-reference/distributed/collection-cluster-info). +#### QdrantEntityStatusResult -Use the [Update collection cluster setup API](https://api.qdrant.tech/master/api-reference/distributed/update-collection-cluster) to initiate the shard transfer: -```http -POST /collections/{collection_name}/cluster -{ - "move_shard": { - "shard_id": 0, - "from_peer_id": 381894127, - "to_peer_id": 467122995 - } -} -``` +QdrantEntityStatusResult is the last result from the invocation to a manager -After the transfer is initiated, the service will process it based on the used -[transfer method](https://qdrant.tech/documentation/guides/distributed_deployment/#shard-transfer-method) keeping both shards in sync. Once the -transfer is completed, the old shard is deleted from the source node. -In case you want to downscale the cluster, you can move all shards away from a peer and then remove the peer using the [remove peer API](https://api.qdrant.tech/master/api-reference/distributed/remove-peer). -```http -DELETE /cluster/peer/{peer_id} +_Appears in:_ +- [QdrantEntityStatus](#qdrantentitystatus) -``` +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `result` _[EntityResult](#entityresult)_ | The result of last reconcile of the entity | | Enum: [Ok Pending Error]
| +| `reason` _string_ | The reason of the result (e.g. in case of an error) | | | +| `payload` _[JSON](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#json-v1-apiextensions-k8s-io)_ | The optional payload of the status. | | | -After that, Qdrant will exclude the node from the consensus, and the instance will be ready for shutdown. -### [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#user-defined-sharding) User-defined sharding +#### QdrantImage -_Available as of v1.7.0_ -Qdrant allows you to specify the shard for each point individually. This feature is useful if you want to control the shard placement of your data, so that operations can hit only the subset of shards they actually need. In big clusters, this can significantly improve the performance of operations that do not require the whole collection to be scanned. -A clear use-case for this feature is managing a multi-tenant collection, where each tenant (let it be a user or organization) is assumed to be segregated, so they can have their data stored in separate shards. -To enable user-defined sharding, set `sharding_method` to `custom` during collection creation: -httppythontypescriptrustjavacsharpgo -```http -PUT /collections/{collection_name} -{ - "shard_number": 1, - "sharding_method": "custom" - // ... other collection parameters -} -``` +_Appears in:_ +- [QdrantClusterSpec](#qdrantclusterspec) -```python -from qdrant_client import QdrantClient, models +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `repository` _string_ | Repository specifies the repository of the Qdrant image.
If not specified defaults the config of the operator (or qdrant/qdrant if not specified in operator). | | | +| `pullPolicy` _[PullPolicy](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#pullpolicy-v1-core)_ | PullPolicy specifies the image pull policy for the Qdrant image.
If not specified defaults the config of the operator (or IfNotPresent if not specified in operator). | | | +| `pullSecretName` _string_ | PullSecretName specifies the pull secret for the Qdrant image. | | | -client = QdrantClient(url="http://localhost:6333") -client.create_collection( - collection_name="{collection_name}", - shard_number=1, - sharding_method=models.ShardingMethod.CUSTOM, - # ... other collection parameters -) -client.create_shard_key("{collection_name}", "{shard_key}") +#### QdrantRelease -``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; -const client = new QdrantClient({ host: "localhost", port: 6333 }); +QdrantRelease describes an available Qdrant release -client.createCollection("{collection_name}", { - shard_number: 1, - sharding_method: "custom", - // ... other collection parameters -}); -client.createShardKey("{collection_name}", { - shard_key: "{shard_key}" -}); -``` +_Appears in:_ +- [QdrantReleaseList](#qdrantreleaselist) -```rust -use qdrant_client::qdrant::{ - CreateCollectionBuilder, CreateShardKeyBuilder, CreateShardKeyRequestBuilder, Distance, - ShardingMethod, VectorParamsBuilder, -}; -use qdrant_client::Qdrant; +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `qdrant.io/v1` | | | +| `kind` _string_ | `QdrantRelease` | | | +| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `spec` _[QdrantReleaseSpec](#qdrantreleasespec)_ | | | | -let client = Qdrant::from_url("http://localhost:6334").build()?; -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(300, Distance::Cosine)) - .shard_number(1) - .sharding_method(ShardingMethod::Custom.into()), - ) - .await?; +#### QdrantReleaseList -client - .create_shard_key( - CreateShardKeyRequestBuilder::new("{collection_name}") - .request(CreateShardKeyBuilder::default().shard_key("{shard_key".to_string())), - ) - .await?; -``` -```java -import static io.qdrant.client.ShardKeyFactory.shardKey; +QdrantReleaseList contains a list of QdrantRelease -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.ShardingMethod; -import io.qdrant.client.grpc.Collections.CreateShardKey; -import io.qdrant.client.grpc.Collections.CreateShardKeyRequest; -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - // ... other collection parameters - .setShardNumber(1) - .setShardingMethod(ShardingMethod.Custom) - .build()) - .get(); -client.createShardKeyAsync(CreateShardKeyRequest.newBuilder() - .setCollectionName("{collection_name}") - .setRequest(CreateShardKey.newBuilder() - .setShardKey(shardKey("{shard_key}")) - .build()) - .build()).get(); -``` +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `qdrant.io/v1` | | | +| `kind` _string_ | `QdrantReleaseList` | | | +| `metadata` _[ListMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#listmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `items` _[QdrantRelease](#qdrantrelease) array_ | | | | -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -var client = new QdrantClient("localhost", 6334); +#### QdrantReleaseSpec -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - // ... other collection parameters - shardNumber: 1, - shardingMethod: ShardingMethod.Custom -); -await client.CreateShardKeyAsync( - "{collection_name}", - new CreateShardKey { ShardKey = new ShardKey { Keyword = "{shard_key}", } } - ); -``` +QdrantReleaseSpec defines the desired state of QdrantRelease -```go -import ( - "context" - "github.com/qdrant/go-client/qdrant" -) -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +_Appears in:_ +- [QdrantRelease](#qdrantrelease) -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - // ... other collection parameters - ShardNumber: qdrant.PtrOf(uint32(1)), - ShardingMethod: qdrant.ShardingMethod_Custom.Enum(), -}) +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `version` _string_ | Version number (should be semver compliant).
E.g. "v1.10.1" | | | +| `default` _boolean_ | If set, this version is default for new clusters on Cloud.
There should be only 1 Qdrant version in the platform set as default. | false | | +| `image` _string_ | Full docker image to use for this version.
If empty, a default image will be derived from Version (and qdrant/qdrant is assumed). | | | +| `unavailable` _boolean_ | If set, this version cannot be used for new clusters. | false | | +| `endOfLife` _boolean_ | If set, this version is no longer actively supported. | false | | +| `accountIds` _string array_ | If set, this version can only be used by accounts with given IDs. | | | +| `accountPrivileges` _string array_ | If set, this version can only be used by accounts that have been given the listed privileges. | | | +| `remarks` _string_ | General remarks for human reading | | | +| `releaseNotesURL` _string_ | Release Notes URL for the specified version | | | -client.CreateShardKey(context.Background(), "{collection_name}", &qdrant.CreateShardKey{ - ShardKey: qdrant.NewShardKey("{shard_key}"), -}) -``` +#### QdrantSecretKeyRef -In this mode, the `shard_number` means the number of shards per shard key, where points will be distributed evenly. For example, if you have 10 shard keys and a collection config with these settings: -```json -{ - "shard_number": 1, - "sharding_method": "custom", - "replication_factor": 2 -} -``` -Then you will have `1 * 10 * 2 = 20` total physical shards in the collection. -Physical shards require a large amount of resources, so make sure your custom sharding key has a low cardinality. -For large cardinality keys, it is recommended to use [partition by payload](https://qdrant.tech/documentation/guides/multiple-partitions/#partition-by-payload) instead. -To specify the shard for each point, you need to provide the `shard_key` field in the upsert request: +_Appears in:_ +- [QdrantConfigurationService](#qdrantconfigurationservice) +- [QdrantConfigurationTLS](#qdrantconfigurationtls) -httppythontypescriptrustjavacsharpgo +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `secretKeyRef` _[SecretKeySelector](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#secretkeyselector-v1-core)_ | SecretKeyRef to the secret containing data to configure the qdrant instance | | | -```http -PUT /collections/{collection_name}/points -{ - "points": [\ - {\ - "id": 1111,\ - "vector": [0.1, 0.2, 0.3]\ - },\ - ] - "shard_key": "user_1" -} -``` +#### QdrantSecurityContext -```python -from qdrant_client import QdrantClient, models -client = QdrantClient(url="http://localhost:6333") -client.upsert( - collection_name="{collection_name}", - points=[\ - models.PointStruct(\ - id=1111,\ - vector=[0.1, 0.2, 0.3],\ - ),\ - ], - shard_key_selector="user_1", -) -``` -```typescript -client.upsert("{collection_name}", { - points: [\ - {\ - id: 1111,\ - vector: [0.1, 0.2, 0.3],\ - },\ - ], - shard_key: "user_1", -}); -``` -```rust -use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder}; -use qdrant_client::Payload; +_Appears in:_ +- [QdrantClusterSpec](#qdrantclusterspec) -client - .upsert_points( - UpsertPointsBuilder::new( - "{collection_name}", - vec![PointStruct::new(\ - 111,\ - vec![0.1, 0.2, 0.3],\ - Payload::default(),\ - )], - ) - .shard_key_selector("user_1".to_string()), - ) - .await?; +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `user` _integer_ | User specifies the user to run the Qdrant process as. | | | +| `group` _integer_ | Group specifies the group to run the Qdrant process as. | | | +| `fsGroup` _integer_ | FsGroup specifies file system group to run the Qdrant process as. | | | -``` -```java -import java.util.List; +#### RebalanceStrategy -import static io.qdrant.client.PointIdFactory.id; -import static io.qdrant.client.ShardKeySelectorFactory.shardKeySelector; -import static io.qdrant.client.VectorsFactory.vectors; +_Underlying type:_ _string_ -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.PointStruct; -import io.qdrant.client.grpc.Points.UpsertPoints; +RebalanceStrategy specifies the strategy to use for automaticially rebalancing shards the cluster. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +_Validation:_ +- Enum: [by_count by_size by_count_and_size] -client - .upsertAsync( - UpsertPoints.newBuilder() - .setCollectionName("{collection_name}") - .addAllPoints( - List.of( - PointStruct.newBuilder() - .setId(id(111)) - .setVectors(vectors(0.1f, 0.2f, 0.3f)) - .build())) - .setShardKeySelector(shardKeySelector("user_1")) - .build()) - .get(); +_Appears in:_ +- [QdrantClusterSpec](#qdrantclusterspec) -``` +| Field | Description | +| --- | --- | +| `by_count` | | +| `by_size` | | +| `by_count_and_size` | | -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -var client = new QdrantClient("localhost", 6334); +#### RegionCapabilities -await client.UpsertAsync( - collectionName: "{collection_name}", - points: new List - { - new() { Id = 111, Vectors = new[] { 0.1f, 0.2f, 0.3f } } - }, - shardKeySelector: new ShardKeySelector { ShardKeys = { new List { "user_1" } } } -); -``` -```go -import ( - "context" - "github.com/qdrant/go-client/qdrant" -) -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) -client.Upsert(context.Background(), &qdrant.UpsertPoints{ - CollectionName: "{collection_name}", - Points: []*qdrant.PointStruct{ - { - Id: qdrant.NewIDNum(111), - Vectors: qdrant.NewVectors(0.1, 0.2, 0.3), - }, - }, - ShardKeySelector: &qdrant.ShardKeySelector{ - ShardKeys: []*qdrant.ShardKey{ - qdrant.NewShardKey("user_1"), - }, - }, -}) -``` +_Appears in:_ +- [QdrantCloudRegionStatus](#qdrantcloudregionstatus) -**\*** When using custom sharding, IDs are only enforced to be unique within a shard key. This means that you can have multiple points with the same ID, if they have different shard keys. -This is a limitation of the current implementation, and is an anti-pattern that should be avoided because it can create scenarios of points with the same ID to have different contents. In the future, we plan to add a global ID uniqueness check. +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `volumeSnapshot` _boolean_ | VolumeSnapshot specifies whether the Kubernetes cluster supports volume snapshot | | | +| `volumeExpansion` _boolean_ | VolumeExpansion specifies whether the Kubernetes cluster supports volume expansion | | | -Now you can target the operations to specific shard(s) by specifying the `shard_key` on any operation you do. Operations that do not specify the shard key will be executed on **all** shards. -Another use-case would be to have shards that track the data chronologically, so that you can do more complex itineraries like uploading live data in one shard and archiving it once a certain age has passed. +#### RegionPhase -![Sharding per day](https://qdrant.tech/docs/sharding-per-day.png) +_Underlying type:_ _string_ -### [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#shard-transfer-method) Shard transfer method -_Available as of v1.7.0_ -There are different methods for transferring a shard, such as moving or -replicating, to another node. Depending on what performance and guarantees you’d -like to have and how you’d like to manage your cluster, you likely want to -choose a specific method. Each method has its own pros and cons. Which is -fastest depends on the size and state of a shard. -Available shard transfer methods are: -- `stream_records`: _(default)_ transfer by streaming just its records to the target node in batches. -- `snapshot`: transfer including its index and quantized data by utilizing a [snapshot](https://qdrant.tech/documentation/concepts/snapshots/) automatically. -- `wal_delta`: _(auto recovery default)_ transfer by resolving [WAL](https://qdrant.tech/documentation/concepts/storage/#versioning) difference; the operations that were missed. +_Appears in:_ +- [QdrantCloudRegionStatus](#qdrantcloudregionstatus) -Each has pros, cons and specific requirements, some of which are: +| Field | Description | +| --- | --- | +| `Ready` | | +| `NotReady` | | +| `FailedToSync` | | -| Method: | Stream records | Snapshot | WAL delta | -| --- | --- | --- | --- | -| **Version** | v0.8.0+ | v1.7.0+ | v1.8.0+ | -| **Target** | New/existing shard | New/existing shard | Existing shard | -| **Connectivity** | Internal gRPC API (6335) | REST API (6333)
Internal gRPC API (6335) | Internal gRPC API (6335) | -| **HNSW index** | Doesn’t transfer, will reindex on target. | Does transfer, immediately ready on target. | Doesn’t transfer, may index on target. | -| **Quantization** | Doesn’t transfer, will requantize on target. | Does transfer, immediately ready on target. | Doesn’t transfer, may quantize on target. | -| **Ordering** | Unordered updates on target[1](https://qdrant.tech/documentation/guides/distributed_deployment/#fn:1) | Ordered updates on target[2](https://qdrant.tech/documentation/guides/distributed_deployment/#fn:2) | Ordered updates on target[2](https://qdrant.tech/documentation/guides/distributed_deployment/#fn:2) | -| **Disk space** | No extra required | Extra required for snapshot on both nodes | No extra required | -To select a shard transfer method, specify the `method` like: +#### ResourceRequests -```http -POST /collections/{collection_name}/cluster -{ - "move_shard": { - "shard_id": 0, - "from_peer_id": 381894127, - "to_peer_id": 467122995, - "method": "snapshot" - } -} -``` -The `stream_records` transfer method is the simplest available. It simply -transfers all shard records in batches to the target node until it has -transferred all of them, keeping both shards in sync. It will also make sure the -transferred shard indexing process is keeping up before performing a final -switch. The method has two common disadvantages: 1. It does not transfer index -or quantization data, meaning that the shard has to be optimized again on the -new node, which can be very expensive. 2. The ordering guarantees are -`weak` [1](https://qdrant.tech/documentation/guides/distributed_deployment/#fn:1), which is not suitable for some applications. Because it is -so simple, it’s also very robust, making it a reliable choice if the above cons -are acceptable in your use case. If your cluster is unstable and out of -resources, it’s probably best to use the `stream_records` transfer method, -because it is unlikely to fail. -The `snapshot` transfer method utilizes [snapshots](https://qdrant.tech/documentation/concepts/snapshots/) -to transfer a shard. A snapshot is created automatically. It is then transferred -and restored on the target node. After this is done, the snapshot is removed -from both nodes. While the snapshot/transfer/restore operation is happening, the -source node queues up all new operations. All queued updates are then sent in -order to the target shard to bring it into the same state as the source. There -are two important benefits: 1. It transfers index and quantization data, so that -the shard does not have to be optimized again on the target node, making them -immediately available. This way, Qdrant ensures that there will be no -degradation in performance at the end of the transfer. Especially on large -shards, this can give a huge performance improvement. 2. The ordering guarantees -can be `strong` [2](https://qdrant.tech/documentation/guides/distributed_deployment/#fn:2), required for some applications. -The `wal_delta` transfer method only transfers the difference between two -shards. More specifically, it transfers all operations that were missed to the -target shard. The [WAL](https://qdrant.tech/documentation/concepts/storage/#versioning) of both shards is used to resolve this. There are two -benefits: 1. It will be very fast because it only transfers the difference -rather than all data. 2. The ordering guarantees can be `strong` [2](https://qdrant.tech/documentation/guides/distributed_deployment/#fn:2), -required for some applications. Two disadvantages are: 1. It can only be used to -transfer to a shard that already exists on the other node. 2. Applicability is -limited because the WALs normally don’t hold more than 64MB of recent -operations. But that should be enough for a node that quickly restarts, to -upgrade for example. If a delta cannot be resolved, this method automatically -falls back to `stream_records` which equals transferring the full shard. -The `stream_records` method is currently used as default. This may change in the -future. As of Qdrant 1.9.0 `wal_delta` is used for automatic shard replications -to recover dead shards. -## [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#replication) Replication +_Appears in:_ +- [Resources](#resources) -Qdrant allows you to replicate shards between nodes in the cluster. +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `cpu` _string_ | CPU specifies the CPU request for each Qdrant node. | | | +| `memory` _string_ | Memory specifies the memory request for each Qdrant node. | | | -Shard replication increases the reliability of the cluster by keeping several copies of a shard spread across the cluster. -This ensures the availability of the data in case of node failures, except if all replicas are lost. -### [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#replication-factor) Replication factor +#### Resources -When you create a collection, you can control how many shard replicas you’d like to store by changing the `replication_factor`. By default, `replication_factor` is set to “1”, meaning no additional copy is maintained automatically. The default can be changed in the [Qdrant configuration](https://qdrant.tech/documentation/guides/configuration/#configuration-options). You can change that by setting the `replication_factor` when you create a collection. -The `replication_factor` can be updated for an existing collection, but the effect of this depends on how you’re running Qdrant. If you’re hosting the open source version of Qdrant yourself, changing the replication factor after collection creation doesn’t do anything. You can manually [create](https://qdrant.tech/documentation/guides/distributed_deployment/#creating-new-shard-replicas) or drop shard replicas to achieve your desired replication factor. In Qdrant Cloud (including Hybrid Cloud, Private Cloud) your shards will automatically be replicated or dropped to match your configured replication factor. -httppythontypescriptrustjavacsharpgo -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 300, - "distance": "Cosine" - }, - "shard_number": 6, - "replication_factor": 2 -} -``` -```python -from qdrant_client import QdrantClient, models -client = QdrantClient(url="http://localhost:6333") +_Appears in:_ +- [QdrantClusterSpec](#qdrantclusterspec) -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=300, distance=models.Distance.COSINE), - shard_number=6, - replication_factor=2, -) +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `cpu` _string_ | CPU specifies the CPU limit for each Qdrant node. | | | +| `memory` _string_ | Memory specifies the memory limit for each Qdrant node. | | | +| `storage` _string_ | Storage specifies the storage amount for each Qdrant node. | | | +| `requests` _[ResourceRequests](#resourcerequests)_ | Requests specifies the resource requests for each Qdrant node. | | | -``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +#### RestoreDestination -const client = new QdrantClient({ host: "localhost", port: 6333 }); -client.createCollection("{collection_name}", { - vectors: { - size: 300, - distance: "Cosine", - }, - shard_number: 6, - replication_factor: 2, -}); -``` -```rust -use qdrant_client::qdrant::{CreateCollectionBuilder, Distance, VectorParamsBuilder}; -use qdrant_client::Qdrant; -let client = Qdrant::from_url("http://localhost:6334").build()?; -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(300, Distance::Cosine)) - .shard_number(6) - .replication_factor(2), - ) - .await?; -``` +_Appears in:_ +- [QdrantClusterRestoreSpec](#qdrantclusterrestorespec) -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.grpc.Collections.VectorsConfig; +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `name` _string_ | Name of the destination cluster | | | +| `namespace` _string_ | Namespace of the destination cluster | | | -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig( - VectorsConfig.newBuilder() - .setParams( - VectorParams.newBuilder() - .setSize(300) - .setDistance(Distance.Cosine) - .build()) - .build()) - .setShardNumber(6) - .setReplicationFactor(2) - .build()) - .get(); +#### RestorePhase -``` +_Underlying type:_ _string_ -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -var client = new QdrantClient("localhost", 6334); -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 300, Distance = Distance.Cosine }, - shardNumber: 6, - replicationFactor: 2 -); -``` -```go -import ( - "context" +_Appears in:_ +- [QdrantClusterRestoreStatus](#qdrantclusterrestorestatus) - "github.com/qdrant/go-client/qdrant" -) +| Field | Description | +| --- | --- | +| `Running` | | +| `Skipped` | | +| `Failed` | | +| `Succeeded` | | -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 300, - Distance: qdrant.Distance_Cosine, - }), - ShardNumber: qdrant.PtrOf(uint32(6)), - ReplicationFactor: qdrant.PtrOf(uint32(2)), -}) +#### RestoreSource -``` -This code sample creates a collection with a total of 6 logical shards backed by a total of 12 physical shards. -Since a replication factor of “2” would require twice as much storage space, it is advised to make sure the hardware can host the additional shard replicas beforehand. -### [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#creating-new-shard-replicas) Creating new shard replicas -It is possible to create or delete replicas manually on an existing collection using the [Update collection cluster setup API](https://api.qdrant.tech/master/api-reference/distributed/update-collection-cluster). This is usually only necessary if you run Qdrant open-source. In Qdrant Cloud shard replication is handled and updated automatically, matching the configured `replication_factor`. -A replica can be added on a specific peer by specifying the peer from which to replicate. -```http -POST /collections/{collection_name}/cluster -{ - "replicate_shard": { - "shard_id": 0, - "from_peer_id": 381894127, - "to_peer_id": 467122995 - } -} +_Appears in:_ +- [QdrantClusterRestoreSpec](#qdrantclusterrestorespec) -``` +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `snapshotName` _string_ | SnapshotName is the name of the snapshot from which we wish to restore | | | +| `namespace` _string_ | Namespace of the snapshot | | | -And a replica can be removed on a specific peer. -```http -POST /collections/{collection_name}/cluster -{ - "drop_replica": { - "shard_id": 0, - "peer_id": 381894127 - } -} +#### ScheduledSnapshotPhase -``` +_Underlying type:_ _string_ -Keep in mind that a collection must contain at least one active replica of a shard. -### [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#error-handling) Error handling -Replicas can be in different states: -- Active: healthy and ready to serve traffic -- Dead: unhealthy and not ready to serve traffic -- Partial: currently under resynchronization before activation -A replica is marked as dead if it does not respond to internal healthchecks or if it fails to serve traffic. +_Appears in:_ +- [QdrantClusterScheduledSnapshotStatus](#qdrantclusterscheduledsnapshotstatus) -A dead replica will not receive traffic from other peers and might require a manual intervention if it does not recover automatically. +| Field | Description | +| --- | --- | +| `Active` | | +| `Disabled` | | -This mechanism ensures data consistency and availability if a subset of the replicas fail during an update operation. -### [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#node-failure-recovery) Node Failure Recovery +#### StorageClass -Sometimes hardware malfunctions might render some nodes of the Qdrant cluster unrecoverable. -No system is immune to this. -But several recovery scenarios allow qdrant to stay available for requests and even avoid performance degradation. -Let’s walk through them from best to worst. -**Recover with replicated collection** -If the number of failed nodes is less than the replication factor of the collection, then your cluster should still be able to perform read, search and update queries. -Now, if the failed node restarts, consensus will trigger the replication process to update the recovering node with the newest updates it has missed. -If the failed node never restarts, you can recover the lost shards if you have a 3+ node cluster. You cannot recover lost shards in smaller clusters because recovery operations go through [raft](https://qdrant.tech/documentation/guides/distributed_deployment/#raft) which requires >50% of the nodes to be healthy. -**Recreate node with replicated collections** +_Appears in:_ +- [QdrantCloudRegionStatus](#qdrantcloudregionstatus) -If a node fails and it is impossible to recover it, you should exclude the dead node from the consensus and create an empty node. +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `name` _string_ | Name specifies the name of the storage class | | | +| `default` _boolean_ | Default specifies whether the storage class is the default storage class | | | +| `provisioner` _string_ | Provisioner specifies the provisioner of the storage class | | | +| `allowVolumeExpansion` _boolean_ | AllowVolumeExpansion specifies whether the storage class allows volume expansion | | | +| `reclaimPolicy` _string_ | ReclaimPolicy specifies the reclaim policy of the storage class | | | +| `parameters` _object (keys:string, values:string)_ | Parameters specifies the parameters of the storage class | | | -To exclude failed nodes from the consensus, use [remove peer](https://api.qdrant.tech/master/api-reference/distributed/remove-peer) API. -Apply the `force` flag if necessary. -When you create a new node, make sure to attach it to the existing cluster by specifying `--bootstrap` CLI parameter with the URL of any of the running cluster nodes. +#### StorageClassNames -Once the new node is ready and synchronized with the cluster, you might want to ensure that the collection shards are replicated enough. Remember that Qdrant will not automatically balance shards since this is an expensive operation. -Use the [Replicate Shard Operation](https://api.qdrant.tech/master/api-reference/distributed/update-collection-cluster) to create another copy of the shard on the newly connected node. -It’s worth mentioning that Qdrant only provides the necessary building blocks to create an automated failure recovery. -Building a completely automatic process of collection scaling would require control over the cluster machines themself. -Check out our [cloud solution](https://qdrant.to/cloud), where we made exactly that. -**Recover from snapshot** -If there are no copies of data in the cluster, it is still possible to recover from a snapshot. -Follow the same steps to detach failed node and create a new one in the cluster: -- To exclude failed nodes from the consensus, use [remove peer](https://api.qdrant.tech/master/api-reference/distributed/remove-peer) API. Apply the `force` flag if necessary. -- Create a new node, making sure to attach it to the existing cluster by specifying the `--bootstrap` CLI parameter with the URL of any of the running cluster nodes. -Snapshot recovery, used in single-node deployment, is different from cluster one. -Consensus manages all metadata about all collections and does not require snapshots to recover it. -But you can use snapshots to recover missing shards of the collections. +_Appears in:_ +- [QdrantClusterSpec](#qdrantclusterspec) -Use the [Collection Snapshot Recovery API](https://qdrant.tech/documentation/concepts/snapshots/#recover-in-cluster-deployment) to do it. -The service will download the specified snapshot of the collection and recover shards with data from it. +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `db` _string_ | DB specifies the storage class name for db volume. | | | +| `snapshots` _string_ | Snapshots specifies the storage class name for snapshots volume. | | | -Once all shards of the collection are recovered, the collection will become operational again. -### [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#temporary-node-failure) Temporary node failure +#### StorageConfig -If properly configured, running Qdrant in distributed mode can make your cluster resistant to outages when one node fails temporarily. -Here is how differently-configured Qdrant clusters respond: -- 1-node clusters: All operations time out or fail for up to a few minutes. It depends on how long it takes to restart and load data from disk. -- 2-node clusters where shards ARE NOT replicated: All operations will time out or fail for up to a few minutes. It depends on how long it takes to restart and load data from disk. -- 2-node clusters where all shards ARE replicated to both nodes: All requests except for operations on collections continue to work during the outage. -- 3+-node clusters where all shards are replicated to at least 2 nodes: All requests continue to work during the outage. -## [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#consistency-guarantees) Consistency guarantees -By default, Qdrant focuses on availability and maximum throughput of search operations. -For the majority of use cases, this is a preferable trade-off. -During the normal state of operation, it is possible to search and modify data from any peers in the cluster. -Before responding to the client, the peer handling the request dispatches all operations according to the current topology in order to keep the data synchronized across the cluster. +_Appears in:_ +- [QdrantConfiguration](#qdrantconfiguration) -- reads are using a partial fan-out strategy to optimize latency and availability -- writes are executed in parallel on all active sharded replicas +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `performance` _[StoragePerformanceConfig](#storageperformanceconfig)_ | Performance configuration | | | +| `maxCollections` _integer_ | MaxCollections represents the maximal number of collections allowed to be created.
It can be set for Qdrant version >= 1.14.1
Default to 1000 if omitted and Qdrant version >= 1.15.0 | | Minimum: 1
| -![Embeddings](https://qdrant.tech/docs/concurrent-operations-replicas.png) -However, in some cases, it is necessary to ensure additional guarantees during possible hardware instabilities, mass concurrent updates of same documents, etc. +#### StoragePerformanceConfig -Qdrant provides a few options to control consistency guarantees: -- `write_consistency_factor` \- defines the number of replicas that must acknowledge a write operation before responding to the client. Increasing this value will make write operations tolerant to network partitions in the cluster, but will require a higher number of replicas to be active to perform write operations. -- Read `consistency` param, can be used with search and retrieve operations to ensure that the results obtained from all replicas are the same. If this option is used, Qdrant will perform the read operation on multiple replicas and resolve the result according to the selected strategy. This option is useful to avoid data inconsistency in case of concurrent updates of the same documents. This options is preferred if the update operations are frequent and the number of replicas is low. -- Write `ordering` param, can be used with update and delete operations to ensure that the operations are executed in the same order on all replicas. If this option is used, Qdrant will route the operation to the leader replica of the shard and wait for the response before responding to the client. This option is useful to avoid data inconsistency in case of concurrent updates of the same documents. This options is preferred if read operations are more frequent than update and if search performance is critical. -### [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#write-consistency-factor) Write consistency factor -The `write_consistency_factor` represents the number of replicas that must acknowledge a write operation before responding to the client. It is set to 1 by default. -It can be configured at the collection’s creation or when updating the -collection parameters. -This value can range from 1 to the number of replicas you have for each shard. -httppythontypescriptrustjavacsharpgo -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 300, - "distance": "Cosine" - }, - "shard_number": 6, - "replication_factor": 2, - "write_consistency_factor": 2 -} +_Appears in:_ +- [StorageConfig](#storageconfig) -``` +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `optimizer_cpu_budget` _integer_ | OptimizerCPUBudget defines the number of CPU allocation.
If 0 - auto selection, keep 1 or more CPUs unallocated depending on CPU size
If negative - subtract this number of CPUs from the available CPUs.
If positive - use this exact number of CPUs. | | | +| `async_scorer` _boolean_ | AsyncScorer enables io_uring when rescoring | | | -```python -from qdrant_client import QdrantClient, models -client = QdrantClient(url="http://localhost:6333") +#### TraefikConfig -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=300, distance=models.Distance.COSINE), - shard_number=6, - replication_factor=2, - write_consistency_factor=2, -) -``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; -const client = new QdrantClient({ host: "localhost", port: 6333 }); -client.createCollection("{collection_name}", { - vectors: { - size: 300, - distance: "Cosine", - }, - shard_number: 6, - replication_factor: 2, - write_consistency_factor: 2, -}); -``` -```rust -use qdrant_client::qdrant::{CreateCollectionBuilder, Distance, VectorParamsBuilder}; -use qdrant_client::Qdrant; +_Appears in:_ +- [Ingress](#ingress) -let client = Qdrant::from_url("http://localhost:6334").build()?; +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `allowedSourceRanges` _string array_ | AllowedSourceRanges specifies the allowed CIDR source ranges for the ingress. | | | +| `entryPoints` _string array_ | EntryPoints is the list of traefik entry points to use for the ingress route.
If nothing is set, it will take the entryPoints configured in the operator config. | | | -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(300, Distance::Cosine)) - .shard_number(6) - .replication_factor(2) - .write_consistency_factor(2), - ) - .await?; -``` +#### VolumeSnapshotClass -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.grpc.Collections.VectorsConfig; -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig( - VectorsConfig.newBuilder() - .setParams( - VectorParams.newBuilder() - .setSize(300) - .setDistance(Distance.Cosine) - .build()) - .build()) - .setShardNumber(6) - .setReplicationFactor(2) - .setWriteConsistencyFactor(2) - .build()) - .get(); -``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -var client = new QdrantClient("localhost", 6334); -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 300, Distance = Distance.Cosine }, - shardNumber: 6, - replicationFactor: 2, - writeConsistencyFactor: 2 -); +_Appears in:_ +- [QdrantCloudRegionStatus](#qdrantcloudregionstatus) -``` +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `name` _string_ | Name specifies the name of the volume snapshot class | | | +| `driver` _string_ | Driver specifies the driver of the volume snapshot class | | | -```go -import ( - "context" - "github.com/qdrant/go-client/qdrant" -) +#### VolumeSnapshotInfo -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 300, - Distance: qdrant.Distance_Cosine, - }), - ShardNumber: qdrant.PtrOf(uint32(6)), - ReplicationFactor: qdrant.PtrOf(uint32(2)), - WriteConsistencyFactor: qdrant.PtrOf(uint32(2)), -}) -``` -Write operations will fail if the number of active replicas is less than the -`write_consistency_factor`. In this case, the client is expected to send the -operation again to ensure a consistent state is reached. -Setting the `write_consistency_factor` to a lower value may allow accepting -writes even if there are unresponsive nodes. Unresponsive nodes are marked as -dead and will automatically be recovered once available to ensure data -consistency. -The configuration of the `write_consistency_factor` is important for adjusting the cluster’s behavior when some nodes go offline due to restarts, upgrades, or failures. -By default, the cluster continues to accept updates as long as at least one replica of each shard is online. However, this behavior means that once an offline replica is restored, it will require additional synchronization with the rest of the cluster. In some cases, this synchronization can be resource-intensive and undesirable. +_Appears in:_ +- [QdrantClusterSnapshotStatus](#qdrantclustersnapshotstatus) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `volumeSnapshotName` _string_ | VolumeSnapshotName is the name of the volume snapshot | | | +| `volumeName` _string_ | VolumeName is the name of the volume that was backed up | | | +| `readyToUse` _boolean_ | ReadyToUse indicates if the volume snapshot is ready to use | | | +| `snapshotHandle` _string_ | SnapshotHandle is the identifier of the volume snapshot in the respective cloud provider | | | -Setting the `write_consistency_factor` to match the replication factor modifies the cluster’s behavior so that unreplicated updates are rejected, preventing the need for extra synchronization. +<|page-126-lllmstxt|> +# Changelog -If the update is applied to enough replicas - according to the `write_consistency_factor` \- the update will return a successful status. Any replicas that failed to apply the update will be temporarily disabled and are automatically recovered to keep data consistency. If the update could not be applied to enough replicas, it’ll return an error and may be partially applied. The user must submit the operation again to ensure data consistency. +## 1.8.0 (2025-08-08) -For asynchronous updates and injection pipelines capable of handling errors and retries, this strategy might be preferable. +| Component | Version | +|-------------------------|---------| +| qdrant-kubernetes-api | v1.17.2 | +| operator | 2.6.8 | +| qdrant-cluster-manager | v0.3.7 | +| qdrant-cluster-exporter | 1.7.1 | -### [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#read-consistency) Read consistency +* Add qdrant-cluster-exporter to provide OpenTelemetry/Prometheus metrics for Qdrant clusters -Read `consistency` can be specified for most read requests and will ensure that the returned result -is consistent across cluster nodes. +## 1.7.3 (2025-08-04) -- `all` will query all nodes and return points, which present on all of them -- `majority` will query all nodes and return points, which present on the majority of them -- `quorum` will query randomly selected majority of nodes and return points, which present on all of them -- `1`/ `2`/ `3`/etc - will query specified number of randomly selected nodes and return points which present on all of them -- default `consistency` is `1` +| Component | Version | +|------------------------|---------| +| qdrant-kubernetes-api | v1.17.2 | +| operator | 2.6.8 | +| qdrant-cluster-manager | v0.3.7 | -httppythontypescriptrustjavacsharpgo +* Configurable max_request_size_mb -```http -POST /collections/{collection_name}/points/query?consistency=majority -{ - "query": [0.2, 0.1, 0.9, 0.7], - "filter": { - "must": [\ - {\ - "key": "city",\ - "match": {\ - "value": "London"\ - }\ - }\ - ] - }, - "params": { - "hnsw_ef": 128, - "exact": false - }, - "limit": 3 -} +## 1.7.2 (2025-06-27) -``` +| Component | Version | +|------------------------|---------| +| qdrant-kubernetes-api | v1.17.0 | +| operator | 2.6.4 | +| qdrant-cluster-manager | v0.3.7 | -```python -client.query_points( - collection_name="{collection_name}", - query=[0.2, 0.1, 0.9, 0.7], - query_filter=models.Filter( - must=[\ - models.FieldCondition(\ - key="city",\ - match=models.MatchValue(\ - value="London",\ - ),\ - )\ - ] - ), - search_params=models.SearchParams(hnsw_ef=128, exact=False), - limit=3, - consistency="majority", -) +* Performance and stability improvements +* Configurable maxCollection -``` +## 1.7.1 (2025-06-03) -```typescript -client.query("{collection_name}", { - query: [0.2, 0.1, 0.9, 0.7], - filter: { - must: [{ key: "city", match: { value: "London" } }], - }, - params: { - hnsw_ef: 128, - exact: false, - }, - limit: 3, - consistency: "majority", -}); +| Component | Version | +|------------------------|---------| +| qdrant-kubernetes-api | v1.16.6 | +| operator | 2.6.0 | +| qdrant-cluster-manager | v0.3.6 | -``` +* Performance and stability improvements -```rust -use qdrant_client::qdrant::{ - read_consistency::Value, Condition, Filter, QueryPointsBuilder, ReadConsistencyType, - SearchParamsBuilder, -}; -use qdrant_client::{Qdrant, QdrantError}; +## 1.7.0 (2025-05-14) -let client = Qdrant::from_url("http://localhost:6334").build()?; +| Component | Version | +|------------------------|---------| +| qdrant-kubernetes-api | v1.16.3 | +| operator | 2.4.2 | +| qdrant-cluster-manager | v0.3.5 | -client - .query( - QueryPointsBuilder::new("{collection_name}") - .query(vec![0.2, 0.1, 0.9, 0.7]) - .limit(3) - .filter(Filter::must([Condition::matches(\ - "city",\ - "London".to_string(),\ - )])) - .params(SearchParamsBuilder::default().hnsw_ef(128).exact(false)) - .read_consistency(Value::Type(ReadConsistencyType::Majority.into())), - ) - .await?; +* Add optional automatic shard balancing +* Set strict mode by default for new clusters to only allow queries with payload filters on fields that are indexed -``` -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.QueryPoints; -import io.qdrant.client.grpc.Points.ReadConsistency; -import io.qdrant.client.grpc.Points.ReadConsistencyType; -import io.qdrant.client.grpc.Points.SearchParams; +## 1.6.4 (2025-04-17) -import static io.qdrant.client.QueryFactory.nearest; -import static io.qdrant.client.ConditionFactory.matchKeyword; +| Component | Version | +|------------------------|---------| +| qdrant-kubernetes-api | v1.15.5 | +| operator | 2.3.4 | +| qdrant-cluster-manager | v0.3.4 | -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +* Fix bug in operator Helm chart that caused role binding generation to fail when using `watch.namespaces` -client.queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setFilter(Filter.newBuilder().addMust(matchKeyword("city", "London")).build()) - .setQuery(nearest(.2f, 0.1f, 0.9f, 0.7f)) - .setParams(SearchParams.newBuilder().setHnswEf(128).setExact(false).build()) - .setLimit(3) - .setReadConsistency( - ReadConsistency.newBuilder().setType(ReadConsistencyType.Majority).build()) - .build()) - .get(); +## 1.6.3 (2025-03-28) -``` +| Component | Version | +|------------------------|---------| +| qdrant-kubernetes-api | v1.15.0 | +| operator | 2.3.3 | +| qdrant-cluster-manager | v0.3.4 | -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -using static Qdrant.Client.Grpc.Conditions; +* Performance and stability improvements for collection re-sharding -var client = new QdrantClient("localhost", 6334); +## 1.6.2 (2025-03-21) -await client.QueryAsync( - collectionName: "{collection_name}", - query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, - filter: MatchKeyword("city", "London"), - searchParams: new SearchParams { HnswEf = 128, Exact = false }, - limit: 3, - readConsistency: new ReadConsistency { Type = ReadConsistencyType.Majority } -); +| Component | Version | +|------------------------|---------| +| qdrant-kubernetes-api | v1.15.0 | +| operator | 2.3.2 | +| qdrant-cluster-manager | v0.3.3 | -``` +* Allow disabling NetworkPolicy management in Qdrant Cluster operator -```go -import ( - "context" +## 1.6.1 (2025-03-14) - "github.com/qdrant/go-client/qdrant" -) +| Component | Version | +|------------------------|---------| +| qdrant-kubernetes-api | v1.14.2 | +| operator | 2.3.2 | +| qdrant-cluster-manager | v0.3.3 | -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +* Add support for GPU instances +* Experimental support for automatic shard balancing -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), - Filter: &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("city", "London"), - }, - }, - Params: &qdrant.SearchParams{ - HnswEf: qdrant.PtrOf(uint64(128)), - }, - Limit: qdrant.PtrOf(uint64(3)), - ReadConsistency: qdrant.NewReadConsistencyType(qdrant.ReadConsistencyType_Majority), -}) +## 1.5.1 (2025-03-04) + +| Component | Version | +|------------------------|---------| +| qdrant-kubernetes-api | v1.12.0 | +| operator | 2.1.26 | +| qdrant-cluster-manager | v0.3.2 | -``` +* Fix scaling down clusters that have TLS with self-signed certificates configured +* Various performance improvements and stability fixes -### [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#write-ordering) Write ordering +## 1.5.0 (2025-02-21) -Write `ordering` can be specified for any write request to serialize it through a single “leader” node, -which ensures that all write operations (issued with the same `ordering`) are performed and observed -sequentially. +| Component | Version | +|------------------------|---------| +| qdrant-kubernetes-api | v1.12.0 | +| operator | 2.1.26 | +| qdrant-cluster-manager | v0.3.0 | -- `weak` _(default)_ ordering does not provide any additional guarantees, so write operations can be freely reordered. -- `medium` ordering serializes all write operations through a dynamically elected leader, which might cause minor inconsistencies in case of leader change. -- `strong` ordering serializes all write operations through the permanent leader, which provides strong consistency, but write operations may be unavailable if the leader is down. +* Added support for P2P TLS configuration +* Faster node removal on scale down +* Various performance improvements and stability fixes -httppythontypescriptrustjavacsharpgo +## 1.4.0 (2025-01-23) -```http -PUT /collections/{collection_name}/points?ordering=strong -{ - "batch": { - "ids": [1, 2, 3], - "payloads": [\ - {"color": "red"},\ - {"color": "green"},\ - {"color": "blue"}\ - ], - "vectors": [\ - [0.9, 0.1, 0.1],\ - [0.1, 0.9, 0.1],\ - [0.1, 0.1, 0.9]\ - ] - } -} +| Component | Version | +|------------------------|---------| +| qdrant-kubernetes-api | v1.8.0 | +| operator | 2.1.26 | +| qdrant-cluster-manager | v0.3.0 | -``` +* Support deleting peers on horizontal scale down, even if they are already offline +* Support removing partially deleted peers -```python -client.upsert( - collection_name="{collection_name}", - points=models.Batch( - ids=[1, 2, 3], - payloads=[\ - {"color": "red"},\ - {"color": "green"},\ - {"color": "blue"},\ - ], - vectors=[\ - [0.9, 0.1, 0.1],\ - [0.1, 0.9, 0.1],\ - [0.1, 0.1, 0.9],\ - ], - ), - ordering=models.WriteOrdering.STRONG, -) +## 1.3.0 (2025-01-17) -``` +| Component | Version | +|------------------------|---------| +| qdrant-kubernetes-api | v1.8.0 | +| operator | 2.1.21 | +| qdrant-cluster-manager | v0.2.10 | -```typescript -client.upsert("{collection_name}", { - batch: { - ids: [1, 2, 3], - payloads: [{ color: "red" }, { color: "green" }, { color: "blue" }], - vectors: [\ - [0.9, 0.1, 0.1],\ - [0.1, 0.9, 0.1],\ - [0.1, 0.1, 0.9],\ - ], - }, - ordering: "strong", -}); +* Support for re-sharding with Qdrant >= 1.13.0 -``` +## 1.2.0 (2025-01-16) -```rust -use qdrant_client::qdrant::{ - PointStruct, UpsertPointsBuilder, WriteOrdering, WriteOrderingType -}; -use qdrant_client::Qdrant; +| Component | Version | +|------------------------|---------| +| qdrant-kubernetes-api | v1.8.0 | +| operator | 2.1.20 | +| qdrant-cluster-manager | v0.2.9 | -let client = Qdrant::from_url("http://localhost:6334").build()?; +* Performance and stability improvements -client - .upsert_points( - UpsertPointsBuilder::new( - "{collection_name}", - vec![\ - PointStruct::new(1, vec![0.9, 0.1, 0.1], [("color", "red".into())]),\ - PointStruct::new(2, vec![0.1, 0.9, 0.1], [("color", "green".into())]),\ - PointStruct::new(3, vec![0.1, 0.1, 0.9], [("color", "blue".into())]),\ - ], - ) - .ordering(WriteOrdering { - r#type: WriteOrderingType::Strong.into(), - }), - ) - .await?; +## 1.1.0 (2024-12-03) -``` +| Component | Version | +|------------------------|---------| +| qdrant-kubernetes-api | v1.6.4 | +| operator | 2.1.10 | +| qdrant-cluster-manager | v0.2.6 | -```java -import java.util.List; -import java.util.Map; +* Activate cluster-manager for automatic shard replication -import static io.qdrant.client.PointIdFactory.id; -import static io.qdrant.client.ValueFactory.value; -import static io.qdrant.client.VectorsFactory.vectors; +## 1.0.0 (2024-11-11) -import io.qdrant.client.grpc.Points.PointStruct; -import io.qdrant.client.grpc.Points.UpsertPoints; -import io.qdrant.client.grpc.Points.WriteOrdering; -import io.qdrant.client.grpc.Points.WriteOrderingType; +| Component | Version | +|------------------------|---------| +| qdrant-kubernetes-api | v1.2.7 | +| operator | 0.1.3 | +| qdrant-cluster-manager | v0.2.4 | -client - .upsertAsync( - UpsertPoints.newBuilder() - .setCollectionName("{collection_name}") - .addAllPoints( - List.of( - PointStruct.newBuilder() - .setId(id(1)) - .setVectors(vectors(0.9f, 0.1f, 0.1f)) - .putAllPayload(Map.of("color", value("red"))) - .build(), - PointStruct.newBuilder() - .setId(id(2)) - .setVectors(vectors(0.1f, 0.9f, 0.1f)) - .putAllPayload(Map.of("color", value("green"))) - .build(), - PointStruct.newBuilder() - .setId(id(3)) - .setVectors(vectors(0.1f, 0.1f, 0.94f)) - .putAllPayload(Map.of("color", value("blue"))) - .build())) - .setOrdering(WriteOrdering.newBuilder().setType(WriteOrderingType.Strong).build()) - .build()) - .get(); +* Initial release -``` +<|page-127-lllmstxt|> +# Qdrant Hybrid Cloud: Hosting Platforms & Deployment Options -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +This page provides an overview of how to deploy Qdrant Hybrid Cloud on various managed Kubernetes platforms. -var client = new QdrantClient("localhost", 6334); +For a general list of prerequisites and installation steps, see our [Hybrid Cloud setup guide](/documentation/hybrid-cloud/hybrid-cloud-setup/). This platform specific documentation also applies to Qdrant Private Cloud. -await client.UpsertAsync( - collectionName: "{collection_name}", - points: new List - { - new() - { - Id = 1, - Vectors = new[] { 0.9f, 0.1f, 0.1f }, - Payload = { ["color"] = "red" } - }, - new() - { - Id = 2, - Vectors = new[] { 0.1f, 0.9f, 0.1f }, - Payload = { ["color"] = "green" } - }, - new() - { - Id = 3, - Vectors = new[] { 0.1f, 0.1f, 0.9f }, - Payload = { ["color"] = "blue" } - } - }, - ordering: WriteOrderingType.Strong -); +![Akamai](/documentation/cloud/cloud-providers/akamai.jpg) -``` +## Akamai (Linode) -```go -import ( - "context" +[The Linode Kubernetes Engine (LKE)](https://www.linode.com/products/kubernetes/) is a managed container orchestration engine built on top of Kubernetes. LKE enables you to quickly deploy and manage your containerized applications without needing to build (and maintain) your own Kubernetes cluster. All LKE instances are equipped with a fully managed control plane at no additional cost. - "github.com/qdrant/go-client/qdrant" -) +First, consult Linode's managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on LKE**, follow our [step-by-step documentation](/documentation/hybrid-cloud/hybrid-cloud-setup/). -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +### More on Linode Kubernetes Engine -client.Upsert(context.Background(), &qdrant.UpsertPoints{ - CollectionName: "{collection_name}", - Points: []*qdrant.PointStruct{ - { - Id: qdrant.NewIDNum(1), - Vectors: qdrant.NewVectors(0.9, 0.1, 0.1), - Payload: qdrant.NewValueMap(map[string]any{"color": "red"}), - }, - { - Id: qdrant.NewIDNum(2), - Vectors: qdrant.NewVectors(0.1, 0.9, 0.1), - Payload: qdrant.NewValueMap(map[string]any{"color": "green"}), - }, - { - Id: qdrant.NewIDNum(3), - Vectors: qdrant.NewVectors(0.1, 0.1, 0.9), - Payload: qdrant.NewValueMap(map[string]any{"color": "blue"}), - }, - }, - Ordering: &qdrant.WriteOrdering{ - Type: qdrant.WriteOrderingType_Strong, - }, -}) +- [Getting Started with LKE](https://www.linode.com/docs/products/compute/kubernetes/get-started/) +- [LKE Guides](https://www.linode.com/docs/products/compute/kubernetes/guides/) +- [LKE API Reference](https://www.linode.com/docs/api/) -``` +At the time of writing, Linode [does not support CSI Volume Snapshots](https://github.com/linode/linode-blockstorage-csi-driver/issues/107). -## [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#listener-mode) Listener mode +![AWS](/documentation/cloud/cloud-providers/aws.jpg) -In some cases it might be useful to have a Qdrant node that only accumulates data and does not participate in search operations. -There are several scenarios where this can be useful: +## Amazon Web Services (AWS) -- Listener option can be used to store data in a separate node, which can be used for backup purposes or to store data for a long time. -- Listener node can be used to synchronize data into another region, while still performing search operations in the local region. +[Amazon Elastic Kubernetes Service (Amazon EKS)](https://aws.amazon.com/eks/) is a managed service to run Kubernetes in the AWS cloud and on-premises data centers which can then be paired with Qdrant's hybrid cloud. With Amazon EKS, you can take advantage of all the performance, scale, reliability, and availability of AWS infrastructure, as well as integrations with AWS networking and security services. -To enable listener mode, set `node_type` to `Listener` in the config file: +First, consult AWS' managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on AWS**, follow our [step-by-step documentation](/documentation/hybrid-cloud/hybrid-cloud-setup/). -```yaml -storage: - node_type: "Listener" +For a good balance between peformance and cost, we recommend: + +* Depending on your cluster resource configuration either general purpose (m6*, m7*, or m8*), memory optimized (r6*, r7*, or r8*) or cpu optimized (c6*, c7*, or c8*) instance types. Qdrant Hybrid Cloud also supports AWS Graviton ARM64 instances. +* At least gp3 EBS volumes for storage + +### More on Amazon Elastic Kubernetes Service + +- [Getting Started with Amazon EKS](https://docs.aws.amazon.com/eks/) +- [Amazon EKS User Guide](https://docs.aws.amazon.com/eks/latest/userguide/what-is-eks.html) +- [Amazon EKS API Reference](https://docs.aws.amazon.com/eks/latest/APIReference/Welcome.html) + +Your EKS cluster needs the EKS EBS CSI driver or a similar storage driver: +- [Amazon EBS CSI Driver](https://docs.aws.amazon.com/eks/latest/userguide/managing-ebs-csi.html) + +To allow vertical scaling, you need a StorageClass with volume expansion enabled: +- [Amazon EBS CSI Volume Resizing](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/examples/kubernetes/resizing/README.md) +```yaml +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + annotations: + storageclass.kubernetes.io/is-default-class: "true" + name: ebs-sc +provisioner: ebs.csi.aws.com +reclaimPolicy: Delete +volumeBindingMode: WaitForFirstConsumer +allowVolumeExpansion: true ``` -Listener node will not participate in search operations, but will still accept write operations and will store the data in the local storage. +To allow backups and restores, your EKS cluster needs the CSI snapshot controller: +- [Amazon EBS CSI Snapshot Controller](https://docs.aws.amazon.com/eks/latest/userguide/csi-snapshot-controller.html) -All shards, stored on the listener node, will be converted to the `Listener` state. +And you need to create a VolumeSnapshotClass: -Additionally, all write requests sent to the listener node will be processed with `wait=false` option, which means that the write oprations will be considered successful once they are written to WAL. -This mechanism should allow to minimize upsert latency in case of parallel snapshotting. +```yaml +apiVersion: snapshot.storage.k8s.io/v1 +kind: VolumeSnapshotClass +metadata: + name: csi-snapclass +deletionPolicy: Delete +driver: ebs.csi.aws.com +``` -## [Anchor](https://qdrant.tech/documentation/guides/distributed_deployment/\#consensus-checkpointing) Consensus Checkpointing +![Civo](/documentation/cloud/cloud-providers/civo.jpg) -Consensus checkpointing is a technique used in Raft to improve performance and simplify log management by periodically creating a consistent snapshot of the system state. -This snapshot represents a point in time where all nodes in the cluster have reached agreement on the state, and it can be used to truncate the log, reducing the amount of data that needs to be stored and transferred between nodes. +## Civo -For example, if you attach a new node to the cluster, it should replay all the log entries to catch up with the current state. -In long-running clusters, this can take a long time, and the log can grow very large. +[Civo Kubernetes](https://www.civo.com/kubernetes) is a robust, scalable, and managed Kubernetes service. Civo supplies a CNCF-compliant Kubernetes cluster and makes it easy to provide standard Kubernetes applications and containerized workloads. User-defined Kubernetes clusters can be created as self-service without complications using the Civo Portal. -To prevent this, one can use a special checkpointing mechanism, that will truncate the log and create a snapshot of the current state. +First, consult Civo's managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on Civo**, follow our [step-by-step documentation](/documentation/hybrid-cloud/hybrid-cloud-setup/). -To use this feature, simply call the `/cluster/recover` API on required node: +### More on Civo Kubernetes -```http -POST /cluster/recover +- [Getting Started with Civo Kubernetes](https://www.civo.com/docs/kubernetes) +- [Civo Tutorials](https://www.civo.com/learn) +- [Frequently Asked Questions on Civo](https://www.civo.com/docs/faq) + +To allow backups and restores, you need to create a VolumeSnapshotClass: +```yaml +apiVersion: snapshot.storage.k8s.io/v1 +kind: VolumeSnapshotClass +metadata: + name: csi-snapclass +deletionPolicy: Delete +driver: csi.civo.com ``` -This API can be triggered on any non-leader node, it will send a request to the current consensus leader to create a snapshot. The leader will in turn send the snapshot back to the requesting node for application. +![Digital Ocean](/documentation/cloud/cloud-providers/digital-ocean.jpg) -In some cases, this API can be used to recover from an inconsistent cluster state by forcing a snapshot creation. +## Digital Ocean -* * * +[DigitalOcean Kubernetes (DOKS)](https://www.digitalocean.com/products/kubernetes) is a managed Kubernetes service that lets you deploy Kubernetes clusters without the complexities of handling the control plane and containerized infrastructure. Clusters are compatible with standard Kubernetes toolchains and integrate natively with DigitalOcean Load Balancers and volumes. -1. Weak ordering for updates: All records are streamed to the target node in order. -New updates are received on the target node in parallel, while the transfer -of records is still happening. We therefore have `weak` ordering, regardless -of what [ordering](https://qdrant.tech/documentation/guides/distributed_deployment/#write-ordering) is used for updates. [↩](https://qdrant.tech/documentation/guides/distributed_deployment/#fnref:1) [↩](https://qdrant.tech/documentation/guides/distributed_deployment/#fnref1:1) +First, consult Digital Ocean's managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on DigitalOcean**, follow our [step-by-step documentation](/documentation/hybrid-cloud/hybrid-cloud-setup/). -2. Strong ordering for updates: A snapshot of the shard -is created, it is transferred and recovered on the target node. That ensures -the state of the shard is kept consistent. New updates are queued on the -source node, and transferred in order to the target node. Updates therefore -have the same [ordering](https://qdrant.tech/documentation/guides/distributed_deployment/#write-ordering) as the user selects, making -`strong` ordering possible. [↩](https://qdrant.tech/documentation/guides/distributed_deployment/#fnref:2) [↩](https://qdrant.tech/documentation/guides/distributed_deployment/#fnref1:2) [↩](https://qdrant.tech/documentation/guides/distributed_deployment/#fnref2:2) [↩](https://qdrant.tech/documentation/guides/distributed_deployment/#fnref3:2) +### More on DigitalOcean Kubernetes +- [Getting Started with DOKS](https://docs.digitalocean.com/products/kubernetes/getting-started/quickstart/) +- [DOKS - How To Guides](https://docs.digitalocean.com/products/kubernetes/how-to/) +- [DOKS - Reference Manual](https://docs.digitalocean.com/products/kubernetes/reference/) -##### Was this page useful? +![Gcore](/documentation/cloud/cloud-providers/gcore.svg) -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +## Gcore -Thank you for your feedback! 🙏 +[Gcore Managed Kubernetes](https://gcore.com/cloud/managed-kubernetes) is a managed container orchestration engine built on top of Kubernetes. Gcore enables you to quickly deploy and manage your containerized applications without needing to build (and maintain) your own Kubernetes cluster. All Gcore instances are equipped with a fully managed control plane at no additional cost. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/distributed_deployment.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +First, consult Gcore's managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on Gcore**, follow our [step-by-step documentation](/documentation/hybrid-cloud/hybrid-cloud-setup/). -On this page: +### More on Gcore Kubernetes Engine -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/distributed_deployment.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +- [Getting Started with Kubnetes on Gcore](https://gcore.com/docs/cloud/kubernetes/about-gcore-kubernetes) -× +![Google Cloud Platform](/documentation/cloud/cloud-providers/gcp.jpg) -[Powered by](https://qdrant.tech/) +## Google Cloud Platform (GCP) -<|page-86-lllmstxt|> -## cluster-access -- [Documentation](https://qdrant.tech/documentation/) -- [Cloud](https://qdrant.tech/documentation/cloud/) -- Cluster Access +[Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine) is a managed Kubernetes service that you can use to deploy and operate containerized applications at scale using Google's infrastructure. GKE provides the operational power of Kubernetes while managing many of the underlying components, such as the control plane and nodes, for you. -# [Anchor](https://qdrant.tech/documentation/cloud/cluster-access/\#accessing-qdrant-cloud-clusters) Accessing Qdrant Cloud Clusters +First, consult GCP's managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on GCP**, follow our [step-by-step documentation](/documentation/hybrid-cloud/hybrid-cloud-setup/). -Once you [created](https://qdrant.tech/documentation/cloud/create-cluster/) a cluster, and set up an [API key](https://qdrant.tech/documentation/cloud/authentication/), you can access your cluster through the integrated Cluster UI, the REST API and the GRPC API. +For a good balance between peformance and cost, we recommend: -## [Anchor](https://qdrant.tech/documentation/cloud/cluster-access/\#cluster-ui) Cluster UI +* Depending on your cluster resource configuration either general purpose (standard), memory optimized (highmem) or cpu optimized (highcpu) instance types of at least 2nd generation. Qdrant Hybrid Cloud also supports ARM64 instances. +* At least pd-balanced disks for storage -There is the convenient link on the cluster detail page in the Qdrant Cloud Console to access the [Cluster UI](https://qdrant.tech/documentation/web-ui/). +### More on the Google Kubernetes Engine -![Cluster Cluster UI](https://qdrant.tech/documentation/cloud/cloud-db-dashboard.png) +- [Getting Started with GKE](https://cloud.google.com/kubernetes-engine/docs/quickstart) +- [GKE Tutorials](https://cloud.google.com/kubernetes-engine/docs/tutorials) +- [GKE Documentation](https://cloud.google.com/kubernetes-engine/docs/) -The Overview tab also contains direct links to explore Qdrant tutorials and sample datasets. +To allow backups and restores, your GKE cluster needs the CSI VolumeSnapshot controller and class: +- [Google GKE Volume Snapshots](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/volume-snapshots) -![Cluster Cluster UI Tutorials](https://qdrant.tech/documentation/cloud/cloud-db-deeplinks.png) +```yaml +apiVersion: snapshot.storage.k8s.io/v1 +kind: VolumeSnapshotClass +metadata: + name: csi-snapclass +deletionPolicy: Delete +driver: pd.csi.storage.gke.io +``` -## [Anchor](https://qdrant.tech/documentation/cloud/cluster-access/\#api) API +![Microsoft Azure](/documentation/cloud/cloud-providers/azure.jpg) -The REST API is exposed on your cluster endpoint at port `6333`. The GRPC API is exposed on your cluster endpoint at port `6334`. When accessing the cluster endpoint, traffic is automatically load balanced across all healthy Qdrant nodes in the cluster. For all operations, but the few mentioned at [Node specific endpoints](https://qdrant.tech/documentation/cloud/cluster-access/#node-specific-endpoints), you should use the cluster endpoint. It does not matter which node in the cluster you land on. All nodes can handle all search and write requests. +## Mircrosoft Azure -![Cluster cluster endpoint](https://qdrant.tech/documentation/cloud/cloud-endpoint.png) +With [Azure Kubernetes Service (AKS)](https://azure.microsoft.com/en-in/products/kubernetes-service), you can start developing and deploying cloud-native apps in Azure, data centres, or at the edge. Get unified management and governance for on-premises, edge, and multi-cloud Kubernetes clusters. Interoperate with Azure security, identity, cost management, and migration services. -Have a look at the [API reference](https://qdrant.tech/documentation/interfaces/#api-reference) and the official [client libraries](https://qdrant.tech/documentation/interfaces/#client-libraries) for more information on how to interact with the Qdrant Cloud API. +First, consult Azure's managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on Azure**, follow our [step-by-step documentation](/documentation/hybrid-cloud/hybrid-cloud-setup/). -## [Anchor](https://qdrant.tech/documentation/cloud/cluster-access/\#node-specific-endpoints) Node Specific Endpoints +For a good balance between peformance and cost, we recommend: -Next to the cluster endpoint which loadbalances requests across all healthy Qdrant nodes, each node in the cluster has its own endpoint as well. This is mainly usefull for monitoring or manual shard management purpuses. +* Depending on your cluster resource configuration either general purpose (D-family), memory optimized (E-family) or cpu optimized (F-family) instance types. Qdrant Hybrid Cloud also supports Azure Cobalt ARM64 instances. +* At least Premium SSD v2 disks for storage -You can finde the node specific endpoints on the cluster detail page in the Qdrant Cloud Console. +### More on Azure Kubernetes Service -![Cluster node endpoints](https://qdrant.tech/documentation/cloud/cloud-node-endpoints.png) +- [Getting Started with AKS](https://learn.microsoft.com/en-us/azure/architecture/reference-architectures/containers/aks-start-here) +- [AKS Documentation](https://learn.microsoft.com/en-in/azure/aks/) +- [Best Practices with AKS](https://learn.microsoft.com/en-in/azure/aks/best-practices) -##### Was this page useful? +To allow backups and restores, your AKS cluster needs the CSI VolumeSnapshot controller and class: +- [Azure AKS Volume Snapshots](https://learn.microsoft.com/en-us/azure/aks/azure-disk-csi#create-a-volume-snapshot) -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +```yaml +apiVersion: snapshot.storage.k8s.io/v1 +kind: VolumeSnapshotClass +metadata: + name: csi-snapclass +deletionPolicy: Delete +driver: disk.csi.azure.com +``` -Thank you for your feedback! 🙏 +![Oracle Cloud Infrastructure](/documentation/cloud/cloud-providers/oracle.jpg) -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud/cluster-access.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +## Oracle Cloud Infrastructure -On this page: +[Oracle Cloud Infrastructure Container Engine for Kubernetes (OKE)](https://www.oracle.com/in/cloud/cloud-native/container-engine-kubernetes/) is a managed Kubernetes solution that enables you to deploy Kubernetes clusters while ensuring stable operations for both the control plane and the worker nodes through automatic scaling, upgrades, and security patching. Additionally, OKE offers a completely serverless Kubernetes experience with virtual nodes. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud/cluster-access.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +First, consult OCI's managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on OCI**, follow our [step-by-step documentation](/documentation/hybrid-cloud/hybrid-cloud-setup/). -× +### More on OCI Container Engine -[Powered by](https://qdrant.tech/) +- [Getting Started with OCI](https://docs.oracle.com/en-us/iaas/Content/ContEng/home.htm) +- [Frequently Asked Questions on OCI](https://www.oracle.com/in/cloud/cloud-native/container-engine-kubernetes/faq/) +- [OCI Product Updates](https://docs.oracle.com/en-us/iaas/releasenotes/services/conteng/) -<|page-87-lllmstxt|> -## gridstore-key-value-storage -- [Articles](https://qdrant.tech/articles/) -- Introducing Gridstore: Qdrant's Custom Key-Value Store +To allow backups and restores, your OCI cluster needs the CSI VolumeSnapshot controller and class: +- [Prerequisites for Creating Volume Snapshots +](https://docs.oracle.com/en-us/iaas/Content/ContEng/Tasks/contengcreatingpersistentvolumeclaim_topic-Provisioning_PVCs_on_BV.htm#contengcreatingpersistentvolumeclaim_topic-Provisioning_PVCs_on_BV-PV_From_Snapshot_CSI__section_volume-snapshot-prerequisites) -[Back to Qdrant Internals](https://qdrant.tech/articles/qdrant-internals/) +```yaml +apiVersion: snapshot.storage.k8s.io/v1 +kind: VolumeSnapshotClass +metadata: + name: csi-snapclass +deletionPolicy: Delete +driver: blockvolume.csi.oraclecloud.com +``` -# Introducing Gridstore: Qdrant's Custom Key-Value Store +![OVHcloud](/documentation/cloud/cloud-providers/ovh.jpg) -Luis Cossio, Arnaud Gourlay & David Myriel +## OVHcloud -· +[Service Managed Kubernetes](https://www.ovhcloud.com/en-in/public-cloud/kubernetes/), powered by OVH Public Cloud Instances, a leading European cloud provider. With OVHcloud Load Balancers and disks built in. OVHcloud Managed Kubernetes provides high availability, compliance, and CNCF conformance, allowing you to focus on your containerized software layers with total reversibility. -February 05, 2025 +First, consult OVHcloud's managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on OVHcloud**, follow our [step-by-step documentation](/documentation/hybrid-cloud/hybrid-cloud-setup/). -![Introducing Gridstore: Qdrant's Custom Key-Value Store](https://qdrant.tech/articles_data/gridstore-key-value-storage/preview/title.jpg) +### More on Service Managed Kubernetes by OVHcloud -## [Anchor](https://qdrant.tech/articles/gridstore-key-value-storage/\#why-we-built-our-own-storage-engine) Why We Built Our Own Storage Engine +- [Getting Started with OVH Managed Kubernetes](https://help.ovhcloud.com/csm/en-in-documentation-public-cloud-containers-orchestration-managed-kubernetes-k8s-getting-started) +- [OVH Managed Kubernetes Documentation](https://help.ovhcloud.com/csm/en-in-documentation-public-cloud-containers-orchestration-managed-kubernetes-k8s) +- [OVH Managed Kubernetes Tutorials](https://help.ovhcloud.com/csm/en-in-documentation-public-cloud-containers-orchestration-managed-kubernetes-k8s-tutorials) -Databases need a place to store and retrieve data. That’s what Qdrant’s [**key-value storage**](https://en.wikipedia.org/wiki/Key%e2%80%93value_database) does—it links keys to values. +![Red Hat](/documentation/cloud/cloud-providers/redhat.jpg) -When we started building Qdrant, we needed to pick something ready for the task. So we chose [**RocksDB**](https://rocksdb.org/) as our embedded key-value store. +## Red Hat OpenShift -![RocksDB](https://qdrant.tech/articles_data/gridstore-key-value-storage/rocksdb.jpg) +[Red Hat OpenShift Kubernetes Engine](https://www.redhat.com/en/technologies/cloud-computing/openshift/kubernetes-engine) provides you with the basic functionality of Red Hat OpenShift. It offers a subset of the features that Red Hat OpenShift Container Platform offers, like full access to an enterprise-ready Kubernetes environment and an extensive compatibility test matrix with many of the software elements that you might use in your data centre. -It is mature, reliable, and well-documented. +First, consult Red Hat's managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on Red Hat OpenShift**, follow our [step-by-step documentation](/documentation/hybrid-cloud/hybrid-cloud-setup/). -Over time, we ran into issues. Its architecture required compaction (uses [LSMT](https://en.wikipedia.org/wiki/Log-structured_merge-tree)), which caused random latency spikes. It handles generic keys, while we only use it for sequential IDs. Having lots of configuration options makes it versatile, but accurately tuning it was a headache. Finally, interoperating with C++ slowed us down (although we will still support it for quite some time 😭). +### More on OpenShift Kubernetes Engine -While there are already some good options written in Rust that we could leverage, we needed something custom. Nothing out there fit our needs in the way we wanted. We didn’t require generic keys. We wanted full control over when and which data was written and flushed. Our system already has crash recovery mechanisms built-in. Online compaction isn’t a priority, we already have optimizers for that. Debugging misconfigurations was not a great use of our time. +- [Getting Started with Red Hat OpenShift Kubernetes](https://docs.openshift.com/container-platform/4.15/getting_started/kubernetes-overview.html) +- [Red Hat OpenShift Kubernetes Documentation](https://docs.openshift.com/container-platform/4.15/welcome/index.html) +- [Installing on Container Platforms](https://access.redhat.com/documentation/en-us/openshift_container_platform/4.5/html/installing/index) -So we built our own storage. As of [**Qdrant Version 1.13**](https://qdrant.tech/blog/qdrant-1.13.x/), we are using Gridstore for **payload and sparse vector storages**. +Qdrant databases need a persistent storage solution. See [Openshift Storage Overview](https://docs.openshift.com/container-platform/4.15/storage/index.html). -![Gridstore](https://qdrant.tech/articles_data/gridstore-key-value-storage/gridstore.png) +To allow vertical scaling, you need a StorageClass with [volume expansion enabled](https://docs.openshift.com/container-platform/4.15/storage/expanding-persistent-volumes.html). -Simple, efficient, and designed just for Qdrant. +To allow backups and restores, your OpenShift cluster needs the [CSI snapshot controller](https://docs.openshift.com/container-platform/4.15/storage/container_storage_interface/persistent-storage-csi-snapshots.html), and you need to create a VolumeSnapshotClass. -#### [Anchor](https://qdrant.tech/articles/gridstore-key-value-storage/\#in-this-article-youll-learn-about) In this article, you’ll learn about: +![Scaleway](/documentation/cloud/cloud-providers/scaleway.jpg) -- **How Gridstore works** – a deep dive into its architecture and mechanics. -- **Why we built it this way** – the key design decisions that shaped it. -- **Rigorous testing** – how we ensured the new storage is production-ready. -- **Performance benchmarks** – official metrics that demonstrate its efficiency. +## Scaleway -**Our first challenge?** Figuring out the best way to handle sequential keys and variable-sized data. +[Scaleway Kapsule](https://www.scaleway.com/en/kubernetes-kapsule/) and [Kosmos](https://www.scaleway.com/en/kubernetes-kosmos/) are managed Kubernetes services from [Scaleway](https://www.scaleway.com/en/). They abstract away the complexities of managing and operating a Kubernetes cluster. The primary difference being, Kapsule clusters are composed solely of Scaleway Instances. Whereas, a Kosmos cluster is a managed multi-cloud Kubernetes engine that allows you to connect instances from any cloud provider to a single managed Control-Plane. -## [Anchor](https://qdrant.tech/articles/gridstore-key-value-storage/\#gridstore-architecture-three-main-components) Gridstore Architecture: Three Main Components +First, consult Scaleway's managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on Scaleway**, follow our [step-by-step documentation](/documentation/hybrid-cloud/hybrid-cloud-setup/). -![gridstore](https://qdrant.tech/articles_data/gridstore-key-value-storage/gridstore-2.png) +### More on Scaleway Kubernetes -Gridstore’s architecture is built around three key components that enable fast lookups and efficient space management: +- [Getting Started with Scaleway Kubernetes](https://www.scaleway.com/en/docs/containers/kubernetes/quickstart/#how-to-add-a-scaleway-pool-to-a-kubernetes-cluster) +- [Scaleway Kubernetes Documentation](https://www.scaleway.com/en/docs/containers/kubernetes/) +- [Frequently Asked Questions on Scaleway Kubernetes](https://www.scaleway.com/en/docs/faq/kubernetes/) -| Component | Description | -| --- | --- | -| The Data Layer | Stores values in fixed-sized blocks and retrieves them using a pointer-based lookup system. | -| The Mask Layer | Uses a bitmask to track which blocks are in use and which are available. | -| The Gaps Layer | Manages block availability at a higher level, allowing for quick space allocation. | +![STACKIT](/documentation/cloud/cloud-providers/stackit.jpg) -### [Anchor](https://qdrant.tech/articles/gridstore-key-value-storage/\#1-the-data-layer-for-fast-retrieval) 1\. The Data Layer for Fast Retrieval +## STACKIT -At the core of Gridstore is **The Data Layer**, which is designed to store and retrieve values quickly based on their keys. This layer allows us to do efficient reads and lets us store variable-sized data. The main two components of this layer are **The Tracker** and **The Data Grid**. +[STACKIT Kubernetes Engine (SKE)](https://www.stackit.de/en/product/kubernetes/) is a robust, scalable, and managed Kubernetes service. SKE supplies a CNCF-compliant Kubernetes cluster and makes it easy to provide standard Kubernetes applications and containerized workloads. User-defined Kubernetes clusters can be created as self-service without complications using the STACKIT Portal. -Since internal IDs are always sequential integers (0, 1, 2, 3, 4, 
), the tracker is an array of pointers, where each pointer tells the system exactly where a value starts and how long it is. +First, consult STACKIT's managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on STACKIT**, follow our [step-by-step documentation](/documentation/hybrid-cloud/hybrid-cloud-setup/). -![The Data Layer](https://qdrant.tech/articles_data/gridstore-key-value-storage/data-layer.png) +### More on STACKIT Kubernetes Engine -The Data Layer uses an array of pointers to quickly retrieve data. +- [Getting Started with SKE](https://docs.stackit.cloud/stackit/en/getting-started-ske-10125565.html) +- [SKE Tutorials](https://docs.stackit.cloud/stackit/en/tutorials-ske-66683162.html) +- [Frequently Asked Questions on SKE](https://docs.stackit.cloud/stackit/en/faq-known-issues-of-ske-28476393.html) -This makes lookups incredibly fast. For example, finding key 3 is just a matter of jumping to the third position in the tracker, and following the pointer to find the value in the data grid. +To allow backups and restores, you need to create a VolumeSnapshotClass: -However, because values are of variable size, the data itself is stored separately in a grid of fixed-sized blocks, which are grouped into larger page files. The fixed size of each block is usually 128 bytes. When inserting a value, Gridstore allocates one or more consecutive blocks to store it, ensuring that each block only holds data from a single value. +```yaml +apiVersion: snapshot.storage.k8s.io/v1 +kind: VolumeSnapshotClass +metadata: + name: csi-snapclass +deletionPolicy: Delete +driver: cinder.csi.openstack.org +``` -### [Anchor](https://qdrant.tech/articles/gridstore-key-value-storage/\#2-the-mask-layer-reuses-space) 2\. The Mask Layer Reuses Space +![Vultr](/documentation/cloud/cloud-providers/vultr.jpg) -**The Mask Layer** helps Gridstore handle updates and deletions without the need for expensive data compaction. Instead of maintaining complex metadata for each block, Gridstore tracks usage with a bitmask, where each bit represents a block, with 1 for used, 0 for free. +## Vultr -![The Mask Layer](https://qdrant.tech/articles_data/gridstore-key-value-storage/mask-layer.png) +[Vultr Kubernetes Engine (VKE)](https://www.vultr.com/kubernetes/) is a fully-managed product offering with predictable pricing that makes Kubernetes easy to use. Vultr manages the control plane and worker nodes and provides integration with other managed services such as Load Balancers, Block Storage, and DNS. -The bitmask efficiently tracks block usage. +First, consult Vultr's managed Kubernetes instructions below. Then, **to set up Qdrant Hybrid Cloud on Vultr**, follow our [step-by-step documentation](/documentation/hybrid-cloud/hybrid-cloud-setup/). -This makes it easy to determine where new values can be written. When a value is removed, it gets soft-deleted at its pointer, and the corresponding blocks in the bitmask are marked as available. Similarly, when updating a value, the new version is written elsewhere, and the old blocks are freed at the bitmask. +### More on Vultr Kubernetes Engine -This approach ensures that Gridstore doesn’t waste space. As the storage grows, however, scanning for available blocks in the entire bitmask can become computationally expensive. +- [VKE Guide](https://docs.vultr.com/vultr-kubernetes-engine) +- [VKE Documentation](https://docs.vultr.com/) +- [Frequently Asked Questions on VKE](https://docs.vultr.com/vultr-kubernetes-engine#frequently-asked-questions) -### [Anchor](https://qdrant.tech/articles/gridstore-key-value-storage/\#3-the-gaps-layer-for-effective-updates) 3\. The Gaps Layer for Effective Updates +At the time of writing, Vultr does not support CSI Volume Snapshots. -To further optimize update handling, Gridstore introduces **The Gaps Layer**, which provides a higher-level view of block availability. +![Kubernetes](/documentation/cloud/cloud-providers/kubernetes.jpg) -Instead of scanning the entire bitmask, Gridstore splits the bitmask into regions and keeps track of the largest contiguous free space within each region, known as **The Region Gap**. By also storing the leading and trailing gaps of each region, the system can efficiently combine multiple regions when needed for storing large values. +## Generic Kubernetes Support (on-premises, cloud, edge) -![The Gaps Layer](https://qdrant.tech/articles_data/gridstore-key-value-storage/architecture.png) +Qdrant Hybrid Cloud works with any Kubernetes cluster that meets the [standard compliance](https://www.cncf.io/training/certification/software-conformance/) requirements. -The complete architecture of Gridstore +This includes for example: -This layered approach allows Gridstore to locate available space quickly, scaling down the work required for scans while keeping memory overhead minimal. With this system, finding storage space for new values requires scanning only a tiny fraction of the total metadata, making updates and insertions highly efficient, even in large segments. +- [VMWare Tanzu](https://tanzu.vmware.com/kubernetes-grid) +- [Red Hat OpenShift](https://www.openshift.com/) +- [SUSE Rancher](https://www.rancher.com/) +- [Canonical Kubernetes](https://ubuntu.com/kubernetes) +- [RKE](https://rancher.com/docs/rke/latest/en/) +- [RKE2](https://docs.rke2.io/) +- [K3s](https://k3s.io/) -Given the default configuration, the gaps layer is scoped out in a millionth fraction of the actual storage size. This means that for each 1GB of data, the gaps layer only requires scanning 6KB of metadata. With this mechanism, the other operations can be executed in virtually constant-time complexity. +Qdrant databases need persistent block storage. Most storage solutions provide a CSI driver that can be used with Kubernetes. See [CSI drivers](https://kubernetes-csi.github.io/docs/drivers.html) for more information. -## [Anchor](https://qdrant.tech/articles/gridstore-key-value-storage/\#gridstore-in-production-maintaining-data-integrity) Gridstore in Production: Maintaining Data Integrity +To allow vertical scaling, you need a StorageClass with volume expansion enabled. See [Volume Expansion](https://kubernetes.io/docs/concepts/storage/storage-classes/#allow-volume-expansion) for more information. -![gridstore](https://qdrant.tech/articles_data/gridstore-key-value-storage/gridstore-1.png) +To allow backups and restores, your CSI driver needs to support volume snapshots cluster needs the CSI VolumeSnapshot controller and class. See [CSI Volume Snapshots](https://kubernetes-csi.github.io/docs/snapshot-controller.html) for more information. -Gridstore’s architecture introduces multiple interdependent structures that must remain in sync to ensure data integrity: +## Next Steps -- **The Data Layer** holds the data and associates each key with its location in storage, including page ID, block offset, and the size of its value. -- **The Mask Layer** keeps track of which blocks are occupied and which are free. -- **The Gaps Layer** provides an indexed view of free blocks for efficient space allocation. +Once you've got a Kubernetes cluster deployed on a platform of your choosing, you can begin setting up Qdrant Hybrid Cloud. Head to our Qdrant Hybrid Cloud [setup guide](/documentation/hybrid-cloud/hybrid-cloud-setup/) for instructions. -Every time a new value is inserted or an existing value is updated, all these components need to be modified in a coordinated way. +<|page-128-lllmstxt|> +# Build a GraphRAG Agent with Neo4j and Qdrant -### [Anchor](https://qdrant.tech/articles/gridstore-key-value-storage/\#when-things-break-in-real-life) When Things Break in Real Life +![image0](/documentation/examples/graphrag-qdrant-neo4j/image0.png) -Real-world systems don’t operate in a vacuum. Failures happen: software bugs cause unexpected crashes, memory exhaustion forces processes to terminate, disks fail to persist data reliably, and power losses can interrupt operations at any moment. +| Time: 30 min | Level: Intermediate |Output: [GitHub](https://github.com/qdrant/examples/blob/master/graphrag_neo4j/graphrag.py)| +| --- | ----------- | ----------- | -_The critical question is: what happens if a failure occurs while updating these structures?_ +To make Artificial Intelligence (AI) systems more intelligent and reliable, we face a paradox: Large Language Models (LLMs) possess remarkable reasoning capabilities, yet they struggle to connect information in ways humans find intuitive. While groundbreaking, Retrieval-Augmented Generation (RAG) approaches often fall short when tasked with complex information synthesis. When asked to connect disparate pieces of information or understand holistic concepts across large documents, these systems frequently miss crucial connections that would be obvious to human experts. -If one component is updated but another isn’t, the entire system could become inconsistent. Worse, if an operation is only partially written to disk, it could lead to orphaned data, unusable space, or even data corruption. +To solve these problems, Microsoft introduced **GraphRAG,** which uses Knowledge Graphs (KGs) instead of vectors as a context for LLMs. GraphRAG depends mainly on LLMs for creating KGs and querying them. However, this reliance on LLMs can lead to many problems. We will address these challenges by combining vector databases with graph-based databases. -### [Anchor](https://qdrant.tech/articles/gridstore-key-value-storage/\#stability-through-idempotency-recovering-with-wal) Stability Through Idempotency: Recovering With WAL +This tutorial will demonstrate how to build a GraphRAG system with vector search using Neo4j and Qdrant. -To guard against these risks, Qdrant relies on a [**Write-Ahead Log (WAL)**](https://qdrant.tech/documentation/concepts/storage/). Before committing an operation, Qdrant ensures that it is at least recorded in the WAL. If a crash happens before all updates are flushed, the system can safely replay operations from the log. +|Additional Materials| +|-| +|This advanced tutorial is based on our original integration doc: [**Neo4j - Qdrant Integration**](/documentation/frameworks/neo4j-graphrag/)| +|The output for this tutorial is in our GitHub Examples repo: [**Neo4j - Qdrant Agent in Python**](https://github.com/qdrant/examples/blob/master/graphrag_neo4j/graphrag.py) -This recovery mechanism introduces another essential property: [**idempotence**](https://en.wikipedia.org/wiki/Idempotence). +## Watch the Video -The storage system must be designed so that reapplying the same operation after a failure leads to the same final state as if the operation had been applied just once. +

-### [Anchor](https://qdrant.tech/articles/gridstore-key-value-storage/\#the-grand-solution-lazy-updates) The Grand Solution: Lazy Updates +# RAG & Its Challenges -To achieve this, **Gridstore completes updates lazily**, prioritizing the most critical part of the write: the data itself. +[RAG](https://qdrant.tech/rag/) combines retrieval-based and generative AI to enhance LLMs with relevant, up-to-date information from a knowledge base, like a vector database. However, RAG faces several challenges: -| | -| --- | -| 👉 Instead of immediately updating all metadata structures, it writes the new value first while keeping lightweight pending changes in a buffer. | -| 👉 The system only finalizes these updates when explicitly requested, ensuring that a crash never results in marking data as deleted before the update has been safely persisted. | -| 👉 In the worst-case scenario, Gridstore may need to write the same data twice, leading to a minor space overhead, but it will never corrupt the storage by overwriting valid data. | +1. **Understanding Context:** Models may misinterpret queries, particularly when the context is complex or ambiguous, leading to incorrect or irrelevant answers. +2. **Balancing Similarity vs. Relevance:** RAG systems can struggle to ensure that retrieved information is similar and contextually relevant. +3. **Answer Completeness:** Traditional RAGs might not be able to capture all relevant details for complex queries that require LLMs to find relationships in the context that are not explicitly present. -## [Anchor](https://qdrant.tech/articles/gridstore-key-value-storage/\#how-we-tested-the-final-product) How We Tested the Final Product +# Introduction to GraphRAG -![gridstore](https://qdrant.tech/articles_data/gridstore-key-value-storage/gridstore-3.png) +Unlike RAG, which typically relies on document retrieval, GraphRAG builds knowledge graphs (KGs) to capture entities and their relationships. For datasets or use cases that demand human-level intelligence from an AI system, GraphRAG offers a promising solution: -### [Anchor](https://qdrant.tech/articles/gridstore-key-value-storage/\#first-model-testing) First
 Model Testing +- It can follow chains of relationships to answer complex queries, making it suitable for better reasoning beyond simple document retrieval. +- The graph structure allows a deeper understanding of the context, leading to more accurate and relevant responses. -Gridstore can be tested efficiently using model testing, which compares its behavior to a simple in-memory hash map. Since Gridstore should function like a persisted hash map, this method quickly detects inconsistencies. +The workflow of GraphRAG is as follows: -The process is straightforward: +1. The LLM analyzes the dataset to identify entities (people, places, organizations) and their relationships, creating a comprehensive knowledge graph where entities are nodes and their connections form edges. +2. A bottom-up clustering algorithm organizes the KG into hierarchical semantic groups. This creates meaningful segments of related information, enabling understanding at different levels of abstraction. +3. GraphRAG uses both the KG and semantic clusters to select a relevant context for the LLM when answering queries. -1. Initialize a Gridstore instance and an empty hash map. -2. Run random operations (put, delete, update) on both. -3. Verify that results match after each operation. -4. Compare all keys and values to ensure consistency. +![image2](/documentation/examples/graphrag-qdrant-neo4j/image2.png) -This approach provides high test coverage, exposing issues like incorrect persistence or faulty deletions. Running large-scale model tests ensures Gridstore remains reliable in real-world use. +[Fig](https://arxiv.org/pdf/2404.16130) 1: A Complete Picture of GraphRAG Ingestion and Retrieval -Here is a naive way to generate operations in Rust. +### Challenges of GraphRAG -```rust +Despite its advantages, the LLM-centric GraphRAG approach faces several challenges: -enum Operation { - Put(PointOffset, Payload), - Delete(PointOffset), - Update(PointOffset, Payload), -} +- **KG Construction with LLMs:** Since the LLM is responsible for constructing the knowledge graph, there are risks such as inconsistencies, propagation of biases or errors, and lack of control over the ontology used. However, we used a LLM to extract the ontology in our implementation. +- **Querying KG with LLMs:** Once the graph is constructed, an LLM translates the human query into Cypher (Neo4j's declarative query language). However, crafting complex queries in Cypher may result in inaccurate outcomes. +- **Scalability & Cost Consideration:** To be practical, applications must be both scalable and cost-effective. Relying on LLMs increases costs and decreases scalability, as they are used every time data is added, queried, or generated. -impl Operation { - fn random(rng: &mut impl Rng, max_point_offset: u32) -> Self { - let point_offset = rng.random_range(0..=max_point_offset); - let operation = rng.gen_range(0..3); - match operation { - 0 => { - let size_factor = rng.random_range(1..10); - let payload = random_payload(rng, size_factor); - Operation::Put(point_offset, payload) - } - 1 => Operation::Delete(point_offset), - 2 => { - let size_factor = rng.random_range(1..10); - let payload = random_payload(rng, size_factor); - Operation::Update(point_offset, payload) - } - _ => unreachable!(), - } - } -} +To address these challenges, a more controlled and structured knowledge representation system may be required for GraphRAG to function optimally at scale. -``` +# Architecture Overview -Model testing is a high-value way to catch bugs, especially when your system mimics a well-defined component like a hash map. If your component behaves the same as another one, using model testing brings a lot of value for a bit of effort. +The architecture has two main components: **Ingestion** and **Retrieval & Generation**. Ingestion processes raw data into structured knowledge and vector representations, while Retrieval and Generation enable efficient querying and response generation. -We could have tested against RocksDB, but simplicity matters more. A simple hash map lets us run massive test sequences quickly, exposing issues faster. +This process is divided into two steps: **Ingestion**, where data is prepared and stored, and **Retrieval and Generation**, where the prepared data is queried and utilized. Let’s start with Ingestion. -For even sharper debugging, Property-Based Testing adds automated test generation and shrinking. It pinpoints failures with minimalized test cases, making bug hunting faster and more effective. +## Ingestion -### [Anchor](https://qdrant.tech/articles/gridstore-key-value-storage/\#crash-testing-can-gridstore-handle-the-pressure) Crash Testing: Can Gridstore Handle the Pressure? +The GraphRAG ingestion pipeline combines a **Graph Database** and a **Vector Database** to improve RAG workflows. -Designing for crash resilience is one thing, and proving it works under stress is another. To push Qdrant’s data integrity to the limit, we built [**Crasher**](https://github.com/qdrant/crasher), a test bench that brutally kills and restarts Qdrant while it handles a heavy update workload. +![image1](/documentation/examples/graphrag-qdrant-neo4j/image1.png) -Crasher runs a loop that continuously writes data, then randomly crashes Qdrant. On each restart, Qdrant replays its [**Write-Ahead Log (WAL)**](https://qdrant.tech/documentation/concepts/storage/), and we verify if data integrity holds. Possible anomalies include: +Fig 2: Overview of Ingestion Pipeline -- Missing data (points, vectors, or payloads) -- Corrupt payload values +Let’s break it down: -This aggressive yet simple approach has uncovered real-world issues when run for extended periods. While we also use chaos testing for distributed setups, Crasher excels at fast, repeatable failure testing in a local environment. +1. **Raw Data:** Serves as the foundation, comprising unstructured or structured content. +2. **Ontology Creation:** An **LLM** processes the raw data into an **ontology**, structuring entities, relationships, and hierarchies. Better approaches exist to extracting more structured information from raw data, like using NER to identify the names of people, organizations, and places. Unlike LLMs, this method creates. +3. **Graph Database:** The ontology is stored in a **Graph database** to capture complex relationships. +4. **Vector Embeddings:** An **Embedding model** converts the raw data into high-dimensional vectors capturing semantic similarities. +5. **Vector Database:** These embeddings are stored in a **Vector database** for similarity-based retrieval. +6. **Database Interlinking:** The **Graph database** (e.g., Neo4j) and **Vector database** (e.g., Qdrant) share unique IDs, enabling cross-referencing between ontology-based and vector-based results. -## [Anchor](https://qdrant.tech/articles/gridstore-key-value-storage/\#testing-gridstore-performance-benchmarks) Testing Gridstore Performance: Benchmarks +## Retrieval & Generation -![gridstore](https://qdrant.tech/articles_data/gridstore-key-value-storage/gridstore-4.png) +The **Retrieval and Generation** process is designed to handle user queries by leveraging both semantic search and graph-based context extraction. -To measure the impact of our new storage engine, we used [**Bustle, a key-value storage benchmarking framework**](https://github.com/jonhoo/bustle), to compare Gridstore against RocksDB. We tested three workloads: +![image3](/documentation/examples/graphrag-qdrant-neo4j/image3.png) -| Workload Type | Operation Distribution | -| --- | --- | -| Read-heavy | 95% reads | -| Insert-heavy | 80% inserts | -| Update-heavy | 50% updates | +Fig 3: Overview of Retrieval and Generation Pipeline -#### [Anchor](https://qdrant.tech/articles/gridstore-key-value-storage/\#the-results-speak-for-themselves) The results speak for themselves: +The architecture can be broken down into the following steps: -Average latency for all kinds of workloads is lower across the board, particularly for inserts. +1. **Query Vectorization:** An embedding model converts The user query into a high-dimensional vector. +2. **Semantic Search:** The vector performs a similarity-based search in the **Vector database**, retrieving relevant documents or entries. +3. **ID Extraction:** Extracted IDs from the semantic search results are used to query the **Graph database**. +4. **Graph Context Retrieval:** The **Graph database** provides contextual information, including relationships and entities linked to the extracted IDs. +5. **Response Generation:** The context retrieved from the graph is passed to an LLM to generate a final response. +6. **Results:** The generated response is returned to the user. -![image.png](https://qdrant.tech/articles_data/gridstore-key-value-storage/1.png) +This architecture combines the strengths of both databases: -This shows a clear boost in performance. As we can see, the investment in Gridstore is paying off. +1. **Semantic Search with Vector Database:** The user query is first processed semantically to identify the most relevant data points without needing explicit keyword matches. +2. **Contextual Expansion with Graph Database:** IDs or entities retrieved from the vector database query the graph database for detailed relationships, enriching the retrieved data with structured context. +3. **Enhanced Generation:** The architecture combines semantic relevance (from the vector database) and graph-based context to enable the LLM to generate more informed, accurate, and contextually rich responses. -### [Anchor](https://qdrant.tech/articles/gridstore-key-value-storage/\#end-to-end-benchmarking) End-to-End Benchmarking +# Implementation -Now, let’s test the impact on a real Qdrant instance. So far, we’ve only integrated Gridstore for [**payloads**](https://qdrant.tech/documentation/concepts/payload/) and [**sparse vectors**](https://qdrant.tech/documentation/concepts/vectors/#sparse-vectors), but even this partial switch should show noticeable improvements. +We'll walk through a complete pipeline that ingests data into Neo4j and Qdrant, retrieves relevant data, and generates responses using an LLM based on the retrieved graph context. -For benchmarking, we used our in-house [**bfb tool**](https://github.com/qdrant/bfb) to generate a workload. Our configuration: +The main components of this pipeline include data ingestion (to Neo4j and Qdrant), retrieval, and generation steps. -```json -bfb -n 2000000 --max-id 1000000 \ - --sparse-vectors 0.02 \ - --set-payload \ - --on-disk-payload \ - --dim 1 \ - --sparse-dim 5000 \ - --bool-payloads \ - --keywords 100 \ - --float-payloads true \ - --int-payloads 100000 \ - --text-payloads \ - --text-payload-length 512 \ - --skip-field-indices \ - --jsonl-updates ./rps.jsonl +## Prerequisites -``` +These are the tutorial prerequisites, which are divided into setup, imports, and initialization of the two DBs. -This benchmark upserts 1 million points twice. Each point has: +### Setup -- A medium to large payload -- A tiny dense vector (dense vectors use a different storage type) -- A sparse vector +Let’s start with setting up instances with Qdrant and Neo4j. -* * * +### Qdrant Setup -#### [Anchor](https://qdrant.tech/articles/gridstore-key-value-storage/\#additional-configuration) Additional configuration: +To create a Qdrant instance, you can use their **managed service** (Qdrant Cloud) or set up a self-hosted cluster. For simplicity, we will use Qdrant cloud: -1. The test we conducted updated payload data separately in another request. +- Go to [Qdrant Cloud](https://qdrant.tech/) and sign up or log in. +- Once logged in, click on **Create New Cluster**. +- Follow the on-screen instructions to create your cluster. +- Once your cluster is created, you'll be given a **Cluster URL** and **API Key**, which you will use in the client to interact with Qdrant. -2. There were no payload indices, which ensured we measured pure ingestion speed. +### Neo4j Setup -3. Finally, we gathered request latency metrics for analysis. +To set up a Neo4j instance, you can use **Neo4j Aura** (cloud service) or host it yourself. We will use Neo4j Aura: +- Go to Neo4j Aura and sign up/log in. +- After setting up, an instance will be created if it is the first time. +- After the database is set up, you’ll receive a **connection URI**, **username**, and **password**. -* * * +We can add the following in the .env file for security purposes. -We ran this against Qdrant 1.12.6, toggling between the old and new storage backends. +### Imports -### [Anchor](https://qdrant.tech/articles/gridstore-key-value-storage/\#final-result) Final Result +First, we import the required libraries for working with Neo4j, Qdrant, OpenAI, and other utility functions. -Data ingestion is **twice as fast and with a smoother throughput** — a massive win! 😍 +```python +from neo4j import GraphDatabase +from qdrant_client import QdrantClient, models +from dotenv import load_dotenv +from pydantic import BaseModel +from openai import OpenAI +from collections import defaultdict +from neo4j_graphrag.retrievers import QdrantNeo4jRetriever +import uuid +import os +``` -![image.png](https://qdrant.tech/articles_data/gridstore-key-value-storage/2.png) +--- -We optimized for speed, and it paid off—but what about storage size? +- **Neo4j:** Used to store and query the graph database. +- **Qdrant:** A vector database used for semantic similarity search. +- **dotenv:** Loads environment variables for credentials and API keys. +- **Pydantic:** Ensures data is structured properly when interacting with the graph data. +- **OpenAI:** Interfaces with the OpenAI API to generate responses and embeddings. +- **neo4j_graphrag:** A helper package to retrieve data from both Qdrant and Neo4j. -- Gridstore: 2333MB -- RocksDB: 2319MB +### Setting Up Environment Variables + +Before initializing the clients, we load the necessary credentials from environment variables. + +```python +# Load environment variables +load_dotenv() + +# Get credentials from environment variables +qdrant_key = os.getenv("QDRANT_KEY") +qdrant_url = os.getenv("QDRANT_URL") +neo4j_uri = os.getenv("NEO4J_URI") +neo4j_username = os.getenv("NEO4J_USERNAME") +neo4j_password = os.getenv("NEO4J_PASSWORD") +openai_key = os.getenv("OPENAI_API_KEY") +``` -Strictly speaking, RocksDB is slightly smaller, but the difference is negligible compared to the 2x faster ingestion and more stable throughput. A small trade-off for a big performance gain! +--- -## [Anchor](https://qdrant.tech/articles/gridstore-key-value-storage/\#trying-out-gridstore) Trying Out Gridstore +This ensures that sensitive information (like API keys and database credentials) is securely stored in environment variables. -Gridstore represents a significant advancement in how Qdrant manages its **key-value storage** needs. It offers great performance and streamlined updates tailored specifically for our use case. We have managed to achieve faster, more reliable data ingestion while maintaining data integrity, even under heavy workloads and unexpected failures. It is already used as a storage backend for on-disk payloads and sparse vectors. +### Initializing Neo4j and Qdrant Clients -👉 It’s important to note that Gridstore remains tightly integrated with Qdrant and, as such, has not been released as a standalone crate. +Now, we initialize the Neo4j and Qdrant clients using the credentials. -Its API is still evolving, and we are focused on refining it within our ecosystem to ensure maximum stability and performance. That said, we recognize the value this innovation could bring to the wider Rust community. In the future, once the API stabilizes and we decouple it enough from Qdrant, we will consider publishing it as a contribution to the community ❀. +```python +# Initialize Neo4j driver +neo4j_driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password)) -For now, Gridstore continues to drive improvements in Qdrant, demonstrating the benefits of a custom-tailored storage engine designed with modern demands in mind. Stay tuned for further updates and potential community releases as we keep pushing the boundaries of performance and reliability. +# Initialize Qdrant client +qdrant_client = QdrantClient( + url=qdrant_url, + api_key=qdrant_key +) +``` -![Gridstore](https://qdrant.tech/articles_data/gridstore-key-value-storage/gridstore.png) +--- -Simple, efficient, and designed just for Qdrant. +- **Neo4j:** We set up a connection to the Neo4j graph database. +- **Qdrant:** We initialize the connection to the Qdrant vector store. -##### Was this page useful? +This will connect with Neo4j and Qdrant, and we can now start with Ingestion. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +## Ingestion -Thank you for your feedback! 🙏 +We will follow the workflow of the ingestion pipeline presented in the architecture section. Let’s examine it implementation-wise. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/gridstore-key-value-storage.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +### Defining Output Parser -On this page: +The single and GraphComponents classes structure the LLM's responses into a usable format. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/gridstore-key-value-storage.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +```python +class single(BaseModel): + node: str + target_node: str + relationship: str -× +class GraphComponents(BaseModel): + graph: list[single] +``` -[Powered by](https://qdrant.tech/) +--- -<|page-88-lllmstxt|> -## vector-search-filtering -- [Articles](https://qdrant.tech/articles/) -- A Complete Guide to Filtering in Vector Search +These classes help ensure that data from the OpenAI LLM is parsed correctly into the graph components (nodes and relationships). -[Back to Vector Search Manuals](https://qdrant.tech/articles/vector-search-manuals/) +### Defining OpenAI Client and LLM Parser Function -# A Complete Guide to Filtering in Vector Search +We now initialize the OpenAI client and define a function to send prompts to the LLM and parse its responses. -Sabrina Aquino, David Myriel +```python +client = OpenAI() -· +def openai_llm_parser(prompt): + completion = client.chat.completions.create( + model="gpt-4o-2024-08-06", + response_format={"type": "json_object"}, + messages=[ + { + "role": "system", + "content": + + """ You are a precise graph relationship extractor. Extract all + relationships from the text and format them as a JSON object + with this exact structure: + { + "graph": [ + {"node": "Person/Entity", + "target_node": "Related Entity", + "relationship": "Type of Relationship"}, + ...more relationships... + ] + } + Include ALL relationships mentioned in the text, including + implicit ones. Be thorough and precise. """ + + }, + { + "role": "user", + "content": prompt + } + ] + ) + + return GraphComponents.model_validate_json(completion.choices[0].message.content) + +``` +--- -September 10, 2024 +This function sends a prompt to the LLM, asking it to extract graph components (nodes and relationships) from the provided text. The response is parsed into structured graph data. -![A Complete Guide to Filtering in Vector Search](https://qdrant.tech/articles_data/vector-search-filtering/preview/title.jpg) +### Extracting Graph Components -Imagine you sell computer hardware. To help shoppers easily find products on your website, you need to have a **user-friendly [search engine](https://qdrant.tech/)**. +The function extract_graph_components processes raw data, extracting the nodes and relationships as graph components. -![vector-search-ecommerce](https://qdrant.tech/articles_data/vector-search-filtering/vector-search-ecommerce.png) +```python +def extract_graph_components(raw_data): + prompt = f"Extract nodes and relationships from the following text:\n{raw_data}" -If you’re selling computers and have extensive data on laptops, desktops, and accessories, your search feature should guide customers to the exact device they want - or at least a **very similar** match. + parsed_response = openai_llm_parser(prompt) # Assuming this returns a list of dictionaries + parsed_response = parsed_response.graph # Assuming the 'graph' structure is a key in the parsed response -When storing data in Qdrant, each product is a point, consisting of an `id`, a `vector` and `payload`: + nodes = {} + relationships = [] -```json -{ - "id": 1, - "vector": [0.1, 0.2, 0.3, 0.4], - "payload": { - "price": 899.99, - "category": "laptop" - } -} + for entry in parsed_response: + node = entry.node + target_node = entry.target_node # Get target node if available + relationship = entry.relationship # Get relationship if available -``` + # Add nodes to the dictionary with a unique ID + if node not in nodes: + nodes[node] = str(uuid.uuid4()) -The `id` is a unique identifier for the point in your collection. The `vector` is a mathematical representation of similarity to other points in the collection. -Finally, the `payload` holds metadata that directly describes the point. + if target_node and target_node not in nodes: + nodes[target_node] = str(uuid.uuid4()) -Though we may not be able to decipher the vector, we are able to derive additional information about the item from its metadata, In this specific case, **we are looking at a data point for a laptop that costs $899.99**. + # Add relationship to the relationships list with node IDs + if target_node and relationship: + relationships.append({ + "source": nodes[node], + "target": nodes[target_node], + "type": relationship + }) -## [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#what-is-filtering) What is filtering? + return nodes, relationships +``` -When searching for the perfect computer, your customers may end up with results that are mathematically similar to the search entry, but not exact. For example, if they are searching for **laptops under $1000**, a simple [vector search](https://qdrant.tech/advanced-search/) without constraints might still show other laptops over $1000. +--- -This is why [semantic search](https://qdrant.tech/advanced-search/) alone **may not be enough**. In order to get the exact result, you would need to enforce a payload filter on the `price`. Only then can you be sure that the search results abide by the chosen characteristic. +This function takes raw data, uses the LLM to parse it into graph components, and then assigns unique IDs to nodes and relationships. -> This is called **filtering** and it is one of the key features of [vector databases](https://qdrant.tech/). +### Ingesting Data to Neo4j -Here is how a **filtered vector search** looks behind the scenes. We’ll cover its mechanics in the following section. +The function ingest_to_neo4j ingests the extracted graph data (nodes and relationships) into Neo4j. -```http -POST /collections/online_store/points/search -{ - "vector": [ 0.2, 0.1, 0.9, 0.7 ], - "filter": { - "must": [\ - {\ - "key": "category",\ - "match": { "value": "laptop" }\ - },\ - {\ - "key": "price",\ - "range": {\ - "gt": null,\ - "gte": null,\ - "lt": null,\ - "lte": 1000\ - }\ - }\ - ] - }, - "limit": 3, - "with_payload": true, - "with_vector": false -} +```python +def ingest_to_neo4j(nodes, relationships): + """ + Ingest nodes and relationships into Neo4j. + """ -``` + with neo4j_driver.session() as session: + # Create nodes in Neo4j + for name, node_id in nodes.items(): + session.run( + "CREATE (n:Entity {id: $id, name: $name})", + id=node_id, + name=name + ) -The filtered result will be a combination of the semantic search and the filtering conditions imposed upon the query. In the following pages, we will show that **filtering is a key practice in vector search for two reasons:** + # Create relationships in Neo4j + for relationship in relationships: + session.run( + "MATCH (a:Entity {id: $source_id}), (b:Entity {id: $target_id}) " + "CREATE (a)-[:RELATIONSHIP {type: $type}]->(b)", + source_id=relationship["source"], + target_id=relationship["target"], + type=relationship["type"] + ) -1. With filtering in Qdrant, you can **dramatically increase search precision**. More on this in the next section. + return nodes +``` -2. Filtering helps control resources and **reduce compute use**. More on this in [**Payload Indexing**](https://qdrant.tech/articles/vector-search-filtering/#filtering-with-the-payload-index). +--- -## [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#what-you-will-learn-in-this-guide) What you will learn in this guide: +Here, we create nodes and relationships in the Neo4j graph database. Nodes are entities, and relationships link these entities. -In [vector search](https://qdrant.tech/advanced-search/), filtering and sorting are more interdependent than they are in traditional databases. While databases like SQL use commands such as `WHERE` and `ORDER BY`, the interplay between these processes in vector search is a bit more complex. +This will ingest the data into Neo4j and on a sample dataset it looks something like this: -Most people use default settings and build vector search apps that aren’t properly configured or even setup for precise retrieval. In this guide, we will show you how to **use filtering to get the most out of vector search** with some basic and advanced strategies that are easy to implement. +![image4](/documentation/examples/graphrag-qdrant-neo4j/image4.png) -#### [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#remember-to-run-all-tutorial-code-in-qdrants-dashboard) Remember to run all tutorial code in Qdrant’s Dashboard +Fig 4: Visualization of the Knowledge Graph -The easiest way to reach that “Hello World” moment is to [**try filtering in a live cluster**](https://qdrant.tech/documentation/quickstart-cloud/). Our interactive tutorial will show you how to create a cluster, add data and try some filtering clauses. +Let's explore how to map nodes with their IDs and integrate this information, along with vectors, into Qdrant. First, let’s create a Qdrant collection. -![qdrant-filtering-tutorial](https://qdrant.tech/articles_data/vector-search-filtering/qdrant-filtering-tutorial.png) +### Creating Qdrant Collection -## [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#qdrants-approach-to-filtering) Qdrant’s approach to filtering +You can create a collection once you have set up your Qdrant instance. A collection in Qdrant holds vectors for search and retrieval. -Qdrant follows a specific method of searching and filtering through dense vectors. +```python +def create_collection(client, collection_name, vector_dimension): +``` -Let’s take a look at this **3-stage diagram**. In this case, we are trying to find the nearest neighbour to the query vector **(green)**. Your search journey starts at the bottom **(orange)**. +try: -By default, Qdrant connects all your data points within the [**vector index**](https://qdrant.tech/documentation/concepts/indexing/). After you [**introduce filters**](https://qdrant.tech/documentation/concepts/filtering/), some data points become disconnected. Vector search can’t cross the grayed out area and it won’t reach the nearest neighbor. -How can we bridge this gap? +```python +# Try to fetch the collection status +try: + collection_info = client.get_collection(collection_name) + print(f"Skipping creating collection; '{collection_name}' already exists.") +except Exception as e: + # If collection does not exist, an error will be thrown, so we create the collection + if 'Not found: Collection' in str(e): + print(f"Collection '{collection_name}' not found. Creating it now...") -**Figure 1:** How Qdrant maintains a filterable vector index. -![filterable-vector-index](https://qdrant.tech/articles_data/vector-search-filtering/filterable-vector-index.png) + client.create_collection( + collection_name=collection_name, + vectors_config=models.VectorParams(size=vector_dimension, distance=models.Distance.COSINE) + ) -[**Filterable vector index**](https://qdrant.tech/documentation/concepts/indexing/): This technique builds additional links **(orange)** between leftover data points. The filtered points which stay behind are now traversible once again. Qdrant uses special category-based methods to connect these data points. + print(f"Collection '{collection_name}' created successfully.") + else: + print(f"Error while checking collection: {e}") +``` -### [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#qdrants-approach-vs-traditional-filtering-methods) Qdrant’s approach vs traditional filtering methods +--- -![stepping-lens](https://qdrant.tech/articles_data/vector-search-filtering/stepping-lens.png) +- **Qdrant Client:** The QdrantClient is used to connect to the Qdrant instance. +- **Creating Collection:** The create_collection function checks if a collection exists. If not, it creates one with a specified vector dimension and distance metric (cosine similarity in this case). -The filterable vector index is Qdrant’s solves pre and post-filtering problems by adding specialized links to the search graph. It aims to maintain the speed advantages of vector search while allowing for precise filtering, addressing the inefficiencies that can occur when applying filters after the vector search. +### Generating Embeddings -#### [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#pre-filtering) Pre-filtering +Next, we define a function that generates embeddings for text using OpenAI's API. -In pre-filtering, a search engine first narrows down the dataset based on chosen metadata values, and then searches within that filtered subset. This reduces unnecessary computation over a dataset that is potentially much larger. +```python +def openai_embeddings(text): + response = client.embeddings.create( + input=text, + model="text-embedding-3-small" + ) + + return response.data[0].embedding +``` -The choice between pre-filtering and using the filterable HNSW index depends on filter cardinality. When metadata cardinality is too low, the filter becomes restrictive and it can disrupt the connections within the graph. This leads to fragmented search paths (as in **Figure 1**). When the semantic search process begins, it won’t be able to travel to those locations. +--- -However, Qdrant still benefits from pre-filtering **under certain conditions**. In cases of low cardinality, Qdrant’s query planner stops using HNSW and switches over to the payload index alone. This makes the search process much cheaper and faster than if using HNSW. +This function uses OpenAI's embedding model to transform input text into vector representations. -**Figure 2:** On the user side, this is how filtering looks. We start with five products with different prices. First, the $1000 price **filter** is applied, narrowing down the selection of laptops. Then, a vector search finds the relevant **results** within this filtered set. +### Ingesting into Qdrant -![pre-filtering-vector-search](https://qdrant.tech/articles_data/vector-search-filtering/pre-filtering.png) +Let’s ingest the data into the vector database. -In conclusion, pre-filtering is efficient in specific cases when you use small datasets with low cardinality metadata. However, pre-filtering should not be used over large datasets as it breaks too many links in the HNSW graph, causing lower accuracy. +```python +def ingest_to_qdrant(collection_name, raw_data, node_id_mapping): + embeddings = [openai_embeddings(paragraph) for paragraph in raw_data.split("\n")] -#### [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#post-filtering) Post-filtering + qdrant_client.upsert( + collection_name=collection_name, + points=[ + { + "id": str(uuid.uuid4()), + "vector": embedding, + "payload": {"id": node_id} + } + for node_id, embedding in zip(node_id_mapping.values(), embeddings) + ] + ) +``` -In post-filtering, a search engine first looks for similar vectors and retrieves a larger set of results. Then, it applies filters to those results based on metadata. The problem with post-filtering becomes apparent when using low-cardinality filters. +--- -> When you apply a low-cardinality filter after performing a vector search, you often end up discarding a large portion of the results that the vector search returned. +The ingest_to_qdrant function generates embeddings for each paragraph in the raw data and stores them in a Qdrant collection. It associates each embedding with a unique ID and its corresponding node ID from the node_id_mapping dictionary, ensuring proper linkage for later retrieval. -**Figure 3:** In the same example, we have five laptops. First, the vector search finds the top two relevant **results**, but they may not meet the price match. When the $1000 price **filter** is applied, other potential results are discarded. -![post-filtering-vector-search](https://qdrant.tech/articles_data/vector-search-filtering/post-filtering.png) +--- -The system will waste computational resources by first finding similar vectors and then discarding many that don’t meet the filter criteria. You’re also limited to filtering only from the initial set of [vector search](https://qdrant.tech/advanced-search/) results. If your desired items aren’t in this initial set, you won’t find them, even if they exist in the database. +## Retrieval & Generation -## [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#basic-filtering-example-ecommerce-and-laptops) Basic filtering example: ecommerce and laptops +In this section, we will create the retrieval and generation engine for the system. -We know that there are three possible laptops that suit our price point. -Let’s see how Qdrant’s filterable vector index works and why it is the best method of capturing all available results. +### Building a Retriever -First, add five new laptops to your online store. Here is a sample input: +The retriever integrates vector search and graph data, enabling semantic similarity searches with Qdrant and fetching relevant graph data from Neo4j. This enriches the RAG process and allows for more informed responses. ```python -laptops = [\ - (1, [0.1, 0.2, 0.3, 0.4], {"price": 899.99, "category": "laptop"}),\ - (2, [0.2, 0.3, 0.4, 0.5], {"price": 1299.99, "category": "laptop"}),\ - (3, [0.3, 0.4, 0.5, 0.6], {"price": 799.99, "category": "laptop"}),\ - (4, [0.4, 0.5, 0.6, 0.7], {"price": 1099.99, "category": "laptop"}),\ - (5, [0.5, 0.6, 0.7, 0.8], {"price": 949.99, "category": "laptop"})\ -] +def retriever_search(neo4j_driver, qdrant_client, collection_name, query): + retriever = QdrantNeo4jRetriever( + driver=neo4j_driver, + client=qdrant_client, + collection_name=collection_name, + id_property_external="id", + id_property_neo4j="id", + ) + results = retriever.search(query_vector=openai_embeddings(query), top_k=5) + + return results ``` -The four-dimensional vector can represent features like laptop CPU, RAM or battery life, but that isn’t specified. The payload, however, specifies the exact price and product category. - -Now, set the filter to “price is less than $1000”: +--- -```json -{ - "key": "price", - "range": { - "gt": null, - "gte": null, - "lt": null, - "lte": 1000 - } -} +The [QdrantNeo4jRetriever](https://qdrant.tech/documentation/frameworks/neo4j-graphrag/) handles both vector search and graph data fetching, combining Qdrant for vector-based retrieval and Neo4j for graph-based queries. -``` +**Vector Search:** -When a price filter of equal/less than $1000 is applied, vector search returns the following results: +- **`qdrant_client`** connects to Qdrant for efficient vector similarity search. +- **`collection_name`** specifies where vectors are stored. +- **`id_property_external="id"`** maps the external entity’s ID for retrieval. -```json -[\ - {\ - "id": 3,\ - "score": 0.9978443564622781,\ - "payload": {\ - "price": 799.99,\ - "category": "laptop"\ - }\ - },\ - {\ - "id": 1,\ - "score": 0.9938079894227599,\ - "payload": {\ - "price": 899.99,\ - "category": "laptop"\ - }\ - },\ - {\ - "id": 5,\ - "score": 0.9903751498208603,\ - "payload": {\ - "price": 949.99,\ - "category": "laptop"\ - }\ - }\ -] +**Graph Fetching:** -``` +- **`neo4j_driver`** connects to Neo4j for querying graph data. +- **`id_property_neo4j="id"`** ensures the entity IDs from Qdrant match the graph nodes in Neo4j. -As you can see, Qdrant’s filtering method has a greater chance of capturing all possible search results. +### Querying Neo4j for Related Graph Data -This specific example uses the `range` condition for filtering. Qdrant, however, offers many other possible ways to structure a filter +We need to fetch subgraph data from a Neo4j database based on specific entity IDs after the retriever has provided the relevant IDs. -**For detailed usage examples, [filtering](https://qdrant.tech/documentation/concepts/filtering/) docs are the best resource.** +```python +def fetch_related_graph(neo4j_client, entity_ids): + query = """ + MATCH (e:Entity)-[r1]-(n1)-[r2]-(n2) + WHERE e.id IN $entity_ids + RETURN e, r1 as r, n1 as related, r2, n2 + UNION + MATCH (e:Entity)-[r]-(related) + WHERE e.id IN $entity_ids + RETURN e, r, related, null as r2, null as n2 + """ + with neo4j_client.session() as session: + result = session.run(query, entity_ids=entity_ids) + subgraph = [] + for record in result: + subgraph.append({ + "entity": record["e"], + "relationship": record["r"], + "related_node": record["related"] + }) + if record["r2"] and record["n2"]: + subgraph.append({ + "entity": record["related"], + "relationship": record["r2"], + "related_node": record["n2"] + }) + return subgraph -### [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#scrolling-instead-of-searching) Scrolling instead of searching +``` -You don’t need to use our `search` and `query` APIs to filter through data. The `scroll` API is another option that lets you retrieve lists of points which meet the filters. +--- -If you aren’t interested in finding similar points, you can simply list the ones that match a given filter. While search gives you the most similar points based on some query vector, scroll will give you all points matching your filter not considering similarity. +The function fetch_related_graph takes in a Neo4j client and a list of entity_ids. It runs a Cypher query to find related nodes (entities) and their relationships based on the given entity IDs. The query matches entities (e:Entity) and finds related nodes through any relationship [r]. The function returns a list of subgraph data, where each record contains the entity, relationship, and related_node. -In Qdrant, scrolling is used to iteratively **retrieve large sets of points from a collection**. It is particularly useful when you’re dealing with a large number of points and don’t want to load them all at once. Instead, Qdrant provides a way to scroll through the points **one page at a time**. +This subgraph is essential for generating context to answer user queries. -You start by sending a scroll request to Qdrant with specific conditions like filtering by payload, vector search, or other criteria. +### Setting up the Graph Context -Let’s retrieve a list of top 10 laptops ordered by price in the store: +The second part of the implementation involves preparing a graph context. We’ll fetch relevant subgraph data from a Neo4j database and format it for the model. Let’s break it down. -```http -POST /collections/online_store/points/scroll -{ - "filter": { - "must": [\ - {\ - "key": "category",\ - "match": {\ - "value": "laptop"\ - }\ - }\ - ] - }, - "limit": 10, - "with_payload": true, - "with_vector": false, - "order_by": [\ - {\ - "key": "price",\ - }\ - ] -} +```python +def format_graph_context(subgraph): + nodes = set() + edges = [] -``` + for entry in subgraph: + entity = entry["entity"] + related = entry["related_node"] + relationship = entry["relationship"] -The response contains a batch of points that match the criteria and a reference (offset or next page token) to retrieve the next set of points. + nodes.add(entity["name"]) + nodes.add(related["name"]) -> [**Scrolling**](https://qdrant.tech/documentation/concepts/points/#scroll-points) is designed to be efficient. It minimizes the load on the server and reduces memory consumption on the client side by returning only manageable chunks of data at a time. + edges.append(f"{entity['name']} {relationship['type']} {related['name']}") -#### [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#available-filtering-conditions) Available filtering conditions + return {"nodes": list(nodes), "edges": edges} +``` -| **Condition** | **Usage** | **Condition** | **Usage** | -| --- | --- | --- | --- | -| **Match** | Exact value match. | **Range** | Filter by value range. | -| **Match Any** | Match multiple values. | **Datetime Range** | Filter by date range. | -| **Match Except** | Exclude specific values. | **UUID Match** | Filter by unique ID. | -| **Nested Key** | Filter by nested data. | **Geo** | Filter by location. | -| **Nested Object** | Filter by nested objects. | **Values Count** | Filter by element count. | -| **Full Text Match** | Search in text fields. | **Is Empty** | Filter empty fields. | -| **Has ID** | Filter by unique ID. | **Is Null** | Filter null values. | +--- -> All clauses and conditions are outlined in Qdrant’s [filtering](https://qdrant.tech/documentation/concepts/filtering/) documentation. +The function format_graph_context processes a subgraph returned by a Neo4j query. It extracts the graph's entities (nodes) and relationships (edges). The nodes set ensures each entity is added only once. The edges list captures the relationships in a readable format: *Entity1 relationship Entity2*. -#### [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#filtering-clauses-to-remember) Filtering clauses to remember +### Integrating with the LLM -| **Clause** | **Description** | **Clause** | **Description** | -| --- | --- | --- | --- | -| **Must** | Includes items that meet the condition
(similar to `AND`). | **Should** | Filters if at least one condition is met
(similar to `OR`). | -| **Must Not** | Excludes items that meet the condition
(similar to `NOT`). | **Clauses Combination** | Combines multiple clauses to refine filtering
(similar to `AND`). | +Now that we have the graph context, we need to generate a prompt for a language model like GPT-4. This is where the core of the Retrieval-Augmented Generation (RAG) happens — we combine the graph data and the user query into a comprehensive prompt for the model. -## [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#advanced-filtering-example-dinosaur-diets) Advanced filtering example: dinosaur diets +```python +def graphRAG_run(graph_context, user_query): + nodes_str = ", ".join(graph_context["nodes"]) + edges_str = "; ".join(graph_context["edges"]) + prompt = f""" + You are an intelligent assistant with access to the following knowledge graph: -![advanced-payload-filtering](https://qdrant.tech/articles_data/vector-search-filtering/advanced-payload-filtering.png) + Nodes: {nodes_str} -We can also use nested filtering to query arrays of objects within the payload. In this example, we have two points. They each represent a dinosaur with a list of food preferences (diet) that indicate what type of food they like or dislike: + Edges: {edges_str} -```json -[\ - {\ - "id": 1,\ - "dinosaur": "t-rex",\ - "diet": [\ - { "food": "leaves", "likes": false},\ - { "food": "meat", "likes": true}\ - ]\ - },\ - {\ - "id": 2,\ - "dinosaur": "diplodocus",\ - "diet": [\ - { "food": "leaves", "likes": true},\ - { "food": "meat", "likes": false}\ - ]\ - }\ -] + Using this graph, Answer the following question: + User Query: "{user_query}" + """ + + try: + response = client.chat.completions.create( + model="gpt-4", + messages=[ + {"role": "system", "content": "Provide the answer for the following question:"}, + {"role": "user", "content": prompt} + ] + ) + return response.choices[0].message + + except Exception as e: + return f"Error querying LLM: {str(e)}" ``` -To ensure that both conditions are applied to the same array element (e.g., food = meat and likes = true must refer to the same diet item), you need to use a nested filter. - -Nested filters are used to apply conditions within an array of objects. They ensure that the conditions are evaluated per array element, rather than across all elements. +--- -httppythontypescriptrustjavacsharp +The function graphRAG_run takes the graph context (nodes and edges) and the user query, combining them into a structured prompt for the LLM. The nodes and edges are formatted as readable strings to form part of the LLM input. The LLM is then queried with the generated prompt, asking it to refine the user query using the graph context and provide an answer. If the model successfully generates a response, it returns the answer. -```http -POST /collections/dinosaurs/points/scroll -{ - "filter": { - "must": [\ - {\ - "key": "diet[].food",\ - "match": {\ - "value": "meat"\ - }\ - },\ - {\ - "key": "diet[].likes",\ - "match": {\ - "value": true\ - }\ - }\ - ] - } -} +### End-to-End Pipeline -``` +Finally, let’s integrate everything into an end-to-end pipeline where we ingest some sample data, run the retrieval process, and query the language model. ```python -client.scroll( - collection_name="dinosaurs", - scroll_filter=models.Filter( - must=[\ - models.FieldCondition(\ - key="diet[].food", match=models.MatchValue(value="meat")\ - ),\ - models.FieldCondition(\ - key="diet[].likes", match=models.MatchValue(value=True)\ - ),\ - ], - ), -) +if __name__ == "__main__": + print("Script started") + print("Loading environment variables...") + load_dotenv('.env.local') + print("Environment variables loaded") + + print("Initializing clients...") + neo4j_driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password)) + qdrant_client = QdrantClient( + url=qdrant_url, + api_key=qdrant_key + ) + print("Clients initialized") + + print("Creating collection...") + collection_name = "graphRAGstoreds" + vector_dimension = 1536 + create_collection(qdrant_client, collection_name, vector_dimension) + print("Collection created/verified") + + print("Extracting graph components...") + + raw_data = """Alice is a data scientist at TechCorp's Seattle office. + Bob and Carol collaborate on the Alpha project. + Carol transferred to the New York office last year. + Dave mentors both Alice and Bob. + TechCorp's headquarters is in Seattle. + Carol leads the East Coast team. + Dave started his career in Seattle. + The Alpha project is managed from New York. + Alice previously worked with Carol at DataCo. + Bob joined the team after Dave's recommendation. + Eve runs the West Coast operations from Seattle. + Frank works with Carol on client relations. + The New York office expanded under Carol's leadership. + Dave's team spans multiple locations. + Alice visits Seattle monthly for team meetings. + Bob's expertise is crucial for the Alpha project. + Carol implemented new processes in New York. + Eve and Dave collaborated on previous projects. + Frank reports to the New York office. + TechCorp's main AI research is in Seattle. + The Alpha project revolutionized East Coast operations. + Dave oversees projects in both offices. + Bob's contributions are mainly remote. + Carol's team grew significantly after moving to New York. + Seattle remains the technology hub for TechCorp.""" -``` + nodes, relationships = extract_graph_components(raw_data) + print("Nodes:", nodes) + print("Relationships:", relationships) + + print("Ingesting to Neo4j...") + node_id_mapping = ingest_to_neo4j(nodes, relationships) + print("Neo4j ingestion complete") + + print("Ingesting to Qdrant...") + ingest_to_qdrant(collection_name, raw_data, node_id_mapping) + print("Qdrant ingestion complete") -```typescript -client.scroll("dinosaurs", { - filter: { - must: [\ - {\ - key: "diet[].food",\ - match: { value: "meat" },\ - },\ - {\ - key: "diet[].likes",\ - match: { value: true },\ - },\ - ], - }, -}); + query = "How is Bob connected to New York?" + print("Starting retriever search...") + retriever_result = retriever_search(neo4j_driver, qdrant_client, collection_name, query) + print("Retriever results:", retriever_result) + + print("Extracting entity IDs...") + entity_ids = [item.content.split("'id': '")[1].split("'")[0] for item in retriever_result.items] + print("Entity IDs:", entity_ids) + + print("Fetching related graph...") + subgraph = fetch_related_graph(neo4j_driver, entity_ids) + print("Subgraph:", subgraph) + + print("Formatting graph context...") + graph_context = format_graph_context(subgraph) + print("Graph context:", graph_context) + + print("Running GraphRAG...") + answer = graphRAG_run(graph_context, query) + print("Final Answer:", answer) ``` -```rust -use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; +--- -client - .scroll( - ScrollPointsBuilder::new("dinosaurs").filter(Filter::must([\ - Condition::matches("diet[].food", "meat".to_string()),\ - Condition::matches("diet[].likes", true),\ - ])), - ) - .await?; +Here’s what’s happening: -``` +- First, the user query is defined ("How is Bob connected to New York?"). +- The QdrantNeo4jRetriever searches for related entities in the Qdrant vector database based on the user query’s embedding. It retrieves the top 5 results (top_k=5). +- The entity_ids are extracted from the retriever result. +- The fetch_related_graph function retrieves related entities and their relationships from the Neo4j database. +- The format_graph_context function prepares the graph data in a format the LLM can understand. +- Finally, the graphRAG_run function is called to generate and query the language model, producing an answer based on the retrieved graph context. -```java -import java.util.List; +With this, we have successfully created GraphRAG, a system capable of capturing complex relationships and delivering improved performance compared to the baseline RAG approach. -import static io.qdrant.client.ConditionFactory.match; -import static io.qdrant.client.ConditionFactory.matchKeyword; +# Advantages of Qdrant + Neo4j GraphRAG -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.ScrollPoints; +Combining Qdrant with Neo4j in a GraphRAG architecture offers several compelling advantages, particularly regarding recall and precision combo, contextual understanding, adaptability to complex queries, and better cost and scalability. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +1. **Improved Recall and Precision:** By leveraging Qdrant, a highly efficient vector search engine, alongside Neo4j’s robust graph database, the system benefits from both semantic search and relationship-based retrieval. Qdrant identifies relevant vectors and captures the similarity between queries and stored data. At the same time, Neo4j adds a layer of connectivity through its graph structure, ensuring that relevant and contextually linked information is retrieved. This combination improves recall (retrieving a broader set of relevant results) and precision (delivering more accurate and contextually relevant results), addressing a common challenge in traditional retrieval-based AI systems. +2. **Enhanced Contextual Understanding:** Neo4j enhances contextual understanding by representing information as a graph, where entities and their relationships are naturally modeled. When integrated with Qdrant, the system can retrieve similar items based on vector embeddings and those that fit within the desired relational context, leading to more nuanced and meaningful responses. +3. **Adaptability to Complex Queries:** Combining Qdrant and Neo4j makes the system highly adaptable to complex queries. While Qdrant handles the vector search for relevant data, Neo4j’s graph capabilities enable sophisticated querying through relationships. This allows for multi-hop reasoning and handling complex, structured queries that would be challenging for traditional search engines. +4. **Better Cost & Scalability:** GraphRAG, on its own, demands significant resources, as it relies on LLMs to construct and query knowledge graphs. It also employs clustering algorithms to create semantic clusters for local searches. These can hinder scalability and increase costs. Qdrant addresses the issue of local search through vector search, while Neo4j’s knowledge graph is queried for more precise answers, enhancing both efficiency and accuracy. Furthermore, instead of using an LLM, Named Entity Recognition (NER)-based techniques can reduce the cost further, but it depends mainly on the dataset. -client - .scrollAsync( - ScrollPoints.newBuilder() - .setCollectionName("dinosaurs") - .setFilter( - Filter.newBuilder() - .addAllMust( - List.of(matchKeyword("diet[].food", "meat"), match("diet[].likes", true))) - .build()) - .build()) - .get(); +# Conclusion -``` +GraphRAG with Neo4j and Qdrant marks an important step forward in retrieval-augmented generation. This hybrid approach delivers significant advantages by combining vector search and graph databases. Qdrant's semantic search capabilities enhance recall accuracy, while Neo4j's relationship modeling provides deeper context understanding. -```csharp -using Qdrant.Client; -using static Qdrant.Client.Grpc.Conditions; +The implementation template we've explored offers a foundation for your projects. You can adapt and customize it based on your specific needs, whether for document analysis, knowledge management, or other information retrieval tasks. -var client = new QdrantClient("localhost", 6334); +As AI systems evolve, this combination of technologies shows how we can build smarter, more efficient solutions. We encourage you to experiment with this approach and discover how it can enhance your applications. -await client.ScrollAsync( - collectionName: "dinosaurs", - filter: MatchKeyword("diet[].food", "meat") & Match("diet[].likes", true) -); +<|page-129-lllmstxt|> +# Installation requirements -``` +The following sections describe the requirements for deploying Qdrant. -This happens because both points are matching the two conditions: +## CPU and memory -- the “t-rex” matches food=meat on `diet[1].food` and likes=true on `diet[1].likes` -- the “diplodocus” matches food=meat on `diet[1].food` and likes=true on `diet[0].likes` +The preferred size of your CPU and RAM depends on: -To retrieve only the points where the conditions apply to a specific element within an array (such as the point with id 1 in this example), you need to use a nested object filter. +- Number of vectors +- Vector dimensions +- [Payloads](/documentation/concepts/payload/) and their indexes +- Storage +- Replication +- How you configure quantization -Nested object filters enable querying arrays of objects independently, ensuring conditions are checked within individual array elements. +Our [Cloud Pricing Calculator](https://cloud.qdrant.io/calculator) can help you estimate required resources without payload or index data. -This is done by using the `nested` condition type, which consists of a payload key that targets an array and a filter to apply. The key should reference an array of objects and can be written with or without bracket notation (e.g., “data” or “data\[\]”). +### Supported CPU architectures: -httppythontypescriptrustjavacsharp +**64-bit system:** +- x86_64/amd64 +- AArch64/arm64 -```http -POST /collections/dinosaurs/points/scroll -{ - "filter": { - "must": [{\ - "nested": {\ - "key": "diet",\ - "filter":{\ - "must": [\ - {\ - "key": "food",\ - "match": {\ - "value": "meat"\ - }\ - },\ - {\ - "key": "likes",\ - "match": {\ - "value": true\ - }\ - }\ - ]\ - }\ - }\ - }] - } -} +**32-bit system:** +- Not supported -``` +### Storage -```python -client.scroll( - collection_name="dinosaurs", - scroll_filter=models.Filter( - must=[\ - models.NestedCondition(\ - nested=models.Nested(\ - key="diet",\ - filter=models.Filter(\ - must=[\ - models.FieldCondition(\ - key="food", match=models.MatchValue(value="meat")\ - ),\ - models.FieldCondition(\ - key="likes", match=models.MatchValue(value=True)\ - ),\ - ]\ - ),\ - )\ - )\ - ], - ), -) +For persistent storage, Qdrant requires block-level access to storage devices with a [POSIX-compatible file system](https://www.quobyte.com/storage-explained/posix-filesystem/). Network systems such as [iSCSI](https://en.wikipedia.org/wiki/ISCSI) that provide block-level access are also acceptable. +Qdrant won't work with [Network file systems](https://en.wikipedia.org/wiki/File_system#Network_file_systems) such as NFS, or [Object storage](https://en.wikipedia.org/wiki/Object_storage) systems such as S3. -``` +If you offload vectors to a local disk, we recommend you use a solid-state (SSD or NVMe) drive. -```typescript -client.scroll("dinosaurs", { - filter: { - must: [\ - {\ - nested: {\ - key: "diet",\ - filter: {\ - must: [\ - {\ - key: "food",\ - match: { value: "meat" },\ - },\ - {\ - key: "likes",\ - match: { value: true },\ - },\ - ],\ - },\ - },\ - },\ - ], - }, -}); +### Networking -``` +Each Qdrant instance requires three open ports: -```rust -use qdrant_client::qdrant::{Condition, Filter, NestedCondition, ScrollPointsBuilder}; +* `6333` - For the HTTP API, for the [Monitoring](/documentation/guides/monitoring/) health and metrics endpoints +* `6334` - For the [gRPC](/documentation/interfaces/#grpc-interface) API +* `6335` - For [Distributed deployment](/documentation/guides/distributed_deployment/) -client - .scroll( - ScrollPointsBuilder::new("dinosaurs").filter(Filter::must([NestedCondition {\ - key: "diet".to_string(),\ - filter: Some(Filter::must([\ - Condition::matches("food", "meat".to_string()),\ - Condition::matches("likes", true),\ - ])),\ - }\ - .into()])), - ) - .await?; +All Qdrant instances in a cluster must be able to: -``` +- Communicate with each other over these ports +- Allow incoming connections to ports `6333` and `6334` from clients that use Qdrant. -```java -import java.util.List; +### Security -import static io.qdrant.client.ConditionFactory.match; -import static io.qdrant.client.ConditionFactory.matchKeyword; -import static io.qdrant.client.ConditionFactory.nested; +The default configuration of Qdrant might not be secure enough for every situation. Please see [our security documentation](/documentation/guides/security/) for more information. -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.ScrollPoints; +## Installation options -client - .scrollAsync( - ScrollPoints.newBuilder() - .setCollectionName("dinosaurs") - .setFilter( - Filter.newBuilder() - .addMust( - nested( - "diet", - Filter.newBuilder() - .addAllMust( - List.of( - matchKeyword("food", "meat"), match("likes", true))) - .build())) - .build()) - .build()) - .get(); +Qdrant can be installed in different ways depending on your needs: + +For production, you can use our Qdrant Cloud to run Qdrant either fully managed in our infrastructure or with Hybrid Cloud in yours. + +If you want to run Qdrant in your own infrastructure, without any cloud connection, we recommend to install Qdrant in a Kubernetes cluster with our Qdrant Private Cloud Enterprise Operator. -``` +For testing or development setups, you can run the Qdrant container or as a binary executable. We also provide a Helm chart for an easy installation in Kubernetes. -```csharp -using Qdrant.Client; -using static Qdrant.Client.Grpc.Conditions; +## Production -var client = new QdrantClient("localhost", 6334); +### Qdrant Cloud -await client.ScrollAsync( - collectionName: "dinosaurs", - filter: Nested("diet", MatchKeyword("food", "meat") & Match("likes", true)) -); +You can set up production with the [Qdrant Cloud](https://qdrant.to/cloud), which provides fully managed Qdrant databases. +It provides horizontal and vertical scaling, one click installation and upgrades, monitoring, logging, as well as backup and disaster recovery. For more information, see the [Qdrant Cloud documentation](/documentation/cloud/). -``` +### Qdrant Kubernetes Operator -The matching logic is adjusted to operate at the level of individual elements within an array in the payload, rather than on all array elements together. +We provide a Qdrant Enterprise Operator for Kubernetes installations as part of our [Qdrant Private Cloud](/documentation/private-cloud/) offering. For more information, [use this form](https://qdrant.to/contact-us) to contact us. -Nested filters function as though each element of the array is evaluated separately. The parent document will be considered a match if at least one array element satisfies all the nested filter conditions. +### Kubernetes -## [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#other-creative-uses-for-filters) Other creative uses for filters +You can use a ready-made [Helm Chart](https://helm.sh/docs/) to run Qdrant in your Kubernetes cluster. While it is possible to deploy Qdrant in a distributed setup with the Helm chart, it does not come with the same level of features for zero-downtime upgrades, up and down-scaling, monitoring, logging, and backup and disaster recovery as the Qdrant Cloud offering or the Qdrant Private Cloud Enterprise Operator. Instead you must manage and set this up [yourself](https://qdrant.tech/documentation/guides/distributed_deployment/). Support for the Helm chart is limited to community support. -You can use filters to retrieve data points without knowing their `id`. You can search through data and manage it, solely by using filters. Let’s take a look at some creative uses for filters: +The following table gives you an overview about the feature differences between the Qdrant Cloud and the Helm chart: -| Action | Description | Action | Description | -| --- | --- | --- | --- | -| [Delete Points](https://qdrant.tech/documentation/concepts/points/#delete-points) | Deletes all points matching the filter. | [Set Payload](https://qdrant.tech/documentation/concepts/payload/#set-payload) | Adds payload fields to all points matching the filter. | -| [Scroll Points](https://qdrant.tech/documentation/concepts/points/#scroll-points) | Lists all points matching the filter. | [Update Payload](https://qdrant.tech/documentation/concepts/payload/#overwrite-payload) | Updates payload fields for points matching the filter. | -| [Order Points](https://qdrant.tech/documentation/concepts/points/#order-points-by-payload-key) | Lists all points, sorted by the filter. | [Delete Payload](https://qdrant.tech/documentation/concepts/payload/#delete-payload-keys) | Deletes fields for points matching the filter. | -| [Count Points](https://qdrant.tech/documentation/concepts/points/#counting-points) | Totals the points matching the filter. | | | +| Feature | Qdrant Helm Chart | Qdrant Cloud | +|--------------------------------------------------------|:-----------------:|:-------------:| +| Open-source | ✅ | | +| Community support only | ✅ | | +| Quick to get started | ✅ | ✅ | +| Vertical and horizontal scaling | ✅ | ✅ | +| API keys with granular access control | ✅ | ✅ | +| Qdrant version upgrades | ✅ | ✅ | +| Support for transit and storage encryption | ✅ | ✅ | +| Zero-downtime upgrades with optimized restart strategy | | ✅ | +| Production ready out-of the box | | ✅ | +| Dataloss prevention on downscaling | | ✅ | +| Full cluster backup and disaster recovery | | ✅ | +| Automatic shard rebalancing | | ✅ | +| Re-sharding support | | ✅ | +| Automatic persistent volume scaling | | ✅ | +| Advanced telemetry | | ✅ | +| One-click API key revoking | | ✅ | +| Recreating nodes with new volumes in existing cluster | | ✅ | +| Enterprise support | | ✅ | -## [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#filtering-with-the-payload-index) Filtering with the payload index +To install the helm chart: -![vector-search-filtering-vector-search](https://qdrant.tech/articles_data/vector-search-filtering/scanning-lens.png) +```bash +helm repo add qdrant https://qdrant.to/helm +helm install qdrant qdrant/qdrant +``` -When you start working with Qdrant, your data is by default organized in a vector index. -In addition to this, we recommend adding a secondary data structure - **the payload index**. +For more information, see the [qdrant-helm](https://github.com/qdrant/qdrant-helm/tree/main/charts/qdrant) README. -Just how the vector index organizes vectors, the payload index will structure your metadata. +### Docker and Docker Compose -**Figure 4:** The payload index is an additional data structure that supports vector search. A payload index (in green) organizes candidate results by cardinality, so that semantic search (in red) can traverse the vector index quickly. +Usually, we recommend to run Qdrant in Kubernetes, or use the Qdrant Cloud for production setups. This makes setting up highly available and scalable Qdrant clusters with backups and disaster recovery a lot easier. -![payload-index-vector-search](https://qdrant.tech/articles_data/vector-search-filtering/payload-index-vector-search.png) +However, you can also use Docker and Docker Compose to run Qdrant in production, by following the setup instructions in the [Docker](#docker) and [Docker Compose](#docker-compose) Development sections. +In addition, you have to make sure: -On its own, semantic searching over terabytes of data can take up lots of RAM. [**Filtering**](https://qdrant.tech/documentation/concepts/filtering/) and [**Indexing**](https://qdrant.tech/documentation/concepts/indexing/) are two easy strategies to reduce your compute usage and still get the best results. Remember, this is only a guide. For an exhaustive list of filtering options, you should read the [filtering documentation](https://qdrant.tech/documentation/concepts/filtering/). +* To use a performant [persistent storage](#storage) for your data +* To configure the [security settings](/documentation/guides/security/) for your deployment +* To set up and configure Qdrant on multiple nodes for a highly available [distributed deployment](/documentation/guides/distributed_deployment/) +* To set up a load balancer for your Qdrant cluster +* To create a [backup and disaster recovery strategy](/documentation/concepts/snapshots/) for your data +* To integrate Qdrant with your [monitoring](/documentation/guides/monitoring/) and logging solutions -Here is how you can create a single index for a metadata field “category”: +## Development -httppython +For development and testing, we recommend that you set up Qdrant in Docker. We also have different client libraries. -```http -PUT /collections/computers/index -{ - "field_name": "category", - "field_schema": "keyword" -} +### Docker -``` +The easiest way to start using Qdrant for testing or development is to run the Qdrant container image. +The latest versions are always available on [DockerHub](https://hub.docker.com/r/qdrant/qdrant/tags?page=1&ordering=last_updated). -```python -from qdrant_client import QdrantClient +Make sure that [Docker](https://docs.docker.com/engine/install/), [Podman](https://podman.io/docs/installation) or the container runtime of your choice is installed and running. The following instructions use Docker. -client = QdrantClient(url="http://localhost:6333") +Pull the image: -client.create_payload_index( - collection_name="computers", - field_name="category", - field_schema="keyword", -) +```bash +docker pull qdrant/qdrant +``` +In the following command, revise `$(pwd)/path/to/data` for your Docker configuration. Then use the updated command to run the container: + +```bash +docker run -p 6333:6333 \ + -v $(pwd)/path/to/data:/qdrant/storage \ + qdrant/qdrant ``` -Once you mark a field indexable, **you don’t need to do anything else**. Qdrant will handle all optimizations in the background. +With this command, you start a Qdrant instance with the default configuration. +It stores all data in the `./path/to/data` directory. + +By default, Qdrant uses port 6333, so at [localhost:6333](http://localhost:6333) you should see the welcome message. -#### [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#why-should-you-index-metadata) Why should you index metadata? +To change the Qdrant configuration, you can overwrite the production configuration: -![payload-index-filtering](https://qdrant.tech/articles_data/vector-search-filtering/payload-index-filtering.png) +```bash +docker run -p 6333:6333 \ + -v $(pwd)/path/to/data:/qdrant/storage \ + -v $(pwd)/path/to/custom_config.yaml:/qdrant/config/production.yaml \ + qdrant/qdrant +``` -The payload index acts as a secondary data structure that speeds up retrieval. Whenever you run vector search with a filter, Qdrant will consult a payload index - if there is one. +Alternatively, you can use your own `custom_config.yaml` configuration file: -As your dataset grows in complexity, Qdrant takes up additional resources to go through all data points. Without a proper data structure, the search can take longer - or run out of resources. +```bash +docker run -p 6333:6333 \ + -v $(pwd)/path/to/data:/qdrant/storage \ + -v $(pwd)/path/to/custom_config.yaml:/qdrant/config/custom_config.yaml \ + qdrant/qdrant \ + ./qdrant --config-path config/custom_config.yaml +``` -#### [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#payload-indexing-helps-evaluate-the-most-restrictive-filters) Payload indexing helps evaluate the most restrictive filters +For more information, see the [Configuration](/documentation/guides/configuration/) documentation. -The payload index is also used to accurately estimate **filter cardinality**, which helps the query planning choose a search strategy. **Filter cardinality** refers to the number of distinct values that a filter can match within a dataset. Qdrant’s search strategy can switch from **HNSW search** to **payload index-based search** if the cardinality is too low. +### Docker Compose -**How it affects your queries:** Depending on the filter used in the search - there are several possible scenarios for query execution. Qdrant chooses one of the query execution options depending on the available indexes, the complexity of the conditions and the cardinality of the filtering result. +You can also use [Docker Compose](https://docs.docker.com/compose/) to run Qdrant. -- The planner estimates the cardinality of a filtered result before selecting a strategy. -- Qdrant retrieves points using the **payload index** if cardinality is below threshold. -- Qdrant uses the **filterable vector index** if the cardinality is above a threshold +Here is an example customized compose file for a single node Qdrant cluster: -#### [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#what-happens-if-you-dont-use-payload-indexes) What happens if you don’t use payload indexes? +```yaml +services: + qdrant: + image: qdrant/qdrant:latest + restart: always + container_name: qdrant + ports: + - 6333:6333 + - 6334:6334 + expose: + - 6333 + - 6334 + - 6335 + configs: + - source: qdrant_config + target: /qdrant/config/production.yaml + volumes: + - ./qdrant_data:/qdrant/storage -When using filters while querying, Qdrant needs to estimate cardinality of those filters to define a proper query plan. If you don’t create a payload index, Qdrant will not be able to do this. It may end up choosing a sub-optimal way of searching causing extremely slow search times or low accuracy results. +configs: + qdrant_config: + content: | + log_level: INFO +``` -If you only rely on **searching for the nearest vector**, Qdrant will have to go through the entire vector index. It will calculate similarities against each vector in the collection, relevant or not. Alternatively, when you filter with the help of a payload index, the HSNW algorithm won’t have to evaluate every point. Furthermore, the payload index will help HNSW construct the graph with additional links. + -## [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#how-does-the-payload-index-look) How does the payload index look? +### From source -A payload index is similar to conventional document-oriented databases. It connects metadata fields with their corresponding point id’s for quick retrieval. +Qdrant is written in Rust and can be compiled into a binary executable. +This installation method can be helpful if you want to compile Qdrant for a specific processor architecture or if you do not want to use Docker. -In this example, you are indexing all of your computer hardware inside of the `computers` collection. Let’s take a look at a sample payload index for the field `category`. +Before compiling, make sure that the necessary libraries and the [rust toolchain](https://www.rust-lang.org/tools/install) are installed. +The current list of required libraries can be found in the [Dockerfile](https://github.com/qdrant/qdrant/blob/master/Dockerfile). -```json -Payload Index by keyword: -+------------+-------------+ -| category | id | -+------------+-------------+ -| laptop | 1, 4, 7 | -| desktop | 2, 5, 9 | -| speakers | 3, 6, 8 | -| keyboard | 10, 11 | -+------------+-------------+ +Build Qdrant with Cargo: +```bash +cargo build --release --bin qdrant ``` -When fields are properly indexed, the search engine roughly knows where it can start its journey. It can start looking up points that contain relevant metadata, and it doesn’t need to scan the entire dataset. This reduces the engine’s workload by a lot. As a result, query results are faster and the system can easily scale. +After a successful build, you can find the binary in the following subdirectory `./target/release/qdrant`. -> You may create as many payload indexes as you want, and we recommend you do so for each field that you filter by. +## Client libraries -If your users are often filtering by **laptop** when looking up a product **category**, indexing all computer metadata will speed up retrieval and make the results more precise. +In addition to the service, Qdrant provides a variety of client libraries for different programming languages. For a full list, see our [Client libraries](/documentation/interfaces/#client-libraries) documentation. -#### [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#different-types-of-payload-indexes) Different types of payload indexes +<|page-130-lllmstxt|> +# Multilingual & Multimodal Search with LlamaIndex -| Index Type | Description | -| --- | --- | -| [Full-text Index](https://qdrant.tech/documentation/concepts/indexing/#full-text-index) | Enables efficient text search in large datasets. | -| [Tenant Index](https://qdrant.tech/documentation/concepts/indexing/#tenant-index) | For data isolation and retrieval efficiency in multi-tenant architectures. | -| [Principal Index](https://qdrant.tech/documentation/concepts/indexing/#principal-index) | Manages data based on primary entities like users or accounts. | -| [On-Disk Index](https://qdrant.tech/documentation/concepts/indexing/#on-disk-payload-index) | Stores indexes on disk to manage large datasets without memory usage. | -| [Parameterized Index](https://qdrant.tech/documentation/concepts/indexing/#parameterized-index) | Allows for dynamic querying, where the index can adapt based on different parameters or conditions provided by the user. Useful for numeric data like prices or timestamps. | +![Snow prints](/documentation/examples/multimodal-search/image-1.png) -### [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#indexing-payloads-in-multitenant-setups) Indexing payloads in multitenant setups +| Time: 15 min | Level: Beginner |Output: [GitHub](https://github.com/qdrant/examples/blob/master/multimodal-search/Multimodal_Search_with_LlamaIndex.ipynb)|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/qdrant/examples/blob/master/multimodal-search/Multimodal_Search_with_LlamaIndex.ipynb) | +| --- | ----------- | ----------- | ----------- | -Some applications need to have data segregated, whereby different users need to see different data inside of the same program. When setting up storage for such a complex application, many users think they need multiple databases for segregated users. +## Overview -We see this quite often. Users very frequently make the mistake of creating a separate collection for each tenant inside of the same cluster. This can quickly exhaust the cluster’s resources. Running vector search through too many collections can start using up too much RAM. You may start seeing out-of-memory (OOM) errors and degraded performance. +We often understand and share information more effectively when combining different types of data. For example, the taste of comfort food can trigger childhood memories. We might describe a song with just “pam pam clap” sounds. Instead of writing paragraphs. Sometimes, we may use emojis and stickers to express how we feel or to share complex ideas. -To mitigate this, we offer extensive support for multitenant systems, so that you can build an entire global application in one single Qdrant collection. +Modalities of data such as **text**, **images**, **video** and **audio** in various combinations form valuable use cases for Semantic Search applications. -When creating or updating a collection, you can mark a metadata field as indexable. To mark `user_id` as a tenant in a shared collection, do the following: +Vector databases, being **modality-agnostic**, are perfect for building these applications. -```http -PUT /collections/{collection_name}/index -{ - "field_name": "user_id", - "field_schema": { - "type": "keyword", - "is_tenant": true - } -} +In this simple tutorial, we are working with two simple modalities: **image** and **text** data. However, you can create a Semantic Search application with any combination of modalities if you choose the right embedding model to bridge the **semantic gap**. -``` +> The **semantic gap** refers to the difference between low-level features (aka brightness) and high-level concepts (aka cuteness). -Additionally, we offer a way of organizing data efficiently by means of the tenant index. This is another variant of the payload index that makes tenant data more accessible. This time, the request will specify the field as a tenant. This means that you can mark various customer types and user id’s as `is_tenant: true`. +For example, the [vdr-2b-multi-v1 model](https://huggingface.co/llamaindex/vdr-2b-multi-v1) from LlamaIndex is designed for multilingual embedding, particularly effective for visual document retrieval across multiple languages and domains. It allows for searching and querying visually rich multilingual documents without the need for OCR or other data extraction pipelines. -Read more about setting up [tenant defragmentation](https://qdrant.tech/documentation/concepts/indexing/?q=tenant#tenant-index) in multitenant environments, +## Setup -## [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#key-takeaways-in-filtering-and-indexing) Key takeaways in filtering and indexing +First, install the required libraries `qdrant-client` and `llama-index-embeddings-huggingface`. -![best-practices](https://qdrant.tech/articles_data/vector-search-filtering/best-practices.png) +```bash +pip install qdrant-client llama-index-embeddings-huggingface +``` -### [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#filtering-with-float-point-decimal-numbers) Filtering with float-point (decimal) numbers +## Dataset -If you filter by the float data type, your search precision may be limited and inaccurate. +To make the demonstration simple, we created a tiny dataset of images and their captions for you. -Float Datatype numbers have a decimal point and are 64 bits in size. Here is an example: +Images can be downloaded from [here](https://github.com/qdrant/examples/tree/master/multimodal-search/images). It's **important** to place them in the same folder as your code/notebook, in the folder named `images`. -```json -{ - "price": 11.99 -} +## Vectorize data -``` +`LlamaIndex`'s `vdr-2b-multi-v1` model supports cross-lingual retrieval, allowing for effective searches across languages and domains. It encodes document page screenshots into dense single-vector representations, eliminating the need for OCR and other complex data extraction processes. -When you filter for a specific float number, such as 11.99, you may get a different result, like 11.98 or 12.00. With decimals, numbers are rounded differently, so logically identical values may appear different. Unfortunately, searching for exact matches can be unreliable in this case. +Let's embed the images and their captions in the **shared embedding space**. -To avoid inaccuracies, use a different filtering method. We recommend that you try Range Based Filtering instead of exact matches. This method accounts for minor variations in data, and it boosts performance - especially with large datasets. +```python +from llama_index.embeddings.huggingface import HuggingFaceEmbedding -Here is a sample JSON range filter for values greater than or equal to 11.99 and less than or equal to the same number. This will retrieve any values within the range of 11.99, including those with additional decimal places. +model = HuggingFaceEmbedding( + model_name="llamaindex/vdr-2b-multi-v1", + device="cpu", # "mps" for mac, "cuda" for nvidia GPUs + trust_remote_code=True, +) -```json -{ - "key": "price", - "range": { - "gt": null, - "gte": 11.99, - "lt": null, - "lte": 11.99 - } -} +documents = [ + {"caption": "An image about plane emergency safety.", "image": "images/image-1.png"}, + {"caption": "An image about airplane components.", "image": "images/image-2.png"}, + {"caption": "An image about COVID safety restrictions.", "image": "images/image-3.png"}, + {"caption": "An confidential image about UFO sightings.", "image": "images/image-4.png"}, + {"caption": "An image about unusual footprints on Aralar 2011.", "image": "images/image-5.png"}, +] +text_embeddings = model.get_text_embedding_batch([doc["caption"] for doc in documents]) +image_embeddings = model.get_image_embedding_batch([doc["image"] for doc in documents]) ``` -### [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#working-with-pagination-in-queries) Working with pagination in queries +## Upload data to Qdrant -When you’re implementing pagination in filtered queries, indexing becomes even more critical. When paginating results, you often need to exclude items you’ve already seen. This is typically managed by applying filters that specify which IDs should not be included in the next set of results. - -However, an interesting aspect of Qdrant’s data model is that a single point can have multiple values for the same field, such as different color options for a product. This means that during filtering, an ID might appear multiple times if it matches on different values of the same field. +1. **Create a client object for Qdrant**. -Proper indexing ensures that these queries are efficient, preventing duplicate results and making pagination smoother. +```python +from qdrant_client import QdrantClient, models -## [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#conclusion-real-life-use-cases-of-filtering) Conclusion: Real-life use cases of filtering +# docker run -p 6333:6333 qdrant/qdrant +client = QdrantClient(url="http://localhost:6333/") +``` -Filtering in a [vector database](https://qdrant.tech/) like Qdrant can significantly enhance search capabilities by enabling more precise and efficient retrieval of data. +2. **Create a new collection for the images with captions**. -As a conclusion to this guide, let’s look at some real-life use cases where filtering is crucial: +```python +COLLECTION_NAME = "llama-multi" -| **Use Case** | **Vector Search** | **Filtering** | -| --- | --- | --- | -| [E-Commerce Product Search](https://qdrant.tech/advanced-search/) | Search for products by style or visual similarity | Filter by price, color, brand, size, ratings | -| [Recommendation Systems](https://qdrant.tech/recommendations/) | Recommend similar content (e.g., movies, songs) | Filter by release date, genre, etc. (e.g., movies after 2020) | -| [Geospatial Search in Ride-Sharing](https://qdrant.tech/articles/geo-polygon-filter-gsoc/) | Find similar drivers or delivery partners | Filter by rating, distance radius, vehicle type | -| [Fraud & Anomaly Detection](https://qdrant.tech/data-analysis-anomaly-detection/) | Detect transactions similar to known fraud cases | Filter by amount, time, location | +if not client.collection_exists(COLLECTION_NAME): + client.create_collection( + collection_name=COLLECTION_NAME, + vectors_config={ + "image": models.VectorParams(size=len(image_embeddings[0]), distance=models.Distance.COSINE), + "text": models.VectorParams(size=len(text_embeddings[0]), distance=models.Distance.COSINE), + } + ) +``` -#### [Anchor](https://qdrant.tech/articles/vector-search-filtering/\#before-you-go---all-the-code-is-in-qdrants-dashboard) Before you go - all the code is in Qdrant’s Dashboard +3. **Upload our images with captions to the Collection**. -The easiest way to reach that “Hello World” moment is to [**try filtering in a live cluster**](https://qdrant.tech/documentation/quickstart-cloud/). Our interactive tutorial will show you how to create a cluster, add data and try some filtering clauses. +```python +client.upload_points( + collection_name=COLLECTION_NAME, + points=[ + models.PointStruct( + id=idx, + vector={ + "text": text_embeddings[idx], + "image": image_embeddings[idx], + }, + payload=doc + ) + for idx, doc in enumerate(documents) + ] +) +``` -**It’s all in your free cluster!** +## Search -[![qdrant-hybrid-cloud](https://qdrant.tech/docs/homepage/cloud-cta.png)](https://qdrant.to/cloud) +### Text-to-Image -##### Was this page useful? +Let's see what image we will get to the query "*Adventures on snow hills*". -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +```python +from PIL import Image -Thank you for your feedback! 🙏 +find_image = model.get_query_embedding("Adventures on snow hills") -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/vector-search-filtering.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Image.open(client.query_points( + collection_name=COLLECTION_NAME, + query=find_image, + using="image", + with_payload=["image"], + limit=1 +).points[0].payload['image']) +``` -On this page: +Let's also run the same query in Italian and compare the results. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/vector-search-filtering.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +### Multilingual Search -× +Now, let's do a multilingual search using an Italian query: -[Powered by](https://qdrant.tech/) +```python +Image.open(client.query_points( + collection_name=COLLECTION_NAME, + query=model.get_query_embedding("Avventure sulle colline innevate"), + using="image", + with_payload=["image"], + limit=1 +).points[0].payload['image']) +``` -<|page-89-lllmstxt|> -## qdrant-airflow-astronomer -- [Documentation](https://qdrant.tech/documentation/) -- [Send data](https://qdrant.tech/documentation/send-data/) -- Semantic Querying with Airflow and Astronomer +**Response:** -# [Anchor](https://qdrant.tech/documentation/send-data/qdrant-airflow-astronomer/\#semantic-querying-with-airflow-and-astronomer) Semantic Querying with Airflow and Astronomer +![Snow prints](/documentation/advanced-tutorials/snow-prints.png) -| Time: 45 min | Level: Intermediate | | | -| --- | --- | --- | --- | +### Image-to-Text -In this tutorial, you will use Qdrant as a [provider](https://airflow.apache.org/docs/apache-airflow-providers-qdrant/stable/index.html) in [Apache Airflow](https://airflow.apache.org/), an open-source tool that lets you setup data-engineering workflows. +Now, let's do a reverse search with the following image: -You will write the pipeline as a DAG (Directed Acyclic Graph) in Python. With this, you can leverage the powerful suite of Python’s capabilities and libraries to achieve almost anything your data pipeline needs. +![Airplane](/documentation/advanced-tutorials/airplane.png) -[Astronomer](https://www.astronomer.io/) is a managed platform that simplifies the process of developing and deploying Airflow projects via its easy-to-use CLI and extensive automation capabilities. +```python +client.query_points( + collection_name=COLLECTION_NAME, + query=model.get_image_embedding("images/image-2.png"), + # Now we are searching only among text vectors with our image query + using="text", + with_payload=["caption"], + limit=1 +).points[0].payload['caption'] +``` -Airflow is useful when running operations in Qdrant based on data events or building parallel tasks for generating vector embeddings. By using Airflow, you can set up monitoring and alerts for your pipelines for full observability. +**Response:** -## [Anchor](https://qdrant.tech/documentation/send-data/qdrant-airflow-astronomer/\#prerequisites) Prerequisites +```text +'An image about plane emergency safety.' +``` -Please make sure you have the following ready: +## Next steps -- A running Qdrant instance. We’ll be using a free instance from [https://cloud.qdrant.io](https://cloud.qdrant.io/) -- The Astronomer CLI. Find the installation instructions [here](https://docs.astronomer.io/astro/cli/install-cli). -- A [HuggingFace token](https://huggingface.co/docs/hub/en/security-tokens) to generate embeddings. +Use cases of even just Image & Text Multimodal Search are countless: E-Commerce, Media Management, Content Recommendation, Emotion Recognition Systems, Biomedical Image Retrieval, Spoken Sign Language Transcription, etc. -## [Anchor](https://qdrant.tech/documentation/send-data/qdrant-airflow-astronomer/\#implementation) Implementation +Imagine a scenario: a user wants to find a product similar to a picture they have, but they also have specific textual requirements, like "*in beige colour*". You can search using just texts or images and combine their embeddings in a **late fusion manner** (summing and weighting might work surprisingly well). -We’ll be building a DAG that generates embeddings in parallel for our data corpus and performs semantic retrieval based on user input. +Moreover, using [Discovery Search](/articles/discovery-search/) with both modalities, you can provide users with information that is impossible to retrieve unimodally! -### [Anchor](https://qdrant.tech/documentation/send-data/qdrant-airflow-astronomer/\#set-up-the-project) Set up the project +Join our [Discord community](https://qdrant.to/discord), where we talk about vector search and similarity learning, experiment, and have fun! -The Astronomer CLI makes it very straightforward to set up the Airflow project: +<|page-131-lllmstxt|> +# How to Generate Sparse Vectors with SPLADE -```console -mkdir qdrant-airflow-tutorial && cd qdrant-airflow-tutorial -astro dev init +SPLADE is a novel method for learning sparse text representation vectors, outperforming BM25 in tasks like information retrieval and document classification. Its main advantage is generating efficient and interpretable sparse vectors, making it effective for large-scale text data. -``` +## Setup -This command generates all of the project files you need to run Airflow locally. You can find a directory called `dags`, which is where we can place our Python DAG files. +First, install FastEmbed. -To use Qdrant within Airflow, install the Qdrant Airflow provider by adding the following to the `requirements.txt` file +```python +pip install -q fastembed +``` -```text -apache-airflow-providers-qdrant +Next, import the required modules for sparse embeddings and Python’s typing module. +```python +from fastembed import SparseTextEmbedding, SparseEmbedding ``` -### [Anchor](https://qdrant.tech/documentation/send-data/qdrant-airflow-astronomer/\#configure-credentials) Configure credentials +You may always check the list of all supported sparse embedding models. -We can set up provider connections using the Airflow UI, environment variables or the `airflow_settings.yml` file. +```python +SparseTextEmbedding.list_supported_models() +``` +This will return a list of models, each with its details such as model name, vocabulary size, description, and sources. -Add the following to the `.env` file in the project. Replace the values as per your credentials. +```python +[ + { + 'model': 'prithivida/Splade_PP_en_v1', + 'sources': {'hf': 'Qdrant/Splade_PP_en_v1', ...}, + 'model_file': 'model.onnx', + 'description': 'Independent Implementation of SPLADE++ Model for English.', + 'license': 'apache-2.0', + 'size_in_GB': 0.532, + 'vocab_size': 30522, + ... + }, + ... +] # part of the output was omitted +``` -```env -HUGGINGFACE_TOKEN="" -AIRFLOW_CONN_QDRANT_DEFAULT='{ - "conn_type": "qdrant", - "host": "xyz-example.eu-central.aws.cloud.qdrant.io:6333", - "password": "" -}' +Now, load the model. +```python +model_name = "prithivida/Splade_PP_en_v1" +# This triggers the model download +model = SparseTextEmbedding(model_name=model_name) ``` -### [Anchor](https://qdrant.tech/documentation/send-data/qdrant-airflow-astronomer/\#add-the-data-corpus) Add the data corpus - -Let’s add some sample data to work with. Paste the following content into a file called `books.txt` file within the `include` directory. +## Embed data -```text -1 | To Kill a Mockingbird (1960) | fiction | Harper Lee's Pulitzer Prize-winning novel explores racial injustice and moral growth through the eyes of young Scout Finch in the Deep South. -2 | Harry Potter and the Sorcerer's Stone (1997) | fantasy | J.K. Rowling's magical tale follows Harry Potter as he discovers his wizarding heritage and attends Hogwarts School of Witchcraft and Wizardry. -3 | The Great Gatsby (1925) | fiction | F. Scott Fitzgerald's classic novel delves into the glitz, glamour, and moral decay of the Jazz Age through the eyes of narrator Nick Carraway and his enigmatic neighbour, Jay Gatsby. -4 | 1984 (1949) | dystopian | George Orwell's dystopian masterpiece paints a chilling picture of a totalitarian society where individuality is suppressed and the truth is manipulated by a powerful regime. -5 | The Catcher in the Rye (1951) | fiction | J.D. Salinger's iconic novel follows disillusioned teenager Holden Caulfield as he navigates the complexities of adulthood and society's expectations in post-World War II America. -6 | Pride and Prejudice (1813) | romance | Jane Austen's beloved novel revolves around the lively and independent Elizabeth Bennet as she navigates love, class, and societal expectations in Regency-era England. -7 | The Hobbit (1937) | fantasy | J.R.R. Tolkien's adventure follows Bilbo Baggins, a hobbit who embarks on a quest with a group of dwarves to reclaim their homeland from the dragon Smaug. -8 | The Lord of the Rings (1954-1955) | fantasy | J.R.R. Tolkien's epic fantasy trilogy follows the journey of Frodo Baggins to destroy the One Ring and defeat the Dark Lord Sauron in the land of Middle-earth. -9 | The Alchemist (1988) | fiction | Paulo Coelho's philosophical novel follows Santiago, an Andalusian shepherd boy, on a journey of self-discovery and spiritual awakening as he searches for a hidden treasure. -10 | The Da Vinci Code (2003) | mystery/thriller | Dan Brown's gripping thriller follows symbologist Robert Langdon as he unravels clues hidden in art and history while trying to solve a murder mystery with far-reaching implications. +You need to define a list of documents to be embedded. +```python +documents: list[str] = [ + "Chandrayaan-3 is India's third lunar mission", + "It aimed to land a rover on the Moon's surface - joining the US, China and Russia", + "The mission is a follow-up to Chandrayaan-2, which had partial success", + "Chandrayaan-3 will be launched by the Indian Space Research Organisation (ISRO)", + "The estimated cost of the mission is around $35 million", + "It will carry instruments to study the lunar surface and atmosphere", + "Chandrayaan-3 landed on the Moon's surface on 23rd August 2023", + "It consists of a lander named Vikram and a rover named Pragyan similar to Chandrayaan-2. Its propulsion module would act like an orbiter.", + "The propulsion module carries the lander and rover configuration until the spacecraft is in a 100-kilometre (62 mi) lunar orbit", + "The mission used GSLV Mk III rocket for its launch", + "Chandrayaan-3 was launched from the Satish Dhawan Space Centre in Sriharikota", + "Chandrayaan-3 was launched earlier in the year 2023", +] +``` +Then, generate sparse embeddings for each document. +Here,`batch_size` is optional and helps to process documents in batches. +```python +sparse_embeddings_list: list[SparseEmbedding] = list( + model.embed(documents, batch_size=6) +) ``` +## Retrieve embeddings -Now, the hacking part - writing our Airflow DAG! +`sparse_embeddings_list` contains sparse embeddings for the documents provided earlier. Each element in this list is a `SparseEmbedding` object that contains the sparse vector representation of a document. + +```python +index = 0 +sparse_embeddings_list[index] +``` -### [Anchor](https://qdrant.tech/documentation/send-data/qdrant-airflow-astronomer/\#write-the-dag) Write the dag +This output is a `SparseEmbedding` object for the first document in our list. It contains two arrays: `values` and `indices`. - The `values` array represents the weights of the features (tokens) in the document. - The `indices` array represents the indices of these features in the model's vocabulary. -We’ll add the following content to a `books_recommend.py` file within the `dags` directory. Let’s go over what it does for each task. +Each pair of corresponding `values` and `indices` represents a token and its weight in the document. ```python -import os -import requests +SparseEmbedding(values=array([0.05297208, 0.01963477, 0.36459631, 1.38508618, 0.71776593, + 0.12667948, 0.46230844, 0.446771 , 0.26897505, 1.01519883, + 1.5655334 , 0.29412213, 1.53102326, 0.59785569, 1.1001817 , + 0.02079751, 0.09955651, 0.44249091, 0.09747757, 1.53519952, + 1.36765671, 0.15740395, 0.49882549, 0.38629025, 0.76612782, + 1.25805044, 0.39058095, 0.27236196, 0.45152301, 0.48262018, + 0.26085234, 1.35912788, 0.70710695, 1.71639752]), indices=array([ 1010, 1011, 1016, 1017, 2001, 2018, 2034, 2093, 2117, + 2319, 2353, 2509, 2634, 2686, 2796, 2817, 2922, 2959, + 3003, 3148, 3260, 3390, 3462, 3523, 3822, 4231, 4316, + 4774, 5590, 5871, 6416, 11926, 12076, 16469])) +``` -from airflow.decorators import dag, task -from airflow.models.baseoperator import chain -from airflow.models.param import Param -from airflow.providers.qdrant.hooks.qdrant import QdrantHook -from airflow.providers.qdrant.operators.qdrant import QdrantIngestOperator -from pendulum import datetime -from qdrant_client import models +## Examine weights -QDRANT_CONNECTION_ID = "qdrant_default" -DATA_FILE_PATH = "include/books.txt" -COLLECTION_NAME = "airflow_tutorial_collection" +Now, print the first 5 features and their weights for better understanding. -EMBEDDING_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2" -EMBEDDING_DIMENSION = 384 -SIMILARITY_METRIC = models.Distance.COSINE +```python +for i in range(5): + print(f"Token at index {sparse_embeddings_list[0].indices[i]} has weight {sparse_embeddings_list[0].values[i]}") +``` +The output will display the token indices and their corresponding weights for the first document. -def embed(text: str) -> list: - HUGGINFACE_URL = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{EMBEDDING_MODEL_ID}" - response = requests.post( - HUGGINFACE_URL, - headers={"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN')}"}, - json={"inputs": [text], "options": {"wait_for_model": True}}, - ) - return response.json()[0] +```python +Token at index 1010 has weight 0.05297207832336426 +Token at index 1011 has weight 0.01963476650416851 +Token at index 1016 has weight 0.36459630727767944 +Token at index 1017 has weight 1.385086178779602 +Token at index 2001 has weight 0.7177659273147583 +``` +## Analyze results -@dag( - dag_id="books_recommend", - start_date=datetime(2023, 10, 18), - schedule=None, - catchup=False, - params={"preference": Param("Something suspenseful and thrilling.", type="string")}, -) -def recommend_book(): - @task - def import_books(text_file_path: str) -> list: - data = [] - with open(text_file_path, "r") as f: - for line in f: - _, title, genre, description = line.split("|") - data.append( - { - "title": title.strip(), - "genre": genre.strip(), - "description": description.strip(), - } - ) +Let's use the tokenizer vocab to make sense of these indices. - return data +```python +import json +from tokenizers import Tokenizer - @task - def init_collection(): - hook = QdrantHook(conn_id=QDRANT_CONNECTION_ID) - if not hook.conn..collection_exists(COLLECTION_NAME): - hook.conn.create_collection( - COLLECTION_NAME, - vectors_config=models.VectorParams( - size=EMBEDDING_DIMENSION, distance=SIMILARITY_METRIC - ), - ) +tokenizer = Tokenizer.from_pretrained("Qdrant/Splade_PP_en_v1") +``` - @task - def embed_description(data: dict) -> list: - return embed(data["description"]) +The `get_tokens_and_weights` function takes a `SparseEmbedding` object and a `tokenizer` as input. It will construct a dictionary where the keys are the decoded tokens, and the values are their corresponding weights. - books = import_books(text_file_path=DATA_FILE_PATH) - embeddings = embed_description.expand(data=books) +```python +def get_tokens_and_weights(sparse_embedding, tokenizer): + token_weight_dict = {} + for i in range(len(sparse_embedding.indices)): + token = tokenizer.decode([sparse_embedding.indices[i]]) + weight = sparse_embedding.values[i] + token_weight_dict[token] = weight - qdrant_vector_ingest = QdrantIngestOperator( - conn_id=QDRANT_CONNECTION_ID, - task_id="qdrant_vector_ingest", - collection_name=COLLECTION_NAME, - payload=books, - vectors=embeddings, - ) + # Sort the dictionary by weights + token_weight_dict = dict(sorted(token_weight_dict.items(), key=lambda item: item[1], reverse=True)) + return token_weight_dict - @task - def embed_preference(**context) -> list: - user_mood = context["params"]["preference"] - response = embed(text=user_mood) +# Test the function with the first SparseEmbedding +print(json.dumps(get_tokens_and_weights(sparse_embeddings_list[index], tokenizer), indent=4)) +``` +## Dictionary output - return response +The dictionary is then sorted by weights in descending order. +```python +{ + "chandra": 1.7163975238800049, + "third": 1.5655333995819092, + "##ya": 1.535199522972107, + "india": 1.5310232639312744, + "3": 1.385086178779602, + "mission": 1.3676567077636719, + "lunar": 1.3591278791427612, + "moon": 1.2580504417419434, + "indian": 1.1001816987991333, + "##an": 1.015198826789856, + "3rd": 0.7661278247833252, + "was": 0.7177659273147583, + "spacecraft": 0.7071069478988647, + "space": 0.5978556871414185, + "flight": 0.4988254904747009, + "satellite": 0.4826201796531677, + "first": 0.46230843663215637, + "expedition": 0.4515230059623718, + "three": 0.4467709958553314, + "fourth": 0.44249090552330017, + "vehicle": 0.390580952167511, + "iii": 0.3862902522087097, + "2": 0.36459630727767944, + "##3": 0.2941221296787262, + "planet": 0.27236196398735046, + "second": 0.26897504925727844, + "missions": 0.2608523368835449, + "launched": 0.15740394592285156, + "had": 0.12667948007583618, + "largest": 0.09955651313066483, + "leader": 0.09747757017612457, + ",": 0.05297207832336426, + "study": 0.02079751156270504, + "-": 0.01963476650416851 +} +``` - @task - def search_qdrant( - preference_embedding: list, - ) -> None: - hook = QdrantHook(conn_id=QDRANT_CONNECTION_ID) +## Observations - result = hook.conn.query_points( - collection_name=COLLECTION_NAME, - query=preference_embedding, - limit=1, - with_payload=True, - ).points +- The relative order of importance is quite useful. The most important tokens in the sentence have the highest weights. +- **Term Expansion:** The model can expand the terms in the document. This means that the model can generate weights for tokens that are not present in the document but are related to the tokens in the document. This is a powerful feature that allows the model to capture the context of the document. Here, you'll see that the model has added the tokens '3' from 'third' and 'moon' from 'lunar' to the sparse vector. - print("Book recommendation: " + result[0].payload["title"]) - print("Description: " + result[0].payload["description"]) +## Design choices - chain( - init_collection(), - qdrant_vector_ingest, - search_qdrant(embed_preference()), - ) +- The weights are not normalized. This means that the sum of the weights is not 1 or 100. This is a common practice in sparse embeddings, as it allows the model to capture the importance of each token in the document. +- Tokens are included in the sparse vector only if they are present in the model's vocabulary. This means that the model will not generate a weight for tokens that it has not seen during training. +- Tokens do not map to words directly -- allowing you to gracefully handle typo errors and out-of-vocabulary tokens. -recommend_book() +<|page-132-lllmstxt|> +![deepseek-rag-qdrant](/documentation/examples/rag-deepseek/deepseek.png) -``` +# 5 Minute RAG with Qdrant and DeepSeek -`import_books`: This task reads a text file containing information about the books (like title, genre, and description), and then returns the data as a list of dictionaries. +| Time: 5 min | Level: Beginner | Output: [GitHub](https://github.com/qdrant/examples/blob/master/rag-with-qdrant-deepseek/deepseek-qdrant.ipynb) | +| --- | ----------- | ----------- |----------- | -`init_collection`: This task initializes a collection in the Qdrant database, where we will store the vector representations of the book descriptions. +This tutorial demonstrates how to build a **Retrieval-Augmented Generation (RAG)** pipeline using Qdrant as a vector storage solution and DeepSeek for semantic query enrichment. RAG pipelines enhance Large Language Model (LLM) responses by providing contextually relevant data. -`embed_description`: This is a dynamic task that creates one mapped task instance for each book in the list. The task uses the `embed` function to generate vector embeddings for each description. To use a different embedding model, you can adjust the `EMBEDDING_MODEL_ID`, `EMBEDDING_DIMENSION` values. +## Overview +In this tutorial, we will: +1. Take sample text and turn it into vectors with FastEmbed. +2. Send the vectors to a Qdrant collection. +3. Connect Qdrant and DeepSeek into a minimal RAG pipeline. +4. Ask DeepSeek different questions and test answer accuracy. +5. Enrich DeepSeek prompts with content retrieved from Qdrant. +6. Evaluate answer accuracy before and after. -`embed_user_preference`: Here, we take a user’s input and convert it into a vector using the same pre-trained model used for the book descriptions. +#### Architecture: -`qdrant_vector_ingest`: This task ingests the book data into the Qdrant collection using the [QdrantIngestOperator](https://airflow.apache.org/docs/apache-airflow-providers-qdrant/1.0.0/), associating each book description with its corresponding vector embeddings. +![deepseek-rag-architecture](/documentation/examples/rag-deepseek/architecture.png) -`search_qdrant`: Finally, this task performs a search in the Qdrant database using the vectorized user preference. It finds the most relevant book in the collection based on vector similarity. +--- -### [Anchor](https://qdrant.tech/documentation/send-data/qdrant-airflow-astronomer/\#run-the-dag) Run the DAG +## Prerequisites -Head over to your terminal and run -`astro dev start` +Ensure you have the following: +- Python environment (3.9+) +- Access to [Qdrant Cloud](https://qdrant.tech) +- A DeepSeek API key from [DeepSeek Platform](https://platform.deepseek.com/api_keys) + +## Setup Qdrant -A local Airflow container should spawn. You can now access the Airflow UI at [http://localhost:8080](http://localhost:8080/). Visit our DAG by clicking on `books_recommend`. -![DAG](https://qdrant.tech/documentation/examples/airflow/demo-dag.png) +```python +pip install "qdrant-client[fastembed]>=1.14.1" +``` -Hit the PLAY button on the right to run the DAG. You’ll be asked for input about your preference, with the default value already filled in. +[Qdrant](https://qdrant.tech) will act as a knowledge base providing the context information for the prompts we'll be sending to the LLM. -![Preference](https://qdrant.tech/documentation/examples/airflow/preference-input.png) +You can get a free-forever Qdrant cloud instance at http://cloud.qdrant.io. Learn about setting up your instance from the [Quickstart](https://qdrant.tech/documentation/quickstart-cloud/). -After your DAG run completes, you should be able to see the output of your search in the logs of the `search_qdrant` task. -![Output](https://qdrant.tech/documentation/examples/airflow/output.png) +```python +QDRANT_URL = "https://xyz-example.eu-central.aws.cloud.qdrant.io:6333" +QDRANT_API_KEY = "" +``` -There you have it, an Airflow pipeline that interfaces with Qdrant! Feel free to fiddle around and explore Airflow. There are references below that might come in handy. +### Instantiating Qdrant Client -## [Anchor](https://qdrant.tech/documentation/send-data/qdrant-airflow-astronomer/\#further-reading) Further reading -- [Introduction to Airflow](https://docs.astronomer.io/learn/intro-to-airflow) -- [Airflow Concepts](https://docs.astronomer.io/learn/category/airflow-concepts) -- [Airflow Reference](https://airflow.apache.org/docs/) -- [Astronomer Documentation](https://docs.astronomer.io/) +```python +from qdrant_client import QdrantClient, models -##### Was this page useful? +client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY) +``` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +### Building the knowledge base -Thank you for your feedback! 🙏 +Qdrant will use vector embeddings of our facts to enrich the original prompt with some context. Thus, we need to store the vector embeddings and the facts used to generate them. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/send-data/qdrant-airflow-astronomer.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +We'll be using the [bge-base-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) model via [FastEmbed](https://github.com/qdrant/fastembed/) - A lightweight, fast, Python library for embeddings generation. -On this page: +The Qdrant client provides a handy integration with FastEmbed that makes building a knowledge base very straighforward. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/send-data/qdrant-airflow-astronomer.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +First, we need to create a collection, so Qdrant would know what vectors it will be dealing with, and then, we just pass our raw documents +wrapped into `models.Document` to compute and upload the embeddings. -× +```python +collection_name = "knowledge_base" +model_name = "BAAI/bge-small-en-v1.5" +client.create_collection( + collection_name=collection_name, + vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE) +) +``` -[Powered by](https://qdrant.tech/) +```python +documents = [ + "Qdrant is a vector database & vector similarity search engine. It deploys as an API service providing search for the nearest high-dimensional vectors. With Qdrant, embeddings or neural network encoders can be turned into full-fledged applications for matching, searching, recommending, and much more!", + "Docker helps developers build, share, and run applications anywhere — without tedious environment configuration or management.", + "PyTorch is a machine learning framework based on the Torch library, used for applications such as computer vision and natural language processing.", + "MySQL is an open-source relational database management system (RDBMS). A relational database organizes data into one or more data tables in which data may be related to each other; these relations help structure the data. SQL is a language that programmers use to create, modify and extract data from the relational database, as well as control user access to the database.", + "NGINX is a free, open-source, high-performance HTTP server and reverse proxy, as well as an IMAP/POP3 proxy server. NGINX is known for its high performance, stability, rich feature set, simple configuration, and low resource consumption.", + "FastAPI is a modern, fast (high-performance), web framework for building APIs with Python 3.7+ based on standard Python type hints.", + "SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings. You can use this framework to compute sentence / text embeddings for more than 100 languages. These embeddings can then be compared e.g. with cosine-similarity to find sentences with a similar meaning. This can be useful for semantic textual similar, semantic search, or paraphrase mining.", + "The cron command-line utility is a job scheduler on Unix-like operating systems. Users who set up and maintain software environments use cron to schedule jobs (commands or shell scripts), also known as cron jobs, to run periodically at fixed times, dates, or intervals.", +] +client.upsert( + collection_name=collection_name, + points=[ + models.PointStruct( + id=idx, + vector=models.Document(text=document, model=model_name), + payload={"document": document}, + ) + for idx, document in enumerate(documents) + ], +) +``` -<|page-90-lllmstxt|> -## datasets -- [Documentation](https://qdrant.tech/documentation/) -- Practice Datasets +## Setup DeepSeek -# [Anchor](https://qdrant.tech/documentation/datasets/\#common-datasets-in-snapshot-format) Common Datasets in Snapshot Format +RAG changes the way we interact with Large Language Models. We're converting a knowledge-oriented task, in which the model may create a counterfactual answer, into a language-oriented task. The latter expects the model to extract meaningful information and generate an answer. LLMs, when implemented correctly, are supposed to be carrying out language-oriented tasks. -You may find that creating embeddings from datasets is a very resource-intensive task. -If you need a practice dataset, feel free to pick one of the ready-made snapshots on this page. -These snapshots contain pre-computed vectors that you can easily import into your Qdrant instance. +The task starts with the original prompt sent by the user. The same prompt is then vectorized and used as a search query for the most relevant facts. Those facts are combined with the original prompt to build a longer prompt containing more information. -## [Anchor](https://qdrant.tech/documentation/datasets/\#available-datasets) Available datasets +But let's start simply by asking our question directly. -Our snapshots are usually generated from publicly available datasets, which are often used for -non-commercial or academic purposes. The following datasets are currently available. Please click -on a dataset name to see its detailed description. -| Dataset | Model | Vector size | Documents | Size | Qdrant snapshot | HF Hub | -| --- | --- | --- | --- | --- | --- | --- | -| [Arxiv.org titles](https://qdrant.tech/documentation/datasets/#arxivorg-titles) | [InstructorXL](https://huggingface.co/hkunlp/instructor-xl) | 768 | 2.3M | 7.1 GB | [Download](https://snapshots.qdrant.io/arxiv_titles-3083016565637815127-2023-05-29-13-56-22.snapshot) | [Open](https://huggingface.co/datasets/Qdrant/arxiv-titles-instructorxl-embeddings) | -| [Arxiv.org abstracts](https://qdrant.tech/documentation/datasets/#arxivorg-abstracts) | [InstructorXL](https://huggingface.co/hkunlp/instructor-xl) | 768 | 2.3M | 8.4 GB | [Download](https://snapshots.qdrant.io/arxiv_abstracts-3083016565637815127-2023-06-02-07-26-29.snapshot) | [Open](https://huggingface.co/datasets/Qdrant/arxiv-abstracts-instructorxl-embeddings) | -| [Wolt food](https://qdrant.tech/documentation/datasets/#wolt-food) | [clip-ViT-B-32](https://huggingface.co/sentence-transformers/clip-ViT-B-32) | 512 | 1.7M | 7.9 GB | [Download](https://snapshots.qdrant.io/wolt-clip-ViT-B-32-2446808438011867-2023-12-14-15-55-26.snapshot) | [Open](https://huggingface.co/datasets/Qdrant/wolt-food-clip-ViT-B-32-embeddings) | +```python +prompt = """ +What tools should I need to use to build a web service using vector embeddings for search? +""" +``` -Once you download a snapshot, you need to [restore it](https://qdrant.tech/documentation/concepts/snapshots/#restore-snapshot) -using the Qdrant CLI upon startup or through the API. +Using the Deepseek API requires providing the API key. You can obtain it from the [DeepSeek platform](https://platform.deepseek.com/api_keys). -## [Anchor](https://qdrant.tech/documentation/datasets/\#qdrant-on-hugging-face) Qdrant on Hugging Face +Now we can finally call the completion API. -[![HuggingFace](https://qdrant.tech/content/images/hf-logo-with-title.svg)](https://huggingface.co/Qdrant) -[Hugging Face](https://huggingface.co/) provides a platform for sharing and using ML models and -datasets. [Qdrant](https://huggingface.co/Qdrant) is one of the organizations there! We aim to -provide you with datasets containing neural embeddings that you can use to practice with Qdrant -and build your applications based on semantic search. **Please let us know if you’d like to see** -**a specific dataset!** +```python +import requests +import json -If you are not familiar with [Hugging Face datasets](https://huggingface.co/docs/datasets/index), -or would like to know how to combine it with Qdrant, please refer to the [tutorial](https://qdrant.tech/documentation/tutorials/huggingface-datasets/). +# Fill the environmental variable with your own Deepseek API key +# See: https://platform.deepseek.com/api_keys +API_KEY = "" -## [Anchor](https://qdrant.tech/documentation/datasets/\#arxivorg) Arxiv.org +HEADERS = { + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json", +} -[Arxiv.org](https://arxiv.org/) is a highly-regarded open-access repository of electronic preprints in multiple -fields. Operated by Cornell University, arXiv allows researchers to share their findings with -the scientific community and receive feedback before they undergo peer review for formal -publication. Its archives host millions of scholarly articles, making it an invaluable resource -for those looking to explore the cutting edge of scientific research. With a high frequency of -daily submissions from scientists around the world, arXiv forms a comprehensive, evolving dataset -that is ripe for mining, analysis, and the development of future innovations. -### [Anchor](https://qdrant.tech/documentation/datasets/\#arxivorg-titles) Arxiv.org titles +def query_deepseek(prompt): + data = { + "model": "deepseek-chat", + "messages": [{"role": "user", "content": prompt}], + "stream": False, + } -This dataset contains embeddings generated from the paper titles only. Each vector has a -payload with the title used to create it, along with the DOI (Digital Object Identifier). + response = requests.post( + "https://api.deepseek.com/chat/completions", headers=HEADERS, data=json.dumps(data) + ) -```json -{ - "title": "Nash Social Welfare for Indivisible Items under Separable, Piecewise-Linear Concave Utilities", - "DOI": "1612.05191" -} + if response.ok: + result = response.json() + return result["choices"][0]["message"]["content"] + else: + raise Exception(f"Error {response.status_code}: {response.text}") ``` -The embeddings generated with InstructorXL model have been generated using the following -instruction: - -> Represent the Research Paper title for retrieval; Input: - -The following code snippet shows how to generate embeddings using the InstructorXL model: +and also the query ```python -from InstructorEmbedding import INSTRUCTOR +query_deepseek(prompt) +``` -model = INSTRUCTOR("hkunlp/instructor-xl") -sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments" -instruction = "Represent the Research Paper title for retrieval; Input:" -embeddings = model.encode([[instruction, sentence]]) +The response is: +```bash +"Building a web service that uses vector embeddings for search involves several components, including data processing, embedding generation, storage, search, and serving the service via an API. Below is a list of tools and technologies you can use for each step:\n\n---\n\n### 1. **Data Processing**\n - **Python**: For general data preprocessing and scripting.\n - **Pandas**: For handling tabular data.\n - **NumPy**: For numerical operations.\n - **NLTK/Spacy**: For text preprocessing (tokenization, stemming, etc.).\n - **LLM models**: For generating embeddings if you're using pre-trained models.\n\n---\n\n### 2. **Embedding Generation**\n - **Pre-trained Models**:\n - Embeddings (e.g., `text-embedding-ada-002`).\n - Hugging Face Transformers (e.g., `Sentence-BERT`, `all-MiniLM-L6-v2`).\n - Google's Universal Sentence Encoder.\n - **Custom Models**:\n - TensorFlow/PyTorch: For training custom embedding models.\n - **Libraries**:\n - `sentence-transformers`: For generating sentence embeddings.\n - `transformers`: For using Hugging Face models.\n\n---\n\n### 3. **Vector Storage**\n - **Vector Databases**:\n - Pinecone: Managed vector database for similarity search.\n - Weaviate: Open-source vector search engine.\n - Milvus: Open-source vector database.\n - FAISS (Facebook AI Similarity Search): Library for efficient similarity search.\n - Qdrant: Open-source vector search engine.\n - Redis with RedisAI: For storing and querying vectors.\n - **Traditional Databases with Vector Support**:\n - PostgreSQL with pgvector extension.\n - Elasticsearch with dense vector support.\n\n---\n\n### 4. **Search and Retrieval**\n - **Similarity Search Algorithms**:\n - Cosine similarity, Euclidean distance, or dot product for comparing vectors.\n - **Libraries**:\n - FAISS: For fast nearest-neighbor search.\n - Annoy (Approximate Nearest Neighbors Oh Yeah): For approximate nearest neighbor search.\n - **Vector Databases**: Most vector databases (e.g., Pinecone, Weaviate) come with built-in search capabilities.\n\n---\n\n### 5. **Web Service Framework**\n - **Backend Frameworks**:\n - Flask/Django/FastAPI (Python): For building RESTful APIs.\n - Node.js/Express: If you prefer JavaScript.\n - **API Documentation**:\n - Swagger/OpenAPI: For documenting your API.\n - **Authentication**:\n - OAuth2, JWT: For securing your API.\n\n---\n\n### 6. **Deployment**\n - **Containerization**:\n - Docker: For packaging your application.\n - **Orchestration**:\n - Kubernetes: For managing containers at scale.\n - **Cloud Platforms**:\n - AWS (EC2, Lambda, S3).\n - Google Cloud (Compute Engine, Cloud Functions).\n - Azure (App Service, Functions).\n - **Serverless**:\n - AWS Lambda, Google Cloud Functions, or Vercel for serverless deployment.\n\n---\n\n### 7. **Monitoring and Logging**\n - **Monitoring**:\n - Prometheus + Grafana: For monitoring performance.\n - **Logging**:\n - ELK Stack (Elasticsearch, Logstash, Kibana).\n - Fluentd.\n - **Error Tracking**:\n - Sentry.\n\n---\n\n### 8. **Frontend (Optional)**\n - **Frontend Frameworks**:\n - React, Vue.js, or Angular: For building a user interface.\n - **Libraries**:\n - Axios: For making API calls from the frontend.\n\n---\n\n### Example Workflow\n1. Preprocess your data (e.g., clean text, tokenize).\n2. Generate embeddings using a pre-trained model (e.g., Hugging Face).\n3. Store embeddings in a vector database (e.g., Pinecone or FAISS).\n4. Build a REST API using FastAPI or Flask to handle search queries.\n5. Deploy the service using Docker and Kubernetes or a serverless platform.\n6. Monitor and scale the service as needed.\n\n---\n\n### Example Tools Stack\n- **Embedding Generation**: Hugging Face `sentence-transformers`.\n- **Vector Storage**: Pinecone or FAISS.\n- **Web Framework**: FastAPI.\n- **Deployment**: Docker + AWS/GCP.\n\nBy combining these tools, you can build a scalable and efficient web service for vector embedding-based search." ``` -The snapshot of the dataset might be downloaded [here](https://snapshots.qdrant.io/arxiv_titles-3083016565637815127-2023-05-29-13-56-22.snapshot). - -#### [Anchor](https://qdrant.tech/documentation/datasets/\#importing-the-dataset) Importing the dataset -The easiest way to use the provided dataset is to recover it via the API by passing the -URL as a location. It works also in [Qdrant Cloud](https://cloud.qdrant.io/). The following -code snippet shows how to create a new collection and fill it with the snapshot data: +### Extending the prompt -```http -PUT /collections/{collection_name}/snapshots/recover -{ - "location": "https://snapshots.qdrant.io/arxiv_titles-3083016565637815127-2023-05-29-13-56-22.snapshot" -} +Even though the original answer sounds credible, it didn't answer our question correctly. Instead, it gave us a generic description of an application stack. To improve the results, enriching the original prompt with the descriptions of the tools available seems like one of the possibilities. Let's use a semantic knowledge base to augment the prompt with the descriptions of different technologies! +```python +results = client.query_points( + collection_name=collection_name, + query=models.Document(text=prompt, model=model_name), + limit=3, +) +results ``` -### [Anchor](https://qdrant.tech/documentation/datasets/\#arxivorg-abstracts) Arxiv.org abstracts - -This dataset contains embeddings generated from the paper abstracts. Each vector has a -payload with the abstract used to create it, along with the DOI (Digital Object Identifier). - -```json -{ - "abstract": "Recently Cole and Gkatzelis gave the first constant factor approximation\nalgorithm for the problem of allocating indivisible items to agents, under\nadditive valuations, so as to maximize the Nash Social Welfare. We give\nconstant factor algorithms for a substantial generalization of their problem --\nto the case of separable, piecewise-linear concave utility functions. We give\ntwo such algorithms, the first using market equilibria and the second using the\ntheory of stable polynomials.\n In AGT, there is a paucity of methods for the design of mechanisms for the\nallocation of indivisible goods and the result of Cole and Gkatzelis seemed to\nbe taking a major step towards filling this gap. Our result can be seen as\nanother step in this direction.\n", - "DOI": "1612.05191" -} +Here is the response: +```bash +QueryResponse(points=[ + ScoredPoint(id=0, version=0, score=0.67437416, payload={'document': 'Qdrant is a vector database & vector similarity search engine. It deploys as an API service providing search for the nearest high-dimensional vectors. With Qdrant, embeddings or neural network encoders can be turned into full-fledged applications for matching, searching, recommending, and much more!'}, vector=None, shard_key=None, order_value=None), + ScoredPoint(id=6, version=0, score=0.63144326, payload={'document': 'SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings. You can use this framework to compute sentence / text embeddings for more than 100 languages. These embeddings can then be compared e.g. with cosine-similarity to find sentences with a similar meaning. This can be useful for semantic textual similar, semantic search, or paraphrase mining.'}, vector=None, shard_key=None, order_value=None), + ScoredPoint(id=5, version=0, score=0.6064749, payload={'document': 'FastAPI is a modern, fast (high-performance), web framework for building APIs with Python 3.7+ based on standard Python type hints.'}, vector=None, shard_key=None, order_value=None) +]) ``` -The embeddings generated with InstructorXL model have been generated using the following -instruction: -> Represent the Research Paper abstract for retrieval; Input: +We used the original prompt to perform a semantic search over the set of tool descriptions. Now we can use these descriptions to augment the prompt and create more context. -The following code snippet shows how to generate embeddings using the InstructorXL model: ```python -from InstructorEmbedding import INSTRUCTOR +context = "\n".join(r.payload['document'] for r in results.points) +context +``` -model = INSTRUCTOR("hkunlp/instructor-xl") -sentence = "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train." -instruction = "Represent the Research Paper abstract for retrieval; Input:" -embeddings = model.encode([[instruction, sentence]]) +The response is: +```bash +'Qdrant is a vector database & vector similarity search engine. It deploys as an API service providing search for the nearest high-dimensional vectors. With Qdrant, embeddings or neural network encoders can be turned into full-fledged applications for matching, searching, recommending, and much more!\nFastAPI is a modern, fast (high-performance), web framework for building APIs with Python 3.7+ based on standard Python type hints.\nPyTorch is a machine learning framework based on the Torch library, used for applications such as computer vision and natural language processing.' ``` -The snapshot of the dataset might be downloaded [here](https://snapshots.qdrant.io/arxiv_abstracts-3083016565637815127-2023-06-02-07-26-29.snapshot). -#### [Anchor](https://qdrant.tech/documentation/datasets/\#importing-the-dataset-1) Importing the dataset +Finally, let's build a metaprompt, the combination of the assumed role of the LLM, the original question, and the results from our semantic search that will force our LLM to use the provided context. -The easiest way to use the provided dataset is to recover it via the API by passing the -URL as a location. It works also in [Qdrant Cloud](https://cloud.qdrant.io/). The following -code snippet shows how to create a new collection and fill it with the snapshot data: +By doing this, we effectively convert the knowledge-oriented task into a language task and hopefully reduce the chances of hallucinations. It also should make the response sound more relevant. -```http -PUT /collections/{collection_name}/snapshots/recover -{ - "location": "https://snapshots.qdrant.io/arxiv_abstracts-3083016565637815127-2023-06-02-07-26-29.snapshot" -} -``` +```python +metaprompt = f""" +You are a software architect. +Answer the following question using the provided context. +If you can't find the answer, do not pretend you know it, but answer "I don't know". -## [Anchor](https://qdrant.tech/documentation/datasets/\#wolt-food) Wolt food +Question: {prompt.strip()} -Our [Food Discovery demo](https://food-discovery.qdrant.tech/) relies on the dataset of -food images from the Wolt app. Each point in the collection represents a dish with a single -image. The image is represented as a vector of 512 float numbers. There is also a JSON -payload attached to each point, which looks similar to this: +Context: +{context.strip()} -```json -{ - "cafe": { - "address": "VGX7+6R2 Vecchia Napoli, Valletta", - "categories": ["italian", "pasta", "pizza", "burgers", "mediterranean"], - "location": {"lat": 35.8980154, "lon": 14.5145106}, - "menu_id": "610936a4ee8ea7a56f4a372a", - "name": "Vecchia Napoli Is-Suq Tal-Belt", - "rating": 9, - "slug": "vecchia-napoli-skyparks-suq-tal-belt" - }, - "description": "Tomato sauce, mozzarella fior di latte, crispy guanciale, Pecorino Romano cheese and a hint of chilli", - "image": "https://wolt-menu-images-cdn.wolt.com/menu-images/610936a4ee8ea7a56f4a372a/005dfeb2-e734-11ec-b667-ced7a78a5abd_l_amatriciana_pizza_joel_gueller1.jpeg", - "name": "L'Amatriciana" -} +Answer: +""" +# Look at the full metaprompt +print(metaprompt) ``` -The embeddings generated with clip-ViT-B-32 model have been generated using the following -code snippet: +**Response:** -```python -from PIL import Image -from sentence_transformers import SentenceTransformer +```bash +You are a software architect. +Answer the following question using the provided context. +If you can't find the answer, do not pretend you know it, but answer "I don't know". + +Question: What tools should I need to use to build a web service using vector embeddings for search? + +Context: +Qdrant is a vector database & vector similarity search engine. It deploys as an API service providing search for the nearest high-dimensional vectors. With Qdrant, embeddings or neural network encoders can be turned into full-fledged applications for matching, searching, recommending, and much more! +FastAPI is a modern, fast (high-performance), web framework for building APIs with Python 3.7+ based on standard Python type hints. +PyTorch is a machine learning framework based on the Torch library, used for applications such as computer vision and natural language processing. + +Answer: +``` -image_path = "5dbfd216-5cce-11eb-8122-de94874ad1c8_ns_takeaway_seelachs_ei_baguette.jpeg" +Our current prompt is much longer, and we also used a couple of strategies to make the responses even better: -model = SentenceTransformer("clip-ViT-B-32") -embedding = model.encode(Image.open(image_path)) +1. The LLM has the role of software architect. +2. We provide more context to answer the question. +3. If the context contains no meaningful information, the model shouldn't make up an answer. +Let's find out if that works as expected. + +**Question:** + +```python +query_deepseek(metaprompt) ``` +**Answer:** -The snapshot of the dataset might be downloaded [here](https://snapshots.qdrant.io/wolt-clip-ViT-B-32-2446808438011867-2023-12-14-15-55-26.snapshot). +```bash +'To build a web service using vector embeddings for search, you can use the following tools:\n\n1. **Qdrant**: As a vector database and similarity search engine, Qdrant will handle the storage and retrieval of high-dimensional vectors. It provides an API service for searching and matching vectors, making it ideal for applications that require vector-based search functionality.\n\n2. **FastAPI**: This web framework is perfect for building the API layer of your web service. It is fast, easy to use, and based on Python type hints, which makes it a great choice for developing the backend of your service. FastAPI will allow you to expose endpoints that interact with Qdrant for vector search operations.\n\n3. **PyTorch**: If you need to generate vector embeddings from your data (e.g., text, images), PyTorch can be used to create and train neural network models that produce these embeddings. PyTorch is a powerful machine learning framework that supports a wide range of applications, including natural language processing and computer vision.\n\n### Summary:\n- **Qdrant** for vector storage and search.\n- **FastAPI** for building the web service API.\n- **PyTorch** for generating vector embeddings (if needed).\n\nThese tools together provide a robust stack for building a web service that leverages vector embeddings for search functionality.' +``` -#### [Anchor](https://qdrant.tech/documentation/datasets/\#importing-the-dataset-2) Importing the dataset +### Testing out the RAG pipeline -The easiest way to use the provided dataset is to recover it via the API by passing the -URL as a location. It works also in [Qdrant Cloud](https://cloud.qdrant.io/). The following -code snippet shows how to create a new collection and fill it with the snapshot data: +By leveraging the semantic context we provided our model is doing a better job answering the question. Let's enclose the RAG as a function, so we can call it more easily for different prompts. -```http -PUT /collections/{collection_name}/snapshots/recover -{ - "location": "https://snapshots.qdrant.io/wolt-clip-ViT-B-32-2446808438011867-2023-12-14-15-55-26.snapshot" -} -``` +```python +def rag(question: str, n_points: int = 3) -> str: + results = client.query_points( + collection_name=collection_name, + query=models.Document(text=question, model=model_name), + limit=n_points, + ) -##### Was this page useful? + context = "\n".join(r.payload["document"] for r in results.points) -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No + metaprompt = f""" + You are a software architect. + Answer the following question using the provided context. + If you can't find the answer, do not pretend you know it, but only answer "I don't know". -Thank you for your feedback! 🙏 + Question: {question.strip()} -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/datasets.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. + Context: + {context.strip()} -On this page: + Answer: + """ -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/datasets.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) + return query_deepseek(metaprompt) +``` -× +Now it's easier to ask a broad range of questions. -[Powered by](https://qdrant.tech/) +**Question:** -<|page-91-lllmstxt|> -## rapid-rag-optimization-with-qdrant-and-quotient -- [Articles](https://qdrant.tech/articles/) -- Optimizing RAG Through an Evaluation-Based Methodology +```python +rag("What can the stack for a web api look like?") +``` +**Answer:** -[Back to RAG & GenAI](https://qdrant.tech/articles/rag-and-genai/) +```bash +'The stack for a web API can include the following components based on the provided context:\n\n1. **Web Framework**: FastAPI can be used as the web framework for building the API. It is modern, fast, and leverages Python type hints for better development and performance.\n\n2. **Reverse Proxy/Web Server**: NGINX can be used as a reverse proxy or web server to handle incoming HTTP requests, load balancing, and serving static content. It is known for its high performance and low resource consumption.\n\n3. **Containerization**: Docker can be used to containerize the application, making it easier to build, share, and run the API consistently across different environments without worrying about configuration issues.\n\nThis stack provides a robust, scalable, and efficient setup for building and deploying a web API.' +``` -# Optimizing RAG Through an Evaluation-Based Methodology +**Question:** -Atita Arora +```python +rag("Where is the nearest grocery store?") +``` -· +**Answer:** -June 12, 2024 +```bash +"I don't know. The provided context does not contain any information about the location of the nearest grocery store." +``` -![Optimizing RAG Through an Evaluation-Based Methodology](https://qdrant.tech/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/preview/title.jpg) +Our model can now: -In today’s fast-paced, information-rich world, AI is revolutionizing knowledge management. The systematic process of capturing, distributing, and effectively using knowledge within an organization is one of the fields in which AI provides exceptional value today. +1. Take advantage of the knowledge in our vector datastore. +2. Answer, based on the provided context, that it can not provide an answer. -> The potential for AI-powered knowledge management increases when leveraging [Retrieval Augmented Generation (RAG)](https://qdrant.tech/rag/rag-evaluation-guide/), a methodology that enables LLMs to access a vast, diverse repository of factual information from knowledge stores, such as vector databases. +We have just shown a useful mechanism to mitigate the risks of hallucinations in Large Language Models. -This process enhances the accuracy, relevance, and reliability of generated text, thereby mitigating the risk of faulty, incorrect, or nonsensical results sometimes associated with traditional LLMs. This method not only ensures that the answers are contextually relevant but also up-to-date, reflecting the latest insights and data available. +<|page-133-lllmstxt|> +# Qdrant Web UI -While RAG enhances the accuracy, relevance, and reliability of traditional LLM solutions, **an evaluation strategy can further help teams ensure their AI products meet these benchmarks of success.** +You can manage both local and cloud Qdrant deployments through the Web UI. -## [Anchor](https://qdrant.tech/articles/rapid-rag-optimization-with-qdrant-and-quotient/\#relevant-tools-for-this-experiment) Relevant tools for this experiment +If you've set up a deployment locally with the Qdrant [Quickstart](/documentation/quick-start/), +navigate to http://localhost:6333/dashboard. -In this article, we’ll break down a RAG Optimization workflow experiment that demonstrates that evaluation is essential to build a successful RAG strategy. We will use Qdrant and Quotient for this experiment. +If you've set up a deployment in a cloud cluster, find your Cluster URL in your +cloud dashboard, at https://cloud.qdrant.io. Add `:6333/dashboard` to the end +of the URL. -[Qdrant](https://qdrant.tech/) is a vector database and vector similarity search engine designed for efficient storage and retrieval of high-dimensional vectors. Because Qdrant offers efficient indexing and searching capabilities, it is ideal for implementing RAG solutions, where quickly and accurately retrieving relevant information from extremely large datasets is crucial. Qdrant also offers a wealth of additional features, such as quantization, multivector support and multi-tenancy. +## Access the Web UI -Alongside Qdrant we will use Quotient, which provides a seamless way to evaluate your RAG implementation, accelerating and improving the experimentation process. +Qdrant's Web UI is an intuitive and efficient graphic interface for your Qdrant Collections, REST API and data points. -[Quotient](https://www.quotientai.co/) is a platform that provides tooling for AI developers to build [evaluation frameworks](https://qdrant.tech/rag/rag-evaluation-guide/) and conduct experiments on their products. Evaluation is how teams surface the shortcomings of their applications and improve performance in key benchmarks such as faithfulness, and semantic similarity. Iteration is key to building innovative AI products that will deliver value to end users. +In the **Console**, you may use the REST API to interact with Qdrant, while in **Collections**, you can manage all the collections and upload Snapshots. -> 💡 The [accompanying notebook](https://github.com/qdrant/qdrant-rag-eval/tree/master/workshop-rag-eval-qdrant-quotient) for this exercise can be found on GitHub for future reference. +![Qdrant Web UI](/articles_data/qdrant-1.3.x/web-ui.png) -## [Anchor](https://qdrant.tech/articles/rapid-rag-optimization-with-qdrant-and-quotient/\#summary-of-key-findings) Summary of key findings +### Qdrant Web UI features -1. **Irrelevance and Hallucinations**: When the documents retrieved are irrelevant, evidenced by low scores in both Chunk Relevance and Context Relevance, the model is prone to generating inaccurate or fabricated information. -2. **Optimizing Document Retrieval**: By retrieving a greater number of documents and reducing the chunk size, we observed improved outcomes in the model’s performance. -3. **Adaptive Retrieval Needs**: Certain queries may benefit from accessing more documents. Implementing a dynamic retrieval strategy that adjusts based on the query could enhance accuracy. -4. **Influence of Model and Prompt Variations**: Alterations in language models or the prompts used can significantly impact the quality of the generated responses, suggesting that fine-tuning these elements could optimize performance. +In the Qdrant Web UI, you can: -Let us walk you through how we arrived at these findings! +- Run HTTP-based calls from the console +- List and search existing [collections](/documentation/concepts/collections/) +- Learn from our interactive tutorial -## [Anchor](https://qdrant.tech/articles/rapid-rag-optimization-with-qdrant-and-quotient/\#building-a-rag-pipeline) Building a RAG pipeline +You can navigate to these options directly. For example, if you used our +[quick start](/documentation/quick-start/) to set up a cluster on localhost, +you can review our tutorial at http://localhost:6333/dashboard#/tutorial. -To evaluate a RAG pipeline, we will have to build a RAG Pipeline first. In the interest of simplicity, we are building a Naive RAG in this article. There are certainly other versions of RAG : +<|page-134-lllmstxt|> +# How to Generate ColBERT Multivectors with FastEmbed -![shades_of_rag.png](https://qdrant.tech/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/shades_of_rag.png) +## ColBERT -The illustration below depicts how we can leverage a [RAG Evaluation framework](https://qdrant.tech/rag/rag-evaluation-guide/) to assess the quality of RAG Application. +ColBERT is an embedding model that produces a matrix (multivector) representation of input text, +generating one vector per token (a token being a meaningful text unit for a machine learning model). +This approach allows ColBERT to capture more nuanced input semantics than many dense embedding models, +which represent an entire input with a single vector. By producing more granular input representations, +ColBERT becomes a strong retriever. However, this advantage comes at the cost of increased resource consumption compared to +traditional dense embedding models, both in terms of speed and memory. -![qdrant_and_quotient.png](https://qdrant.tech/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/qdrant_and_quotient.png) +Despite ColBERT being a powerful retriever, its speed limitation might make it less suitable for large-scale retrieval. +Therefore, we generally recommend using ColBERT for reranking a small set of already retrieved examples, rather than for first-stage retrieval. +A simple dense retriever can initially retrieve around 100-500 candidates, which can then be reranked with ColBERT to bring the most relevant results +to the top. -We are going to build a RAG application using Qdrant’s Documentation and the premeditated [hugging face dataset](https://huggingface.co/datasets/atitaarora/qdrant_doc). -We will then assess our RAG application’s ability to answer questions about Qdrant. +ColBERT is a considerable alternative of a reranking model to [cross-encoders](https://sbert.net/examples/applications/cross-encoder/README.html), since +it tends to be faster on inference time due to its `late interaction` mechanism. -To prepare our knowledge store we will use Qdrant, which can be leveraged in 3 different ways as below : +How does `late interaction` work? Cross-encoders ingest a query and a document glued together as one input. +A cross-encoder model divides this input into meaningful (for the model) parts and checks how these parts relate. +So, all interactions between the query and the document happen "early" inside the model. +Late interaction models, such as ColBERT, only do the first part, generating document and query parts suitable for comparison. +All interactions between these parts are expected to be done "later" outside the model. -```python -client = qdrant_client.QdrantClient( - os.environ.get("QDRANT_URL"), - api_key=os.environ.get("QDRANT_API_KEY"), -) +## Using ColBERT in Qdrant -``` +Qdrant supports [multivector representations](https://qdrant.tech/documentation/concepts/vectors/#multivectors) out of the box so that you can use any late interaction model as `ColBERT` or `ColPali` in Qdrant without any additional pre/post-processing. -We will be using [Qdrant Cloud](https://cloud.qdrant.io/login) so it is a good idea to provide the `QDRANT_URL` and `QDRANT_API_KEY` as environment variables for easier access. +This tutorial uses ColBERT as a first-stage retriever on a toy dataset. +You can see how to use ColBERT as a reranker in our [multi-stage queries documentation](https://qdrant.tech/documentation/concepts/hybrid-queries/#multi-stage-queries). +## Setup -Moving on, we will need to define the collection name as : +Install `fastembed`. ```python -COLLECTION_NAME = "qdrant-docs-quotient" - +pip install fastembed ``` -In this case , we may need to create different collections based on the experiments we conduct. - -To help us provide seamless embedding creations throughout the experiment, we will use Qdrant’s own embeddings library [Fastembed](https://qdrant.github.io/fastembed/) which supports [many different models](https://qdrant.github.io/fastembed/examples/Supported_Models/) including dense as well as sparse vector models. - -Before implementing RAG, we need to prepare and index our data in Qdrant. +Imports late interaction models for text embedding. -This involves converting textual data into vectors using a suitable encoder (e.g., sentence transformers), and storing these vectors in Qdrant for retrieval. +```python +from fastembed import LateInteractionTextEmbedding +``` +You can list which late interaction models are supported in FastEmbed. ```python -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain.docstore.document import Document as LangchainDocument +LateInteractionTextEmbedding.list_supported_models() +``` +This command displays the available models. The output shows details about the model, including output embedding dimensions, model description, model size, model sources, and model file. -## Load the dataset with qdrant documentation -dataset = load_dataset("atitaarora/qdrant_doc", split="train") +```python +[{'model': 'colbert-ir/colbertv2.0', + 'dim': 128, + 'description': 'Late interaction model', + 'size_in_GB': 0.44, + 'sources': {'hf': 'colbert-ir/colbertv2.0'}, + 'model_file': 'model.onnx'}, + {'model': 'answerdotai/answerai-colbert-small-v1', + 'dim': 96, + 'description': 'Text embeddings, Unimodal (text), Multilingual (~100 languages), 512 input tokens truncation, 2024 year', + 'size_in_GB': 0.13, + 'sources': {'hf': 'answerdotai/answerai-colbert-small-v1'}, + 'model_file': 'vespa_colbert.onnx'}] +``` +Now, load the model. -## Dataset to langchain document -langchain_docs = [\ - LangchainDocument(page_content=doc["text"], metadata={"source": doc["source"]})\ - for doc in dataset\ -] +```python +model_name = "colbert-ir/colbertv2.0" +embedding_model = LateInteractionTextEmbedding(model_name) +``` +The model files will be fetched and downloaded, with progress showing. -len(langchain_docs) +## Embed data -#Outputs -#240 +We will vectorize a toy movie description dataset with ColBERT: +
+ Movie description dataset + +```python +descriptions = ["In 1431, Jeanne d'Arc is placed on trial on charges of heresy. The ecclesiastical jurists attempt to force Jeanne to recant her claims of holy visions.", + "A film projectionist longs to be a detective, and puts his meagre skills to work when he is framed by a rival for stealing his girlfriend's father's pocketwatch.", + "A group of high-end professional thieves start to feel the heat from the LAPD when they unknowingly leave a clue at their latest heist.", + "A petty thief with an utter resemblance to a samurai warlord is hired as the lord's double. When the warlord later dies the thief is forced to take up arms in his place.", + "A young boy named Kubo must locate a magical suit of armour worn by his late father in order to defeat a vengeful spirit from the past.", + "A biopic detailing the 2 decades that Punjabi Sikh revolutionary Udham Singh spent planning the assassination of the man responsible for the Jallianwala Bagh massacre.", + "When a machine that allows therapists to enter their patients' dreams is stolen, all hell breaks loose. Only a young female therapist, Paprika, can stop it.", + "An ordinary word processor has the worst night of his life after he agrees to visit a girl in Soho whom he met that evening at a coffee shop.", + "A story that revolves around drug abuse in the affluent north Indian State of Punjab and how the youth there have succumbed to it en-masse resulting in a socio-economic decline.", + "A world-weary political journalist picks up the story of a woman's search for her son, who was taken away from her decades ago after she became pregnant and was forced to live in a convent.", + "Concurrent theatrical ending of the TV series Neon Genesis Evangelion (1995).", + "During World War II, a rebellious U.S. Army Major is assigned a dozen convicted murderers to train and lead them into a mass assassination mission of German officers.", + "The toys are mistakenly delivered to a day-care center instead of the attic right before Andy leaves for college, and it's up to Woody to convince the other toys that they weren't abandoned and to return home.", + "A soldier fighting aliens gets to relive the same day over and over again, the day restarting every time he dies.", + "After two male musicians witness a mob hit, they flee the state in an all-female band disguised as women, but further complications set in.", + "Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household.", + "A renegade reporter trailing a young runaway heiress for a big story joins her on a bus heading from Florida to New York, and they end up stuck with each other when the bus leaves them behind at one of the stops.", + "Story of 40-man Turkish task force who must defend a relay station.", + "Spinal Tap, one of England's loudest bands, is chronicled by film director Marty DiBergi on what proves to be a fateful tour.", + "Oskar, an overlooked and bullied boy, finds love and revenge through Eli, a beautiful but peculiar girl."] ``` +
-You can preview documents in the dataset as below : +The vectorization is done with an `embed` generator function. ```python -## Here's an example of what a document in our dataset looks like -print(dataset[100]['text']) - +descriptions_embeddings = list( + embedding_model.embed(descriptions) +) ``` +Let's check the size of one of the produced embeddings. -## [Anchor](https://qdrant.tech/articles/rapid-rag-optimization-with-qdrant-and-quotient/\#evaluation-dataset) Evaluation dataset - -To measure the quality of our RAG setup, we will need a representative evaluation dataset. This dataset should contain realistic questions and the expected answers. +```python +descriptions_embeddings[0].shape +``` -Additionally, including the expected contexts for which your RAG pipeline is designed to retrieve information would be beneficial. +We get the following result -We will be using a [prebuilt evaluation dataset](https://huggingface.co/datasets/atitaarora/qdrant_doc_qna). +```bash +(48, 128) +``` +That means that for the first description, we have **48** vectors of lengths **128** representing it. -If you are struggling to make an evaluation dataset for your use case , you can use your documents and some techniques described in this [notebook](https://github.com/qdrant/qdrant-rag-eval/blob/master/synthetic_qna/notebook/Synthetic_question_generation.ipynb) +## Upload embeddings to Qdrant -### [Anchor](https://qdrant.tech/articles/rapid-rag-optimization-with-qdrant-and-quotient/\#building-the-rag-pipeline) Building the RAG pipeline +Install `qdrant-client` -We establish the data preprocessing parameters essential for the RAG pipeline and configure the Qdrant vector database according to the specified criteria. +```python +pip install "qdrant-client>=1.14.2" +``` -Key parameters under consideration are: +Qdrant Client has a simple in-memory mode that allows you to experiment locally on small data volumes. +Alternatively, you could use for experiments [a free cluster](https://qdrant.tech/documentation/cloud/create-cluster/#create-a-cluster) in Qdrant Cloud. -- **Chunk size** -- **Chunk overlap** -- **Embedding model** -- **Number of documents retrieved (retrieval window)** +```python +from qdrant_client import QdrantClient, models -Following the ingestion of data in Qdrant, we proceed to retrieve pertinent documents corresponding to each query. These documents are then seamlessly integrated into our evaluation dataset, enriching the contextual information within the designated **`context`** column to fulfil the evaluation aspect. +qdrant_client = QdrantClient(":memory:") # Qdrant is running from RAM. +``` -Next we define methods to take care of logistics with respect to adding documents to Qdrant +Now, let's create a small [collection](https://qdrant.tech/documentation/concepts/collections/) with our movie data. +For that, we will use the [multivectors](https://qdrant.tech/documentation/concepts/vectors/#multivectors) functionality supported in Qdrant. +To configure multivector collection, we need to specify: +- similarity metric between vectors; +- the size of each vector (for ColBERT, it's **128**); +- similarity metric between multivectors (matrices), for example, `maximum`, so for vector from matrix A, we find the most similar vector from matrix B, and their similarity score will be out matrix similarity. ```python -import uuid +qdrant_client.create_collection( + collection_name="movies", + vectors_config=models.VectorParams( + size=128, #size of each vector produced by ColBERT + distance=models.Distance.COSINE, #similarity metric between each vector + multivector_config=models.MultiVectorConfig( + comparator=models.MultiVectorComparator.MAX_SIM #similarity metric between multivectors (matrices) + ), + ), +) +``` +To make this collection human-readable, let's save movie metadata (name, description in text form and movie's length) together with an embedded description. + +
+ Movie metadata + +```python +metadata = [{"movie_name": "The Passion of Joan of Arc", "movie_watch_time_min": 114, "movie_description": "In 1431, Jeanne d'Arc is placed on trial on charges of heresy. The ecclesiastical jurists attempt to force Jeanne to recant her claims of holy visions."}, +{"movie_name": "Sherlock Jr.", "movie_watch_time_min": 45, "movie_description": "A film projectionist longs to be a detective, and puts his meagre skills to work when he is framed by a rival for stealing his girlfriend's father's pocketwatch."}, +{"movie_name": "Heat", "movie_watch_time_min": 170, "movie_description": "A group of high-end professional thieves start to feel the heat from the LAPD when they unknowingly leave a clue at their latest heist."}, +{"movie_name": "Kagemusha", "movie_watch_time_min": 162, "movie_description": "A petty thief with an utter resemblance to a samurai warlord is hired as the lord's double. When the warlord later dies the thief is forced to take up arms in his place."}, +{"movie_name": "Kubo and the Two Strings", "movie_watch_time_min": 101, "movie_description": "A young boy named Kubo must locate a magical suit of armour worn by his late father in order to defeat a vengeful spirit from the past."}, +{"movie_name": "Sardar Udham", "movie_watch_time_min": 164, "movie_description": "A biopic detailing the 2 decades that Punjabi Sikh revolutionary Udham Singh spent planning the assassination of the man responsible for the Jallianwala Bagh massacre."}, +{"movie_name": "Paprika", "movie_watch_time_min": 90, "movie_description": "When a machine that allows therapists to enter their patients' dreams is stolen, all hell breaks loose. Only a young female therapist, Paprika, can stop it."}, +{"movie_name": "After Hours", "movie_watch_time_min": 97, "movie_description": "An ordinary word processor has the worst night of his life after he agrees to visit a girl in Soho whom he met that evening at a coffee shop."}, +{"movie_name": "Udta Punjab", "movie_watch_time_min": 148, "movie_description": "A story that revolves around drug abuse in the affluent north Indian State of Punjab and how the youth there have succumbed to it en-masse resulting in a socio-economic decline."}, +{"movie_name": "Philomena", "movie_watch_time_min": 98, "movie_description": "A world-weary political journalist picks up the story of a woman's search for her son, who was taken away from her decades ago after she became pregnant and was forced to live in a convent."}, +{"movie_name": "Neon Genesis Evangelion: The End of Evangelion", "movie_watch_time_min": 87, "movie_description": "Concurrent theatrical ending of the TV series Neon Genesis Evangelion (1995)."}, +{"movie_name": "The Dirty Dozen", "movie_watch_time_min": 150, "movie_description": "During World War II, a rebellious U.S. Army Major is assigned a dozen convicted murderers to train and lead them into a mass assassination mission of German officers."}, +{"movie_name": "Toy Story 3", "movie_watch_time_min": 103, "movie_description": "The toys are mistakenly delivered to a day-care center instead of the attic right before Andy leaves for college, and it's up to Woody to convince the other toys that they weren't abandoned and to return home."}, +{"movie_name": "Edge of Tomorrow", "movie_watch_time_min": 113, "movie_description": "A soldier fighting aliens gets to relive the same day over and over again, the day restarting every time he dies."}, +{"movie_name": "Some Like It Hot", "movie_watch_time_min": 121, "movie_description": "After two male musicians witness a mob hit, they flee the state in an all-female band disguised as women, but further complications set in."}, +{"movie_name": "Snow White and the Seven Dwarfs", "movie_watch_time_min": 83, "movie_description": "Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household."}, +{"movie_name": "It Happened One Night", "movie_watch_time_min": 105, "movie_description": "A renegade reporter trailing a young runaway heiress for a big story joins her on a bus heading from Florida to New York, and they end up stuck with each other when the bus leaves them behind at one of the stops."}, +{"movie_name": "Nefes: Vatan Sagolsun", "movie_watch_time_min": 128, "movie_description": "Story of 40-man Turkish task force who must defend a relay station."}, +{"movie_name": "This Is Spinal Tap", "movie_watch_time_min": 82, "movie_description": "Spinal Tap, one of England's loudest bands, is chronicled by film director Marty DiBergi on what proves to be a fateful tour."}, +{"movie_name": "Let the Right One In", "movie_watch_time_min": 114, "movie_description": "Oskar, an overlooked and bullied boy, finds love and revenge through Eli, a beautiful but peculiar girl."}] +``` +
-from qdrant_client import models +```python +qdrant_client.upload_points( + collection_name="movies", + points=[ + models.PointStruct( + id=idx, + payload=metadata[idx], + vector=vector + ) + for idx, vector in enumerate(descriptions_embeddings) + ], +) +``` -def add_documents(client, collection_name, chunk_size, chunk_overlap, embedding_model_name): - """ - This function adds documents to the desired Qdrant collection given the specified RAG parameters. - """ + - ## Processing each document with desired TEXT_SPLITTER_ALGO, CHUNK_SIZE, CHUNK_OVERLAP - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - add_start_index=True, - separators=["\n\n", "\n", ".", " ", ""], - ) +
+ Upload with implicit embeddings computation - docs_processed = [] - for doc in langchain_docs: - docs_processed += text_splitter.split_documents([doc]) - ## Processing documents to be encoded by Fastembed - docs_contents = [] - docs_metadatas = [] +```python +description_documents = [models.Document(text=description, model=model_name) for description in descriptions] +qdrant_client.upload_points( + collection_name="movies", + points=[ + models.PointStruct( + id=idx, + payload=metadata[idx], + vector=description_document + ) + for idx, description_document in enumerate(description_documents) + ], +) +``` +
- for doc in docs_processed: - if hasattr(doc, 'page_content') and hasattr(doc, 'metadata'): - docs_contents.append(doc.page_content) - docs_metadatas.append(doc.metadata) - else: - # Handle the case where attributes are missing - print("Warning: Some documents do not have 'page_content' or 'metadata' attributes.") +## Querying - print("processed: ", len(docs_processed)) - print("content: ", len(docs_contents)) - print("metadata: ", len(docs_metadatas)) +ColBERT uses two distinct methods for embedding documents and queries, as do we in Fastembed. However, we altered query pre-processing used in ColBERT, so we don't have to cut all queries after 32-token length but ingest longer queries directly. - if not client.collection_exists(collection_name): - client.create_collection( - collection_name=collection_name, - vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE), - ) +```python +qdrant_client.query_points( + collection_name="movies", + query=list(embedding_model.query_embed("A movie for kids with fantasy elements and wonders"))[0], #converting generator object into numpy.ndarray + limit=1, #How many closest to the query movies we would like to get + #with_vectors=True, #If this option is used, vectors will also be returned + with_payload=True #So metadata is provided in the output +) +``` - client.upsert( - collection_name=collection_name, - points=[\ - models.PointStruct(\ - id=uuid.uuid4().hex,\ - vector=models.Document(text=content, model=embedding_model_name),\ - payload={"metadata": metadata, "document": content},\ - )\ - for metadata, content in zip(docs_metadatas, docs_contents)\ - ], - ) + -``` +
+ Query points with implicit embeddings computation -and retrieving documents from Qdrant during our RAG Pipeline assessment. ```python -def get_documents(collection_name, query, num_documents=3): - """ - This function retrieves the desired number of documents from the Qdrant collection given a query. - It returns a list of the retrieved documents. - """ - search_results = client.query_points( - collection_name=collection_name, - query=models.Document(text=query, model=embedding_model_name), - limit=num_documents, - ).points +query_document = models.Document(text="A movie for kids with fantasy elements and wonders", model=model_name) +qdrant_client.query_points( + collection_name="movies", + query=query_document, + limit=1, +) +``` +
- results = [r.payload["document"] for r in search_results] - return results +The result is the following: +```bash +QueryResponse(points=[ScoredPoint(id=4, version=0, score=12.063469, +payload={'movie_name': 'Kubo and the Two Strings', 'movie_watch_time_min': 101, +'movie_description': 'A young boy named Kubo must locate a magical suit of armour worn by his late father in order to defeat a vengeful spirit from the past.'}, +vector=None, shard_key=None, order_value=None)]) ``` -### [Anchor](https://qdrant.tech/articles/rapid-rag-optimization-with-qdrant-and-quotient/\#setting-up-quotient) Setting up Quotient +<|page-135-lllmstxt|> +![n8n-qdrant](/documentation/examples/qdrant-n8n-2/cover.png) -You will need an account log in, which you can get by requesting access on [Quotient’s website](https://www.quotientai.co/). Once you have an account, you can create an API key by running the `quotient authenticate` CLI command. +# Automating Processes with Qdrant and n8n beyond simple RAG -**Once you have your API key, make sure to set it as an environment variable called `QUOTIENT_API_KEY`** +| Time: 45 min | Level: Intermediate | +| --- | ----------- | -```python -# Import QuotientAI client and connect to QuotientAI -from quotientai.client import QuotientClient -from quotientai.utils import show_job_progress +This tutorial shows how to combine Qdrant with [n8n](https://n8n.io/) low-code automation platform to cover **use cases beyond basic Retrieval-Augmented Generation (RAG)**. You'll learn how to use vector search for **recommendations** and **unstructured big data analysis**. -# IMPORTANT: be sure to set your API key as an environment variable called QUOTIENT_API_KEY -# You will need this set before running the code below. You may also uncomment the following line and insert your API key: -# os.environ['QUOTIENT_API_KEY'] = "YOUR_API_KEY" + -quotient = QuotientClient() +## Setting Up Qdrant in n8n -``` +To start using Qdrant with n8n, you need to provide your Qdrant instance credentials in the [credentials](https://docs.n8n.io/integrations/builtin/credentials/qdrant/#using-api-key) tab. Select `QdrantApi` from the list. -**QuotientAI** provides a seamless way to integrate _RAG evaluation_ into your applications. Here, we’ll see how to use it to evaluate text generated from an LLM, based on retrieved knowledge from the Qdrant vector database. +### Qdrant Cloud -After retrieving the top similar documents and populating the `context` column, we can submit the evaluation dataset to Quotient and execute an evaluation job. To run a job, all you need is your evaluation dataset and a `recipe`. +To connect [Qdrant Cloud](https://qdrant.tech/documentation/cloud/) to n8n: +1. Open the [Cloud Dashboard](https://qdrant.to/cloud) and select a cluster. +2. From the **Cluster Details**, copy the `Endpoint` address—this will be used as the `Qdrant URL` in n8n. +3. Navigate to the **API Keys** tab and copy your API key—this will be the `API Key` in n8n. -_**A recipe is a combination of a prompt template and a specified LLM.**_ +For a walkthrough, see this [step-by-step video guide](https://youtu.be/fYMGpXyAsfQ?feature=shared&t=177). -**Quotient** orchestrates the evaluation run and handles version control and asset management throughout the experimentation process. +### Local Mode -_**Prior to assessing our RAG solution, it’s crucial to outline our optimization goals.**_ +For a fully local experimnets-driven setup, a valuable option is n8n's [Self-hosted AI Starter Kit](https://github.com/n8n-io/self-hosted-ai-starter-kit). This is an open-source Docker Compose template for local AI & low-code development environment. -In the context of _question-answering on Qdrant documentation_, our focus extends beyond merely providing helpful responses. Ensuring the absence of any _inaccurate or misleading information_ is paramount. +This kit includes a [local instance of Qdrant](https://qdrant.tech/documentation/quickstart/). To get started: -In other words, **we want to minimize hallucinations** in the LLM outputs. +1. Follow the instructions in the repository to install the AI Starter Kit. +2. Use the values from the `docker-compose.yml` file to fill in the connection details. -For our evaluation, we will be considering the following metrics, with a focus on **Faithfulness**: + -- **Context Relevance** -- **Chunk Relevance** -- **Faithfulness** -- **ROUGE-L** -- **BERT Sentence Similarity** -- **BERTScore** +The default Qdrant configuration in AI Starter Kit's `docker-compose.yml` looks like this: -### [Anchor](https://qdrant.tech/articles/rapid-rag-optimization-with-qdrant-and-quotient/\#evaluation-in-action) Evaluation in action +```yaml +qdrant: + image: qdrant/qdrant + hostname: qdrant + container_name: qdrant + networks: ['demo'] + restart: unless-stopped + ports: + - 6333:6333 + volumes: + - qdrant_storage:/qdrant/storage +``` -The function below takes an evaluation dataset as input, which in this case contains questions and their corresponding answers. It retrieves relevant documents based on the questions in the dataset and populates the context field with this information from Qdrant. The prepared dataset is then submitted to QuotientAI for evaluation for the chosen metrics. After the evaluation is complete, the function displays aggregated statistics on the evaluation metrics followed by the summarized evaluation results. +From this configuration, the `Qdrant URL` in n8n Qdrant credentials is `http://qdrant:6333/`. +To set up a local Qdrant API key, add the following lines to the YAML file: -```python -def run_eval(eval_df, collection_name, recipe_id, num_docs=3, path="eval_dataset_qdrant_questions.csv"): - """ - This function evaluates the performance of a complete RAG pipeline on a given evaluation dataset. +```yaml +qdrant: + ... + volumes: + - qdrant_storage:/qdrant/storage + environment: + - QDRANT_API_KEY=test +``` - Given an evaluation dataset (containing questions and ground truth answers), - this function retrieves relevant documents, populates the context field, and submits the dataset to QuotientAI for evaluation. - Once the evaluation is complete, aggregated statistics on the evaluation metrics are displayed. +After saving the configuration and running the Starter Kit, use `QDRANT_API_KEY` value (e.g., `test`) as the `API Key` and `http://qdrant:6333/` as the `Qdrant URL`. - The evaluation results are returned as a pandas dataframe. - """ +## Qdrant + n8n Beyond Simple Similarity Search - # Add context to each question by retrieving relevant documents - eval_df['documents'] = eval_df.apply(lambda x: get_documents(collection_name=collection_name, - query=x['input_text'], - num_documents=num_docs), axis=1) - eval_df['context'] = eval_df.apply(lambda x: "\n".join(x['documents']), axis=1) +Vector search's ability to determine semantic similarity between objects is often used to address models' hallucinations, powering the memory of Retrieval-Augmented Generation-based applications. Yet there's more to vector search than just a "knowledge base" role. - # Now we'll save the eval_df to a CSV - eval_df.to_csv(path, index=False) +The combination of similarity and dissimilarity metrics in vector space expands vector search to recommendations, discovery search, and large-scale unstructured data analysis. - # Upload the eval dataset to QuotientAI - dataset = quotient.create_dataset( - file_path=path, - name="qdrant-questions-eval-v1", - ) +![overview](/documentation/examples/qdrant-n8n-2/overview.png) - # Create a new task for the dataset - task = quotient.create_task( - dataset_id=dataset['id'], - name='qdrant-questions-qa-v1', - task_type='question_answering' - ) +### Recommendations - # Run a job to evaluate the model - job = quotient.create_job( - task_id=task['id'], - recipe_id=recipe_id, - num_fewshot_examples=0, - limit=500, - metric_ids=[5, 7, 8, 11, 12, 13, 50], - ) +When searching for new music, films, books, or food, it can be difficult to articulate exactly what we want. Instead, we often rely on discovering new content through comparison to examples of what we like or dislike. - # Show the progress of the job - show_job_progress(quotient, job['id']) +The [Qdrant Recommendation API](https://qdrant.tech/articles/new-recommendation-api/) is built to make these discovery searches possible by using positive and negative examples as anchors. It helps find new relevant results based on your preferences. - # Once the job is complete, we can get our results - data = quotient.get_eval_results(job_id=job['id']) +![recommendations](/documentation/examples/qdrant-n8n-2/recommendations.png) - # Add the results to a pandas dataframe to get statistics on performance - df = pd.json_normalize(data, "results") - df_stats = df[df.columns[df.columns.str.contains("metric|completion_time")]] +#### Movie Recommendations +Imagine a home cinema night—you've already watched Harry Potter 666 times and crave a new series featuring young wizards. Your favorite streaming service repetitively recommends all seven parts of the millennial saga. Frustrated, you turn to n8n to create an **Agentic Movie Recommendation tool**. - df.columns = df.columns.str.replace("metric.", "") - df_stats.columns = df_stats.columns.str.replace("metric.", "") +**Setup:** +1. **Dataset**: We use movie descriptions from the [IMDB Top 1000 Kaggle dataset](https://www.kaggle.com/datasets/omarhanyy/imdb-top-1000). +2. **Embedding Model**: We'll use OpenAI `text-embedding-3-small`, but you can opt for any other suitable embedding model. - metrics = { - 'completion_time_ms':'Completion Time (ms)', - 'chunk_relevance': 'Chunk Relevance', - 'selfcheckgpt_nli_relevance':"Context Relevance", - 'selfcheckgpt_nli':"Faithfulness", - 'rougeL_fmeasure':"ROUGE-L", - 'bert_score_f1':"BERTScore", - 'bert_sentence_similarity': "BERT Sentence Similarity", - 'completion_verbosity':"Completion Verbosity", - 'verbosity_ratio':"Verbosity Ratio",} +**Workflow:** - df = df.rename(columns=metrics) - df_stats = df_stats.rename(columns=metrics) +A [Template Agentic Movie Recommendation Workflow](https://n8n.io/workflows/2440-building-rag-chatbot-for-movie-recommendations-with-qdrant-and-open-ai/) consists of three parts: - display(df_stats[metrics.values()].describe()) +1. **Movie Data Uploader**: Embeds movie descriptions and uploads them to Qdrant using the [Qdrant Vector Store Node](https://docs.n8n.io/integrations/builtin/cluster-nodes/root-nodes/n8n-nodes-langchain.vectorstoreqdrant) (now this can also be done using the [official Qdrant Node for n8n](https://github.com/qdrant/n8n-nodes-qdrant)). In the template workflow, the dataset is fetched from GitHub, but you can use any supported storage, for example [Google Cloud Storage node](https://docs.n8n.io/integrations/builtin/app-nodes/n8n-nodes-base.googlecloudstorage). +2. **AI Agent**: Uses the [AI Agent Node](https://docs.n8n.io/integrations/builtin/cluster-nodes/root-nodes/n8n-nodes-langchain.agent) to formulate Recommendation API calls based on your natural language requests. Choose an LLM as a "brain" and define a [JSON schema](https://docs.n8n.io/integrations/builtin/cluster-nodes/sub-nodes/n8n-nodes-langchain.toolworkflow/#specify-input-schema) for the recommendations tool powered by Qdrant. This schema lets the LLM map your requests to the tool input format. +3. **Recommendations Tool**: A [subworkflow](https://docs.n8n.io/flow-logic/subworkflows/) that calls the Qdrant Recommendation API using the [HTTP Request Node](https://docs.n8n.io/integrations/builtin/core-nodes/n8n-nodes-base.httprequest) (now this can also be done using the [official Qdrant Node for n8n](https://github.com/qdrant/n8n-nodes-qdrant)). The agent extracts relevant and irrelevant movie descriptions from your chat message and passes them to the tool. The tool embeds them with `text-embedding-3-small` and uses the Qdrant Recommendation API to get movie recommendations, which are passed back to the agent. - return df +Set it up, run a chat and ask for "*something about wizards but not Harry Potter*." +What results do you get? -main_metrics = [\ - 'Context Relevance',\ - 'Chunk Relevance',\ - 'Faithfulness',\ - 'ROUGE-L',\ - 'BERT Sentence Similarity',\ - 'BERTScore',\ - ] +--- -``` +If you'd like a detailed walkthrough of building this workflow step-by-step, watch the video below: -## [Anchor](https://qdrant.tech/articles/rapid-rag-optimization-with-qdrant-and-quotient/\#experimentation) Experimentation + -Our approach is rooted in the belief that improvement thrives in an environment of exploration and discovery. By systematically testing and tweaking various components of the RAG pipeline, we aim to incrementally enhance its capabilities and performance. +This recommendation scenario is easily adaptable to any language or data type (images, audio, video). -In the following section, we dive into the details of our experimentation process, outlining the specific experiments conducted and the insights gained. +### Big Data Analysis -### [Anchor](https://qdrant.tech/articles/rapid-rag-optimization-with-qdrant-and-quotient/\#experiment-1---baseline) Experiment 1 - Baseline +The ability to map data to a vector space that reflects items' similarity and dissimilarity relationships provides a range of mathematical tools for data analysis. -Parameters +Vector search dedicated solutions are built to handle billions of data points and quickly compute distances between them, simplifying **clustering, classification, dissimilarity sampling, deduplication, interpolation**, and **anomaly detection at scale**. -- **Embedding Model: `bge-small-en`** -- **Chunk size: `512`** -- **Chunk overlap: `64`** -- **Number of docs retrieved (Retireval Window): `3`** -- **LLM: `Mistral-7B-Instruct`** +The combination of this vector search feature with automation tools like n8n creates production-level solutions capable of monitoring data temporal shifts, managing data drift, and discovering patterns in seemingly unstructured data. -We’ll process our documents based on configuration above and ingest them into Qdrant using `add_documents` method introduced earlier +A practical example is worth a thousand words. Let's look at **Qdrant-based anomaly detection and classification tools**, which are designed to be used by the [n8n AI Agent node](https://docs.n8n.io/integrations/builtin/cluster-nodes/root-nodes/n8n-nodes-langchain.agent) for data analysis automation. -```python -#experiment1 - base config -chunk_size = 512 -chunk_overlap = 64 -embedding_model_name = "BAAI/bge-small-en" -num_docs = 3 +To make it more interesting, this time we'll focus on image data. -COLLECTION_NAME = f"experiment_{chunk_size}_{chunk_overlap}_{embedding_model_name.split('/')[1]}" +#### Anomaly Detection Tool -add_documents(client, - collection_name=COLLECTION_NAME, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - embedding_model_name=embedding_model_name) +One definition of "anomaly" comes intuitively after projecting vector representations of data points into a 2D space—[Qdrant webUI](https://qdrant.tech/documentation/web-ui/) provides this functionality. -#Outputs -#processed: 4504 -#content: 4504 -#metadata: 4504 +Points that don't belong to any clusters are more likely to be anomalous. -``` +![anomalies-on-2D](/documentation/examples/qdrant-n8n-2/anomalies-2D.png) -Notice the `COLLECTION_NAME` which helps us segregate and identify our collections based on the experiments conducted. +With that intuition comes the recipe for building an anomaly detection tool. We will demonstrate it on anomaly detection in agricultural crops. Qdrant will be used to: +1. Store vectorized images. +2. Identify a "center" (representative) for each crop cluster. +3. Define the borders of each cluster. +4. Check if new images fall within these boundaries. If an image does not fit within any cluster, it is flagged as anomalous. Alternatively, you can check if an image is anomalous to a specific cluster. -To proceed with the evaluation, let’s create the `evaluation recipe` up next +![anomaly-detection](/documentation/examples/qdrant-n8n-2/anomaly-detection.png) -```python -# Create a recipe for the generator model and prompt template -recipe_mistral = quotient.create_recipe( - model_id=10, - prompt_template_id=1, - name='mistral-7b-instruct-qa-with-rag', - description='Mistral-7b-instruct using a prompt template that includes context.' -) -recipe_mistral +**Setup:** +1. **Dataset**: We use the [Agricultural Crops Image Classification dataset](https://www.kaggle.com/datasets/mdwaquarazam/agricultural-crops-image-classification). +2. **Embedding Model**: The [Voyage AI multimodal embedding model](https://docs.voyageai.com/docs/multimodal-embeddings). It can project images and text data into a shared vector space. -#Outputs recipe JSON with the used prompt template -#'prompt_template': {'id': 1, -# 'name': 'Default Question Answering Template', -# 'variables': '["input_text","context"]', -# 'created_at': '2023-12-21T22:01:54.632367', -# 'template_string': 'Question: {input_text}\\n\\nContext: {context}\\n\\nAnswer:', -# 'owner_profile_id': None} +**1. Uploading Images to Qdrant** -``` +Since the [Qdrant Vector Store node](https://docs.n8n.io/integrations/builtin/cluster-nodes/root-nodes/n8n-nodes-langchain.vectorstoreqdrant/) does not support embedding models outside the predefined list (which doesn't include Voyage AI), we embed and upload data to Qdrant via direct API calls in [HTTP Request nodes](https://docs.n8n.io/integrations/builtin/core-nodes/n8n-nodes-base.httprequest/). -To get a list of your existing recipes, you can simply run: +With the release of the [official Qdrant node](https://github.com/qdrant/n8n-nodes-qdrant), which supports arbitrary vectorized input, the HTTP Request node can now be replaced with this native integration. -```python -quotient.list_recipes() +**Workflow:** -``` +*There are three workflows: (1) Uploading images to Qdrant (2) Setting up cluster centers and thresholds (3) Anomaly detection tool itself.* -Notice the recipe template is a simplest prompt using `Question` from evaluation template `Context` from document chunks retrieved from Qdrant and `Answer` generated by the pipeline. +An [1/3 Uploading Images to Qdrant Template Workflow](https://n8n.io/workflows/2654-vector-database-as-a-big-data-analysis-tool-for-ai-agents-13-anomaly12-knn/) consists of the following blocks: -To kick off the evaluation +1. **Check Collection**: Verifies if a collection with the specified name exists in Qdrant. If not, it creates one. +2. **Payload Index**: Adds a [payload index](https://qdrant.tech/documentation/concepts/indexing/#payload-index) on the `crop_name` payload (metadata) field. This field stores crop class labels, and indexing it improves the speed of filterable searches in Qdrant. It changes the way a vector index is constructed, adapting it for fast vector search under filtering constraints. For more details, refer to this [guide on filtering in Qdrant](https://qdrant.tech/articles/vector-search-filtering/). +3. **Fetch Images**: Fetches images from Google Cloud Storage using the [Google Cloud Storage node](https://docs.n8n.io/integrations/builtin/app-nodes/n8n-nodes-base.googlecloudstorage). +4. **Generate IDs**: Assigns UUIDs to each data point. +5. **Embed Images**: Embeds the images using the Voyage API. +6. **Batch Upload**: Uploads the embeddings to Qdrant in batches. -```python -# Kick off an evaluation job -experiment_1 = run_eval(eval_df, - collection_name=COLLECTION_NAME, - recipe_id=recipe_mistral['id'], - num_docs=num_docs, - path=f"{COLLECTION_NAME}_{num_docs}_mistral.csv") +**2. Defining a Cluster Representative** -``` +We used two approaches (it's not an exhaustive list) to defining a cluster representative, depending on the availability of labeled data: -This may take few minutes (depending on the size of evaluation dataset!) +| Method | Description | +|----------------------|-----------------------------------------------------------------------------| +| **Medoids** | A point within the cluster that has the smallest total distance to all other cluster points. This approach needs labeled data for each cluster. | +| **Perfect Representative** | A representative defined by a textual description of the ideal cluster member—the multimodality of Voyage AI embeddings allows for this trick. For example, for cherries: *"Small, glossy red fruits on a medium-sized tree with slender branches and serrated leaves."* The closest image to this description in the vector space is selected as the representative. This method requires experimentation to align descriptions with real data. | -We can look at the results from our first (baseline) experiment as below : +**Workflow:** -![experiment1_eval.png](https://qdrant.tech/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/experiment1_eval.png) +Both methods are demonstrated in the [2/3 Template Workflow for Anomaly Detection](https://n8n.io/workflows/2655-vector-database-as-a-big-data-analysis-tool-for-ai-agents-23-anomaly/). -Notice that we have a pretty **low average Chunk Relevance** and **very large standard deviations for both Chunk Relevance and Context Relevance**. +| **Method** | **Steps** | +|------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **Medoids** | 1. Sample labeled cluster points from Qdrant.
2. Compute a **pairwise distance matrix** for the cluster using Qdrant's [Distance Matrix API](https://qdrant.tech/documentation/concepts/explore/?q=distance+#distance-matrix). This API helps with scalable cluster analysis and data points relationship exploration. Learn more in [this article](https://qdrant.tech/articles/distance-based-exploration/).
3. For each point, calculate the sum of its distances to all other points. The point with the smallest total distance (or highest similarity for COSINE distance metric) is the medoid.
4. Mark this point as the cluster representative. | +| **Perfect Representative** | 1. Define textual descriptions for each cluster (e.g., AI-generated).
2. Embed these descriptions using Voyage.
3. Find the image embedding closest to the description one.
4. Mark this image as the cluster representative. | -Let’s take a look at some of the lower performing datapoints with **poor Faithfulness**: +**3. Defining the Cluster Border** -```python -with pd.option_context('display.max_colwidth', 0): - display(experiment_1[['content.input_text', 'content.answer','content.documents','Chunk Relevance','Context Relevance','Faithfulness']\ - ].sort_values(by='Faithfulness').head(2)) +**Workflow:** -``` +The approach demonstrated in [2/3 Template Workflow for Anomaly Detection](https://n8n.io/workflows/2655-vector-database-as-a-big-data-analysis-tool-for-ai-agents-23-anomaly/) works similarly for both types of cluster representatives. -![experiment1_bad_examples.png](https://qdrant.tech/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/experiment1_bad_examples.png) +1. Within a cluster, identify the furthest data point from the cluster representative (it can also be the 2nd or Xth furthest point; the best way to define it is through experimentation—for us, the 5th furthest point worked well). Since we use COSINE similarity, this is equivalent to the most similar point to the [opposite](https://mathinsight.org/image/vector_opposite) of the cluster representative (its vector multiplied by -1). +2. Save the distance between the representative and respective furthest point as the cluster border (threshold). -In instances where the retrieved documents are **irrelevant (where both Chunk Relevance and Context Relevance are low)**, the model also shows **tendencies to hallucinate** and **produce poor quality responses**. +**4. Anomaly Detection Tool** -The quality of the retrieved text directly impacts the quality of the LLM-generated answer. Therefore, our focus will be on enhancing the RAG setup by **adjusting the chunking parameters**. +**Workflow:** -### [Anchor](https://qdrant.tech/articles/rapid-rag-optimization-with-qdrant-and-quotient/\#experiment-2---adjusting-the-chunk-parameter) Experiment 2 - Adjusting the chunk parameter +With the preparatory steps complete, you can set up the anomaly detection tool, demonstrated in the [3/3 Template Workflow for Anomaly Detection](https://n8n.io/workflows/2656-vector-database-as-a-big-data-analysis-tool-for-ai-agents-33-anomaly/). -Keeping all other parameters constant, we changed the `chunk size` and `chunk overlap` to see if we can improve our results. +Steps: +1. Choose the method of the cluster representative definition. +2. Fetch all the clusters to compare the candidate image against. +3. Using Voyage AI, embed the candidate image in the same vector space. +4. Calculate the candidate's similarity to each cluster representative. The image is flagged as anomalous if the similarity is below the threshold for all clusters (outside the cluster borders). Alternatively, you can check if it's anomalous to a particular cluster, for example, the cherries one. -Parameters : +--- -- **Embedding Model : `bge-small-en`** -- **Chunk size: `1024`** -- **Chunk overlap: `128`** -- **Number of docs retrieved (Retireval Window): `3`** -- **LLM: `Mistral-7B-Instruct`** +Anomaly detection in image data has diverse applications, including: +- Moderation of advertisements. +- Anomaly detection in vertical farming. +- Quality control in the food industry, such as [detecting anomalies in coffee beans](https://qdrant.tech/articles/detecting-coffee-anomalies/). +- Identifying anomalies in map tiles for tasks like automated map updates or ecological monitoring. -We will reprocess the data with the updated parameters above: +This tool is easily adaptable to these use cases. -```python -## for iteration 2 - lets modify chunk configuration -## We will start with creating seperate collection to store vectors +#### Classification Tool -chunk_size = 1024 -chunk_overlap = 128 -embedding_model_name = "BAAI/bge-small-en" -num_docs = 3 +The anomaly detection tool can also be used for classification, but there's a simpler approach: K-Nearest Neighbors (KNN) classification. -COLLECTION_NAME = f"experiment_{chunk_size}_{chunk_overlap}_{embedding_model_name.split('/')[1]}" +> "Show me your friends, and I will tell you who you are." -add_documents(client, - collection_name=COLLECTION_NAME, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - embedding_model_name=embedding_model_name) +![KNN-2D](/documentation/examples/qdrant-n8n-2/classification.png) -#Outputs -#processed: 2152 -#content: 2152 -#metadata: 2152 +The KNN method labels a data point by analyzing its classified neighbors and assigning this point the majority class in the neighborhood. This approach doesn't require all data points to be labeled—a subset of labeled examples can serve as anchors to propagate labels across the dataset. -``` +Let's build a KNN-based image classification tool. -Followed by running evaluation : +**Setup** +1. **Dataset**: We'll use the [Land-Use Scene Classification dataset](https://www.kaggle.com/datasets/apollo2506/landuse-scene-classification). Satellite imagery analysis has applications in ecology, rescue operations, and map updates. +2. **Embedding Model**: As for anomaly detection, we'll use the [Voyage AI multimodal embedding model](https://docs.voyageai.com/docs/multimodal-embeddings). -![experiment2_eval.png](https://qdrant.tech/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/experiment2_eval.png) +Additionally, it's good to have test and validation data to determine the optimal value of K for your dataset. -and **comparing it with the results from Experiment 1:** +**Workflow:** -![graph_exp1_vs_exp2.png](https://qdrant.tech/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/graph_exp1_vs_exp2.png) +Uploading images to Qdrant can be done using the same workflow—[1/3 Uploading Images to Qdrant Template Workflow](https://n8n.io/workflows/2654-vector-database-as-a-big-data-analysis-tool-for-ai-agents-13-anomaly12-knn/), just by swapping the dataset. -We observed slight enhancements in our LLM completion metrics (including BERT Sentence Similarity, BERTScore, ROUGE-L, and Knowledge F1) with the increase in _chunk size_. However, it’s noteworthy that there was a significant decrease in _Faithfulness_, which is the primary metric we are aiming to optimize. +The [KNN-Classification Tool Template](https://n8n.io/workflows/2657-vector-database-as-a-big-data-analysis-tool-for-ai-agents-22-knn/) has the following steps: -Moreover, _Context Relevance_ demonstrated an increase, indicating that the RAG pipeline retrieved more relevant information required to address the query. Nonetheless, there was a considerable drop in _Chunk Relevance_, implying that a smaller portion of the retrieved documents contained pertinent information for answering the question. +1. **Embed Image**: Embeds the candidate for classification using Voyage. +2. **Fetch neighbors**: Retrieves the K closest labeled neighbors from Qdrant. +3. **Majority Voting**: Determines the prevailing class in the neighborhood by simple majority voting. +4. **Optional: Ties Resolving**: In case of ties, expands the neighborhood radius. -**The correlation between the rise in Context Relevance and the decline in Chunk Relevance suggests that retrieving more documents using the smaller chunk size might yield improved results.** +Of course, this is a simple solution, and there exist more advanced approaches with higher precision & no need for labeled data—for example, you could try [metric learning with Qdrant](https://qdrant.tech/articles/metric-learning-tips/). -### [Anchor](https://qdrant.tech/articles/rapid-rag-optimization-with-qdrant-and-quotient/\#experiment-3---increasing-the-number-of-documents-retrieved-retrieval-window) Experiment 3 - Increasing the number of documents retrieved (retrieval window) +Though classification seems like a task that was solved in machine learning decades ago, it's not so trivial to deal with in production. Issues like data drift, shifting class definitions, mislabeled data, and fuzzy differences between classes create unexpected problems, which require continuous adjustments of classifiers, and vector search can be an unusual but effective solution, due to its scalability. -This time, we are using the same RAG setup as `Experiment 1`, but increasing the number of retrieved documents from **3** to **5**. +#### Live Walkthrough -Parameters : +To see how n8n agents use these tools in practice, and to revisit the main ideas of the "*Big Data Analysis*" section, watch our integration webinar: -- **Embedding Model : `bge-small-en`** -- **Chunk size: `512`** -- **Chunk overlap: `64`** -- **Number of docs retrieved (Retrieval Window): `5`** -- **LLM: : `Mistral-7B-Instruct`** + -We can use the collection from Experiment 1 and run evaluation with modified `num_docs` parameter as : +## Conclusion -```python -#collection name from Experiment 1 -COLLECTION_NAME = f"experiment_{chunk_size}_{chunk_overlap}_{embedding_model_name.split('/')[1]}" +Vector search is not limited to similarity search or basic RAG. When combined with automation platforms like n8n, it becomes a powerful tool for building smarter systems. Think dynamic routing in customer support, content moderation based on user behavior, or AI-driven alerts in data monitoring dashboards. -#running eval for experiment 3 -experiment_3 = run_eval(eval_df, - collection_name=COLLECTION_NAME, - recipe_id=recipe_mistral['id'], - num_docs=num_docs, - path=f"{COLLECTION_NAME}_{num_docs}_mistral.csv") +This tutorial showed how to use Qdrant and n8n for AI-backed recommendations, classification, and anomaly detection. But that's just the start—try vector search for: +- **Deduplication** +- **Dissimilarity search** +- **Diverse sampling** -``` +With Qdrant and n8n, there's plenty of room to create something unique! -Observe the results as below : +<|page-136-lllmstxt|> +# How to use rerankers with FastEmbed -![experiment_3_eval.png](https://qdrant.tech/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/experiment_3_eval.png) +## Rerankers -Comparing the results with Experiment 1 and 2 : +A reranker is a model that improves the ordering of search results. A subset of documents is initially retrieved using a fast, simple method (e.g., BM25 or dense embeddings). Then, a reranker -- a more powerful, precise, but slower and heavier model -- re-evaluates this subset to refine document relevance to the query. -![graph_exp1_exp2_exp3.png](https://qdrant.tech/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/graph_exp1_exp2_exp3.png) +Rerankers analyze token-level interactions between the query and each document in depth, making them expensive to use but precise in defining relevance. They trade speed for accuracy, so they are best used on a limited candidate set rather than the entire corpus. -As anticipated, employing the smaller chunk size while retrieving a larger number of documents resulted in achieving the highest levels of both _Context Relevance_ and _Chunk Relevance._ Additionally, it yielded the **best** (albeit marginal) _Faithfulness_ score, indicating a _reduced occurrence of inaccuracies or hallucinations_. +## Goal of this Tutorial -Looks like we have achieved a good hold on our chunking parameters but it is worth testing another embedding model to see if we can get better results. +It's common to use [cross-encoder](https://sbert.net/examples/applications/cross-encoder/README.html) models as rerankers. This tutorial uses [Jina Reranker v2 Base Multilingual](https://jina.ai/news/jina-reranker-v2-for-agentic-rag-ultra-fast-multilingual-function-calling-and-code-search/) (licensed under CC-BY-NC-4.0) -- a cross-encoder reranker supported in FastEmbed. -### [Anchor](https://qdrant.tech/articles/rapid-rag-optimization-with-qdrant-and-quotient/\#experiment-4---changing-the-embedding-model) Experiment 4 - Changing the embedding model +We use the `all-MiniLM-L6-v2` dense embedding model (also supported in FastEmbed) as a first-stage retriever and then refine results with `Jina Reranker v2`. -Let us try using **MiniLM** for this experiment -\*\*\*\*Parameters : -- **Embedding Model : `MiniLM-L6-v2`** -- **Chunk size: `512`** -- **Chunk overlap: `64`** -- **Number of docs retrieved (Retrieval Window): `5`** -- **LLM: : `Mistral-7B-Instruct`** +## Setup -We will have to create another collection for this experiment : +Install `qdrant-client` with `fastembed`. ```python -#experiment-4 -chunk_size=512 -chunk_overlap=64 -embedding_model_name="sentence-transformers/all-MiniLM-L6-v2" -num_docs=5 +pip install "qdrant-client[fastembed]>=1.14.1" +``` -COLLECTION_NAME = f"experiment_{chunk_size}_{chunk_overlap}_{embedding_model_name.split('/')[1]}" +Import cross-encoders and text embeddings for the first-stage retrieval. -add_documents(client, - collection_name=COLLECTION_NAME, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - embedding_model_name=embedding_model_name) +```python +from fastembed import TextEmbedding +from fastembed.rerank.cross_encoder import TextCrossEncoder +``` +You can list the cross-encoder rerankers supported in FastEmbed using the following command. -#Outputs -#processed: 4504 -#content: 4504 -#metadata: 4504 +```python +TextCrossEncoder.list_supported_models() +``` + +This command displays the available models, including details such as output embedding dimensions, model description, model size, model sources, and model file. +
+ Avaliable models + + +```python +[{'model': 'Xenova/ms-marco-MiniLM-L-6-v2', + 'size_in_GB': 0.08, + 'sources': {'hf': 'Xenova/ms-marco-MiniLM-L-6-v2'}, + 'model_file': 'onnx/model.onnx', + 'description': 'MiniLM-L-6-v2 model optimized for re-ranking tasks.', + 'license': 'apache-2.0'}, + {'model': 'Xenova/ms-marco-MiniLM-L-12-v2', + 'size_in_GB': 0.12, + 'sources': {'hf': 'Xenova/ms-marco-MiniLM-L-12-v2'}, + 'model_file': 'onnx/model.onnx', + 'description': 'MiniLM-L-12-v2 model optimized for re-ranking tasks.', + 'license': 'apache-2.0'}, + {'model': 'BAAI/bge-reranker-base', + 'size_in_GB': 1.04, + 'sources': {'hf': 'BAAI/bge-reranker-base'}, + 'model_file': 'onnx/model.onnx', + 'description': 'BGE reranker base model for cross-encoder re-ranking.', + 'license': 'mit'}, + {'model': 'jinaai/jina-reranker-v1-tiny-en', + 'size_in_GB': 0.13, + 'sources': {'hf': 'jinaai/jina-reranker-v1-tiny-en'}, + 'model_file': 'onnx/model.onnx', + 'description': 'Designed for blazing-fast re-ranking with 8K context length and fewer parameters than jina-reranker-v1-turbo-en.', + 'license': 'apache-2.0'}, + {'model': 'jinaai/jina-reranker-v1-turbo-en', + 'size_in_GB': 0.15, + 'sources': {'hf': 'jinaai/jina-reranker-v1-turbo-en'}, + 'model_file': 'onnx/model.onnx', + 'description': 'Designed for blazing-fast re-ranking with 8K context length.', + 'license': 'apache-2.0'}, + {'model': 'jinaai/jina-reranker-v2-base-multilingual', + 'size_in_GB': 1.11, + 'sources': {'hf': 'jinaai/jina-reranker-v2-base-multilingual'}, + 'model_file': 'onnx/model.onnx', + 'description': 'A multi-lingual reranker model for cross-encoder re-ranking with 1K context length and sliding window', + 'license': 'cc-by-nc-4.0'}] # some of the fields are omitted for brevity ``` +
-We will observe our evaluations as : -![experiment4_eval.png](https://qdrant.tech/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/experiment4_eval.png) +Now, load the first-stage retriever and reranker. -Comparing these with our previous experiments : +```python +encoder_name = "sentence-transformers/all-MiniLM-L6-v2" +dense_embedding_model = TextEmbedding(model_name=encoder_name) +reranker = TextCrossEncoder(model_name='jinaai/jina-reranker-v2-base-multilingual') +``` -![graph_exp1_exp2_exp3_exp4.png](https://qdrant.tech/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/graph_exp1_exp2_exp3_exp4.png) +The model files will be fetched and downloaded, with progress displayed. -It appears that `bge-small` was more proficient in capturing the semantic nuances of the Qdrant Documentation. +## Embed & index data for the first-stage retrieval -Up to this point, our experimentation has focused solely on the _retrieval aspect_ of our RAG pipeline. Now, let’s explore altering the _generation aspect_ or LLM while retaining the optimal parameters identified in Experiment 3. +We will vectorize a toy movie description dataset using the `all-MiniLM-L6-v2` model and save the embeddings in Qdrant for first-stage retrieval. -### [Anchor](https://qdrant.tech/articles/rapid-rag-optimization-with-qdrant-and-quotient/\#experiment-5---changing-the-llm) Experiment 5 - Changing the LLM +Then, we will use a cross-encoder reranking model to rerank a small subset of data retrieved in the first stage. -Parameters : +
+ Movie description dataset + +```python +descriptions = ["In 1431, Jeanne d'Arc is placed on trial on charges of heresy. The ecclesiastical jurists attempt to force Jeanne to recant her claims of holy visions.", + "A film projectionist longs to be a detective, and puts his meagre skills to work when he is framed by a rival for stealing his girlfriend's father's pocketwatch.", + "A group of high-end professional thieves start to feel the heat from the LAPD when they unknowingly leave a clue at their latest heist.", + "A petty thief with an utter resemblance to a samurai warlord is hired as the lord's double. When the warlord later dies the thief is forced to take up arms in his place.", + "A young boy named Kubo must locate a magical suit of armour worn by his late father in order to defeat a vengeful spirit from the past.", + "A biopic detailing the 2 decades that Punjabi Sikh revolutionary Udham Singh spent planning the assassination of the man responsible for the Jallianwala Bagh massacre.", + "When a machine that allows therapists to enter their patients' dreams is stolen, all hell breaks loose. Only a young female therapist, Paprika, can stop it.", + "An ordinary word processor has the worst night of his life after he agrees to visit a girl in Soho whom he met that evening at a coffee shop.", + "A story that revolves around drug abuse in the affluent north Indian State of Punjab and how the youth there have succumbed to it en-masse resulting in a socio-economic decline.", + "A world-weary political journalist picks up the story of a woman's search for her son, who was taken away from her decades ago after she became pregnant and was forced to live in a convent.", + "Concurrent theatrical ending of the TV series Neon Genesis Evangelion (1995).", + "During World War II, a rebellious U.S. Army Major is assigned a dozen convicted murderers to train and lead them into a mass assassination mission of German officers.", + "The toys are mistakenly delivered to a day-care center instead of the attic right before Andy leaves for college, and it's up to Woody to convince the other toys that they weren't abandoned and to return home.", + "A soldier fighting aliens gets to relive the same day over and over again, the day restarting every time he dies.", + "After two male musicians witness a mob hit, they flee the state in an all-female band disguised as women, but further complications set in.", + "Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household.", + "A renegade reporter trailing a young runaway heiress for a big story joins her on a bus heading from Florida to New York, and they end up stuck with each other when the bus leaves them behind at one of the stops.", + "Story of 40-man Turkish task force who must defend a relay station.", + "Spinal Tap, one of England's loudest bands, is chronicled by film director Marty DiBergi on what proves to be a fateful tour.", + "Oskar, an overlooked and bullied boy, finds love and revenge through Eli, a beautiful but peculiar girl."] +``` +
-- **Embedding Model : `bge-small-en`** -- **Chunk size: `512`** -- **Chunk overlap: `64`** -- **Number of docs retrieved (Retrieval Window): `5`** -- **LLM: : `GPT-3.5-turbo`** +```python +descriptions_embeddings = list( + dense_embedding_model.embed(descriptions) +) +``` -For this we can repurpose our collection from Experiment 3 while the evaluations to use a new recipe with **GPT-3.5-turbo** model. +Let's upload the embeddings to Qdrant. + +Qdrant Client offers a simple in-memory mode, allowing you to experiment locally with small data volumes. +Alternatively, you can use [a free cluster](https://qdrant.tech/documentation/cloud/create-cluster/#create-a-cluster) in Qdrant Cloud for experiments. ```python -#collection name from Experiment 3 -COLLECTION_NAME = f"experiment_{chunk_size}_{chunk_overlap}_{embedding_model_name.split('/')[1]}" +from qdrant_client import QdrantClient, models -# We have to create a recipe using the same prompt template and GPT-3.5-turbo -recipe_gpt = quotient.create_recipe( - model_id=5, - prompt_template_id=1, - name='gpt3.5-qa-with-rag-recipe-v1', - description='GPT-3.5 using a prompt template that includes context.' +client = QdrantClient(":memory:") # Qdrant is running from RAM. +``` + +Let's create a [collection](https://qdrant.tech/documentation/concepts/collections/) with our movie data. + +```python +client.create_collection( + collection_name="movies", + vectors_config={ + "embedding": models.VectorParams( + size=client.get_embedding_size("sentence-transformers/all-MiniLM-L6-v2"), + distance=models.Distance.COSINE + ) + } ) +``` -recipe_gpt +And upload the embeddings to it. -#Outputs -#{'id': 495, -# 'name': 'gpt3.5-qa-with-rag-recipe-v1', -# 'description': 'GPT-3.5 using a prompt template that includes context.', -# 'model_id': 5, -# 'prompt_template_id': 1, -# 'created_at': '2024-05-03T12:14:58.779585', -# 'owner_profile_id': 34, -# 'system_prompt_id': None, -# 'prompt_template': {'id': 1, -# 'name': 'Default Question Answering Template', -# 'variables': '["input_text","context"]', -# 'created_at': '2023-12-21T22:01:54.632367', -# 'template_string': 'Question: {input_text}\\n\\nContext: {context}\\n\\nAnswer:', -# 'owner_profile_id': None}, -# 'model': {'id': 5, -# 'name': 'gpt-3.5-turbo', -# 'endpoint': 'https://api.openai.com/v1/chat/completions', -# 'revision': 'placeholder', -# 'created_at': '2024-02-06T17:01:21.408454', -# 'model_type': 'OpenAI', -# 'description': 'Returns a maximum of 4K output tokens.', -# 'owner_profile_id': None, -# 'external_model_config_id': None, -# 'instruction_template_cls': 'NoneType'}} +```python +client.upload_points( + collection_name="movies", + points=[ + models.PointStruct( + id=idx, + payload={"description": description}, + vector={"embedding": vector} + ) + for idx, (description, vector) in enumerate( + zip(descriptions, descriptions_embeddings) + ) + ], +) ``` -Running the evaluations as : + + +
+ Upload with implicit embeddings computation -```python -experiment_5 = run_eval(eval_df, - collection_name=COLLECTION_NAME, - recipe_id=recipe_gpt['id'], - num_docs=num_docs, - path=f"{COLLECTION_NAME}_{num_docs}_gpt.csv") +```python +client.upload_points( + collection_name="movies", + points=[ + models.PointStruct( + id=idx, + payload={"description": description}, + vector={"embedding": models.Document(text=description, model=encoder_name)}, + ) + for idx, description in enumerate(descriptions) + ], +) ``` +
-We observe : +## First-stage retrieval -![experiment5_eval.png](https://qdrant.tech/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/experiment5_eval.png) +Let's see how relevant the results will be using only an `all-MiniLM-L6-v2`-based dense retriever. -and comparing all the 5 experiments as below : +```python +query = "A story about a strong historically significant female figure." +query_embedded = list(dense_embedding_model.query_embed(query))[0] -![graph_exp1_exp2_exp3_exp4_exp5.png](https://qdrant.tech/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/graph_exp1_exp2_exp3_exp4_exp5.png) +initial_retrieval = client.query_points( + collection_name="movies", + using="embedding", + query=query_embedded, + with_payload=True, + limit=10 +) -**GPT-3.5 surpassed Mistral-7B in all metrics**! Notably, Experiment 5 exhibited the **lowest occurrence of hallucination**. +description_hits = [] +for i, hit in enumerate(initial_retrieval.points): + print(f'Result number {i+1} is \"{hit.payload["description"]}\"') + description_hits.append(hit.payload["description"]) +``` -## [Anchor](https://qdrant.tech/articles/rapid-rag-optimization-with-qdrant-and-quotient/\#conclusions) Conclusions + -Let’s take a look at our results from all 5 experiments above +
+ Query points with implicit embeddings computation -![overall_eval_results.png](https://qdrant.tech/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/overall_eval_results.png) -We still have a long way to go in improving the retrieval performance of RAG, as indicated by our generally poor results thus far. It might be beneficial to **explore alternative embedding models** or **different retrieval strategies** to address this issue. +```python +query = "A story about a strong historically significant female figure." -The significant variations in _Context Relevance_ suggest that **certain questions may necessitate retrieving more documents than others**. Therefore, investigating a **dynamic retrieval strategy** could be worthwhile. +initial_retrieval = client.query_points( + collection_name="movies", + using="embedding", + query=models.Document(text=query, model=encoder_name), + with_payload=True, + limit=10 +) +``` +
-Furthermore, there’s ongoing **exploration required on the generative aspect** of RAG. -Modifying LLMs or prompts can substantially impact the overall quality of responses. +The result is as follows: -This iterative process demonstrates how, starting from scratch, continual evaluation and adjustments throughout experimentation can lead to the development of an enhanced RAG system. +```bash +Result number 1 is "A world-weary political journalist picks up the story of a woman's search for her son, who was taken away from her decades ago after she became pregnant and was forced to live in a convent." +Result number 2 is "Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household." +... +Result number 9 is "A biopic detailing the 2 decades that Punjabi Sikh revolutionary Udham Singh spent planning the assassination of the man responsible for the Jallianwala Bagh massacre." +Result number 10 is "In 1431, Jeanne d'Arc is placed on trial on charges of heresy. The ecclesiastical jurists attempt to force Jeanne to recant her claims of holy visions." +``` -## [Anchor](https://qdrant.tech/articles/rapid-rag-optimization-with-qdrant-and-quotient/\#watch-this-workshop-on-youtube) Watch this workshop on YouTube +We can see that the description of *"The Messenger: The Story of Joan of Arc"*, which is the most fitting, appears 10th in the results. -> A workshop version of this article is [available on YouTube](https://www.youtube.com/watch?v=3MEMPZR1aZA). Follow along using our [GitHub notebook](https://github.com/qdrant/qdrant-rag-eval/tree/master/workshop-rag-eval-qdrant-quotient). +Let's try refining the order of the retrieved subset with `Jina Reranker v2`. It takes a query and a set of documents (movie descriptions) as input and calculates a relevance score based on token-level interactions between the query and each document. + +```python +new_scores = list( + reranker.rerank(query, description_hits) +) # returns scores between query and each document + +ranking = [ + (i, score) for i, score in enumerate(new_scores) +] # saving document indices +ranking.sort( + key=lambda x: x[1], reverse=True +) # sorting them in order of relevance defined by reranker -Rapid RAG Optimization with Qdrant and Quotient - YouTube +for i, rank in enumerate(ranking): + print(f'''Reranked result number {i+1} is \"{description_hits[rank[0]]}\"''') +``` -[Photo image of Qdrant - Vector Database & Search Engine](https://www.youtube.com/channel/UC6ftm8PwH1RU_LM1jwG0LQA?embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +The reranker moves the desired movie to the first position based on relevance. -Qdrant - Vector Database & Search Engine +```bash +Reranked result number 1 is "In 1431, Jeanne d'Arc is placed on trial on charges of heresy. The ecclesiastical jurists attempt to force Jeanne to recant her claims of holy visions." +Reranked result number 2 is "Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household." +... +Reranked result number 9 is "An ordinary word processor has the worst night of his life after he agrees to visit a girl in Soho whom he met that evening at a coffee shop." +Reranked result number 10 is "A biopic detailing the 2 decades that Punjabi Sikh revolutionary Udham Singh spent planning the assassination of the man responsible for the Jallianwala Bagh massacre." +``` -8.12K subscribers -[Rapid RAG Optimization with Qdrant and Quotient](https://www.youtube.com/watch?v=3MEMPZR1aZA) +## Conclusion -Qdrant - Vector Database & Search Engine +Rerankers refine search results by reordering retrieved candidates through deeper semantic analysis. For efficiency, they should be applied **only to a small subset of retrieved results**. -Search +Balance speed and accuracy in search by leveraging the power of rerankers! -Watch later +<|page-137-lllmstxt|> +# Administration -Share +Qdrant exposes administration tools which enable to modify at runtime the behavior of a qdrant instance without changing its configuration manually. -Copy link +## Locking -Info +A locking API enables users to restrict the possible operations on a qdrant process. +It is important to mention that: -Shopping +- The configuration is not persistent therefore it is necessary to lock again following a restart. +- Locking applies to a single node only. It is necessary to call lock on all the desired nodes in a distributed deployment setup. -Tap to unmute +Lock request sample: -If playback doesn't begin shortly, try restarting your device. +```http +POST /locks +{ + "error_message": "write is forbidden", + "write": true +} +``` -More videos +Write flags enables/disables write lock. +If the write lock is set to true, qdrant doesn't allow creating new collections or adding new data to the existing storage. +However, deletion operations or updates are not forbidden under the write lock. +This feature enables administrators to prevent a qdrant process from using more disk space while permitting users to search and delete unnecessary data. -## More videos +You can optionally provide the error message that should be used for error responses to users. -You're signed out +## Recovery mode -Videos you watch may be added to the TV's watch history and influence TV recommendations. To avoid this, cancel and sign in to YouTube on your computer. +*Available as of v1.2.0* -CancelConfirm +Recovery mode can help in situations where Qdrant fails to start repeatedly. +When starting in recovery mode, Qdrant only loads collection metadata to prevent +going out of memory. This allows you to resolve out of memory situations, for +example, by deleting a collection. After resolving Qdrant can be restarted +normally to continue operation. -Share +In recovery mode, collection operations are limited to +[deleting](/documentation/concepts/collections/#delete-collection) a +collection. That is because only collection metadata is loaded during recovery. -Include playlist +To enable recovery mode with the Qdrant Docker image you must set the +environment variable `QDRANT_ALLOW_RECOVERY_MODE=true`. The container will try +to start normally first, and restarts in recovery mode if initialisation fails +due to an out of memory error. This behavior is disabled by default. -An error occurred while retrieving sharing information. Please try again later. +If using a Qdrant binary, recovery mode can be enabled by setting a recovery +message in an environment variable, such as +`QDRANT__STORAGE__RECOVERY_MODE="My recovery message"`. -[Watch on](https://www.youtube.com/watch?v=3MEMPZR1aZA&embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) -0:00 +## Strict mode -0:00 / 51:40 -‱Live +*Available as of v1.13.0* -‱ +Strict mode is a feature to restrict certain type of operations on the collection in order to protect it. -[Watch on YouTube](https://www.youtube.com/watch?v=3MEMPZR1aZA "Watch on YouTube") +The goal is to prevent inefficient usage patterns that could overload the collections. -##### Was this page useful? +This configuration ensures a more predictible and responsive service when you do not have control over the queries that are being executed. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Here is a non exhaustive list of operations that can be restricted using strict mode: -Thank you for your feedback! 🙏 +- Preventing querying non indexed payload which can be very slow +- Maximum number of filtering conditions in a query +- Maximum batch size when inserting vectors +- Maximum collection size (in terms of vectors or payload size) -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/rapid-rag-optimization-with-qdrant-and-quotient.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +See [schema definitions](https://api.qdrant.tech/api-reference/collections/create-collection#request.body.strict_mode_config) for all the `strict_mode_config` parameters. -On this page: +Upon crossing a limit, the server will return a client side error with the information about the limit that was crossed. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/rapid-rag-optimization-with-qdrant-and-quotient.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +As part of the config, the `enabled` field act as a toggle to enable or disable the strict mode dynamically. -× +The `strict_mode_config` can be enabled when [creating](#create-a-collection) a collection, for instance below to activate the `unindexed_filtering_retrieve` limit. -[Powered by](https://qdrant.tech/) +Setting `unindexed_filtering_retrieve` to false prevents the usage of filtering on a non indexed payload key. -<|page-92-lllmstxt|> -## hybrid-queries -- [Documentation](https://qdrant.tech/documentation/) -- [Concepts](https://qdrant.tech/documentation/concepts/) -- Hybrid Queries +```http +PUT /collections/{collection_name} +{ + "strict_mode_config": { + "enabled": true, + "unindexed_filtering_retrieve": false + } +} +``` -# [Anchor](https://qdrant.tech/documentation/concepts/hybrid-queries/\#hybrid-and-multi-stage-queries) Hybrid and Multi-Stage Queries +```bash +curl -X PUT http://localhost:6333/collections/{collection_name} \ + -H 'Content-Type: application/json' \ + --data-raw '{ + "strict_mode_config": { + "enabled":" true, + "unindexed_filtering_retrieve": false + } + }' +``` -_Available as of v1.10.0_ +```python +from qdrant_client import QdrantClient, models -With the introduction of [many named vectors per point](https://qdrant.tech/documentation/concepts/vectors/#named-vectors), there are use-cases when the best search is obtained by combining multiple queries, -or by performing the search in more than one stage. +client = QdrantClient(url="http://localhost:6333") -Qdrant has a flexible and universal interface to make this possible, called `Query API` ( [API reference](https://api.qdrant.tech/api-reference/search/query-points)). +client.create_collection( + collection_name="{collection_name}", + strict_mode_config=models.StrictModeConfig(enabled=True, unindexed_filtering_retrieve=false), +) +``` -The main component for making the combinations of queries possible is the `prefetch` parameter, which enables making sub-requests. +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -Specifically, whenever a query has at least one prefetch, Qdrant will: +const client = new QdrantClient({ host: "localhost", port: 6333 }); -1. Perform the prefetch query (or queries), -2. Apply the main query over the results of its prefetch(es). +client.createCollection("{collection_name}", { + strict_mode_config: { + enabled: true, + unindexed_filtering_retrieve: false, + }, +}); +``` -Additionally, prefetches can have prefetches themselves, so you can have nested prefetches. +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{CreateCollectionBuilder, StrictModeConfigBuilder}; -## [Anchor](https://qdrant.tech/documentation/concepts/hybrid-queries/\#hybrid-search) Hybrid Search +let client = Qdrant::from_url("http://localhost:6334").build()?; -One of the most common problems when you have different representations of the same data is to combine the queried points for each representation into a single result. +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .strict_config_mode(StrictModeConfigBuilder::default().enabled(true).unindexed_filtering_retrieve(false)), + ) + .await?; +``` -![Fusing results from multiple queries](https://qdrant.tech/docs/fusion-idea.png) +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.StrictModeCOnfig; -Fusing results from multiple queries +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -For example, in text search, it is often useful to combine dense and sparse vectors get the best of semantics, -plus the best of matching specific words. +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setStrictModeConfig( + StrictModeConfig.newBuilder().setEnabled(true).setUnindexedFilteringRetrieve(false).build()) + .build()) + .get(); +``` -Qdrant currently has two ways of combining the results from different queries: +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -- `rrf` - -[Reciprocal Rank Fusion](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) +var client = new QdrantClient("localhost", 6334); -Considers the positions of results within each query, and boosts the ones that appear closer to the top in multiple of them. +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + strictModeConfig: new StrictModeConfig { enabled = true, unindexed_filtering_retrieve = false } +); +``` -- `dbsf` - -[Distribution-Based Score Fusion](https://medium.com/plain-simple-software/distribution-based-score-fusion-dbsf-a-new-approach-to-vector-search-ranking-f87c37488b18) _(available as of v1.11.0)_ +```go +import ( + "context" -Normalizes the scores of the points in each query, using the mean +/- the 3rd standard deviation as limits, and then sums the scores of the same point across different queries. + "github.com/qdrant/go-client/qdrant" +) +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -Here is an example of Reciprocal Rank Fusion for a query containing two prefetches against different named vectors configured to respectively hold sparse and dense vectors. +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + StrictModeConfig: &qdrant.StrictModeConfig{ + Enabled: qdrant.PtrOf(true), + IndexingThreshold: qdrant.PtrOf(false), + }, +}) +``` -httppythontypescriptrustjavacsharpgo +Or activate it later on an existing collection through the [collection update](#update-collection-parameters) API: ```http -POST /collections/{collection_name}/points/query +PATCH /collections/{collection_name} { - "prefetch": [\ - {\ - "query": {\ - "indices": [1, 42], // <┐\ - "values": [0.22, 0.8] // <┮─sparse vector\ - },\ - "using": "sparse",\ - "limit": 20\ - },\ - {\ - "query": [0.01, 0.45, 0.67, ...], // <-- dense vector\ - "using": "dense",\ - "limit": 20\ - }\ - ], - "query": { "fusion": "rrf" }, // <--- reciprocal rank fusion - "limit": 10 + "strict_mode_config": { + "enabled": true, + "unindexed_filtering_retrieve": false + } } +``` +```bash +curl -X PATCH http://localhost:6333/collections/{collection_name} \ + -H 'Content-Type: application/json' \ + --data-raw '{ + "strict_mode_config": { + "enabled": true, + "unindexed_filtering_retrieve": false + } + }' ``` ```python @@ -33735,23 +50767,10 @@ from qdrant_client import QdrantClient, models client = QdrantClient(url="http://localhost:6333") -client.query_points( +client.update_collection( collection_name="{collection_name}", - prefetch=[\ - models.Prefetch(\ - query=models.SparseVector(indices=[1, 42], values=[0.22, 0.8]),\ - using="sparse",\ - limit=20,\ - ),\ - models.Prefetch(\ - query=[0.01, 0.45, 0.67], # <-- dense vector\ - using="dense",\ - limit=20,\ - ),\ - ], - query=models.FusionQuery(fusion=models.Fusion.RRF), + strict_mode_config=models.StrictModeConfig(enabled=True, unindexed_filtering_retrieve=False), ) - ``` ```typescript @@ -33759,84 +50778,36 @@ import { QdrantClient } from "@qdrant/js-client-rest"; const client = new QdrantClient({ host: "localhost", port: 6333 }); -client.query("{collection_name}", { - prefetch: [\ - {\ - query: {\ - values: [0.22, 0.8],\ - indices: [1, 42],\ - },\ - using: 'sparse',\ - limit: 20,\ - },\ - {\ - query: [0.01, 0.45, 0.67],\ - using: 'dense',\ - limit: 20,\ - },\ - ], - query: { - fusion: 'rrf', - }, +client.updateCollection("{collection_name}", { + strict_mode_config: { + enabled: true, + unindexed_filtering_retrieve: false, + }, }); - ``` ```rust -use qdrant_client::Qdrant; -use qdrant_client::qdrant::{Fusion, PrefetchQueryBuilder, Query, QueryPointsBuilder}; - -let client = Qdrant::from_url("http://localhost:6334").build()?; - -client.query( - QueryPointsBuilder::new("{collection_name}") - .add_prefetch(PrefetchQueryBuilder::default() - .query(Query::new_nearest([(1, 0.22), (42, 0.8)].as_slice())) - .using("sparse") - .limit(20u64) - ) - .add_prefetch(PrefetchQueryBuilder::default() - .query(Query::new_nearest(vec![0.01, 0.45, 0.67])) - .using("dense") - .limit(20u64) - ) - .query(Query::new_fusion(Fusion::Rrf)) -).await?; +use qdrant_client::qdrant::{StrictModeConfigBuilder, UpdateCollectionBuilder}; +client + .update_collection( + UpdateCollectionBuilder::new("{collection_name}").strict_mode_config( + StrictModeConfigBuilder::default().enabled(true).unindexed_filtering_retrieve(false), + ), + ) + .await?; ``` ```java -import static io.qdrant.client.QueryFactory.nearest; - -import java.util.List; - -import static io.qdrant.client.QueryFactory.fusion; - -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.Fusion; -import io.qdrant.client.grpc.Points.PrefetchQuery; -import io.qdrant.client.grpc.Points.QueryPoints; - -QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); - -client.queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .addPrefetch(PrefetchQuery.newBuilder() - .setQuery(nearest(List.of(0.22f, 0.8f), List.of(1, 42))) - .setUsing("sparse") - .setLimit(20) - .build()) - .addPrefetch(PrefetchQuery.newBuilder() - .setQuery(nearest(List.of(0.01f, 0.45f, 0.67f))) - .setUsing("dense") - .setLimit(20) - .build()) - .setQuery(fusion(Fusion.RRF)) - .build()) - .get(); +import io.qdrant.client.grpc.Collections.StrictModeConfigBuilder; +import io.qdrant.client.grpc.Collections.UpdateCollection; +client.updateCollectionAsync( + UpdateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setStrictModeConfig( + StrictModeConfig.newBuilder().setEnabled(true).setUnindexedFilteringRetrieve(false).build()) + .build()); ``` ```csharp @@ -33845,95 +50816,52 @@ using Qdrant.Client.Grpc; var client = new QdrantClient("localhost", 6334); -await client.QueryAsync( +await client.UpdateCollectionAsync( collectionName: "{collection_name}", - prefetch: new List < PrefetchQuery > { - new() { - Query = new(float, uint)[] { - (0.22f, 1), (0.8f, 42), - }, - Using = "sparse", - Limit = 20 - }, - new() { - Query = new float[] { - 0.01f, 0.45f, 0.67f - }, - Using = "dense", - Limit = 20 - } - }, - query: Fusion.Rrf + strictModeConfig: new StrictModeConfig { Enabled = true, UnindexedFilteringRetrieve = false } ); - ``` ```go import ( - "context" + "context" - "github.com/qdrant/go-client/qdrant" + "github.com/qdrant/go-client/qdrant" ) client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, + Host: "localhost", + Port: 6334, }) -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Prefetch: []*qdrant.PrefetchQuery{ - { - Query: qdrant.NewQuerySparse([]uint32{1, 42}, []float32{0.22, 0.8}), - Using: qdrant.PtrOf("sparse"), - }, - { - Query: qdrant.NewQueryDense([]float32{0.01, 0.45, 0.67}), - Using: qdrant.PtrOf("dense"), - }, - }, - Query: qdrant.NewQueryFusion(qdrant.Fusion_RRF), +client.UpdateCollection(context.Background(), &qdrant.UpdateCollection{ + CollectionName: "{collection_name}", + StrictModeConfig: &qdrant.StrictModeConfig{ + Enabled: qdrant.PtrOf(true), + UnindexedFilteringRetrieve: qdrant.PtrOf(false), + }, }) - ``` -## [Anchor](https://qdrant.tech/documentation/concepts/hybrid-queries/\#multi-stage-queries) Multi-stage queries - -In many cases, the usage of a larger vector representation gives more accurate search results, but it is also more expensive to compute. - -Splitting the search into two stages is a known technique: - -- First, use a smaller and cheaper representation to get a large list of candidates. -- Then, re-score the candidates using the larger and more accurate representation. - -There are a few ways to build search architectures around this idea: - -- The quantized vectors as a first stage, and the full-precision vectors as a second stage. -- Leverage Matryoshka Representation Learning ( [MRL](https://arxiv.org/abs/2205.13147)) to generate candidate vectors with a shorter vector, and then refine them with a longer one. -- Use regular dense vectors to pre-fetch the candidates, and then re-score them with a multi-vector model like [ColBERT](https://arxiv.org/abs/2112.01488). - -To get the best of all worlds, Qdrant has a convenient interface to perform the queries in stages, -such that the coarse results are fetched first, and then they are refined later with larger vectors. - -### [Anchor](https://qdrant.tech/documentation/concepts/hybrid-queries/\#re-scoring-examples) Re-scoring examples - -Fetch 1000 results using a shorter MRL byte vector, then re-score them using the full vector and get the top 10. - -httppythontypescriptrustjavacsharpgo +To disable completely strict mode on an existing collection use: ```http -POST /collections/{collection_name}/points/query +PATCH /collections/{collection_name} { - "prefetch": { - "query": [1, 23, 45, 67], // <------------- small byte vector - "using": "mrl_byte" - "limit": 1000 - }, - "query": [0.01, 0.299, 0.45, 0.67, ...], // <-- full vector - "using": "full", - "limit": 10 + "strict_mode_config": { + "enabled": false + } } +``` +```bash +curl -X PATCH http://localhost:6333/collections/{collection_name} \ + -H 'Content-Type: application/json' \ + --data-raw '{ + "strict_mode_config": { + "enabled": false, + } + }' ``` ```python @@ -33941,18 +50869,10 @@ from qdrant_client import QdrantClient, models client = QdrantClient(url="http://localhost:6333") -client.query_points( +client.update_collection( collection_name="{collection_name}", - prefetch=models.Prefetch( - query=[1, 23, 45, 67], # <------------- small byte vector - using="mrl_byte", - limit=1000, - ), - query=[0.01, 0.299, 0.45, 0.67], # <-- full vector - using="full", - limit=10, + strict_mode_config=models.StrictModeConfig(enabled=False), ) - ``` ```typescript @@ -33960,66 +50880,35 @@ import { QdrantClient } from "@qdrant/js-client-rest"; const client = new QdrantClient({ host: "localhost", port: 6333 }); -client.query("{collection_name}", { - prefetch: { - query: [1, 23, 45, 67], // <------------- small byte vector - using: 'mrl_byte', - limit: 1000, +client.updateCollection("{collection_name}", { + strict_mode_config: { + enabled: false, }, - query: [0.01, 0.299, 0.45, 0.67], // <-- full vector, - using: 'full', - limit: 10, }); - ``` ```rust -use qdrant_client::Qdrant; -use qdrant_client::qdrant::{PrefetchQueryBuilder, Query, QueryPointsBuilder}; - -let client = Qdrant::from_url("http://localhost:6334").build()?; - -client.query( - QueryPointsBuilder::new("{collection_name}") - .add_prefetch(PrefetchQueryBuilder::default() - .query(Query::new_nearest(vec![1.0, 23.0, 45.0, 67.0])) - .using("mlr_byte") - .limit(1000u64) - ) - .query(Query::new_nearest(vec![0.01, 0.299, 0.45, 0.67])) - .using("full") - .limit(10u64) -).await?; +use qdrant_client::qdrant::{StrictModeConfigBuilder, UpdateCollectionBuilder}; +client + .update_collection( + UpdateCollectionBuilder::new("{collection_name}").strict_mode_config( + StrictModeConfigBuilder::default().enabled(false), + ), + ) + .await?; ``` ```java -import static io.qdrant.client.QueryFactory.nearest; - -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.PrefetchQuery; -import io.qdrant.client.grpc.Points.QueryPoints; - -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); - -client - .queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .addPrefetch( - PrefetchQuery.newBuilder() - .setQuery(nearest(1, 23, 45, 67)) // <------------- small byte vector - .setLimit(1000) - .setUsing("mrl_byte") - .build()) - .setQuery(nearest(0.01f, 0.299f, 0.45f, 0.67f)) // <-- full vector - .setUsing("full") - .setLimit(10) - .build()) - .get(); +import io.qdrant.client.grpc.Collections.StrictModeConfigBuilder; +import io.qdrant.client.grpc.Collections.UpdateCollection; +client.updateCollectionAsync( + UpdateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setStrictModeConfig( + StrictModeConfig.newBuilder().setEnabled(false).build()) + .build()); ``` ```csharp @@ -34028,829 +50917,465 @@ using Qdrant.Client.Grpc; var client = new QdrantClient("localhost", 6334); -await client.QueryAsync( +await client.UpdateCollectionAsync( collectionName: "{collection_name}", - prefetch: new List { - new() { - Query = new float[] { 1,23, 45, 67 }, // <------------- small byte vector - Using = "mrl_byte", - Limit = 1000 - } - }, - query: new float[] { 0.01f, 0.299f, 0.45f, 0.67f }, // <-- full vector - usingVector: "full", - limit: 10 + strictModeConfig: new StrictModeConfig { Enabled = false } ); - ``` ```go import ( - "context" + "context" - "github.com/qdrant/go-client/qdrant" + "github.com/qdrant/go-client/qdrant" ) client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, + Host: "localhost", + Port: 6334, }) -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Prefetch: []*qdrant.PrefetchQuery{ - { - Query: qdrant.NewQueryDense([]float32{1, 23, 45, 67}), - Using: qdrant.PtrOf("mrl_byte"), - Limit: qdrant.PtrOf(uint64(1000)), - }, - }, - Query: qdrant.NewQueryDense([]float32{0.01, 0.299, 0.45, 0.67}), - Using: qdrant.PtrOf("full"), +client.UpdateCollection(context.Background(), &qdrant.UpdateCollection{ + CollectionName: "{collection_name}", + StrictModeConfig: &qdrant.StrictModeConfig{ + Enabled: qdrant.PtrOf(false), + }, }) - ``` -Fetch 100 results using the default vector, then re-score them using a multi-vector to get the top 10. +<|page-138-lllmstxt|> +# Running Qdrant with GPU Support -httppythontypescriptrustjavacsharpgo +Starting from version v1.13.0, Qdrant offers support for GPU acceleration. -```http -POST /collections/{collection_name}/points/query -{ - "prefetch": { - "query": [0.01, 0.45, 0.67, ...], // <-- dense vector - "limit": 100 - }, - "query": [ // <─┐\ - [0.1, 0.2, ...], // < │\ - [0.2, 0.1, ...], // < ├─ multi-vector\ - [0.8, 0.9, ...] // < │\ - ], // <─┘ - "using": "colbert", - "limit": 10 -} +However, GPU support is not included in the default Qdrant binary due to additional dependencies and libraries. Instead, you will need to use dedicated Docker images with GPU support ([NVIDIA](#nvidia-gpus), [AMD](#amd-gpus)). + + +## Configuration + +Qdrant includes a number of configuration options to control GPU usage. The following options are available: +```yaml +gpu: + # Enable GPU indexing. + indexing: false + # Force half precision for `f32` values while indexing. + # `f16` conversion will take place + # only inside GPU memory and won't affect storage type. + force_half_precision: false + # Used vulkan "groups" of GPU. + # In other words, how many parallel points can be indexed by GPU. + # Optimal value might depend on the GPU model. + # Proportional, but doesn't necessary equal + # to the physical number of warps. + # Do not change this value unless you know what you are doing. + # Default: 512 + groups_count: 512 + # Filter for GPU devices by hardware name. Case insensitive. + # Comma-separated list of substrings to match + # against the gpu device name. + # Example: "nvidia" + # Default: "" - all devices are accepted. + device_filter: "" + # List of explicit GPU devices to use. + # If host has multiple GPUs, this option allows to select specific devices + # by their index in the list of found devices. + # If `device_filter` is set, indexes are applied after filtering. + # By default, all devices are accepted. + devices: null + # How many parallel indexing processes are allowed to run. + # Default: 1 + parallel_indexes: 1 + # Allow to use integrated GPUs. + # Default: false + allow_integrated: false + # Allow to use emulated GPUs like LLVMpipe. Useful for CI. + # Default: false + allow_emulated: false ``` -```python -from qdrant_client import QdrantClient, models +It is not recommended to change these options unless you are familiar with the Qdrant internals and the Vulkan API. -client = QdrantClient(url="http://localhost:6333") -client.query_points( - collection_name="{collection_name}", - prefetch=models.Prefetch( - query=[0.01, 0.45, 0.67, 0.53], # <-- dense vector - limit=100, - ), - query=[\ - [0.1, 0.2, 0.32], # <─┐\ - [0.2, 0.1, 0.52], # < ├─ multi-vector\ - [0.8, 0.9, 0.93], # < ┘\ - ], - using="colbert", - limit=10, -) +## Standalone GPU Support + +For standalone usage, you can build Qdrant with GPU support by running the following command: +```bash +cargo build --release --features gpu ``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +Ensure your device supports Vulkan API v1.3. This includes compatibility with Apple Silicon, Intel GPUs, and CPU emulators. Note that `gpu.indexing: true` must be set in your configuration to use GPUs at runtime. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +## NVIDIA GPUs -client.query("{collection_name}", { - prefetch: { - query: [1, 23, 45, 67], // <------------- small byte vector - limit: 100, - }, - query: [\ - [0.1, 0.2], // <─┐\ - [0.2, 0.1], // < ├─ multi-vector\ - [0.8, 0.9], // < ┘\ - ], - using: 'colbert', - limit: 10, -}); +### Prerequisites + +To use Docker with NVIDIA GPU support, ensure the following are installed on your host: +- Latest NVIDIA drivers +- [nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) + +Most AI or CUDA images on Amazon/GCP come pre-configured with the NVIDIA container toolkit. +### Docker images with NVIDIA GPU support + +Docker images with NVIDIA GPU support use the tag suffix `gpu-nvidia`, e.g., `qdrant/qdrant:v1.13.0-gpu-nvidia`. These images include all necessary dependencies. + +To enable GPU support, use the `--gpus=all` flag with Docker settings. Example: + +```bash +# `--gpus=all` flag says to Docker that we want to use GPUs. +# `-e QDRANT__GPU__INDEXING=1` flag says to Qdrant that we want to use GPUs for indexing. +docker run \ + --rm \ + --gpus=all \ + -p 6333:6333 \ + -p 6334:6334 \ + -e QDRANT__GPU__INDEXING=1 \ + qdrant/qdrant:gpu-nvidia-latest ``` -```rust -use qdrant_client::Qdrant; -use qdrant_client::qdrant::{PrefetchQueryBuilder, Query, QueryPointsBuilder}; +To ensure that the GPU was initialized correctly, you may check it in logs. First Qdrant prints all found GPU devices without filtering and then prints list of all created devices: -let client = Qdrant::from_url("http://localhost:6334").build()?; +```text +2025-01-13T11:58:29.124087Z INFO gpu::instance: Found GPU device: NVIDIA GeForce RTX 3090 +2025-01-13T11:58:29.124118Z INFO gpu::instance: Found GPU device: llvmpipe (LLVM 15.0.7, 256 bits) +2025-01-13T11:58:29.124138Z INFO gpu::device: Create GPU device NVIDIA GeForce RTX 3090 +``` -client.query( - QueryPointsBuilder::new("{collection_name}") - .add_prefetch(PrefetchQueryBuilder::default() - .query(Query::new_nearest(vec![0.01, 0.45, 0.67])) - .limit(100u64) - ) - .query(Query::new_nearest(vec![\ - vec![0.1, 0.2],\ - vec![0.2, 0.1],\ - vec![0.8, 0.9],\ - ])) - .using("colbert") - .limit(10u64) -).await?; +Here you can see that two devices were found: RTX 3090 and llvmpipe (a CPU-emulated GPU which is included in the Docker image). Later, you will see that only RTX was initialized. + +This concludes the setup. Now, you can start using this Qdrant instance. + +### Troubleshooting NVIDIA GPUs + +If your GPU is not detected in Docker, make sure your driver and `nvidia-container-toolkit` are up-to-date. +If needed, you can install latest version of `nvidia-container-toolkit` from it's GitHub Releases [page](https://github.com/NVIDIA/nvidia-container-toolkit/releases) + +Verify Vulkan API visibility in the Docker container using: + +```bash +docker run --rm --gpus=all qdrant/qdrant:gpu-nvidia-latest vulkaninfo --summary ``` -```java -import static io.qdrant.client.QueryFactory.nearest; +The system may show you an error message explaining why the NVIDIA device is not visible. +Note that if your NVIDIA GPU is not visible in Docker, the Docker image cannot use libGLX_nvidia.so.0 on your host. Here is what an error message could look like: -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.PrefetchQuery; -import io.qdrant.client.grpc.Points.QueryPoints; +```text +ERROR: [Loader Message] Code 0 : loader_scanned_icd_add: Could not get `vkCreateInstance` via `vk_icdGetInstanceProcAddr` for ICD libGLX_nvidia.so.0 +WARNING: [Loader Message] Code 0 : terminator_CreateInstance: Failed to CreateInstance in ICD 0. Skipping ICD. +``` -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +To resolve errors, update your NVIDIA container runtime configuration: -client - .queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .addPrefetch( - PrefetchQuery.newBuilder() - .setQuery(nearest(0.01f, 0.45f, 0.67f)) // <-- dense vector - .setLimit(100) - .build()) - .setQuery( - nearest( - new float[][] { - {0.1f, 0.2f}, // <─┐ - {0.2f, 0.1f}, // < ├─ multi-vector - {0.8f, 0.9f} // < ┘ - })) - .setUsing("colbert") - .setLimit(10) - .build()) - .get(); +```bash +sudo nano /etc/nvidia-container-runtime/config.toml +``` +Set `no-cgroups=false`, save the configuration, and restart Docker: + +```bash +sudo systemctl restart docker ``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +## AMD GPUs -var client = new QdrantClient("localhost", 6334); +### Prerequisites -await client.QueryAsync( - collectionName: "{collection_name}", - prefetch: new List { - new() { - Query = new float[] { 0.01f, 0.45f, 0.67f }, // <-- dense vector**** - Limit = 100 - } - }, - query: new float[][] { - [0.1f, 0.2f], // <─┐ - [0.2f, 0.1f], // < ├─ multi-vector - [0.8f, 0.9f] // < ┘ - }, - usingVector: "colbert", - limit: 10 -); +Running Qdrant with AMD GPUs requires [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/detailed-install.html) to be installed on your host. + +### Docker images with AMD GPU support + +Docker images for AMD GPUs use the tag suffix `gpu-amd`, e.g., `qdrant/qdrant:v1.13.0-gpu-amd`. These images include all required dependencies. + +To enable GPU for Docker, you need additional `--device /dev/kfd --device /dev/dri` flags. To enable GPU for Qdrant you need to set the enable flag. Here is an example: + +```bash +# `--device /dev/kfd --device /dev/dri` flags say to Docker that we want to use GPUs. +# `-e QDRANT__GPU__INDEXING=1` flag says to Qdrant that we want to use GPUs for indexing. +docker run \ + --rm \ + --device /dev/kfd --device /dev/dri \ + -p 6333:6333 \ + -p 6334:6334 \ + -e QDRANT__LOG_LEVEL=debug \ + -e QDRANT__GPU__INDEXING=1 \ + qdrant/qdrant:gpu-amd-latest +``` + +Check logs to confirm GPU initialization. Example log output: +```text +2025-01-10T11:56:55.926466Z INFO gpu::instance: Found GPU device: AMD Radeon Graphics (RADV GFX1103_R1) +2025-01-10T11:56:55.926485Z INFO gpu::instance: Found GPU device: llvmpipe (LLVM 17.0.6, 256 bits) +2025-01-10T11:56:55.926504Z INFO gpu::device: Create GPU device AMD Radeon Graphics (RADV GFX1103_R1) ``` -```go -import ( - "context" +This concludes the setup. In a basic scenario, you won't need to configure anything else. - "github.com/qdrant/go-client/qdrant" -) -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +## Known limitations -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Prefetch: []*qdrant.PrefetchQuery{ - { - Query: qdrant.NewQueryDense([]float32{0.01, 0.45, 0.67}), - Limit: qdrant.PtrOf(uint64(100)), - }, - }, - Query: qdrant.NewQueryMulti([][]float32{ - {0.1, 0.2}, - {0.2, 0.1}, - {0.8, 0.9}, - }), - Using: qdrant.PtrOf("colbert"), -}) +* **Platform Support:** Docker images are only available for Linux x86_64. Windows, macOS, ARM, and other platforms are not supported. +* **Memory Limits:** Each GPU can process up to 16GB of vector data per indexing iteration. + +Due to this limitation, you should not create segments where either original vectors OR quantized vectors are larger than 16GB. + +For example, a collection with 1536d vectors and scalar quantization can have at most: + +```text +16Gb / 1536 ~= 11 million vectors per segment ``` -It is possible to combine all the above techniques in a single query: +And without quantization: + +```text +16Gb / 1536 * 4 ~= 2.7 million vectors per segment +``` -httppythontypescriptrustjavacsharpgo +The maximum size of each segment can be configured in the collection settings. +Use the following operation to [change](/documentation/concepts/collections/#update-collection-parameters) on your existing collection: ```http -POST /collections/{collection_name}/points/query +PATCH collections/{collection_name} { - "prefetch": { - "prefetch": { - "query": [1, 23, 45, 67], // <------ small byte vector - "using": "mrl_byte" - "limit": 1000 - }, - "query": [0.01, 0.45, 0.67, ...], // <-- full dense vector - "using": "full" - "limit": 100 - }, - "query": [ // <─┐\ - [0.1, 0.2, ...], // < │\ - [0.2, 0.1, ...], // < ├─ multi-vector\ - [0.8, 0.9, ...] // < │\ - ], // <─┘ - "using": "colbert", - "limit": 10 + "optimizers_config": { + "max_segment_size": 1000000 + } } - ``` -```python -from qdrant_client import QdrantClient, models - -client = QdrantClient(url="http://localhost:6333") +Note that `max_segment_size` is specified in KiloBytes. -client.query_points( - collection_name="{collection_name}", - prefetch=models.Prefetch( - prefetch=models.Prefetch( - query=[1, 23, 45, 67], # <------ small byte vector - using="mrl_byte", - limit=1000, - ), - query=[0.01, 0.45, 0.67], # <-- full dense vector - using="full", - limit=100, - ), - query=[\ - [0.17, 0.23, 0.52], # <─┐\ - [0.22, 0.11, 0.63], # < ├─ multi-vector\ - [0.86, 0.93, 0.12], # < ┘\ - ], - using="colbert", - limit=10, -) +<|page-139-lllmstxt|> +# Capacity Planning -``` +When setting up your cluster, you'll need to figure out the right balance of **RAM** and **disk storage**. The best setup depends on a few things: -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +- How many vectors you have and their dimensions. +- The amount of payload data you're using and their indexes. +- What data you want to store in memory versus on disk. +- Your cluster's replication settings. +- Whether you're using quantization and how you’ve set it up. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +## Calculating RAM size -client.query("{collection_name}", { - prefetch: { - prefetch: { - query: [1, 23, 45, 67], // <------------- small byte vector - using: 'mrl_byte', - limit: 1000, - }, - query: [0.01, 0.45, 0.67], // <-- full dense vector - using: 'full', - limit: 100, - }, - query: [\ - [0.1, 0.2], // <─┐\ - [0.2, 0.1], // < ├─ multi-vector\ - [0.8, 0.9], // < ┘\ - ], - using: 'colbert', - limit: 10, -}); +You should store frequently accessed data in RAM for faster retrieval. If you want to keep all vectors in memory for optimal performance, you can use this rough formula for estimation: +```text +memory_size = number_of_vectors * vector_dimension * 4 bytes * 1.5 ``` -```rust -use qdrant_client::Qdrant; -use qdrant_client::qdrant::{PrefetchQueryBuilder, Query, QueryPointsBuilder}; - -let client = Qdrant::from_url("http://localhost:6334").build()?; +At the end, we multiply everything by 1.5. This extra 50% accounts for metadata (such as indexes and point versions) and temporary segments created during optimization. -client.query( - QueryPointsBuilder::new("{collection_name}") - .add_prefetch(PrefetchQueryBuilder::default() - .add_prefetch(PrefetchQueryBuilder::default() - .query(Query::new_nearest(vec![1.0, 23.0, 45.0, 67.0])) - .using("mlr_byte") - .limit(1000u64) - ) - .query(Query::new_nearest(vec![0.01, 0.45, 0.67])) - .using("full") - .limit(100u64) - ) - .query(Query::new_nearest(vec![\ - vec![0.1, 0.2],\ - vec![0.2, 0.1],\ - vec![0.8, 0.9],\ - ])) - .using("colbert") - .limit(10u64) -).await?; +Let's say you want to store 1 million vectors with 1024 dimensions: +```text +memory_size = 1,000,000 * 1024 * 4 bytes * 1.5 ``` +The memory_size is approximately 6,144,000,000 bytes, or about 5.72 GB. -```java -import static io.qdrant.client.QueryFactory.nearest; +Depending on the use case, large datasets can benefit from reduced memory requirements via [quantization](/documentation/guides/quantization/). -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.PrefetchQuery; -import io.qdrant.client.grpc.Points.QueryPoints; +## Calculating payload size -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +This is always different. The size of the payload depends on the [structure and content of your data](/documentation/concepts/payload/#payload-types). For instance: -client - .queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .addPrefetch( - PrefetchQuery.newBuilder() - .addPrefetch( - PrefetchQuery.newBuilder() - .setQuery(nearest(1, 23, 45, 67)) // <------------- small byte vector - .setUsing("mrl_byte") - .setLimit(1000) - .build()) - .setQuery(nearest(0.01f, 0.45f, 0.67f)) // <-- dense vector - .setUsing("full") - .setLimit(100) - .build()) - .setQuery( - nearest( - new float[][] { - {0.1f, 0.2f}, // <─┐ - {0.2f, 0.1f}, // < ├─ multi-vector - {0.8f, 0.9f} // < ┘ - })) - .setUsing("colbert") - .setLimit(10) - .build()) - .get(); +- **Text fields** consume space based on length and encoding (e.g. a large chunk of text vs a few words). +- **Floats** have fixed sizes of 8 bytes for `int64` or `float64`. +- **Boolean fields** typically consume 1 byte. -``` + -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +Calculating total payload size is similar to vectors. We have to multiply it by 1.5 for back-end indexing processes. -var client = new QdrantClient("localhost", 6334); +```text +total_payload_size = number_of_points * payload_size * 1.5 +``` -await client.QueryAsync( - collectionName: "{collection_name}", - prefetch: new List { - new() { - Prefetch = { - new List { - new() { - Query = new float[] { 1, 23, 45, 67 }, // <------------- small byte vector - Using = "mrl_byte", - Limit = 1000 - }, - } - }, - Query = new float[] {0.01f, 0.45f, 0.67f}, // <-- dense vector - Using = "full", - Limit = 100 - } - }, - query: new float[][] { - [0.1f, 0.2f], // <─┐ - [0.2f, 0.1f], // < ├─ multi-vector - [0.8f, 0.9f] // < ┘ - }, - usingVector: "colbert", - limit: 10 -); +Let's say you want to store 1 million points with JSON payloads of 5KB: +```text +total_payload_size = 1,000,000 * 5KB * 1.5 ``` +The total_payload_size is approximately 5,000,000 bytes, or about 4.77 GB. -```go -import ( - "context" +## Choosing disk over RAM - "github.com/qdrant/go-client/qdrant" -) +For optimal performance, you should store only frequently accessed data in RAM. The rest should be offloaded to the disk. For example, extra payload fields that you don't use for filtering can be stored on disk. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Only [indexed fields](/documentation/concepts/indexing/#payload-index) should be stored in RAM. You can read more about payload storage in the [Storage](/documentation/concepts/storage/#payload-storage) section. -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Prefetch: []*qdrant.PrefetchQuery{ - { - Prefetch: []*qdrant.PrefetchQuery{ - { - Query: qdrant.NewQueryDense([]float32{1, 23, 45, 67}), - Using: qdrant.PtrOf("mrl_byte"), - Limit: qdrant.PtrOf(uint64(1000)), - }, - }, - Query: qdrant.NewQueryDense([]float32{0.01, 0.45, 0.67}), - Limit: qdrant.PtrOf(uint64(100)), - Using: qdrant.PtrOf("full"), - }, - }, - Query: qdrant.NewQueryMulti([][]float32{ - {0.1, 0.2}, - {0.2, 0.1}, - {0.8, 0.9}, - }), - Using: qdrant.PtrOf("colbert"), -}) +### Storage-focused configuration -``` +If your priority is to handle large volumes of vectors with average search latency, it's recommended to configure [memory-mapped (mmap) storage](/documentation/concepts/storage/#configuring-memmap-storage). In this setup, vectors are stored on disk in memory-mapped files, while only the most frequently accessed vectors are cached in RAM. -## [Anchor](https://qdrant.tech/documentation/concepts/hybrid-queries/\#score-boosting) Score boosting +The amount of available RAM greatly impacts search performance. As a general rule, if you store half as many vectors in RAM, search latency will roughly double. -_Available as of v1.14.0_ +Disk speed is also crucial. [Contact us](/documentation/support/) if you have specific requirements for high-volume searches in our Cloud. -When introducing vector search to specific applications, sometimes business logic needs to be considered for ranking the final list of results. +### Subgroup-oriented configuration -A quick example is [our own documentation search bar](https://github.com/qdrant/page-search). -It has vectors for every part of the documentation site. If one were to perform a search by “just” using the vectors, all kinds of elements would be equally considered good results. -However, when searching for documentation, we can establish a hierarchy of importance: +If your use case involves splitting vectors into multiple collections or subgroups based on payload values (e.g., serving searches for multiple users, each with their own subset of vectors), memory-mapped storage is recommended. -`title > content > snippets` +In this scenario, only the active subset of vectors will be cached in RAM, allowing for fast searches for the most recent and active users. You can estimate the required memory size as: -One way to solve this is to weight the results based on the kind of element. -For example, we can assign a higher weight to titles and content, and keep snippets unboosted. +```text +memory_size = number_of_active_vectors * vector_dimension * 4 bytes * 1.5 +``` -Pseudocode would be something like: +Please refer to our [multitenancy](/documentation/guides/multiple-partitions/) documentation for more details on partitioning data in a Qdrant. -`score = score + (is_title * 0.5) + (is_content * 0.25)` +## Scaling disk space in Qdrant Cloud -Query API can rescore points with custom formulas. They can be based on: +Clusters supporting vector search require substantial disk space compared to other search systems. If you're running low on disk space, you can use the UI at [cloud.qdrant.io](https://cloud.qdrant.io/) to **Scale Up** your cluster. -- Dynamic payload values -- Conditions -- Scores of prefetches + -To express the formula, the syntax uses objects to identify each element. -Taking the documentation example, the request would look like this: +When running low on disk space, consider the following benefits of scaling up: -httppythontypescriptrustjavacsharpgo +- **Larger Datasets**: Supports larger datasets, which can improve the relevance and quality of search results. +- **Improved Indexing**: Enables the use of advanced indexing strategies like HNSW. +- **Caching**: Enhances speed by having more RAM, allowing more frequently accessed data to be cached. +- **Backups and Redundancy**: Facilitates more frequent backups, which is a key advantage for data safety. -```http -POST /collections/{collection_name}/points/query -{ - "prefetch": { - "query": [0.2, 0.8, ...], // <-- dense vector - "limit": 50 - } - "query": { - "formula": { - "sum": [\ - "$score,\ - {\ - "mult": [\ - 0.5,\ - {\ - "key": "tag",\ - "match": { "any": ["h1", "h2", "h3", "h4"] } }\ - ]\ - },\ - {\ - "mult": [\ - 0.25,\ - {\ - "key": "tag",\ - "match": { "any": ["p", "li"] }\ - }\ - ]\ - }\ - ] - } - } -} +Always remember to add 50% of the vector size. This would account for things like indexes and auxiliary data used during operations such as vector insertion, deletion, and search. Thus, the estimated memory size including metadata is: +```text +total_vector_size = number_of_dimensions * 4 bytes * 1.5 ``` -```python -from qdrant_client import models +**Disclaimer** -tag_boosted = client.query_points( - collection_name="{collection_name}", - prefetch=models.Prefetch( - query=[0.2, 0.8, ...], # <-- dense vector - limit=50 - ), - query=models.FormulaQuery( - formula=models.SumExpression(sum=[\ - "$score",\ - models.MultExpression(mult=[0.5, models.FieldCondition(key="tag", match=models.MatchAny(any=["h1", "h2", "h3", "h4"]))]),\ - models.MultExpression(mult=[0.25, models.FieldCondition(key="tag", match=models.MatchAny(any=["p", "li"]))])\ - ] - )) -) +The above calculations are estimates at best. If you're looking for more accurate numbers, you should always test your data set in practice. -``` +<|page-140-lllmstxt|> +# What is FastEmbed? +FastEmbed is a lightweight Python library built for embedding generation. It supports popular embedding models and offers a user-friendly experience for embedding data into vector space. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +By using FastEmbed, you can ensure that your embedding generation process is not only fast and efficient but also highly accurate, meeting the needs of various machine learning and natural language processing applications. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +FastEmbed easily integrates with Qdrant for a variety of multimodal search purposes. -const tag_boosted = await client.query(collectionName, { - prefetch: { - query: [0.2, 0.8, 0.1, 0.9], - limit: 50 - }, - query: { - formula: { - sum: [\ - "$score",\ - {\ - mult: [ 0.5, { key: "tag", match: { any: ["h1", "h2", "h3", "h4"] }} ]\ - },\ - {\ - mult: [ 0.25, { key: "tag", match: { any: ["p", "li"] }} ]\ - }\ - ] - } - } -}); +## Using FastEmbed -``` +| Type | Guide | What you'll learn | +|---|-------|--------------------| +| **Beginner** | [Generate Text Embeddings](/documentation/fastembed/fastembed-quickstart/) | Install FastEmbed and generate dense text embeddings | +| | [Dense Embeddings + Qdrant](/documentation/fastembed/fastembed-semantic-search/) | Generate and index dense embeddings for semantic similarity search | +| **Advanced** | [miniCOIL Sparse Embeddings + Qdrant](/documentation/fastembed/fastembed-minicoil/) | Use Qdrant's sparse neural retriever for exact text search | +| | [SPLADE Sparse Embeddings + Qdrant](/documentation/fastembed/fastembed-splade/) | Generate sparse neural embeddings for exact text search | +| | [ColBERT Multivector Embeddings + Qdrant](/documentation/fastembed/fastembed-colbert/) | Generate and index multi-vector representations; **ideal for rescoring, or small-scale retrieval** | +| | [Reranking with FastEmbed](/documentation/fastembed/fastembed-rerankers/) | Re-rank top-K results using FastEmbed cross-encoders | -```rust -use qdrant_client::qdrant::{ - Condition, Expression, FormulaBuilder, PrefetchQueryBuilder, QueryPointsBuilder, -}; -use qdrant_client::Qdrant; +## Why is FastEmbed useful? -let client = Qdrant::from_url("http://localhost:6334").build()?; +- Light: Unlike other inference frameworks, such as PyTorch, FastEmbed requires very little external dependencies. Because it uses the ONNX runtime, it is perfect for serverless environments like AWS Lambda. +- Fast: By using ONNX, FastEmbed ensures high-performance inference across various hardware platforms. +- Accurate: FastEmbed aims for better accuracy and recall than models like OpenAI's `Ada-002`. It always uses model which demonstrate strong results on the MTEB leaderboard. +- Support: FastEmbed supports a wide range of models, including multilingual ones, to meet diverse use case needs. -let _tag_boosted = client.query( - QueryPointsBuilder::new("{collection_name}") - .add_prefetch(PrefetchQueryBuilder::default() - .query(vec![0.01, 0.45, 0.67]) - .limit(100u64) - ) - .query(FormulaBuilder::new(Expression::sum_with([\ - Expression::score(),\ - Expression::mult_with([\ - Expression::constant(0.5),\ - Expression::condition(Condition::matches("tag", ["h1", "h2", "h3", "h4"])),\ - ]),\ - Expression::mult_with([\ - Expression::constant(0.25),\ - Expression::condition(Condition::matches("tag", ["p", "li"])),\ - ]),\ - ]))) - .limit(10) - ).await?; +<|page-141-lllmstxt|> +# Optimizing Qdrant Performance: Three Scenarios -``` +Different use cases require different balances between memory usage, search speed, and precision. Qdrant is designed to be flexible and customizable so you can tune it to your specific needs. -```java -import java.util.List; +This guide will walk you three main optimization strategies: -import static io.qdrant.client.ConditionFactory.matchKeywords; -import static io.qdrant.client.ExpressionFactory.condition; -import static io.qdrant.client.ExpressionFactory.constant; -import static io.qdrant.client.ExpressionFactory.mult; -import static io.qdrant.client.ExpressionFactory.sum; -import static io.qdrant.client.ExpressionFactory.variable; -import static io.qdrant.client.QueryFactory.formula; -import static io.qdrant.client.QueryFactory.nearest; +- High Speed Search & Low Memory Usage +- High Precision & Low Memory Usage +- High Precision & High Speed Search -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.Formula; -import io.qdrant.client.grpc.Points.MultExpression; -import io.qdrant.client.grpc.Points.PrefetchQuery; -import io.qdrant.client.grpc.Points.QueryPoints; -import io.qdrant.client.grpc.Points.SumExpression; +![qdrant resource tradeoffs](/docs/tradeoff.png) -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +## 1. High-Speed Search with Low Memory Usage -client - .queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .addPrefetch( - PrefetchQuery.newBuilder() - .setQuery(nearest(0.01f, 0.45f, 0.67f)) - .setLimit(100) - .build()) - .setQuery( - formula( - Formula.newBuilder() - .setExpression( - sum( - SumExpression.newBuilder() - .addSum(variable("$score")) - .addSum( - mult( - MultExpression.newBuilder() - .addMult(constant(0.5f)) - .addMult( - condition( - matchKeywords( - "tag", - List.of("h1", "h2", "h3", "h4")))) - .build())) - .addSum(mult(MultExpression.newBuilder() - .addMult(constant(0.25f)) - .addMult( - condition( - matchKeywords( - "tag", - List.of("p", "li")))) - .build())) - .build())) - .build())) - .build()) - .get(); +To achieve high search speed with minimal memory usage, you can store vectors on disk while minimizing the number of disk reads. Vector quantization is a technique that compresses vectors, allowing more of them to be stored in memory, thus reducing the need to read from disk. + +To configure in-memory quantization, with on-disk original vectors, you need to create a collection with the following parameters: + +- `on_disk`: Stores original vectors on disk. +- `quantization_config`: Compresses quantized vectors to `int8` using the `scalar` method. +- `always_ram`: Keeps quantized vectors in RAM. +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE, on_disk=True), + quantization_config=models.ScalarQuantization( + scalar=models.ScalarQuantizationConfig( + type=models.ScalarType.INT8, + always_ram=True, + ), + ), +) ``` ```csharp using Qdrant.Client; using Qdrant.Client.Grpc; -using static Qdrant.Client.Grpc.Conditions; var client = new QdrantClient("localhost", 6334); -await client.QueryAsync( - collectionName: "{collection_name}", - prefetch: - [\ - new PrefetchQuery { Query = new float[] { 0.01f, 0.45f, 0.67f }, Limit = 100 },\ - ], - query: new Formula - { - Expression = new SumExpression - { - Sum = - { - "$score", - new MultExpression - { - Mult = { 0.5f, Match("tag", ["h1", "h2", "h3", "h4"]) }, - }, - new MultExpression { Mult = { 0.25f, Match("tag", ["p", "li"]) } }, - }, - }, - }, - limit: 10 +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine, OnDisk = true }, + quantizationConfig: new QuantizationConfig + { + Scalar = new ScalarQuantization { Type = QuantizationType.Int8, AlwaysRam = true } + } ); - ``` ```go import ( - "context" + "context" - "github.com/qdrant/go-client/qdrant" + "github.com/qdrant/go-client/qdrant" ) client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) - -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Prefetch: []*qdrant.PrefetchQuery{ - { - Query: qdrant.NewQuery(0.01, 0.45, 0.67), - }, - }, - Query: qdrant.NewQueryFormula(&qdrant.Formula{ - Expression: qdrant.NewExpressionSum(&qdrant.SumExpression{ - Sum: []*qdrant.Expression{ - qdrant.NewExpressionVariable("$score"), - qdrant.NewExpressionMult(&qdrant.MultExpression{ - Mult: []*qdrant.Expression{ - qdrant.NewExpressionConstant(0.5), - qdrant.NewExpressionCondition(qdrant.NewMatchKeywords("tag", "h1", "h2", "h3", "h4")), - }, - }), - qdrant.NewExpressionMult(&qdrant.MultExpression{ - Mult: []*qdrant.Expression{ - qdrant.NewExpressionConstant(0.25), - qdrant.NewExpressionCondition(qdrant.NewMatchKeywords("tag", "p", "li")), - }, - }), - }, - }), - }), + Host: "localhost", + Port: 6334, }) +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 768, + Distance: qdrant.Distance_Cosine, + OnDisk: qdrant.PtrOf(true), + }), + QuantizationConfig: qdrant.NewQuantizationScalar(&qdrant.ScalarQuantization{ + Type: qdrant.QuantizationType_Int8, + AlwaysRam: qdrant.PtrOf(true), + }), +}) ``` -There are multiple expressions available, check the [API docs for specific details](https://api.qdrant.tech/v-1-14-x/api-reference/search/query-points#request.body.query.Query%20Interface.Query.Formula%20Query.formula). - -- **constant** \- A floating point number. e.g. `0.5`. -- `"$score"` \- Reference to the score of the point in the prefetch. This is the same as `"$score[0]"`. -- `"$score[0]"`, `"$score[1]"`, `"$score[2]"`, 
 \- When using multiple prefetches, you can reference specific prefetch with the index within the array of prefetches. -- **payload key** \- Any plain string will refer to a payload key. This uses the jsonpath format used in every other place, e.g. `key` or `key.subkey`. It will try to extract a number from the given key. -- **condition** \- A filtering condition. If the condition is met, it becomes `1.0`, otherwise `0.0`. -- **mult** \- Multiply an array of expressions. -- **sum** \- Sum an array of expressions. -- **div** \- Divide an expression by another expression. -- **abs** \- Absolute value of an expression. -- **pow** \- Raise an expression to the power of another expression. -- **sqrt** \- Square root of an expression. -- **log10** \- Base 10 logarithm of an expression. -- **ln** \- Natural logarithm of an expression. -- **exp** \- Exponential function of an expression ( `e^x`). -- **geo distance** \- Haversine distance between two geographic points. Values need to be `{ "lat": 0.0, "lon": 0.0 }` objects. -- **decay** \- Apply a decay function to an expression, which clamps the output between 0 and 1. Available decay functions are **linear**, **exponential**, and **gaussian**. [See more](https://qdrant.tech/documentation/concepts/hybrid-queries/#boost-points-closer-to-user). -- **datetime** \- Parse a datetime string (see formats [here](https://qdrant.tech/documentation/concepts/payload/#datetime)), and use it as a POSIX timestamp, in seconds. -- **datetime key** \- Specify that a payload key contains a datetime string to be parsed into POSIX seconds. - -It is possible to define a default for when the variable (either from payload or prefetch score) is not found. This is given in the form of a mapping from variable to value. -If there is no variable, and no defined default, a default value of `0.0` is used. - -### [Anchor](https://qdrant.tech/documentation/concepts/hybrid-queries/\#boost-points-closer-to-user) Boost points closer to user - -Another example. Combine the score with how close the result is to a user. - -Considering each point has an associated geo location, we can calculate the distance between the point and the request’s location. - -Assuming we have cosine scores in the prefetch, we can use a helper function to clamp the geographical distance between 0 and 1, by using a decay function. Once clamped, we can sum the score and the distance together. Pseudocode: - -`score = score + gauss_decay(distance)` - -In this case we use a **gauss\_decay** function. - -httppythontypescriptrustjavacsharpgo - -```http -POST /collections/{collection_name}/points/query -{ - "prefetch": { "query": [0.2, 0.8, ...], "limit": 50 }, - "query": { - "formula": { - "sum": [\ - "$score",\ - {\ - "gauss_decay": {\ - "x": {\ - "geo_distance": {\ - "origin": { "lat": 52.504043, "lon": 13.393236 }\ - "to": "geo.location"\ - }\ - },\ - "scale": 5000 // 5km\ - }\ - }\ - ] - }, - "defaults": { "geo.location": {"lat": 48.137154, "lon": 11.576124} } - } -} - -``` - -```python -from qdrant_client import models - -geo_boosted = client.query_points( - collection_name="{collection_name}", - prefetch=models.Prefetch( - query=[0.2, 0.8, ...], # <-- dense vector - limit=50 - ), - query=models.FormulaQuery( - formula=models.SumExpression(sum=[\ - "$score",\ - models.GaussDecayExpression(\ - gauss_decay=models.DecayParamsExpression(\ - x=models.GeoDistance(\ - geo_distance=models.GeoDistanceParams(\ - origin=models.GeoPoint(\ - lat=52.504043,\ - lon=13.393236\ - ), # Berlin\ - to="geo.location"\ - )\ - ),\ - scale=5000 # 5km\ - )\ - )\ - ]), - defaults={"geo.location": models.GeoPoint(lat=48.137154, lon=11.576124)} # Munich - ) -) - +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 768, + "distance": "Cosine", + "on_disk": true + }, + "quantization_config": { + "scalar": { + "type": "int8", + "always_ram": true + } + } +} ``` ```typescript @@ -34858,419 +51383,405 @@ import { QdrantClient } from "@qdrant/js-client-rest"; const client = new QdrantClient({ host: "localhost", port: 6333 }); -const distance_boosted = await client.query(collectionName, { - prefetch: { - query: [0.2, 0.8, ...], - limit: 50 +client.createCollection("{collection_name}", { + vectors: { + size: 768, + distance: "Cosine", + on_disk: true, }, - query: { - formula: { - sum: [\ - "$score",\ - {\ - gauss_decay: {\ - x: {\ - geo_distance: {\ - origin: { lat: 52.504043, lon: 13.393236 }, // Berlin\ - to: "geo.location"\ - }\ - },\ - scale: 5000 // 5km\ - }\ - }\ - ] + quantization_config: { + scalar: { + type: "int8", + always_ram: true, }, - defaults: { "geo.location": { lat: 48.137154, lon: 11.576124 } } // Munich - } + }, }); - ``` ```rust use qdrant_client::qdrant::{ - GeoPoint, DecayParamsExpressionBuilder, Expression, FormulaBuilder, PrefetchQueryBuilder, QueryPointsBuilder, + CreateCollectionBuilder, Distance, QuantizationType, ScalarQuantizationBuilder, + VectorParamsBuilder, }; use qdrant_client::Qdrant; let client = Qdrant::from_url("http://localhost:6334").build()?; -let _geo_boosted = client.query( - QueryPointsBuilder::new("{collection_name}") - .add_prefetch( - PrefetchQueryBuilder::default() - .query(vec![0.01, 0.45, 0.67]) - .limit(100u64), - ) - .query( - FormulaBuilder::new(Expression::sum_with([\ - Expression::score(),\ - Expression::exp_decay(\ - DecayParamsExpressionBuilder::new(Expression::geo_distance_with(\ - // Berlin\ - GeoPoint { lat: 52.504043, lon: 13.393236 },\ - "geo.location",\ - ))\ - .scale(5_000.0),\ - ),\ - ])) - // Munich - .add_default("geo.location", GeoPoint { lat: 48.137154, lon: 11.576124 }), - ) - .limit(10), +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) + .quantization_config( + ScalarQuantizationBuilder::default() + .r#type(QuantizationType::Int8.into()) + .always_ram(true), + ), ) .await?; - ``` ```java -import static io.qdrant.client.ExpressionFactory.expDecay; -import static io.qdrant.client.ExpressionFactory.geoDistance; -import static io.qdrant.client.ExpressionFactory.sum; -import static io.qdrant.client.ExpressionFactory.variable; -import static io.qdrant.client.PointIdFactory.id; -import static io.qdrant.client.QueryFactory.formula; -import static io.qdrant.client.QueryFactory.nearest; -import static io.qdrant.client.ValueFactory.value; - import io.qdrant.client.QdrantClient; import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.DecayParamsExpression; -import io.qdrant.client.grpc.Points.Formula; -import io.qdrant.client.grpc.Points.GeoDistance; -import io.qdrant.client.grpc.Points.GeoPoint; -import io.qdrant.client.grpc.Points.PrefetchQuery; -import io.qdrant.client.grpc.Points.QueryPoints; -import io.qdrant.client.grpc.Points.SumExpression; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; +import io.qdrant.client.grpc.Collections.QuantizationConfig; +import io.qdrant.client.grpc.Collections.QuantizationType; +import io.qdrant.client.grpc.Collections.ScalarQuantization; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); client - .queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .addPrefetch( - PrefetchQuery.newBuilder() - .setQuery(nearest(0.01f, 0.45f, 0.67f)) - .setLimit(100) - .build()) - .setQuery( - formula( - Formula.newBuilder() - .setExpression( - sum( - SumExpression.newBuilder() - .addSum(variable("$score")) - .addSum( - expDecay( - DecayParamsExpression.newBuilder() - .setX( - geoDistance( - GeoDistance.newBuilder() - .setOrigin( - GeoPoint.newBuilder() - .setLat(52.504043) - .setLon(13.393236) - .build()) - .setTo("geo.location") - .build())) - .setScale(5000) - .build())) - .build())) - .putDefaults( - "geo.location", - value( - Map.of( - "lat", value(48.137154), - "lon", value(11.576124)))) - .build())) - .build()) - .get(); + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(768) + .setDistance(Distance.Cosine) + .setOnDisk(true) + .build()) + .build()) + .setQuantizationConfig( + QuantizationConfig.newBuilder() + .setScalar( + ScalarQuantization.newBuilder() + .setType(QuantizationType.Int8) + .setAlwaysRam(true) + .build()) + .build()) + .build()) + .get(); +``` + +### Disable Rescoring for Faster Search (optional) +This is completely optional. Disabling rescoring with search `params` can further reduce the number of disk reads. Note that this might slightly decrease precision. + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.query_points( + collection_name="{collection_name}", + query=[0.2, 0.1, 0.9, 0.7], + search_params=models.SearchParams( + quantization=models.QuantizationSearchParams(rescore=False) + ), +) ``` ```csharp using Qdrant.Client; using Qdrant.Client.Grpc; -using static Qdrant.Client.Grpc.Expression; var client = new QdrantClient("localhost", 6334); await client.QueryAsync( - collectionName: "{collection_name}", - prefetch: - [\ - new PrefetchQuery { Query = new float[] { 0.01f, 0.45f, 0.67f }, Limit = 100 },\ - ], - query: new Formula - { - Expression = new SumExpression - { - Sum = - { - "$score", - FromExpDecay( - new() - { - X = new GeoDistance - { - Origin = new GeoPoint { Lat = 52.504043, Lon = 13.393236 }, - To = "geo.location", - }, - Scale = 5000, - } - ), - }, - }, - Defaults = - { - ["geo.location"] = new Dictionary - { - ["lat"] = 48.137154, - ["lon"] = 11.576124, - }, - }, - } + collectionName: "{collection_name}", + query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, + searchParams: new SearchParams + { + Quantization = new QuantizationSearchParams { Rescore = false } + }, + limit: 3 ); - ``` ```go import ( - "context" + "context" - "github.com/qdrant/go-client/qdrant" + "github.com/qdrant/go-client/qdrant" ) client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, + Host: "localhost", + Port: 6334, }) client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Prefetch: []*qdrant.PrefetchQuery{ - { - Query: qdrant.NewQuery(0.2, 0.8), - }, - }, - Query: qdrant.NewQueryFormula(&qdrant.Formula{ - Expression: qdrant.NewExpressionSum(&qdrant.SumExpression{ - Sum: []*qdrant.Expression{ - qdrant.NewExpressionVariable("$score"), - qdrant.NewExpressionExpDecay(&qdrant.DecayParamsExpression{ - X: qdrant.NewExpressionGeoDistance(&qdrant.GeoDistance{ - Origin: &qdrant.GeoPoint{ - Lat: 52.504043, - Lon: 13.393236, - }, - To: "geo.location", - }), - }), - }, - }), - Defaults: qdrant.NewValueMap(map[string]any{ - "geo.location": map[string]any{ - "lat": 48.137154, - "lon": 11.576124, - }, - }), - }), + CollectionName: "{collection_name}", + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), + Params: &qdrant.SearchParams{ + Quantization: &qdrant.QuantizationSearchParams{ + Rescore: qdrant.PtrOf(true), + }, + }, }) - ``` -For all decay functions, there are these parameters available - -| Parameter | Default | Description | -| --- | --- | --- | -| `x` | N/A | The value to decay | -| `target` | 0.0 | The value at which the decay will be at its peak. For distances it is usually set at 0.0, but can be set to any value. | -| `scale` | 1.0 | The value at which the decay function will be equal to `midpoint`. This is in terms of `x` units, for example, if `x` is in meters, `scale` of 5000 means 5km. Must be a non-zero positive number | -| `midpoint` | 0.5 | Output is `midpoint` when `x` equals `scale`. Must be in the range (0.0, 1.0), exclusive | - -The formulas for each decay function are as follows: - -Loading... - -[edit graph on](https://www.desmos.com/calculator/idv5hknwb1) - -scale - -target - -midpoint - -"x"x - -"y"y - -"a" squareda2 - -"a" Superscript, "b" , Baselineab - -77 - -88 - -99 - -overĂ· - -functions - -(( - -)) - -less than< - -greater than> - -44 - -55 - -66 - -times× - -\| "a" \|\|a\| - -,, - -less than or equal to≀ - -greater than or equal to≄ +```http +POST /collections/{collection_name}/points/query +{ + "query": [0.2, 0.1, 0.9, 0.7], + "params": { + "quantization": { + "rescore": false + } + }, + "limit": 10 +} +``` -11 +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -22 +const client = new QdrantClient({ host: "localhost", port: 6333 }); -33 +client.query("{collection_name}", { + query: [0.2, 0.1, 0.9, 0.7], + params: { + quantization: { + rescore: false, + }, + }, +}); +``` -negative− +```rust +use qdrant_client::qdrant::{ + QuantizationSearchParamsBuilder, QueryPointsBuilder, SearchParamsBuilder, +}; +use qdrant_client::Qdrant; -ABC +let client = Qdrant::from_url("http://localhost:6334").build()?; -StartRoot, , EndRoot +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query(vec![0.2, 0.1, 0.9, 0.7]) + .limit(3) + .params( + SearchParamsBuilder::default() + .quantization(QuantizationSearchParamsBuilder::default().rescore(false)), + ), + ) + .await?; +``` -piπ +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.QuantizationSearchParams; +import io.qdrant.client.grpc.Points.QueryPoints; +import io.qdrant.client.grpc.Points.SearchParams; -00 +import static io.qdrant.client.QueryFactory.nearest; -.. +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -equals= +client.queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) + .setParams( + SearchParams.newBuilder() + .setQuantization( + QuantizationSearchParams.newBuilder().setRescore(false).build()) + .build()) + .setLimit(3) + .build()) + .get(); +``` -positive+ +## 2. High Precision with Low Memory Usage -#### [Anchor](https://qdrant.tech/documentation/concepts/hybrid-queries/\#decay-functions) Decay functions +If you require high precision but have limited RAM, you can store both vectors and the HNSW index on disk. This setup reduces memory usage while maintaining search precision. -**`lin_decay`** (green), range: `[0, 1]` +To store the vectors `on_disk`, you need to configure both the vectors and the HNSW index: -lin\_decay(x)=max(0,−(1−midpoint)scale⋅abs(x−target)+1) +```python +from qdrant_client import QdrantClient, models -**`exp_decay`** (red), range: `(0, 1]` +client = QdrantClient(url="http://localhost:6333") -exp\_decay(x)=exp⁥(ln⁥(midpoint)scale⋅abs(x−target)) +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE, on_disk=True), + hnsw_config=models.HnswConfigDiff(on_disk=True), +) +``` -**`gauss_decay`** (purple), range: `(0, 1]` +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -gauss\_decay(x)=exp⁥(ln⁥(midpoint)scale2⋅(x−target)2) +var client = new QdrantClient("localhost", 6334); -## [Anchor](https://qdrant.tech/documentation/concepts/hybrid-queries/\#grouping) Grouping +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine, OnDisk = true }, + hnswConfig: new HnswConfigDiff { OnDisk = true } +); +``` -_Available as of v1.11.0_ +```go +import ( + "context" -It is possible to group results by a certain field. This is useful when you have multiple points for the same item, and you want to avoid redundancy of the same item in the results. + "github.com/qdrant/go-client/qdrant" +) -REST API ( [Schema](https://api.qdrant.tech/master/api-reference/search/query-points-groups)): +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -httppythontypescriptrustjavacsharpgo +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 768, + Distance: qdrant.Distance_Cosine, + OnDisk: qdrant.PtrOf(true), + }), + HnswConfig: &qdrant.HnswConfigDiff{ + OnDisk: qdrant.PtrOf(true), + }, +}) +``` ```http -POST /collections/{collection_name}/points/query/groups +PUT /collections/{collection_name} { - // Same as in the regular query API - "query": [1.1], - // Grouping parameters - "group_by": "document_id", // Path of the field to group by - "limit": 4, // Max amount of groups - "group_size": 2 // Max amount of points per group + "vectors": { + "size": 768, + "distance": "Cosine", + "on_disk": true + }, + "hnsw_config": { + "on_disk": true + } } - ``` -```python -client.query_points_groups( - collection_name="{collection_name}", - # Same as in the regular query_points() API - query=[1.1], - # Grouping parameters - group_by="document_id", # Path of the field to group by - limit=4, # Max amount of groups - group_size=2, # Max amount of points per group -) +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -``` +const client = new QdrantClient({ host: "localhost", port: 6333 }); -```typescript -client.queryGroups("{collection_name}", { - query: [1.1], - group_by: "document_id", - limit: 4, - group_size: 2, +client.createCollection("{collection_name}", { + vectors: { + size: 768, + distance: "Cosine", + on_disk: true, + }, + hnsw_config: { + on_disk: true, + }, }); - ``` ```rust -use qdrant_client::qdrant::QueryPointGroupsBuilder; +use qdrant_client::qdrant::{ + CreateCollectionBuilder, Distance, HnswConfigDiffBuilder, + VectorParamsBuilder, +}; +use qdrant_client::Qdrant; + +let client = Qdrant::from_url("http://localhost:6334").build()?; client - .query_groups( - QueryPointGroupsBuilder::new("{collection_name}", "document_id") - .query(vec![0.2, 0.1, 0.9, 0.7]) - .group_size(2u64) - .with_payload(true) - .with_vectors(true) - .limit(4u64), + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine).on_disk(true)) + .hnsw_config(HnswConfigDiffBuilder::default().on_disk(true)), ) .await?; +``` + +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.HnswConfigDiff; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(768) + .setDistance(Distance.Cosine) + .setOnDisk(true) + .build()) + .build()) + .setHnswConfig(HnswConfigDiff.newBuilder().setOnDisk(true).build()) + .build()) + .get(); +``` + +### Improving Precision +Increase the `ef` and `m` parameters of the HNSW index to improve precision, even with limited RAM: + +```json +... +"hnsw_config": { + "m": 64, + "ef_construct": 512, + "on_disk": true +} +... ``` -```java -import java.util.List; +**Note:** The speed of this setup depends on the disk’s IOPS (Input/Output Operations Per Second).
+You can use [fio](https://gist.github.com/superboum/aaa45d305700a7873a8ebbab1abddf2b) to measure disk IOPS. -import io.qdrant.client.grpc.Points.SearchPointGroups; +## 3. High Precision with High-Speed Search -client.queryGroupsAsync( - QueryPointGroups.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) - .setGroupBy("document_id") - .setLimit(4) - .setGroupSize(2) - .build()) - .get(); +For scenarios requiring both high speed and high precision, keep as much data in RAM as possible. Apply quantization with re-scoring for tunable accuracy. +Here is how you can configure scalar quantization for a collection: + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), + quantization_config=models.ScalarQuantization( + scalar=models.ScalarQuantizationConfig( + type=models.ScalarType.INT8, + always_ram=True, + ), + ), +) ``` ```csharp using Qdrant.Client; +using Qdrant.Client.Grpc; var client = new QdrantClient("localhost", 6334); -await client.QueryGroupsAsync( - collectionName: "{collection_name}", - query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, - groupBy: "document_id", - limit: 4, - groupSize: 2 +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine}, + quantizationConfig: new QuantizationConfig + { + Scalar = new ScalarQuantization { Type = QuantizationType.Int8, AlwaysRam = true } + } ); - ``` ```go @@ -35285,7707 +51796,9257 @@ client, err := qdrant.NewClient(&qdrant.Config{ Port: 6334, }) -client.QueryGroups(context.Background(), &qdrant.QueryPointGroups{ +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), - GroupBy: "document_id", - GroupSize: qdrant.PtrOf(uint64(2)), + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 768, + Distance: qdrant.Distance_Cosine, + }), + QuantizationConfig: qdrant.NewQuantizationScalar(&qdrant.ScalarQuantization{ + Type: qdrant.QuantizationType_Int8, + AlwaysRam: qdrant.PtrOf(true), + }), }) - ``` -For more information on the `grouping` capabilities refer to the reference documentation for search with [grouping](https://qdrant.tech/documentation/concepts/search/#search-groups) and [lookup](https://qdrant.tech/documentation/concepts/search/#lookup-in-groups). - -##### Was this page useful? - -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No - -Thank you for your feedback! 🙏 - -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/hybrid-queries.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. - -On this page: - -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/hybrid-queries.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) - -× - -[Powered by](https://qdrant.tech/) - -<|page-93-lllmstxt|> -## hybrid-search -- [Articles](https://qdrant.tech/articles/) -- Hybrid Search Revamped - Building with Qdrant's Query API - -[Back to Vector Search Manuals](https://qdrant.tech/articles/vector-search-manuals/) +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 768, + "distance": "Cosine" + }, + "quantization_config": { + "scalar": { + "type": "int8", + "always_ram": true + } + } +} +``` -# Hybrid Search Revamped - Building with Qdrant's Query API +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -Kacper Ɓukawski +const client = new QdrantClient({ host: "localhost", port: 6333 }); -· +client.createCollection("{collection_name}", { + vectors: { + size: 768, + distance: "Cosine", + }, + quantization_config: { + scalar: { + type: "int8", + always_ram: true, + }, + }, +}); +``` -July 25, 2024 +```rust +use qdrant_client::qdrant::{ + CreateCollectionBuilder, Distance, QuantizationType, ScalarQuantizationBuilder, + VectorParamsBuilder, +}; +use qdrant_client::Qdrant; -![Hybrid Search Revamped - Building with Qdrant's Query API](https://qdrant.tech/articles_data/hybrid-search/preview/title.jpg) +let client = Qdrant::from_url("http://localhost:6334").build()?; -It’s been over a year since we published the original article on how to build a hybrid -search system with Qdrant. The idea was straightforward: combine the results from different search methods to improve -retrieval quality. Back in 2023, you still needed to use an additional service to bring lexical search -capabilities and combine all the intermediate results. Things have changed since then. Once we introduced support for -sparse vectors, [the additional search service became obsolete](https://qdrant.tech/articles/sparse-vectors/), but you were still -required to combine the results from different methods on your end. +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) + .quantization_config( + ScalarQuantizationBuilder::default() + .r#type(QuantizationType::Int8.into()) + .always_ram(true), + ), + ) + .await?; +``` -**Qdrant 1.10 introduces a new Query API that lets you build a search system by combining different search methods** -**to improve retrieval quality**. Everything is now done on the server side, and you can focus on building the best search -experience for your users. In this article, we will show you how to utilize the new [Query\\ -API](https://qdrant.tech/documentation/concepts/search/#query-api) to build a hybrid search system. +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; +import io.qdrant.client.grpc.Collections.QuantizationConfig; +import io.qdrant.client.grpc.Collections.QuantizationType; +import io.qdrant.client.grpc.Collections.ScalarQuantization; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; -## [Anchor](https://qdrant.tech/articles/hybrid-search/\#introducing-the-new-query-api) Introducing the new Query API +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -At Qdrant, we believe that vector search capabilities go well beyond a simple search for nearest neighbors. -That’s why we provided separate methods for different search use cases, such as `search`, `recommend`, or `discover`. -With the latest release, we are happy to introduce the new Query API, which combines all of these methods into a single -endpoint and also supports creating nested multistage queries that can be used to build complex search pipelines. +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(768) + .setDistance(Distance.Cosine) + .build()) + .build()) + .setQuantizationConfig( + QuantizationConfig.newBuilder() + .setScalar( + ScalarQuantization.newBuilder() + .setType(QuantizationType.Int8) + .setAlwaysRam(true) + .build()) + .build()) + .build()) + .get(); +``` -If you are an existing Qdrant user, you probably have a running search mechanism that you want to improve, whether sparse -or dense. Doing any changes should be preceded by a proper evaluation of its effectiveness. +### Fine-Tuning Search Parameters -## [Anchor](https://qdrant.tech/articles/hybrid-search/\#how-effective-is-your-search-system) How effective is your search system? +You can adjust search parameters like `hnsw_ef` and `exact` to balance between speed and precision: -None of the experiments makes sense if you don’t measure the quality. How else would you compare which method works -better for your use case? The most common way of doing that is by using the standard metrics, such as `precision@k`, -`MRR`, or `NDCG`. There are existing libraries, such as [ranx](https://amenra.github.io/ranx/), that can help you with -that. We need to have the ground truth dataset to calculate any of these, but curating it is a separate task. +**Key Parameters:** +- `hnsw_ef`: Number of neighbors to visit during search (higher value = better accuracy, slower speed). +- `exact`: Set to `true` for exact search, which is slower but more accurate. You can use it to compare results of the search with different `hnsw_ef` values versus the ground truth. ```python -from ranx import Qrels, Run, evaluate +from qdrant_client import QdrantClient, models -# Qrels, or query relevance judgments, keep the ground truth data -qrels_dict = { "q_1": { "d_12": 5, "d_25": 3 }, - "q_2": { "d_11": 6, "d_22": 1 } } +client = QdrantClient(url="http://localhost:6333") -# Runs are built from the search results -run_dict = { "q_1": { "d_12": 0.9, "d_23": 0.8, "d_25": 0.7, - "d_36": 0.6, "d_32": 0.5, "d_35": 0.4 }, - "q_2": { "d_12": 0.9, "d_11": 0.8, "d_25": 0.7, - "d_36": 0.6, "d_22": 0.5, "d_35": 0.4 } } +client.query_points( + collection_name="{collection_name}", + query=[0.2, 0.1, 0.9, 0.7], + search_params=models.SearchParams(hnsw_ef=128, exact=False), + limit=3, +) +``` -# We need to create both objects, and then we can evaluate the run against the qrels -qrels = Qrels(qrels_dict) -run = Run(run_dict) +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -# Calculating the NDCG@5 metric is as simple as that -evaluate(qrels, run, "ndcg@5") +var client = new QdrantClient("localhost", 6334); +await client.QueryAsync( + collectionName: "{collection_name}", + query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, + searchParams: new SearchParams { HnswEf = 128, Exact = false }, + limit: 3 +); ``` -## [Anchor](https://qdrant.tech/articles/hybrid-search/\#available-embedding-options-with-query-api) Available embedding options with Query API - -Support for multiple vectors per point is nothing new in Qdrant, but introducing the Query API makes it even -more powerful. The 1.10 release supports the multivectors, allowing you to treat embedding lists -as a single entity. There are many possible ways of utilizing this feature, and the most prominent one is the support -for late interaction models, such as [ColBERT](https://qdrant.tech/documentation/fastembed/fastembed-colbert/). Instead of having a single embedding for each document or query, this -family of models creates a separate one for each token of text. In the search process, the final score is calculated -based on the interaction between the tokens of the query and the document. Contrary to cross-encoders, document -embedding might be precomputed and stored in the database, which makes the search process much faster. If you are -curious about the details, please check out [the article about ColBERT, written by our friends from Jina\\ -AI](https://jina.ai/news/what-is-colbert-and-late-interaction-and-why-they-matter-in-search/). - -![Late interaction](https://qdrant.tech/articles_data/hybrid-search/late-interaction.png) +```go +import ( + "context" -Besides multivectors, you can use regular dense and sparse vectors, and experiment with smaller data types to reduce -memory use. Named vectors can help you store different dimensionalities of the embeddings, which is useful if you -use multiple models to represent your data, or want to utilize the Matryoshka embeddings. + "github.com/qdrant/go-client/qdrant" +) -![Multiple vectors per point](https://qdrant.tech/articles_data/hybrid-search/multiple-vectors.png) +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -There is no single way of building a hybrid search. The process of designing it is an exploratory exercise, where you -need to test various setups and measure their effectiveness. Building a proper search experience is a -complex task, and it’s better to keep it data-driven, not just rely on the intuition. +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), + Params: &qdrant.SearchParams{ + HnswEf: qdrant.PtrOf(uint64(128)), + Exact: qdrant.PtrOf(false), + }, +}) +``` -## [Anchor](https://qdrant.tech/articles/hybrid-search/\#fusion-vs-reranking) Fusion vs reranking +```http +POST /collections/{collection_name}/points/query +{ + "query": [0.2, 0.1, 0.9, 0.7], + "params": { + "hnsw_ef": 128, + "exact": false + }, + "limit": 3 +} +``` -We can, distinguish two main approaches to building a hybrid search system: fusion and reranking. The former is about -combining the results from different search methods, based solely on the scores returned by each method. That usually -involves some normalization, as the scores returned by different methods might be in different ranges. After that, there -is a formula that takes the relevancy measures and calculates the final score that we use later on to reorder the -documents. Qdrant has built-in support for the Reciprocal Rank Fusion method, which is the de facto standard in the -field. +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -![Fusion](https://qdrant.tech/articles_data/hybrid-search/fusion.png) +const client = new QdrantClient({ host: "localhost", port: 6333 }); -Reranking, on the other hand, is about taking the results from different search methods and reordering them based on -some additional processing using the content of the documents, not just the scores. This processing may rely on an -additional neural model, such as a cross-encoder which would be inefficient enough to be used on the whole dataset. -These methods are practically applicable only when used on a smaller subset of candidates returned by the faster search -methods. Late interaction models, such as ColBERT, are way more efficient in this case, as they can be used to rerank -the candidates without the need to access all the documents in the collection. +client.query("{collection_name}", { + query: [0.2, 0.1, 0.9, 0.7], + params: { + hnsw_ef: 128, + exact: false, + }, + limit: 3, +}); +``` -![Reranking](https://qdrant.tech/articles_data/hybrid-search/reranking.png) +```rust +use qdrant_client::qdrant::{QueryPointsBuilder, SearchParamsBuilder}; +use qdrant_client::Qdrant; -### [Anchor](https://qdrant.tech/articles/hybrid-search/\#why-not-a-linear-combination) Why not a linear combination? +let client = Qdrant::from_url("http://localhost:6334").build()?; -It’s often proposed to use full-text and vector search scores to form a linear combination formula to rerank -the results. So it goes like this: +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query(vec![0.2, 0.1, 0.9, 0.7]) + .limit(3) + .params(SearchParamsBuilder::default().hnsw_ef(128).exact(false)), + ) + .await?; +``` -`final_score = 0.7 * vector_score + 0.3 * full_text_score` +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.QueryPoints; +import io.qdrant.client.grpc.Points.SearchParams; -However, we didn’t even consider such a setup. Why? Those scores don’t make the problem linearly separable. We used -the BM25 score along with cosine vector similarity to use both of them as points coordinates in 2-dimensional space. The -chart shows how those points are distributed: +import static io.qdrant.client.QueryFactory.nearest; -![A distribution of both Qdrant and BM25 scores mapped into 2D space.](https://qdrant.tech/articles_data/hybrid-search/linear-combination.png) +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -_A distribution of both Qdrant and BM25 scores mapped into 2D space. It clearly shows relevant and non-relevant_ -_objects are not linearly separable in that space, so using a linear combination of both scores won’t give us_ -_a proper hybrid search._ +client.queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) + .setParams(SearchParams.newBuilder().setHnswEf(128).setExact(false).build()) + .setLimit(3) + .build()) + .get(); +``` -Both relevant and non-relevant items are mixed. **None of the linear formulas would be able to distinguish** -**between them.** Thus, that’s not the way to solve it. +## Balancing Latency and Throughput -## [Anchor](https://qdrant.tech/articles/hybrid-search/\#building-a-hybrid-search-system-in-qdrant) Building a hybrid search system in Qdrant +When optimizing search performance, latency and throughput are two main metrics to consider: +- **Latency:** Time taken for a single request. +- **Throughput:** Number of requests handled per second. -Ultimately, **any search mechanism might also be a reranking mechanism**. You can prefetch results with sparse vectors -and then rerank them with the dense ones, or the other way around. Or, if you have Matryoshka embeddings, you can start -with oversampling the candidates with the dense vectors of the lowest dimensionality and then gradually reduce the -number of candidates by reranking them with the higher-dimensional embeddings. Nothing stops you from -combining both fusion and reranking. +The following optimization approaches are not mutually exclusive, but in some cases it might be preferable to optimize for one or another. -Let’s go a step further and build a hybrid search mechanism that combines the results from the -Matryoshka embeddings, dense vectors, and sparse vectors and then reranks them with the late interaction model. In the -meantime, we will introduce additional reranking and fusion steps. +### Minimizing Latency -![Complex search pipeline](https://qdrant.tech/articles_data/hybrid-search/complex-search-pipeline.png) +To minimize latency, you can set up Qdrant to use as many cores as possible for a single request. +You can do this by setting the number of segments in the collection to be equal to the number of cores in the system. -Our search pipeline consists of two branches, each of them responsible for retrieving a subset of documents that -we eventually want to rerank with the late interaction model. Let’s connect to Qdrant first and then build the search -pipeline. +In this case, each segment will be processed in parallel, and the final result will be obtained faster. ```python from qdrant_client import QdrantClient, models -client = QdrantClient("http://localhost:6333") - -``` - -All the steps utilizing Matryoshka embeddings might be specified in the Query API as a nested structure: +client = QdrantClient(url="http://localhost:6333") -```python -# The first branch of our search pipeline retrieves 25 documents -# using the Matryoshka embeddings with multistep retrieval. -matryoshka_prefetch = models.Prefetch( - prefetch=[\ - models.Prefetch(\ - prefetch=[\ - # The first prefetch operation retrieves 100 documents\ - # using the Matryoshka embeddings with the lowest\ - # dimensionality of 64.\ - models.Prefetch(\ - query=[0.456, -0.789, ..., 0.239],\ - using="matryoshka-64dim",\ - limit=100,\ - ),\ - ],\ - # Then, the retrieved documents are re-ranked using the\ - # Matryoshka embeddings with the dimensionality of 128.\ - query=[0.456, -0.789, ..., -0.789],\ - using="matryoshka-128dim",\ - limit=50,\ - )\ - ], - # Finally, the results are re-ranked using the Matryoshka - # embeddings with the dimensionality of 256. - query=[0.456, -0.789, ..., 0.123], - using="matryoshka-256dim", - limit=25, +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), + optimizers_config=models.OptimizersConfigDiff(default_segment_number=16), ) - ``` -Similarly, we can build the second branch of our search pipeline, which retrieves the documents using the dense and -sparse vectors and performs the fusion of them using the Reciprocal Rank Fusion method: +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -```python -# The second branch of our search pipeline also retrieves 25 documents, -# but uses the dense and sparse vectors, with their results combined -# using the Reciprocal Rank Fusion. -sparse_dense_rrf_prefetch = models.Prefetch( - prefetch=[\ - models.Prefetch(\ - prefetch=[\ - # The first prefetch operation retrieves 100 documents\ - # using dense vectors using integer data type. Retrieval\ - # is faster, but quality is lower.\ - models.Prefetch(\ - query=[7, 63, ..., 92],\ - using="dense-uint8",\ - limit=100,\ - )\ - ],\ - # Integer-based embeddings are then re-ranked using the\ - # float-based embeddings. Here we just want to retrieve\ - # 25 documents.\ - query=[-1.234, 0.762, ..., 1.532],\ - using="dense",\ - limit=25,\ - ),\ - # Here we just add another 25 documents using the sparse\ - # vectors only.\ - models.Prefetch(\ - query=models.SparseVector(\ - indices=[125, 9325, 58214],\ - values=[-0.164, 0.229, 0.731],\ - ),\ - using="sparse",\ - limit=25,\ - ),\ - ], - # RRF is activated below, so there is no need to specify the - # query vector here, as fusion is done on the scores of the - # retrieved documents. - query=models.FusionQuery( - fusion=models.Fusion.RRF, - ), -) +var client = new QdrantClient("localhost", 6334); +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine }, + optimizersConfig: new OptimizersConfigDiff { DefaultSegmentNumber = 16 } +); ``` -The second branch could have already been called hybrid, as it combines the results from the dense and sparse vectors -with fusion. However, nothing stops us from building even more complex search pipelines. - -Here is how the target call to the Query API would look like in Python: +```go +import ( + "context" -```python -client.query_points( - "my-collection", - prefetch=[\ - matryoshka_prefetch,\ - sparse_dense_rrf_prefetch,\ - ], - # Finally rerank the results with the late interaction model. It only - # considers the documents retrieved by all the prefetch operations above. - # Return 10 final results. - query=[\ - [1.928, -0.654, ..., 0.213],\ - [-1.197, 0.583, ..., 1.901],\ - ...,\ - [0.112, -1.473, ..., 1.786],\ - ], - using="late-interaction", - with_payload=False, - limit=10, + "github.com/qdrant/go-client/qdrant" ) -``` - -The options are endless, the new Query API gives you the flexibility to experiment with different setups. **You** -**rarely need to build such a complex search pipeline**, but it’s good to know that you can do that if needed. - -## [Anchor](https://qdrant.tech/articles/hybrid-search/\#lessons-learned-multi-vector-representations) Lessons learned: multi-vector representations - -Many of you have already started building hybrid search systems and reached out to us with questions and feedback. -We’ve seen many different approaches, however one recurring idea was to utilize **multi-vector representations with** -**ColBERT-style models as a reranking step**, after retrieving candidates with single-vector dense and/or sparse methods. -This reflects the latest trends in the field, as single-vector methods are still the most efficient, but multivectors -capture the nuances of the text better. - -![Reranking with late interaction models](https://qdrant.tech/articles_data/hybrid-search/late-interaction-reranking.png) - -Assuming you never use late interaction models for retrieval alone, but only for reranking, this setup comes with a -hidden cost. By default, each configured dense vector of the collection will have a corresponding HNSW graph created. -Even, if it is a multi-vector. - -```python -from qdrant_client import QdrantClient, models - -client = QdrantClient(...) -client.create_collection( - collection_name="my-collection", - vectors_config={ - "dense": models.VectorParams(...), - "late-interaction": models.VectorParams( - size=128, - distance=models.Distance.COSINE, - multivector_config=models.MultiVectorConfig( - comparator=models.MultiVectorComparator.MAX_SIM - ), - ) - }, - sparse_vectors_config={ - "sparse": models.SparseVectorParams(...) - }, -) +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 768, + Distance: qdrant.Distance_Cosine, + }), + OptimizersConfig: &qdrant.OptimizersConfigDiff{ + DefaultSegmentNumber: qdrant.PtrOf(uint64(16)), + }, +}) ``` -Reranking will never use the created graph, as all the candidates are already retrieved. Multi-vector ranking will only -be applied to the candidates retrieved by the previous steps, so no search operation is needed. HNSW becomes redundant -while still the indexing process has to be performed, and in that case, it will be quite heavy. ColBERT-like models -create hundreds of embeddings for each document, so the overhead is significant. **To avoid it, you can disable the HNSW** -**graph creation for this kind of model**: - -```python -client.create_collection( - collection_name="my-collection", - vectors_config={ - "dense": models.VectorParams(...), - "late-interaction": models.VectorParams( - size=128, - distance=models.Distance.COSINE, - multivector_config=models.MultiVectorConfig( - comparator=models.MultiVectorComparator.MAX_SIM - ), - hnsw_config=models.HnswConfigDiff( - m=0, # Disable HNSW graph creation - ), - ) - }, - sparse_vectors_config={ - "sparse": models.SparseVectorParams(...) +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 768, + "distance": "Cosine" }, -) - + "optimizers_config": { + "default_segment_number": 16 + } +} ``` -You won’t notice any difference in the search performance, but the use of resources will be significantly lower when you -upload the embeddings to the collection. - -## [Anchor](https://qdrant.tech/articles/hybrid-search/\#some-anecdotal-observations) Some anecdotal observations - -Neither of the algorithms performs best in all cases. In some cases, keyword-based search -will be the winner and vice-versa. The following table shows some interesting examples we could find in the -[WANDS](https://github.com/wayfair/WANDS) dataset during experimentation: - -| Query | BM25 Search | Vector Search | -| --- | --- | --- | -| cybersport desk | desk ❌ | gaming desk ✅ | -| plates for icecream | "eat" plates on wood wall dĂ©cor ❌ | alicyn 8.5 '' melamine dessert plate ✅ | -| kitchen table with a thick board | craft kitchen acacia wood cutting board ❌ | industrial solid wood dining table ✅ | -| wooden bedside table | 30 '' bedside table lamp ❌ | portable bedside end table ✅ | - -Also examples where keyword-based search did better: - -| Query | BM25 Search | Vector Search | -| --- | --- | --- | -| computer chair | vibrant computer task chair ✅ | office chair ❌ | -| 64.2 inch console table | cervantez 64.2 '' console table ✅ | 69.5 '' console table ❌ | - -## [Anchor](https://qdrant.tech/articles/hybrid-search/\#try-the-new-query-api-in-qdrant-110) Try the New Query API in Qdrant 1.10 - -The new Query API introduced in Qdrant 1.10 is a game-changer for building hybrid search systems. You don’t need any -additional services to combine the results from different search methods, and you can even create more complex pipelines -and serve them directly from Qdrant. - -Our webinar on _Building the Ultimate Hybrid Search_ takes you through the process of building a hybrid search system -with Qdrant Query API. If you missed it, you can [watch the recording](https://www.youtube.com/watch?v=LAZOxqzceEU), or -[check the notebooks](https://github.com/qdrant/workshop-ultimate-hybrid-search). - -How to Build the Ultimate Hybrid Search with Qdrant - YouTube - -[Photo image of Qdrant - Vector Database & Search Engine](https://www.youtube.com/channel/UC6ftm8PwH1RU_LM1jwG0LQA?embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) - -Qdrant - Vector Database & Search Engine - -8.12K subscribers - -[How to Build the Ultimate Hybrid Search with Qdrant](https://www.youtube.com/watch?v=LAZOxqzceEU) - -Qdrant - Vector Database & Search Engine - -Search - -Watch later - -Share - -Copy link - -Info - -Shopping - -Tap to unmute - -If playback doesn't begin shortly, try restarting your device. - -More videos - -## More videos - -You're signed out - -Videos you watch may be added to the TV's watch history and influence TV recommendations. To avoid this, cancel and sign in to YouTube on your computer. - -CancelConfirm - -Share - -Include playlist - -An error occurred while retrieving sharing information. Please try again later. - -[Watch on](https://www.youtube.com/watch?v=LAZOxqzceEU&embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -0:00 +const client = new QdrantClient({ host: "localhost", port: 6333 }); -0:00 / 1:01:18 -‱Live +client.createCollection("{collection_name}", { + vectors: { + size: 768, + distance: "Cosine", + }, + optimizers_config: { + default_segment_number: 16, + }, +}); +``` -‱ +```rust +use qdrant_client::qdrant::{ + CreateCollectionBuilder, Distance, OptimizersConfigDiffBuilder, VectorParamsBuilder, +}; +use qdrant_client::Qdrant; -[Watch on YouTube](https://www.youtube.com/watch?v=LAZOxqzceEU "Watch on YouTube") +let client = Qdrant::from_url("http://localhost:6334").build()?; -If you have any questions or need help with building your hybrid search system, don’t hesitate to reach out to us on -[Discord](https://qdrant.to/discord). +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) + .optimizers_config( + OptimizersConfigDiffBuilder::default().default_segment_number(16), + ), + ) + .await?; +``` -##### Was this page useful? +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -Thank you for your feedback! 🙏 +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(768) + .setDistance(Distance.Cosine) + .build()) + .build()) + .setOptimizersConfig( + OptimizersConfigDiff.newBuilder().setDefaultSegmentNumber(16).build()) + .build()) + .get(); +``` +### Maximizing Throughput -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/hybrid-search.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +To maximize throughput, configure Qdrant to use as many cores as possible to process multiple requests in parallel. -On this page: +To do that, use fewer segments (usually 2) of larger size (default 200Mb per segment) to handle more requests in parallel. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/hybrid-search.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Large segments benefit from the size of the index and overall smaller number of vector comparisons required to find the nearest neighbors. However, they will require more time to build the HNSW index. -× +```python +from qdrant_client import QdrantClient, models -[Powered by](https://qdrant.tech/) +client = QdrantClient(url="http://localhost:6333") -<|page-94-lllmstxt|> -## why-rust -- [Articles](https://qdrant.tech/articles/) -- Why Rust? +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), + optimizers_config=models.OptimizersConfigDiff(default_segment_number=2, max_segment_size=5000000), +) +``` -[Back to Qdrant Articles](https://qdrant.tech/articles/) +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -# Why Rust? +var client = new QdrantClient("localhost", 6334); -Andre Bogus +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine }, + optimizersConfig: new OptimizersConfigDiff { DefaultSegmentNumber = 2, MaxSegmentSize = 5000000 } +); +``` -· +```go +import ( + "context" -May 11, 2023 + "github.com/qdrant/go-client/qdrant" +) -![Why Rust?](https://qdrant.tech/articles_data/why-rust/preview/title.jpg) +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -# [Anchor](https://qdrant.tech/articles/why-rust/\#building-qdrant-in-rust) Building Qdrant in Rust +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 768, + Distance: qdrant.Distance_Cosine, + }), + OptimizersConfig: &qdrant.OptimizersConfigDiff{ + DefaultSegmentNumber: qdrant.PtrOf(uint64(2)), + MaxSegmentSize: qdrant.PtrOf(uint64(5000000)), + }, +}) +``` -Looking at the [github repository](https://github.com/qdrant/qdrant), you can see that Qdrant is built in [Rust](https://rust-lang.org/). Other offerings may be written in C++, Go, Java or even Python. So why does Qdrant chose Rust? Our founder Andrey had built the first prototype in C++, but didn’t trust his command of the language to scale to a production system (to be frank, he likened it to cutting his leg off). He was well versed in Java and Scala and also knew some Python. However, he considered neither a good fit: +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 768, + "distance": "Cosine" + }, + "optimizers_config": { + "default_segment_number": 2, + "max_segment_size": 5000000 + } +} +``` -**Java** is also more than 30 years old now. With a throughput-optimized VM it can often at least play in the same ball park as native services, and the tooling is phenomenal. Also portability is surprisingly good, although the GC is not suited for low-memory applications and will generally take good amount of RAM to deliver good performance. That said, the focus on throughput led to the dreaded GC pauses that cause latency spikes. Also the fat runtime incurs high start-up delays, which need to be worked around. +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -**Scala** also builds on the JVM, although there is a native compiler, there was the question of compatibility. So Scala shared the limitations of Java, and although it has some nice high-level amenities (of which Java only recently copied a subset), it still doesn’t offer the same level of control over memory layout as, say, C++, so it is similarly disqualified. +const client = new QdrantClient({ host: "localhost", port: 6333 }); -**Python**, being just a bit younger than Java, is ubiquitous in ML projects, mostly owing to its tooling (notably jupyter notebooks), being easy to learn and integration in most ML stacks. It doesn’t have a traditional garbage collector, opting for ubiquitous reference counting instead, which somewhat helps memory consumption. With that said, unless you only use it as glue code over high-perf modules, you may find yourself waiting for results. Also getting complex python services to perform stably under load is a serious technical challenge. +client.createCollection("{collection_name}", { + vectors: { + size: 768, + distance: "Cosine", + }, + optimizers_config: { + default_segment_number: 2, + max_segment_size: 5000000, + }, +}); +``` -## [Anchor](https://qdrant.tech/articles/why-rust/\#into-the-unknown) Into the Unknown +```rust +use qdrant_client::qdrant::{ + CreateCollectionBuilder, Distance, OptimizersConfigDiffBuilder, VectorParamsBuilder, +}; +use qdrant_client::Qdrant; -So Andrey looked around at what younger languages would fit the challenge. After some searching, two contenders emerged: Go and Rust. Knowing neither, Andrey consulted the docs, and found hinself intrigued by Rust with its promise of Systems Programming without pervasive memory unsafety. +let client = Qdrant::from_url("http://localhost:6334").build()?; -This early decision has been validated time and again. When first learning Rust, the compiler’s error messages are very helpful (and have only improved in the meantime). It’s easy to keep memory profile low when one doesn’t have to wrestle a garbage collector and has complete control over stack and heap. Apart from the much advertised memory safety, many footguns one can run into when writing C++ have been meticulously designed out. And it’s much easier to parallelize a task if one doesn’t have to fear data races. +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) + .optimizers_config( + OptimizersConfigDiffBuilder::default().default_segment_number(2).max_segment_size(5000000), + ), + ) + .await?; +``` -With Qdrant written in Rust, we can offer cloud services that don’t keep us awake at night, thanks to Rust’s famed robustness. A current qdrant docker container comes in at just a bit over 50MB — try that for size. As for performance
 have some [benchmarks](https://qdrant.tech/benchmarks/). +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; -And we don’t have to compromise on ergonomics either, not for us nor for our users. Of course, there are downsides: Rust compile times are usually similar to C++’s, and though the learning curve has been considerably softened in the last years, it’s still no match for easy-entry languages like Python or Go. But learning it is a one-time cost. Contrast this with Go, where you may find [the apparent simplicity is only skin-deep](https://fasterthanli.me/articles/i-want-off-mr-golangs-wild-ride). +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -## [Anchor](https://qdrant.tech/articles/why-rust/\#smooth-is-fast) Smooth is Fast +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(768) + .setDistance(Distance.Cosine) + .build()) + .build()) + .setOptimizersConfig( + OptimizersConfigDiff.newBuilder() + .setDefaultSegmentNumber(2) + .setMaxSegmentSize(5000000) + .build() + ) + .build()) + .get(); +``` -The complexity of the type system pays large dividends in bugs that didn’t even make it to a commit. The ecosystem for web services is also already quite advanced, perhaps not at the same point as Java, but certainly matching or outcompeting Go. +## Summary -Some people may think that the strict nature of Rust will slow down development, which is true only insofar as it won’t let you cut any corners. However, experience has conclusively shown that this is a net win. In fact, Rust lets us [ride the wall](https://the-race.com/nascar/bizarre-wall-riding-move-puts-chastain-into-nascar-folklore/), which makes us faster, not slower. +By adjusting configurations like vector storage, quantization, and search parameters, you can optimize Qdrant for different use cases: +- **Low Memory + High Speed:** Use vector quantization. +- **High Precision + Low Memory:** Store vectors and HNSW index on disk. +- **High Precision + High Speed:** Keep data in RAM, use quantization with re-scoring. +- **Latency vs. Throughput:** Adjust segment numbers based on the priority. -The job market for Rust programmers is certainly not as big as that for Java or Python programmers, but the language has finally reached the mainstream, and we don’t have any problems getting and retaining top talent. And being an open source project, when we get contributions, we don’t have to check for a wide variety of errors that Rust already rules out. +Choose the strategy that best fits your use case to get the most out of Qdrant’s performance capabilities. -## [Anchor](https://qdrant.tech/articles/why-rust/\#in-rust-we-trust) In Rust We Trust +<|page-142-lllmstxt|> +# Getting Started with Qdrant Managed Cloud -Finally, the Rust community is a very friendly bunch, and we are delighted to be part of that. And we don’t seem to be alone. Most large IT companies (notably Amazon, Google, Huawei, Meta and Microsoft) have already started investing in Rust. It’s in the Windows font system already and in the process of coming to the Linux kernel (build support has already been included). In machine learning applications, Rust has been tried and proven by the likes of Aleph Alpha and Huggingface, among many others. +Welcome to Qdrant Managed Cloud! This document contains all the information you need to get started. -To sum up, choosing Rust was a lucky guess that has brought huge benefits to Qdrant. Rust continues to be our not-so-secret weapon. +## Prerequisites -### [Anchor](https://qdrant.tech/articles/why-rust/\#key-takeaways) Key Takeaways: +Before creating a cluster, make sure you have a Qdrant Cloud account. Detailed instructions for signing up can be found in the [Qdrant Cloud Setup](/documentation/cloud/qdrant-cloud-setup/) guide. Qdrant Cloud supports granular [role-based access control](/documentation/cloud-rbac/). -- **Rust’s Advantages for Qdrant:** Rust provides memory safety and control without a garbage collector, which is crucial for Qdrant’s high-performance cloud services. +You also need to provide [payment details](/documentation/cloud/pricing-payments/). If you have a custom payment agreement, first create your account, then [contact our Support Team](https://support.qdrant.io/) to finalize the setup. -- **Low Overhead:** Qdrant’s Rust-based system offers efficiency, with small Docker container sizes and robust performance benchmarks. +Premium Plan subscribers can enable single sign-on (SSO) for their organizations. To activate SSO, please reach out to the Support Team at [https://support.qdrant.io/](https://support.qdrant.io/) for guidance. -- **Complexity vs. Simplicity:** Rust’s strict type system reduces bugs early in development, making it faster in the long run despite initial learning curves. +## Cluster Sizing -- **Adoption by Major Players:** Large tech companies like Amazon, Google, and Microsoft are embracing Rust, further validating Qdrant’s choice. +Before deploying any cluster, consider the resources needed for your specific workload. Our [Capacity Planning guide](/documentation/guides/capacity-planning/) describes how to assess the required CPU, memory, and storage. Additionally, the [Pricing Calculator](https://cloud.qdrant.io/calculator) helps you estimate associated costs based on your projected usage. -- **Community and Talent:** The supportive Rust community and increasing availability of Rust developers make it easier for Qdrant to grow and innovate. +## Creating and Managing Clusters +After setting up your account, you can create a Qdrant Cluster by following the steps in [Create a Cluster](/documentation/cloud/create-cluster/). -##### Was this page useful? +## Preparing for Production -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +For a production-ready environment, consider deploying a multi-node Qdrant cluster (at least three nodes) with replication enabled. More details are available in the [Distributed Deployment](/documentation/guides/distributed_deployment/) guide. For more information on how to create a production-ready cluster, see our [Vector Search in Production](/articles/vector-search-production/) article. -Thank you for your feedback! 🙏 +If you are looking to optimize costs, you can reduce memory usage through [Quantization](/documentation/guides/quantization/) or by [offloading vectors to disk](/documentation/concepts/storage/#configuring-memmap-storage). -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/why-rust.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +## Infrastructure as Code Automation -On this page: +Qdrant Cloud can be fully automated using the [Qdrant Cloud API](/documentation/cloud-api/). This allows you to create, manage, and scale clusters programmatically. You can also use our [Terraform Provider](https://registry.terraform.io/providers/qdrant/qdrant-cloud) to automate your Qdrant Cloud infrastructure. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/why-rust.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +<|page-143-lllmstxt|> +# Configure Multitenancy -× +**How many collections should you create?** In most cases, you should only use a single collection with payload-based partitioning. This approach is called multitenancy. It is efficient for most of users, but it requires additional configuration. This document will show you how to set it up. -[Powered by](https://qdrant.tech/) +**When should you create multiple collections?** When you have a limited number of users and you need isolation. This approach is flexible, but it may be more costly, since creating numerous collections may result in resource overhead. Also, you need to ensure that they do not affect each other in any way, including performance-wise. -<|page-95-lllmstxt|> -## cloud-premium -- [Documentation](https://qdrant.tech/documentation/) -- Premium Tier +## Partition by payload -# [Anchor](https://qdrant.tech/documentation/cloud-premium/\#qdrant-cloud-premium-tier) Qdrant Cloud Premium Tier +When an instance is shared between multiple users, you may need to partition vectors by user. This is done so that each user can only access their own vectors and can't see the vectors of other users. -Qdrant Cloud offers an optional premium tier for customers who require additional features and better SLA support levels. The premium tier includes: -- **24/7 Support**: Our support team is available around the clock to help you with any issues you may encounter (compared to 10x5 in standard). -- **Shorter Response Times**: Premium customers receive priority support and can expect faster response times, with shorter SLAs. -- **99.9% Uptime SLA**: We guarantee 99.9% uptime for your Qdrant Cloud clusters (compared to 99.5% in standard). -- **Single Sign-On (SSO)**: Premium customers can use their existing SSO provider to manage access to Qdrant Cloud. -- **VPC Private Links**: Premium customers can connect their Qdrant Cloud clusters to their VPCs using private links (AWS only). -- **Storage encryption with shared keys**: Premium customers can encrypt their data at rest using their own keys (AWS only). + -Please refer to the [Qdrant Cloud SLA](https://qdrant.to/sla/) for a detailed definition on uptime and support SLAs. +```python +client.upsert( + collection_name="{collection_name}", + points=[ + models.PointStruct( + id=1, + payload={"group_id": "user_1"}, + vector=[0.9, 0.1, 0.1], + ), + models.PointStruct( + id=2, + payload={"group_id": "user_1"}, + vector=[0.1, 0.9, 0.1], + ), + models.PointStruct( + id=3, + payload={"group_id": "user_2"}, + vector=[0.1, 0.1, 0.9], + ), + ], +) +``` -If you are interested in switching to Qdrant Cloud Premium, please [contact us](https://qdrant.tech/contact-us/) for more information. +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -##### Was this page useful? +var client = new QdrantClient("localhost", 6334); -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +await client.UpsertAsync( + collectionName: "{collection_name}", + points: new List + { + new() + { + Id = 1, + Vectors = new[] { 0.9f, 0.1f, 0.1f }, + Payload = { ["group_id"] = "user_1" } + }, + new() + { + Id = 2, + Vectors = new[] { 0.1f, 0.9f, 0.1f }, + Payload = { ["group_id"] = "user_1" } + }, + new() + { + Id = 3, + Vectors = new[] { 0.1f, 0.1f, 0.9f }, + Payload = { ["group_id"] = "user_2" } + } + } +); +``` -Thank you for your feedback! 🙏 +```go +import ( + "context" -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-premium.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. + "github.com/qdrant/go-client/qdrant" +) -On this page: +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-premium.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +client.Upsert(context.Background(), &qdrant.UpsertPoints{ + CollectionName: "{collection_name}", + Points: []*qdrant.PointStruct{ + { + Id: qdrant.NewIDNum(1), + Vectors: qdrant.NewVectors(0.9, 0.1, 0.1), + Payload: qdrant.NewValueMap(map[string]any{"group_id": "user_1"}), + }, + { + Id: qdrant.NewIDNum(2), + Vectors: qdrant.NewVectors(0.1, 0.9, 0.1), + Payload: qdrant.NewValueMap(map[string]any{"group_id": "user_1"}), + }, + { + Id: qdrant.NewIDNum(3), + Vectors: qdrant.NewVectors(0.1, 0.1, 0.9), + Payload: qdrant.NewValueMap(map[string]any{"group_id": "user_2"}), + }, + }, +}) +``` -× +```http +PUT /collections/{collection_name}/points +{ + "points": [ + { + "id": 1, + "payload": {"group_id": "user_1"}, + "vector": [0.9, 0.1, 0.1] + }, + { + "id": 2, + "payload": {"group_id": "user_1"}, + "vector": [0.1, 0.9, 0.1] + }, + { + "id": 3, + "payload": {"group_id": "user_2"}, + "vector": [0.1, 0.1, 0.9] + }, + ] +} +``` -[Powered by](https://qdrant.tech/) +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -<|page-96-lllmstxt|> -## graphrag-qdrant-neo4j -- [Documentation](https://qdrant.tech/documentation/) -- [Examples](https://qdrant.tech/documentation/examples/) -- GraphRAG with Qdrant and Neo4j +const client = new QdrantClient({ host: "localhost", port: 6333 }); -# [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#build-a-graphrag-agent-with-neo4j-and-qdrant) Build a GraphRAG Agent with Neo4j and Qdrant +client.upsert("{collection_name}", { + points: [ + { + id: 1, + payload: { group_id: "user_1" }, + vector: [0.9, 0.1, 0.1], + }, + { + id: 2, + payload: { group_id: "user_1" }, + vector: [0.1, 0.9, 0.1], + }, + { + id: 3, + payload: { group_id: "user_2" }, + vector: [0.1, 0.1, 0.9], + }, + ], +}); +``` -![image0](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/image0.png) +```rust +use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder}; +use qdrant_client::Qdrant; -| Time: 30 min | Level: Intermediate | Output: [GitHub](https://github.com/qdrant/examples/blob/master/graphrag_neo4j/graphrag.py) | -| --- | --- | --- | +let client = Qdrant::from_url("http://localhost:6334").build()?; -To make Artificial Intelligence (AI) systems more intelligent and reliable, we face a paradox: Large Language Models (LLMs) possess remarkable reasoning capabilities, yet they struggle to connect information in ways humans find intuitive. While groundbreaking, Retrieval-Augmented Generation (RAG) approaches often fall short when tasked with complex information synthesis. When asked to connect disparate pieces of information or understand holistic concepts across large documents, these systems frequently miss crucial connections that would be obvious to human experts. +client + .upsert_points(UpsertPointsBuilder::new( + "{collection_name}", + vec![ + PointStruct::new(1, vec![0.9, 0.1, 0.1], [("group_id", "user_1".into())]), + PointStruct::new(2, vec![0.1, 0.9, 0.1], [("group_id", "user_1".into())]), + PointStruct::new(3, vec![0.1, 0.1, 0.9], [("group_id", "user_2".into())]), + ], + )) + .await?; +``` -To solve these problems, Microsoft introduced **GraphRAG,** which uses Knowledge Graphs (KGs) instead of vectors as a context for LLMs. GraphRAG depends mainly on LLMs for creating KGs and querying them. However, this reliance on LLMs can lead to many problems. We will address these challenges by combining vector databases with graph-based databases. +```java +import java.util.List; +import java.util.Map; -This tutorial will demonstrate how to build a GraphRAG system with vector search using Neo4j and Qdrant. +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.PointStruct; -| Additional Materials | -| --- | -| This advanced tutorial is based on our original integration doc: [**Neo4j - Qdrant Integration**](https://qdrant.tech/documentation/frameworks/neo4j-graphrag/) | -| The output for this tutorial is in our GitHub Examples repo: [**Neo4j - Qdrant Agent in Python**](https://github.com/qdrant/examples/blob/master/graphrag_neo4j/graphrag.py) | +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -## [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#watch-the-video) Watch the Video +client + .upsertAsync( + "{collection_name}", + List.of( + PointStruct.newBuilder() + .setId(id(1)) + .setVectors(vectors(0.9f, 0.1f, 0.1f)) + .putAllPayload(Map.of("group_id", value("user_1"))) + .build(), + PointStruct.newBuilder() + .setId(id(2)) + .setVectors(vectors(0.1f, 0.9f, 0.1f)) + .putAllPayload(Map.of("group_id", value("user_1"))) + .build(), + PointStruct.newBuilder() + .setId(id(3)) + .setVectors(vectors(0.1f, 0.1f, 0.9f)) + .putAllPayload(Map.of("group_id", value("user_2"))) + .build())) + .get(); +``` -GraphRAG with Qdrant & Neo4j: Combining Vector Search and Knowledge Graphs - YouTube +2. Use a filter along with `group_id` to filter vectors for each user. -[Photo image of Qdrant - Vector Database & Search Engine](https://www.youtube.com/channel/UC6ftm8PwH1RU_LM1jwG0LQA?embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +```python +from qdrant_client import QdrantClient, models -Qdrant - Vector Database & Search Engine +client = QdrantClient(url="http://localhost:6333") -8.12K subscribers +client.query_points( + collection_name="{collection_name}", + query=[0.1, 0.1, 0.9], + query_filter=models.Filter( + must=[ + models.FieldCondition( + key="group_id", + match=models.MatchValue( + value="user_1", + ), + ) + ] + ), + limit=10, +) +``` -[GraphRAG with Qdrant & Neo4j: Combining Vector Search and Knowledge Graphs](https://www.youtube.com/watch?v=o9pszzRuyjo) +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; +using static Qdrant.Client.Grpc.Conditions; -Qdrant - Vector Database & Search Engine +var client = new QdrantClient("localhost", 6334); -Search +await client.QueryAsync( + collectionName: "{collection_name}", + query: new float[] { 0.1f, 0.1f, 0.9f }, + filter: MatchKeyword("group_id", "user_1"), + limit: 10 +); +``` -Watch later +```go +import ( + "context" -Share + "github.com/qdrant/go-client/qdrant" +) -Copy link +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -Info +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQuery(0.1, 0.1, 0.9), + Filter: &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("group_id", "user_1"), + }, + }, +}) +``` -Shopping +```http +POST /collections/{collection_name}/points/query +{ + "query": [0.1, 0.1, 0.9], + "filter": { + "must": [ + { + "key": "group_id", + "match": { + "value": "user_1" + } + } + ] + }, + "limit": 10 +} +``` -Tap to unmute +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -If playback doesn't begin shortly, try restarting your device. +const client = new QdrantClient({ host: "localhost", port: 6333 }); -More videos +client.query("{collection_name}", { + query: [0.1, 0.1, 0.9], + filter: { + must: [{ key: "group_id", match: { value: "user_1" } }], + }, + limit: 10, +}); +``` -## More videos +```rust +use qdrant_client::qdrant::{Condition, Filter, QueryPointsBuilder}; +use qdrant_client::Qdrant; -You're signed out +let client = Qdrant::from_url("http://localhost:6334").build()?; -Videos you watch may be added to the TV's watch history and influence TV recommendations. To avoid this, cancel and sign in to YouTube on your computer. +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query(vec![0.1, 0.1, 0.9]) + .limit(10) + .filter(Filter::must([Condition::matches( + "group_id", + "user_1".to_string(), + )])), + ) + .await?; +``` -CancelConfirm +```java +import java.util.List; -Share +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.QueryPoints; -Include playlist +import static io.qdrant.client.QueryFactory.nearest; +import static io.qdrant.client.ConditionFactory.matchKeyword; -An error occurred while retrieving sharing information. Please try again later. +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -[Watch on](https://www.youtube.com/watch?v=o9pszzRuyjo&embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +client.queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setFilter( + Filter.newBuilder().addMust(matchKeyword("group_id", "user_1")).build()) + .setQuery(nearest(0.1f, 0.1f, 0.9f)) + .setLimit(10) + .build()) + .get(); +``` -0:00 +## Calibrate performance -0:00 / 11:11 -‱Live +The speed of indexation may become a bottleneck in this case, as each user's vector will be indexed into the same collection. To avoid this bottleneck, consider _bypassing the construction of a global vector index_ for the entire collection and building it only for individual groups instead. -‱ +By adopting this strategy, Qdrant will index vectors for each user independently, significantly accelerating the process. -[Watch on YouTube](https://www.youtube.com/watch?v=o9pszzRuyjo "Watch on YouTube") +To implement this approach, you should: -# [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#rag--its-challenges) RAG & Its Challenges +1. Set `payload_m` in the HNSW configuration to a non-zero value, such as 16. +2. Set `m` in hnsw config to 0. This will disable building global index for the whole collection. -[RAG](https://qdrant.tech/rag/) combines retrieval-based and generative AI to enhance LLMs with relevant, up-to-date information from a knowledge base, like a vector database. However, RAG faces several challenges: +```python +from qdrant_client import QdrantClient, models -1. **Understanding Context:** Models may misinterpret queries, particularly when the context is complex or ambiguous, leading to incorrect or irrelevant answers. -2. **Balancing Similarity vs. Relevance:** RAG systems can struggle to ensure that retrieved information is similar and contextually relevant. -3. **Answer Completeness:** Traditional RAGs might not be able to capture all relevant details for complex queries that require LLMs to find relationships in the context that are not explicitly present. +client = QdrantClient(url="http://localhost:6333") -# [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#introduction-to-graphrag) Introduction to GraphRAG +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), + hnsw_config=models.HnswConfigDiff( + payload_m=16, + m=0, + ), +) +``` -Unlike RAG, which typically relies on document retrieval, GraphRAG builds knowledge graphs (KGs) to capture entities and their relationships. For datasets or use cases that demand human-level intelligence from an AI system, GraphRAG offers a promising solution: +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -- It can follow chains of relationships to answer complex queries, making it suitable for better reasoning beyond simple document retrieval. -- The graph structure allows a deeper understanding of the context, leading to more accurate and relevant responses. +var client = new QdrantClient("localhost", 6334); -The workflow of GraphRAG is as follows: +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine }, + hnswConfig: new HnswConfigDiff { PayloadM = 16, M = 0 } +); +``` -1. The LLM analyzes the dataset to identify entities (people, places, organizations) and their relationships, creating a comprehensive knowledge graph where entities are nodes and their connections form edges. -2. A bottom-up clustering algorithm organizes the KG into hierarchical semantic groups. This creates meaningful segments of related information, enabling understanding at different levels of abstraction. -3. GraphRAG uses both the KG and semantic clusters to select a relevant context for the LLM when answering queries. +```go +import ( + "context" -![image2](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/image2.png) + "github.com/qdrant/go-client/qdrant" +) -[Fig](https://arxiv.org/pdf/2404.16130) 1: A Complete Picture of GraphRAG Ingestion and Retrieval +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -### [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#challenges-of-graphrag) Challenges of GraphRAG +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 768, + Distance: qdrant.Distance_Cosine, + }), + HnswConfig: &qdrant.HnswConfigDiff{ + PayloadM: qdrant.PtrOf(uint64(16)), + M: qdrant.PtrOf(uint64(0)), + }, +}) +``` -Despite its advantages, the LLM-centric GraphRAG approach faces several challenges: +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 768, + "distance": "Cosine" + }, + "hnsw_config": { + "payload_m": 16, + "m": 0 + } +} +``` -- **KG Construction with LLMs:** Since the LLM is responsible for constructing the knowledge graph, there are risks such as inconsistencies, propagation of biases or errors, and lack of control over the ontology used. However, we used a LLM to extract the ontology in our implementation. -- **Querying KG with LLMs:** Once the graph is constructed, an LLM translates the human query into Cypher (Neo4j’s declarative query language). However, crafting complex queries in Cypher may result in inaccurate outcomes. -- **Scalability & Cost Consideration:** To be practical, applications must be both scalable and cost-effective. Relying on LLMs increases costs and decreases scalability, as they are used every time data is added, queried, or generated. +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -To address these challenges, a more controlled and structured knowledge representation system may be required for GraphRAG to function optimally at scale. +const client = new QdrantClient({ host: "localhost", port: 6333 }); -# [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#architecture-overview) Architecture Overview +client.createCollection("{collection_name}", { + vectors: { + size: 768, + distance: "Cosine", + }, + hnsw_config: { + payload_m: 16, + m: 0, + }, +}); +``` -The architecture has two main components: **Ingestion** and **Retrieval & Generation**. Ingestion processes raw data into structured knowledge and vector representations, while Retrieval and Generation enable efficient querying and response generation. +```rust +use qdrant_client::qdrant::{ + CreateCollectionBuilder, Distance, HnswConfigDiffBuilder, VectorParamsBuilder, +}; +use qdrant_client::Qdrant; -This process is divided into two steps: **Ingestion**, where data is prepared and stored, and **Retrieval and Generation**, where the prepared data is queried and utilized. Let’s start with Ingestion. +let client = Qdrant::from_url("http://localhost:6334").build()?; -## [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#ingestion) Ingestion +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) + .hnsw_config(HnswConfigDiffBuilder::default().payload_m(16).m(0)), + ) + .await?; +``` -The GraphRAG ingestion pipeline combines a **Graph Database** and a **Vector Database** to improve RAG workflows. +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.HnswConfigDiff; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; -![image1](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/image1.png) +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -Fig 2: Overview of Ingestion Pipeline +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(768) + .setDistance(Distance.Cosine) + .build()) + .build()) + .setHnswConfig(HnswConfigDiff.newBuilder().setPayloadM(16).setM(0).build()) + .build()) + .get(); +``` -Let’s break it down: +3. Create keyword payload index for `group_id` field. -1. **Raw Data:** Serves as the foundation, comprising unstructured or structured content. -2. **Ontology Creation:** An **LLM** processes the raw data into an **ontology**, structuring entities, relationships, and hierarchies. Better approaches exist to extracting more structured information from raw data, like using NER to identify the names of people, organizations, and places. Unlike LLMs, this method creates. -3. **Graph Database:** The ontology is stored in a **Graph database** to capture complex relationships. -4. **Vector Embeddings:** An **Embedding model** converts the raw data into high-dimensional vectors capturing semantic similarities. -5. **Vector Database:** These embeddings are stored in a **Vector database** for similarity-based retrieval. -6. **Database Interlinking:** The **Graph database** (e.g., Neo4j) and **Vector database** (e.g., Qdrant) share unique IDs, enabling cross-referencing between ontology-based and vector-based results. + -## [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#retrieval--generation) Retrieval & Generation -The **Retrieval and Generation** process is designed to handle user queries by leveraging both semantic search and graph-based context extraction. +```python +client.create_payload_index( + collection_name="{collection_name}", + field_name="group_id", + field_schema=models.KeywordIndexParams( + type="keyword", + is_tenant=True, + ), +) +``` -![image3](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/image3.png) +```csharp +using Qdrant.Client; -Fig 3: Overview of Retrieval and Generation Pipeline +var client = new QdrantClient("localhost", 6334); -The architecture can be broken down into the following steps: +await client.CreatePayloadIndexAsync( + collectionName: "{collection_name}", + fieldName: "group_id", + schemaType: PayloadSchemaType.Keyword, + indexParams: new PayloadIndexParams + { + KeywordIndexParams = new KeywordIndexParams + { + IsTenant = true + } + } +); +``` -1. **Query Vectorization:** An embedding model converts The user query into a high-dimensional vector. -2. **Semantic Search:** The vector performs a similarity-based search in the **Vector database**, retrieving relevant documents or entries. -3. **ID Extraction:** Extracted IDs from the semantic search results are used to query the **Graph database**. -4. **Graph Context Retrieval:** The **Graph database** provides contextual information, including relationships and entities linked to the extracted IDs. -5. **Response Generation:** The context retrieved from the graph is passed to an LLM to generate a final response. -6. **Results:** The generated response is returned to the user. +```go +import ( + "context" -This architecture combines the strengths of both databases: + "github.com/qdrant/go-client/qdrant" +) -1. **Semantic Search with Vector Database:** The user query is first processed semantically to identify the most relevant data points without needing explicit keyword matches. -2. **Contextual Expansion with Graph Database:** IDs or entities retrieved from the vector database query the graph database for detailed relationships, enriching the retrieved data with structured context. -3. **Enhanced Generation:** The architecture combines semantic relevance (from the vector database) and graph-based context to enable the LLM to generate more informed, accurate, and contextually rich responses. +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -# [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#implementation) Implementation +client.CreateFieldIndex(context.Background(), &qdrant.CreateFieldIndexCollection{ + CollectionName: "{collection_name}", + FieldName: "group_id", + FieldType: qdrant.FieldType_FieldTypeKeyword.Enum(), + FieldIndexParams: qdrant.NewPayloadIndexParams( + &qdrant.KeywordIndexParams{ + IsTenant: qdrant.PtrOf(true), + }), +}) +``` -We’ll walk through a complete pipeline that ingests data into Neo4j and Qdrant, retrieves relevant data, and generates responses using an LLM based on the retrieved graph context. +```http +PUT /collections/{collection_name}/index +{ + "field_name": "group_id", + "field_schema": { + "type": "keyword", + "is_tenant": true + } +} +``` -The main components of this pipeline include data ingestion (to Neo4j and Qdrant), retrieval, and generation steps. +```typescript +client.createPayloadIndex("{collection_name}", { + field_name: "group_id", + field_schema: { + type: "keyword", + is_tenant: true, + }, +}); +``` -## [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#prerequisites) Prerequisites +```rust +use qdrant_client::qdrant::{ + CreateFieldIndexCollectionBuilder, + KeywordIndexParamsBuilder, + FieldType +}; +use qdrant_client::{Qdrant, QdrantError}; -These are the tutorial prerequisites, which are divided into setup, imports, and initialization of the two DBs. +let client = Qdrant::from_url("http://localhost:6334").build()?; -### [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#setup) Setup +client.create_field_index( + CreateFieldIndexCollectionBuilder::new( + "{collection_name}", + "group_id", + FieldType::Keyword, + ).field_index_params( + KeywordIndexParamsBuilder::default() + .is_tenant(true) + ) + ).await?; +``` -Let’s start with setting up instances with Qdrant and Neo4j. +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.PayloadIndexParams; +import io.qdrant.client.grpc.Collections.PayloadSchemaType; +import io.qdrant.client.grpc.Collections.KeywordIndexParams; -### [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#qdrant-setup) Qdrant Setup +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -To create a Qdrant instance, you can use their **managed service** (Qdrant Cloud) or set up a self-hosted cluster. For simplicity, we will use Qdrant cloud: +client + .createPayloadIndexAsync( + "{collection_name}", + "group_id", + PayloadSchemaType.Keyword, + PayloadIndexParams.newBuilder() + .setKeywordIndexParams( + KeywordIndexParams.newBuilder() + .setIsTenant(true) + .build()) + .build(), + null, + null, + null) + .get(); +``` -- Go to [Qdrant Cloud](https://qdrant.tech/) and sign up or log in. -- Once logged in, click on **Create New Cluster**. -- Follow the on-screen instructions to create your cluster. -- Once your cluster is created, you’ll be given a **Cluster URL** and **API Key**, which you will use in the client to interact with Qdrant. +`is_tenant=true` parameter is optional, but specifying it provides storage with additional information about the usage patterns the collection is going to use. +When specified, storage structure will be organized in a way to co-locate vectors of the same tenant together, which can significantly improve performance in some cases. -### [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#neo4j-setup) Neo4j Setup -To set up a Neo4j instance, you can use **Neo4j Aura** (cloud service) or host it yourself. We will use Neo4j Aura: +## Limitations -- Go to Neo4j Aura and sign up/log in. -- After setting up, an instance will be created if it is the first time. -- After the database is set up, you’ll receive a **connection URI**, **username**, and **password**. +One downside to this approach is that global requests (without the `group_id` filter) will be slower since they will necessitate scanning all groups to identify the nearest neighbors. -We can add the following in the .env file for security purposes. +<|page-144-lllmstxt|> +# Setting up a Qdrant Cloud Account -### [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#imports) Imports +## Registration -First, we import the required libraries for working with Neo4j, Qdrant, OpenAI, and other utility functions. +There are different ways to register for a Qdrant Cloud account: -```python -from neo4j import GraphDatabase -from qdrant_client import QdrantClient, models -from dotenv import load_dotenv -from pydantic import BaseModel -from openai import OpenAI -from collections import defaultdict -from neo4j_graphrag.retrievers import QdrantNeo4jRetriever -import uuid -import os +* With an email address and passwordless login via email +* With a Google account +* With a GitHub account +* By connection an enterprise SSO solution -``` +Every account is tied to an email address. You can invite additional users to your account and manage their permissions. -* * * +### Email Registration -- **Neo4j:** Used to store and query the graph database. -- **Qdrant:** A vector database used for semantic similarity search. -- **dotenv:** Loads environment variables for credentials and API keys. -- **Pydantic:** Ensures data is structured properly when interacting with the graph data. -- **OpenAI:** Interfaces with the OpenAI API to generate responses and embeddings. -- **neo4j\_graphrag:** A helper package to retrieve data from both Qdrant and Neo4j. +1. Register for a [Cloud account](https://cloud.qdrant.io/signup) with your email, Google or GitHub credentials. -### [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#setting-up-environment-variables) Setting Up Environment Variables +## Inviting Additional Users to an Account -Before initializing the clients, we load the necessary credentials from environment variables. +You can invite additional users to your account, and manage their permissions on the **Account -> Access Management** page in the Qdrant Cloud Console. -```python -# Load environment variables -load_dotenv() +![Invitations](/documentation/cloud/invitations.png) -# Get credentials from environment variables -qdrant_key = os.getenv("QDRANT_KEY") -qdrant_url = os.getenv("QDRANT_URL") -neo4j_uri = os.getenv("NEO4J_URI") -neo4j_username = os.getenv("NEO4J_USERNAME") -neo4j_password = os.getenv("NEO4J_PASSWORD") -openai_key = os.getenv("OPENAI_API_KEY") +Invited users will receive an email with an invitation link to join Qdrant Cloud. Once they signed up, they can accept the invitation from the Overview page. -``` +![Accepting invitation](/documentation/cloud/accept-invitation.png) -* * * +## Switching Between Accounts -This ensures that sensitive information (like API keys and database credentials) is securely stored in environment variables. +If you have access to multiple accounts, you can switch between accounts with the account switcher on the top menu bar of the Qdrant Cloud Console. -### [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#initializing-neo4j-and-qdrant-clients) Initializing Neo4j and Qdrant Clients +![Switching between accounts](/documentation/cloud/account-switcher.png) -Now, we initialize the Neo4j and Qdrant clients using the credentials. +## Creating Additional Accounts -```python -# Initialize Neo4j driver -neo4j_driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password)) +You can create additional accounts from the account switcher in the top menu bar. Every account has its own set of clusters, permissions, and payment methods. -# Initialize Qdrant client -qdrant_client = QdrantClient( - url=qdrant_url, - api_key=qdrant_key -) +Besides the account owner, users are not shared across accounts, and must be specifically invited to an account to access it. -``` +Multiple accounts are useful if you want to manage clusters across different teams or environments, and also if you want to apply different payment methods to different resources. -* * * +![Create Account](/documentation/cloud/create-new-account.png) -- **Neo4j:** We set up a connection to the Neo4j graph database. -- **Qdrant:** We initialize the connection to the Qdrant vector store. +## Light & Dark Mode -This will connect with Neo4j and Qdrant, and we can now start with Ingestion. +The Qdrant Cloud Console supports light and dark mode. You can switch between the two modes in the *Settings* menu, by clicking on your account picture in the top right corner. -## [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#ingestion-1) Ingestion +![Light & Dark Mode](/documentation/cloud/light-dark-mode.png) -We will follow the workflow of the ingestion pipeline presented in the architecture section. Let’s examine it implementation-wise. +## Account Settings -### [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#defining-output-parser) Defining Output Parser +You can configure your account settings in the Qdrant Cloud Console on the **Account -> Settings** page. -The single and GraphComponents classes structure the LLM’s responses into a usable format. +The following functionality is available. -```python -class single(BaseModel): - node: str - target_node: str - relationship: str +### Renaming an Account -class GraphComponents(BaseModel): - graph: list[single] +If you use multiple accounts for different purposes, it is a good idea to give them descriptive names, for example *Development*, *Production*, *Testing*. You can also choose which account should be the default one, when you log in. -``` +![Account management](/documentation/cloud/account-management.png) -* * * +### Deleting an Account -These classes help ensure that data from the OpenAI LLM is parsed correctly into the graph components (nodes and relationships). +When you delete an account, all database clusters and associated data will be deleted. -### [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#defining-openai-client-and-llm-parser-function) Defining OpenAI Client and LLM Parser Function +![Delete Account](/documentation/cloud/account-delete.png) -We now initialize the OpenAI client and define a function to send prompts to the LLM and parse its responses. -```python -client = OpenAI() +## Enterprise Single-Sign-On (SSO) -def openai_llm_parser(prompt): - completion = client.chat.completions.create( - model="gpt-4o-2024-08-06", - response_format={"type": "json_object"}, - messages=[\ - {\ - "role": "system",\ - "content":\ -\ - """ You are a precise graph relationship extractor. Extract all\ - relationships from the text and format them as a JSON object\ - with this exact structure:\ - {\ - "graph": [\ - {"node": "Person/Entity",\ - "target_node": "Related Entity",\ - "relationship": "Type of Relationship"},\ - ...more relationships...\ - ]\ - }\ - Include ALL relationships mentioned in the text, including\ - implicit ones. Be thorough and precise. """\ -\ - },\ - {\ - "role": "user",\ - "content": prompt\ - }\ - ] - ) +Qdrant Cloud supports Enterprise Single-Sign-On for Premium Tier customers. The following providers are supported: - return GraphComponents.model_validate_json(completion.choices[0].message.content) +* Active Directory/LDAP +* ADFS +* Azure Active Directory Native +* Google Workspace +* OpenID Connect +* Okta +* PingFederate +* SAML +* Azure Active Directory +Enterprise Sign-On is available as an add-on for [Premium Tier](/documentation/cloud/premium/) customers. If you are interested in using SSO, please [contact us](/contact-us/). -``` + -* * * +<|page-145-lllmstxt|> +# Cloud RBAC -This function sends a prompt to the LLM, asking it to extract graph components (nodes and relationships) from the provided text. The response is parsed into structured graph data. +## About Cloud RBAC -### [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#extracting-graph-components) Extracting Graph Components +Qdrant Cloud enables you to manage permissions for your cloud resources with greater precision within the Qdrant Cloud console. This feature ensures that only authorized users have access to sensitive data and capabilities, covering the following areas: -The function extract\_graph\_components processes raw data, extracting the nodes and relationships as graph components. +- Billing +- Identity and Access Management +- Clusters* +- Hybrid Cloud +- Account Configuration -```python -def extract_graph_components(raw_data): - prompt = f"Extract nodes and relationships from the following text:\n{raw_data}" +*Note: Current permissions control access to ALL clusters. Per Cluster permissions will be in a future release.* - parsed_response = openai_llm_parser(prompt) # Assuming this returns a list of dictionaries - parsed_response = parsed_response.graph # Assuming the 'graph' structure is a key in the parsed response +> 💡 You can access this in **Access Management > User & Role Management** *if enabled.* - nodes = {} - relationships = [] +## Guides - for entry in parsed_response: - node = entry.node - target_node = entry.target_node # Get target node if available - relationship = entry.relationship # Get relationship if available +- [Role Management](/documentation/cloud-rbac/role-management/) +- [User Management](/documentation/cloud-rbac/user-management/) - # Add nodes to the dictionary with a unique ID - if node not in nodes: - nodes[node] = str(uuid.uuid4()) +## Reference - if target_node and target_node not in nodes: - nodes[target_node] = str(uuid.uuid4()) +- [Permission List](/documentation/cloud-rbac/permission-reference/) - # Add relationship to the relationships list with node IDs - if target_node and relationship: - relationships.append({ - "source": nodes[node], - "target": nodes[target_node], - "type": relationship - }) +<|page-146-lllmstxt|> +# About Qdrant Managed Cloud - return nodes, relationships +Qdrant Managed Cloud is our SaaS (software-as-a-service) solution, providing managed Qdrant database clusters on the cloud. We provide you the same fast and reliable similarity search engine, but without the need to maintain your own infrastructure. -``` +Transitioning to the Managed Cloud version of Qdrant does not change how you interact with the service. All you need is a [Qdrant Cloud account](https://qdrant.to/cloud/) and an [API key](/documentation/cloud/authentication/) for each request. -* * * +You can also attach your own infrastructure as a Hybrid Cloud Environment. For details, see our [Hybrid Cloud](/documentation/hybrid-cloud/) documentation. -This function takes raw data, uses the LLM to parse it into graph components, and then assigns unique IDs to nodes and relationships. +## Cluster Configuration -### [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#ingesting-data-to-neo4j) Ingesting Data to Neo4j +Each database cluster comes with the following features: -The function ingest\_to\_neo4j ingests the extracted graph data (nodes and relationships) into Neo4j. +- Allows the creation of highly available clusters with automatic failover +- Easy version upgrades, zero-downtime on highly available clusters +- Monitoring, logging and alerting to observe the health of each cluster +- Horizontal and vertical up and down scaling +- Automatic shard rebalancing +- Support for resharding +- Backups and disaster recovery +- Available natively on AWS and GCP, and Azure. +- Available on your own infrastructure and other providers if you use the Hybrid Cloud -```python -def ingest_to_neo4j(nodes, relationships): - """ - Ingest nodes and relationships into Neo4j. - """ +<|page-147-lllmstxt|> +# Qdrant Hybrid Cloud - with neo4j_driver.session() as session: - # Create nodes in Neo4j - for name, node_id in nodes.items(): - session.run( - "CREATE (n:Entity {id: $id, name: $name})", - id=node_id, - name=name - ) +Seamlessly deploy and manage your vector database across diverse environments, ensuring performance, security, and cost efficiency for AI-driven applications. - # Create relationships in Neo4j - for relationship in relationships: - session.run( - "MATCH (a:Entity {id: $source_id}), (b:Entity {id: $target_id}) " - "CREATE (a)-[:RELATIONSHIP {type: $type}]->(b)", - source_id=relationship["source"], - target_id=relationship["target"], - type=relationship["type"] - ) +[Qdrant Hybrid Cloud](/hybrid-cloud/) integrates Kubernetes clusters from any setting - cloud, on-premises, or edge - into a unified, enterprise-grade managed service. - return nodes +You can use [Qdrant Cloud's UI](/documentation/cloud/create-cluster/) to create and manage your database clusters, while they still remain within your infrastructure. **All Qdrant databases will operate solely within your network, using your storage and compute resources. All user data will stay securely within your environment and won't be accessible by the Qdrant Cloud platform, or anyone else outside your organization.** -``` +Qdrant Hybrid Cloud ensures data privacy, deployment flexibility, low latency, and delivers cost savings, elevating standards for vector search and AI applications. -* * * +**How it works:** Qdrant Hybrid Cloud relies on Kubernetes and works with any standard compliant Kubernetes distribution. When you onboard a Kubernetes cluster as a Hybrid Cloud Environment, you can deploy the Qdrant Kubernetes Operator and Cloud Agent into this cluster. These will manage Qdrant databases within your Kubernetes cluster and establish an outgoing connection to Qdrant Cloud to transport telemetry and receive management instructions. You can then benefit from the same cloud management features and transport telemetry that is available with any managed Qdrant Cloud cluster. -Here, we create nodes and relationships in the Neo4j graph database. Nodes are entities, and relationships link these entities. + -This will ingest the data into Neo4j and on a sample dataset it looks something like this: +**Setup instructions:** To begin using Qdrant Hybrid Cloud, [read our installation guide](/documentation/hybrid-cloud/hybrid-cloud-setup/). -![image4](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/image4.png) +## Hybrid Cloud architecture -Fig 4: Visualization of the Knowledge Graph +The Hybrid Cloud onboarding will install a Kubernetes Operator and Cloud Agent into your Kubernetes cluster. -Let’s explore how to map nodes with their IDs and integrate this information, along with vectors, into Qdrant. First, let’s create a Qdrant collection. +The Cloud Agent will establish an outgoing connection to `cloud.qdrant.io` on port `443` to transport telemetry and receive management instructions. It will also interact with the Kubernetes API through a ServiceAccount to create, read, update and delete the necessary Qdrant CRs (Custom Resources) based on the configuration setup in the Qdrant Cloud Console. -### [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#creating-qdrant-collection) Creating Qdrant Collection +The Qdrant Kubernetes Operator will manage the Qdrant databases within your Kubernetes cluster. Based on the Qdrant CRs, it will interact with the Kubernetes API through a ServiceAccount to create and manage the necessary resources to deploy and run Qdrant databases, such as Pods, Services, ConfigMaps, and Secrets. -You can create a collection once you have set up your Qdrant instance. A collection in Qdrant holds vectors for search and retrieval. +Both component's access is limited to the Kubernetes namespace that you chose during the onboarding process. -```python -def create_collection(client, collection_name, vector_dimension): +The Cloud Agent only sends telemetry data and status information to the Qdrant Cloud platform. It does not send any user data or sensitive information. The telemetry data includes: -``` +* The health status and resource (CPU, memory, disk and network) usage of the Qdrant databases and Qdrant control plane components. +* Information about the Qdrant databases, such as the number, name and configuration of collections, the number of vectors, the number of queries, and the number of indexing operations. +* Telemetry and notification data from the Qdrant databases. +* Kubernetes operations and scheduling events reported for the Qdrant databases and Qdrant control plane components. -try: +After the initial onboarding, the lifecycle of these components will be controlled by the Qdrant Cloud platform via the built-in Helm controller. -```python -# Try to fetch the collection status -try: - collection_info = client.get_collection(collection_name) - print(f"Skipping creating collection; '{collection_name}' already exists.") -except Exception as e: - # If collection does not exist, an error will be thrown, so we create the collection - if 'Not found: Collection' in str(e): - print(f"Collection '{collection_name}' not found. Creating it now...") +You don't need to expose your Kubernetes Cluster to the Qdrant Cloud platform, you don't need to open any ports for incoming traffic, and you don't need to provide any Kubernetes or cloud provider credentials to the Qdrant Cloud platform. - client.create_collection( - collection_name=collection_name, - vectors_config=models.VectorParams(size=vector_dimension, distance=models.Distance.COSINE) - ) +![hybrid-cloud-architecture](/blog/hybrid-cloud/hybrid-cloud-architecture.png) - print(f"Collection '{collection_name}' created successfully.") - else: - print(f"Error while checking collection: {e}") +<|page-148-lllmstxt|> +# Beginner Tutorials -``` +| | +|----------------------------------------------------| +| [Build Your First Semantic Search Engine in 5 Minutes](/documentation/beginner-tutorials/search-beginners/) | +| [Build a Neural Search Service with Sentence Transformers and Qdrant](/documentation/beginner-tutorials/neural-search/) | +| [Build a Hybrid Search Service with FastEmbed and Qdrant](/documentation/beginner-tutorials/hybrid-search-fastembed/) | +| [Measure and Improve Retrieval Quality in Semantic Search](/documentation/beginner-tutorials/retrieval-quality/) | -* * * +<|page-149-lllmstxt|> +# Advanced Tutorials -- **Qdrant Client:** The QdrantClient is used to connect to the Qdrant instance. -- **Creating Collection:** The create\_collection function checks if a collection exists. If not, it creates one with a specified vector dimension and distance metric (cosine similarity in this case). +| | +|----------------------------------------------------------| +| [Use Collaborative Filtering to Build a Movie Recommendation System with Qdrant](/documentation/advanced-tutorials/collaborative-filtering/) | +| [Build a Text/Image Multimodal Search System with Qdrant and FastEmbed](/documentation/advanced-tutorials/multimodal-search-fastembed/) | +| [Navigate Your Codebase with Semantic Search and Qdrant](/documentation/advanced-tutorials/code-search/) | +| [Ensure optimal large-scale PDF Retrieval with Qdrant and ColPali/ColQwen](/documentation/advanced-tutorials/pdf-retrieval-at-scale/) | -### [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#generating-embeddings) Generating Embeddings +<|page-150-lllmstxt|> +# Qdrant Private Cloud -Next, we define a function that generates embeddings for text using OpenAI’s API. +Qdrant Private Cloud allows you to manage Qdrant database clusters in any Kubernetes cluster on any infrastructure. It uses the same Qdrant Operator that powers Qdrant Managed Cloud and Qdrant Hybrid Cloud, but without any connection to the Qdrant Cloud Management Console. -```python -def openai_embeddings(text): - response = client.embeddings.create( - input=text, - model="text-embedding-3-small" - ) +On top of the open source Qdrant database, it allows - return response.data[0].embedding +* Easy deployment and management of Qdrant database clusters in your own Kubernetes infrastructure +* Zero-downtime upgrades of the Qdrant database with replication +* Vertical and horizontal up and downscaling of the Qdrant database with auto rebalancing and shard splitting +* Full control over scheduling, including Multi-AZ deployments +* Backup & Disaster Recovery +* Extended telemetry +* Qdrant Enterprise Support Services -``` +If you are interested in using Qdrant Private Cloud, please [contact us](/contact-us/) for more information. -* * * +<|page-151-lllmstxt|> +# Qdrant Cloud Billing & Payments -This function uses OpenAI’s embedding model to transform input text into vector representations. +Qdrant database clusters in Qdrant Cloud are priced based on CPU, memory, and disk storage usage. To get a clearer idea for the pricing structure, based on the amounts of vectors you want to store, please use our [Pricing Calculator](https://cloud.qdrant.io/calculator). -### [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#ingesting-into-qdrant) Ingesting into Qdrant +## Billing -Let’s ingest the data into the vector database. +You can pay for your Qdrant Cloud database clusters either with a credit card or through an AWS, GCP, or Azure Marketplace subscription. -```python -def ingest_to_qdrant(collection_name, raw_data, node_id_mapping): - embeddings = [openai_embeddings(paragraph) for paragraph in raw_data.split("\n")] +Your payment method is charged at the beginning of each month for the previous month's usage. There is no difference in pricing between the different payment methods. - qdrant_client.upsert( - collection_name=collection_name, - points=[\ - {\ - "id": str(uuid.uuid4()),\ - "vector": embedding,\ - "payload": {"id": node_id}\ - }\ - for node_id, embedding in zip(node_id_mapping.values(), embeddings)\ - ] - ) +If you choose to pay through a marketplace, the Qdrant Cloud usage costs are added as usage units to your existing billing for your cloud provider services. A detailed breakdown of your usage is available in the Qdrant Cloud Console. -``` +Note: Even if you pay using a marketplace subscription, your database clusters will still be deployed into Qdrant-owned infrastructure. The setup and management of Qdrant database clusters will also still be done via the Qdrant Cloud Console UI. -* * * +If you wish to deploy Qdrant database clusters into your own environment from Qdrant Cloud then we recommend our [Hybrid Cloud](/documentation/hybrid-cloud/) solution. -The ingest\_to\_qdrant function generates embeddings for each paragraph in the raw data and stores them in a Qdrant collection. It associates each embedding with a unique ID and its corresponding node ID from the node\_id\_mapping dictionary, ensuring proper linkage for later retrieval. +![Payment Options](/documentation/cloud/payment-options.png) -* * * +### Credit Card -## [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#retrieval--generation-1) Retrieval & Generation +Credit card payments are processed through Stripe. To set up a credit card, go to the Billing Details screen in the [Qdrant Cloud Console](https://cloud.qdrant.io/), select **Stripe** as the payment method, and enter your credit card details. -In this section, we will create the retrieval and generation engine for the system. +### AWS Marketplace -### [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#building-a-retriever) Building a Retriever +Our [AWS Marketplace](https://aws.amazon.com/marketplace/pp/prodview-rtphb42tydtzg) listing streamlines access to Qdrant for users who rely on Amazon Web Services for hosting and application development. -The retriever integrates vector search and graph data, enabling semantic similarity searches with Qdrant and fetching relevant graph data from Neo4j. This enriches the RAG process and allows for more informed responses. +To subscribe: -```python -def retriever_search(neo4j_driver, qdrant_client, collection_name, query): - retriever = QdrantNeo4jRetriever( - driver=neo4j_driver, - client=qdrant_client, - collection_name=collection_name, - id_property_external="id", - id_property_neo4j="id", - ) +1. Go to Billing Details screen in the [Qdrant Cloud Console](https://cloud.qdrant.io/) +2. Select **AWS Marketplace** as the payment method. You will be redirected to the AWS Marketplace listing for Qdrant. +3. Click the bright orange button - **View purchase options**. +4. On the next screen, under Purchase, click **Subscribe**. +5. Up top, on the green banner, click **Set up your account**. - results = retriever.search(query_vector=openai_embeddings(query), top_k=5) +You will be redirected to the Billing Details screen in the [Qdrant Cloud Console](https://cloud.qdrant.io/). From there you can start to create Qdrant database clusters. - return results +### GCP Marketplace -``` +Our [GCP Marketplace](https://console.cloud.google.com/marketplace/product/qdrant-public/qdrant) listing streamlines access to Qdrant for users who rely on the Google Cloud Platform for hosting and application development. -* * * +To subscribe: -The [QdrantNeo4jRetriever](https://qdrant.tech/documentation/frameworks/neo4j-graphrag/) handles both vector search and graph data fetching, combining Qdrant for vector-based retrieval and Neo4j for graph-based queries. +1. Go to Billing Details screen in the [Qdrant Cloud Console](https://cloud.qdrant.io/) +2. Select **GCP Marketplace** as the payment method. You will be redirected to the GCP Marketplace listing for Qdrant. +3. Select **Subscribe**. (If you have already subscribed, select **Manage on Provider**.) +4. On the next screen, choose options as required, and select **Subscribe**. +5. On the pop-up window that appers, select **Sign up with Qdrant**. -**Vector Search:** +You will be redirected to the Billing Details screen in the [Qdrant Cloud Console](https://cloud.qdrant.io/). From there you can start to create Qdrant database clusters. -- **`qdrant_client`** connects to Qdrant for efficient vector similarity search. -- **`collection_name`** specifies where vectors are stored. -- **`id_property_external="id"`** maps the external entity’s ID for retrieval. +### Azure Marketplace -**Graph Fetching:** +Our [Azure Marketplace](https://portal.azure.com/#view/Microsoft_Azure_Marketplace/GalleryItemDetailsBladeNopdl/id/qdrantsolutionsgmbh1698769709989.qdrant-db/selectionMode~/false/resourceGroupId//resourceGroupLocation//dontDiscardJourney~/false/selectedMenuId/home/launchingContext~/%7B%22galleryItemId%22%3A%22qdrantsolutionsgmbh1698769709989.qdrant-dbqdrant_cloud_unit%22%2C%22source%22%3A%5B%22GalleryFeaturedMenuItemPart%22%2C%22VirtualizedTileDetails%22%5D%2C%22menuItemId%22%3A%22home%22%2C%22subMenuItemId%22%3A%22Search%20results%22%2C%22telemetryId%22%3A%221df5537b-8b29-4200-80ce-0cd38c7e0e56%22%7D/searchTelemetryId/6b44fb90-7b9c-4286-aad8-59f88f3cc2ff) listing streamlines access to Qdrant for users who rely on Microsoft Azure for hosting and application development. -- **`neo4j_driver`** connects to Neo4j for querying graph data. -- **`id_property_neo4j="id"`** ensures the entity IDs from Qdrant match the graph nodes in Neo4j. +To subscribe: -### [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#querying-neo4j-for-related-graph-data) Querying Neo4j for Related Graph Data +1. Go to Billing Details screen in the [Qdrant Cloud Console](https://cloud.qdrant.io/) +2. Select **Azure Marketplace** as the payment method. You will be redirected to the Azure Marketplace listing for Qdrant. +3. Select **Subscribe**. +4. On the next screen, choose options as required, and select **Review + Subscribe**. +5. After reviewing all settings, select **Subscribe**. +6. Once the SaaS subscription is created, select **Configure account now**. + +You will be redirected to the Billing Details screen in the [Qdrant Cloud Console](https://cloud.qdrant.io/). From there you can start to create Qdrant database clusters. -We need to fetch subgraph data from a Neo4j database based on specific entity IDs after the retriever has provided the relevant IDs. +<|page-152-lllmstxt|> +# Building a Chain-of-Thought Medical Chatbot with Qdrant and DSPy -```python -def fetch_related_graph(neo4j_client, entity_ids): - query = """ - MATCH (e:Entity)-[r1]-(n1)-[r2]-(n2) - WHERE e.id IN $entity_ids - RETURN e, r1 as r, n1 as related, r2, n2 - UNION - MATCH (e:Entity)-[r]-(related) - WHERE e.id IN $entity_ids - RETURN e, r, related, null as r2, null as n2 - """ - with neo4j_client.session() as session: - result = session.run(query, entity_ids=entity_ids) - subgraph = [] - for record in result: - subgraph.append({ - "entity": record["e"], - "relationship": record["r"], - "related_node": record["related"] - }) - if record["r2"] and record["n2"]: - subgraph.append({ - "entity": record["related"], - "relationship": record["r2"], - "related_node": record["n2"] - }) - return subgraph +Accessing medical information from LLMs can lead to hallucinations or outdated information. Relying on this type of information can result in serious medical consequences. Building a trustworthy and context-aware medical chatbot can solve this. -``` +In this article, we will look at how to tackle these challenges using: -* * * +* **Retrieval-Augmented Generation (RAG)**: Instead of answering the questions from scratch, the bot retrieves the information from medical literature before answering questions. +* **Filtering**: Users can filter the results by specialty and publication year, ensuring the information is accurate and up-to-date. -The function fetch\_related\_graph takes in a Neo4j client and a list of entity\_ids. It runs a Cypher query to find related nodes (entities) and their relationships based on the given entity IDs. The query matches entities (e:Entity) and finds related nodes through any relationship \[r\]. The function returns a list of subgraph data, where each record contains the entity, relationship, and related\_node. +Let’s discover the technologies needed to build the medical bot. -This subgraph is essential for generating context to answer user queries. +## Tech Stack Overview -### [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#setting-up-the-graph-context) Setting up the Graph Context +To build a robust and trustworthy medical chatbot, we will combine the following technologies: -The second part of the implementation involves preparing a graph context. We’ll fetch relevant subgraph data from a Neo4j database and format it for the model. Let’s break it down. +* [**Qdrant Cloud**](https://qdrant.tech/cloud/): Qdrant is a high-performance vector search engine for storing and retrieving large collections of embeddings. In this project, we will use it to enable fast and accurate search across millions of medical documents, supporting dense and multi-vector (ColBERT) retrieval for context-aware answers. +* [**Stanford DSPy**](https://qdrant.tech/documentation/frameworks/dspy/)**:** DSPy is the AI framework we will use to obtain the final answer. It allows the medical bot to retrieve the relevant information and reason step-by-step to produce accurate and explainable answers. + +![medicalbot flow chart](/articles_data/Qdrant-DSPy-medicalbot/medicalbot.png) -```python -def format_graph_context(subgraph): - nodes = set() - edges = [] +## Dataset Preparation and Indexing - for entry in subgraph: - entity = entry["entity"] - related = entry["related_node"] - relationship = entry["relationship"] +A medical chatbot is only as good as the knowledge it has access to. For this project, we will leverage the [MIRIAD medical dataset](https://huggingface.co/datasets/miriad/miriad-5.8M), a large-scale collection of medical passages enriched with metadata such as publication year and specialty. - nodes.add(entity["name"]) - nodes.add(related["name"]) +### Indexing with Dense and ColBERT Multivectors - edges.append(f"{entity['name']} {relationship['type']} {related['name']}") +To enable high-quality retrieval, we will embed each medical passage with two models: - return {"nodes": list(nodes), "edges": edges} +* **Dense Embeddings**: These are generated using the `BAAI/bge-small-en` model and capture the passages' general semantic meaning. +* **ColBERT Multivectors**: These provide more fine-grained representations, enabling precise ranking of results. -``` +```python +dense_documents = [ + models.Document(text=doc, model="BAAI/bge-small-en") for doc in ds["passage_text"] +] -* * * +colbert_documents = [ + models.Document(text=doc, model="colbert-ir/colbertv2.0") + for doc in ds["passage_text"] +] -The function format\_graph\_context processes a subgraph returned by a Neo4j query. It extracts the graph’s entities (nodes) and relationships (edges). The nodes set ensures each entity is added only once. The edges list captures the relationships in a readable format: _Entity1 relationship Entity2_. +collection_name = "miriad" -### [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#integrating-with-the-llm) Integrating with the LLM +# Create collection +if not client.collection_exists(collection_name): + client.create_collection( + collection_name=collection_name, + vectors_config={ + "dense": models.VectorParams(size=384, distance=models.Distance.COSINE), + "colbert": models.VectorParams( + size=128, + distance=models.Distance.COSINE, + multivector_config=models.MultiVectorConfig( + comparator=models.MultiVectorComparator.MAX_SIM + ), + hnsw_config=models.HnswConfigDiff(m=0), # reranker: no indexing + ), + }, + ) -Now that we have the graph context, we need to generate a prompt for a language model like GPT-4. This is where the core of the Retrieval-Augmented Generation (RAG) happens — we combine the graph data and the user query into a comprehensive prompt for the model. +``` +We disable indexing for the ColBERT multivector since it will only be used for reranking. To learn more about this, check out the [How to Effectively Use Multivector Representations in Qdrant for Reranking](https://qdrant.tech/documentation/advanced-tutorials/using-multivector-representations/) article. -```python -def graphRAG_run(graph_context, user_query): - nodes_str = ", ".join(graph_context["nodes"]) - edges_str = "; ".join(graph_context["edges"]) - prompt = f""" - You are an intelligent assistant with access to the following knowledge graph: +### Batch Uploading to Qdrant - Nodes: {nodes_str} +To avoid hitting API limits, we upload the data in batches, each batch containing: - Edges: {edges_str} +* The passage text +* ColBERT and dense embeddings. +* `year` and `specialty` metadata fields. - Using this graph, Answer the following question: +```python +BATCH_SIZE = 3 +points_batch = [] - User Query: "{user_query}" - """ +for i in range(len(ds["passage_text"])): + point = models.PointStruct( + id=i, + vector={"dense": dense_documents[i], "colbert": colbert_documents[i]}, + payload={ + "passage_text": ds["passage_text"][i], + "year": ds["year"][i], + "specialty": ds["specialty"][i], + }, + ) + points_batch.append(point) - try: - response = client.chat.completions.create( - model="gpt-4", - messages=[\ - {"role": "system", "content": "Provide the answer for the following question:"},\ - {"role": "user", "content": prompt}\ - ] - ) - return response.choices[0].message + if len(points_batch) == BATCH_SIZE: + client.upsert(collection_name=collection_name, points=points_batch) + print(f"Uploaded batch ending at index {i}") + points_batch = [] - except Exception as e: - return f"Error querying LLM: {str(e)}" +# Final flush +if points_batch: + client.upsert(collection_name=collection_name, points=points_batch) + print("Uploaded final batch.") ``` -* * * +## Retrieval-Augmented Generation (RAG) Pipeline -The function graphRAG\_run takes the graph context (nodes and edges) and the user query, combining them into a structured prompt for the LLM. The nodes and edges are formatted as readable strings to form part of the LLM input. The LLM is then queried with the generated prompt, asking it to refine the user query using the graph context and provide an answer. If the model successfully generates a response, it returns the answer. +Our chatbot will use a Retrieval-Augmented Generation (RAG) pipeline to ensure its answers are grounded in medical literature. -### [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#end-to-end-pipeline) End-to-End Pipeline +### Integration of DSPy and Qdrant -Finally, let’s integrate everything into an end-to-end pipeline where we ingest some sample data, run the retrieval process, and query the language model. +At the heart of the application is the Qdrant vector database that provides the information sent to DSPy to generate the final answer. This is what happens when a user submits a query: -```python -if __name__ == "__main__": - print("Script started") - print("Loading environment variables...") - load_dotenv('.env.local') - print("Environment variables loaded") +* DSPy searches against the Qdrant vector database to retrieve the top documents and answers the query. The results are also filtered with a particular year range for a specific specialty. +* The retrieved passages are then reranked using ColBERT multivector embeddings, leading to the most relevant and contextually appropriate answers. +* DSPy uses these passages to guide the language model through a chain-of-thought reasoning to generate the most accurate answer. - print("Initializing clients...") - neo4j_driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password)) - qdrant_client = QdrantClient( - url=qdrant_url, - api_key=qdrant_key - ) - print("Clients initialized") +```python +def rerank_with_colbert(query_text, min_year, max_year, specialty): + from fastembed import TextEmbedding, LateInteractionTextEmbedding - print("Creating collection...") - collection_name = "graphRAGstoreds" - vector_dimension = 1536 - create_collection(qdrant_client, collection_name, vector_dimension) - print("Collection created/verified") + # Encode query once with both models + dense_model = TextEmbedding("BAAI/bge-small-en") + colbert_model = LateInteractionTextEmbedding("colbert-ir/colbertv2.0") - print("Extracting graph components...") + dense_query = list(dense_model.embed(query_text))[0] + colbert_query = list(colbert_model.embed(query_text))[0] - raw_data = """Alice is a data scientist at TechCorp's Seattle office. - Bob and Carol collaborate on the Alpha project. - Carol transferred to the New York office last year. - Dave mentors both Alice and Bob. - TechCorp's headquarters is in Seattle. - Carol leads the East Coast team. - Dave started his career in Seattle. - The Alpha project is managed from New York. - Alice previously worked with Carol at DataCo. - Bob joined the team after Dave's recommendation. - Eve runs the West Coast operations from Seattle. - Frank works with Carol on client relations. - The New York office expanded under Carol's leadership. - Dave's team spans multiple locations. - Alice visits Seattle monthly for team meetings. - Bob's expertise is crucial for the Alpha project. - Carol implemented new processes in New York. - Eve and Dave collaborated on previous projects. - Frank reports to the New York office. - TechCorp's main AI research is in Seattle. - The Alpha project revolutionized East Coast operations. - Dave oversees projects in both offices. - Bob's contributions are mainly remote. - Carol's team grew significantly after moving to New York. - Seattle remains the technology hub for TechCorp.""" + # Combined query: retrieve with dense, + # rerank with ColBERT + results = client.query_points( + collection_name=collection_name, + prefetch=models.Prefetch(query=dense_query, using="dense"), + query=colbert_query, + using="colbert", + limit=5, + with_payload=True, + query_filter=Filter( + must=[ + FieldCondition(key="specialty", match=MatchValue(value=specialty)), + FieldCondition( + key="year", + range=models.Range(gt=None, gte=min_year, lt=None, lte=max_year), + ), + ] + ), + ) - nodes, relationships = extract_graph_components(raw_data) - print("Nodes:", nodes) - print("Relationships:", relationships) + points = results.points + docs = [] - print("Ingesting to Neo4j...") - node_id_mapping = ingest_to_neo4j(nodes, relationships) - print("Neo4j ingestion complete") + for point in points: + docs.append(point.payload["passage_text"]) - print("Ingesting to Qdrant...") - ingest_to_qdrant(collection_name, raw_data, node_id_mapping) - print("Qdrant ingestion complete") + return docs + +``` - query = "How is Bob connected to New York?" - print("Starting retriever search...") - retriever_result = retriever_search(neo4j_driver, qdrant_client, collection_name, query) - print("Retriever results:", retriever_result) +The pipeline ensures that each response is grounded in real and recent medical literature and is aligned with the user's needs. - print("Extracting entity IDs...") - entity_ids = [item.content.split("'id': '")[1].split("'")[0] for item in retriever_result.items] - print("Entity IDs:", entity_ids) +## Guardrails and Medical Question Detection - print("Fetching related graph...") - subgraph = fetch_related_graph(neo4j_driver, entity_ids) - print("Subgraph:", subgraph) +Since this is a medical chatbot, we can introduce a simple guardrail to ensure it doesn’t respond to unrelated questions like the weather. This can be implemented using a DSPy module. - print("Formatting graph context...") - graph_context = format_graph_context(subgraph) - print("Graph context:", graph_context) +The chatbot checks if every question is medical-related before attempting to answer it. This is achieved by a DSPy module that classifies each incoming query as medical or not. If the question is not medical-related, the chatbot declines to answer, reducing the risk of misinformation or inappropriate responses. - print("Running GraphRAG...") - answer = graphRAG_run(graph_context, query) - print("Final Answer:", answer) +```python +class MedicalGuardrail(dspy.Module): + def forward(self, question): + prompt = ( + """ + Is the following question a medical question? + Answer with 'Yes' or 'No'.n" + f"Question: {question}n" + "Answer: + """ + ) + response = dspy.settings.lm(prompt) + answer = response[0].strip().lower() + return answer.startswith("yes") -``` -* * * +if not self.guardrail.forward(question): -Here’s what’s happening: + class DummyResult: + final_answer = """ + Sorry, I can only answer medical questions. + Please ask a question related to medicine or healthcare + """ -- First, the user query is defined (“How is Bob connected to New York?”). -- The QdrantNeo4jRetriever searches for related entities in the Qdrant vector database based on the user query’s embedding. It retrieves the top 5 results (top\_k=5). -- The entity\_ids are extracted from the retriever result. -- The fetch\_related\_graph function retrieves related entities and their relationships from the Neo4j database. -- The format\_graph\_context function prepares the graph data in a format the LLM can understand. -- Finally, the graphRAG\_run function is called to generate and query the language model, producing an answer based on the retrieved graph context. + return DummyResult() -With this, we have successfully created GraphRAG, a system capable of capturing complex relationships and delivering improved performance compared to the baseline RAG approach. +``` +By combining this guardrail with specialty and year filtering, we ensure that the chatbot: -# [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#advantages-of-qdrant--neo4j-graphrag) Advantages of Qdrant + Neo4j GraphRAG +* Only answers medical questions. +* Answers questions from recent medical literature. +* Doesn’t make up answers by grounding its answers in the provided literature. -Combining Qdrant with Neo4j in a GraphRAG architecture offers several compelling advantages, particularly regarding recall and precision combo, contextual understanding, adaptability to complex queries, and better cost and scalability. +![medicalbot demo](/articles_data/Qdrant-DSPy-medicalbot/medicaldemo.png) -1. **Improved Recall and Precision:** By leveraging Qdrant, a highly efficient vector search engine, alongside Neo4j’s robust graph database, the system benefits from both semantic search and relationship-based retrieval. Qdrant identifies relevant vectors and captures the similarity between queries and stored data. At the same time, Neo4j adds a layer of connectivity through its graph structure, ensuring that relevant and contextually linked information is retrieved. This combination improves recall (retrieving a broader set of relevant results) and precision (delivering more accurate and contextually relevant results), addressing a common challenge in traditional retrieval-based AI systems. -2. **Enhanced Contextual Understanding:** Neo4j enhances contextual understanding by representing information as a graph, where entities and their relationships are naturally modeled. When integrated with Qdrant, the system can retrieve similar items based on vector embeddings and those that fit within the desired relational context, leading to more nuanced and meaningful responses. -3. **Adaptability to Complex Queries:** Combining Qdrant and Neo4j makes the system highly adaptable to complex queries. While Qdrant handles the vector search for relevant data, Neo4j’s graph capabilities enable sophisticated querying through relationships. This allows for multi-hop reasoning and handling complex, structured queries that would be challenging for traditional search engines. -4. **Better Cost & Scalability:** GraphRAG, on its own, demands significant resources, as it relies on LLMs to construct and query knowledge graphs. It also employs clustering algorithms to create semantic clusters for local searches. These can hinder scalability and increase costs. Qdrant addresses the issue of local search through vector search, while Neo4j’s knowledge graph is queried for more precise answers, enhancing both efficiency and accuracy. Furthermore, instead of using an LLM, Named Entity Recognition (NER)-based techniques can reduce the cost further, but it depends mainly on the dataset. +## Conclusion -# [Anchor](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/\#conclusion) Conclusion +By leveraging Qdrant and DSPy, you can build a medical chatbot that generates accurate and up-to-date medical responses. Qdrant provides the technology and enables fast and scalable retrieval, while DSPy synthesizes this information to provide correct answers grounded in the medical literature. As a result, you can achieve a medical system that is truthful, safe, and provides relevant responses. Check out the entire project from this [notebook](https://github.com/qdrant/examples/blob/master/DSPy-medical-bot/medical_bot_DSPy_Qdrant.ipynb). You’ll need a free [Qdrant Cloud](https://qdrant.tech/cloud/) account to run the notebook. -GraphRAG with Neo4j and Qdrant marks an important step forward in retrieval-augmented generation. This hybrid approach delivers significant advantages by combining vector search and graph databases. Qdrant’s semantic search capabilities enhance recall accuracy, while Neo4j’s relationship modeling provides deeper context understanding. +<|page-153-lllmstxt|> +## Data Management Integrations + +| Integration | Description | +| ------------------------------- | -------------------------------------------------------------------------------------------------- | +| [Airbyte](/documentation/data-management/airbyte/) | Data integration platform specialising in ELT pipelines. | +| [Airflow](/documentation/data-management/airflow/) | Platform designed for developing, scheduling, and monitoring batch-oriented workflows. | +| [CocoIndex](/documentation/data-management/cocoindex/) | High performance ETL framework to transform data for AI, with real-time incremental processing | +| [Cognee](/documentation/data-management/cognee/) | AI memory frameworks that allows loading from 30+ data sources to graph and vector stores | +| [Connect](/documentation/data-management/redpanda/) | Declarative data-agnostic streaming service for efficient, stateless processing. | +| [Confluent](/documentation/data-management/confluent/) | Fully-managed data streaming platform with a cloud-native Apache Kafka engine. | +| [DLT](/documentation/data-management/dlt/) | Python library to simplify data loading processes between several sources and destinations. | +| [Fluvio](/documentation/data-management/fluvio/) | Rust-based platform for high speed, real-time data processing. | +| [Spark](/documentation/data-management/spark/) | A unified analytics engine for large-scale data processing. | +| [Unstructured](/documentation/data-management/unstructured/) | Python library with components for ingesting and pre-processing data from numerous sources. | -The implementation template we’ve explored offers a foundation for your projects. You can adapt and customize it based on your specific needs, whether for document analysis, knowledge management, or other information retrieval tasks. +<|page-154-lllmstxt|> +# Multitenancy with LlamaIndex -As AI systems evolve, this combination of technologies shows how we can build smarter, more efficient solutions. We encourage you to experiment with this approach and discover how it can enhance your applications. +If you are building a service that serves vectors for many independent users, and you want to isolate their +data, the best practice is to use a single collection with payload-based partitioning. This approach is +called **multitenancy**. Our guide on the [Separate Partitions](/documentation/guides/multiple-partitions/) describes +how to set it up in general, but if you use [LlamaIndex](/documentation/integrations/llama-index/) as a +backend, you may prefer reading a more specific instruction. So here it is! -##### Was this page useful? +## Prerequisites -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +This tutorial assumes that you have already installed Qdrant and LlamaIndex. If you haven't, please run the +following commands: -Thank you for your feedback! 🙏 +```bash +pip install llama-index llama-index-vector-stores-qdrant +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/graphrag-qdrant-neo4j.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +We are going to use a local Docker-based instance of Qdrant. If you want to use a remote instance, please +adjust the code accordingly. Here is how we can start a local instance: -On this page: +```bash +docker run -d --name qdrant -p 6333:6333 -p 6334:6334 qdrant/qdrant:latest +``` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/graphrag-qdrant-neo4j.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +## Setting up LlamaIndex pipeline -× +We are going to implement an end-to-end example of multitenant application using LlamaIndex. We'll be +indexing the documentation of different Python libraries, and we definitely don't want any users to see the +results coming from a library they are not interested in. In real case scenarios, this is even more dangerous, +as the documents may contain sensitive information. -[Powered by](https://qdrant.tech/) +### Creating vector store -<|page-97-lllmstxt|> -## payload -- [Documentation](https://qdrant.tech/documentation/) -- [Concepts](https://qdrant.tech/documentation/concepts/) -- Payload +[QdrantVectorStore](https://docs.llamaindex.ai/en/stable/examples/vector_stores/QdrantIndexDemo.html) is a +wrapper around Qdrant that provides all the necessary methods to work with your vector database in LlamaIndex. +Let's create a vector store for our collection. It requires setting a collection name and passing an instance +of `QdrantClient`. -# [Anchor](https://qdrant.tech/documentation/concepts/payload/\#payload) Payload +```python +from qdrant_client import QdrantClient +from llama_index.vector_stores.qdrant import QdrantVectorStore -One of the significant features of Qdrant is the ability to store additional information along with vectors. -This information is called `payload` in Qdrant terminology. -Qdrant allows you to store any information that can be represented using JSON. +client = QdrantClient("http://localhost:6333") -Here is an example of a typical payload: +vector_store = QdrantVectorStore( + collection_name="my_collection", + client=client, +) +``` -```json -{ - "name": "jacket", - "colors": ["red", "blue"], - "count": 10, - "price": 11.99, - "locations": [\ - {\ - "lon": 52.5200,\ - "lat": 13.4050\ - }\ - ], - "reviews": [\ - {\ - "user": "alice",\ - "score": 4\ - },\ - {\ - "user": "bob",\ - "score": 5\ - }\ - ] -} +### Defining chunking strategy and embedding model -``` +Any semantic search application requires a way to convert text queries into vectors - an embedding model. +`ServiceContext` is a bundle of commonly used resources used during the indexing and querying stage in any +LlamaIndex application. We can also use it to set up an embedding model - in our case, a local +[BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5). +set up -## [Anchor](https://qdrant.tech/documentation/concepts/payload/\#payload-types) Payload types +```python +from llama_index.core import ServiceContext -In addition to storing payloads, Qdrant also allows you search based on certain kinds of values. -This feature is implemented as additional filters during the search and will enable you to incorporate custom logic on top of semantic similarity. +service_context = ServiceContext.from_defaults( + embed_model="local:BAAI/bge-small-en-v1.5", +) +``` +*Note*, in case you are using Large Language Model different from OpenAI's ChatGPT, you should specify +`llm` parameter for `ServiceContext`. -During the filtering, Qdrant will check the conditions over those values that match the type of the filtering condition. If the stored value type does not fit the filtering condition - it will be considered not satisfied. +We can also control how our documents are split into chunks, or nodes using LLamaIndex's terminology. +The `SimpleNodeParser` splits documents into fixed length chunks with an overlap. The defaults are +reasonable, but we can also adjust them if we want to. Both values are defined in tokens. -For example, you will get an empty output if you apply the [range condition](https://qdrant.tech/documentation/concepts/filtering/#range) on the string data. +```python +from llama_index.core.node_parser import SimpleNodeParser -However, arrays (multiple values of the same type) are treated a little bit different. When we apply a filter to an array, it will succeed if at least one of the values inside the array meets the condition. +node_parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=32) +``` -The filtering process is discussed in detail in the section [Filtering](https://qdrant.tech/documentation/concepts/filtering/). +Now we also need to inform the `ServiceContext` about our choices: -Let’s look at the data types that Qdrant supports for searching: +```python +service_context = ServiceContext.from_defaults( + embed_model="local:BAAI/bge-large-en-v1.5", + node_parser=node_parser, +) +``` -### [Anchor](https://qdrant.tech/documentation/concepts/payload/\#integer) Integer +Both embedding model and selected node parser will be implicitly used during the indexing and querying. -`integer` \- 64-bit integer in the range from `-9223372036854775808` to `9223372036854775807`. +### Combining everything together -Example of single and multiple `integer` values: +The last missing piece, before we can start indexing, is the `VectorStoreIndex`. It is a wrapper around +`VectorStore` that provides a convenient interface for indexing and querying. It also requires a +`ServiceContext` to be initialized. -```json -{ - "count": 10, - "sizes": [35, 36, 38] -} +```python +from llama_index.core import VectorStoreIndex +index = VectorStoreIndex.from_vector_store( + vector_store=vector_store, service_context=service_context +) ``` -### [Anchor](https://qdrant.tech/documentation/concepts/payload/\#float) Float +## Indexing documents -`float` \- 64-bit floating point number. +No matter how our documents are generated, LlamaIndex will automatically split them into nodes, if +required, encode using selected embedding model, and then store in the vector store. Let's define +some documents manually and insert them into Qdrant collection. Our documents are going to have +a single metadata attribute - a library name they belong to. -Example of single and multiple `float` values: +```python +from llama_index.core.schema import Document -```json -{ - "price": 11.99, - "ratings": [9.1, 9.2, 9.4] -} +documents = [ + Document( + text="LlamaIndex is a simple, flexible data framework for connecting custom data sources to large language models.", + metadata={ + "library": "llama-index", + }, + ), + Document( + text="Qdrant is a vector database & vector similarity search engine.", + metadata={ + "library": "qdrant", + }, + ), +] +``` +Now we can index them using our `VectorStoreIndex`: + +```python +for document in documents: + index.insert(document) ``` -### [Anchor](https://qdrant.tech/documentation/concepts/payload/\#bool) Bool +### Performance considerations -Bool - binary value. Equals to `true` or `false`. +Our documents have been split into nodes, encoded using the embedding model, and stored in the vector +store. However, we don't want to allow our users to search for all the documents in the collection, +but only for the documents that belong to a library they are interested in. For that reason, we need +to set up the Qdrant [payload index](/documentation/concepts/indexing/#payload-index), so the search +is more efficient. -Example of single and multiple `bool` values: +```python +from qdrant_client import models -```json -{ - "is_delivered": true, - "responses": [false, false, true, false] -} +client.create_payload_index( + collection_name="my_collection", + field_name="metadata.library", + field_type=models.PayloadSchemaType.KEYWORD, +) +``` + +The payload index is not the only thing we want to change. Since none of the search +queries will be executed on the whole collection, we can also change its configuration, so the HNSW +graph is not built globally. This is also done due to [performance reasons](/documentation/guides/multiple-partitions/#calibrate-performance). +**You should not be changing these parameters, if you know there will be some global search operations +done on the collection.** +```python +client.update_collection( + collection_name="my_collection", + hnsw_config=models.HnswConfigDiff(payload_m=16, m=0), +) ``` -### [Anchor](https://qdrant.tech/documentation/concepts/payload/\#keyword) Keyword +Once both operations are completed, we can start searching for our documents. -`keyword` \- string value. + -Example of single and multiple `keyword` values: +## Querying documents with constraints -```json -{ - "name": "Alice", - "friends": [\ - "bob",\ - "eva",\ - "jack"\ - ] -} +Let's assume we are searching for some information about large language models, but are only allowed to +use Qdrant documentation. LlamaIndex has a concept of retrievers, responsible for finding the most +relevant nodes for a given query. Our `VectorStoreIndex` can be used as a retriever, with some additional +constraints - in our case value of the `library` metadata attribute. -``` +```python +from llama_index.core.vector_stores.types import MetadataFilters, ExactMatchFilter -### [Anchor](https://qdrant.tech/documentation/concepts/payload/\#geo) Geo +qdrant_retriever = index.as_retriever( + filters=MetadataFilters( + filters=[ + ExactMatchFilter( + key="library", + value="qdrant", + ) + ] + ) +) -`geo` is used to represent geographical coordinates. +nodes_with_scores = qdrant_retriever.retrieve("large language models") +for node in nodes_with_scores: + print(node.text, node.score) +# Output: Qdrant is a vector database & vector similarity search engine. 0.60551536 +``` -Example of single and multiple `geo` values: +The description of Qdrant was the best match, even though it didn't mention large language models +at all. However, it was the only document that belonged to the `qdrant` library, so there was no +other choice. Let's try to search for something that is not present in the collection. -```json -{ - "location": { - "lon": 52.5200, - "lat": 13.4050 - }, - "cities": [\ - {\ - "lon": 51.5072,\ - "lat": 0.1276\ - },\ - {\ - "lon": 40.7128,\ - "lat": 74.0060\ - }\ - ] -} +Let's define another retrieve, this time for the `llama-index` library: + +```python +llama_index_retriever = index.as_retriever( + filters=MetadataFilters( + filters=[ + ExactMatchFilter( + key="library", + value="llama-index", + ) + ] + ) +) +nodes_with_scores = llama_index_retriever.retrieve("large language models") +for node in nodes_with_scores: + print(node.text, node.score) +# Output: LlamaIndex is a simple, flexible data framework for connecting custom data sources to large language models. 0.63576734 ``` -Coordinate should be described as an object containing two fields: `lon` \- for longitude, and `lat` \- for latitude. +The results returned by both retrievers are different, due to the different constraints, so we implemented +a real multitenant search application! -### [Anchor](https://qdrant.tech/documentation/concepts/payload/\#datetime) Datetime +<|page-155-lllmstxt|> +# Database Tutorials + +| | +|--------------------------------------------| +| [Bulk Upload Vectors to a Qdrant Collection](/documentation/database-tutorials/bulk-upload/) | +| [Large Scale Search](/documentation/database-tutorials/large-scale-search/) | +| [Backup and Restore Qdrant Collections Using Snapshots](/documentation/database-tutorials/create-snapshot/) | +| [Load and Search Hugging Face Datasets with Qdrant](/documentation/database-tutorials/huggingface-datasets/) | +| [Using Qdrant’s Async API for Efficient Python Applications](/documentation/database-tutorials/async-api/) | +| [Qdrant Migration Guide](/documentation/database-tutorials/migration/) | +| [Static Embeddings. Should you pay attention?](/documentation/database-tutorials/static-embeddings/) | -_Available as of v1.8.0_ +<|page-156-lllmstxt|> +# Supported Embedding Providers & Models -`datetime` \- date and time in [RFC 3339](https://datatracker.ietf.org/doc/html/rfc3339#section-5.6) format. +Qdrant supports all available text and multimodal dense vector embedding models as well as vector embedding services without any limitations. -See the following examples of single and multiple `datetime` values: +## Some of the Embeddings you can use with Qdrant -```json -{ - "created_at": "2023-02-08T10:49:00Z", - "updated_at": [\ - "2023-02-08T13:52:00Z",\ - "2023-02-21T21:23:00Z"\ - ] -} +SentenceTransformers, BERT, SBERT, Clip, OpenClip, Open AI, Vertex AI, Azure AI, AWS Bedrock, Jina AI, Upstage AI, Mistral AI, Cohere AI, Voyage AI, Aleph Alpha, Baidu Qianfan, BGE, Instruct, Watsonx Embeddings, Snowflake Embeddings, NVIDIA NeMo, Nomic, OCI Embeddings, Ollama Embeddings, MixedBread, Together AI, Clarifai, Databricks Embeddings, GPT4All Embeddings, John Snow Labs Embeddings. -``` +Additionally, [any open-source embeddings from HuggingFace](https://huggingface.co/spaces/mteb/leaderboard) can be used with Qdrant. -The following formats are supported: +## Code samples + +| Embeddings Providers | Description | +| ----------------------------------------------------- | ---------------------------------------------------------------- | +| [Aleph Alpha](/documentation/embeddings/aleph-alpha/) | Multilingual embeddings focused on European languages. | +| [Bedrock](/documentation/embeddings/bedrock/) | AWS managed service for foundation models and embeddings. | +| [Cohere](/documentation/embeddings/cohere/) | Language model embeddings for NLP tasks. | +| [Gemini](/documentation/embeddings/gemini/) | Google’s multimodal embeddings for text and vision. | +| [Jina AI](/documentation/embeddings/jina-embeddings/) | Customizable embeddings for neural search. | +| [Mistral](/documentation/embeddings/mistral/) | Open-source, efficient language model embeddings. | +| [MixedBread](/documentation/embeddings/mixedbread/) | Lightweight embeddings for constrained environments. | +| [Mixpeek](/documentation/embeddings/mixpeek/) | Managed SDK for video chunking, embedding, and post-processing. ​ | +| [Nomic](/documentation/embeddings/nomic/) | Embeddings for data visualization. | +| [Nvidia](/documentation/embeddings/nvidia/) | GPU-optimized embeddings from Nvidia. | +| [Ollama](/documentation/embeddings/ollama/) | Embeddings for conversational AI. | +| [OpenAI](/documentation/embeddings/openai/) | Industry-leading embeddings for NLP. | +| [Prem AI](/documentation/embeddings/premai/) | Precise language embeddings. | +| [Twelve Labs](/documentation/embeddings/twelvelabs/) | Multimodal embeddings from Twelve labs. | +| [Snowflake](/documentation/embeddings/snowflake/) | Scalable embeddings for big data. | +| [Upstage](/documentation/embeddings/upstage/) | Embeddings for speech and language tasks. | +| [Voyage AI](/documentation/embeddings/voyage/) | Navigation and spatial understanding embeddings. | -- `"2023-02-08T10:49:00Z"` ( [RFC 3339](https://datatracker.ietf.org/doc/html/rfc3339#section-5.6), UTC) -- `"2023-02-08T11:49:00+01:00"` ( [RFC 3339](https://datatracker.ietf.org/doc/html/rfc3339#section-5.6), with timezone) -- `"2023-02-08T10:49:00"` (without timezone, UTC is assumed) -- `"2023-02-08T10:49"` (without timezone and seconds) -- `"2023-02-08"` (only date, midnight is assumed) +<|page-157-lllmstxt|> +# Qdrant Cloud Premium Tier -Notes about the format: +Qdrant Cloud offers an optional premium tier for customers who require additional features and better SLA support levels. The premium tier includes: -- `T` can be replaced with a space. -- The `T` and `Z` symbols are case-insensitive. -- UTC is always assumed when the timezone is not specified. -- Timezone can have the following formats: `±HH:MM`, `±HHMM`, `±HH`, or `Z`. -- Seconds can have up to 6 decimals, so the finest granularity for `datetime` is microseconds. +* **24/7 Support**: Our support team is available around the clock to help you with any issues you may encounter (compared to 10x5 in standard). +* **Shorter Response Times**: Premium customers receive priority support and can expect faster response times, with shorter SLAs. +* **99.9% Uptime SLA**: We guarantee 99.9% uptime for your Qdrant Cloud clusters (compared to 99.5% in standard). +* **Single Sign-On (SSO)**: Premium customers can use their existing SSO provider to manage access to Qdrant Cloud. +* **VPC Private Links**: Premium customers can connect their Qdrant Cloud clusters to their VPCs using private links (AWS only). +* **Storage encryption with shared keys**: Premium customers can encrypt their data at rest using their own keys (AWS only). -### [Anchor](https://qdrant.tech/documentation/concepts/payload/\#uuid) UUID +Please refer to the [Qdrant Cloud SLA](https://qdrant.to/sla/) for a detailed definition on uptime and support SLAs. -_Available as of v1.11.0_ +If you are interested in switching to Qdrant Cloud Premium, please [contact us](/contact-us/) for more information. -In addition to the basic `keyword` type, Qdrant supports `uuid` type for storing UUID values. -Functionally, it works the same as `keyword`, internally stores parsed UUID values. +<|page-158-lllmstxt|> +# Creating a Qdrant Cloud Cluster -```json -{ - "uuid": "550e8400-e29b-41d4-a716-446655440000", - "uuids": [\ - "550e8400-e29b-41d4-a716-446655440000",\ - "550e8400-e29b-41d4-a716-446655440001"\ - ] -} +Qdrant Cloud offers two types of clusters: **Free** and **Standard**. -``` +## Free Clusters -String representation of UUID (e.g. `550e8400-e29b-41d4-a716-446655440000`) occupies 36 bytes. -But when numeric representation is used, it is only 128 bits (16 bytes). +Free tier clusters are perfect for prototyping and testing. You don't need a credit card to join. -Usage of `uuid` index type is recommended in payload-heavy collections to save RAM and improve search performance. +A free tier cluster only includes 1 single node with the following resources: -## [Anchor](https://qdrant.tech/documentation/concepts/payload/\#create-point-with-payload) Create point with payload +| Resource | Value | +|------------|-------| +| RAM | 1 GB | +| vCPU | 0.5 | +| Disk space | 4 GB | +| Nodes | 1 | -REST API ( [Schema](https://api.qdrant.tech/api-reference/points/upsert-points)) +This configuration supports serving about 1 M vectors of 768 dimensions. To calculate your needs, refer to our documentation on [Capacity Planning](/documentation/guides/capacity-planning/). -httppythontypescriptrustjavacsharpgo +The choice of cloud providers and regions is limited. -```http -PUT /collections/{collection_name}/points -{ - "points": [\ - {\ - "id": 1,\ - "vector": [0.05, 0.61, 0.76, 0.74],\ - "payload": {"city": "Berlin", "price": 1.99}\ - },\ - {\ - "id": 2,\ - "vector": [0.19, 0.81, 0.75, 0.11],\ - "payload": {"city": ["Berlin", "London"], "price": 1.99}\ - },\ - {\ - "id": 3,\ - "vector": [0.36, 0.55, 0.47, 0.94],\ - "payload": {"city": ["Berlin", "Moscow"], "price": [1.99, 2.99]}\ - }\ - ] -} +It includes: -``` +- Standard Support +- Basic monitoring +- Basic log access +- Basic alerting +- Version upgrades with downtime +- Only manual snapshots and restores via API +- No dedicated resources -```python -from qdrant_client import QdrantClient, models +If unused, free tier clusters are automatically suspended after 1 week, and deleted after 4 weeks of inactivity if not reactivated. -client = QdrantClient(url="http://localhost:6333") +You can always upgrade to a standard cluster with more resources and features. -client.upsert( - collection_name="{collection_name}", - points=[\ - models.PointStruct(\ - id=1,\ - vector=[0.05, 0.61, 0.76, 0.74],\ - payload={\ - "city": "Berlin",\ - "price": 1.99,\ - },\ - ),\ - models.PointStruct(\ - id=2,\ - vector=[0.19, 0.81, 0.75, 0.11],\ - payload={\ - "city": ["Berlin", "London"],\ - "price": 1.99,\ - },\ - ),\ - models.PointStruct(\ - id=3,\ - vector=[0.36, 0.55, 0.47, 0.94],\ - payload={\ - "city": ["Berlin", "Moscow"],\ - "price": [1.99, 2.99],\ - },\ - ),\ - ], -) +## Standard Clusters -``` +On top of the Free cluster features, Standard clusters offer: -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +- Response time and uptime SLAs +- Dedicated resources +- Backup and disaster recovery +- Multi-node clusters for high availability +- Horizontal and vertical scaling +- Monitoring and log management +- Zero-downtime upgrades for multi-node clusters with replication -const client = new QdrantClient({ host: "localhost", port: 6333 }); +You have a broad choice of regions on AWS, Azure and Google Cloud. -client.upsert("{collection_name}", { - points: [\ - {\ - id: 1,\ - vector: [0.05, 0.61, 0.76, 0.74],\ - payload: {\ - city: "Berlin",\ - price: 1.99,\ - },\ - },\ - {\ - id: 2,\ - vector: [0.19, 0.81, 0.75, 0.11],\ - payload: {\ - city: ["Berlin", "London"],\ - price: 1.99,\ - },\ - },\ - {\ - id: 3,\ - vector: [0.36, 0.55, 0.47, 0.94],\ - payload: {\ - city: ["Berlin", "Moscow"],\ - price: [1.99, 2.99],\ - },\ - },\ - ], -}); +For payment information see [**Pricing and Payments**](/documentation/cloud/pricing-payments/). -``` +## Create a Cluster -```rust -use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder}; -use qdrant_client::{Payload, Qdrant, QdrantError}; -use serde_json::json; +![Create Cluster Page](/documentation/cloud/create-cluster.png) -let client = Qdrant::from_url("http://localhost:6334").build()?; +This page shows you how to use the Qdrant Cloud Console to create a custom Qdrant Cloud cluster. -let points = vec![\ - PointStruct::new(\ - 1,\ - vec![0.05, 0.61, 0.76, 0.74],\ - Payload::try_from(json!({"city": "Berlin", "price": 1.99})).unwrap(),\ - ),\ - PointStruct::new(\ - 2,\ - vec![0.19, 0.81, 0.75, 0.11],\ - Payload::try_from(json!({"city": ["Berlin", "London"]})).unwrap(),\ - ),\ - PointStruct::new(\ - 3,\ - vec![0.36, 0.55, 0.47, 0.94],\ - Payload::try_from(json!({"city": ["Berlin", "Moscow"], "price": [1.99, 2.99]}))\ - .unwrap(),\ - ),\ -]; +> **Prerequisite:** Please make sure you have provided billing information before creating a custom cluster. + +1. Start in the **Clusters** section of the [Cloud Dashboard](https://cloud.qdrant.io/). +1. Select **Clusters** and then click **+ Create**. +1. In the **Create a cluster** screen select **Free** or **Standard** + Most of the remaining configuration options are only available for standard clusters. +1. Select a provider. Currently, you can deploy to: + + - Amazon Web Services (AWS) + - Google Cloud Platform (GCP) + - Microsoft Azure + - Your own [Hybrid Cloud](/documentation/hybrid-cloud/) Infrastructure + +1. Choose your data center region or Hybrid Cloud environment. +1. Configure RAM for each node. + > For more information, see our [Capacity Planning](/documentation/guides/capacity-planning/) guidance. +1. Choose the number of vCPUs per node. If you add more + RAM, the menu provides different options for vCPUs. +1. Select the number of nodes you want the cluster to be deployed on. + > Each node is automatically attached with a disk, that has enough space to store data with Qdrant's default collection configuration. +1. Select additional disk space for your deployment. + > Depending on your collection configuration, you may need more disk space per RAM. For example, if you configure `on_disk: true` and only use RAM for caching. +1. Review your cluster configuration and pricing. +1. When you're ready, select **Create**. It takes some time to provision your cluster. -client - .upsert_points(UpsertPointsBuilder::new("{collection_name}", points).wait(true)) - .await?; +Once provisioned, you can access your cluster on ports 443 and 6333 (REST) and 6334 (gRPC). -``` +![Cluster configured in the UI](/documentation/cloud/cluster-detail.png) -```java -import java.util.List; -import java.util.Map; +You should now see the new cluster in the **Clusters** menu. -import static io.qdrant.client.PointIdFactory.id; -import static io.qdrant.client.ValueFactory.value; -import static io.qdrant.client.VectorsFactory.vectors; +## Creating a Production-Ready Cluster -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.PointStruct; +To create a production-ready cluster, you need to ensure the following: -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +**High Availability** -client - .upsertAsync( - "{collection_name}", - List.of( - PointStruct.newBuilder() - .setId(id(1)) - .setVectors(vectors(0.05f, 0.61f, 0.76f, 0.74f)) - .putAllPayload(Map.of("city", value("Berlin"), "price", value(1.99))) - .build(), - PointStruct.newBuilder() - .setId(id(2)) - .setVectors(vectors(0.19f, 0.81f, 0.75f, 0.11f)) - .putAllPayload( - Map.of("city", list(List.of(value("Berlin"), value("London"))))) - .build(), - PointStruct.newBuilder() - .setId(id(3)) - .setVectors(vectors(0.36f, 0.55f, 0.47f, 0.94f)) - .putAllPayload( - Map.of( - "city", - list(List.of(value("Berlin"), value("London"))), - "price", - list(List.of(value(1.99), value(2.99))))) - .build())) - .get(); +Your cluster should have at least 3 nodes, and each collection should have a replication factor of at least 2. This ensures that is one node fails, or is restarted due to maintenance, a version upgrade, or a scaling operation, that the cluster remains fully operational. You can ensure this by checking the **High Availability** checkbox when creating a cluster. -``` +**Backup and Disaster Recovery** -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +You should create a backup schedule for your cluster. This ensures that you can restore your data in case of a disaster. You can configure backups in the **Backups** section of the cluster detail page. See [**Backups**](/documentation/cloud/backups/) for more information. -var client = new QdrantClient("localhost", 6334); +**Collection Sharding** -await client.UpsertAsync( - collectionName: "{collection_name}", - points: new List - { - new PointStruct - { - Id = 1, - Vectors = new[] { 0.05f, 0.61f, 0.76f, 0.74f }, - Payload = { ["city"] = "Berlin", ["price"] = 1.99 } - }, - new PointStruct - { - Id = 2, - Vectors = new[] { 0.19f, 0.81f, 0.75f, 0.11f }, - Payload = { ["city"] = new[] { "Berlin", "London" } } - }, - new PointStruct - { - Id = 3, - Vectors = new[] { 0.36f, 0.55f, 0.47f, 0.94f }, - Payload = - { - ["city"] = new[] { "Berlin", "Moscow" }, - ["price"] = new Value - { - ListValue = new ListValue { Values = { new Value[] { 1.99, 2.99 } } } - } - } - } - } -); +To allow your cluster to easily scale horizontally, you should configure at least twice as many shards per collection than the number of nodes in your cluster. You can configure the number of shards when creating a collection. See [**Sharding**](/documentation/guides/distributed_deployment/#sharding) for more information. -``` +If you did not configure enough shards in a collection, you can use the [**Resharding**](/documentation/cloud/cluster-scaling/#resharding) feature to change the number of shards in an existing collection. -```go -import ( - "context" +For more information on how to create a production-ready cluster, see our [**Vector Search in Production**](/articles/vector-search-production/) article. - "github.com/qdrant/go-client/qdrant" -) +## Deleting a Cluster -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +You can delete a Qdrant database cluster from the cluster's detail page. -client.Upsert(context.Background(), &qdrant.UpsertPoints{ - CollectionName: "{collection_name}", - Points: []*qdrant.PointStruct{ - { - Id: qdrant.NewIDNum(1), - Vectors: qdrant.NewVectors(0.05, 0.61, 0.76, 0.74), - Payload: qdrant.NewValueMap(map[string]any{ - "city": "Berlin", "price": 1.99}), - }, - { - Id: qdrant.NewIDNum(2), - Vectors: qdrant.NewVectors(0.19, 0.81, 0.75, 0.11), - Payload: qdrant.NewValueMap(map[string]any{ - "city": []any{"Berlin", "London"}}), - }, - { - Id: qdrant.NewIDNum(3), - Vectors: qdrant.NewVectors(0.36, 0.55, 0.47, 0.94), - Payload: qdrant.NewValueMap(map[string]any{ - "city": []any{"Berlin", "London"}, - "price": []any{1.99, 2.99}}), - }, - }, -}) +![Delete Cluster](/documentation/cloud/delete-cluster.png) + +## Next Steps -``` +You will need to connect to your new Qdrant Cloud cluster. Follow [**Authentication**](/documentation/cloud/authentication/) to create one or more API keys. -## [Anchor](https://qdrant.tech/documentation/concepts/payload/\#update-payload) Update payload +You can also scale your cluster both horizontally and vertically. Read more in [**Cluster Scaling**](/documentation/cloud/cluster-scaling/). -Updating payloads in Qdrant offers flexible methods to manage vector metadata. The **set payload** method updates specific fields while keeping others unchanged, while the **overwrite** method replaces the entire payload. Developers can also use **clear payload** to remove all metadata or delete fields to remove specific keys without affecting the rest. These options provide precise control for adapting to dynamic datasets. +If a new Qdrant version becomes available, you can upgrade your cluster. See [**Cluster Upgrades**](/documentation/cloud/cluster-upgrades/). -### [Anchor](https://qdrant.tech/documentation/concepts/payload/\#set-payload) Set payload +For more information on creating and restoring backups of a cluster, see [**Backups**](/documentation/cloud/backups/). -Set only the given payload values on a point. +<|page-159-lllmstxt|> +## Framework Integrations + +| Framework | Description | +| ------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | +| [AutoGen](/documentation/frameworks/autogen/) | Framework from Microsoft building LLM applications using multiple conversational agents. | +| [Camel](/documentation/frameworks/camel/) | Framework to build and use LLM-based agents for real-world task solving | +| [Cheshire Cat](/documentation/frameworks/cheshire-cat/) | Framework to create personalized AI assistants using custom data. | +| [CrewAI](/documentation/frameworks/crewai/) | CrewAI is a framework to build automated workflows using multiple AI agents that perform complex tasks. | +| [Dagster](/documentation/frameworks/dagster/) | Python framework for data orchestration with integrated lineage, observability. | +| [DeepEval](/documentation/frameworks/deepeval/) | Python framework for testing large language model systems. | +| [DSPy](/documentation/frameworks/dspy/) | Framework for algorithmically optimizing LM prompts and weights. | +| [Dynamiq](/documentation/frameworks/dynamiq/) | Dynamiq is all-in-one Gen AI framework, designed to streamline the development of AI-powered applications. | +| [Feast](/documentation/frameworks/feast/) | Open-source feature store to operate production ML systems at scale as a set of features. | +| [Fifty-One](/documentation/frameworks/fifty-one/) | Toolkit for building high-quality datasets and computer vision models. | +| [Genkit](/documentation/frameworks/genkit/) | Framework to build, deploy, and monitor production-ready AI-powered apps. | +| [Haystack](/documentation/frameworks/haystack/) | LLM orchestration framework to build customizable, production-ready LLM applications. | +| [HoneyHive](/documentation/frameworks/honeyhive/) | AI observability and evaluation platform that provides tracing and monitoring tools for GenAI pipelines. | +| [Lakechain](/documentation/frameworks/lakechain/) | Python framework for deploying document processing pipelines on AWS using infrastructure-as-code. | +| [Langchain](/documentation/frameworks/langchain/) | Python framework for building context-aware, reasoning applications using LLMs. | +| [Langchain4j](/documentation/frameworks/langchain4j/) | Java framework for building context-aware, reasoning applications using LLMs. | +| [LangGraph](/documentation/frameworks/langgraph/) | Python, Javascript libraries for building stateful, multi-actor applications. | +| [LlamaIndex](/documentation/frameworks/llama-index/) | A data framework for building LLM applications with modular integrations. | +| [Mastra](/documentation/frameworks/mastra/) | Typescript framework to build AI applications and features quickly. | +| [Mirror Security](/documentation/frameworks/mirror-security/) | Python framework for vector encryption and access control. | +| [Mem0](/documentation/frameworks/mem0/) | Self-improving memory layer for LLM applications, enabling personalized AI experiences. | +| [Neo4j GraphRAG](/documentation/frameworks/neo4j-graphrag/) | Package to build graph retrieval augmented generation (GraphRAG) applications using Neo4j and Python. | +| [NLWeb](/documentation/frameworks/nlweb/) | A framework to turn websites into chat-ready data using schema.org and associated data formats. | +| [Rig-rs](/documentation/frameworks/rig-rs/) | Rust library for building scalable, modular, and ergonomic LLM-powered applications. | +| [Semantic Router](/documentation/frameworks/semantic-router/) | Python library to build a decision-making layer for AI applications using vector search. | +| [SmolAgents](/documentation/frameworks/smolagents/) | Barebones library for agents. Agents write python code to call tools and orchestrate other agent. | +| [Spring AI](/documentation/frameworks/spring-ai/) | Java AI framework for building with Spring design principles such as portability and modular design. | +| [Sycamore](/documentation/frameworks/sycamore/) | Document processing engine for ETL, RAG, LLM-based applications, and analytics on unstructured data. | +| [Testcontainers](/documentation/frameworks/testcontainers/) | Framework for providing throwaway, lightweight instances of systems for testing | +| [txtai](/documentation/frameworks/txtai/) | Python library for semantic search, LLM orchestration and language model workflows. | +| [Vanna AI](/documentation/frameworks/vanna-ai/) | Python RAG framework for SQL generation and querying. | +| [VoltAgent](/documentation/frameworks/voltagent/) | TypeScript framework for building AI agents with modular tools, LLM coordination, and visual monitoring dashboard. | -REST API ( [Schema](https://api.qdrant.tech/api-reference/points/set-payload)): +<|page-160-lllmstxt|> +## Observability Integrations -httppythontypescriptrustjavacsharpgo +| Tool | Description | +| ----------------------------- | -------------------------------------------------------------------------------------- | +| [OpenLIT](/documentation/observability/openlit/) | Platform for OpenTelemetry-native Observability & Evals for LLMs and Vector Databases. | +| [OpenLLMetry](/documentation/observability/openllmetry/) | Set of OpenTelemetry extensions to add Observability for your LLM application. | +| [Datadog](/documentation/observability/datadog/) | Cloud-based monitoring and analytics platform. | -```http -POST /collections/{collection_name}/points/payload -{ - "payload": { - "property1": "string", - "property2": "string" - }, - "points": [\ - 0, 3, 100\ - ] -} +<|page-161-lllmstxt|> +## Platform Integrations + +| Platform | Description | +| ------------------------------------------------------- | ---------------------------------------------------------------------------------------- | +| [Apify](/documentation/platforms/apify/) | Platform to build web scrapers and automate web browser tasks. | +| [BuildShip](/documentation/platforms/buildship/) | Low-code visual builder to create APIs, scheduled jobs, and backend workflows. | +| [Keboola](/documentation/platforms/keboola/) | Data operations platform that unifies data sources, transformations, and ML deployments. | +| [Kotaemon](/documentation/platforms/kotaemon/) | Open-source & customizable RAG UI for chatting with your documents. | +| [Make](/documentation/platforms/make/) | Cloud platform to build low-code workflows by integrating various software applications. | +| [Mulesoft Anypoint](/documentation/platforms/mulesoft/) | Integration platform to connect applications, data, and devices across environments. | +| [N8N](/documentation/platforms/n8n/) | Platform for node-based, low-code workflow automation. | +| [Pipedream](/documentation/platforms/pipedream/) | Platform for connecting apps and developing event-driven automation. | +| [PrivateGPT](/documentation/platforms/privategpt/) | Tool to ask questions about your documents using local LLMs emphasising privacy. | +| [ToolJet](/documentation/platforms/tooljet/) | A low-code platform for business apps that connect to DBs, cloud storages and more. | +| [Vectorize](/documentation/platforms/vectorize/) | Platform to automate data extraction, RAG evaluation, deploy RAG pipelines. | + +<|page-162-lllmstxt|> +# Private Chatbot for Interactive Learning + +| Time: 120 min | Level: Advanced | | +| --- | ----------- | ----------- |----------- | + +With chatbots, companies can scale their training programs to accommodate a large workforce, delivering consistent and standardized learning experiences across departments, locations, and time zones. Furthermore, having already completed their online training, corporate employees might want to refer back old course materials. Most of this information is proprietary to the company, and manually searching through an entire library of materials takes time. However, a chatbot built on this knowledge can respond in the blink of an eye. + +With a simple RAG pipeline, you can build a private chatbot. In this tutorial, you will combine open source tools inside of a closed infrastructure and tie them together with a reliable framework. This custom solution lets you run a chatbot without public internet access. You will be able to keep sensitive data secure without compromising privacy. + +![OpenShift](/documentation/examples/student-rag-haystack-red-hat-openshift-hc/openshift-diagram.png) +**Figure 1:** The LLM and Qdrant Hybrid Cloud are containerized as separate services. Haystack combines them into a RAG pipeline and exposes the API via Hayhooks. + +## Components +To maintain complete data isolation, we need to limit ourselves to open-source tools and use them in a private environment, such as [Red Hat OpenShift](https://www.redhat.com/en/technologies/cloud-computing/openshift). The pipeline will run internally and will be inaccessible from the internet. + +- **Dataset:** [Red Hat Interactive Learning Portal](https://developers.redhat.com/learn), an online library of Red Hat course materials. +- **LLM:** `mistralai/Mistral-7B-Instruct-v0.1`, deployed as a standalone service on OpenShift. +- **Embedding Model:** `BAAI/bge-base-en-v1.5`, lightweight embedding model deployed from within the Haystack pipeline + with [FastEmbed](https://github.com/qdrant/fastembed) +- **Vector DB:** [Qdrant Hybrid Cloud](https://hybrid-cloud.qdrant.tech) running on OpenShift. +- **Framework:** [Haystack 2.x](https://haystack.deepset.ai/) to connect all and [Hayhooks](https://docs.haystack.deepset.ai/docs/hayhooks) to serve the app through HTTP endpoints. + +### Procedure +The [Haystack](https://haystack.deepset.ai/) framework leverages two pipelines, which combine our components sequentially to process data. + +1. The **Indexing Pipeline** will run offline in batches, when new data is added or updated. +2. The **Search Pipeline** will retrieve information from Qdrant and use an LLM to produce an answer. +> **Note:** We will define the pipelines in Python and then export them to YAML format, so that [Hayhooks](https://docs.haystack.deepset.ai/docs/hayhooks) can run them as a web service. + +## Prerequisites + +### Deploy the LLM to OpenShift + +Follow the steps in [Chapter 6. Serving large language models](https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/2.5/html/working_on_data_science_projects/serving-large-language-models_serving-large-language-models#doc-wrapper). This will download the LLM from the [HuggingFace](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1), and deploy it to OpenShift using a *single model serving platform*. + +Your LLM service will have a URL, which you need to store as an environment variable. + +```shell +export INFERENCE_ENDPOINT_URL="http://mistral-service.default.svc.cluster.local" ``` ```python -client.set_payload( - collection_name="{collection_name}", - payload={ - "property1": "string", - "property2": "string", - }, - points=[0, 3, 10], -) +import os +os.environ["INFERENCE_ENDPOINT_URL"] = "http://mistral-service.default.svc.cluster.local" ``` -```typescript -client.setPayload("{collection_name}", { - payload: { - property1: "string", - property2: "string", - }, - points: [0, 3, 10], -}); +### Launch Qdrant Hybrid Cloud -``` +Complete **How to Set Up Qdrant on Red Hat OpenShift**. When in Hybrid Cloud, your Qdrant instance is private and and its nodes run on the same OpenShift infrastructure as your other components. -```rust -use qdrant_client::qdrant::{ - PointsIdsList, SetPayloadPointsBuilder, -}; -use qdrant_client::Payload,; -use serde_json::json; +Retrieve your Qdrant URL and API key and store them as environment variables: -client - .set_payload( - SetPayloadPointsBuilder::new( - "{collection_name}", - Payload::try_from(json!({ - "property1": "string", - "property2": "string", - })) - .unwrap(), - ) - .points_selector(PointsIdsList { - ids: vec![0.into(), 3.into(), 10.into()], - }) - .wait(true), - ) - .await?; +```shell +export QDRANT_URL="https://qdrant.example.com" +export QDRANT_API_KEY="your-api-key" +``` +```python +os.environ["QDRANT_URL"] = "https://qdrant.example.com" +os.environ["QDRANT_API_KEY"] = "your-api-key" ``` +## Implementation -```java -import java.util.List; -import java.util.Map; +We will first create an indexing pipeline to add documents to the system. +Then, the search pipeline will retrieve relevant data from our documents. +After the pipelines are tested, we will export them to YAML files. -import static io.qdrant.client.PointIdFactory.id; -import static io.qdrant.client.ValueFactory.value; +### Indexing pipeline -client - .setPayloadAsync( - "{collection_name}", - Map.of("property1", value("string"), "property2", value("string")), - List.of(id(0), id(3), id(10)), - true, - null, - null) - .get(); +[Haystack 2.x](https://haystack.deepset.ai/) comes packed with a lot of useful components, from data fetching, through +HTML parsing, up to the vector storage. Before we start, there are a few Python packages that we need to install: +```shell +pip install haystack-ai \ + qdrant-client \ + qdrant-haystack \ + fastembed-haystack ``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; + -var client = new QdrantClient("localhost", 6334); +Our environment is now ready, so we can jump right into the code. Let's define an empty pipeline and gradually add +components to it: -await client.SetPayloadAsync( - collectionName: "{collection_name}", - payload: new Dictionary { { "property1", "string" }, { "property2", "string" } }, - ids: new ulong[] { 0, 3, 10 } -); +```python +from haystack import Pipeline +indexing_pipeline = Pipeline() ``` -```go -import ( - "context" +#### Data fetching and conversion - "github.com/qdrant/go-client/qdrant" -) +In this step, we will use Haystack's `LinkContentFetcher` to download course content from a list of URLs and store it in Qdrant for retrieval. +As we don't want to store raw HTML, this tool will extract text content from each webpage. Then, the fetcher will divide them into digestible chunks, since the documents might be pretty long. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Let's start with data fetching and text conversion: -client.SetPayload(context.Background(), &qdrant.SetPayloadPoints{ - CollectionName: "{collection_name}", - Payload: qdrant.NewValueMap( - map[string]any{"property1": "string", "property2": "string"}), - PointsSelector: qdrant.NewPointsSelector( - qdrant.NewIDNum(0), - qdrant.NewIDNum(3)), -}) +```python +from haystack.components.fetchers import LinkContentFetcher +from haystack.components.converters import HTMLToDocument + +fetcher = LinkContentFetcher() +converter = HTMLToDocument() +indexing_pipeline.add_component("fetcher", fetcher) +indexing_pipeline.add_component("converter", converter) ``` -You don’t need to know the ids of the points you want to modify. The alternative -is to use filters. +Our pipeline knows there are two components, but they are not connected yet. We need to define the flow between them: -httppythontypescriptrustjavacsharpgo +```python +indexing_pipeline.connect("fetcher.streams", "converter.sources") +``` -```http -POST /collections/{collection_name}/points/payload -{ - "payload": { - "property1": "string", - "property2": "string" - }, - "filter": { - "must": [\ - {\ - "key": "color",\ - "match": {\ - "value": "red"\ - }\ - }\ - ] - } -} +Each component has a set of inputs and outputs which might be combined in a directed graph. The definitions of the +inputs and outputs are usually provided in the documentation of the component. The `LinkContentFetcher` has the +following parameters: + +![Parameters of the `LinkContentFetcher`](/documentation/examples/student-rag-haystack-red-hat-openshift-hc/haystack-link-content-fetcher.png) + +*Source: https://docs.haystack.deepset.ai/docs/linkcontentfetcher* + +#### Chunking and creating the embeddings + +We used `HTMLToDocument` to convert the HTML sources into `Document` instances of Haystack, which is a +base class containing some data to be queried. However, a single document might be too long to be processed by the +embedding model, and it also carries way too much information to make the search relevant. + +Therefore, we need to split the document into smaller parts and convert them into embeddings. For this, we will use the +`DocumentSplitter` and `FastembedDocumentEmbedder` pointed to our `BAAI/bge-base-en-v1.5` model: + +```python +from haystack.components.preprocessors import DocumentSplitter +from haystack_integrations.components.embedders.fastembed import FastembedDocumentEmbedder + +splitter = DocumentSplitter(split_by="sentence", split_length=5, split_overlap=2) +embedder = FastembedDocumentEmbedder(model="BAAI/bge-base-en-v1.5") +embedder.warm_up() + +indexing_pipeline.add_component("splitter", splitter) +indexing_pipeline.add_component("embedder", embedder) +indexing_pipeline.connect("converter.documents", "splitter.documents") +indexing_pipeline.connect("splitter.documents", "embedder.documents") ``` +#### Writing data to Qdrant + +The splitter will be producing chunks with a maximum length of 5 sentences, with an overlap of 2 sentences. Then, these +smaller portions will be converted into embeddings. + +Finally, we need to store our embeddings in Qdrant. + ```python -client.set_payload( - collection_name="{collection_name}", - payload={ - "property1": "string", - "property2": "string", - }, - points=models.Filter( - must=[\ - models.FieldCondition(\ - key="color",\ - match=models.MatchValue(value="red"),\ - ),\ - ], - ), +from haystack.utils import Secret +from haystack_integrations.document_stores.qdrant import QdrantDocumentStore +from haystack.components.writers import DocumentWriter + +document_store = QdrantDocumentStore( + os.environ["QDRANT_URL"], + api_key=Secret.from_env_var("QDRANT_API_KEY"), + index="red-hat-learning", + return_embedding=True, + embedding_dim=768, ) +writer = DocumentWriter(document_store=document_store) + +indexing_pipeline.add_component("writer", writer) +indexing_pipeline.connect("embedder.documents", "writer.documents") ``` -```typescript -client.setPayload("{collection_name}", { - payload: { - property1: "string", - property2: "string", - }, - filter: { - must: [\ - {\ - key: "color",\ - match: {\ - value: "red",\ - },\ - },\ - ], - }, -}); +Our pipeline is now complete. Haystack comes with a handy visualization of the pipeline, so you can see and verify the +connections between the components. It is displayed in the Jupyter notebook, but you can also export it to a file: +```python +indexing_pipeline.draw("indexing_pipeline.png") ``` -```rust -use qdrant_client::qdrant::{Condition, Filter, SetPayloadPointsBuilder}; -use qdrant_client::Payload; -use serde_json::json; +![Structure of the indexing pipeline](/documentation/examples/student-rag-haystack-red-hat-openshift-hc/indexing_pipeline.png) -client - .set_payload( - SetPayloadPointsBuilder::new( - "{collection_name}", - Payload::try_from(json!({ - "property1": "string", - "property2": "string", - })) - .unwrap(), - ) - .points_selector(Filter::must([Condition::matches(\ - "color",\ - "red".to_string(),\ - )])) - .wait(true), - ) - .await?; +#### Test the entire pipeline + +We can finally run it on a list of URLs to index the content in Qdrant. We have a bunch of URLs to all the Red Hat +OpenShift Foundations course lessons, so let's use them: + +```python +course_urls = [ + "https://developers.redhat.com/learn/openshift/foundations-openshift", + "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:openshift-and-developer-sandbox", + "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:overview-web-console", + "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:use-terminal-window-within-red-hat-openshift-web-console", + "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:install-application-source-code-github-repository-using-openshift-web-console", + "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:install-application-linux-container-image-repository-using-openshift-web-console", + "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:install-application-linux-container-image-using-oc-cli-tool", + "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:install-application-source-code-using-oc-cli-tool", + "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:scale-applications-using-openshift-web-console", + "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:scale-applications-using-oc-cli-tool", + "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:work-databases-openshift-using-oc-cli-tool", + "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:work-databases-openshift-web-console", + "https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:view-performance-information-using-openshift-web-console", +] +indexing_pipeline.run(data={ + "fetcher": { + "urls": course_urls, + } +}) ``` -```java -import java.util.Map; +The execution might take a while, as the model needs to process all the documents. After the process is finished, we +should have all the documents stored in Qdrant, ready for search. You should see a short summary of processed documents: -import static io.qdrant.client.ConditionFactory.matchKeyword; -import static io.qdrant.client.ValueFactory.value; +```shell +{'writer': {'documents_written': 381}} +``` -client - .setPayloadAsync( - "{collection_name}", - Map.of("property1", value("string"), "property2", value("string")), - Filter.newBuilder().addMust(matchKeyword("color", "red")).build(), - true, - null, - null) - .get(); +### Search pipeline + +Our documents are now indexed and ready for search. The next pipeline is a bit simpler, but we still need to define a +few components. Let's start again with an empty pipeline: +```python +search_pipeline = Pipeline() ``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -using static Qdrant.Client.Grpc.Conditions; +Our second process takes user input, converts it into embeddings and then searches for the most relevant documents +using the query embedding. This might look familiar, but we arent working with `Document` instances +anymore, since the query only accepts raw text. Thus, some of the components will be different, especially the embedder, +as it has to accept a single string as an input and produce a single embedding as an output: -var client = new QdrantClient("localhost", 6334); +```python +from haystack_integrations.components.embedders.fastembed import FastembedTextEmbedder +from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever -await client.SetPayloadAsync( - collectionName: "{collection_name}", - payload: new Dictionary { { "property1", "string" }, { "property2", "string" } }, - filter: MatchKeyword("color", "red") -); +query_embedder = FastembedTextEmbedder(model="BAAI/bge-base-en-v1.5") +query_embedder.warm_up() + +retriever = QdrantEmbeddingRetriever( + document_store=document_store, # The same document store as the one used for indexing + top_k=3, # Number of documents to return +) +search_pipeline.add_component("query_embedder", query_embedder) +search_pipeline.add_component("retriever", retriever) + +search_pipeline.connect("query_embedder.embedding", "retriever.query_embedding") ``` -```go -import ( - "context" +#### Run a test query - "github.com/qdrant/go-client/qdrant" -) +If our goal was to just retrieve the relevant documents, we could stop here. Let's try the current pipeline on a simple +query: -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +```python +query = "How to install an application using the OpenShift web console?" -client.SetPayload(context.Background(), &qdrant.SetPayloadPoints{ - CollectionName: "{collection_name}", - Payload: qdrant.NewValueMap( - map[string]any{"property1": "string", "property2": "string"}), - PointsSelector: qdrant.NewPointsSelectorFilter(&qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("color", "red"), - }, - }), +search_pipeline.run(data={ + "query_embedder": { + "text": query + } }) +``` + +We set the `top_k` parameter to 3, so the retriever should return the three most relevant documents. Your output should look like this: +```text +{ + 'retriever': { + 'documents': [ + Document(id=867b4aa4c37a91e72dc7ff452c47972c1a46a279a7531cd6af14169bcef1441b, content: 'Install a Node.js application from GitHub using the web console The following describes the steps r...', meta: {'content_type': 'text/html', 'source_id': 'f56e8f827dda86abe67c0ba3b4b11331d896e2d4f7b2b43c74d3ce973d07be0c', 'url': 'https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:work-databases-openshift-web-console'}, score: 0.9209432), + Document(id=0c74381c178597dd91335ebfde790d13bf5989b682d73bf5573c7734e6765af7, content: 'How to remove an application from OpenShift using the web console. In addition to providing the cap...', meta: {'content_type': 'text/html', 'source_id': '2a0759f3ce4a37d9f5c2af9c0ffcc80879077c102fb8e41e576e04833c9d24ce', 'url': 'https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:install-application-linux-container-image-repository-using-openshift-web-console'}, score: 0.9132109500000001), + Document(id=3e5f8923a34ab05611ef20783211e5543e880c709fd6534d9c1f63576edc4061, content: 'Path resource: Install an application from source code in a GitHub repository using the OpenShift w...', meta: {'content_type': 'text/html', 'source_id': 'a4c4cd62d07c0d9d240e3289d2a1cc0a3d1127ae70704529967f715601559089', 'url': 'https://developers.redhat.com/learning/learn:openshift:foundations-openshift/resource/resources:install-application-source-code-github-repository-using-openshift-web-console'}, score: 0.912748935) + ] + } +} ``` -_Available as of v1.8.0_ +#### Generating the answer -It is possible to modify only a specific key of the payload by using the `key` parameter. +Retrieval should serve more than just documents. Therefore, we will need to use an LLM to generate exact answers to our question. +This is the final component of our second pipeline. -For instance, given the following payload JSON object on a point: +Haystack will create a prompt which adds your documents to the model's context. -```json -{ - "property1": { - "nested_property": "foo", +```python +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.generators import HuggingFaceTGIGenerator + +prompt_builder = PromptBuilder(""" +Given the following information, answer the question. + +Context: +{% for document in documents %} + {{ document.content }} +{% endfor %} + +Question: {{ query }} +""") +llm = HuggingFaceTGIGenerator( + model="mistralai/Mistral-7B-Instruct-v0.1", + url=os.environ["INFERENCE_ENDPOINT_URL"], + generation_kwargs={ + "max_new_tokens": 1000, # Allow longer responses }, - "property2": { - "nested_property": "bar", - } -} +) +search_pipeline.add_component("prompt_builder", prompt_builder) +search_pipeline.add_component("llm", llm) + +search_pipeline.connect("retriever.documents", "prompt_builder.documents") +search_pipeline.connect("prompt_builder.prompt", "llm.prompt") ``` -You can modify the `nested_property` of `property1` with the following request: +The `PromptBuilder` is a Jinja2 template that will be filled with the documents and the query. The +`HuggingFaceTGIGenerator` connects to the LLM service and generates the answer. Let's run the pipeline again: -```http -POST /collections/{collection_name}/points/payload -{ - "payload": { - "nested_property": "qux", +```python +query = "How to install an application using the OpenShift web console?" + +response = search_pipeline.run(data={ + "query_embedder": { + "text": query }, - "key": "property1", - "points": [1] -} + "prompt_builder": { + "query": query + }, +}) +``` + +The LLM may provide multiple replies, if asked to do so, so let's iterate over and print them out: +```python +for reply in response["llm"]["replies"]: + print(reply.strip()) ``` -Resulting in the following payload: +In our case there is a single response, which should be the answer to the question: -```json -{ - "property1": { - "nested_property": "qux", - }, - "property2": { - "nested_property": "bar", - } -} +```text +Answer: To install an application using the OpenShift web console, follow these steps: +1. Select +Add on the left side of the web console. +2. Identify the container image to install. +3. Using your web browser, navigate to the Developer Sandbox for Red Hat OpenShift and select Start your Sandbox for free. +4. Install an application from source code stored in a GitHub repository using the OpenShift web console. ``` -### [Anchor](https://qdrant.tech/documentation/concepts/payload/\#overwrite-payload) Overwrite payload +Our final search pipeline might also be visualized, so we can see how the components are glued together: -Fully replace any existing payload with the given one. +```python +search_pipeline.draw("search_pipeline.png") +``` -REST API ( [Schema](https://api.qdrant.tech/api-reference/points/overwrite-payload)): +![Structure of the search pipeline](/documentation/examples/student-rag-haystack-red-hat-openshift-hc/search_pipeline.png) -httppythontypescriptrustjavacsharpgo +## Deployment -```http -PUT /collections/{collection_name}/points/payload -{ - "payload": { - "property1": "string", - "property2": "string" - }, - "points": [\ - 0, 3, 100\ - ] -} +The pipelines are now ready, and we can export them to YAML. Hayhooks will use these files to run the +pipelines as HTTP endpoints. To do this, specify both file paths and your environment variables. + +> Note: The indexing pipeline might be run inside your ETL tool, but search should be definitely exposed as an HTTP endpoint. +Let's run it on the local machine: + +```shell +pip install hayhooks ``` +First of all, we need to save the pipelines to the YAML file: + ```python -client.overwrite_payload( - collection_name="{collection_name}", - payload={ - "property1": "string", - "property2": "string", - }, - points=[0, 3, 10], -) +with open("search-pipeline.yaml", "w") as fp: + search_pipeline.dump(fp) +``` + +And now we are able to run the Hayhooks service: +```shell +hayhooks run ``` -```typescript -client.overwritePayload("{collection_name}", { - payload: { - property1: "string", - property2: "string", - }, - points: [0, 3, 10], -}); +The command should start the service on the default port, so you can access it at `http://localhost:1416`. The pipeline +is not deployed yet, but we can do it with just another command: +```shell +hayhooks deploy search-pipeline.yaml ``` -```rust -use qdrant_client::qdrant::{PointsIdsList, SetPayloadPointsBuilder}; -use qdrant_client::Payload; -use serde_json::json; +Once it's finished, you should be able to see the OpenAPI documentation at +[http://localhost:1416/docs](http://localhost:1416/docs), and test the newly created endpoint. -client - .overwrite_payload( - SetPayloadPointsBuilder::new( - "{collection_name}", - Payload::try_from(json!({ - "property1": "string", - "property2": "string", - })) - .unwrap(), - ) - .points_selector(PointsIdsList { - ids: vec![0.into(), 3.into(), 10.into()], - }) - .wait(true), - ) - .await?; +![Search pipeline in the OpenAPI documentation](/documentation/examples/student-rag-haystack-red-hat-openshift-hc/hayhooks-openapi.png) + +Our search is now accessible through the HTTP endpoint, so we can integrate it with any other service. We can even +control the other parameters, like the number of documents to return: +```shell +curl -X 'POST' \ + 'http://localhost:1416/search-pipeline' \ + -H 'Accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "llm": { + }, + "prompt_builder": { + "query": "How can I remove an application?" + }, + "query_embedder": { + "text": "How can I remove an application?" + }, + "retriever": { + "top_k": 5 + } +}' ``` -```java -import java.util.List; +The response should be similar to the one we got in the Python before: + +```json +{ + "llm": { + "replies": [ + "\n\nAnswer: You can remove an application running in OpenShift by right-clicking on the circular graphic representing the application in Topology view and selecting the Delete Application text from the dialog that appears when you click the graphic’s outer ring. Alternatively, you can use the oc CLI tool to delete an installed application using the oc delete all command." + ], + "meta": [ + { + "model": "mistralai/Mistral-7B-Instruct-v0.1", + "index": 0, + "finish_reason": "eos_token", + "usage": { + "completion_tokens": 75, + "prompt_tokens": 642, + "total_tokens": 717 + } + } + ] + } +} +``` -import static io.qdrant.client.PointIdFactory.id; -import static io.qdrant.client.ValueFactory.value; +## Next steps -client - .overwritePayloadAsync( - "{collection_name}", - Map.of("property1", value("string"), "property2", value("string")), - List.of(id(0), id(3), id(10)), - true, - null, - null) - .get(); +- In this example, [Red Hat OpenShift](https://www.redhat.com/en/technologies/cloud-computing/openshift) is the infrastructure of choice for proprietary chatbots. [Read more](https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/2.8) about how to host AI projects in their [extensive documentation](https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/2.8). -``` +- [Haystack's documentation](https://docs.haystack.deepset.ai/docs/kubernetes) describes [how to deploy the Hayhooks service in a Kubernetes +environment](https://docs.haystack.deepset.ai/docs/kubernetes), so you can easily move it to your own OpenShift infrastructure. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +- If you are just getting started and need more guidance on Qdrant, read the [quickstart](/documentation/quick-start/) or try out our [beginner tutorial](/documentation/tutorials/neural-search/). -var client = new QdrantClient("localhost", 6334); +<|page-163-lllmstxt|> +# Implement custom connector for Cohere RAG -await client.OverwritePayloadAsync( - collectionName: "{collection_name}", - payload: new Dictionary { { "property1", "string" }, { "property2", "string" } }, - ids: new ulong[] { 0, 3, 10 } -); +| Time: 45 min | Level: Intermediate | | | +|--------------|---------------------|-|----| -``` +The usual approach to implementing Retrieval Augmented Generation requires users to build their prompts with the +relevant context the LLM may rely on, and manually sending them to the model. Cohere is quite unique here, as their +models can now speak to the external tools and extract meaningful data on their own. You can virtually connect any data +source and let the Cohere LLM know how to access it. Obviously, vector search goes well with LLMs, and enabling semantic +search over your data is a typical case. -```go -import ( - "context" +Cohere RAG has lots of interesting features, such as inline citations, which help you to refer to the specific parts of +the documents used to generate the response. - "github.com/qdrant/go-client/qdrant" -) +![Cohere RAG citations](/documentation/tutorials/cohere-rag-connector/cohere-rag-citations.png) -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +*Source: https://docs.cohere.com/docs/retrieval-augmented-generation-rag* -client.OverwritePayload(context.Background(), &qdrant.SetPayloadPoints{ - CollectionName: "{collection_name}", - Payload: qdrant.NewValueMap( - map[string]any{"property1": "string", "property2": "string"}), - PointsSelector: qdrant.NewPointsSelector( - qdrant.NewIDNum(0), - qdrant.NewIDNum(3)), -}) +The connectors have to implement a specific interface and expose the data source as HTTP REST API. Cohere documentation +[describes a general process of creating a connector](https://docs.cohere.com/v1/docs/creating-and-deploying-a-connector). +This tutorial guides you step by step on building such a service around Qdrant. -``` +## Qdrant connector -Like [set payload](https://qdrant.tech/documentation/concepts/payload/#set-payload), you don’t need to know the ids of the points -you want to modify. The alternative is to use filters. +You probably already have some collections you would like to bring to the LLM. Maybe your pipeline was set up using some +of the popular libraries such as Langchain, Llama Index, or Haystack. Cohere connectors may implement even more complex +logic, e.g. hybrid search. In our case, we are going to start with a fresh Qdrant collection, index data using Cohere +Embed v3, build the connector, and finally connect it with the [Command-R model](https://txt.cohere.com/command-r/). -### [Anchor](https://qdrant.tech/documentation/concepts/payload/\#clear-payload) Clear payload +### Building the collection -This method removes all payload keys from specified points +First things first, let's build a collection and configure it for the Cohere `embed-multilingual-v3.0` model. It +produces 1024-dimensional embeddings, and we can choose any of the distance metrics available in Qdrant. Our connector +will act as a personal assistant of a software engineer, and it will expose our notes to suggest the priorities or +actions to perform. -REST API ( [Schema](https://api.qdrant.tech/api-reference/points/clear-payload)): +```python +from qdrant_client import QdrantClient, models -httppythontypescriptrustjavacsharpgo +client = QdrantClient( + "https://my-cluster.cloud.qdrant.io:6333", + api_key="my-api-key", +) +client.create_collection( + collection_name="personal-notes", + vectors_config=models.VectorParams( + size=1024, + distance=models.Distance.DOT, + ), +) +``` -```http -POST /collections/{collection_name}/points/payload/clear -{ - "points": [0, 3, 100] -} +Our notes will be represented as simple JSON objects with a `title` and `text` of the specific note. The embeddings will +be created from the `text` field only. +```python +notes = [ + { + "title": "Project Alpha Review", + "text": "Review the current progress of Project Alpha, focusing on the integration of the new API. Check for any compatibility issues with the existing system and document the steps needed to resolve them. Schedule a meeting with the development team to discuss the timeline and any potential roadblocks." + }, + { + "title": "Learning Path Update", + "text": "Update the learning path document with the latest courses on React and Node.js from Pluralsight. Schedule at least 2 hours weekly to dedicate to these courses. Aim to complete the React course by the end of the month and the Node.js course by mid-next month." + }, + { + "title": "Weekly Team Meeting Agenda", + "text": "Prepare the agenda for the weekly team meeting. Include the following topics: project updates, review of the sprint backlog, discussion on the new feature requests, and a brainstorming session for improving remote work practices. Send out the agenda and the Zoom link by Thursday afternoon." + }, + { + "title": "Code Review Process Improvement", + "text": "Analyze the current code review process to identify inefficiencies. Consider adopting a new tool that integrates with our version control system. Explore options such as GitHub Actions for automating parts of the process. Draft a proposal with recommendations and share it with the team for feedback." + }, + { + "title": "Cloud Migration Strategy", + "text": "Draft a plan for migrating our current on-premise infrastructure to the cloud. The plan should cover the selection of a cloud provider, cost analysis, and a phased migration approach. Identify critical applications for the first phase and any potential risks or challenges. Schedule a meeting with the IT department to discuss the plan." + }, + { + "title": "Quarterly Goals Review", + "text": "Review the progress towards the quarterly goals. Update the documentation to reflect any completed objectives and outline steps for any remaining goals. Schedule individual meetings with team members to discuss their contributions and any support they might need to achieve their targets." + }, + { + "title": "Personal Development Plan", + "text": "Reflect on the past quarter's achievements and areas for improvement. Update the personal development plan to include new technical skills to learn, certifications to pursue, and networking events to attend. Set realistic timelines and check-in points to monitor progress." + }, + { + "title": "End-of-Year Performance Reviews", + "text": "Start preparing for the end-of-year performance reviews. Collect feedback from peers and managers, review project contributions, and document achievements. Consider areas for improvement and set goals for the next year. Schedule preliminary discussions with each team member to gather their self-assessments." + }, + { + "title": "Technology Stack Evaluation", + "text": "Conduct an evaluation of our current technology stack to identify any outdated technologies or tools that could be replaced for better performance and productivity. Research emerging technologies that might benefit our projects. Prepare a report with findings and recommendations to present to the management team." + }, + { + "title": "Team Building Event Planning", + "text": "Plan a team-building event for the next quarter. Consider activities that can be done remotely, such as virtual escape rooms or online game nights. Survey the team for their preferences and availability. Draft a budget proposal for the event and submit it for approval." + } +] ``` +Storing the embeddings along with the metadata is fairly simple. + ```python -client.clear_payload( - collection_name="{collection_name}", - points_selector=[0, 3, 100], +import cohere +import uuid + +cohere_client = cohere.Client(api_key="my-cohere-api-key") + +response = cohere_client.embed( + texts=[ + note.get("text") + for note in notes + ], + model="embed-multilingual-v3.0", + input_type="search_document", ) +client.upload_points( + collection_name="personal-notes", + points=[ + models.PointStruct( + id=uuid.uuid4().hex, + vector=embedding, + payload=note, + ) + for note, embedding in zip(notes, response.embeddings) + ] +) ``` -```typescript -client.clearPayload("{collection_name}", { - points: [0, 3, 100], -}); +Our collection is now ready to be searched over. In the real world, the set of notes would be changing over time, so the +ingestion process won't be as straightforward. This data is not yet exposed to the LLM, but we will build the connector +in the next step. -``` +### Connector web service -```rust -use qdrant_client::qdrant::{ClearPayloadPointsBuilder, PointsIdsList}; +[FastAPI](https://fastapi.tiangolo.com/) is a modern web framework and perfect a choice for a simple HTTP API. We are +going to use it for the purposes of our connector. There will be just one endpoint, as required by the model. It will +accept POST requests at the `/search` path. There is a single `query` parameter required. Let's define a corresponding +model. -client - .clear_payload( - ClearPayloadPointsBuilder::new("{collection_name}") - .points(PointsIdsList { - ids: vec![0.into(), 3.into(), 10.into()], - }) - .wait(true), - ) - .await?; +```python +from pydantic import BaseModel +class SearchQuery(BaseModel): + query: str ``` -```java -import java.util.List; +RAG connector does not have to return the documents in any specific format. There are [some good practices to follow](https://docs.cohere.com/v1/docs/creating-and-deploying-a-connector#configure-the-connection-between-the-connector-and-the-chat-api), +but Cohere models are quite flexible here. Results just have to be returned as JSON, with a list of objects in a +`results` property of the output. We will use the same document structure as we did for the Qdrant payloads, so there +is no conversion required. That requires two additional models to be created. -import static io.qdrant.client.PointIdFactory.id; +```python +from typing import List -client - .clearPayloadAsync("{collection_name}", List.of(id(0), id(3), id(100)), true, null, null) - .get(); +class Document(BaseModel): + title: str + text: str +class SearchResults(BaseModel): + results: List[Document] ``` -```csharp -using Qdrant.Client; +Once our model classes are ready, we can implement the logic that will get the query and provide the notes that are +relevant to it. Please note the LLM is not going to define the number of documents to be returned. That's completely +up to you how many of them you want to bring to the context. -var client = new QdrantClient("localhost", 6334); +There are two services we need to interact with - Qdrant server and Cohere API. FastAPI has a concept of a [dependency +injection](https://fastapi.tiangolo.com/tutorial/dependencies/#dependencies), and we will use it to provide both +clients into the implementation. -await client.ClearPayloadAsync(collectionName: "{collection_name}", ids: new ulong[] { 0, 3, 100 }); +In case of queries, we need to set the `input_type` to `search_query` in the calls to Cohere API. -``` +```python +from fastapi import FastAPI, Depends +from typing import Annotated -```go -import ( - "context" +app = FastAPI() - "github.com/qdrant/go-client/qdrant" -) +def client() -> QdrantClient: + return QdrantClient(config.QDRANT_URL, api_key=config.QDRANT_API_KEY) -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +def cohere_client() -> cohere.Client: + return cohere.Client(api_key=config.COHERE_API_KEY) -client.ClearPayload(context.Background(), &qdrant.ClearPayloadPoints{ - CollectionName: "{collection_name}", - Points: qdrant.NewPointsSelector( - qdrant.NewIDNum(0), - qdrant.NewIDNum(3)), -}) +@app.post("/search") +def search( + query: SearchQuery, + client: Annotated[QdrantClient, Depends(client)], + cohere_client: Annotated[cohere.Client, Depends(cohere_client)], +) -> SearchResults: + response = cohere_client.embed( + texts=[query.query], + model="embed-multilingual-v3.0", + input_type="search_query", + ) + results = client.query_points( + collection_name="personal-notes", + query=response.embeddings[0], + limit=2, + ).points + return SearchResults( + results=[ + Document(**point.payload) + for point in results + ] + ) +``` + +Our app might be launched locally for the development purposes, given we have the `uvicorn` server installed: +```shell +uvicorn main:app ``` -### [Anchor](https://qdrant.tech/documentation/concepts/payload/\#delete-payload-keys) Delete payload keys +FastAPI exposes an interactive documentation at `http://localhost:8000/docs`, where we can test our endpoint. The +`/search` endpoint is available there. + +![FastAPI documentation](/documentation/tutorials/cohere-rag-connector/fastapi-openapi.png) -Delete specific payload keys from points. +We can interact with it and check the documents that will be returned for a specific query. For example, we want to know +recall what we are supposed to do regarding the infrastructure for your projects. -REST API ( [Schema](https://api.qdrant.tech/api-reference/points/delete-payload)): +```shell +curl -X "POST" \ + -H "Content-type: application/json" \ + -d '{"query": "Is there anything I have to do regarding the project infrastructure?"}' \ + "http://localhost:8000/search" +``` -httppythontypescriptrustjavacsharpgo +The output should look like following: -```http -POST /collections/{collection_name}/points/payload/delete +```json { - "keys": ["color", "price"], - "points": [0, 3, 100] + "results": [ + { + "title": "Cloud Migration Strategy", + "text": "Draft a plan for migrating our current on-premise infrastructure to the cloud. The plan should cover the selection of a cloud provider, cost analysis, and a phased migration approach. Identify critical applications for the first phase and any potential risks or challenges. Schedule a meeting with the IT department to discuss the plan." + }, + { + "title": "Project Alpha Review", + "text": "Review the current progress of Project Alpha, focusing on the integration of the new API. Check for any compatibility issues with the existing system and document the steps needed to resolve them. Schedule a meeting with the development team to discuss the timeline and any potential roadblocks." + } + ] } - ``` +### Connecting to Command-R + +Our web service is implemented, yet running only on our local machine. It has to be exposed to the public before +Command-R can interact with it. For a quick experiment, it might be enough to set up tunneling using services such as +[ngrok](https://ngrok.com/). We won't cover all the details in the tutorial, but their +[Quickstart](https://ngrok.com/docs/guides/getting-started/) is a great resource describing the process step-by-step. +Alternatively, you can also deploy the service with a public URL. + +Once it's done, we can create the connector first, and then tell the model to use it, while interacting through the chat +API. Creating a connector is a single call to Cohere client: + ```python -client.delete_payload( - collection_name="{collection_name}", - keys=["color", "price"], - points=[0, 3, 100], +connector_response = cohere_client.connectors.create( + name="personal-notes", + url="https:/this-is-my-domain.app/search", ) +``` + +The `connector_response.connector` will be a descriptor, with `id` being one of the attributes. We'll use this +identifier for our interactions like this: +```python +response = cohere_client.chat( + message=( + "Is there anything I have to do regarding the project infrastructure? " + "Please mention the tasks briefly." + ), + connectors=[ + cohere.ChatConnector(id=connector_response.connector.id) + ], + model="command-r", +) ``` -```typescript -client.deletePayload("{collection_name}", { - keys: ["color", "price"], - points: [0, 3, 100], -}); +We changed the `model` to `command-r`, as this is currently the best Cohere model available to public. The +`response.text` is the output of the model: +```text +Here are some of the tasks related to project infrastructure that you might have to perform: +- You need to draft a plan for migrating your on-premise infrastructure to the cloud and come up with a plan for the selection of a cloud provider, cost analysis, and a gradual migration approach. +- It's important to evaluate your current technology stack to identify any outdated technologies. You should also research emerging technologies and the benefits they could bring to your projects. ``` -```rust -use qdrant_client::qdrant::{DeletePayloadPointsBuilder, PointsIdsList}; +You only need to create a specific connector once! Please do not call `cohere_client.connectors.create` for every single +message you send to the `chat` method. -client - .delete_payload( - DeletePayloadPointsBuilder::new( - "{collection_name}", - vec!["color".to_string(), "price".to_string()], - ) - .points_selector(PointsIdsList { - ids: vec![0.into(), 3.into(), 10.into()], - }) - .wait(true), - ) - .await?; +## Wrapping up -``` +We have built a Cohere RAG connector that integrates with your existing knowledge base stored in Qdrant. We covered just +the basic flow, but in real world scenarios, you should also consider e.g. [building the authentication +system](https://docs.cohere.com/docs/connector-authentication) to prevent unauthorized access. -```java -import java.util.List; +<|page-164-lllmstxt|> +## How to Send Your Data to a Qdrant Cluster -import static io.qdrant.client.PointIdFactory.id; +The following examples show you some of the many ways you can send data to a Qdrant cluster from different sources. -client - .deletePayloadAsync( - "{collection_name}", - List.of("color", "price"), - List.of(id(0), id(3), id(100)), - true, - null, - null) - .get(); +If you want to migrate data from another Qdrant instance or vector database like Pinecone, Weaviate or Milvus see our [Migration Guide](/documentation/database-tutorials/migration/) for more information. -``` +| Example | Description | Stack | +|---------------------------------------------------------------------------------|-------------------------------------------------------------------|---------------------------------------------| +| [Stream Data to Qdrant with Kafka](/documentation/send-data/data-streaming-kafka-qdrant/) | Use Confluent to Stream Data to Qdrant via Managed Kafka. | Qdrant, Kafka | +| [Qdrant on Databricks](/documentation/send-data/databricks/) | Learn how to use Qdrant on Databricks using the Spark connector | Qdrant, Databricks, Apache Spark | +| [Qdrant with Airflow and Astronomer](/documentation/send-data/qdrant-airflow-astronomer/) | Build a semantic querying system using Airflow and Astronomer | Qdrant, Airflow, Astronomer | -```csharp -using Qdrant.Client; +<|page-165-lllmstxt|> +# Examples -var client = new QdrantClient("localhost", 6334); +| End-to-End Code Samples | Description | Stack | +|---------------------------------------------------------------------------------|-------------------------------------------------------------------|---------------------------------------------| +| [Multitenancy with LlamaIndex](/documentation/examples/llama-index-multitenancy/) | Handle data coming from multiple users in LlamaIndex. | Qdrant, Python, LlamaIndex | +| [Implement custom connector for Cohere RAG](/documentation/examples/cohere-rag-connector/) | Bring data stored in Qdrant to Cohere RAG | Qdrant, Cohere, FastAPI | +| [Chatbot for Interactive Learning](/documentation/examples/rag-chatbot-red-hat-openshift-haystack/) | Build a Private RAG Chatbot for Interactive Learning | Qdrant, Haystack, OpenShift | +| [Information Extraction Engine](/documentation/examples/rag-chatbot-vultr-dspy-ollama/) | Build a Private RAG Information Extraction Engine | Qdrant, Vultr, DSPy, Ollama | +| [System for Employee Onboarding](/documentation/examples/natural-language-search-oracle-cloud-infrastructure-cohere-langchain/) | Build a RAG System for Employee Onboarding | Qdrant, Cohere, LangChain | +| [System for Contract Management](/documentation/examples/rag-contract-management-stackit-aleph-alpha/) | Build a Region-Specific RAG System for Contract Management | Qdrant, Aleph Alpha, STACKIT | +| [Question-Answering System for Customer Support](/documentation/examples/rag-customer-support-cohere-airbyte-aws/) | Build a RAG System for AI Customer Support | Qdrant, Cohere, Airbyte, AWS | +| [Hybrid Search on PDF Documents](/documentation/examples/hybrid-search-llamaindex-jinaai/) | Develop a Hybrid Search System for Product PDF Manuals | Qdrant, LlamaIndex, Jina AI +| [Blog-Reading RAG Chatbot](/documentation/examples/rag-chatbot-scaleway/) | Develop a RAG-based Chatbot on Scaleway and with LangChain | Qdrant, LangChain, GPT-4o +| [Movie Recommendation System](/documentation/examples/recommendation-system-ovhcloud/) | Build a Movie Recommendation System with LlamaIndex and With JinaAI | Qdrant | +| [GraphRAG Agent](/documentation/examples/graphrag-qdrant-neo4j/) | Build a GraphRAG Agent with Neo4J and Qdrant | Qdrant, Neo4j | +| [Building a Chain-of-Thought Medical Chatbot with Qdrant and DSPy](/documentation/examples/Qdrant-DSPy-medicalbot/) | How to build a medical chatbot grounded in medical literature with Qdrant and DSPy. | Qdrant, DSPy | -await client.DeletePayloadAsync( - collectionName: "{collection_name}", - keys: ["color", "price"], - ids: new ulong[] { 0, 3, 100 } -); -``` -```go -import ( - "context" +## Notebooks - "github.com/qdrant/go-client/qdrant" -) +Our Notebooks offer complex instructions that are supported with a throrough explanation. Follow along by trying out the code and get the most out of each example. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +| Example | Description | Stack | +|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------|----------------------------| +| [Intro to Semantic Search and Recommendations Systems](https://githubtocolab.com/qdrant/examples/blob/master/qdrant_101_getting_started/getting_started.ipynb) | Learn how to get started building semantic search and recommendation systems. | Qdrant | +| [Search and Recommend Newspaper Articles](https://githubtocolab.com/qdrant/examples/blob/master/qdrant_101_text_data/qdrant_and_text_data.ipynb) | Work with text data to develop a semantic search and a recommendation engine for news articles. | Qdrant | +| [Recommendation System for Songs](https://githubtocolab.com/qdrant/examples/blob/master/qdrant_101_audio_data/03_qdrant_101_audio.ipynb) | Use Qdrant to develop a music recommendation engine based on audio embeddings. | Qdrant | +| [Image Comparison System for Skin Conditions](https://colab.research.google.com/github/qdrant/examples/blob/master/qdrant_101_image_data/04_qdrant_101_cv.ipynb) | Use Qdrant to compare challenging images with labels representing different skin diseases. | Qdrant | +| [Question and Answer System with LlamaIndex](https://github.com/qdrant/examples/blob/949669f001a03131afebf2ecd1e0ce63cab01c81/llama_index_recency/Qdrant%20and%20LlamaIndex%20%E2%80%94%20A%20new%20way%20to%20keep%20your%20Q%26A%20systems%20up-to-date.ipynb) | Combine Qdrant and LlamaIndex to create a self-updating Q&A system. | Qdrant, LlamaIndex, Cohere | +| [Extractive QA System](https://githubtocolab.com/qdrant/examples/blob/master/extractive_qa/extractive-question-answering.ipynb) | Extract answers directly from context to generate highly relevant answers. | Qdrant | +| [Ecommerce Reverse Image Search](https://githubtocolab.com/qdrant/examples/blob/master/ecommerce_reverse_image_search/ecommerce-reverse-image-search.ipynb) | Accept images as search queries to receive semantically appropriate answers. | Qdrant | +| [Basic RAG](https://githubtocolab.com/qdrant/examples/blob/master/rag-openai-qdrant/rag-openai-qdrant.ipynb) | Basic RAG pipeline with Qdrant and OpenAI SDKs. | OpenAI, Qdrant, FastEmbed | -client.DeletePayload(context.Background(), &qdrant.DeletePayloadPoints{ - CollectionName: "{collection_name}", - Keys: []string{"color", "price"}, - PointsSelector: qdrant.NewPointsSelector( - qdrant.NewIDNum(0), - qdrant.NewIDNum(3)), -}) +<|page-166-lllmstxt|> +# Question-Answering System for AI Customer Support -``` +| Time: 120 min | Level: Advanced | | +| --- | ----------- | ----------- |----------- | -Alternatively, you can use filters to delete payload keys from the points. +Maintaining top-notch customer service is vital to business success. As your operation expands, so does the influx of customer queries. Many of these queries are repetitive, making automation a time-saving solution. +Your support team's expertise is typically kept private, but you can still use AI to automate responses securely. -httppythontypescriptrustjavacsharpgo +In this tutorial we will setup a private AI service that answers customer support queries with high accuracy and effectiveness. By leveraging Cohere's powerful models (deployed to [AWS](https://cohere.com/deployment-options/aws)) with Qdrant Hybrid Cloud, you can create a fully private customer support system. Data synchronization, facilitated by [Airbyte](https://airbyte.com/), will complete the setup. -```http -POST /collections/{collection_name}/points/payload/delete -{ - "keys": ["color", "price"], - "filter": { - "must": [\ - {\ - "key": "color",\ - "match": {\ - "value": "red"\ - }\ - }\ - ] - } -} +![Architecture diagram](/documentation/examples/customer-support-cohere-airbyte/architecture-diagram.png) -``` +## System design -```python -client.delete_payload( - collection_name="{collection_name}", - keys=["color", "price"], - points=models.Filter( - must=[\ - models.FieldCondition(\ - key="color",\ - match=models.MatchValue(value="red"),\ - ),\ - ], - ), -) +The history of past interactions with your customers is not a static dataset. It is constantly evolving, as new +questions are coming in. You probably have a ticketing system that stores all the interactions, or use a different way +to communicate with your customers. No matter what is the communication channel, you need to bring the correct answers +to the selected Large Language Model, and have an established way to do it in a continuous manner. Thus, we will build +an ingestion pipeline and then a Retrieval Augmented Generation application that will use the data. -``` +- **Dataset:** a [set of Frequently Asked Questions from Qdrant + users](/documentation/faq/qdrant-fundamentals/) as an incrementally updated Excel sheet +- **Embedding model:** Cohere `embed-multilingual-v3.0`, to support different languages with the same pipeline +- **Knowledge base:** Qdrant, running in Hybrid Cloud mode +- **Ingestion pipeline:** [Airbyte](https://airbyte.com/), loading the data into Qdrant +- **Large Language Model:** Cohere [Command-R](https://docs.cohere.com/docs/command-r) +- **RAG:** Cohere [RAG](https://docs.cohere.com/docs/retrieval-augmented-generation-rag) using our knowledge base + through a custom connector -```typescript -client.deletePayload("{collection_name}", { - keys: ["color", "price"], - filter: { - must: [\ - {\ - key: "color",\ - match: {\ - value: "red",\ - },\ - },\ - ], - }, -}); +All the selected components are compatible with the [AWS](https://aws.amazon.com/) infrastructure. Thanks to Cohere models' availability, you can build a fully private customer support system completely isolates data within your infrastructure. Also, if you have AWS credits, you can now use them without spending additional money on the models or +semantic search layer. -``` +### Data ingestion -```rust -use qdrant_client::qdrant::{Condition, DeletePayloadPointsBuilder, Filter}; +Building a RAG starts with a well-curated dataset. In your specific case you may prefer loading the data directly from +a ticketing system, such as [Zendesk Support](https://airbyte.com/connectors/zendesk-support), +[Freshdesk](https://airbyte.com/connectors/freshdesk), or maybe integrate it with a shared inbox. However, in case of +customer questions quality over quantity is the key. There should be a conscious decision on what data to include in the +knowledge base, so we do not confuse the model with possibly irrelevant information. We'll assume there is an [Excel +sheet](https://docs.airbyte.com/integrations/sources/file) available over HTTP/FTP that Airbyte can access and load into +Qdrant in an incremental manner. -client - .delete_payload( - DeletePayloadPointsBuilder::new( - "{collection_name}", - vec!["color".to_string(), "price".to_string()], - ) - .points_selector(Filter::must([Condition::matches(\ - "color",\ - "red".to_string(),\ - )])) - .wait(true), - ) - .await?; +### Cohere <> Qdrant Connector for RAG -``` +Cohere RAG relies on [connectors](https://docs.cohere.com/docs/connectors) which brings additional context to the model. +The connector is a web service that implements a specific interface, and exposes its data through HTTP API. With that +setup, the Large Language Model becomes responsible for communicating with the connectors, so building a prompt with the +context is not needed anymore. -```java -import java.util.List; +### Answering bot -import static io.qdrant.client.ConditionFactory.matchKeyword; +Finally, we want to automate the responses and send them automatically when we are sure that the model is confident +enough. Again, the way such an application should be created strongly depends on the system you are using within the +customer support team. If it exposes a way to set up a webhook whenever a new question is coming in, you can create a +web service and use it to automate the responses. In general, our bot should be created specifically for the platform +you use, so we'll just cover the general idea here and build a simple CLI tool. -client - .deletePayloadAsync( - "{collection_name}", - List.of("color", "price"), - Filter.newBuilder().addMust(matchKeyword("color", "red")).build(), - true, - null, - null) - .get(); +## Prerequisites -``` +### Cohere models on AWS -```csharp -using Qdrant.Client; -using static Qdrant.Client.Grpc.Conditions; +One of the possible ways to deploy Cohere models on AWS is to use AWS SageMaker. Cohere's website has [a detailed +guide on how to deploy the models in that way](https://docs.cohere.com/docs/amazon-sagemaker-setup-guide), so you can +follow the steps described there to set up your own instance. -var client = new QdrantClient("localhost", 6334); +### Qdrant Hybrid Cloud on AWS -await client.DeletePayloadAsync( - collectionName: "{collection_name}", - keys: ["color", "price"], - filter: MatchKeyword("color", "red") -); +Our documentation covers the deployment of Qdrant on AWS as a Hybrid Cloud Environment, so you can follow the steps described +there to set up your own instance. The deployment process is quite straightforward, and you can have your Qdrant cluster +up and running in a few minutes. + +[//]: # (TODO: refer to the documentation on how to deploy Qdrant on AWS) +Once you perform all the steps, your Qdrant cluster should be running on a specific URL. You will need this URL and the +API key to interact with Qdrant, so let's store them both in the environment variables: + +```shell +export QDRANT_URL="https://qdrant.example.com" +export QDRANT_API_KEY="your-api-key" ``` -```go -import ( - "context" +```python +import os - "github.com/qdrant/go-client/qdrant" -) +os.environ["QDRANT_URL"] = "https://qdrant.example.com" +os.environ["QDRANT_API_KEY"] = "your-api-key" +``` -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +### Airbyte Open Source -client.DeletePayload(context.Background(), &qdrant.DeletePayloadPoints{ - CollectionName: "{collection_name}", - Keys: []string{"color", "price"}, - PointsSelector: qdrant.NewPointsSelectorFilter( - &qdrant.Filter{ - Must: []*qdrant.Condition{qdrant.NewMatch("color", "red")}, - }, - ), -}) +Airbyte is an open-source data integration platform that helps you replicate your data in your warehouses, lakes, and +databases. You can install it on your infrastructure and use it to load the data into Qdrant. The installation process is described in the [official documentation](https://docs.airbyte.com/deploying-airbyte/). +Please follow the instructions to set up your own instance. -``` +#### Setting up the connection -## [Anchor](https://qdrant.tech/documentation/concepts/payload/\#payload-indexing) Payload indexing +Once you have an Airbyte up and running, you can configure the connection to load the data from the respective source +into Qdrant. The configuration will require setting up the source and destination connectors. In this tutorial we will +use the following connectors: -To search more efficiently with filters, Qdrant allows you to create indexes for payload fields by specifying the name and type of field it is intended to be. +- **Source:** [File](https://docs.airbyte.com/integrations/sources/file) to load the data from an Excel sheet +- **Destination:** [Qdrant](https://docs.airbyte.com/integrations/destinations/qdrant) to load the data into Qdrant -The indexed fields also affect the vector index. See [Indexing](https://qdrant.tech/documentation/concepts/indexing/) for details. +Airbyte UI will guide you through the process of setting up the source and destination and connecting them. Here is how +the configuration of the source might look like: -In practice, we recommend creating an index on those fields that could potentially constrain the results the most. -For example, using an index for the object ID will be much more efficient, being unique for each record, than an index by its color, which has only a few possible values. +![Airbyte source configuration](/documentation/examples/customer-support-cohere-airbyte/airbyte-excel-source.png) -In compound queries involving multiple fields, Qdrant will attempt to use the most restrictive index first. +Qdrant is our target destination, so we need to set up the connection to it. We need to specify which fields should be +included to generate the embeddings. In our case it makes complete sense to embed just the questions, as we are going +to look for similar questions asked in the past and provide the answers. -To create index for the field, you can use the following: +![Airbyte destination configuration](/documentation/examples/customer-support-cohere-airbyte/airbyte-qdrant-destination.png) -REST API ( [Schema](https://api.qdrant.tech/api-reference/indexes/create-field-index)) +Once we have the destination set up, we can finally configure a connection. The connection will define the schedule +of the data synchronization. -httppythontypescriptrustjavacsharpgo +![Airbyte connection configuration](/documentation/examples/customer-support-cohere-airbyte/airbyte-connection.png) -```http -PUT /collections/{collection_name}/index -{ - "field_name": "name_of_the_field_to_index", - "field_schema": "keyword" -} +Airbyte should now be ready to accept any data updates from the source and load them into Qdrant. You can monitor the +progress of the synchronization in the UI. -``` +## RAG connector -```python -client.create_payload_index( - collection_name="{collection_name}", - field_name="name_of_the_field_to_index", - field_schema="keyword", -) +One of our previous tutorials, guides you step-by-step on [implementing custom connector for Cohere +RAG](documentation/examples/cohere-rag-connector/) with Cohere Embed v3 and Qdrant. You can just point it to use your Hybrid Cloud +Qdrant instance running on AWS. Created connector might be deployed to Amazon Web Services in various ways, even in a +[Serverless](https://aws.amazon.com/serverless/) manner using [AWS +Lambda](https://aws.amazon.com/lambda/?c=ser&sec=srv). -``` +In general, RAG connector has to expose a single endpoint that will accept POST requests with `query` parameter and +return the matching documents as JSON document with a specific structure. Our FastAPI implementation created [in the +related tutorial](documentation/examples/cohere-rag-connector/) is a perfect fit for this task. The only difference is that you +should point it to the Cohere models and Qdrant running on AWS infrastructure. -```typescript -client.createPayloadIndex("{collection_name}", { - field_name: "name_of_the_field_to_index", - field_schema: "keyword", -}); +> Our connector is a lightweight web service that exposes a single endpoint and glues the Cohere embedding model with +> our Qdrant Hybrid Cloud instance. Thus, it perfectly fits the serverless architecture, requiring no additional +> infrastructure to run. -``` +You can also run the connector as another service within your [Kubernetes cluster running on AWS +(EKS)](https://aws.amazon.com/eks/), or by launching an [EC2](https://aws.amazon.com/ec2/) compute instance. This step +is dependent on the way you deploy your other services, so we'll leave it to you to decide how to run the connector. -```rust -use qdrant_client::qdrant::{CreateFieldIndexCollectionBuilder, FieldType}; +Eventually, the web service should be available under a specific URL, and it's a good practice to store it in the +environment variable, so the other services can easily access it. -client - .create_field_index( - CreateFieldIndexCollectionBuilder::new( - "{collection_name}", - "name_of_the_field_to_index", - FieldType::Keyword, - ) - .wait(true), - ) - .await?; +```shell +export RAG_CONNECTOR_URL="https://rag-connector.example.com/search" +``` +```python +os.environ["RAG_CONNECTOR_URL"] = "https://rag-connector.example.com/search" ``` -```java -import io.qdrant.client.grpc.Collections.PayloadSchemaType; +## Customer interface -client.createPayloadIndexAsync( - "{collection_name}", - "name_of_the_field_to_index", - PayloadSchemaType.Keyword, - null, - true, - null, - null); +At this part we have all the data loaded into Qdrant, and the RAG connector is ready to serve the relevant context. The +last missing piece is the customer interface, that will call the Command model to create the answer. Such a system +should be built specifically for the platform you use and integrated into its workflow, but we will build the strong +foundation for it and show how to use it in a simple CLI tool. + +> Our application does not have to connect to Qdrant anymore, as the model will connect to the RAG connector directly. + +First of all, we have to create a connection to Cohere services through the Cohere SDK. + +```python +import cohere +# Create a Cohere client pointing to the AWS instance +cohere_client = cohere.Client(...) ``` -```csharp -using Qdrant.Client; +Next, our connector should be registered. **Please make sure to do it once, and store the id of the connector in the +environment variable or in any other way that will be accessible to the application.** -var client = new QdrantClient("localhost", 6334); +```python +import os -await client.CreatePayloadIndexAsync( - collectionName: "{collection_name}", - fieldName: "name_of_the_field_to_index" -); +connector_response = cohere_client.connectors.create( + name="customer-support", + url=os.environ["RAG_CONNECTOR_URL"], +) +# The id returned by the API should be stored for future use +connector_id = connector_response.connector.id ``` -```go -import ( - "context" +Finally, we can create a prompt and get the answer from the model. Additionally, we define which of the connectors +should be used to provide the context, as we may have multiple connectors and want to use specific ones, depending on +some conditions. Let's start with asking a question. - "github.com/qdrant/go-client/qdrant" -) +```python +query = "Why Qdrant does not return my vectors?" +``` -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Now we can send the query to the model, get the response, and possibly send it back to the customer. -client.CreateFieldIndex(context.Background(), &qdrant.CreateFieldIndexCollection{ - CollectionName: "{collection_name}", - FieldName: "name_of_the_field_to_index", - FieldType: qdrant.FieldType_FieldTypeKeyword.Enum(), -}) +```python +response = cohere_client.chat( + message=query, + connectors=[ + cohere.ChatConnector(id=connector_id), + ], + model="command-r", +) +print(response.text) ``` -The index usage flag is displayed in the payload schema with the [collection info API](https://api.qdrant.tech/api-reference/collections/get-collection). +The output should be the answer to the question, generated by the model, for example: -Payload schema example: +> Qdrant is set up by default to minimize network traffic and therefore doesn't return vectors in search results. However, you can make Qdrant return your vectors by setting the 'with_vector' parameter of the Search/Scroll function to true. -```json -{ - "payload_schema": { - "property1": { - "data_type": "keyword" - }, - "property2": { - "data_type": "integer" - } - } -} +Customer support should not be fully automated, as some completely new issues might require human intervention. We +should play with prompt engineering and expect the model to provide the answer with a certain confidence level. If the +confidence is too low, we should not send the answer automatically but present it to the support team for review. -``` +## Wrapping up -## [Anchor](https://qdrant.tech/documentation/concepts/payload/\#facet-counts) Facet counts +This tutorial shows how to build a fully private customer support system using Cohere models, Qdrant Hybrid Cloud, and +Airbyte, which runs on AWS infrastructure. You can ensure your data does not leave your premises and focus on providing +the best customer support experience without bothering your team with repetitive tasks. -_Available as of v1.12.0_ +<|page-167-lllmstxt|> +# Chat With Product PDF Manuals Using Hybrid Search -Faceting is a special counting technique that can be used for various purposes: +| Time: 120 min | Level: Advanced | Output: [GitHub](https://github.com/infoslack/qdrant-example/blob/main/HC-demo/HC-DO-LlamaIndex-Jina-v2.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/infoslack/qdrant-example/blob/main/HC-demo/HC-DO-LlamaIndex-Jina-v2.ipynb) | +| --- | ----------- | ----------- |----------- | -- Know which unique values exist for a payload key. -- Know the number of points that contain each unique value. -- Know how restrictive a filter would become by matching a specific value. +With the proliferation of digital manuals and the increasing demand for quick and accurate customer support, having a chatbot capable of efficiently parsing through complex PDF documents and delivering precise information can be a game-changer for any business. -Specifically, it is a counting aggregation for the values in a field, akin to a `GROUP BY` with `COUNT(*)` commands in SQL. +In this tutorial, we'll walk you through the process of building a RAG-based chatbot, designed specifically to assist users with understanding the operation of various household appliances. +We'll cover the essential steps required to build your system, including data ingestion, natural language understanding, and response generation for customer support use cases. -These results for a specific field is called a “facet”. For example, when you look at an e-commerce search results page, you might see a list of brands on the sidebar, showing the number of products for each brand. This would be a facet for a `"brand"` field. +## Components -To get the facet counts for a field, you can use the following: +- **Embeddings:** Jina Embeddings, served via the [Jina Embeddings API](https://jina.ai/embeddings/#apiform) +- **Database:** [Qdrant Hybrid Cloud](/documentation/hybrid-cloud/), deployed in a managed Kubernetes cluster on [DigitalOcean + (DOKS)](https://www.digitalocean.com/products/kubernetes) +- **LLM:** [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) language model on HuggingFace +- **Framework:** [LlamaIndex](https://www.llamaindex.ai/) for extended RAG functionality and [Hybrid Search support](https://docs.llamaindex.ai/en/stable/examples/vector_stores/qdrant_hybrid/). +- **Parser:** [LlamaParse](https://github.com/run-llama/llama_parse) as a way to parse complex documents with embedded objects such as tables and figures. -REST API ( [Facet](https://api.qdrant.tech/v-1-13-x/api-reference/points/facet)) +![Architecture diagram](/documentation/examples/hybrid-search-llamaindex-jinaai/architecture-diagram.png) -httppythontypescriptrustjavacsharpgo +### Procedure -```http -POST /collections/{collection_name}/facet -{ - "key": "size", - "filter": { - "must": { - "key": "color", - "match": { "value": "red" } - } - } -} +Retrieval Augmented Generation (RAG) combines search with language generation. An external information retrieval system is used to identify documents likely to provide information relevant to the user's query. These documents, along with the user's request, are then passed on to a text-generating language model, producing a natural response. -``` +This method enables a language model to respond to questions and access information from a much larger set of documents than it could see otherwise. The language model only looks at a few relevant sections of the documents when generating responses, which also helps to reduce inexplicable errors. -```python -from qdrant_client import QdrantClient, models +## -client = QdrantClient(url="http://localhost:6333") +[Service Managed Kubernetes](https://www.ovhcloud.com/en-in/public-cloud/kubernetes/), powered by OVH Public Cloud Instances, a leading European cloud provider. With OVHcloud Load Balancers and disks built in. OVHcloud Managed Kubernetes provides high availability, compliance, and CNCF conformance, allowing you to focus on your containerized software layers with total reversibility. -client.facet( - collection_name="{collection_name}", - key="size", - facet_filter=models.Filter(must=[models.Match("color", "red")]), -) -``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +## Prerequisites -const client = new QdrantClient({ host: "localhost", port: 6333 }); +### Deploying Qdrant Hybrid Cloud on DigitalOcean -client.facet("{collection_name}", { - filter: { - must: [\ - {\ - key: "color",\ - match: {\ - value: "red",\ - },\ - },\ - ], - }, - key: "size", -}); +[DigitalOcean Kubernetes (DOKS)](https://www.digitalocean.com/products/kubernetes) is a managed Kubernetes service that lets you deploy Kubernetes clusters without the complexities of handling the control plane and containerized infrastructure. Clusters are compatible with standard Kubernetes toolchains and integrate natively with DigitalOcean Load Balancers and volumes. -``` +1. To start using managed Kubernetes on DigitalOcean, follow the [platform-specific documentation](/documentation/hybrid-cloud/platform-deployment-options/#digital-ocean). +2. Once your Kubernetes clusters are up, [you can begin deploying Qdrant Hybrid Cloud](/documentation/hybrid-cloud/). +3. Once it's deployed, you should have a running Qdrant cluster with an API key. -```rust -use qdrant_client::qdrant::{Condition, FacetCountsBuilder, Filter}; -use qdrant_client::Qdrant; +### Development environment -let client = Qdrant::from_url("http://localhost:6334").build()?; +Then, install all dependencies: -client - .facet( - FacetCountsBuilder::new("{collection_name}", "size") - .limit(10) - .filter(Filter::must(vec![Condition::matches(\ - "color",\ - "red".to_string(),\ - )])), - ) - .await?; +```python +!pip install -U \ + llama-index \ + llama-parse \ + python-dotenv \ + llama-index-embeddings-jinaai \ + llama-index-llms-huggingface \ + llama-index-vector-stores-qdrant \ + "huggingface_hub[inference]" \ + datasets +``` +Set up secret key values on `.env` file: + +```bash +JINAAI_API_KEY +HF_INFERENCE_API_KEY +LLAMA_CLOUD_API_KEY +QDRANT_HOST +QDRANT_API_KEY ``` -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; +Load all environment variables: -import static io.qdrant.client.ConditionFactory.matchKeyword; -import io.qdrant.client.grpc.Points; -import io.qdrant.client.grpc.Filter; +```python +import os +from dotenv import load_dotenv +load_dotenv('./.env') +``` +## Implementation -QdrantClient client = new QdrantClient( - QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +### Connect Jina Embeddings and Mixtral LLM -client - .facetAsync( - Points.FacetCounts.newBuilder() - .setCollectionName(collection_name) - .setKey("size") - .setFilter(Filter.newBuilder().addMust(matchKeyword("color", "red")).build()) - .build()) - .get(); +LlamaIndex provides built-in support for the [Jina Embeddings API](https://jina.ai/embeddings/#apiform). To use it, you need to initialize the `JinaEmbedding` object with your API Key and model name. -``` +For the LLM, you need wrap it in a subclass of `llama_index.llms.CustomLLM` to make it compatible with LlamaIndex. -```csharp -using Qdrant.Client; -using static Qdrant.Client.Grpc.Conditions; +```python +# connect embeddings +from llama_index.embeddings.jinaai import JinaEmbedding -var client = new QdrantClient("localhost", 6334); +jina_embedding_model = JinaEmbedding( + model="jina-embeddings-v2-base-en", + api_key=os.getenv("JINAAI_API_KEY"), +) -await client.FacetAsync( - "{collection_name}", - key: "size", - filter: MatchKeyword("color", "red") -); +# connect LLM +from llama_index.llms.huggingface import HuggingFaceInferenceAPI +mixtral_llm = HuggingFaceInferenceAPI( + model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1", + token=os.getenv("HF_INFERENCE_API_KEY"), +) ``` -```go -import ( - "context" +### Prepare data for RAG - "github.com/qdrant/go-client/qdrant" -) +This example will use household appliance manuals, which are generally available as PDF documents. +LlamaPar +In the `data` folder, we have three documents, and we will use it to extract the textual content from the PDF and use it as a knowledge base in a simple RAG. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +The free LlamaIndex Cloud plan is sufficient for our example: -res, err := client.Facet(ctx, &qdrant.FacetCounts{ - CollectionName: "{collection_name}", - Key: "size", - Filter: &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("color", "red"), - }, - }, -}) +```python +import nest_asyncio +nest_asyncio.apply() +from llama_parse import LlamaParse +llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY") + +llama_parse_documents = LlamaParse(api_key=llamaparse_api_key, result_type="markdown").load_data([ + "data/DJ68-00682F_0.0.pdf", + "data/F500E_WF80F5E_03445F_EN.pdf", + "data/O_ME4000R_ME19R7041FS_AA_EN.pdf" +]) ``` -The response will contain the counts for each unique value in the field: +### Store data into Qdrant +The code below does the following: -```json -{ - "response": { - "hits": [\ - {"value": "L", "count": 19},\ - {"value": "S", "count": 10},\ - {"value": "M", "count": 5},\ - {"value": "XL", "count": 1},\ - {"value": "XXL", "count": 1}\ - ] - }, - "time": 0.0001 -} +- create a vector store with Qdrant client; +- get an embedding for each chunk using Jina Embeddings API; +- combines `sparse` and `dense` vectors for hybrid search; +- stores all data into Qdrant; -``` +Hybrid search with Qdrant must be enabled from the beginning - we can simply set `enable_hybrid=True`. -The results are sorted by the count in descending order, then by the value in ascending order. -Only values with non-zero counts will be returned. +```python +# By default llamaindex uses OpenAI models +# setting embed_model to Jina and llm model to Mixtral +from llama_index.core import Settings +Settings.embed_model = jina_embedding_model +Settings.llm = mixtral_llm -By default, the way Qdrant the counts for each value is approximate to achieve fast results. This should accurate enough for most cases, but if you need to debug your storage, you can use the `exact` parameter to get exact counts. +from llama_index.core import VectorStoreIndex, StorageContext +from llama_index.vector_stores.qdrant import QdrantVectorStore +import qdrant_client -httppythontypescriptrustjavacsharpgo +client = qdrant_client.QdrantClient( + url=os.getenv("QDRANT_HOST"), + api_key=os.getenv("QDRANT_API_KEY") +) -```http -POST /collections/{collection_name}/facet -{ - "key": "size", - "exact": true -} +vector_store = QdrantVectorStore( + client=client, collection_name="demo", enable_hybrid=True, batch_size=20 +) +Settings.chunk_size = 512 +storage_context = StorageContext.from_defaults(vector_store=vector_store) +index = VectorStoreIndex.from_documents( + documents=llama_parse_documents, + storage_context=storage_context +) ``` +### Prepare a prompt +Here we will create a custom prompt template. This prompt asks the LLM to use only the context information retrieved from Qdrant. When querying with hybrid mode, we can set `similarity_top_k` and `sparse_top_k` separately: + +- `sparse_top_k` represents how many nodes will be retrieved from each dense and sparse query. +- `similarity_top_k` controls the final number of returned nodes. In the above setting, we end up with 10 nodes. + +Then, we assemble the query engine using the prompt. + ```python -client.facet( - collection_name="{collection_name}", - key="size", - exact=True, +from llama_index.core import PromptTemplate + +qa_prompt_tmpl = ( + "Context information is below.\n" + "-------------------------------" + "{context_str}\n" + "-------------------------------" + "Given the context information and not prior knowledge," + "answer the query. Please be concise, and complete.\n" + "If the context does not contain an answer to the query," + "respond with \"I don't know!\"." + "Query: {query_str}\n" + "Answer: " ) +qa_prompt = PromptTemplate(qa_prompt_tmpl) -``` +from llama_index.core.retrievers import VectorIndexRetriever +from llama_index.core.query_engine import RetrieverQueryEngine +from llama_index.core import get_response_synthesizer +from llama_index.core import Settings +Settings.embed_model = jina_embedding_model +Settings.llm = mixtral_llm -```typescript -client.facet("{collection_name}", { - key: "size", - exact: true, -}); +# retriever +retriever = VectorIndexRetriever( + index=index, + similarity_top_k=2, + sparse_top_k=12, + vector_store_query_mode="hybrid" +) + +# response synthesizer +response_synthesizer = get_response_synthesizer( + llm=mixtral_llm, + text_qa_template=qa_prompt, + response_mode="compact", +) +# query engine +query_engine = RetrieverQueryEngine( + retriever=retriever, + response_synthesizer=response_synthesizer, +) ``` -```rust -use qdrant_client::qdrant::FacetCountsBuilder; +## Run a test query +Now you can ask questions and receive answers based on the data: -client - .facet( - FacetCountsBuilder::new("{collection_name}", "size") - .limit(10) - .exact(true), - ) - .await?; +**Question** +```python +result = query_engine.query("What temperature should I use for my laundry?") +print(result.response) ``` -```java - client - .facetAsync( - Points.FacetCounts.newBuilder() - .setCollectionName(collection_name) - .setKey("foo") - .setExact(true) - .build()) - .get(); +**Answer** +```text +The water temperature is set to 70 ˚C during the Eco Drum Clean cycle. You cannot change the water temperature. However, the temperature for other cycles is not specified in the context. ``` -```csharp -using Qdrant.Client; +And that's it! Feel free to scale this up to as many documents and complex PDFs as you like. -await client.FacetAsync( - "{collection_name}", - key: "size", - exact: true, -); +<|page-168-lllmstxt|> +# Qdrant Cloud API: Powerful gRPC and Flexible REST/JSON Interfaces -``` +**Note:** This is not the Qdrant REST or gPRC API of the database itself. For database APIs & SDKs, see our list of [interfaces](/documentation/interfaces/) -```go -res, err := client.Facet(ctx, &qdrant.FacetCounts{ - CollectionName: "{collection_name}", - Key: "key", - Exact: true, -}) +## Introduction -``` +The Qdrant Cloud API lets you automate the Qdrant Cloud platform. You can use this API to manage your accounts, clusters, backup schedules, authentication methods, hybrid cloud environments, and more. -##### Was this page useful? +To cater to diverse integration needs, the Qdrant Cloud API offers two primary interaction models: -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +* **gRPC API**: For high-performance, low-latency, and type-safe communication. This is the recommended way for backend services and applications requiring maximum efficiency. The API is defined using Protocol Buffers. +* **REST/JSON API**: A conventional HTTP/1.1 (and HTTP/2) interface with JSON payloads. This API is provided via a gRPC Gateway, translating RESTful calls into gRPC messages, offering ease of use for web clients, scripts, and broader tool compatibility. -Thank you for your feedback! 🙏 +You can find the API definitions and generated client libraries in our Qdrant Cloud Public API [GitHub repository](https://github.com/qdrant/qdrant-cloud-public-api). +**Note:** The API is splitted into multiple services to make it easier to use. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/payload.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +### Qdrant Cloud API Endpoints -On this page: +* **gRPC Endpoint**: grpc.cloud.qdrant.io:443 +* **REST/JSON Endpoint**: https://api.cloud.qdrant.io -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/payload.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +### Authentication -× +Most of the Qdrant Cloud API requests must be authenticated. Authentication is handled via API keys (so called management keys), which should be passed in the Authorization header. +**Management Keys**: `Authorization: apikey ` -[Powered by](https://qdrant.tech/) +Replace with the actual API key obtained from your Qdrant Cloud dashboard or generated programmatically. -<|page-98-lllmstxt|> -## private-cloud-setup -- [Documentation](https://qdrant.tech/documentation/) -- [Private cloud](https://qdrant.tech/documentation/private-cloud/) -- Setup Private Cloud +You can create a management key in the Cloud Console UI. Go to **Access Management** > **Cloud Management Keys**. +![Authentication](/documentation/cloud/authentication.png) -# [Anchor](https://qdrant.tech/documentation/private-cloud/private-cloud-setup/\#qdrant-private-cloud-setup) Qdrant Private Cloud Setup +**Note:** Ensure that the API key is kept secure and not exposed in public repositories or logs. Once authenticated, the API allows you to manage clusters, backup schedules, and perform other operations available to your account. -## [Anchor](https://qdrant.tech/documentation/private-cloud/private-cloud-setup/\#requirements) Requirements +### Samples -- **Kubernetes cluster:** To install Qdrant Private Cloud, you need a [standard compliant](https://www.cncf.io/training/certification/software-conformance/) Kubernetes cluster. You can run this cluster in any cloud, on-premise or edge environment, with distributions that range from AWS EKS to VMWare vSphere. See [Deployment Platforms](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/) for more information. -- **Storage:** For storage, you need to set up the Kubernetes cluster with a Container Storage Interface (CSI) driver that provides block storage. For vertical scaling, the CSI driver needs to support volume expansion. For backups and restores, the driver needs to support CSI snapshots and restores. +For samples on how to use the API, with a tool like grpcurl, curl or any of the provided SDKs, please see the [Qdrant Cloud Public API](https://github.com/qdrant/qdrant-cloud-public-api) repository. -- **Permissions:** To install the Qdrant Kubernetes Operator you need to have `cluster-admin` access in your Kubernetes cluster. -- **Locations:** By default, the Qdrant Operator Helm charts and container images are served from `registry.cloud.qdrant.io`. +## Terraform Provider -> **Note:** You can also mirror these images and charts into your own registry and pull them from there. +Qdrant Cloud also provides a Terraform provider to manage your Qdrant Cloud resources. [Learn more](/documentation/infrastructure/terraform/). -### [Anchor](https://qdrant.tech/documentation/private-cloud/private-cloud-setup/\#cli-tools) CLI tools +## Deprecated OpenAPI specification -During the onboarding, you will need to deploy the Qdrant Kubernetes Operator using Helm. Make sure you have the following tools installed: +We still support our deprecated OpenAPI endpoint, but this is scheduled to be removed later this year (November 1st, 2025). +We do _NOT_ recommend to use this endpoint anymore and use the replacement as described above. -- [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) -- [helm](https://helm.sh/docs/intro/install/) +| REST API | Documentation | +| -------- | ------------------------------------------------------------------------------------ | +| v.0.1.0 | [OpenAPI Specification](https://cloud.qdrant.io/pa/v1/docs) | -You will need to have access to the Kubernetes cluster with `kubectl` and `helm` configured to connect to it. Please refer the documentation of your Kubernetes distribution for more information. +<|page-169-lllmstxt|> +## Cloud Tools -### [Anchor](https://qdrant.tech/documentation/private-cloud/private-cloud-setup/\#required-artifacts) Required artifacts +| Integration | Description | +| ----------------------------------- | ------------------------------------------------------------------------------------------- | +| [Pulumi](/documentation/cloud-tools/pulumi/) | Infrastructure as code tool for creating, deploying, and managing cloud infrastructure | +| [Terraform](/documentation/cloud-tools/terraform/) | infrastructure as code tool to define resources in human-readable configuration files. | -Container images: +<|page-170-lllmstxt|> +# Region-Specific Contract Management System -- `registry.cloud.qdrant.io/qdrant/qdrant` -- `registry.cloud.qdrant.io/qdrant/operator` -- `registry.cloud.qdrant.io/qdrant/cluster-manager` +| Time: 90 min | Level: Advanced | | +| --- | ----------- | ----------- |----------- | -Open Containers Initiative (OCI) Helm charts: +Contract management benefits greatly from Retrieval Augmented Generation (RAG), streamlining the handling of lengthy business contract texts. With AI assistance, complex questions can be asked and well-informed answers generated, facilitating efficient document management. This proves invaluable for businesses with extensive relationships, like shipping companies, construction firms, and consulting practices. Access to such contracts is often restricted to authorized team members due to security and regulatory requirements, such as GDPR in Europe, necessitating secure storage practices. -- `registry.cloud.qdrant.io/qdrant-charts/qdrant-private-cloud` -- `registry.cloud.qdrant.io/library/qdrant-kubernetes-api` +Companies want their data to be kept and processed within specific geographical boundaries. For that reason, this RAG-centric tutorial focuses on dealing with a region-specific cloud provider. You will set up a contract management system using [Aleph Alpha's](https://aleph-alpha.com/) embeddings and LLM. You will host everything on [STACKIT](https://www.stackit.de/), a German business cloud provider. On this platform, you will run Qdrant Hybrid Cloud as well as the rest of your RAG application. This setup will ensure that your data is stored and processed in Germany. -### [Anchor](https://qdrant.tech/documentation/private-cloud/private-cloud-setup/\#mirroring-images-and-charts) Mirroring images and charts +![Architecture diagram](/documentation/examples/contract-management-stackit-aleph-alpha/architecture-diagram.png) -To mirror all necessary container images and Helm charts into your own registry, you can either use a replication feature that your registry provides, or you can manually sync the images with [Skopeo](https://github.com/containers/skopeo): +## Components -First login to the source registry: +A contract management platform is not a simple CLI tool, but an application that should be available to all team +members. It needs an interface to upload, search, and manage the documents. Ideally, the system should be +integrated with org's existing stack, and the permissions/access controls inherited from LDAP or Active +Directory. + +> **Note:** In this tutorial, we are going to build a solid foundation for such a system. However, it is up to your organization's setup to implement the entire solution. + +- **Dataset** - a collection of documents, using different formats, such as PDF or DOCx, scraped from internet +- **Asymmetric semantic embeddings** - [Aleph Alpha embedding](https://docs.aleph-alpha.com/api/pharia-inference/semantic-embed/) to + convert the queries and the documents into vectors +- **Large Language Model** - the [Luminous-extended-control + model](https://docs.aleph-alpha.com/api/pharia-inference/available-models/), but you can play with a different one from the + Luminous family +- **Qdrant Hybrid Cloud** - a knowledge base to store the vectors and search over the documents +- **STACKIT** - a [German business cloud](https://www.stackit.de) to run the Qdrant Hybrid Cloud and the application + processes + +We will implement the process of uploading the documents, converting them into vectors, and storing them in Qdrant. +Then, we will build a search interface to query the documents and get the answers. All that, assuming the user +interacts with the system with some set of permissions, and can only access the documents they are allowed to. -```shell -skopeo login registry.cloud.qdrant.io +## Prerequisites -``` +### Aleph Alpha account -Then login to your own registry: +Since you will be using Aleph Alpha's models, [sign up](https://aleph-alpha.com) with their managed service and obtain an API token. Once you have it ready, store it as an environment variable: ```shell -skopeo login your-registry.example.com +export ALEPH_ALPHA_API_KEY="" +``` +```python +import os + +os.environ["ALEPH_ALPHA_API_KEY"] = "" ``` -To sync all container images: +### Qdrant Hybrid Cloud on STACKIT + +Please refer to our documentation to see [how to deploy Qdrant Hybrid Cloud on +STACKIT](/documentation/hybrid-cloud/platform-deployment-options/#stackit). Once you finish the deployment, you will +have the API endpoint to interact with the Qdrant server. Let's store it in the environment variable as well: ```shell -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/qdrant your-registry.example.com/qdrant/qdrant -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/cluster-manager your-registry.example.com/qdrant/cluster-manager -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant/operator your-registry.example.com/qdrant/operator +export QDRANT_URL="https://qdrant.example.com" +export QDRANT_API_KEY="your-api-key" +``` +```python +os.environ["QDRANT_URL"] = "https://qdrant.example.com" +os.environ["QDRANT_API_KEY"] = "your-api-key" ``` -To sync all helm charts: +Qdrant will be running on a specific URL and access will be restricted by the API key. Make sure to store them both as environment variables as well: -```shell -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant-charts/qdrant-private-cloud your-registry.example.com/qdrant-charts/qdrant-private-cloud -skopeo sync --all --src docker --dest docker registry.cloud.qdrant.io/qdrant-charts/qdrant-kubernetes-api your-registry.example.com/qdrant-charts/qdrant-kubernetes-api +*Optional:* Whenever you use LangChain, you can also [configure LangSmith](https://docs.smith.langchain.com/), which will help us trace, monitor and debug LangChain applications. You can sign up for LangSmith [here](https://smith.langchain.com/). +```shell +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_API_KEY="your-api-key" +export LANGCHAIN_PROJECT="your-project" # if not specified, defaults to "default" ``` -During the installation or upgrade, you will need to adapt the repository information in the Helm chart values. See [Private Cloud Configuration](https://qdrant.tech/documentation/private-cloud/configuration/) for details. +## Implementation -## [Anchor](https://qdrant.tech/documentation/private-cloud/private-cloud-setup/\#installation-and-upgrades) Installation and Upgrades +To build the application, we can use the official SDKs of Aleph Alpha and Qdrant. However, to streamline the process +let's use [LangChain](https://python.langchain.com/docs/get_started/introduction). This framework is already integrated with both services, so we can focus our efforts on +developing business logic. -Once you are onboarded to Qdrant Private Cloud, you will receive credentials to access the Qdrant Cloud Registry. You can use these credentials to install the Qdrant Private Cloud solution using the following commands. You can choose the Kubernetes namespace freely. +### Qdrant collection -```bash -kubectl create namespace qdrant-private-cloud -kubectl create secret docker-registry qdrant-registry-creds --docker-server=registry.cloud.qdrant.io --docker-username='your-username' --docker-password='your-password' --namespace qdrant-private-cloud -helm registry login 'registry.cloud.qdrant.io' --username 'your-username' --password 'your-password' -helm upgrade --install qdrant-private-cloud-crds oci://registry.cloud.qdrant.io/qdrant-charts/qdrant-kubernetes-api --namespace qdrant-private-cloud --version v1.16.6 --wait -helm upgrade --install qdrant-private-cloud oci://registry.cloud.qdrant.io/qdrant-charts/qdrant-private-cloud --namespace qdrant-private-cloud --version 1.7.1 +Aleph Alpha embeddings are high dimensional vectors by default, with a dimensionality of `5120`. However, a pretty +unique feature of that model is that they might be compressed to a size of `128`, with a small drop in accuracy +performance (4-6%, according to the docs). Qdrant can store even the original vectors easily, and this sounds like a +good idea to enable [Binary Quantization](/documentation/guides/quantization/#binary-quantization) to save space and +make the retrieval faster. Let's create a collection with such settings: +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient( + location=os.environ["QDRANT_URL"], + api_key=os.environ["QDRANT_API_KEY"], +) +client.create_collection( + collection_name="contracts", + vectors_config=models.VectorParams( + size=5120, + distance=models.Distance.COSINE, + quantization_config=models.BinaryQuantization( + binary=models.BinaryQuantizationConfig( + always_ram=True, + ) + ) + ), +) ``` -For a list of available versions consult the [Private Cloud Changelog](https://qdrant.tech/documentation/private-cloud/changelog/). +We are going to use the `contracts` collection to store the vectors of the documents. The `always_ram` flag is set to +`True` to keep the quantized vectors in RAM, which will speed up the search process. We also wanted to restrict access +to the individual documents, so only users with the proper permissions can see them. In Qdrant that should be solved by +adding a payload field that defines who can access the document. We'll call this field `roles` and set it to an array +of strings with the roles that can access the document. -Current default versions are: +```python +client.create_payload_index( + collection_name="contracts", + field_name="metadata.roles", + field_schema=models.PayloadSchemaType.KEYWORD, +) +``` -- qdrant-kubernetes-api v1.16.6 -- qdrant-private-cloud 1.7.1 +Since we use Langchain, the `roles` field is a nested field of the `metadata`, so we have to define it as +`metadata.roles`. The schema says that the field is a keyword, which means it is a string or an array of strings. We are +going to use the name of the customers as the roles, so the access control will be based on the customer name. -Especially ensure, that the default values to reference `StorageClasses` and the corresponding `VolumeSnapshotClass` are set correctly in your environment. +### Ingestion pipeline -### [Anchor](https://qdrant.tech/documentation/private-cloud/private-cloud-setup/\#scope-of-the-operator) Scope of the operator +Semantic search systems rely on high-quality data as their foundation. With the [unstructured integration of Langchain](https://python.langchain.com/docs/integrations/providers/unstructured), ingestion of various document formats like PDFs, Microsoft Word files, and PowerPoint presentations becomes effortless. However, it's crucial to split the text intelligently to avoid converting entire documents into vectors; instead, they should be divided into meaningful chunks. Subsequently, the extracted documents are converted into vectors using Aleph Alpha embeddings and stored in the Qdrant collection. -By default, the Qdrant Operator will only manage Qdrant clusters in the same Kubernetes namespace, where it is already deployed. The RoleBindings are also limited to this specific namespace. This default is chosen to limit the operator to the least amount of permissions necessary within a Kubernetes cluster. +Let's start by defining the components and connecting them together: -If you want to manage Qdrant clusters in multiple namespaces with the same operator, you can either configure a list of namespaces that the operator should watch: +```python +embeddings = AlephAlphaAsymmetricSemanticEmbedding( + model="luminous-base", + aleph_alpha_api_key=os.environ["ALEPH_ALPHA_API_KEY"], + normalize=True, +) -```yaml -operator: - watch: - # If true, watches only the namespace where the Qdrant operator is deployed, otherwise watches the namespaces in watch.namespaces - onlyReleaseNamespace: false - # an empty list watches all namespaces. - namespaces: - - qdrant-private-cloud - - some-other-namespase - limitRBAC: true +qdrant = Qdrant( + client=client, + collection_name="contracts", + embeddings=embeddings, +) +``` + +Now it's high time to index our documents. Each of the documents is a separate file, and we also have to know the +customer name to set the access control properly. There might be several roles for a single document, so let's keep them +in a list. + +```python +documents = { + "data/Data-Processing-Agreement_STACKIT_Cloud_version-1.2.pdf": ["stackit"], + "data/langchain-terms-of-service.pdf": ["langchain"], +} +``` + +This is how the documents might look like: + +![Example of the indexed document](/documentation/examples/contract-management-stackit-aleph-alpha/indexed-document.png) + +Each has to be split into chunks first; there is no silver bullet. Our chunking algorithm will be simple and based on +recursive splitting, with the maximum chunk size of 500 characters and the overlap of 100 characters. +```python +from langchain_text_splitters import RecursiveCharacterTextSplitter + +text_splitter = RecursiveCharacterTextSplitter( + chunk_size=500, + chunk_overlap=100, +) ``` -Or you can configure the operator to watch all namespaces: +Now we can iterate over the documents, split them into chunks, convert them into vectors with Aleph Alpha embedding +model, and store them in the Qdrant. -```yaml -operator: - watch: - # If true, watches only the namespace where the Qdrant operator is deployed, otherwise watches the namespaces in watch.namespaces - onlyReleaseNamespace: false - # an empty list watches all namespaces. - namespaces: [] - limitRBAC: false +```python +from langchain_community.document_loaders.unstructured import UnstructuredFileLoader + +for document_path, roles in documents.items(): + document_loader = UnstructuredFileLoader(file_path=document_path) + + # Unstructured loads each file into a single Document object + loaded_documents = document_loader.load() + for doc in loaded_documents: + doc.metadata["roles"] = roles + + # Chunks will have the same metadata as the original document + document_chunks = text_splitter.split_documents(loaded_documents) + # Add the documents to the Qdrant collection + qdrant.add_documents(document_chunks, batch_size=20) ``` -## [Anchor](https://qdrant.tech/documentation/private-cloud/private-cloud-setup/\#uninstallation) Uninstallation +Our collection is filled with data, and we can start searching over it. In a real-world scenario, the ingestion process +should be automated and triggered by the new documents uploaded to the system. Since we already use Qdrant Hybrid Cloud +running on Kubernetes, we can easily deploy the ingestion pipeline as a job to the same environment. On STACKIT, you +probably use the [STACKIT Kubernetes Engine (SKE)](https://www.stackit.de/en/product/kubernetes/) and launch it in a +container. The [Compute Engine](https://www.stackit.de/en/product/stackit-compute-engine/) is also an option, but +everything depends on the specifics of your organization. -To uninstall the Qdrant Private Cloud solution, you can use the following command: +### Search application -```bash -helm uninstall qdrant-private-cloud --namespace qdrant-private-cloud -helm uninstall qdrant-private-cloud-crds --namespace qdrant-private-cloud -kubectl delete namespace qdrant-private-cloud +Specialized Document Management Systems have a lot of features, but semantic search is not yet a standard. We are going +to build a simple search mechanism which could be possibly integrated with the existing system. The search process is +quite simple: we convert the query into a vector using the same Aleph Alpha model, and then search for the most similar +documents in the Qdrant collection. The access control is also applied, so the user can only see the documents they are +allowed to. -``` +We start with creating an instance of the LLM of our choice, and set the maximum number of tokens to 200, as the default +value is 64, which might be too low for our purposes. -Note that uninstalling the `qdrant-private-cloud-crds` Helm chart will remove all Custom Resource Definitions (CRDs) will also remove all Qdrant clusters that were managed by the operator. +```python +from langchain.llms.aleph_alpha import AlephAlpha -##### Was this page useful? +llm = AlephAlpha( + model="luminous-extended-control", + aleph_alpha_api_key=os.environ["ALEPH_ALPHA_API_KEY"], + maximum_tokens=200, +) +``` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Then, we can glue the components together and build the search process. `RetrievalQA` is a class that takes implements +the Question Retrieval process, with a specified retriever and Large Language Model. The instance of `Qdrant` might be +converted into a retriever, with additional filter that will be passed to the `similarity_search` method. The filter +is created as [in a regular Qdrant query](/documentation/concepts/filtering/), with the `roles` field set to the +user's roles. -Thank you for your feedback! 🙏 +```python +user_roles = ["stackit", "aleph-alpha"] -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/private-cloud/private-cloud-setup.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +qdrant_retriever = qdrant.as_retriever( + search_kwargs={ + "filter": models.Filter( + must=[ + models.FieldCondition( + key="metadata.roles", + match=models.MatchAny(any=user_roles) + ) + ] + ) + } +) +``` -On this page: +We set the user roles to `stackit` and `aleph-alpha`, so the user can see the documents that are accessible to these +customers, but not to the others. The final step is to create the `RetrievalQA` instance and use it to search over the +documents, with the custom prompt. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/private-cloud/private-cloud-setup.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +```python +from langchain.prompts import PromptTemplate +from langchain.chains.retrieval_qa.base import RetrievalQA -× +prompt_template = """ +Question: {question} +Answer the question using the Source. If there's no answer, say "NO ANSWER IN TEXT". -[Powered by](https://qdrant.tech/) +Source: {context} -<|page-99-lllmstxt|> -## user-management -- [Documentation](https://qdrant.tech/documentation/) -- [Cloud rbac](https://qdrant.tech/documentation/cloud-rbac/) -- User Management +### Response: +""" +prompt = PromptTemplate( + template=prompt_template, input_variables=["context", "question"] +) -# [Anchor](https://qdrant.tech/documentation/cloud-rbac/user-management/\#user-management) User Management +retrieval_qa = RetrievalQA.from_chain_type( + llm=llm, + chain_type="stuff", + retriever=qdrant_retriever, + return_source_documents=True, + chain_type_kwargs={"prompt": prompt}, +) -> 💡 You can access this in **Access Management > User & Role Management** _if available see [this page for details](https://qdrant.tech/documentation/cloud-rbac/)._ +response = retrieval_qa.invoke({"query": "What are the rules of performing the audit?"}) +print(response["result"]) +``` -## [Anchor](https://qdrant.tech/documentation/cloud-rbac/user-management/\#inviting-users-to-an-account) Inviting Users to an Account +Output: -Users can be invited via the **User Management** section, where they are assigned the **Base role** by default. Additionally, users have the option to select a specific role when inviting another user. The **Base role** is a predefined role with minimal permissions, granting users access to the platform while restricting them to viewing only their own profile. +```text +The rules for performing the audit are as follows: -![image.png](https://qdrant.tech/documentation/cloud/role-based-access-control/user-invitation.png) +1. The Customer must inform the Contractor in good time (usually at least two weeks in advance) about any and all circumstances related to the performance of the audit. +2. The Customer is entitled to perform one audit per calendar year. Any additional audits may be performed if agreed with the Contractor and are subject to reimbursement of expenses. +3. If the Customer engages a third party to perform the audit, the Customer must obtain the Contractor's consent and ensure that the confidentiality agreements with the third party are observed. +4. The Contractor may object to any third party deemed unsuitable. +``` -### [Anchor](https://qdrant.tech/documentation/cloud-rbac/user-management/\#inviting-users-from-a-role) Inviting Users from a Role +There are some other parameters that might be tuned to optimize the search process. The `k` parameter defines how many +documents should be returned, but Langchain allows us also to control the retrieval process by choosing the type of the +search operation. The default is `similarity`, which is just vector search, but we can also use `mmr` which stands for +Maximal Marginal Relevance. It is a technique to diversify the search results, so the user gets the most relevant +documents, but also the most diverse ones. The `mmr` search is slower, but might be more user-friendly. -Users can be invited attached to a specific role by inviting them through the **Role Details** page - just click on the Users tab and follow the prompts. +Our search application is ready, and we can deploy it to the same environment as the ingestion pipeline on STACKIT. The +same rules apply here, so you can use the SKE or the Compute Engine, depending on the specifics of your organization. -Once accepted, they’ll be assigned that role’s permissions, along with the base role. +## Next steps -![image.png](https://qdrant.tech/documentation/cloud/role-based-access-control/invite-user.png) +We built a solid foundation for the contract management system, but there is still a lot to do. If you want to make the +system production-ready, you should consider implementing the mechanism into your existing stack. If you have any +questions, feel free to ask on our [Discord community](https://qdrant.to/discord). -### [Anchor](https://qdrant.tech/documentation/cloud-rbac/user-management/\#revoking-an-invitation) Revoking an Invitation +<|page-171-lllmstxt|> +# Common Datasets in Snapshot Format -Before being accepted, an Admin/Owner can cancel a pending invite directly on either the **User Management** or **Role Details** page. +You may find that creating embeddings from datasets is a very resource-intensive task. +If you need a practice dataset, feel free to pick one of the ready-made snapshots on this page. +These snapshots contain pre-computed vectors that you can easily import into your Qdrant instance. -![image.png](https://qdrant.tech/documentation/cloud/role-based-access-control/revoke-invite.png) +## Available datasets -## [Anchor](https://qdrant.tech/documentation/cloud-rbac/user-management/\#updating-a-users-roles) Updating a User’s Roles +Our snapshots are usually generated from publicly available datasets, which are often used for +non-commercial or academic purposes. The following datasets are currently available. Please click +on a dataset name to see its detailed description. -Authorized users can give or take away roles from users in **User Management**. +| Dataset | Model | Vector size | Documents | Size | Qdrant snapshot | HF Hub | +|--------------------------------------------|-----------------------------------------------------------------------------|-------------|-----------|--------|----------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------| +| [Arxiv.org titles](#arxivorg-titles) | [InstructorXL](https://huggingface.co/hkunlp/instructor-xl) | 768 | 2.3M | 7.1 GB | [Download](https://snapshots.qdrant.io/arxiv_titles-3083016565637815127-2023-05-29-13-56-22.snapshot) | [Open](https://huggingface.co/datasets/Qdrant/arxiv-titles-instructorxl-embeddings) | +| [Arxiv.org abstracts](#arxivorg-abstracts) | [InstructorXL](https://huggingface.co/hkunlp/instructor-xl) | 768 | 2.3M | 8.4 GB | [Download](https://snapshots.qdrant.io/arxiv_abstracts-3083016565637815127-2023-06-02-07-26-29.snapshot) | [Open](https://huggingface.co/datasets/Qdrant/arxiv-abstracts-instructorxl-embeddings) | +| [Wolt food](#wolt-food) | [clip-ViT-B-32](https://huggingface.co/sentence-transformers/clip-ViT-B-32) | 512 | 1.7M | 7.9 GB | [Download](https://snapshots.qdrant.io/wolt-clip-ViT-B-32-2446808438011867-2023-12-14-15-55-26.snapshot) | [Open](https://huggingface.co/datasets/Qdrant/wolt-food-clip-ViT-B-32-embeddings) | -![image.png](https://qdrant.tech/documentation/cloud/role-based-access-control/update-user-role.png) +Once you download a snapshot, you need to [restore it](/documentation/concepts/snapshots/#restore-snapshot) +using the Qdrant CLI upon startup or through the API. -![image.png](https://qdrant.tech/documentation/cloud/role-based-access-control/update-user-role-edit-dialog.png) +## Qdrant on Hugging Face -## [Anchor](https://qdrant.tech/documentation/cloud-rbac/user-management/\#removing-a-user-from-an-account) Removing a User from an Account +

+ + HuggingFace + +

-Users can be removed from an account by clicking on their name in either **User Management** (via Actions). This option is only available after they’ve accepted the invitation to join, ensuring that only active users can be removed. +[Hugging Face](https://huggingface.co/) provides a platform for sharing and using ML models and +datasets. [Qdrant](https://huggingface.co/Qdrant) is one of the organizations there! We aim to +provide you with datasets containing neural embeddings that you can use to practice with Qdrant +and build your applications based on semantic search. **Please let us know if you'd like to see +a specific dataset!** -![image.png](https://qdrant.tech/documentation/cloud/role-based-access-control/remove-user.png) +If you are not familiar with [Hugging Face datasets](https://huggingface.co/docs/datasets/index), +or would like to know how to combine it with Qdrant, please refer to the [tutorial](/documentation/tutorials/huggingface-datasets/). -##### Was this page useful? +## Arxiv.org -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +[Arxiv.org](https://arxiv.org) is a highly-regarded open-access repository of electronic preprints in multiple +fields. Operated by Cornell University, arXiv allows researchers to share their findings with +the scientific community and receive feedback before they undergo peer review for formal +publication. Its archives host millions of scholarly articles, making it an invaluable resource +for those looking to explore the cutting edge of scientific research. With a high frequency of +daily submissions from scientists around the world, arXiv forms a comprehensive, evolving dataset +that is ripe for mining, analysis, and the development of future innovations. -Thank you for your feedback! 🙏 + -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-rbac/user-management.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +### Arxiv.org titles -On this page: +This dataset contains embeddings generated from the paper titles only. Each vector has a +payload with the title used to create it, along with the DOI (Digital Object Identifier). -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-rbac/user-management.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +```json +{ + "title": "Nash Social Welfare for Indivisible Items under Separable, Piecewise-Linear Concave Utilities", + "DOI": "1612.05191" +} +``` -× +The embeddings generated with InstructorXL model have been generated using the following +instruction: -[Powered by](https://qdrant.tech/) +> Represent the Research Paper title for retrieval; Input: -<|page-100-lllmstxt|> -## reranking-semantic-search -- [Documentation](https://qdrant.tech/documentation/) -- [Search precision](https://qdrant.tech/documentation/search-precision/) -- Reranking in Semantic Search +The following code snippet shows how to generate embeddings using the InstructorXL model: -# [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#reranking-in-rag-with-qdrant-vector-database) Reranking in RAG with Qdrant Vector Database +```python +from InstructorEmbedding import INSTRUCTOR -In Retrieval-Augmented Generation (RAG) systems, irrelevant or missing information can throw off your model’s ability to produce accurate, meaningful outputs. One of the best ways to ensure you’re feeding your language model the most relevant, context-rich documents is through reranking. It’s a game-changer. +model = INSTRUCTOR("hkunlp/instructor-xl") +sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments" +instruction = "Represent the Research Paper title for retrieval; Input:" +embeddings = model.encode([[instruction, sentence]]) +``` -In this guide, we’ll dive into using reranking to boost the relevance of search results in Qdrant. We’ll start with an easy use case that leverages the Cohere Rerank model. Then, we’ll take it up a notch by exploring ColBERT for a more advanced approach. By the time you’re done, you’ll know how to implement [hybrid search](https://qdrant.tech/articles/hybrid-search/), fine-tune reranking models, and significantly improve your accuracy. +The snapshot of the dataset might be downloaded [here](https://snapshots.qdrant.io/arxiv_titles-3083016565637815127-2023-05-29-13-56-22.snapshot). -Ready? Let’s jump in. +#### Importing the dataset -# [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#understanding-reranking) Understanding Reranking +The easiest way to use the provided dataset is to recover it via the API by passing the +URL as a location. It works also in [Qdrant Cloud](https://cloud.qdrant.io/). The following +code snippet shows how to create a new collection and fill it with the snapshot data: -This section is broken down into key parts to help you easily grasp the background, mechanics, and significance of reranking. +```http request +PUT /collections/{collection_name}/snapshots/recover +{ + "location": "https://snapshots.qdrant.io/arxiv_titles-3083016565637815127-2023-05-29-13-56-22.snapshot" +} +``` -## [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#background) Background +### Arxiv.org abstracts -In search systems, two metrics—precision and recall—are the backbone of success. But what do they mean? Precision tells us how many of the retrieved results are actually relevant, while recall measures how well we’ve captured all the relevant results out there. Simply put: +This dataset contains embeddings generated from the paper abstracts. Each vector has a +payload with the abstract used to create it, along with the DOI (Digital Object Identifier). -![image5.png](https://qdrant.tech/documentation/examples/reranking-semantic-search/image5.png) +```json +{ + "abstract": "Recently Cole and Gkatzelis gave the first constant factor approximation\nalgorithm for the problem of allocating indivisible items to agents, under\nadditive valuations, so as to maximize the Nash Social Welfare. We give\nconstant factor algorithms for a substantial generalization of their problem --\nto the case of separable, piecewise-linear concave utility functions. We give\ntwo such algorithms, the first using market equilibria and the second using the\ntheory of stable polynomials.\n In AGT, there is a paucity of methods for the design of mechanisms for the\nallocation of indivisible goods and the result of Cole and Gkatzelis seemed to\nbe taking a major step towards filling this gap. Our result can be seen as\nanother step in this direction.\n", + "DOI": "1612.05191" +} +``` -Sparse vector searches usually give you high precision because they’re great at finding exact matches. But, here’s the catch—your recall can suffer when relevant documents don’t contain those exact keywords. On the flip side, dense vector searches are fantastic for recall since they grasp the broader, semantic meaning of your query. However, this can lead to lower precision, where you might see results that are only loosely related. +The embeddings generated with InstructorXL model have been generated using the following +instruction: -This is exactly where reranking comes to the rescue. It takes a wide net of documents (giving you high recall) and then refines them by reordering the top candidates based on their relevance scores—boosting precision without losing that broad understanding. Typically, we retain only the top K candidates after reordering to focus on the most relevant results. +> Represent the Research Paper abstract for retrieval; Input: -## [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#working) Working +The following code snippet shows how to generate embeddings using the InstructorXL model: -Picture this: You walk into a massive library and ask for a book on “climate change.” The librarian pulls out a dozen books for you—some are scientific papers, others are personal essays, and one’s even a novel. Sure, they’re all relevant, but the first one you get handed is the novel. Not exactly what you were hoping for, right? +```python +from InstructorEmbedding import INSTRUCTOR -Now, imagine a smarter, more intuitive librarian who really gets what you’re after. This one knows exactly which books are most impactful, the most current, and perfectly aligned with what you need. That’s what reranking does for your search results—it doesn’t just grab any relevant document; it smartly reorders them so the best ones land at the top of your list. It’s like having a librarian who knows exactly what you’re looking for before you do! +model = INSTRUCTOR("hkunlp/instructor-xl") +sentence = "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train." +instruction = "Represent the Research Paper abstract for retrieval; Input:" +embeddings = model.encode([[instruction, sentence]]) +``` -![image6.png](https://qdrant.tech/documentation/examples/reranking-semantic-search/image6.png) +The snapshot of the dataset might be downloaded [here](https://snapshots.qdrant.io/arxiv_abstracts-3083016565637815127-2023-06-02-07-26-29.snapshot). -An illustration of the rerank model prioritizing better results +#### Importing the dataset -To become that smart, intuitive librarian, your algorithm needs to learn how to understand both your queries and the documents it retrieves. It has to evaluate the relationship between them effectively, so it can give you exactly what you’re looking for. +The easiest way to use the provided dataset is to recover it via the API by passing the +URL as a location. It works also in [Qdrant Cloud](https://cloud.qdrant.io/). The following +code snippet shows how to create a new collection and fill it with the snapshot data: -The way reranker models operate varies based on their type, which will be discussed later, but in general, they calculate a relevance score for each document-query pair.Unlike embedding models, which squash everything into a single vector upfront, rerankers keep all the important details intact by using the full transformer output to calculate a similarity score. The result? Precision. But, there’s a trade-off—reranking can be slow. Processing millions of documents can take hours, which is why rerankers focus on refining results, not searching through the entire document collection. +```http request +PUT /collections/{collection_name}/snapshots/recover +{ + "location": "https://snapshots.qdrant.io/arxiv_abstracts-3083016565637815127-2023-06-02-07-26-29.snapshot" +} +``` -Rerankers come in different types, each with its own strengths. Let’s break them down: +## Wolt food -1. **Cross Encoder Models**: These boost reranking by using a classification system to evaluate pairs of data—like sentences or documents. They spit out a similarity score from 0 to 1, showing how closely the document matches your query. The catch? Cross-encoders need both query and document, so they can’t handle standalone documents or queries by themselves. -2. **Multi-Vector Rerankers (e.g., ColBERT)**: These models take a more efficient route. They encode your query and the documents separately and only compare them later, reducing the computational load. This means document representations can be precomputed, speeding up retrieval times -3. **Large Language Models (LLMs) as Rerankers**: This is a newer, smarter way to rerank. LLMs, like GPT, are getting better by the day. With the right instructions, they can prioritize the most relevant documents for you, leveraging their massive understanding of language to deliver even more accurate results. +Our [Food Discovery demo](https://food-discovery.qdrant.tech/) relies on the dataset of +food images from the Wolt app. Each point in the collection represents a dish with a single +image. The image is represented as a vector of 512 float numbers. There is also a JSON +payload attached to each point, which looks similar to this: -Each of these rerankers has its own special way of making sure you get the best search results, fast and relevant to what you need. +```json +{ + "cafe": { + "address": "VGX7+6R2 Vecchia Napoli, Valletta", + "categories": ["italian", "pasta", "pizza", "burgers", "mediterranean"], + "location": {"lat": 35.8980154, "lon": 14.5145106}, + "menu_id": "610936a4ee8ea7a56f4a372a", + "name": "Vecchia Napoli Is-Suq Tal-Belt", + "rating": 9, + "slug": "vecchia-napoli-skyparks-suq-tal-belt" + }, + "description": "Tomato sauce, mozzarella fior di latte, crispy guanciale, Pecorino Romano cheese and a hint of chilli", + "image": "https://wolt-menu-images-cdn.wolt.com/menu-images/610936a4ee8ea7a56f4a372a/005dfeb2-e734-11ec-b667-ced7a78a5abd_l_amatriciana_pizza_joel_gueller1.jpeg", + "name": "L'Amatriciana" +} +``` -## [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#importance) Importance +The embeddings generated with clip-ViT-B-32 model have been generated using the following +code snippet: -In the previous section, we explored the background and mechanics of reranking, but now let’s talk about the three big wins you get from using it: +```python +from PIL import Image +from sentence_transformers import SentenceTransformer -- **Enhancing Search Accuracy:** Reranking is all about making your search results sharper and more relevant. After the initial ranking, rerankers step in, reshuffling the results based on deeper analysis to ensure that the most crucial information is front and center. [Research shows that rerankers](https://cohere.com/blog/rerank) can pull off a serious boost—improving the top results for about 72% of search queries. That’s a huge leap in precision. -- **Reducing Information Overload:** If you feel like you’re drowning in a sea of search results, rerankers can come to your rescue. They filter and fine-tune the flood of information so you get exactly what you need, without the overwhelm. It makes your search experience more focused and way less chaotic. -- **Balancing Speed and Relevance:** First stage retrieval and second stage reranking strike the perfect balance between speed and accuracy. Sure, the second stage may add a bit of latency due to their processing power, but the trade-off is worth it. You get highly relevant results, and in the end, that’s what matters most. +image_path = "5dbfd216-5cce-11eb-8122-de94874ad1c8_ns_takeaway_seelachs_ei_baguette.jpeg" -Now that you know why reranking is such a game-changer, let’s dive into the practical side of things. +model = SentenceTransformer("clip-ViT-B-32") +embedding = model.encode(Image.open(image_path)) +``` -# [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#implementing-vector-search-with-reranking) Implementing Vector Search with Reranking +The snapshot of the dataset might be downloaded [here](https://snapshots.qdrant.io/wolt-clip-ViT-B-32-2446808438011867-2023-12-14-15-55-26.snapshot). -In this section, you’re going to see how to implement vector search with reranking using Cohere. But first, let’s break it down. +#### Importing the dataset -## [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#overview) Overview +The easiest way to use the provided dataset is to recover it via the API by passing the +URL as a location. It works also in [Qdrant Cloud](https://cloud.qdrant.io/). The following +code snippet shows how to create a new collection and fill it with the snapshot data: -A typical search system works in two main stages: Ingestion and Retrieval. Think of ingestion as the process where your data gets prepped and loaded into the system, and retrieval as the part where the magic happens—where your queries pull out the most relevant documents. +```http request +PUT /collections/{collection_name}/snapshots/recover +{ + "location": "https://snapshots.qdrant.io/wolt-clip-ViT-B-32-2446808438011867-2023-12-14-15-55-26.snapshot" +} +``` -Check out the architectural diagram below to visualize how these stages work together. +<|page-172-lllmstxt|> +# Database Authentication in Qdrant Managed Cloud -![image1.png](https://qdrant.tech/documentation/examples/reranking-semantic-search/image1.png) +This page describes what Database API keys are and shows you how to use the Qdrant Cloud Console to create a Database API key for a cluster. You will learn how to connect to your cluster using the new API key. -The two essential stages of a search system: Ingestion and Retrieval Process +Database API keys can be configured with granular access control. Database API keys with granular access control can be recognized by starting with `eyJhb`. Please refer to the [Table of access](/documentation/guides/security/#table-of-access) to understand what permissions you can configure. -### [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#ingestion-stage) Ingestion Stage +Database API keys with granular access control are available for clusters using version **v1.11.0** and above. -- **Documents:** This is where it all starts. The system takes in raw data or documents that need to be prepped for search—this is your initial input. -- **Embeddings:** Next, these documents are transformed into sparse or dense [embeddings](https://qdrant.tech/documentation/embeddings/), which are basically vector representations. These vectors capture the deep, underlying meaning of the text, allowing your system to perform smart, efficient searches and comparisons based on semantic meaning -- **Vector Database:** Once your documents are converted into these embeddings, they get stored in a vector database—essentially the powerhouse behind fast, accurate similarity searches. Here, we’ll see the capabilities of the Qdrant vector database. +## Create Database API Keys -### [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#retrieval-stage) Retrieval Stage + -- **User’s Query:** Now we enter the retrieval phase. The user submits a query, and it’s time to match that query against the stored documents. -- **Embeddings:** Just like with the documents, the user’s query is converted into a sparse or dense embedding. This enables the system to compare the query’s meaning with the meanings of the stored documents. -- **Vector Search:** The system searches for the most relevant documents by comparing the query’s embedding to those in the vector database, and it pulls up the closest matches. -- **Rerank:** Once the initial results are in, the reranking process kicks in to ensure you get the best results on top. We’ll be using **Cohere’s** rerank-english-v3.0 model, which excels at reordering English language documents to prioritize relevance. It can handle up to 4096 tokens, giving it plenty of context to work with. And if you’re dealing with multi-lingual data, don’t worry—Cohere’s got reranking models for other languages too. +![API Key](/documentation/cloud/create-api-key.png) -## [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#implementation) Implementation +1. Go to the [Cloud Dashboard](https://qdrant.to/cloud). +2. Go to the **API Keys** section of the **Cluster Detail Page**. +3. Click **Create**. +4. Choose a name and an optional expiration (in days, the default is 90 days) for your API key. An empty expiration will result in no expiration. +5. By default, tokens are given cluster-wide permissions, with a choice between manage/write permissions (default) or read-only.

To restrict a token to a subset of collections, you can select the Collections tab and choose from the collections available in your cluster. +6. Click **Create** and retrieve your API key. -Now it’s time to dive into the actual implementation. +![API Key](/documentation/cloud/api-key.png) -### [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#setup) Setup +We recommend configuring an expiration and rotating your API keys regularly as a security best practice. -To follow along with this tutorial, you’ll need a few key tools:: + -- Python Client for Qdrant -- Cohere +## Admin Database API Keys -Let’s install everything you need in one go using the Python package manager:: +The previous iteration of Database API keys, called Admin Database API keys, do not have granular access control. Clusters created before January 27, 2025 will still see the option to create Admin Database API keys. Older Admin Database API keys will continue to work, but we do recommend switching to Database API keys with granular access control to take advantage of better security controls. -```jsx -pip install qdrant-client cohere + -``` +To enable Database API keys with granular access control, click **Enable** on the **API Keys** section of the Cluster detail page. -* * * +After enabling Database API keys with granular access control for a cluster, existing Admin Database API keys will continue to work, but you will not be able to create new Admin Database API Keys. -Now, let’s bring in all the necessary components in one tidy block: +## Test Cluster Access -```jsx -from qdrant_client import QdrantClient -from qdrant_client.models import Distance, VectorParams, PointStruct -import cohere +After creation, you will receive a code snippet to access your cluster. Your generated request should look very similar to this one: +```bash +curl \ + -X GET 'https://xyz-example.cloud-region.cloud-provider.cloud.qdrant.io:6333' \ + --header 'api-key: ' ``` +Open Terminal and run the request. You should get a response that looks like this: -* * * - -Qdrant is a powerful vector similarity search engine that gives you a production-ready service with an easy-to-use API for storing, searching, and managing data. You can interact with Qdrant through a local or cloud setup, but since we’re working in Colab, let’s go with the cloud setup. +```bash +{"title":"qdrant - vector search engine","version":"1.13.0","commit":"ffda0b90c8c44fc43c99adab518b9787fe57bde6"} +``` -### [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#steps-to-set-up-qdrant-cloud)**Steps to Set Up Qdrant Cloud:** +> **Note:** You need to include the API key in the request header for every +> request over REST or gRPC. -1. **Sign Up**: Head to Qdrant’s website and sign up for a cloud account using your email, Google, or GitHub credentials. -2. **Create Your First Cluster**: Once you’re in, navigate to the Overview section and follow the onboarding steps under Create First Cluster. -3. **Get Your API Key**: After creating your cluster, an API key will be generated. This key will let you interact with the cluster using the Python client. -4. **Check Your Cluster**: Your new cluster will appear under the Clusters section. From here, you’re all set to start interacting with your data. +## Authenticate via SDK -Finally, under the Overview section, you’ll see the following code snippet: +Now that you have created your first cluster and key, you might want to access your database from within your application. +Our [official Qdrant clients](/documentation/interfaces/) for Python, TypeScript, Go, Rust, .NET and Java all support the API key parameter. -![image7.png](https://qdrant.tech/documentation/examples/reranking-semantic-search/image7.png) +```bash +curl \ + -X GET https://xyz-example.cloud-region.cloud-provider.cloud.qdrant.io:6333 \ + --header 'api-key: ' -Qdrant Overview Section +# Alternatively, you can use the `Authorization` header with the `Bearer` prefix +curl \ + -X GET https://xyz-example.cloud-region.cloud-provider.cloud.qdrant.io:6333 \ + --header 'Authorization: Bearer ' +``` -Add your API keys. This will let your Python client connect to Qdrant and Cohere. +```python +from qdrant_client import QdrantClient -```jsx -client = QdrantClient( - url="", - api_key="", +qdrant_client = QdrantClient( + "xyz-example.cloud-region.cloud-provider.cloud.qdrant.io", + api_key="", ) - -print(client.get_collections()) - ``` -* * * +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -Next, we’ll set up Cohere for reranking. Log in to your Cohere account, generate an API key, and add it like this:: +const client = new QdrantClient({ + host: "xyz-example.cloud-region.cloud-provider.cloud.qdrant.io", + apiKey: "", +}); +``` -```jsx -co = cohere.Client("") +```rust +use qdrant_client::Qdrant; +let client = Qdrant::from_url("https://xyz-example.cloud-region.cloud-provider.cloud.qdrant.io:6334") + .api_key("") + .build()?; ``` -* * * - -### [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#ingestion) Ingestion +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; -### [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#there-are-three-key-parts-to-ingestion-creating-a-collection-converting-documents-to-embeddings-and-upserting-the-data-lets-break-it-down) There are three key parts to ingestion: Creating a Collection, Converting Documents to Embeddings, and Upserting the Data. Let’s break it down. +QdrantClient client = + new QdrantClient( + QdrantGrpcClient.newBuilder( + "xyz-example.cloud-region.cloud-provider.cloud.qdrant.io", + 6334, + true) + .withApiKey("") + .build()); +``` -### [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#creating-a-collection) Creating a Collection +```csharp +using Qdrant.Client; -A collection is basically a named group of points (vectors with data) that you can search through. All the vectors in a collection need to have the same size and be compared using one distance metric. Here’s how to create one: +var client = new QdrantClient( + host: "xyz-example.cloud-region.cloud-provider.cloud.qdrant.io", + https: true, + apiKey: "" +); +``` -```jsx -client.create_collection( - collection_name="basic-search-rerank", - vectors_config=VectorParams(size=1024, distance=Distance.DOT), -) +```go +import "github.com/qdrant/go-client/qdrant" +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "xyz-example.cloud-region.cloud-provider.cloud.qdrant.io", + Port: 6334, + APIKey: "", + UseTLS: true, +}) ``` -* * * - -Here, the vector size is set to 1024 to match our dense embeddings, and we’re using dot product as the distance metric—perfect for capturing the similarity between vectors, especially when they’re normalized. +<|page-173-lllmstxt|> +# RAG System for Employee Onboarding -### [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#documents-to-embeddings) Documents to Embeddings +Public websites are a great way to share information with a wide audience. However, finding the right information can be +challenging, if you are not familiar with the website's structure or the terminology used. That's what the search bar is +for, but it is not always easy to formulate a query that will return the desired results, if you are not yet familiar +with the content. This is even more important in a corporate environment, and for the new employees, who are just +starting to learn the ropes, and don't even know how to ask the right questions yet. You may have even the best intranet +pages, but onboarding is more than just reading the documentation, it is about understanding the processes. Semantic +search can help with finding right resources easier, but wouldn't it be easier to just chat with the website, like you +would with a colleague? -Let’s set up some example data. Here’s a query and a few documents for demonstration: +Technological advancements have made it possible to interact with websites using natural language. This tutorial will +guide you through the process of integrating [Cohere](https://cohere.com/)'s language models with Qdrant to enable +natural language search on your documentation. We are going to use [LangChain](https://langchain.com/) as an +orchestrator. Everything will be hosted on [Oracle Cloud Infrastructure (OCI)](https://www.oracle.com/cloud/), so you +can scale your application as needed, and do not send your data to third parties. That is especially important when you +are working with confidential or sensitive data. -```jsx -query = "What is the purpose of feature scaling in machine learning?" +## Building up the application -documents = [\ - "In machine learning, feature scaling is the process of normalizing the range of independent variables or features. The goal is to ensure that all features contribute equally to the model, especially in algorithms like SVM or k-nearest neighbors where distance calculations matter.",\ -\ - "Feature scaling is commonly used in data preprocessing to ensure that features are on the same scale. This is particularly important for gradient descent-based algorithms where features with larger scales could disproportionately impact the cost function.",\ -\ - "In data science, feature extraction is the process of transforming raw data into a set of engineered features that can be used in predictive models. Feature scaling is related but focuses on adjusting the values of these features.",\ -\ - "Unsupervised learning algorithms, such as clustering methods, may benefit from feature scaling as it ensures that features with larger numerical ranges don't dominate the learning process.",\ -\ - "One common data preprocessing technique in data science is feature selection. Unlike feature scaling, feature selection aims to reduce the number of input variables used in a model to avoid overfitting.",\ -\ - "Principal component analysis (PCA) is a dimensionality reduction technique used in data science to reduce the number of variables. PCA works best when data is scaled, as it relies on variance which can be skewed by features on different scales.",\ -\ - "Min-max scaling is a common feature scaling technique that usually transforms features to a fixed range [0, 1]. This method is useful when the distribution of data is not Gaussian.",\ -\ - "Standardization, or z-score normalization, is another technique that transforms features into a mean of 0 and a standard deviation of 1. This method is effective for data that follows a normal distribution.",\ -\ - "Feature scaling is critical when using algorithms that rely on distances, such as k-means clustering, as unscaled features can lead to misleading results.",\ -\ - "Scaling can improve the convergence speed of gradient descent algorithms by preventing issues with different feature scales affecting the cost function's landscape.",\ -\ - "In deep learning, feature scaling helps in stabilizing the learning process, allowing for better performance and faster convergence during training.",\ -\ - "Robust scaling is another method that uses the median and the interquartile range to scale features, making it less sensitive to outliers.",\ -\ - "When working with time series data, feature scaling can help in standardizing the input data, improving model performance across different periods.",\ -\ - "Normalization is often used in image processing to scale pixel values to a range that enhances model performance in computer vision tasks.",\ -\ - "Feature scaling is significant when features have different units of measurement, such as height in centimeters and weight in kilograms.",\ -\ - "In recommendation systems, scaling features such as user ratings can improve the model's ability to find similar users or items.",\ -\ - "Dimensionality reduction techniques, like t-SNE and UMAP, often require feature scaling to visualize high-dimensional data in lower dimensions effectively.",\ -\ - "Outlier detection techniques can also benefit from feature scaling, as they can be influenced by unscaled features that have extreme values.",\ -\ - "Data preprocessing steps, including feature scaling, can significantly impact the performance of machine learning models, making it a crucial part of the modeling pipeline.",\ -\ - "In ensemble methods, like random forests, feature scaling is not strictly necessary, but it can still enhance interpretability and comparison of feature importance.",\ -\ - "Feature scaling should be applied consistently across training and test datasets to avoid data leakage and ensure reliable model evaluation.",\ -\ - "In natural language processing (NLP), scaling can be useful when working with numerical features derived from text data, such as word counts or term frequencies.",\ -\ - "Log transformation is a technique that can be applied to skewed data to stabilize variance and make the data more suitable for scaling.",\ -\ - "Data augmentation techniques in machine learning may also include scaling to ensure consistency across training datasets, especially in computer vision tasks."\ -] +Our application will consist of two main processes: indexing and searching. Langchain will glue everything together, +as we will use a few components, including Cohere and Qdrant, as well as some OCI services. Here is a high-level +overview of the architecture: -``` +![Architecture diagram of the target system](/documentation/examples/faq-oci-cohere-langchain/architecture-diagram.png) -* * * +### Prerequisites -We’ll generate embeddings for these documents using Cohere’s embed-english-v3.0 model, which produces 1024-dimensional vectors: +Before we dive into the implementation, make sure to set up all the necessary accounts and tools. -```python -model="embed-english-v3.0" +#### Libraries -doc_embeddings = co.embed(texts=documents, - model=model, - input_type="search_document", - embedding_types=['float']) +We are going to use a few Python libraries. Of course, Langchain will be our main framework, but the Cohere models on +OCI are accessible via the [OCI SDK](https://docs.oracle.com/en-us/iaas/tools/python/2.125.1/). Let's install all the +necessary libraries: +```shell +pip install langchain oci qdrant-client langchainhub ``` -* * * +#### Oracle Cloud -This code taps into the power of the Cohere API to generate embeddings for your list of documents. It uses the embed-english-v3.0 model, sets the input type to “search\_document,” and asks for the embeddings in float format. The result? A set of dense embeddings, each one representing the deep semantic meaning of your documents. These embeddings will be stored in doc\_embeddings, ready for action. +Our application will be fully running on Oracle Cloud Infrastructure (OCI). It's up to you to choose how you want to +deploy your application. Qdrant Hybrid Cloud will be running in your [Kubernetes cluster running on Oracle Cloud +(OKE)](https://www.oracle.com/cloud/cloud-native/container-engine-kubernetes/), so all the processes might be also +deployed there. You can get started with signing up for an account on [Oracle Cloud](https://signup.cloud.oracle.com/). -### [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#upsert-data) Upsert Data +Cohere models are available on OCI as a part of the [Generative AI +Service](https://www.oracle.com/artificial-intelligence/generative-ai/generative-ai-service/). We need both the +[Generation models](https://docs.oracle.com/en-us/iaas/Content/generative-ai/use-playground-generate.htm) and the +[Embedding models](https://docs.oracle.com/en-us/iaas/Content/generative-ai/use-playground-embed.htm). Please follow the +linked tutorials to grasp the basics of using Cohere models there. -We need to transform those dense embeddings into a format Qdrant can work with, and that’s where Points come in. Points are the building blocks of Qdrant—they’re records made up of a vector (the embedding) and an optional payload (like your document text). +Accessing the models programmatically requires knowing the compartment OCID. Please refer to the [documentation that +describes how to find it](https://docs.oracle.com/en-us/iaas/Content/GSG/Tasks/contactingsupport_topic-Locating_Oracle_Cloud_Infrastructure_IDs.htm#Finding_the_OCID_of_a_Compartment). +For the further reference, we will assume that the compartment OCID is stored in the environment variable: -Here’s how we convert those embeddings into Points: +```shell +export COMPARTMENT_OCID="" +``` ```python -points = [] -for idx, (embedding, doc) in enumerate(zip(doc_embeddings.embeddings.float_, documents)): - point = PointStruct( - id=idx, - vector=embedding, - payload={"document": doc} - ) - points.append(point) +import os +os.environ["COMPARTMENT_OCID"] = "" ``` -* * * +#### Qdrant Hybrid Cloud -What’s happening here? We’re building a list of Points from the embeddings: +Qdrant Hybrid Cloud running on Oracle Cloud helps you build a solution without sending your data to external services. Our documentation provides a step-by-step guide on how to [deploy Qdrant Hybrid Cloud on Oracle +Cloud](/documentation/hybrid-cloud/platform-deployment-options/#oracle-cloud-infrastructure). -- First, we start with an empty list. -- Then, we loop through both **doc\_embeddings** and **documents** at the same time using enumerate() to grab the index (idx) along the way. -- For each pair (an embedding and its corresponding document), we create a PointStruct. Each point gets: - - An id (from idx). - - A vector (the embedding). - - A payload (the actual document text). -- Each Point is added to our list. +Qdrant will be running on a specific URL and access will be restricted by the API key. Make sure to store them both as environment variables as well: -Once that’s done, it’s time to send these Points into your Qdrant collection with the upsert() function: +```shell +export QDRANT_URL="https://qdrant.example.com" +export QDRANT_API_KEY="your-api-key" +``` -```python -operation_info = client.upsert( - collection_name="basic-search-rerank", - points=points -) +*Optional:* Whenever you use LangChain, you can also [configure LangSmith](https://docs.smith.langchain.com/), which will help us trace, monitor and debug LangChain applications. You can sign up for LangSmith [here](https://smith.langchain.com/). +```shell +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_API_KEY="your-api-key" +export LANGCHAIN_PROJECT="your-project" # if not specified, defaults to "default" ``` -* * * +Now you can get started: -### [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#now-your-embeddings-are-all-set-in-qdrant-ready-to-power-your-search) Now your embeddings are all set in Qdrant, ready to power your search. +```python +import os -### [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#retrieval) Retrieval +os.environ["QDRANT_URL"] = "https://qdrant.example.com" +os.environ["QDRANT_API_KEY"] = "your-api-key" +``` -The first few steps here mirror what we did during ingestion—just like before, we need to convert the query into an embedding: +Let's create the collection that will store the indexed documents. We will use the `qdrant-client` library, and our +collection will be named `oracle-cloud-website`. Our embedding model, `cohere.embed-english-v3.0`, produces embeddings +of size 1024, and we have to specify that when creating the collection. ```python -query_embeddings = co.embed(texts=[query], - model=model, - input_type="search_query", - embedding_types=['float']) +from qdrant_client import QdrantClient, models +client = QdrantClient( + location=os.environ.get("QDRANT_URL"), + api_key=os.environ.get("QDRANT_API_KEY"), +) +client.create_collection( + collection_name="oracle-cloud-website", + vectors_config=models.VectorParams( + size=1024, + distance=models.Distance.COSINE, + ), +) ``` -* * * +### Indexing process -After that, we’ll move on to retrieve results using vector search and apply reranking on the results. This two-stage process is super efficient because we’re grabbing a small set of the most relevant documents first, which is much faster than reranking a huge dataset. +We have all the necessary tools set up, so let's start with the indexing process. We will use the Cohere Embedding +models to convert the text into vectors, and then store them in Qdrant. Langchain is integrated with OCI Generative AI +Service, so we can easily access the models. -### [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#vector-search) Vector Search +Our dataset will be fairly simple, as it will consist of the questions and answers from the [Oracle Cloud Free Tier +FAQ page](https://www.oracle.com/cloud/free/faq/). -This snippet grabs the top 10 most relevant points from your Qdrant collection using the query embedding. +![Some examples of the Oracle Cloud FAQ](/documentation/examples/faq-oci-cohere-langchain/oracle-faq.png) + +Questions and answers are presented in an HTML format, but we don't want to manually extract the text and adapt it for +each subpage. Instead, we will use the `WebBaseLoader` that just loads the HTML content from given URL and converts it +to text. ```python -search_result = client.query_points( - collection_name="basic-search-rerank", query=query_embeddings.embeddings.float_[0], limit=10 -).points +from langchain_community.document_loaders.web_base import WebBaseLoader +loader = WebBaseLoader("https://www.oracle.com/cloud/free/faq/") +documents = loader.load() ``` -* * * - -Here’s how it works: we use the query\_points method to search within the “basic-search-rerank” collection. It compares the query embedding (the first embedding in query\_embeddings) against all the document embeddings, pulling up the 10 closest matches. The matching points get stored in search\_result. +Our `documents` is a list with just a single element, which is the text of the whole page. We need to split it into +meaningful parts, so we will use the `RecursiveCharacterTextSplitter` component. It will try to keep all paragraphs (and +then sentences, and then words) together as long as possible, as those would generically seem to be the strongest +semantically related pieces of text. The chunk size and overlap are both parameters that can be adjusted to fit the +specific use case. -And here’s a sneak peek at what you’ll get from the vector search: +```python +from langchain_text_splitters import RecursiveCharacterTextSplitter -| **ID** | **Document** | **Score** | -| --- | --- | --- | -| 0 | In machine learning, feature scaling is the process of normalizing the range of independent
 | 0.71 | -| 10 | In deep learning, feature scaling helps stabilize the learning process, allowing for
 | 0.69 | -| 1 | Feature scaling is commonly used in data preprocessing to ensure that features are on the
 | 0.68 | -| 23 | Data augmentation techniques in machine learning may also include scaling to ensure
 | 0.64 | -| 3 | Unsupervised learning algorithms, such as clustering methods, may benefit from feature
 | 0.64 | -| 12 | When working with time series data, feature scaling can help standardize the input
 | 0.62 | -| 19 | In ensemble methods, like random forests, feature scaling is not strictly necessary
 | 0.61 | -| 21 | In natural language processing (NLP), scaling can be useful when working with numerical
 | 0.61 | -| 20 | Feature scaling should be applied consistently across training and test datasets
 | 0.61 | -| 18 | Data preprocessing steps, including feature scaling, can significantly impact the performance
 | 0.61 | +splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=100) +split_documents = splitter.split_documents(documents) +``` -From the looks of it, the data pulled up is highly relevant to your query. Now, with this solid base of results, it’s time to refine them further with reranking. +Our documents might be now indexed, but we need to convert them into vectors. Let's configure the embeddings so the +`cohere.embed-english-v3.0` is used. Not all the regions support the Generative AI Service, so we need to specify the +region where the models are stored. We will use the `us-chicago-1`, but please check the +[documentation](https://docs.oracle.com/en-us/iaas/Content/generative-ai/overview.htm#regions) for the most up-to-date +list of supported regions. -### [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#rerank) Rerank +```python +from langchain_community.embeddings.oci_generative_ai import OCIGenAIEmbeddings -This code takes the documents from the search results and reranks them based on your query, making sure you get the most relevant ones right at the top. +embeddings = OCIGenAIEmbeddings( + model_id="cohere.embed-english-v3.0", + service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", + compartment_id=os.environ.get("COMPARTMENT_OCID"), +) +``` -First, we pull out the documents from the search results. Then we use Cohere’s rerank model to refine these results: +Now we can embed the documents and store them in Qdrant. We will create an instance of `Qdrant` and add the split +documents to the collection. ```python -document_list = [point.payload['document'] for point in search_result] +from langchain.vectorstores.qdrant import Qdrant -rerank_results = co.rerank( - model="rerank-english-v3.0", - query=query, - documents=document_list, - top_n=5, +qdrant = Qdrant( + client=client, + collection_name="oracle-cloud-website", + embeddings=embeddings, ) +qdrant.add_documents(split_documents, batch_size=20) ``` -* * * +Our documents should be now indexed and ready for searching. Let's move to the next step. -What’s happening here? In the first line, we’re building a list of documents by grabbing the ‘document’ field from each search result point. Then, we pass this list, along with the original query, to Cohere’s rerank method. Using the **rerank-english-v3.0** model, it reshuffles the documents and gives you back the top 5, ranked by their relevance to the query. +### Speaking to the website -Here’s the reranked result table, with the new order and their relevance scores: +The intended method of interaction with the website is through the chatbot. Large Language Model, in our case [Cohere +Command](https://cohere.com/command), will be answering user's questions based on the relevant documents that Qdrant +will return using the question as a query. Our LLM is also hosted on OCI, so we can access it similarly to the embedding +model: -| **Index** | **Document** | **Relevance Score** | -| --- | --- | --- | -| 0 | In machine learning, feature scaling is the process of normalizing the range of independent variables or features. | 0.99995166 | -| 1 | Feature scaling is commonly used in data preprocessing to ensure that features are on the same scale. | 0.99929035 | -| 10 | In deep learning, feature scaling helps stabilize the learning process, allowing for better performance and faster convergence. | 0.998675 | -| 23 | Data augmentation techniques in machine learning may also include scaling to ensure consistency across training datasets. | 0.998043 | -| 3 | Unsupervised learning algorithms, such as clustering methods, may benefit from feature scaling. | 0.9979967 | +```python +from langchain_community.llms.oci_generative_ai import OCIGenAI -As you can see, the reranking did its job. Positions for documents 10 and 1 got swapped, showing that the reranker has fine-tuned the results to give you the most relevant content at the top. +llm = OCIGenAI( + model_id="cohere.command", + service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", + compartment_id=os.environ.get("COMPARTMENT_OCID"), +) +``` -## [Anchor](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/\#conclusion) Conclusion +Connection to Qdrant might be established in the same way as we did during the indexing process. We can use it to create +a retrieval chain, which implements the question-answering process. The retrieval chain also requires an additional +chain that will combine retrieved documents before sending them to an LLM. -Reranking is a powerful way to boost the relevance and precision of search results in RAG systems. By combining Qdrant’s vector search capabilities with tools like Cohere’s Rerank model or ColBERT, you can refine search outputs, ensuring the most relevant information rises to the top. +```python +from langchain.chains.combine_documents import create_stuff_documents_chain +from langchain.chains.retrieval import create_retrieval_chain +from langchain import hub -This guide demonstrated how reranking enhances precision without sacrificing recall, delivering sharper, context-rich results. With these tools, you’re equipped to create search systems that provide meaningful and impactful user experiences. Start implementing reranking to take your applications to the next level! +retriever = qdrant.as_retriever() +combine_docs_chain = create_stuff_documents_chain( + llm=llm, + # Default prompt is loaded from the hub, but we can also modify it + prompt=hub.pull("langchain-ai/retrieval-qa-chat"), +) +retrieval_qa_chain = create_retrieval_chain( + retriever=retriever, + combine_docs_chain=combine_docs_chain, +) +response = retrieval_qa_chain.invoke({"input": "What is the Oracle Cloud Free Tier?"}) +``` -##### Was this page useful? +The output of the `.invoke` method is a dictionary-like structure with the query and answer, but we can also access the +source documents used to generate the response. This might be useful for debugging or for further processing. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +```python +{ + 'input': 'What is the Oracle Cloud Free Tier?', + 'context': [ + Document( + page_content='* Free Tier is generally available in regions where commercial Oracle Cloud Infrastructure service is available. See the data regions page for detailed service availability (the exact regions available for Free Tier may differ during the sign-up process). The US$300 cloud credit is available in', + metadata={ + 'language': 'en-US', + 'source': 'https://www.oracle.com/cloud/free/faq/', + 'title': "FAQ on Oracle's Cloud Free Tier", + '_id': 'c8cf98e0-4b88-4750-be42-4157495fed2c', + '_collection_name': 'oracle-cloud-website' + } + ), + Document( + page_content='Oracle Cloud Free Tier allows you to sign up for an Oracle Cloud account which provides a number of Always Free services and a Free Trial with US$300 of free credit to use on all eligible Oracle Cloud Infrastructure services for up to 30 days. The Always Free services are available for an unlimited', + metadata={ + 'language': 'en-US', + 'source': 'https://www.oracle.com/cloud/free/faq/', + 'title': "FAQ on Oracle's Cloud Free Tier", + '_id': 'dc291430-ff7b-4181-944a-39f6e7a0de69', + '_collection_name': 'oracle-cloud-website' + } + ), + Document( + page_content='Oracle Cloud Free Tier does not include SLAs. Community support through our forums is available to all customers. Customers using only Always Free resources are not eligible for Oracle Support. Limited support is available for Oracle Cloud Free Tier with Free Trial credits. After you use all of', + metadata={ + 'language': 'en-US', + 'source': 'https://www.oracle.com/cloud/free/faq/', + 'title': "FAQ on Oracle's Cloud Free Tier", + '_id': '9e831039-7ccc-47f7-9301-20dbddd2fc07', + '_collection_name': 'oracle-cloud-website' + } + ), + Document( + page_content='looking to test things before moving to cloud, a student wanting to learn, or an academic developing curriculum in the cloud, Oracle Cloud Free Tier enables you to learn, explore, build and test for free.', + metadata={ + 'language': 'en-US', + 'source': 'https://www.oracle.com/cloud/free/faq/', + 'title': "FAQ on Oracle's Cloud Free Tier", + '_id': 'e2dc43e1-50ee-4678-8284-6df60a835cf5', + '_collection_name': 'oracle-cloud-website' + } + ) + ], + 'answer': ' Oracle Cloud Free Tier is a subscription that gives you access to Always Free services and a Free Trial with $300 of credit that can be used on all eligible Oracle Cloud Infrastructure services for up to 30 days. \n\nThrough this Free Tier, you can learn, explore, build, and test for free. It is aimed at those who want to experiment with cloud services before making a commitment, as wellTheir use cases range from testing prior to cloud migration to learning and academic curriculum development. ' +} +``` -Thank you for your feedback! 🙏 +#### Other experiments -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/search-precision/reranking-semantic-search.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Asking the basic questions is just the beginning. What you want to avoid is a hallucination, where the model generates +an answer that is not based on the actual content. The default prompt of Langchain should already prevent this, but you +might still want to check it. Let's ask a question that is not directly answered on the FAQ page: -On this page: +```python +response = retrieval_qa.invoke({ + "input": "Is Oracle Generative AI Service included in the free tier?" +}) +``` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/search-precision/reranking-semantic-search.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Output: -× +> Oracle Generative AI Services are not specifically mentioned as being available in the free tier. As per the text, the +> $300 free credit can be used on all eligible services for up to 30 days. To confirm if Oracle Generative AI Services +> are included in the free credit offer, it is best to check the official Oracle Cloud website or contact their support. -[Powered by](https://qdrant.tech/) +It seems that Cohere Command model could not find the exact answer in the provided documents, but it tried to interpret +the context and provide a reasonable answer, without making up the information. This is a good sign that the model is +not hallucinating in that case. -<|page-101-lllmstxt|> -## search-beginners -- [Documentation](https://qdrant.tech/documentation/) -- [Beginner tutorials](https://qdrant.tech/documentation/beginner-tutorials/) -- Semantic Search 101 +## Wrapping up -# [Anchor](https://qdrant.tech/documentation/beginner-tutorials/search-beginners/\#build-your-first-semantic-search-engine-in-5-minutes) Build Your First Semantic Search Engine in 5 Minutes +This tutorial has shown how to integrate Cohere's language models with Qdrant to enable natural language search on your +website. We have used Langchain as an orchestrator, and everything was hosted on Oracle Cloud Infrastructure (OCI). +Real world would require integrating this mechanism into your organization's systems, but we built a solid foundation +that can be further developed. -| Time: 5 - 15 min | Level: Beginner | | | -| --- | --- | --- | --- | +<|page-174-lllmstxt|> +# Private RAG Information Extraction Engine -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/search-beginners/\#overview) Overview +| Time: 90 min | Level: Advanced | | | +|--------------|-----------------|--|----| -If you are new to vector databases, this tutorial is for you. In 5 minutes you will build a semantic search engine for science fiction books. After you set it up, you will ask the engine about an impending alien threat. Your creation will recommend books as preparation for a potential space attack. +Handling private documents is a common task in many industries. Various businesses possess a large amount of +unstructured data stored as huge files that must be processed and analyzed. Industry reports, financial analysis, legal +documents, and many other documents are stored in PDF, Word, and other formats. Conversational chatbots built on top of +RAG pipelines are one of the viable solutions for finding the relevant answers in such documents. However, if we want to +extract structured information from these documents, and pass them to downstream systems, we need to use a different +approach. -Before you begin, you need to have a [recent version of Python](https://www.python.org/downloads/) installed. If you don’t know how to run this code in a virtual environment, follow Python documentation for [Creating Virtual Environments](https://docs.python.org/3/tutorial/venv.html#creating-virtual-environments) first. +Information extraction is a process of structuring unstructured data into a format that can be easily processed by +machines. In this tutorial, we will show you how to use [DSPy](https://dspy-docs.vercel.app/) to perform that process on +a set of documents. Assuming we cannot send our data to an external service, we will use [Ollama](https://ollama.com/) +to run our own LLM model on our premises, using [Vultr](https://www.vultr.com/) as a cloud provider. Qdrant, acting in +this setup as a knowledge base providing the relevant pieces of documents for a given query, will also be hosted in the +Hybrid Cloud mode on Vultr. The last missing piece, the DSPy application will be also running in the same environment. +If you work in a regulated industry, or just need to keep your data private, this tutorial is for you. -This tutorial assumes you’re in the bash shell. Use the Python documentation to activate a virtual environment, with commands such as: +![Architecture diagram](/documentation/examples/information-extraction-ollama-vultr/architecture-diagram.png) -```bash -source tutorial-env/bin/activate +## Deploying Qdrant Hybrid Cloud on Vultr -``` +All the services we are going to use in this tutorial will be running on [Vultr Kubernetes +Engine](https://www.vultr.com/kubernetes/). That gives us a lot of flexibility in terms of scaling and managing the resources. Vultr manages the control plane and worker nodes and provides integration with other managed services such as Load Balancers, Block Storage, and DNS. -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/search-beginners/\#1-installation) 1\. Installation +1. To start using managed Kubernetes on Vultr, follow the [platform-specific documentation](/documentation/hybrid-cloud/platform-deployment-options/#vultr). +2. Once your Kubernetes clusters are up, [you can begin deploying Qdrant Hybrid Cloud](/documentation/hybrid-cloud/). -You need to process your data so that the search engine can work with it. The [Sentence Transformers](https://www.sbert.net/) framework gives you access to common Large Language Models that turn raw data into embeddings. +### Installing the necessary packages -```bash -pip install -U sentence-transformers +We are going to need a couple of Python packages to run our application. They might be installed together with the +`dspy-ai` package and `qdrant` extra: +```shell +pip install dspy-ai dspy-qdrant ``` -Once encoded, this data needs to be kept somewhere. Qdrant lets you store data as embeddings. You can also use Qdrant to run search queries against this data. This means that you can ask the engine to give you relevant answers that go way beyond keyword matching. +### Qdrant Hybrid Cloud -```bash -pip install -U qdrant-client +Our [documentation](/documentation/hybrid-cloud/) contains a comprehensive guide on how to set up Qdrant in the Hybrid Cloud mode on Vultr. Please follow it carefully to get your Qdrant instance up and running. Once it's done, we need to store the Qdrant URL and the API key in the environment variables. You can do it by running the following commands: +```shell +export QDRANT_URL="https://qdrant.example.com" +export QDRANT_API_KEY="your-api-key" ``` -### [Anchor](https://qdrant.tech/documentation/beginner-tutorials/search-beginners/\#import-the-models) Import the models - -Once the two main frameworks are defined, you need to specify the exact models this engine will use. - ```python -from qdrant_client import models, QdrantClient -from sentence_transformers import SentenceTransformer +import os +os.environ["QDRANT_URL"] = "https://qdrant.example.com" +os.environ["QDRANT_API_KEY"] = "your-api-key" ``` -The [Sentence Transformers](https://www.sbert.net/) framework contains many embedding models. We’ll take [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) as it has a good balance between speed and embedding quality for this tutorial. - -```python -encoder = SentenceTransformer("all-MiniLM-L6-v2") +DSPy is framework we are going to use. It's integrated with Qdrant already, but it assumes you use +[FastEmbed](https://qdrant.github.io/fastembed/) to create the embeddings. DSPy does not provide a way to index the +data, but leaves this task to the user. We are going to create a collection on our own, and fill it with the embeddings +of our document chunks. -``` +#### Data indexing -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/search-beginners/\#2-add-the-dataset) 2\. Add the dataset - -[all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) will encode the data you provide. Here you will list all the science fiction books in your library. Each book has metadata, a name, author, publication year and a short description. - -```python -documents = [\ - {\ - "name": "The Time Machine",\ - "description": "A man travels through time and witnesses the evolution of humanity.",\ - "author": "H.G. Wells",\ - "year": 1895,\ - },\ - {\ - "name": "Ender's Game",\ - "description": "A young boy is trained to become a military leader in a war against an alien race.",\ - "author": "Orson Scott Card",\ - "year": 1985,\ - },\ - {\ - "name": "Brave New World",\ - "description": "A dystopian society where people are genetically engineered and conditioned to conform to a strict social hierarchy.",\ - "author": "Aldous Huxley",\ - "year": 1932,\ - },\ - {\ - "name": "The Hitchhiker's Guide to the Galaxy",\ - "description": "A comedic science fiction series following the misadventures of an unwitting human and his alien friend.",\ - "author": "Douglas Adams",\ - "year": 1979,\ - },\ - {\ - "name": "Dune",\ - "description": "A desert planet is the site of political intrigue and power struggles.",\ - "author": "Frank Herbert",\ - "year": 1965,\ - },\ - {\ - "name": "Foundation",\ - "description": "A mathematician develops a science to predict the future of humanity and works to save civilization from collapse.",\ - "author": "Isaac Asimov",\ - "year": 1951,\ - },\ - {\ - "name": "Snow Crash",\ - "description": "A futuristic world where the internet has evolved into a virtual reality metaverse.",\ - "author": "Neal Stephenson",\ - "year": 1992,\ - },\ - {\ - "name": "Neuromancer",\ - "description": "A hacker is hired to pull off a near-impossible hack and gets pulled into a web of intrigue.",\ - "author": "William Gibson",\ - "year": 1984,\ - },\ - {\ - "name": "The War of the Worlds",\ - "description": "A Martian invasion of Earth throws humanity into chaos.",\ - "author": "H.G. Wells",\ - "year": 1898,\ - },\ - {\ - "name": "The Hunger Games",\ - "description": "A dystopian society where teenagers are forced to fight to the death in a televised spectacle.",\ - "author": "Suzanne Collins",\ - "year": 2008,\ - },\ - {\ - "name": "The Andromeda Strain",\ - "description": "A deadly virus from outer space threatens to wipe out humanity.",\ - "author": "Michael Crichton",\ - "year": 1969,\ - },\ - {\ - "name": "The Left Hand of Darkness",\ - "description": "A human ambassador is sent to a planet where the inhabitants are genderless and can change gender at will.",\ - "author": "Ursula K. Le Guin",\ - "year": 1969,\ - },\ - {\ - "name": "The Three-Body Problem",\ - "description": "Humans encounter an alien civilization that lives in a dying system.",\ - "author": "Liu Cixin",\ - "year": 2008,\ - },\ +FastEmbed uses the `BAAI/bge-small-en` as the default embedding model. We are going to use it as well. Our collection +will be created automatically if we call the `.add` method on an existing `QdrantClient` instance. In this tutorial we +are not going to focus much on the document parsing, as there are plenty of tools that can help with that. The +[`unstructured`](https://github.com/Unstructured-IO/unstructured) library is one of the options you can launch on your +infrastructure. In our simplified example, we are going to use a list of strings as our documents. These are the +descriptions of the made up technical events. Each of them should contain the name of the event along with the location +and start and end dates. + +```python +documents = [ + "Taking place in San Francisco, USA, from the 10th to the 12th of June, 2024, the Global Developers Conference is the annual gathering spot for developers worldwide, offering insights into software engineering, web development, and mobile applications.", + "The AI Innovations Summit, scheduled for 15-17 September 2024 in London, UK, aims at professionals and researchers advancing artificial intelligence and machine learning.", + "Berlin, Germany will host the CyberSecurity World Conference between November 5th and 7th, 2024, serving as a key forum for cybersecurity professionals to exchange strategies and research on threat detection and mitigation.", + "Data Science Connect in New York City, USA, occurring from August 22nd to 24th, 2024, connects data scientists, analysts, and engineers to discuss data science's innovative methodologies, tools, and applications.", + "Set for July 14-16, 2024, in Tokyo, Japan, the Frontend Developers Fest invites developers to delve into the future of UI/UX design, web performance, and modern JavaScript frameworks.", + "The Blockchain Expo Global, happening May 20-22, 2024, in Dubai, UAE, focuses on blockchain technology's applications, opportunities, and challenges for entrepreneurs, developers, and investors.", + "Singapore's Cloud Computing Summit, scheduled for October 3-5, 2024, is where IT professionals and cloud experts will convene to discuss strategies, architectures, and cloud solutions.", + "The IoT World Forum, taking place in Barcelona, Spain from December 1st to 3rd, 2024, is the premier conference for those focused on the Internet of Things, from smart cities to IoT security.", + "Los Angeles, USA, will become the hub for game developers, designers, and enthusiasts at the Game Developers Arcade, running from April 18th to 20th, 2024, to showcase new games and discuss development tools.", + "The TechWomen Summit in Sydney, Australia, from March 8-10, 2024, aims to empower women in tech with workshops, keynotes, and networking opportunities.", + "Seoul, South Korea's Mobile Tech Conference, happening from September 29th to October 1st, 2024, will explore the future of mobile technology, including 5G networks and app development trends.", + "The Open Source Summit, to be held in Helsinki, Finland from August 11th to 13th, 2024, celebrates open source technologies and communities, offering insights into the latest software and collaboration techniques.", + "Vancouver, Canada will play host to the VR/AR Innovation Conference from June 20th to 22nd, 2024, focusing on the latest in virtual and augmented reality technologies.", + "Scheduled for May 5-7, 2024, in London, UK, the Fintech Leaders Forum brings together experts to discuss the future of finance, including innovations in blockchain, digital currencies, and payment technologies.", + "The Digital Marketing Summit, set for April 25-27, 2024, in New York City, USA, is designed for marketing professionals and strategists to discuss digital marketing and social media trends.", + "EcoTech Symposium in Paris, France, unfolds over 2024-10-09 to 2024-10-11, spotlighting sustainable technologies and green innovations for environmental scientists, tech entrepreneurs, and policy makers.", + "Set in Tokyo, Japan, from 16th to 18th May '24, the Robotic Innovations Conference showcases automation, robotics, and AI-driven solutions, appealing to enthusiasts and engineers.", + "The Software Architecture World Forum in Dublin, Ireland, occurring 22-24 Sept 2024, gathers software architects and IT managers to discuss modern architecture patterns.", + "Quantum Computing Summit, convening in Silicon Valley, USA from 2024/11/12 to 2024/11/14, is a rendezvous for exploring quantum computing advancements with physicists and technologists.", + "From March 3 to 5, 2024, the Global EdTech Conference in London, UK, discusses the intersection of education and technology, featuring e-learning and digital classrooms.", + "Bangalore, India's NextGen DevOps Days, from 28 to 30 August 2024, is a hotspot for IT professionals keen on the latest DevOps tools and innovations.", + "The UX/UI Design Conference, slated for April 21-23, 2024, in New York City, USA, invites discussions on the latest in user experience and interface design among designers and developers.", + "Big Data Analytics Summit, taking place 2024 July 10-12 in Amsterdam, Netherlands, brings together data professionals to delve into big data analysis and insights.", + "Toronto, Canada, will see the HealthTech Innovation Forum from June 8 to 10, '24, focusing on technology's impact on healthcare with professionals and innovators.", + "Blockchain for Business Summit, happening in Singapore from 2024-05-02 to 2024-05-04, focuses on blockchain's business applications, from finance to supply chain.", + "Las Vegas, USA hosts the Global Gaming Expo from October 18th to 20th, 2024, a premiere event for game developers, publishers, and enthusiasts.", + "The Renewable Energy Tech Conference in Copenhagen, Denmark, from 2024/09/05 to 2024/09/07, discusses renewable energy innovations and policies.", + "Set for 2024 Apr 9-11 in Boston, USA, the Artificial Intelligence in Healthcare Summit gathers healthcare professionals to discuss AI's healthcare applications.", + "Nordic Software Engineers Conference, happening in Stockholm, Sweden from June 15 to 17, 2024, focuses on software development in the Nordic region.", + "The International Space Exploration Symposium, scheduled in Houston, USA from 2024-08-05 to 2024-08-07, invites discussions on space exploration technologies and missions." ] - ``` -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/search-beginners/\#3-define-storage-location) 3\. Define storage location - -You need to tell Qdrant where to store embeddings. This is a basic demo, so your local computer will use its memory as temporary storage. - -```python -client = QdrantClient(":memory:") - -``` +We'll be able to ask general questions, for example, about topics we are interested in or events happening in a specific +location, but expect the results to be returned in a structured format. -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/search-beginners/\#4-create-a-collection) 4\. Create a collection +![An example of extracted information](/documentation/examples/information-extraction-ollama-vultr/extracted-information.png) -All data in Qdrant is organized by collections. In this case, you are storing books, so we are calling it `my_books`. +Indexing in Qdrant is a single call if we have the documents defined: ```python -client.create_collection( - collection_name="my_books", - vectors_config=models.VectorParams( - size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model - distance=models.Distance.COSINE, - ), +client.add( + collection_name="document-parts", + documents=documents, + metadata=[{"document": document} for document in documents], ) - ``` -- The `vector_size` parameter defines the size of the vectors for a specific collection. If their size is different, it is impossible to calculate the distance between them. 384 is the encoder output dimensionality. You can also use model.get\_sentence\_embedding\_dimension() to get the dimensionality of the model you are using. - -- The `distance` parameter lets you specify the function used to measure the distance between two points. +Our collection is ready to be queried. We can now move to the next step, which is setting up the Ollama model. +### Ollama on Vultr -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/search-beginners/\#5-upload-data-to-collection) 5\. Upload data to collection +Ollama is a great tool for running the LLM models on your own infrastructure. It's designed to be lightweight and easy +to use, and [an official Docker image](https://hub.docker.com/r/ollama/ollama) is available. We can use it to run Ollama +on our Vultr Kubernetes cluster. In case of LLMs we may have some special requirements, like a GPU, and Vultr provides +the [Vultr Kubernetes Engine for Cloud GPU](https://www.vultr.com/products/cloud-gpu/) so the model can be run on a +specialized machine. Please refer to the official documentation to get Ollama up and running within your environment. +Once it's done, we need to store the Ollama URL in the environment variable: -Tell the database to upload `documents` to the `my_books` collection. This will give each record an id and a payload. The payload is just the metadata from the dataset. +```shell +export OLLAMA_URL="https://ollama.example.com" +``` ```python -client.upload_points( - collection_name="my_books", - points=[\ - models.PointStruct(\ - id=idx, vector=encoder.encode(doc["description"]).tolist(), payload=doc\ - )\ - for idx, doc in enumerate(documents)\ - ], -) - +os.environ["OLLAMA_URL"] = "https://ollama.example.com" ``` -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/search-beginners/\#6--ask-the-engine-a-question) 6\. Ask the engine a question - -Now that the data is stored in Qdrant, you can ask it questions and receive semantically relevant results. +We will refer to this URL later on when configuring the Ollama model in our application. -```python -hits = client.query_points( - collection_name="my_books", - query=encoder.encode("alien invasion").tolist(), - limit=3, -).points +#### Setting up the Large Language Model -for hit in hits: - print(hit.payload, "score:", hit.score) +We are going to use one of the lightweight LLMs available in Ollama, a `gemma:2b` model. It was developed by Google +DeepMind team and has 3B parameters. The [Ollama version](https://ollama.com/library/gemma:2b) uses 4-bit quantization. +Installing the model is as simple as running the following command on the machine where Ollama is running: +```shell +ollama run gemma:2b ``` -**Response:** - -The search engine shows three of the most likely responses that have to do with the alien invasion. Each of the responses is assigned a score to show how close the response is to the original inquiry. - -```text -{'name': 'The War of the Worlds', 'description': 'A Martian invasion of Earth throws humanity into chaos.', 'author': 'H.G. Wells', 'year': 1898} score: 0.570093257022374 -{'name': "The Hitchhiker's Guide to the Galaxy", 'description': 'A comedic science fiction series following the misadventures of an unwitting human and his alien friend.', 'author': 'Douglas Adams', 'year': 1979} score: 0.5040468703143637 -{'name': 'The Three-Body Problem', 'description': 'Humans encounter an alien civilization that lives in a dying system.', 'author': 'Liu Cixin', 'year': 2008} score: 0.45902943411768216 +Ollama models are also integrated with DSPy, so we can use them directly in our application. -``` +## Implementing the information extraction pipeline -### [Anchor](https://qdrant.tech/documentation/beginner-tutorials/search-beginners/\#narrow-down-the-query) Narrow down the query +DSPy is a bit different from the other LLM frameworks. It's designed to optimize the prompts and weights of LMs in a +pipeline. It's a bit like a compiler for LMs: you write a pipeline in a high-level language, and DSPy generates the +prompts and weights for you. This means you can build complex systems without having to worry about the details of how +to prompt your LMs, as DSPy will do that for you. It is somehow similar to PyTorch but for LLMs. -How about the most recent book from the early 2000s? +First of all, we will define the Language Model we are going to use: ```python -hits = client.query_points( - collection_name="my_books", - query=encoder.encode("alien invasion").tolist(), - query_filter=models.Filter( - must=[models.FieldCondition(key="year", range=models.Range(gte=2000))] - ), - limit=1, -).points - -for hit in hits: - print(hit.payload, "score:", hit.score) +import dspy +gemma_model = dspy.OllamaLocal( + model="gemma:2b", + base_url=os.environ.get("OLLAMA_URL"), + max_tokens=500, +) ``` -**Response:** - -The query has been narrowed down to one result from 2008. +Similarly, we have to define connection to our Qdrant Hybrid Cloud cluster: -```text -{'name': 'The Three-Body Problem', 'description': 'Humans encounter an alien civilization that lives in a dying system.', 'author': 'Liu Cixin', 'year': 2008} score: 0.45902943411768216 +```python +from dspy_qdrant import QdrantRM +from qdrant_client import QdrantClient, models +client = QdrantClient( + os.environ.get("QDRANT_URL"), + api_key=os.environ.get("QDRANT_API_KEY"), +) +qdrant_retriever = QdrantRM( + qdrant_collection_name="document-parts", + qdrant_client=client, +) ``` -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/search-beginners/\#next-steps) Next Steps - -Congratulations, you have just created your very first search engine! Trust us, the rest of Qdrant is not that complicated, either. For your next tutorial you should try building an actual [Neural Search Service with a complete API and a dataset](https://qdrant.tech/documentation/tutorials/neural-search/). - -##### Was this page useful? - -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No - -Thank you for your feedback! 🙏 - -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/beginner-tutorials/search-beginners.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. - -On this page: +Finally, both components have to be configured in DSPy with a simple call to one of the functions: -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/beginner-tutorials/search-beginners.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +```python +dspy.configure(lm=gemma_model, rm=qdrant_retriever) +``` -× +### Application logic -[Powered by](https://qdrant.tech/) +There is a concept of signatures which defines input and output formats of the pipeline. We are going to define a simple +signature for the event: -<|page-102-lllmstxt|> -## quickstart -- [Documentation](https://qdrant.tech/documentation/) -- Local Quickstart +```python +class Event(dspy.Signature): + description = dspy.InputField( + desc="Textual description of the event, including name, location and dates" + ) + event_name = dspy.OutputField(desc="Name of the event") + location = dspy.OutputField(desc="Location of the event") + start_date = dspy.OutputField(desc="Start date of the event, YYYY-MM-DD") + end_date = dspy.OutputField(desc="End date of the event, YYYY-MM-DD") +``` -# [Anchor](https://qdrant.tech/documentation/quickstart/\#how-to-get-started-with-qdrant-locally) How to Get Started with Qdrant Locally +It is designed to derive the structured information from the textual description of the event. Now, we can build our +module that will use it, along with Qdrant and Ollama model. Let's call it `EventExtractor`: -In this short example, you will use the Python Client to create a Collection, load data into it and run a basic search query. +```python +class EventExtractor(dspy.Module): -## [Anchor](https://qdrant.tech/documentation/quickstart/\#download-and-run) Download and run + def __init__(self): + super().__init__() + # Retrieve module to get relevant documents + self.retriever = dspy.Retrieve(k=3) + # Predict module for the created signature + self.predict = dspy.Predict(Event) -First, download the latest Qdrant image from Dockerhub: + def forward(self, query: str): + # Retrieve the most relevant documents + results = self.retriever.forward(query) -```bash -docker pull qdrant/qdrant + # Try to extract events from the retrieved documents + events = [] + for document in results.passages: + event = self.predict(description=document) + events.append(event) + return events ``` -Then, run the service: - -```bash -docker run -p 6333:6333 -p 6334:6334 \ - -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \ - qdrant/qdrant +The logic is simple: we retrieve the most relevant documents from Qdrant, and then try to extract the structured +information from them using the `Event` signature. We can simply call it and see the results: +```python +extractor = EventExtractor() +extractor.forward("Blockchain events close to Europe") ``` -Under the default configuration all data will be stored in the `./qdrant_storage` directory. This will also be the only directory that both the Container and the host machine can both see. +Output: -Qdrant is now accessible: +```python +[ + Prediction( + event_name='Event Name: Blockchain Expo Global', + location='Dubai, UAE', + start_date='2024-05-20', + end_date='2024-05-22' + ), + Prediction( + event_name='Event Name: Blockchain for Business Summit', + location='Singapore', + start_date='2024-05-02', + end_date='2024-05-04' + ), + Prediction( + event_name='Event Name: Open Source Summit', + location='Helsinki, Finland', + start_date='2024-08-11', + end_date='2024-08-13' + ) +] +``` -- REST API: [localhost:6333](http://localhost:6333/) -- Web UI: [localhost:6333/dashboard](http://localhost:6333/dashboard) -- GRPC API: [localhost:6334](http://localhost:6334/) +The task was solved successfully, even without any optimization. However, each of the events has the "Event Name: " +prefix that we might want to remove. DSPy allows optimizing the module, so we can improve the results. Optimization +might be done in different ways, and it's [well covered in the DSPy +documentation](https://dspy.ai/learn/optimization/optimizers/). -## [Anchor](https://qdrant.tech/documentation/quickstart/\#initialize-the-client) Initialize the client +We are not going to go through the optimization process in this tutorial. However, we encourage you to experiment with +it, as it might significantly improve the performance of your pipeline. -pythontypescriptrustjavacsharpgo +Created module might be easily stored on a specific path, and loaded later on: ```python -from qdrant_client import QdrantClient - -client = QdrantClient(url="http://localhost:6333") - +extractor.save("event_extractor") ``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; - -const client = new QdrantClient({ host: "localhost", port: 6333 }); +To load, just create an instance of the module and call the `load` method: +```python +second_extractor = EventExtractor() +second_extractor.load("event_extractor") ``` -```rust -use qdrant_client::Qdrant; +This is especially useful when you optimize the module, as the optimized version might be stored and loaded later on +without redoing the optimization process each time you run the application. -// The Rust client uses Qdrant's gRPC interface -let client = Qdrant::from_url("http://localhost:6334").build()?; +### Deploying the extraction pipeline -``` +Vultr gives us a lot of flexibility in terms of deploying the applications. Perfectly, we would use the Kubernetes +cluster we set up earlier to run it. The deployment is as simple as running any other Python application. This time we +don't need a GPU, as Ollama is already running on a separate machine, and DSPy just interacts with it. -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; +## Wrapping up -// The Java client uses Qdrant's gRPC interface -QdrantClient client = new QdrantClient( - QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +In this tutorial, we showed you how to set up a private environment for information extraction using DSPy, Ollama, and +Qdrant. All the components might be securely hosted on the Vultr cloud, giving you full control over your data. -``` +<|page-175-lllmstxt|> +# Movie Recommendation System -```csharp -using Qdrant.Client; +| Time: 120 min | Level: Advanced | Output: [GitHub](https://github.com/infoslack/qdrant-example/blob/main/HC-demo/HC-OVH.ipynb) | +| --- | ----------- | ----------- |----------- | -// The C# client uses Qdrant's gRPC interface -var client = new QdrantClient("localhost", 6334); +In this tutorial, you will build a mechanism that recommends movies based on defined preferences. Vector databases like Qdrant are good for storing high-dimensional data, such as user and item embeddings. They can enable personalized recommendations by quickly retrieving similar entries based on advanced indexing techniques. In this specific case, we will use [sparse vectors](/articles/sparse-vectors/) to create an efficient and accurate recommendation system. -``` +**Privacy and Sovereignty:** Since preference data is proprietary, it should be stored in a secure and controlled environment. Our vector database can easily be hosted on [OVHcloud](https://ovhcloud.com/), our trusted [Qdrant Hybrid Cloud](/documentation/hybrid-cloud/) partner. This means that Qdrant can be run from your OVHcloud region, but the database itself can still be managed from within Qdrant Cloud's interface. Both products have been tested for compatibility and scalability, and we recommend their [managed Kubernetes](https://www.ovhcloud.com/en/public-cloud/kubernetes/) service. -```go -import "github.com/qdrant/go-client/qdrant" +> To see the entire output, use our [notebook with complete instructions](https://github.com/infoslack/qdrant-example/blob/main/HC-demo/HC-OVH.ipynb). -// The Go client uses Qdrant's gRPC interface -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +## Components -``` +- **Dataset:** The [MovieLens dataset](https://grouplens.org/datasets/movielens/) contains a list of movies and ratings given by users. +- **Cloud:** [OVHcloud](https://ovhcloud.com/), with managed Kubernetes. +- **Vector DB:** [Qdrant Hybrid Cloud](https://hybrid-cloud.qdrant.tech) running on [OVHcloud](https://ovhcloud.com/). -## [Anchor](https://qdrant.tech/documentation/quickstart/\#create-a-collection) Create a collection +**Methodology:** We're adopting a collaborative filtering approach to construct a recommendation system from the dataset provided. Collaborative filtering works on the premise that if two users share similar tastes, they're likely to enjoy similar movies. Leveraging this concept, we'll identify users whose ratings align closely with ours, and explore the movies they liked but we haven't seen yet. To do this, we'll represent each user's ratings as a vector in a high-dimensional, sparse space. Using Qdrant, we'll index these vectors and search for users whose ratings vectors closely match ours. Ultimately, we will see which movies were enjoyed by users similar to us. -You will be storing all of your vector data in a Qdrant collection. Let’s call it `test_collection`. This collection will be using a dot product distance metric to compare vectors. +![](/documentation/examples/recommendation-system-ovhcloud/architecture-diagram.png) -pythontypescriptrustjavacsharpgo +## Deploying Qdrant Hybrid Cloud on OVHcloud -```python -from qdrant_client.models import Distance, VectorParams +[Service Managed Kubernetes](https://www.ovhcloud.com/en-in/public-cloud/kubernetes/), powered by OVH Public Cloud Instances, a leading European cloud provider. With OVHcloud Load Balancers and disks built in. OVHcloud Managed Kubernetes provides high availability, compliance, and CNCF conformance, allowing you to focus on your containerized software layers with total reversibility. -client.create_collection( - collection_name="test_collection", - vectors_config=VectorParams(size=4, distance=Distance.DOT), -) +1. To start using managed Kubernetes on OVHcloud, follow the [platform-specific documentation](/documentation/hybrid-cloud/platform-deployment-options/#ovhcloud). +2. Once your Kubernetes clusters are up, [you can begin deploying Qdrant Hybrid Cloud](/documentation/hybrid-cloud/). -``` +## Prerequisites -```typescript -await client.createCollection("test_collection", { - vectors: { size: 4, distance: "Dot" }, -}); +Download and unzip the MovieLens dataset: +```shell +mkdir -p data +wget https://files.grouplens.org/datasets/movielens/ml-1m.zip +unzip ml-1m.zip -d data ``` -```rust -use qdrant_client::qdrant::{CreateCollectionBuilder, VectorParamsBuilder}; - -client - .create_collection( - CreateCollectionBuilder::new("test_collection") - .vectors_config(VectorParamsBuilder::new(4, Distance::Dot)), - ) - .await?; +The necessary * libraries are installed using `pip`, including `pandas` for data manipulation, `qdrant-client` for interfacing with Qdrant, and `*-dotenv` for managing environment variables. +```python +!pip install -U \ + pandas \ + qdrant-client \ + *-dotenv ``` -```java -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.VectorParams; +The `.env` file is used to store sensitive information like the Qdrant host URL and API key securely. -client.createCollectionAsync("test_collection", - VectorParams.newBuilder().setDistance(Distance.Dot).setSize(4).build()).get(); +```shell +QDRANT_HOST +QDRANT_API_KEY +``` +Load all environment variables into the setup: +```python +import os +from dotenv import load_dotenv +load_dotenv('./.env') ``` -```csharp -using Qdrant.Client.Grpc; +## Implementation -await client.CreateCollectionAsync(collectionName: "test_collection", vectorsConfig: new VectorParams -{ - Size = 4, Distance = Distance.Dot -}); +Load the data from the MovieLens dataset into pandas DataFrames to facilitate data manipulation and analysis. +```python +from qdrant_client import QdrantClient, models +import pandas as pd ``` - -```go -import ( - "context" - - "github.com/qdrant/go-client/qdrant" +Load user data: +```python +users = pd.read_csv( + 'data/ml-1m/users.dat', + sep='::', + names=['user_id', 'gender', 'age', 'occupation', 'zip'], + engine='*' ) - -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 4, - Distance: qdrant.Distance_Cosine, - }), -}) - +users.head() ``` - -## [Anchor](https://qdrant.tech/documentation/quickstart/\#add-vectors) Add vectors - -Let’s now add a few vectors with a payload. Payloads are other data you want to associate with the vector: - -pythontypescriptrustjavacsharpgo - +Add movies: ```python -from qdrant_client.models import PointStruct - -operation_info = client.upsert( - collection_name="test_collection", - wait=True, - points=[\ - PointStruct(id=1, vector=[0.05, 0.61, 0.76, 0.74], payload={"city": "Berlin"}),\ - PointStruct(id=2, vector=[0.19, 0.81, 0.75, 0.11], payload={"city": "London"}),\ - PointStruct(id=3, vector=[0.36, 0.55, 0.47, 0.94], payload={"city": "Moscow"}),\ - PointStruct(id=4, vector=[0.18, 0.01, 0.85, 0.80], payload={"city": "New York"}),\ - PointStruct(id=5, vector=[0.24, 0.18, 0.22, 0.44], payload={"city": "Beijing"}),\ - PointStruct(id=6, vector=[0.35, 0.08, 0.11, 0.44], payload={"city": "Mumbai"}),\ - ], +movies = pd.read_csv( + 'data/ml-1m/movies.dat', + sep='::', + names=['movie_id', 'title', 'genres'], + engine='*', + encoding='latin-1' ) - -print(operation_info) - +movies.head() +``` +Finally, add the ratings: +```python +ratings = pd.read_csv( + 'data/ml-1m/ratings.dat', + sep='::', + names=['user_id', 'movie_id', 'rating', 'timestamp'], + engine='*' +) +ratings.head() ``` -```typescript -const operationInfo = await client.upsert("test_collection", { - wait: true, - points: [\ - { id: 1, vector: [0.05, 0.61, 0.76, 0.74], payload: { city: "Berlin" } },\ - { id: 2, vector: [0.19, 0.81, 0.75, 0.11], payload: { city: "London" } },\ - { id: 3, vector: [0.36, 0.55, 0.47, 0.94], payload: { city: "Moscow" } },\ - { id: 4, vector: [0.18, 0.01, 0.85, 0.80], payload: { city: "New York" } },\ - { id: 5, vector: [0.24, 0.18, 0.22, 0.44], payload: { city: "Beijing" } },\ - { id: 6, vector: [0.35, 0.08, 0.11, 0.44], payload: { city: "Mumbai" } },\ - ], -}); +### Normalize the ratings -console.debug(operationInfo); +Sparse vectors can use advantage of negative values, so we can normalize ratings to have a mean of 0 and a standard deviation of 1. This normalization ensures that ratings are consistent and centered around zero, enabling accurate similarity calculations. In this scenario we can take into account movies that we don't like. +```python +ratings.rating = (ratings.rating - ratings.rating.mean()) / ratings.rating.std() ``` +To get the results: -```rust -use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder}; +```python +ratings.head() +``` -let points = vec![\ - PointStruct::new(1, vec![0.05, 0.61, 0.76, 0.74], [("city", "Berlin".into())]),\ - PointStruct::new(2, vec![0.19, 0.81, 0.75, 0.11], [("city", "London".into())]),\ - PointStruct::new(3, vec![0.36, 0.55, 0.47, 0.94], [("city", "Moscow".into())]),\ - // ..truncated\ -]; +### Data preparation -let response = client - .upsert_points(UpsertPointsBuilder::new("test_collection", points).wait(true)) - .await?; +Now you will transform user ratings into sparse vectors, where each vector represents ratings for different movies. This step prepares the data for indexing in Qdrant. -dbg!(response); +First, create a collection with configured sparse vectors. For sparse vectors, you don't need to specify the dimension, because it's extracted from the data automatically. -``` +```python +from collections import defaultdict -```java -import java.util.List; -import java.util.Map; +user_sparse_vectors = defaultdict(lambda: {"values": [], "indices": []}) -import static io.qdrant.client.PointIdFactory.id; -import static io.qdrant.client.ValueFactory.value; -import static io.qdrant.client.VectorsFactory.vectors; +for row in ratings.itertuples(): + user_sparse_vectors[row.user_id]["values"].append(row.rating) + user_sparse_vectors[row.user_id]["indices"].append(row.movie_id) +``` +Connect to Qdrant and create a collection called **movielens**: -import io.qdrant.client.grpc.Points.PointStruct; -import io.qdrant.client.grpc.Points.UpdateResult; +```python +client = QdrantClient( + url = os.getenv("QDRANT_HOST"), + api_key = os.getenv("QDRANT_API_KEY") +) -UpdateResult operationInfo = - client - .upsertAsync( - "test_collection", - List.of( - PointStruct.newBuilder() - .setId(id(1)) - .setVectors(vectors(0.05f, 0.61f, 0.76f, 0.74f)) - .putAllPayload(Map.of("city", value("Berlin"))) - .build(), - PointStruct.newBuilder() - .setId(id(2)) - .setVectors(vectors(0.19f, 0.81f, 0.75f, 0.11f)) - .putAllPayload(Map.of("city", value("London"))) - .build(), - PointStruct.newBuilder() - .setId(id(3)) - .setVectors(vectors(0.36f, 0.55f, 0.47f, 0.94f)) - .putAllPayload(Map.of("city", value("Moscow"))) - .build())) - // Truncated - .get(); +client.create_collection( + "movielens", + vectors_config={}, + sparse_vectors_config={ + "ratings": models.SparseVectorParams() + } +) +``` -System.out.println(operationInfo); +Upload user ratings to the **movielens** collection in Qdrant as sparse vectors, along with user metadata. This step populates the database with the necessary data for recommendation generation. + +```python +def data_generator(): + for user in users.itertuples(): + yield models.PointStruct( + id=user.user_id, + vector={ + "ratings": user_sparse_vectors[user.user_id] + }, + payload=user._asdict() + ) +client.upload_points( + "movielens", + data_generator() +) ``` -```csharp -using Qdrant.Client.Grpc; +## Recommendations -var operationInfo = await client.UpsertAsync(collectionName: "test_collection", points: new List -{ - new() - { - Id = 1, - Vectors = new float[] - { - 0.05f, 0.61f, 0.76f, 0.74f - }, - Payload = { - ["city"] = "Berlin" - } - }, - new() - { - Id = 2, - Vectors = new float[] - { - 0.19f, 0.81f, 0.75f, 0.11f - }, - Payload = { - ["city"] = "London" - } - }, - new() - { - Id = 3, - Vectors = new float[] - { - 0.36f, 0.55f, 0.47f, 0.94f - }, - Payload = { - ["city"] = "Moscow" - } - }, - // Truncated -}); +Personal movie ratings are specified, where positive ratings indicate likes and negative ratings indicate dislikes. These ratings serve as the basis for finding similar users with comparable tastes. -Console.WriteLine(operationInfo); +Personal ratings are converted into a sparse vector representation suitable for querying Qdrant. This vector represents the user's preferences across different movies. -``` +Let's try to recommend something for ourselves: -```go -import ( - "context" - "fmt" +``` +1 = Like +-1 = dislike +``` - "github.com/qdrant/go-client/qdrant" -) +```python +# Search with movies[movies.title.str.contains("Matrix", case=False)]. -operationInfo, err := client.Upsert(context.Background(), &qdrant.UpsertPoints{ - CollectionName: "test_collection", - Points: []*qdrant.PointStruct{ - { - Id: qdrant.NewIDNum(1), - Vectors: qdrant.NewVectors(0.05, 0.61, 0.76, 0.74), - Payload: qdrant.NewValueMap(map[string]any{"city": "Berlin"}), - }, - { - Id: qdrant.NewIDNum(2), - Vectors: qdrant.NewVectors(0.19, 0.81, 0.75, 0.11), - Payload: qdrant.NewValueMap(map[string]any{"city": "London"}), - }, - { - Id: qdrant.NewIDNum(3), - Vectors: qdrant.NewVectors(0.36, 0.55, 0.47, 0.94), - Payload: qdrant.NewValueMap(map[string]any{"city": "Moscow"}), - }, - // Truncated - }, -}) -if err != nil { - panic(err) +my_ratings = { + 2571: 1, # Matrix + 329: 1, # Star Trek + 260: 1, # Star Wars + 2288: -1, # The Thing + 1: 1, # Toy Story + 1721: -1, # Titanic + 296: -1, # Pulp Fiction + 356: 1, # Forrest Gump + 2116: 1, # Lord of the Rings + 1291: -1, # Indiana Jones + 1036: -1 # Die Hard } -fmt.Println(operationInfo) -``` +inverse_ratings = {k: -v for k, v in my_ratings.items()} -**Response:** +def to_vector(ratings): + vector = models.SparseVector( + values=[], + indices=[] + ) + for movie_id, rating in ratings.items(): + vector.values.append(rating) + vector.indices.append(movie_id) + return vector +``` -pythontypescriptrustjavacsharpgo +Query Qdrant to find users with similar tastes based on the provided personal ratings. The search returns a list of similar users along with their ratings, facilitating collaborative filtering. ```python -operation_id=0 status= - +results = client.query_points( + "movielens", + query=to_vector(my_ratings), + using="ratings", + with_vectors=True, # We will use those to find new movies + limit=20 +).points ``` -```typescript -{ operation_id: 0, status: 'completed' } +Movie scores are computed based on how frequently each movie appears in the ratings of similar users, weighted by their ratings. This step identifies popular movies among users with similar tastes. Calculate how frequently each movie is found in similar users' ratings -``` +```python +def results_to_scores(results): + movie_scores = defaultdict(lambda: 0) -```rust -PointsOperationResponse { - result: Some( - UpdateResult { - operation_id: Some( - 0, - ), - status: Completed, - }, - ), - time: 0.00094027, -} + for user in results: + user_scores = user.vector['ratings'] + for idx, rating in zip(user_scores.indices, user_scores.values): + if idx in my_ratings: + continue + movie_scores[idx] += rating + return movie_scores ``` -```java -operation_id: 0 -status: Completed +The top-rated movies are sorted based on their scores and printed as recommendations for the user. These recommendations are tailored to the user's preferences and aligned with their tastes. Sort movies by score and print top five: + +```python +movie_scores = results_to_scores(results) +top_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True) +for movie_id, score in top_movies[:5]: + print(movies[movies.movie_id == movie_id].title.values[0], score) ``` -```csharp -{ "operationId": "0", "status": "Completed" } +Result: +```text +Star Wars: Episode V - The Empire Strikes Back (1980) 20.02387858 +Star Wars: Episode VI - Return of the Jedi (1983) 16.443184379999998 +Princess Bride, The (1987) 15.840068229999996 +Raiders of the Lost Ark (1981) 14.94489462 +Sixth Sense, The (1999) 14.570322149999999 ``` -```go -operation_id:0 status:Acknowledged +<|page-176-lllmstxt|> +# Blog-Reading Chatbot with GPT-4o -``` +| Time: 90 min | Level: Advanced |[GitHub](https://github.com/qdrant/examples/blob/langchain-lcel-rag/langchain-lcel-rag/Langchain-LCEL-RAG-Demo.ipynb)| | +|--------------|-----------------|--|----| -## [Anchor](https://qdrant.tech/documentation/quickstart/\#run-a-query) Run a query +In this tutorial, you will build a RAG system that combines blog content ingestion with the capabilities of semantic search. **OpenAI's GPT-4o LLM** is powerful, but scaling its use requires us to supply context systematically. -Let’s ask a basic question - Which of our stored vectors are most similar to the query vector `[0.2, 0.1, 0.9, 0.7]`? +RAG enhances the LLM's generation of answers by retrieving relevant documents to aid the question-answering process. This setup showcases the integration of advanced search and AI language processing to improve information retrieval and generation tasks. -pythontypescriptrustjavacsharpgo +A notebook for this tutorial is available on [GitHub](https://github.com/qdrant/examples/blob/langchain-lcel-rag/langchain-lcel-rag/Langchain-LCEL-RAG-Demo.ipynb). -```python -search_result = client.query_points( - collection_name="test_collection", - query=[0.2, 0.1, 0.9, 0.7], - with_payload=False, - limit=3 -).points +**Data Privacy and Sovereignty:** RAG applications often rely on sensitive or proprietary internal data. Running the entire stack within your own environment becomes crucial for maintaining control over this data. Qdrant Hybrid Cloud deployed on [Scaleway](https://www.scaleway.com/) addresses this need perfectly, offering a secure, scalable platform that still leverages the full potential of RAG. Scaleway offers serverless [Functions](https://www.scaleway.com/en/serverless-functions/) and serverless [Jobs](https://www.scaleway.com/en/serverless-jobs/), both of which are ideal for embedding creation in large-scale RAG cases. -print(search_result) +## Components -``` +- **Cloud Host:** [Scaleway on managed Kubernetes](https://www.scaleway.com/en/kubernetes-kapsule/) for compatibility with Qdrant Hybrid Cloud. +- **Vector Database:** Qdrant Hybrid Cloud as the vector search engine for retrieval. +- **LLM:** GPT-4o, developed by OpenAI is utilized as the generator for producing answers. +- **Framework:** [LangChain](https://www.langchain.com/) for extensive RAG capabilities. -```typescript -let searchResult = await client.query( - "test_collection", { - query: [0.2, 0.1, 0.9, 0.7], - limit: 3 -}); +![Architecture diagram](/documentation/examples/rag-chatbot-scaleway/architecture-diagram.png) -console.debug(searchResult.points); +> Langchain [supports a wide range of LLMs](https://python.langchain.com/docs/integrations/chat/), and GPT-4o is used as the main generator in this tutorial. You can easily swap it out for your preferred model that might be launched on your premises to complete the fully private setup. For the sake of simplicity, we used the OpenAI APIs, but LangChain makes the transition seamless. -``` +## Deploying Qdrant Hybrid Cloud on Scaleway -```rust -use qdrant_client::qdrant::QueryPointsBuilder; +[Scaleway Kapsule](https://www.scaleway.com/en/kubernetes-kapsule/) and [Kosmos](https://www.scaleway.com/en/kubernetes-kosmos/) are managed Kubernetes services from [Scaleway](https://www.scaleway.com/en/). They abstract away the complexities of managing and operating a Kubernetes cluster. The primary difference being, Kapsule clusters are composed solely of Scaleway Instances. Whereas, a Kosmos cluster is a managed multi-cloud Kubernetes engine that allows you to connect instances from any cloud provider to a single managed Control-Plane. -let search_result = client - .query( - QueryPointsBuilder::new("test_collection") - .query(vec![0.2, 0.1, 0.9, 0.7]) - ) - .await?; +1. To start using managed Kubernetes on Scaleway, follow the [platform-specific documentation](/documentation/hybrid-cloud/platform-deployment-options/#scaleway). +2. Once your Kubernetes clusters are up, [you can begin deploying Qdrant Hybrid Cloud](/documentation/hybrid-cloud/). -dbg!(search_result); +## Prerequisites -``` +To prepare the environment for working with Qdrant and related libraries, it's necessary to install all required Python packages. This can be done using Poetry, a tool for dependency management and packaging in Python. The code snippet imports various libraries essential for the tasks ahead, including `bs4` for parsing HTML and XML documents, `langchain` and its community extensions for working with language models and document loaders, and `Qdrant` for vector storage and retrieval. These imports lay the groundwork for utilizing Qdrant alongside other tools for natural language processing and machine learning tasks. -```java -import java.util.List; +Qdrant will be running on a specific URL and access will be restricted by the API key. Make sure to store them both as environment variables as well: -import io.qdrant.client.grpc.Points.ScoredPoint; -import io.qdrant.client.grpc.Points.QueryPoints; +```shell +export QDRANT_URL="https://qdrant.example.com" +export QDRANT_API_KEY="your-api-key" +``` -import static io.qdrant.client.QueryFactory.nearest; +*Optional:* Whenever you use LangChain, you can also [configure LangSmith](https://docs.smith.langchain.com/), which will help us trace, monitor and debug LangChain applications. You can sign up for LangSmith [here](https://smith.langchain.com/). -List searchResult = - client.queryAsync(QueryPoints.newBuilder() - .setCollectionName("test_collection") - .setLimit(3) - .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) - .build()).get(); +```shell +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_API_KEY="your-api-key" +export LANGCHAIN_PROJECT="your-project" # if not specified, defaults to "default" +``` -System.out.println(searchResult); +Now you can get started: + +```python +import getpass +import os +import bs4 +from langchain import hub +from langchain_community.document_loaders import WebBaseLoader +from langchain_qdrant import Qdrant +from langchain_core.output_parsers import StrOutputParser +from langchain_core.runnables import RunnablePassthrough +from langchain_openai import ChatOpenAI, OpenAIEmbeddings +from langchain_text_splitters import RecursiveCharacterTextSplitter ``` -```csharp -var searchResult = await client.QueryAsync( - collectionName: "test_collection", - query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, - limit: 3, -); +Set up the OpenAI API key: -Console.WriteLine(searchResult); +```python +os.environ["OPENAI_API_KEY"] = getpass.getpass() +``` +Initialize the language model: + +```python +llm = ChatOpenAI(model="gpt-4o") ``` -```go -import ( - "context" - "fmt" +It is here that we configure both the Embeddings and LLM. You can replace this with your own models using Ollama or other services. Scaleway has some great [L4 GPU Instances](https://www.scaleway.com/en/l4-gpu-instance/) you can use for compute here. - "github.com/qdrant/go-client/qdrant" -) +## Download and parse data -searchResult, err := client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "test_collection", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), -}) -if err != nil { - panic(err) -} +To begin working with blog post contents, the process involves loading and parsing the HTML content. This is achieved using `urllib` and `BeautifulSoup`, which are tools designed for such tasks. After the content is loaded and parsed, it is indexed using Qdrant, a powerful tool for managing and querying vector data. The code snippet demonstrates how to load, chunk, and index the contents of a blog post by specifying the URL of the blog and the specific HTML elements to parse. This step is crucial for preparing the data for further processing and analysis with Qdrant. -fmt.Println(searchResult) +```python +# Load, chunk and index the contents of the blog. +loader = WebBaseLoader( + web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",), + bs_kwargs=dict( + parse_only=bs4.SoupStrainer( + class_=("post-content", "post-title", "post-header") + ) + ), +) +docs = loader.load() ``` -**Response:** +### Chunking data -```json -[\ - {\ - "id": 4,\ - "version": 0,\ - "score": 1.362,\ - "payload": null,\ - "vector": null\ - },\ - {\ - "id": 1,\ - "version": 0,\ - "score": 1.273,\ - "payload": null,\ - "vector": null\ - },\ - {\ - "id": 3,\ - "version": 0,\ - "score": 1.208,\ - "payload": null,\ - "vector": null\ - }\ -] +When dealing with large documents, such as a blog post exceeding 42,000 characters, it's crucial to manage the data efficiently for processing. Many models have a limited context window and struggle with long inputs, making it difficult to extract or find relevant information. To overcome this, the document is divided into smaller chunks. This approach enhances the model's ability to process and retrieve the most pertinent sections of the document effectively. + +In this scenario, the document is split into chunks using the `RecursiveCharacterTextSplitter` with a specified chunk size and overlap. This method ensures that no critical information is lost between chunks. Following the splitting, these chunks are then indexed into Qdrant—a vector database for efficient similarity search and storage of embeddings. The `Qdrant.from_documents` function is utilized for indexing, with documents being the split chunks and embeddings generated through `OpenAIEmbeddings`. The entire process is facilitated within an in-memory database, signifying that the operations are performed without the need for persistent storage, and the collection is named "lilianweng" for reference. + +This chunking and indexing strategy significantly improves the management and retrieval of information from large documents, making it a practical solution for handling extensive texts in data processing workflows. + +```python +text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) +text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) +splits = text_splitter.split_documents(docs) +vectorstore = Qdrant.from_documents( + documents=splits, + embedding=OpenAIEmbeddings(), + collection_name="lilianweng", + url=os.environ["QDRANT_URL"], + api_key=os.environ["QDRANT_API_KEY"], +) ``` -The results are returned in decreasing similarity order. Note that payload and vector data is missing in these results by default. -See [payload and vector in the result](https://qdrant.tech/documentation/concepts/search/#payload-and-vector-in-the-result) on how to enable it. +## Retrieve and generate content + +The `vectorstore` is used as a retriever to fetch relevant documents based on vector similarity. The `hub.pull("rlm/rag-prompt")` function is used to pull a specific prompt from a repository, which is designed to work with retrieved documents and a question to generate a response. -## [Anchor](https://qdrant.tech/documentation/quickstart/\#add-a-filter) Add a filter +The `format_docs` function formats the retrieved documents into a single string, preparing them for further processing. This formatted string, along with a question, is passed through a chain of operations. Firstly, the context (formatted documents) and the question are processed by the retriever and the prompt. Then, the result is fed into a large language model (`llm`) for content generation. Finally, the output is parsed into a string format using `StrOutputParser()`. -We can narrow down the results further by filtering by payload. Let’s find the closest results that include “London”. +This chain of operations demonstrates a sophisticated approach to information retrieval and content generation, leveraging both the semantic understanding capabilities of vector search and the generative prowess of large language models. -pythontypescriptrustjavacsharpgo +Now, retrieve and generate data using relevant snippets from the blogL ```python -from qdrant_client.models import Filter, FieldCondition, MatchValue +retriever = vectorstore.as_retriever() +prompt = hub.pull("rlm/rag-prompt") -search_result = client.query_points( - collection_name="test_collection", - query=[0.2, 0.1, 0.9, 0.7], - query_filter=Filter( - must=[FieldCondition(key="city", match=MatchValue(value="London"))] - ), - with_payload=True, - limit=3, -).points -print(search_result) +def format_docs(docs): + return "\n\n".join(doc.page_content for doc in docs) -``` -```typescript -searchResult = await client.query("test_collection", { - query: [0.2, 0.1, 0.9, 0.7], - filter: { - must: [{ key: "city", match: { value: "London" } }], - }, - with_payload: true, - limit: 3, -}); +rag_chain = ( + {"context": retriever | format_docs, "question": RunnablePassthrough()} + | prompt + | llm + | StrOutputParser() +) +``` -console.debug(searchResult); +### Invoking the RAG Chain +```python +rag_chain.invoke("What is Task Decomposition?") ``` -```rust -use qdrant_client::qdrant::{Condition, Filter, QueryPointsBuilder}; +## Next steps: +We built a solid foundation for a simple chatbot, but there is still a lot to do. If you want to make the +system production-ready, you should consider implementing the mechanism into your existing stack. We recommend -let search_result = client - .query( - QueryPointsBuilder::new("test_collection") - .query(vec![0.2, 0.1, 0.9, 0.7]) - .filter(Filter::must([Condition::matches(\ - "city",\ - "London".to_string(),\ - )])) - .with_payload(true), - ) - .await?; +Our vector database can easily be hosted on [Scaleway](https://www.scaleway.com/), our trusted [Qdrant Hybrid Cloud](/documentation/hybrid-cloud/) partner. This means that Qdrant can be run from your Scaleway region, but the database itself can still be managed from within Qdrant Cloud's interface. Both products have been tested for compatibility and scalability, and we recommend their [managed Kubernetes](https://www.scaleway.com/en/kubernetes-kapsule/) service. +Their French deployment regions e.g. France are excellent for network latency and data sovereignty. For hosted GPUs, try [rendering with L4 GPU instances](https://www.scaleway.com/en/l4-gpu-instance/). -dbg!(search_result); +If you have any questions, feel free to ask on our [Discord community](https://qdrant.to/discord). -``` +<|page-177-lllmstxt|> +# Accessing Qdrant Cloud Clusters -```java -import static io.qdrant.client.ConditionFactory.matchKeyword; +Once you [created](/documentation/cloud/create-cluster/) a cluster, and set up an [API key](/documentation/cloud/authentication/), you can access your cluster through the integrated Cluster UI, the REST API and the GRPC API. -List searchResult = - client.queryAsync(QueryPoints.newBuilder() - .setCollectionName("test_collection") - .setLimit(3) - .setFilter(Filter.newBuilder().addMust(matchKeyword("city", "London"))) - .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) - .setWithPayload(enable(true)) - .build()).get(); +## Cluster UI -System.out.println(searchResult); +You can access your [Cluster UI](/documentation/web-ui/) via the Cluster Details page in the Qdrant Cloud Console. Authentication to a cluster is automatic if your cloud user has the [`read:cluster_data` or `write:cluster_data` permission](/documentation/cloud-rbac/permission-reference/). Without the correct permissions you will be prompted to enter an [API Key](/documentation/cloud/authentication/) to access the cluster. -``` +![Cluster Cluster UI](/documentation/cloud/cloud-db-dashboard.png) -```csharp -using static Qdrant.Client.Grpc.Conditions; +The Overview tab also contains direct links to explore Qdrant tutorials and sample datasets. -var searchResult = await client.QueryAsync( - collectionName: "test_collection", - query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, - filter: MatchKeyword("city", "London"), - limit: 3, - payloadSelector: true -); +![Cluster Cluster UI Tutorials](/documentation/cloud/cloud-db-deeplinks.png) -Console.WriteLine(searchResult); +## API -``` +The REST API is exposed on your cluster endpoint at port `6333`. The GRPC API is exposed on your cluster endpoint at port `6334`. When accessing the cluster endpoint, traffic is automatically load balanced across all healthy Qdrant nodes in the cluster. For all operations, but the few mentioned at [Node specific endpoints](#node-specific-endpoints), you should use the cluster endpoint. It does not matter which node in the cluster you land on. All nodes can handle all search and write requests. -```go -import ( - "context" - "fmt" +![Cluster cluster endpoint](/documentation/cloud/cloud-endpoint.png) - "github.com/qdrant/go-client/qdrant" -) +Have a look at the [API reference](/documentation/interfaces/#api-reference) and the official [client libraries](/documentation/interfaces/#client-libraries) for more information on how to interact with the Qdrant Cloud API. -searchResult, err := client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "test_collection", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), - Filter: &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("city", "London"), - }, - }, - WithPayload: qdrant.NewWithPayload(true), -}) -if err != nil { - panic(err) -} +## Node Specific Endpoints -fmt.Println(searchResult) +Next to the cluster endpoint which loadbalances requests across all healthy Qdrant nodes, each node in the cluster has its own endpoint as well. This is mainly usefull for monitoring or manual shard management purpuses. -``` +You can finde the node specific endpoints on the cluster detail page in the Qdrant Cloud Console. -**Response:** +![Cluster node endpoints](/documentation/cloud/cloud-node-endpoints.png) -```json -[\ - {\ - "id": 2,\ - "version": 0,\ - "score": 0.871,\ - "payload": {\ - "city": "London"\ - },\ - "vector": null\ - }\ -] +<|page-178-lllmstxt|> +# Qdrant Cloud Support and Troubleshooting -``` +## Community Support -You have just conducted vector search. You loaded vectors into a database and queried the database with a vector of your own. Qdrant found the closest results and presented you with a similarity score. +All Qdrant Cloud users are welcome to join our [Discord community](https://qdrant.to/discord/). -## [Anchor](https://qdrant.tech/documentation/quickstart/\#next-steps) Next steps +![Discord](/documentation/cloud/discord.png) -Now you know how Qdrant works. Getting started with [Qdrant Cloud](https://qdrant.tech/documentation/cloud/quickstart-cloud/) is just as easy. [Create an account](https://qdrant.to/cloud) and use our SaaS completely free. We will take care of infrastructure maintenance and software updates. +## Qdrant Cloud Support -To move onto some more complex examples of vector search, read our [Tutorials](https://qdrant.tech/documentation/tutorials/) and create your own app with the help of our [Examples](https://qdrant.tech/documentation/examples/). +Paying customers have access to our Support team. Links to the support portal are available in the Qdrant Cloud Console. -**Note:** There is another way of running Qdrant locally. If you are a Python developer, we recommend that you try Local Mode in [Qdrant Client](https://github.com/qdrant/qdrant-client), as it only takes a few moments to get setup. +![Support Portal](/documentation/cloud/support-portal.png) -##### Was this page useful? +Support is handled via **Jira Service Management (JSM)**. When creating a support ticket, you will be asked to select a request type and provide information to help us understand and prioritize your issue. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +### Request Type -Thank you for your feedback! 🙏 +The form allows you to specify what your ticket is about: -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/quickstart.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +- **Information** – Questions, analysis, or troubleshooting +- **Incidents** – Reporting bugs or disruptions in service +- **Billing** – Issues or questions related to charges, invoices, or payment +- **Features** – Suggestions or requests for product enhancements -On this page: +### Ticket Information -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/quickstart.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Please provide as much detail as possible when submitting your request. This helps us help you faster. -× +This includes but is not limited to: -[Powered by](https://qdrant.tech/) +- The ID of your Qdrant Cloud cluster (auto-filled if possible) +- Which collection(s) are affected +- Code examples showing how you're interacting with the Qdrant API +- Logs or error messages from your application +- Relevant telemetry or traces from your system -<|page-103-lllmstxt|> -## metric-learning-tips -- [Articles](https://qdrant.tech/articles/) -- Metric Learning Tips & Tricks +If you're submitting a ticket for a **Hybrid Cloud** or **Private Cloud** environment, we may also ask for: -[Back to Machine Learning](https://qdrant.tech/articles/machine-learning/) +- Logs from Qdrant components (database, operator) +- Kubernetes environment state (e.g., node/resource usage, pod status) -# Metric Learning Tips & Tricks +To streamline this, we recommend using our [support bundle script](https://github.com/qdrant/qdrant-cloud-support-tools/tree/main/support-bundle). It collects environment metadata (but **no user data or API keys**) and helps us troubleshoot more efficiently. The bundle includes collection names and configuration details. -Andrei Vasnetsov +Please attach it to your ticket if applicable. -· +### Priority & SLA -May 15, 2021 +You will also be asked to select a **severity level**, which determines how your ticket is prioritized. The severity should reflect the impact on your system or customers. -![Metric Learning Tips & Tricks](https://qdrant.tech/articles_data/metric-learning-tips/preview/title.jpg) +- **Severity 1** – Critical impact: full service outage or data loss +- **Severity 2** – Major impact: degraded performance or partial outage +- **Severity 3** – Moderate impact: bugs with workarounds or degraded UX +- **Severity 4** – Minor issues: cosmetic bugs, general questions -## [Anchor](https://qdrant.tech/articles/metric-learning-tips/\#how-to-train-object-matching-model-with-no-labeled-data-and-use-it-in-production) How to train object matching model with no labeled data and use it in production +> Please refer to the [Qdrant Cloud SLA](https://qdrant.to/sla/) for full definitions of severity levels and guaranteed response times per your [support tier](/documentation/cloud/premium/). -Currently, most machine-learning-related business cases are solved as a classification problems. -Classification algorithms are so well studied in practice that even if the original problem is not directly a classification task, it is usually decomposed or approximately converted into one. +<|page-179-lllmstxt|> +# Using Cloud Inference with Qdrant for Vector Search +In this tutorial, we'll walkthrough building a **hybrid semantic search engine** using Qdrant Cloud's built-in [inference](/documentation/cloud/inference/) capabilities. You'll learn how to: +- Automatically embed your data using [cloud Inference](/documentation/cloud/inference/) without needing to run local models, +- Combine dense semantic embeddings with [sparse BM25 keywords](https://qdrant.tech/documentation/advanced-tutorials/reranking-hybrid-search/), and +- Perform hybrid search using [Reciprocal Rank Fusion (RRF)](https://qdrant.tech/documentation/concepts/hybrid-queries/) to retrieve the most relevant results. +## Install Qdrant Client +```bash +pip install qdrant-client datasets +``` +## Initialize the Client +Initialize the Qdrant client after creating a [Qdrant Cloud account](/documentation/cloud/) and a [dedicated paid cluster](/documentation/cloud/create-cluster/). Set `cloud_inference` to `True` to enable [cloud inference](/documentation/cloud/inference/). -However, despite its simplicity, the classification task has requirements that could complicate its production integration and scaling. -E.g. it requires a fixed number of classes, where each class should have a sufficient number of training samples. +```python +client = QdrantClient( + url="https://YOUR_URL.eastus-0.azure.cloud.qdrant.io:6333/", + api_key="YOUR_API_KEY", + cloud_inference=True, + timeout=30.0 +) +``` -In this article, I will describe how we overcome these limitations by switching to metric learning. -By the example of matching job positions and candidates, I will show how to train metric learning model with no manually labeled data, how to estimate prediction confidence, and how to serve metric learning in production. +## Create a Collection +Qdrant stores vectors and associated metadata in collections. A collection requires vector parameters to be set during creation. In this case, let's set up a collection using `BM25` for sparse vectors and `all-minilm-l6-v2` for dense vectors. BM25 uses the Inverse Document Frequency to reduce the weight of common terms that appear in many documents while boosting the importance of rare terms that are more discriminative for retrieval. Qdrant will handle the calculations of the IDF term if we enable that in the configuration of the `bm25_sparse_vector` named sparse vector. -## [Anchor](https://qdrant.tech/articles/metric-learning-tips/\#what-is-metric-learning-and-why-using-it) What is metric learning and why using it? +```python +from qdrant_client import models -According to Wikipedia, metric learning is the task of learning a distance function over objects. -In practice, it means that we can train a model that tells a number for any pair of given objects. -And this number should represent a degree or score of similarity between those given objects. -For example, objects with a score of 0.9 could be more similar than objects with a score of 0.5 -Actual scores and their direction could vary among different implementations. +collection_name = "my_collection_name" -In practice, there are two main approaches to metric learning and two corresponding types of NN architectures. -The first is the interaction-based approach, which first builds local interactions (i.e., local matching signals) between two objects. Deep neural networks learn hierarchical interaction patterns for matching. -Examples of neural network architectures include MV-LSTM, ARC-II, and MatchPyramid. +if not client.collection_exists(collection_name=collection_name): + client.create_collection( + collection_name=collection_name, + vectors_config={ + "dense_vector": models.VectorParams( + size=384, + distance=models.Distance.COSINE + ) + }, + sparse_vectors_config={ + "bm25_sparse_vector": models.SparseVectorParams( + modifier=models.Modifier.IDF # Enable Inverse Document Frequency + ) + } + ) +``` -![MV-LSTM, example of interaction-based model](https://gist.githubusercontent.com/generall/4821e3c6b5eee603d56729e7a156e461/raw/b0eb4ea5d088fe1095e529eb12708ac69f304ce3/mv_lstm.png) +## Add Data +Now you can add sample documents, their associated metadata, and a point id for each. Here's a sample of the [miriad/miriad-4.4M](https://huggingface.co/datasets/miriad/miriad-4.4M) dataset: -> MV-LSTM, example of interaction-based model, [Shengxian Wan et al.](https://www.researchgate.net/figure/Illustration-of-MV-LSTM-S-X-and-S-Y-are-the-in_fig1_285271115) via Researchgate +| qa_id | paper_id | question | year | venue | specialty | passage_text | +|--------------------|----------|-------------------------------------------------------|------|--------------------------------------|--------------|--------------------------------------------------------| +| 38_77498699_0_1 | 77498699 | What are the clinical features of relapsing polychondritis? | 2006 | Internet Journal of Otorhinolaryngology | Rheumatology | A 45-year-old man presented with painful swelling... | +| 38_77498699_0_2 | 77498699 | What treatments are available for relapsing polychondritis? | 2006 | Internet Journal of Otorhinolaryngology | Rheumatology | Patient showed improvement after treatment with... | +| 38_88124321_0_3 | 88124321 | How is Takayasu arteritis diagnosed? | 2015 | Journal of Autoimmune Diseases | Rheumatology | A 32-year-old woman with fatigue and limb pain... | -The second is the representation-based approach. -In this case distance function is composed of 2 components: -the Encoder transforms an object into embedded representation - usually a large float point vector, and the Comparator takes embeddings of a pair of objects from the Encoder and calculates their similarity. -The most well-known example of this embedding representation is Word2Vec. +We won't ingest all the entries from the dataset, but for demo purposes, just take the first hundred ones: -Examples of neural network architectures also include DSSM, C-DSSM, and ARC-I. +```python +from qdrant_client.http.models import PointStruct, Document +from datasets import load_dataset +import uuid -The Comparator is usually a very simple function that could be calculated very quickly. -It might be cosine similarity or even a dot production. -Two-stage schema allows performing complex calculations only once per object. -Once transformed, the Comparator can calculate object similarity independent of the Encoder much more quickly. -For more convenience, embeddings can be placed into specialized storages or vector search engines. -These search engines allow to manage embeddings using API, perform searches and other operations with vectors. +dense_model = "sentence-transformers/all-minilm-l6-v2" -![C-DSSM, example of representation-based model](https://gist.githubusercontent.com/generall/4821e3c6b5eee603d56729e7a156e461/raw/b0eb4ea5d088fe1095e529eb12708ac69f304ce3/cdssm.png) +bm25_model = "qdrant/bm25" -> C-DSSM, example of representation-based model, [Xue Li et al.](https://arxiv.org/abs/1901.10710v2) via arXiv +ds = load_dataset("miriad/miriad-4.4M", split="train[0:100]") -Pre-trained NNs can also be used. The output of the second-to-last layer could work as an embedded representation. -Further in this article, I would focus on the representation-based approach, as it proved to be more flexible and fast. +points = [] -So what are the advantages of using metric learning comparing to classification? -Object Encoder does not assume the number of classes. -So if you can’t split your object into classes, -if the number of classes is too high, or you suspect that it could grow in the future - consider using metric learning. +for idx, item in enumerate(ds): + passage = item["passage_text"] + + point = PointStruct( + id=uuid.uuid4().hex, # use unique string ID + payload=item, + vector={ + "dense_vector": Document( + text=passage, + model=dense_model + ), + "bm25_sparse_vector": Document( + text=passage, + model=bm25_model + ) + } + ) + points.append(point) -In our case, business goal was to find suitable vacancies for candidates who specify the title of the desired position. -To solve this, we used to apply a classifier to determine the job category of the vacancy and the candidate. -But this solution was limited to only a few hundred categories. -Candidates were complaining that they couldn’t find the right category for them. -Training the classifier for new categories would be too long and require new training data for each new category. -Switching to metric learning allowed us to overcome these limitations, the resulting solution could compare any pair position descriptions, even if we don’t have this category reference yet. +client.upload_points( + collection_name=collection_name, + points=points, + batch_size=8 +) +``` +## Set Up Input Query +Create a sample query: -![T-SNE with job samples](https://gist.githubusercontent.com/generall/4821e3c6b5eee603d56729e7a156e461/raw/b0eb4ea5d088fe1095e529eb12708ac69f304ce3/embeddings.png) +```python +query_text = "What is relapsing polychondritis?" +``` -> T-SNE with job samples, Image by Author. Play with [Embedding Projector](https://projector.tensorflow.org/?config=https://gist.githubusercontent.com/generall/7e712425e3b340c2c4dbc1a29f515d91/raw/b45b2b6f6c1d5ab3d3363c50805f3834a85c8879/config.json) yourself. +## Run Vector Search +Here, you will ask a question that will allow you to retrieve semantically relevant results. The final results are obtained by reranking using [Reciprocal Rank Fusion](https://qdrant.tech/documentation/concepts/hybrid-queries/#hybrid-search). -With metric learning, we learn not a concrete job type but how to match job descriptions from a candidate’s CV and a vacancy. -Secondly, with metric learning, it is easy to add more reference occupations without model retraining. -We can then add the reference to a vector search engine. -Next time we will match occupations - this new reference vector will be searchable. +```python +results = client.query_points( + collection_name=collection_name, + prefetch=[ + models.Prefetch( + query=Document( + text=query_text, + model=dense_model + ), + using="dense_vector", + limit=5 + ), + models.Prefetch( + query=Document( + text=query_text, + model=bm25_model + ), + using="bm25_sparse_vector", + limit=5 + ) + ], + query=models.FusionQuery(fusion=models.Fusion.RRF), + limit=5, + with_payload=True +) -## [Anchor](https://qdrant.tech/articles/metric-learning-tips/\#data-for-metric-learning) Data for metric learning +print(results.points) +``` -Unlike classifiers, a metric learning training does not require specific class labels. -All that is required are examples of similar and dissimilar objects. -We would call them positive and negative samples. +The semantic search engine will retrieve the most similar result in order of relevance. +```markdown +[ScoredPoint(id='9968a760-fbb5-4d91-8549-ffbaeb3ebdba', +version=0, score=14.545895, +payload={'text': "Relapsing Polychondritis is a rare..."}, +vector=None, shard_key=None, order_value=None)] +``` -At the same time, it could be a relative similarity between a pair of objects. -For example, twins look more alike to each other than a pair of random people. -And random people are more similar to each other than a man and a cat. -A model can use such relative examples for learning. +<|page-180-lllmstxt|> +# Monitoring Hybrid/Private Cloud with Prometheus and Grafana -The good news is that the division into classes is only a special case of determining similarity. -To use such datasets, it is enough to declare samples from one class as positive and samples from another class as negative. -In this way, it is possible to combine several datasets with mismatched classes into one generalized dataset for metric learning. +This tutorial will guide you through the process of setting up Prometheus and Grafana to monitor Qdrant databases in Kubernetes cluster used for Hybrid or Private Cloud. -But not only datasets with division into classes are suitable for extracting positive and negative examples. -If, for example, there are additional features in the description of the object, the value of these features can also be used as a similarity factor. -It may not be as explicit as class membership, but the relative similarity is also suitable for learning. +## Prerequisites -In the case of job descriptions, there are many ontologies of occupations, which were able to be combined into a single dataset thanks to this approach. -We even went a step further and used identical job titles to find similar descriptions. +This tutorial assumes that you already have a Kubernetes cluster running and a Qdrant database deployed in it, using either a Hybrid Cloud or Private Cloud deployment. You should also have `kubectl` and `helm` configured to interact with your cluster. -As a result, we got a self-supervised universal dataset that did not require any manual labeling. +## Step 1: Install Prometheus and Grafana -Unfortunately, universality does not allow some techniques to be applied in training. -Next, I will describe how to overcome this disadvantage. +If you haven't installed Prometheus and Grafana yet, you can use the [kube-prometheus-stack](https://artifacthub.io/packages/helm/prometheus-community/kube-prometheus-stack) Helm chart to deploy them in your Kubernetes cluster. -## [Anchor](https://qdrant.tech/articles/metric-learning-tips/\#training-the-model) Training the model +A minimal example of installing the stack: + +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts -There are several ways to train a metric learning model. -Among the most popular is the use of Triplet or Contrastive loss functions, but I will not go deep into them in this article. -However, I will tell you about one interesting trick that helped us work with unified training examples. +helm install prometheus prometheus-community/kube-prometheus-stack --namespace monitoring --create-namespace +``` -One of the most important practices to efficiently train the metric learning model is hard negative mining. -This technique aims to include negative samples on which model gave worse predictions during the last training epoch. -Most articles that describe this technique assume that training data consists of many small classes (in most cases it is people’s faces). -With data like this, it is easy to find bad samples - if two samples from different classes have a high similarity score, we can use it as a negative sample. -But we had no such classes in our data, the only thing we have is occupation pairs assumed to be similar in some way. -We cannot guarantee that there is no better match for each job occupation among this pair. -That is why we can’t use hard negative mining for our model. +This command will install Prometheus, Grafana, and all necessary components into a new `monitoring` namespace. -![Loss variations](https://gist.githubusercontent.com/generall/4821e3c6b5eee603d56729e7a156e461/raw/b0eb4ea5d088fe1095e529eb12708ac69f304ce3/losses.png) +## Step 2: Configure Prometheus to Scrape Qdrant Metrics -> [Alfonso Medela et al.](https://arxiv.org/abs/1905.10675) via arXiv +To monitor Qdrant, you need to configure Prometheus to scrape metrics from the Qdrant database(s). You can do this by creating a `ServiceMonitor` resource in the host Kubernetes cluster. -To compensate for this limitation we can try to increase the number of random (weak) negative samples. -One way to achieve this is to train the model longer, so it will see more samples by the end of the training. -But we found a better solution in adjusting our loss function. -In a regular implementation of Triplet or Contractive loss, each positive pair is compared with some or a few negative samples. -What we did is we allow pair comparison amongst the whole batch. -That means that loss-function penalizes all pairs of random objects if its score exceeds any of the positive scores in a batch. -This extension gives `~ N * B^2` comparisons where `B` is a size of batch and `N` is a number of batches. -Much bigger than `~ N * B` in regular triplet loss. -This means that increasing the size of the batch significantly increases the number of negative comparisons, and therefore should improve the model performance. -We were able to observe this dependence in our experiments. -Similar idea we also found in the article [Supervised Contrastive Learning](https://arxiv.org/abs/2004.11362). +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: qdrant-cluster-exporter + namespace: qdrant + labels: + release: prometheus +spec: + endpoints: + - honorLabels: true + interval: 60s + port: metrics + scheme: http + scrapeTimeout: 55s + jobLabel: app.kubernetes.io/name + namespaceSelector: + matchNames: + - qdrant + selector: + matchLabels: + app.kubernetes.io/instance: qdrant-cluster-exporter + app.kubernetes.io/name: qdrant-cluster-exporter +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: qdrant-operator + namespace: qdrant + labels: + release: prometheus +spec: + endpoints: + - honorLabels: true + interval: 60s + port: metrics + scheme: http + scrapeTimeout: 55s + jobLabel: app.kubernetes.io/name + namespaceSelector: + matchNames: + - qdrant + selector: + matchLabels: + app.kubernetes.io/name: operator +``` -## [Anchor](https://qdrant.tech/articles/metric-learning-tips/\#model-confidence) Model confidence +The example aboves assumes that your Qdrant database and the cloud platform exporter are deployed in the `qdrant` namespace. Adjust the `namespaceSelector` and `namespace` fields according to your deployment. -In real life it is often needed to know how confident the model was in the prediction. -Whether manual adjustment or validation of the result is required. +## Step 3: Access Grafana -With conventional classification, it is easy to understand by scores how confident the model is in the result. -If the probability values of different classes are close to each other, the model is not confident. -If, on the contrary, the most probable class differs greatly, then the model is confident. +Once Prometheus is configured to scrape metrics from Qdrant, you can access Grafana to visualize the metrics. -At first glance, this cannot be applied to metric learning. -Even if the predicted object similarity score is small it might only mean that the reference set has no proper objects to compare with. -Conversely, the model can group garbage objects with a large score. +Get the Grafana 'admin' user password by running: + +```bash +kubectl --namespace monitoring get secrets prometheus-grafana -o jsonpath="{.data.admin-password}" | base64 -d ; echo +``` + +Access the Grafana dashboard by port-forwarding: + +```bash +export POD_NAME=$(kubectl --namespace monitoring get pod -l "app.kubernetes.io/name=grafana,app.kubernetes.io/instance=prometheus" -oname) +kubectl --namespace monitoring port-forward $POD_NAME 3000 +``` -Fortunately, we found a small modification to the embedding generator, which allows us to define confidence in the same way as it is done in conventional classifiers with a Softmax activation function. -The modification consists in building an embedding as a combination of feature groups. -Each feature group is presented as a one-hot encoded sub-vector in the embedding. -If the model can confidently predict the feature value - the corresponding sub-vector will have a high absolute value in some of its elements. -For a more intuitive understanding, I recommend thinking about embeddings not as points in space, but as a set of binary features. +Now you can open your web browser and go to `http://localhost:3000`. Log in with the username `admin` and the password you retrieved earlier. -To implement this modification and form proper feature groups we would need to change a regular linear output layer to a concatenation of several Softmax layers. -Each softmax component would represent an independent feature and force the neural network to learn them. +## Step 4: Import Qdrant Dashboard -Let’s take for example that we have 4 softmax components with 128 elements each. -Every such component could be roughly imagined as a one-hot-encoded number in the range of 0 to 127. -Thus, the resulting vector will represent one of `128^4` possible combinations. -If the trained model is good enough, you can even try to interpret the values of singular features individually. +Qdrant Cloud offers an example Grafana Dashboard on the [Qdrant GitHub repository](https://github.com/qdrant/qdrant-cloud-grafana-dashboard). This comes with built in views and graphs to get you started with monitoring your Qdrant Clusters. -![Softmax feature embeddings](https://gist.githubusercontent.com/generall/4821e3c6b5eee603d56729e7a156e461/raw/b0eb4ea5d088fe1095e529eb12708ac69f304ce3/feature_embedding.png) +To import the dashboard: -> Softmax feature embeddings, Image by Author. +1. In Grafana, go to "Dashboards" and click on "New" -> "Import". +2. Copy and paste the dashboard JSON from the Qdrant GitHub repository. +3. Click "Load" and then "Import". -## [Anchor](https://qdrant.tech/articles/metric-learning-tips/\#neural-rules) Neural rules +<|page-181-lllmstxt|> +# Qdrant on Databricks -Machine learning models rarely train to 100% accuracy. -In a conventional classifier, errors can only be eliminated by modifying and repeating the training process. -Metric training, however, is more flexible in this matter and allows you to introduce additional steps that allow you to correct the errors of an already trained model. +| Time: 30 min | Level: Intermediate | [Complete Notebook](https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/4750876096379825/93425612168199/6949977306828869/latest.html) | +| ------------ | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -A common error of the metric learning model is erroneously declaring objects close although in reality they are not. -To correct this kind of error, we introduce exclusion rules. +[Databricks](https://www.databricks.com/) is a unified analytics platform for working with big data and AI. It's built around Apache Spark, a powerful open-source distributed computing system well-suited for processing large-scale datasets and performing complex analytics tasks. -Rules consist of 2 object anchors encoded into vector space. -If the target object falls into one of the anchors’ effects area - it triggers the rule. It will exclude all objects in the second anchor area from the prediction result. +Apache Spark is designed to scale horizontally, meaning it can handle expensive operations like generating vector embeddings by distributing computation across a cluster of machines. This scalability is crucial when dealing with large datasets. -![Exclusion rules](https://gist.githubusercontent.com/generall/4821e3c6b5eee603d56729e7a156e461/raw/b0eb4ea5d088fe1095e529eb12708ac69f304ce3/exclusion_rule.png) +In this example, we will demonstrate how to vectorize a dataset with dense and sparse embeddings using Qdrant's [FastEmbed](https://qdrant.github.io/fastembed/) library. We will then load this vectorized data into a Qdrant cluster using the [Qdrant Spark connector](/documentation/frameworks/spark/) on Databricks. -> Neural exclusion rules, Image by Author. +### Setting up a Databricks project -The convenience of working with embeddings is that regardless of the number of rules, -you only need to perform the encoding once per object. -Then to find a suitable rule, it is enough to compare the target object’s embedding and the pre-calculated embeddings of the rule’s anchors. -Which, when implemented, translates into just one additional query to the vector search engine. +- Set up a **[Databricks cluster](https://docs.databricks.com/en/compute/configure.html)** following the official documentation guidelines. -## [Anchor](https://qdrant.tech/articles/metric-learning-tips/\#vector-search-in-production) Vector search in production +- Install the **[Qdrant Spark connector](/documentation/frameworks/spark/)** as a library: + - Navigate to the `Libraries` section in your cluster dashboard. + - Click on `Install New` at the top-right to open the library installation modal. + - Search for `io.qdrant:spark:VERSION` in the Maven packages and click on `Install`. -When implementing a metric learning model in production, the question arises about the storage and management of vectors. -It should be easy to add new vectors if new job descriptions appear in the service. + ![Install the library](/documentation/examples/databricks/library-install.png) -In our case, we also needed to apply additional conditions to the search. -We needed to filter, for example, the location of candidates and the level of language proficiency. +- Create a new **[Databricks notebook](https://docs.databricks.com/en/notebooks/index.html)** on your cluster to begin working with your data and libraries. -We did not find a ready-made tool for such vector management, so we created [Qdrant](https://github.com/qdrant/qdrant) \- open-source vector search engine. +### Download a dataset -It allows you to add and delete vectors with a simple API, independent of a programming language you are using. -You can also assign the payload to vectors. -This payload allows additional filtering during the search request. +- **Install the required dependencies:** -Qdrant has a pre-built docker image and start working with it is just as simple as running +```python +%pip install fastembed datasets +``` -```bash -docker run -p 6333:6333 qdrant/qdrant +- **Download the dataset:** + +```python +from datasets import load_dataset +dataset_name = "tasksource/med" +dataset = load_dataset(dataset_name, split="train") +# We'll use the first 100 entries from this dataset and exclude some unused columns. +dataset = dataset.select(range(100)).remove_columns(["gold_label", "genre"]) ``` -Documentation with examples could be found [here](https://api.qdrant.tech/api-reference). +- **Convert the dataset into a Spark dataframe:** + +```python +dataset.to_parquet("/dbfs/pq.pq") +dataset_df = spark.read.parquet("file:/dbfs/pq.pq") +``` -## [Anchor](https://qdrant.tech/articles/metric-learning-tips/\#conclusion) Conclusion +### Vectorizing the data -In this article, I have shown how metric learning can be more scalable and flexible than the classification models. -I suggest trying similar approaches in your tasks - it might be matching similar texts, images, or audio data. -With the existing variety of pre-trained neural networks and a vector search engine, it is easy to build your metric learning-based application. +In this section, we'll be generating both dense and sparse vectors for our rows using [FastEmbed](https://qdrant.github.io/fastembed/). We'll create a user-defined function (UDF) to handle this step. -##### Was this page useful? +#### Creating the vectorization function -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +```python +from fastembed import TextEmbedding, SparseTextEmbedding -Thank you for your feedback! 🙏 +def vectorize(partition_data): + # Initialize dense and sparse models + dense_model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5") + sparse_model = SparseTextEmbedding(model_name="Qdrant/bm25") -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/metric-learning-tips.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. + for row in partition_data: + # Generate dense and sparse vectors + dense_vector = next(dense_model.embed(row.sentence1)) + sparse_vector = next(sparse_model.embed(row.sentence2)) -On this page: + yield [ + row.sentence1, # 1st column: original text + row.sentence2, # 2nd column: original text + dense_vector.tolist(), # 3rd column: dense vector + sparse_vector.indices.tolist(), # 4th column: sparse vector indices + sparse_vector.values.tolist(), # 5th column: sparse vector values + ] +``` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/metric-learning-tips.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +We're using the [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) model for dense embeddings and [BM25](https://huggingface.co/Qdrant/bm25) for sparse embeddings. -× +#### Applying the UDF on our dataframe -[Powered by](https://qdrant.tech/) +Next, let's apply our `vectorize` UDF on our Spark dataframe to generate embeddings. -<|page-104-lllmstxt|> -## filtrable-hnsw -- [Articles](https://qdrant.tech/articles/) -- Filtrable HNSW +```python +embeddings = dataset_df.rdd.mapPartitions(vectorize) +``` -[Back to Qdrant Internals](https://qdrant.tech/articles/qdrant-internals/) +The `mapPartitions()` method returns a [Resilient Distributed Dataset (RDD)](https://www.databricks.com/glossary/what-is-rdd) which should then be converted back to a Spark dataframe. -# Filtrable HNSW +#### Building the new Spark dataframe with the vectorized data -Andrei Vasnetsov +We'll now create a new Spark dataframe (`embeddings_df`) with the vectorized data using the specified schema. -· +```python +from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType -November 24, 2019 +# Define the schema for the new dataframe +schema = StructType([ + StructField("sentence1", StringType()), + StructField("sentence2", StringType()), + StructField("dense_vector", ArrayType(FloatType())), + StructField("sparse_vector_indices", ArrayType(IntegerType())), + StructField("sparse_vector_values", ArrayType(FloatType())) +]) -![Filtrable HNSW](https://qdrant.tech/articles_data/filtrable-hnsw/preview/title.jpg) +# Create the new dataframe with the vectorized data +embeddings_df = spark.createDataFrame(data=embeddings, schema=schema) +``` -If you need to find some similar objects in vector space, provided e.g. by embeddings or matching NN, you can choose among a variety of libraries: Annoy, FAISS or NMSLib. -All of them will give you a fast approximate neighbors search within almost any space. +### Uploading the data to Qdrant -But what if you need to introduce some constraints in your search? -For example, you want search only for products in some category or select the most similar customer of a particular brand. -I did not find any simple solutions for this. -There are several discussions like [this](https://github.com/spotify/annoy/issues/263), but they only suggest to iterate over top search results and apply conditions consequently after the search. +- **Create a Qdrant collection:** + - [Follow the documentation](/documentation/concepts/collections/#create-a-collection) to create a collection with the appropriate configurations. Here's an example request to support both dense and sparse vectors: -Let’s see if we could somehow modify any of ANN algorithms to be able to apply constrains during the search itself. + ```json + PUT /collections/{collection_name} + { + "vectors": { + "dense": { + "size": 384, + "distance": "Cosine" + } + }, + "sparse_vectors": { + "sparse": {} + } + } + ``` -Annoy builds tree index over random projections. -Tree index implies that we will meet same problem that appears in relational databases: -if field indexes were built independently, then it is possible to use only one of them at a time. -Since nobody solved this problem before, it seems that there is no easy approach. +- **Upload the dataframe to Qdrant:** -There is another algorithm which shows top results on the [benchmark](https://github.com/erikbern/ann-benchmarks). -It is called HNSW which stands for Hierarchical Navigable Small World. +```python +options = { + "qdrant_url": "", + "api_key": "", + "collection_name": "", + "vector_fields": "dense_vector", + "vector_names": "dense", + "sparse_vector_value_fields": "sparse_vector_values", + "sparse_vector_index_fields": "sparse_vector_indices", + "sparse_vector_names": "sparse", + "schema": embeddings_df.schema.json(), +} -The [original paper](https://arxiv.org/abs/1603.09320) is well written and very easy to read, so I will only give the main idea here. -We need to build a navigation graph among all indexed points so that the greedy search on this graph will lead us to the nearest point. -This graph is constructed by sequentially adding points that are connected by a fixed number of edges to previously added points. -In the resulting graph, the number of edges at each point does not exceed a given threshold m and always contains the nearest considered points. +embeddings_df.write.format("io.qdrant.spark.Qdrant").options(**options).mode( + "append" +).save() +``` -![NSW](https://qdrant.tech/articles_data/filtrable-hnsw/NSW.png) + -### [Anchor](https://qdrant.tech/articles/filtrable-hnsw/\#how-can-we-modify-it) How can we modify it? +Ensure to replace the placeholder values (``, ``, ``) with your actual values. If the `id_field` option is not specified, Qdrant Spark connector generates random UUIDs for each point. -What if we simply apply the filter criteria to the nodes of this graph and use in the greedy search only those that meet these criteria? -It turns out that even with this naive modification algorithm can cover some use cases. +The command output you should see is similar to: -One such case is if your criteria do not correlate with vector semantics. -For example, you use a vector search for clothing names and want to filter out some sizes. -In this case, the nodes will be uniformly filtered out from the entire cluster structure. -Therefore, the theoretical conclusions obtained in the [Percolation theory](https://en.wikipedia.org/wiki/Percolation_theory) become applicable: +```console +Command took 40.37 seconds -- by xxxxx90@xxxxxx.com at 4/17/2024, 12:13:28 PM on fastembed +``` -> Percolation is related to the robustness of the graph (called also network). Given a random graph of n nodes and an average degree ⟹k⟩ . Next we remove randomly a fraction 1−p of nodes and leave only a fraction p. There exists a critical percolation threshold pc=1⟹k⟩ below which the network becomes fragmented while above pc a giant connected component exists. +### Conclusion -This statement also confirmed by experiments: +That wraps up our tutorial! Feel free to explore more functionalities and experiments with different models, parameters, and features available in Databricks, Spark, and Qdrant. -![Dependency of connectivity to the number of edges](https://qdrant.tech/articles_data/filtrable-hnsw/exp_connectivity_glove_m0.png) +Happy data engineering! -Dependency of connectivity to the number of edges +<|page-182-lllmstxt|> +# Semantic Querying with Airflow and Astronomer -![Dependency of connectivity to the number of point (no dependency).](https://qdrant.tech/articles_data/filtrable-hnsw/exp_connectivity_glove_num_elements.png) +| Time: 45 min | Level: Intermediate | | | +| ------------ | ------------------- | --- | --- | -Dependency of connectivity to the number of point (no dependency). +In this tutorial, you will use Qdrant as a [provider](https://airflow.apache.org/docs/apache-airflow-providers-qdrant/stable/index.html) in [Apache Airflow](https://airflow.apache.org/), an open-source tool that lets you setup data-engineering workflows. -There is a clear threshold when the search begins to fail. -This threshold is due to the decomposition of the graph into small connected components. -The graphs also show that this threshold can be shifted by increasing the m parameter of the algorithm, which is responsible for the degree of nodes. +You will write the pipeline as a DAG (Directed Acyclic Graph) in Python. With this, you can leverage the powerful suite of Python's capabilities and libraries to achieve almost anything your data pipeline needs. -Let’s consider some other filtering conditions we might want to apply in the search: +[Astronomer](https://www.astronomer.io/) is a managed platform that simplifies the process of developing and deploying Airflow projects via its easy-to-use CLI and extensive automation capabilities. -- Categorical filtering - - Select only points in a specific category - - Select points which belong to a specific subset of categories - - Select points with a specific set of labels -- Numerical range -- Selection within some geographical region +Airflow is useful when running operations in Qdrant based on data events or building parallel tasks for generating vector embeddings. By using Airflow, you can set up monitoring and alerts for your pipelines for full observability. -In the first case, we can guarantee that the HNSW graph will be connected simply by creating additional edges -inside each category separately, using the same graph construction algorithm, and then combining them into the original graph. -In this case, the total number of edges will increase by no more than 2 times, regardless of the number of categories. +## Prerequisites -Second case is a little harder. A connection may be lost between two categories if they lie in different clusters. +Please make sure you have the following ready: -![category clusters](https://qdrant.tech/articles_data/filtrable-hnsw/hnsw_graph_category.png) +- A running Qdrant instance. We'll be using a free instance from +- The Astronomer CLI. Find the installation instructions [here](https://docs.astronomer.io/astro/cli/install-cli). +- A [HuggingFace token](https://huggingface.co/docs/hub/en/security-tokens) to generate embeddings. -The idea here is to build same navigation graph but not between nodes, but between categories. -Distance between two categories might be defined as distance between category entry points (or, for precision, as the average distance between a random sample). Now we can estimate expected graph connectivity by number of excluded categories, not nodes. -It still does not guarantee that two random categories will be connected, but allows us to switch to multiple searches in each category if connectivity threshold passed. In some cases, multiple searches can be even faster if you take advantage of parallel processing. +## Implementation -![Dependency of connectivity to the random categories included in search](https://qdrant.tech/articles_data/filtrable-hnsw/exp_random_groups.png) +We'll be building a DAG that generates embeddings in parallel for our data corpus and performs semantic retrieval based on user input. -Dependency of connectivity to the random categories included in search +### Set up the project -Third case might be resolved in a same way it is resolved in classical databases. -Depending on labeled subsets size ration we can go for one of the following scenarios: +The Astronomer CLI makes it very straightforward to set up the Airflow project: -- if at least one subset is small: perform search over the label containing smallest subset and then filter points consequently. -- if large subsets give large intersection: perform regular search with constraints expecting that intersection size fits connectivity threshold. -- if large subsets give small intersection: perform linear search over intersection expecting that it is small enough to fit a time frame. +```console +mkdir qdrant-airflow-tutorial && cd qdrant-airflow-tutorial +astro dev init +``` -Numerical range case can be reduces to the previous one if we split numerical range into a buckets containing equal amount of points. -Next we also connect neighboring buckets to achieve graph connectivity. We still need to filter some results which presence in border buckets but do not fulfill actual constraints, but their amount might be regulated by the size of buckets. +This command generates all of the project files you need to run Airflow locally. You can find a directory called `dags`, which is where we can place our Python DAG files. -Geographical case is a lot like a numerical one. -Usual geographical search involves [geohash](https://en.wikipedia.org/wiki/Geohash), which matches any geo-point to a fixes length identifier. +To use Qdrant within Airflow, install the Qdrant Airflow provider by adding the following to the `requirements.txt` file -![Geohash example](https://qdrant.tech/articles_data/filtrable-hnsw/geohash.png) +```text +apache-airflow-providers-qdrant +``` -We can use this identifiers as categories and additionally make connections between neighboring geohashes. -It will ensure that any selected geographical region will also contain connected HNSW graph. +### Configure credentials -## [Anchor](https://qdrant.tech/articles/filtrable-hnsw/\#conclusion) Conclusion +We can set up provider connections using the Airflow UI, environment variables or the `airflow_settings.yml` file. -It is possible to enchant HNSW algorithm so that it will support filtering points in a first search phase. -Filtering can be carried out on the basis of belonging to categories, -which in turn is generalized to such popular cases as numerical ranges and geo. +Add the following to the `.env` file in the project. Replace the values as per your credentials. -Experiments were carried by modification [python implementation](https://github.com/generall/hnsw-python) of the algorithm, -but real production systems require much faster version, like [NMSLib](https://github.com/nmslib/nmslib). +```env +HUGGINGFACE_TOKEN="" +AIRFLOW_CONN_QDRANT_DEFAULT='{ + "conn_type": "qdrant", + "host": "xyz-example.eu-central.aws.cloud.qdrant.io:6333", + "password": "" +}' +``` -##### Was this page useful? +### Add the data corpus -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Let's add some sample data to work with. Paste the following content into a file called `books.txt` file within the `include` directory. -Thank you for your feedback! 🙏 +```text +1 | To Kill a Mockingbird (1960) | fiction | Harper Lee's Pulitzer Prize-winning novel explores racial injustice and moral growth through the eyes of young Scout Finch in the Deep South. +2 | Harry Potter and the Sorcerer's Stone (1997) | fantasy | J.K. Rowling's magical tale follows Harry Potter as he discovers his wizarding heritage and attends Hogwarts School of Witchcraft and Wizardry. +3 | The Great Gatsby (1925) | fiction | F. Scott Fitzgerald's classic novel delves into the glitz, glamour, and moral decay of the Jazz Age through the eyes of narrator Nick Carraway and his enigmatic neighbour, Jay Gatsby. +4 | 1984 (1949) | dystopian | George Orwell's dystopian masterpiece paints a chilling picture of a totalitarian society where individuality is suppressed and the truth is manipulated by a powerful regime. +5 | The Catcher in the Rye (1951) | fiction | J.D. Salinger's iconic novel follows disillusioned teenager Holden Caulfield as he navigates the complexities of adulthood and society's expectations in post-World War II America. +6 | Pride and Prejudice (1813) | romance | Jane Austen's beloved novel revolves around the lively and independent Elizabeth Bennet as she navigates love, class, and societal expectations in Regency-era England. +7 | The Hobbit (1937) | fantasy | J.R.R. Tolkien's adventure follows Bilbo Baggins, a hobbit who embarks on a quest with a group of dwarves to reclaim their homeland from the dragon Smaug. +8 | The Lord of the Rings (1954-1955) | fantasy | J.R.R. Tolkien's epic fantasy trilogy follows the journey of Frodo Baggins to destroy the One Ring and defeat the Dark Lord Sauron in the land of Middle-earth. +9 | The Alchemist (1988) | fiction | Paulo Coelho's philosophical novel follows Santiago, an Andalusian shepherd boy, on a journey of self-discovery and spiritual awakening as he searches for a hidden treasure. +10 | The Da Vinci Code (2003) | mystery/thriller | Dan Brown's gripping thriller follows symbologist Robert Langdon as he unravels clues hidden in art and history while trying to solve a murder mystery with far-reaching implications. +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/filtrable-hnsw.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Now, the hacking part - writing our Airflow DAG! -On this page: +### Write the dag -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/filtrable-hnsw.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +We'll add the following content to a `books_recommend.py` file within the `dags` directory. Let's go over what it does for each task. -× +```python +import os +import requests -[Powered by](https://qdrant.tech/) +from airflow.decorators import dag, task +from airflow.models.baseoperator import chain +from airflow.models.param import Param +from airflow.providers.qdrant.hooks.qdrant import QdrantHook +from airflow.providers.qdrant.operators.qdrant import QdrantIngestOperator +from pendulum import datetime +from qdrant_client import models -<|page-105-lllmstxt|> -## cluster-monitoring -- [Documentation](https://qdrant.tech/documentation/) -- [Cloud](https://qdrant.tech/documentation/cloud/) -- Monitor Clusters -# [Anchor](https://qdrant.tech/documentation/cloud/cluster-monitoring/\#monitoring-qdrant-cloud-clusters) Monitoring Qdrant Cloud Clusters +QDRANT_CONNECTION_ID = "qdrant_default" +DATA_FILE_PATH = "include/books.txt" +COLLECTION_NAME = "airflow_tutorial_collection" -## [Anchor](https://qdrant.tech/documentation/cloud/cluster-monitoring/\#telemetry) Telemetry +EMBEDDING_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2" +EMBEDDING_DIMENSION = 384 +SIMILARITY_METRIC = models.Distance.COSINE -![Cluster Metrics](https://qdrant.tech/documentation/cloud/cluster-metrics.png) -Qdrant Cloud provides you with a set of metrics to monitor the health of your database cluster. You can access these metrics in the Qdrant Cloud Console in the **Metrics** and **Request** sections of the cluster details page. +def embed(text: str) -> list: + HUGGINFACE_URL = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{EMBEDDING_MODEL_ID}" + response = requests.post( + HUGGINFACE_URL, + headers={"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN')}"}, + json={"inputs": [text], "options": {"wait_for_model": True}}, + ) + return response.json()[0] -## [Anchor](https://qdrant.tech/documentation/cloud/cluster-monitoring/\#logs) Logs -![Cluster Logs](https://qdrant.tech/documentation/cloud/cluster-logs.png) +@dag( + dag_id="books_recommend", + start_date=datetime(2023, 10, 18), + schedule=None, + catchup=False, + params={"preference": Param("Something suspenseful and thrilling.", type="string")}, +) +def recommend_book(): + @task + def import_books(text_file_path: str) -> list: + data = [] + with open(text_file_path, "r") as f: + for line in f: + _, title, genre, description = line.split("|") + data.append( + { + "title": title.strip(), + "genre": genre.strip(), + "description": description.strip(), + } + ) -Logs of the database cluster are available in the Qdrant Cloud Console in the **Logs** section of the cluster details page. + return data -## [Anchor](https://qdrant.tech/documentation/cloud/cluster-monitoring/\#alerts) Alerts + @task + def init_collection(): + hook = QdrantHook(conn_id=QDRANT_CONNECTION_ID) + if not hook.conn..collection_exists(COLLECTION_NAME): + hook.conn.create_collection( + COLLECTION_NAME, + vectors_config=models.VectorParams( + size=EMBEDDING_DIMENSION, distance=SIMILARITY_METRIC + ), + ) -You will receive automatic alerts via email before your cluster reaches the currently configured memory or storage limits, including recommendations for scaling your cluster. + @task + def embed_description(data: dict) -> list: + return embed(data["description"]) -## [Anchor](https://qdrant.tech/documentation/cloud/cluster-monitoring/\#qdrant-database-metrics-and-telemetry) Qdrant Database Metrics and Telemetry + books = import_books(text_file_path=DATA_FILE_PATH) + embeddings = embed_description.expand(data=books) -You can also directly access the metrics and telemetry that the Qdrant database nodes provide. + qdrant_vector_ingest = QdrantIngestOperator( + conn_id=QDRANT_CONNECTION_ID, + task_id="qdrant_vector_ingest", + collection_name=COLLECTION_NAME, + payload=books, + vectors=embeddings, + ) -To scrape metrics from a Qdrant cluster running in Qdrant Cloud, an [API key](https://qdrant.tech/documentation/cloud/authentication/) is required to access `/metrics` and `/sys_metrics`. Qdrant Cloud also supports supplying the API key as a [Bearer token](https://www.rfc-editor.org/rfc/rfc6750.html), which may be required by some providers. + @task + def embed_preference(**context) -> list: + user_mood = context["params"]["preference"] + response = embed(text=user_mood) -### [Anchor](https://qdrant.tech/documentation/cloud/cluster-monitoring/\#qdrant-node-metrics) Qdrant Node Metrics + return response -Metrics in a Prometheus compatible format are available at the `/metrics` endpoint of each Qdrant database node. When scraping, you should use the [node specific URLs](https://qdrant.tech/documentation/cloud/cluster-access/#node-specific-endpoints) to ensure that you are scraping metrics from all nodes in each cluster. For more information see [Qdrant monitoring](https://qdrant.tech/documentation/guides/monitoring/). + @task + def search_qdrant( + preference_embedding: list, + ) -> None: + hook = QdrantHook(conn_id=QDRANT_CONNECTION_ID) -You can also access the `/telemetry` [endpoint](https://api.qdrant.tech/api-reference/service/telemetry) of your database. This endpoint is available on the cluster endpoint and provides information about the current state of the database, including the number of vectors, shards, and other useful information. + result = hook.conn.query_points( + collection_name=COLLECTION_NAME, + query=preference_embedding, + limit=1, + with_payload=True, + ).points -For more information, see [Qdrant monitoring](https://qdrant.tech/documentation/guides/monitoring/). + print("Book recommendation: " + result[0].payload["title"]) + print("Description: " + result[0].payload["description"]) -### [Anchor](https://qdrant.tech/documentation/cloud/cluster-monitoring/\#cluster-system-metrics) Cluster System Metrics + chain( + init_collection(), + qdrant_vector_ingest, + search_qdrant(embed_preference()), + ) -Cluster system metrics is a cloud-only endpoint that not only shares all the information about the database from `/metrics` but also provides additional operational data from our infrastructure about your cluster, including information from our load balancers, ingresses, and cluster workloads themselves. -Metrics in a Prometheus-compatible format are available at the `/sys_metrics` cluster endpoint. Database API Keys are used to authenticate access to cluster system metrics. `/sys_metrics` only need to be queried once per cluster on the main load-balanced cluster endpoint. You don’t need to scrape each cluster node individually, instead it will always provide metrics about all nodes. +recommend_book() +``` -## [Anchor](https://qdrant.tech/documentation/cloud/cluster-monitoring/\#grafana-dashboard) Grafana Dashboard +`import_books`: This task reads a text file containing information about the books (like title, genre, and description), and then returns the data as a list of dictionaries. -If you scrape your Qdrant Cluster system metrics into your own monitoring system, and your are using Grafana, you can use our [Grafana dashboard](https://github.com/qdrant/qdrant-cloud-grafana-dashboard) to visualize these metrics. +`init_collection`: This task initializes a collection in the Qdrant database, where we will store the vector representations of the book descriptions. -![Grafa dashboard](https://qdrant.tech/documentation/cloud/cloud-grafana-dashboard.png) +`embed_description`: This is a dynamic task that creates one mapped task instance for each book in the list. The task uses the `embed` function to generate vector embeddings for each description. To use a different embedding model, you can adjust the `EMBEDDING_MODEL_ID`, `EMBEDDING_DIMENSION` values. -Qdrant's Full Observability with Monitoring - YouTube +`embed_user_preference`: Here, we take a user's input and convert it into a vector using the same pre-trained model used for the book descriptions. -[Photo image of Qdrant - Vector Database & Search Engine](https://www.youtube.com/channel/UC6ftm8PwH1RU_LM1jwG0LQA?embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +`qdrant_vector_ingest`: This task ingests the book data into the Qdrant collection using the [QdrantIngestOperator](https://airflow.apache.org/docs/apache-airflow-providers-qdrant/1.0.0/), associating each book description with its corresponding vector embeddings. -Qdrant - Vector Database & Search Engine +`search_qdrant`: Finally, this task performs a search in the Qdrant database using the vectorized user preference. It finds the most relevant book in the collection based on vector similarity. -8.12K subscribers +### Run the DAG -[Qdrant's Full Observability with Monitoring](https://www.youtube.com/watch?v=pKPP-tL5_6w) +Head over to your terminal and run +```astro dev start``` -Qdrant - Vector Database & Search Engine +A local Airflow container should spawn. You can now access the Airflow UI at . Visit our DAG by clicking on `books_recommend`. -Search +![DAG](/documentation/examples/airflow/demo-dag.png) -Watch later +Hit the PLAY button on the right to run the DAG. You'll be asked for input about your preference, with the default value already filled in. -Share +![Preference](/documentation/examples/airflow/preference-input.png) -Copy link +After your DAG run completes, you should be able to see the output of your search in the logs of the `search_qdrant` task. -Info +![Output](/documentation/examples/airflow/output.png) -Shopping +There you have it, an Airflow pipeline that interfaces with Qdrant! Feel free to fiddle around and explore Airflow. There are references below that might come in handy. -Tap to unmute +## Further reading -If playback doesn't begin shortly, try restarting your device. +- [Introduction to Airflow](https://docs.astronomer.io/learn/intro-to-airflow) +- [Airflow Concepts](https://docs.astronomer.io/learn/category/airflow-concepts) +- [Airflow Reference](https://airflow.apache.org/docs/) +- [Astronomer Documentation](https://docs.astronomer.io/) -More videos +<|page-183-lllmstxt|> +## Cloud Tutorials & Examples -## More videos +| Example | Description | +| ----------------------------------- | ------------------------------------------------------------------------------------------- | +| [Using Cloud Inference to Build Hybrid Search](/documentation/tutorials-and-examples/cloud-inference-hybrid-search/) | Cloud inference hybrid example | -You're signed out +<|page-184-lllmstxt|> +# Setup Data Streaming with Kafka via Confluent -Videos you watch may be added to the TV's watch history and influence TV recommendations. To avoid this, cancel and sign in to YouTube on your computer. +**Author:** [M K Pavan Kumar](https://www.linkedin.com/in/kameshwara-pavan-kumar-mantha-91678b21/) , research scholar at [IIITDM, Kurnool](https://iiitk.ac.in). Specialist in hallucination mitigation techniques and RAG methodologies. +‱ [GitHub](https://github.com/pavanjava) ‱ [Medium](https://medium.com/@manthapavankumar11) -CancelConfirm +## Introduction -Share +This guide will walk you through the detailed steps of installing and setting up the [Qdrant Sink Connector](https://github.com/qdrant/qdrant-kafka), building the necessary infrastructure, and creating a practical playground application. By the end of this article, you will have a deep understanding of how to leverage this powerful integration to streamline your data workflows, ultimately enhancing the performance and capabilities of your data-driven real-time semantic search and RAG applications. -Include playlist +In this example, original data will be sourced from Azure Blob Storage and MongoDB. -An error occurred while retrieving sharing information. Please try again later. +![1.webp](/documentation/examples/data-streaming-kafka-qdrant/1.webp) -[Watch on](https://www.youtube.com/watch?v=pKPP-tL5_6w&embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +Figure 1: [Real time Change Data Capture (CDC)](https://www.confluent.io/learn/change-data-capture/) with Kafka and Qdrant. -0:00 +## The Architecture: -0:00 / 2:31 -‱Live +## Source Systems -‱ +The architecture begins with the **source systems**, represented by MongoDB and Azure Blob Storage. These systems are vital for storing and managing raw data. MongoDB, a popular NoSQL database, is known for its flexibility in handling various data formats and its capability to scale horizontally. It is widely used for applications that require high performance and scalability. Azure Blob Storage, on the other hand, is Microsoft’s object storage solution for the cloud. It is designed for storing massive amounts of unstructured data, such as text or binary data. The data from these sources is extracted using **source connectors**, which are responsible for capturing changes in real-time and streaming them into Kafka. -[Watch on YouTube](https://www.youtube.com/watch?v=pKPP-tL5_6w "Watch on YouTube") +## Kafka -### [Anchor](https://qdrant.tech/documentation/cloud/cluster-monitoring/\#cluster-system-mtrics-sys_metrics) Cluster System Mtrics `/sys_metrics` +At the heart of this architecture lies **Kafka**, a distributed event streaming platform capable of handling trillions of events a day. Kafka acts as a central hub where data from various sources can be ingested, processed, and distributed to various downstream systems. Its fault-tolerant and scalable design ensures that data can be reliably transmitted and processed in real-time. Kafka’s capability to handle high-throughput, low-latency data streams makes it an ideal choice for real-time data processing and analytics. The use of **Confluent** enhances Kafka’s functionalities, providing additional tools and services for managing Kafka clusters and stream processing. -In Qdrant Cloud, each Qdrant cluster will expose the following metrics. This endpoint is not available when running Qdrant open-source. +## Qdrant -**List of metrics** +The processed data is then routed to **Qdrant**, a highly scalable vector search engine designed for similarity searches. Qdrant excels at managing and searching through high-dimensional vector data, which is essential for applications involving machine learning and AI, such as recommendation systems, image recognition, and natural language processing. The **Qdrant Sink Connector** for Kafka plays a pivotal role here, enabling seamless integration between Kafka and Qdrant. This connector allows for the real-time ingestion of vector data into Qdrant, ensuring that the data is always up-to-date and ready for high-performance similarity searches. -| Name | Type | Meaning | -| --- | --- | --- | -| app\_info | gauge | Information about the Qdrant server | -| app\_status\_recovery\_mode | gauge | If Qdrant is currently started in recovery mode | -| cluster\_commit | | | -| cluster\_enabled | | Indicates wether multi-node clustering is enabled | -| cluster\_peers\_total | counter | Total number of cluster peers | -| cluster\_pending\_operations\_total | counter | Total number of pending operations in the cluster | -| cluster\_term | | | -| cluster\_voter | | | -| collection\_hardware\_metric\_cpu | | | -| collection\_hardware\_metric\_io\_read | | | -| collection\_hardware\_metric\_io\_write | | | -| collections\_total | counter | Number of collections | -| collections\_vector\_total | counter | Total number of vectors in all collections | -| container\_cpu\_cfs\_periods\_total | | | -| container\_cpu\_cfs\_throttled\_periods\_total | counter | Indicating that your CPU demand was higher than what your instance offers | -| container\_cpu\_usage\_seconds\_total | counter | Total CPU usage in seconds | -| container\_file\_descriptors | | | -| container\_fs\_reads\_bytes\_total | counter | Total number of bytes read by the container file system (disk) | -| container\_fs\_reads\_total | counter | Total number of read operations on the container file system (disk) | -| container\_fs\_writes\_bytes\_total | counter | Total number of bytes written by the container file system (disk) | -| container\_fs\_writes\_total | counter | Total number of write operations on the container file system (disk) | -| container\_memory\_cache | gauge | Memory used for cache in the container | -| container\_memory\_mapped\_file | gauge | Memory used for memory-mapped files in the container | -| container\_memory\_rss | gauge | Resident Set Size (RSS) - Memory used by the container excluding swap space used for caching | -| container\_memory\_working\_set\_bytes | gauge | Total memory used by the container, including both anonymous and file-backed memory | -| container\_network\_receive\_bytes\_total | counter | Total bytes received over the container’s network interface | -| container\_network\_receive\_errors\_total | | | -| container\_network\_receive\_packets\_dropped\_total | | | -| container\_network\_receive\_packets\_total | | | -| container\_network\_transmit\_bytes\_total | counter | Total bytes transmitted over the container’s network interface | -| container\_network\_transmit\_errors\_total | | | -| container\_network\_transmit\_packets\_dropped\_total | | | -| container\_network\_transmit\_packets\_total | | | -| kube\_persistentvolumeclaim\_info | | | -| kube\_pod\_container\_info | | | -| kube\_pod\_container\_resource\_limits | gauge | Response contains limits for CPU and memory of DB. | -| kube\_pod\_container\_resource\_requests | gauge | Response contains requests for CPU and memory of DB. | -| kube\_pod\_container\_status\_last\_terminated\_exitcode | | | -| kube\_pod\_container\_status\_last\_terminated\_reason | | | -| kube\_pod\_container\_status\_last\_terminated\_timestamp | | | -| kube\_pod\_container\_status\_ready | | | -| kube\_pod\_container\_status\_restarts\_total | | | -| kube\_pod\_container\_status\_running | | | -| kube\_pod\_container\_status\_terminated | | | -| kube\_pod\_container\_status\_terminated\_reason | | | -| kube\_pod\_created | | | -| kube\_pod\_info | | | -| kube\_pod\_start\_time | | | -| kube\_pod\_status\_container\_ready\_time | | | -| kube\_pod\_status\_initialized\_time | | | -| kube\_pod\_status\_phase | gauge | Pod status in terms of different phases (Failed/Running/Succeeded/Unknown) | -| kube\_pod\_status\_ready | gauge | Pod readiness state (unknown/false/true) | -| kube\_pod\_status\_ready\_time | | | -| kube\_pod\_status\_reason | | | -| kubelet\_volume\_stats\_capacity\_bytes | gauge | Amount of disk available | -| kubelet\_volume\_stats\_inodes | gauge | Amount of inodes available | -| kubelet\_volume\_stats\_inodes\_used | gauge | Amount of inodes used | -| kubelet\_volume\_stats\_used\_bytes | gauge | Amount of disk used | -| memory\_active\_bytes | | | -| memory\_allocated\_bytes | | | -| memory\_metadata\_bytes | | | -| memory\_resident\_bytes | | | -| memory\_retained\_bytes | | | -| qdrant\_cluster\_state | | | -| qdrant\_collection\_commit | | | -| qdrant\_collection\_config\_hnsw\_full\_ef\_construct | | | -| qdrant\_collection\_config\_hnsw\_full\_scan\_threshold | | | -| qdrant\_collection\_config\_hnsw\_m | | | -| qdrant\_collection\_config\_hnsw\_max\_indexing\_threads | | | -| qdrant\_collection\_config\_hnsw\_on\_disk | | | -| qdrant\_collection\_config\_hnsw\_payload\_m | | | -| qdrant\_collection\_config\_optimizer\_default\_segment\_number | | | -| qdrant\_collection\_config\_optimizer\_deleted\_threshold | | | -| qdrant\_collection\_config\_optimizer\_flush\_interval\_sec | | | -| qdrant\_collection\_config\_optimizer\_indexing\_threshold | | | -| qdrant\_collection\_config\_optimizer\_max\_optimization\_threads | | | -| qdrant\_collection\_config\_optimizer\_max\_segment\_size | | | -| qdrant\_collection\_config\_optimizer\_memmap\_threshold | | | -| qdrant\_collection\_config\_optimizer\_vacuum\_min\_vector\_number | | | -| qdrant\_collection\_config\_params\_always\_ram | | | -| qdrant\_collection\_config\_params\_on\_disk\_payload | | | -| qdrant\_collection\_config\_params\_product\_compression | | | -| qdrant\_collection\_config\_params\_read\_fanout\_factor | | | -| qdrant\_collection\_config\_params\_replication\_factor | | | -| qdrant\_collection\_config\_params\_scalar\_quantile | | | -| qdrant\_collection\_config\_params\_scalar\_type | | | -| qdrant\_collection\_config\_params\_shard\_number | | | -| qdrant\_collection\_config\_params\_vector\_size | | | -| qdrant\_collection\_config\_params\_write\_consistency\_factor | | | -| qdrant\_collection\_config\_quantization\_always\_ram | | | -| qdrant\_collection\_config\_quantization\_product\_compression | | | -| qdrant\_collection\_config\_quantization\_scalar\_quantile | | | -| qdrant\_collection\_config\_quantization\_scalar\_type | | | -| qdrant\_collection\_config\_wal\_capacity\_mb | | | -| qdrant\_collection\_config\_wal\_segments\_ahead | | | -| qdrant\_collection\_consensus\_thread\_status | | | -| qdrant\_collection\_is\_voter | | | -| qdrant\_collection\_number\_of\_collections | counter | Total number of collections in Qdrant | -| qdrant\_collection\_number\_of\_grpc\_requests | counter | Total number of gRPC requests on a collection | -| qdrant\_collection\_number\_of\_rest\_requests | counter | Total number of REST requests on a collection | -| qdrant\_collection\_pending\_operations | counter | Total number of pending operations on a collection | -| qdrant\_collection\_role | | | -| qdrant\_collection\_shard\_segment\_num\_indexed\_vectors | | | -| qdrant\_collection\_shard\_segment\_num\_points | | | -| qdrant\_collection\_shard\_segment\_num\_vectors | | | -| qdrant\_collection\_shard\_segment\_type | | | -| qdrant\_collection\_term | | | -| qdrant\_collection\_transfer | | | -| qdrant\_operator\_cluster\_info\_total | | | -| qdrant\_operator\_cluster\_phase | gauge | Information about the status of Qdrant clusters | -| qdrant\_operator\_cluster\_pod\_up\_to\_date | | | -| qdrant\_operator\_cluster\_restore\_info\_total | | | -| qdrant\_operator\_cluster\_restore\_phase | | | -| qdrant\_operator\_cluster\_scheduled\_snapshot\_info\_total | | | -| qdrant\_operator\_cluster\_scheduled\_snapshot\_phase | | | -| qdrant\_operator\_cluster\_snapshot\_duration\_sconds | | | -| qdrant\_operator\_cluster\_snapshot\_phase | gauge | Information about the status of Qdrant cluster backups | -| qdrant\_operator\_cluster\_status\_nodes | | | -| qdrant\_operator\_cluster\_status\_nodes\_ready | | | -| qdrant\_node\_rssanon\_bytes | gauge | Allocated memory without memory-mapped files. This is the hard metric on memory which will lead to an OOM if it goes over the limit | -| rest\_responses\_avg\_duration\_seconds | | | -| rest\_responses\_duration\_seconds\_bucket | | | -| rest\_responses\_duration\_seconds\_count | | | -| rest\_responses\_duration\_seconds\_sum | | | -| rest\_responses\_fail\_total | | | -| rest\_responses\_max\_duration\_seconds | | | -| rest\_responses\_min\_duration\_seconds | | | -| rest\_responses\_total | | | -| traefik\_service\_open\_connections | | | -| traefik\_service\_request\_duration\_seconds\_bucket | | | -| traefik\_service\_request\_duration\_seconds\_count | | | -| traefik\_service\_request\_duration\_seconds\_sum | gauge | Response contains list of metrics for each Traefik service. | -| traefik\_service\_requests\_bytes\_total | | | -| traefik\_service\_requests\_total | counter | Response contains list of metrics for each Traefik service. | -| traefik\_service\_responses\_bytes\_total | | | - -##### Was this page useful? - -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No - -Thank you for your feedback! 🙏 - -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud/cluster-monitoring.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. - -On this page: - -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud/cluster-monitoring.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) - -× - -[Powered by](https://qdrant.tech/) +## Integration and Pipeline Importance -<|page-106-lllmstxt|> -## private-cloud -- [Documentation](https://qdrant.tech/documentation/) -- Private Cloud +The integration of these components forms a powerful and efficient data streaming pipeline. The **Qdrant Sink Connector** ensures that the data flowing through Kafka is continuously ingested into Qdrant without any manual intervention. This real-time integration is crucial for applications that rely on the most current data for decision-making and analysis. By combining the strengths of MongoDB and Azure Blob Storage for data storage, Kafka for data streaming, and Qdrant for vector search, this pipeline provides a robust solution for managing and processing large volumes of data in real-time. The architecture’s scalability, fault-tolerance, and real-time processing capabilities are key to its effectiveness, making it a versatile solution for modern data-driven applications. -# [Anchor](https://qdrant.tech/documentation/private-cloud/\#qdrant-private-cloud) Qdrant Private Cloud +## Installation of Confluent Kafka Platform -Qdrant Private Cloud allows you to manage Qdrant database clusters in any Kubernetes cluster on any infrastructure. It uses the same Qdrant Operator that powers Qdrant Managed Cloud and Qdrant Hybrid Cloud, but without any connection to the Qdrant Cloud Management Console. +To install the Confluent Kafka Platform (self-managed locally), follow these 3 simple steps: -On top of the open source Qdrant database, it allows +**Download and Extract the Distribution Files:** -- Easy deployment and management of Qdrant database clusters in your own Kubernetes infrastructure -- Zero-downtime upgrades of the Qdrant database with replication -- Vertical and horizontal up and downscaling of the Qdrant database with auto rebalancing and shard splitting -- Full control over scheduling, including Multi-AZ deployments -- Backup & Disaster Recovery -- Extended telemetry -- Qdrant Enterprise Support Services +- Visit [Confluent Installation Page](https://www.confluent.io/installation/). +- Download the distribution files (tar, zip, etc.). +- Extract the downloaded file using: -If you are interested in using Qdrant Private Cloud, please [contact us](https://qdrant.tech/contact-us/) for more information. +```bash +tar -xvf confluent-.tar.gz +``` +or +```bash +unzip confluent-.zip +``` -##### Was this page useful? +**Configure Environment Variables:** -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +```bash +# Set CONFLUENT_HOME to the installation directory: +export CONFLUENT_HOME=/path/to/confluent- -Thank you for your feedback! 🙏 +# Add Confluent binaries to your PATH +export PATH=$CONFLUENT_HOME/bin:$PATH +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/private-cloud/_index.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +**Run Confluent Platform Locally:** -On this page: +```bash +# Start the Confluent Platform services: +confluent local start +# Stop the Confluent Platform services: +confluent local stop +``` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/private-cloud/_index.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +## Installation of Qdrant: -× +To install and run Qdrant (self-managed locally), you can use Docker, which simplifies the process. First, ensure you have Docker installed on your system. Then, you can pull the Qdrant image from Docker Hub and run it with the following commands: -[Powered by](https://qdrant.tech/) +```bash +docker pull qdrant/qdrant +docker run -p 6334:6334 -p 6333:6333 qdrant/qdrant +``` -<|page-107-lllmstxt|> -## tags -# Qdrant Blog - -## Features and News - -What are you Looking for? - -[![GraphRAG: How Lettria Unlocked 20% Accuracy Gains with Qdrant and Neo4j](https://qdrant.tech/blog/case-study-lettria-v2/preview/title.jpg)\\ -**GraphRAG: How Lettria Unlocked 20% Accuracy Gains with Qdrant and Neo4j** \\ -\\ -Daniel Azoulai\\ -\\ -June 17, 2025](https://qdrant.tech/blog/case-study-lettria-v2/) - -[**How Lawme Scaled AI Legal Assistants and Significantly Cut Costs with Qdrant** \\ -\\ -Daniel Azoulai\\ -\\ -June 11, 2025](https://qdrant.tech/blog/case-study-lawme/)[**How ConvoSearch Boosted Revenue for D2C Brands with Qdrant** \\ -\\ -Daniel Azoulai\\ -\\ -June 10, 2025](https://qdrant.tech/blog/case-study-convosearch/)[**​​Introducing the Official Qdrant Node for n8n** \\ -\\ -Maddie Duhon & Evgeniya Sukhodolskaya\\ -\\ -June 09, 2025](https://qdrant.tech/blog/n8n-node/) - -[![Vector Data Migration Tool](https://qdrant.tech/blog/beta-database-migration-tool/preview/preview.jpg)\\ -**Vector Data Migration Tool** \\ -Migrate data across clusters, regions, from open source to cloud, and more with just one command.\\ -\\ -Qdrant\\ -\\ -June 16, 2025](https://qdrant.tech/blog/beta-database-migration-tool/)[![LegalTech Builder's Guide: Navigating Strategic Decisions with Vector Search](https://qdrant.tech/blog/legal-tech-builders-guide/preview/preview.jpg)\\ -**LegalTech Builder's Guide: Navigating Strategic Decisions with Vector Search** \\ -This guide explores critical architectural decisions for LegalTech builders using Qdrant, covering accuracy, hybrid search, reranking, score boosting, quantization, and enterprise scaling needs.\\ -\\ -Daniel Azoulai\\ -\\ -June 10, 2025](https://qdrant.tech/blog/legal-tech-builders-guide/)[![Qdrant Achieves SOC 2 Type II and HIPAA Certifications](https://qdrant.tech/blog/soc-2-type-II-hipaa/preview/preview.jpg)\\ -**Qdrant Achieves SOC 2 Type II and HIPAA Certifications** \\ -Qdrant achieves SOC 2 Type II and HIPAA certifications.\\ -\\ -Daniel Azoulai\\ -\\ -June 10, 2025](https://qdrant.tech/blog/soc-2-type-ii-hipaa/)[![Qdrant + DataTalks.Club: Free 10-Week Course on LLM Applications](https://qdrant.tech/blog/datatalks-course/preview/preview.jpg)\\ -**Qdrant + DataTalks.Club: Free 10-Week Course on LLM Applications** \\ -Gain hands-on experience with RAG, vector search, evaluation, monitoring, and more.\\ -\\ -Qdrant\\ -\\ -June 05, 2025](https://qdrant.tech/blog/datatalks-course/)[![How Qovery Accelerated Developer Autonomy with Qdrant](https://qdrant.tech/blog/case-study-qovery/preview/preview.jpg)\\ -**How Qovery Accelerated Developer Autonomy with Qdrant** \\ -Discover how Qovery empowered developers and drastically reduced infrastructure management latency using Qdrant.\\ -\\ -Daniel Azoulai\\ -\\ -May 27, 2025](https://qdrant.tech/blog/case-study-qovery/)[![How Tripadvisor Drives 2 to 3x More Revenue with Qdrant-Powered AI](https://qdrant.tech/blog/case-study-tripadvisor/preview/preview.jpg)\\ -**How Tripadvisor Drives 2 to 3x More Revenue with Qdrant-Powered AI** \\ -Tripadvisor transformed trip planning and search by using Qdrant to index over a billion user-generated reviews and images. Learn how this powered AI features that boost revenue 2 to 3x for engaged users.\\ -\\ -Daniel Azoulai\\ -\\ -May 13, 2025](https://qdrant.tech/blog/case-study-tripadvisor/)[![Precision at Scale: How Aracor Accelerated Legal Due Diligence with Hybrid Vector Search](https://qdrant.tech/blog/case-study-aracor/preview/preview.jpg)\\ -**Precision at Scale: How Aracor Accelerated Legal Due Diligence with Hybrid Vector Search** \\ -Explore how Aracor transformed manual, error-prone legal document processing into an accurate, scalable, and rapid workflow, leveraging hybrid, filtered, and multitenant vector search technology.\\ -\\ -Daniel Azoulai\\ -\\ -May 13, 2025](https://qdrant.tech/blog/case-study-aracor/)[![How Garden Scaled Patent Intelligence with Qdrant](https://qdrant.tech/blog/case-study-garden-intel/preview/preview.jpg)\\ -**How Garden Scaled Patent Intelligence with Qdrant** \\ -Discover how Garden ingests 200 M+ patents and product documents, achieves sub-100 ms query latency, and launched a new infringement-analysis business line with Qdrant.\\ -\\ -Daniel Azoulai\\ -\\ -May 09, 2025](https://qdrant.tech/blog/case-study-garden-intel/)[![Exploring Qdrant Cloud Just Got Easier](https://qdrant.tech/blog/product-ui-changes/preview/preview.jpg)\\ -**Exploring Qdrant Cloud Just Got Easier** \\ -Read about recent improvements designed to simplify your journey from login, creating your first cluster, prototyping, and going to production.\\ -\\ -Qdrant\\ -\\ -May 06, 2025](https://qdrant.tech/blog/product-ui-changes/) - -- [1](https://qdrant.tech/blog/) -- [2](https://qdrant.tech/blog/page/2/) -- [3](https://qdrant.tech/blog/page/3/) -- [4](https://qdrant.tech/blog/page/4/) -- [5](https://qdrant.tech/blog/page/5/) -- [6](https://qdrant.tech/blog/page/6/) -- [7](https://qdrant.tech/blog/page/7/) -- [8](https://qdrant.tech/blog/page/8/) -- [9](https://qdrant.tech/blog/page/9/) -- [10](https://qdrant.tech/blog/page/10/) -- [11](https://qdrant.tech/blog/page/11/) -- [12](https://qdrant.tech/blog/page/12/) -- [13](https://qdrant.tech/blog/page/13/) -- [Newest](https://qdrant.tech/blog/) - -### Get Started with Qdrant Free - -[Get Started](https://cloud.qdrant.io/signup) - -![](https://qdrant.tech/img/rocket.svg) - -###### Sign up for Qdrant updates - -We'll occasionally send you best practices for using vector data and similarity search, as well as product news. - -Email\* - -utm\_campaign - -utm\_content - -utm\_medium - -utm\_source - -last\_form\_fill\_url - -referrer\_url - -Last Conversion Campaign Type - -Last Conversion Campaign Name - -explicit opt in - -By submitting, you agree to subscribe to Qdrant's updates. You can withdraw your consent anytime. More details are in the [Privacy Policy](https://qdrant.tech/legal/privacy-policy/). - -× - -[Powered by](https://qdrant.tech/) +This will download the Qdrant image and start a Qdrant instance accessible at `http://localhost:6333`. For more detailed instructions and alternative installation methods, refer to the [Qdrant installation documentation](https://qdrant.tech/documentation/quick-start/). -<|page-108-lllmstxt|> -## embedding-recycler -- [Articles](https://qdrant.tech/articles/) -- Layer Recycling and Fine-tuning Efficiency +## Installation of Qdrant-Kafka Sink Connector: -[Back to Machine Learning](https://qdrant.tech/articles/machine-learning/) +To install the Qdrant Kafka connector using [Confluent Hub](https://www.confluent.io/hub/), you can utilize the straightforward `confluent-hub install` command. This command simplifies the process by eliminating the need for manual configuration file manipulations. To install the Qdrant Kafka connector version 1.1.0, execute the following command in your terminal: -# Layer Recycling and Fine-tuning Efficiency +```bash + confluent-hub install qdrant/qdrant-kafka:1.1.0 +``` -Yusuf Sarıgöz +This command downloads and installs the specified connector directly from Confluent Hub into your Confluent Platform or Kafka Connect environment. The installation process ensures that all necessary dependencies are handled automatically, allowing for a seamless integration of the Qdrant Kafka connector with your existing setup. Once installed, the connector can be configured and managed using the Confluent Control Center or the Kafka Connect REST API, enabling efficient data streaming between Kafka and Qdrant without the need for intricate manual setup. -· +![2.webp](/documentation/examples/data-streaming-kafka-qdrant/2.webp) -August 23, 2022 +*Figure 2: Local Confluent platform showing the Source and Sink connectors after installation.* -![Layer Recycling and Fine-tuning Efficiency](https://qdrant.tech/articles_data/embedding-recycling/preview/title.jpg) +Ensure the configuration of the connector once it's installed as below. keep in mind that your `key.converter` and `value.converter` are very important for kafka to safely deliver the messages from topic to qdrant. -A recent [paper](https://arxiv.org/abs/2207.04993) -by Allen AI has attracted attention in the NLP community as they cache the output of a certain intermediate layer -in the training and inference phases to achieve a speedup of ~83% -with a negligible loss in model performance. -This technique is quite similar to [the caching mechanism in Quaterion](https://quaterion.qdrant.tech/tutorials/cache_tutorial.html), -but the latter is intended for any data modalities while the former focuses only on language models -despite presenting important insights from their experiments. -In this post, I will share our findings combined with those, -hoping to provide the community with a wider perspective on layer recycling. +```bash +{ + "name": "QdrantSinkConnectorConnector_0", + "config": { + "value.converter.schemas.enable": "false", + "name": "QdrantSinkConnectorConnector_0", + "connector.class": "io.qdrant.kafka.QdrantSinkConnector", + "key.converter": "org.apache.kafka.connect.storage.StringConverter", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "topics": "topic_62,qdrant_kafka.docs", + "errors.deadletterqueue.topic.name": "dead_queue", + "errors.deadletterqueue.topic.replication.factor": "1", + "qdrant.grpc.url": "http://localhost:6334", + "qdrant.api.key": "************" + } +} +``` -## [Anchor](https://qdrant.tech/articles/embedding-recycler/\#how-layer-recycling-works) How layer recycling works +## Installation of MongoDB -The main idea of layer recycling is to accelerate the training (and inference) -by avoiding repeated passes of the same data object through the frozen layers. -Instead, it is possible to pass objects through those layers only once, -cache the output -and use them as inputs to the unfrozen layers in future epochs. +For the Kafka to connect MongoDB as source, your MongoDB instance should be running in a `replicaSet` mode. below is the `docker compose` file which will spin a single node `replicaSet` instance of MongoDB. -In the paper, they usually cache 50% of the layers, e.g., the output of the 6th multi-head self-attention block in a 12-block encoder. -However, they find out that it does not work equally for all the tasks. -For example, the question answering task suffers from a more significant degradation in performance with 50% of the layers recycled, -and they choose to lower it down to 25% for this task, -so they suggest determining the level of caching based on the task at hand. -they also note that caching provides a more considerable speedup for larger models and on lower-end machines. +```bash +version: "3.8" -In layer recycling, the cache is hit for exactly the same object. -It is easy to achieve this in textual data as it is easily hashable, -but you may need more advanced tricks to generate keys for the cache -when you want to generalize this technique to diverse data types. -For instance, hashing PyTorch tensors [does not work as you may expect](https://github.com/joblib/joblib/issues/1282). -Quaterion comes with an intelligent key extractor that may be applied to any data type, -but it is also allowed to customize it with a callable passed as an argument. -Thanks to this flexibility, we were able to run a variety of experiments in different setups, -and I believe that these findings will be helpful for your future projects. +services: + mongo1: + image: mongo:7.0 + command: ["--replSet", "rs0", "--bind_ip_all", "--port", "27017"] + ports: + - 27017:27017 + healthcheck: + test: echo "try { rs.status() } catch (err) { rs.initiate({_id:'rs0',members:[{_id:0,host:'host.docker.internal:27017'}]}) }" | mongosh --port 27017 --quiet + interval: 5s + timeout: 30s + start_period: 0s + start_interval: 1s + retries: 30 + volumes: + - "mongo1_data:/data/db" + - "mongo1_config:/data/configdb" -## [Anchor](https://qdrant.tech/articles/embedding-recycler/\#experiments) Experiments +volumes: + mongo1_data: + mongo1_config: +``` -We conducted different experiments to test the performance with: +Similarly, install and configure source connector as below. -1. Different numbers of layers recycled in [the similar cars search example](https://quaterion.qdrant.tech/tutorials/cars-tutorial.html). -2. Different numbers of samples in the dataset for training and fine-tuning for similar cars search. -3. Different numbers of layers recycled in [the question answerring example](https://quaterion.qdrant.tech/tutorials/nlp_tutorial.html). +```bash +confluent-hub install mongodb/kafka-connect-mongodb:latest +``` -## [Anchor](https://qdrant.tech/articles/embedding-recycler/\#easy-layer-recycling-with-quaterion) Easy layer recycling with Quaterion +After installing the `MongoDB` connector, connector configuration should look like this: -The easiest way of caching layers in Quaterion is to compose a [TrainableModel](https://quaterion.qdrant.tech/quaterion.train.trainable_model.html#quaterion.train.trainable_model.TrainableModel) -with a frozen [Encoder](https://quaterion-models.qdrant.tech/quaterion_models.encoders.encoder.html#quaterion_models.encoders.encoder.Encoder) -and an unfrozen [EncoderHead](https://quaterion-models.qdrant.tech/quaterion_models.heads.encoder_head.html#quaterion_models.heads.encoder_head.EncoderHead). -Therefore, we modified the `TrainableModel` in the [example](https://github.com/qdrant/quaterion/blob/master/examples/cars/models.py) -as in the following: +```bash +{ + "name": "MongoSourceConnectorConnector_0", + "config": { + "connector.class": "com.mongodb.kafka.connect.MongoSourceConnector", + "key.converter": "org.apache.kafka.connect.storage.StringConverter", + "value.converter": "org.apache.kafka.connect.storage.StringConverter", + "connection.uri": "mongodb://127.0.0.1:27017/?replicaSet=rs0&directConnection=true", + "database": "qdrant_kafka", + "collection": "docs", + "publish.full.document.only": "true", + "topic.namespace.map": "{\"*\":\"qdrant_kafka.docs\"}", + "copy.existing": "true" + } +} +``` -```python -class Model(TrainableModel): - # ... +## Playground Application - def configure_encoders(self) -> Union[Encoder, Dict[str, Encoder]]: - pre_trained_encoder = torchvision.models.resnet34(pretrained=True) - self.avgpool = copy.deepcopy(pre_trained_encoder.avgpool) - self.finetuned_block = copy.deepcopy(pre_trained_encoder.layer4) - modules = [] +As the infrastructure set is completely done, now it's time for us to create a simple application and check our setup. the objective of our application is the data is inserted to Mongodb and eventually it will get ingested into Qdrant also using [Change Data Capture (CDC)](https://www.confluent.io/learn/change-data-capture/). - for name, child in pre_trained_encoder.named_children(): - modules.append(child) - if name == "layer3": - break +`requirements.txt` - pre_trained_encoder = nn.Sequential(*modules) +```bash +fastembed==0.3.1 +pymongo==4.8.0 +qdrant_client==1.10.1 +``` - return CarsEncoder(pre_trained_encoder) +`project_root_folder/main.py` - def configure_head(self, input_embedding_size) -> EncoderHead: - return SequentialHead(self.finetuned_block, - self.avgpool, - nn.Flatten(), - SkipConnectionHead(512, dropout=0.3, skip_dropout=0.2), - output_size=512) +This is just sample code. Nevertheless it can be extended to millions of operations based on your use case. - # ... +```python +from pymongo import MongoClient +from utils.app_utils import create_qdrant_collection +from fastembed import TextEmbedding +collection_name: str = 'test' +embed_model_name: str = 'snowflake/snowflake-arctic-embed-s' ``` +```python +# Step 0: create qdrant_collection +create_qdrant_collection(collection_name=collection_name, embed_model=embed_model_name) + +# Step 1: Connect to MongoDB +client = MongoClient('mongodb://127.0.0.1:27017/?replicaSet=rs0&directConnection=true') + +# Step 2: Select Database +db = client['qdrant_kafka'] + +# Step 3: Select Collection +collection = db['docs'] + +# Step 4: Create a Document to Insert + +description = "qdrant is a high available vector search engine" +embedding_model = TextEmbedding(model_name=embed_model_name) +vector = next(embedding_model.embed(documents=description)).tolist() +document = { + "collection_name": collection_name, + "id": 1, + "vector": vector, + "payload": { + "name": "qdrant", + "description": description, + "url": "https://qdrant.tech/documentation" + } +} -This trick lets us finetune one more layer from the base model as a part of the `EncoderHead` -while still benefiting from the speedup in the frozen `Encoder` provided by the cache. +# Step 5: Insert the Document into the Collection +result = collection.insert_one(document) -## [Anchor](https://qdrant.tech/articles/embedding-recycler/\#experiment-1-percentage-of-layers-recycled) Experiment 1: Percentage of layers recycled +# Step 6: Print the Inserted Document's ID +print("Inserted document ID:", result.inserted_id) +``` -The paper states that recycling 50% of the layers yields little to no loss in performance when compared to full fine-tuning. -In this setup, we compared performances of four methods: +`project_root_folder/utils/app_utils.py` -1. Freeze the whole base model and train only `EncoderHead`. -2. Move one of the four residual blocks `EncoderHead` and train it together with the head layer while freezing the rest (75% layer recycling). -3. Move two of the four residual blocks to `EncoderHead` while freezing the rest (50% layer recycling). -4. Train the whole base model together with `EncoderHead`. +```python +from qdrant_client import QdrantClient, models -**Note**: During these experiments, we used ResNet34 instead of ResNet152 as the pretrained model -in order to be able to use a reasonable batch size in full training. -The baseline score with ResNet34 is 0.106. +client = QdrantClient(url="http://localhost:6333", api_key="") +dimension_dict = {"snowflake/snowflake-arctic-embed-s": 384} -| Model | RRP | -| --- | --- | -| Full training | 0.32 | -| 50% recycling | 0.31 | -| 75% recycling | 0.28 | -| Head only | 0.22 | -| Baseline | 0.11 | +def create_qdrant_collection(collection_name: str, embed_model: str): -As is seen in the table, the performance in 50% layer recycling is very close to that in full training. -Additionally, we can still have a considerable speedup in 50% layer recycling with only a small drop in performance. -Although 75% layer recycling is better than training only `EncoderHead`, -its performance drops quickly when compared to 50% layer recycling and full training. + if not client.collection_exists(collection_name=collection_name): + client.create_collection( + collection_name=collection_name, + vectors_config=models.VectorParams(size=dimension_dict.get(embed_model), distance=models.Distance.COSINE) + ) +``` -## [Anchor](https://qdrant.tech/articles/embedding-recycler/\#experiment-2-amount-of-available-data) Experiment 2: Amount of available data +Before we run the application, below is the state of MongoDB and Qdrant databases. -In the second experiment setup, we compared performances of fine-tuning strategies with different dataset sizes. -We sampled 50% of the training set randomly while still evaluating models on the whole validation set. +![3.webp](/documentation/examples/data-streaming-kafka-qdrant/3.webp) -| Model | RRP | -| --- | --- | -| Full training | 0.27 | -| 50% recycling | 0.26 | -| 75% recycling | 0.25 | -| Head only | 0.21 | -| Baseline | 0.11 | +Figure 3: Initial state: no collection named `test` & `no data` in the `docs` collection of MongodDB. -This experiment shows that, the smaller the available dataset is, -the bigger drop in performance we observe in full training, 50% and 75% layer recycling. -On the other hand, the level of degradation in training only `EncoderHead` is really small when compared to others. -When we further reduce the dataset size, full training becomes untrainable at some point, -while we can still improve over the baseline by training only `EncoderHead`. +Once you run the code the data goes into Mongodb and the CDC gets triggered and eventually Qdrant will receive this data. -## [Anchor](https://qdrant.tech/articles/embedding-recycler/\#experiment-3-layer-recycling-in-question-answering) Experiment 3: Layer recycling in question answering +![4.webp](/documentation/examples/data-streaming-kafka-qdrant/4.webp) -We also wanted to test layer recycling in a different domain -as one of the most important takeaways of the paper is that -the performance of layer recycling is task-dependent. -To this end, we set up an experiment with the code from the [Question Answering with Similarity Learning tutorial](https://quaterion.qdrant.tech/tutorials/nlp_tutorial.html). +Figure 4: The test Qdrant collection is created automatically. -| Model | RP@1 | RRK | -| --- | --- | --- | -| Full training | 0.76 | 0.65 | -| 50% recycling | 0.75 | 0.63 | -| 75% recycling | 0.69 | 0.59 | -| Head only | 0.67 | 0.58 | -| Baseline | 0.64 | 0.55 | +![5.webp](/documentation/examples/data-streaming-kafka-qdrant/5.webp) -In this task, 50% layer recycling can still do a good job with only a small drop in performance when compared to full training. -However, the level of degradation is smaller than that in the similar cars search example. -This can be attributed to several factors such as the pretrained model quality, dataset size and task definition, -and it can be the subject of a more elaborate and comprehensive research project. -Another observation is that the performance of 75% layer recycling is closer to that of training only `EncoderHead` -than 50% layer recycling. +Figure 5: Data is inserted into both MongoDB and Qdrant. -## [Anchor](https://qdrant.tech/articles/embedding-recycler/\#conclusion) Conclusion +## Conclusion: -We set up several experiments to test layer recycling under different constraints -and confirmed that layer recycling yields varying performances with different tasks and domains. -One of the most important observations is the fact that the level of degradation in layer recycling -is sublinear with a comparison to full training, i.e., we lose a smaller percentage of performance than -the percentage we recycle. Additionally, training only `EncoderHead` -is more resistant to small dataset sizes. -There is even a critical size under which full training does not work at all. -The issue of performance differences shows that there is still room for further research on layer recycling, -and luckily Quaterion is flexible enough to run such experiments quickly. -We will continue to report our findings on fine-tuning efficiency. +In conclusion, the integration of **Kafka** with **Qdrant** using the **Qdrant Sink Connector** provides a seamless and efficient solution for real-time data streaming and processing. This setup not only enhances the capabilities of your data pipeline but also ensures that high-dimensional vector data is continuously indexed and readily available for similarity searches. By following the installation and setup guide, you can easily establish a robust data flow from your **source systems** like **MongoDB** and **Azure Blob Storage**, through **Kafka**, and into **Qdrant**. This architecture empowers modern applications to leverage real-time data insights and advanced search capabilities, paving the way for innovative data-driven solutions. -**Fun fact**: The preview image for this article was created with Dall.e with the following prompt: “Photo-realistic robot using a tuning fork to adjust a piano.” -[Click here](https://qdrant.tech/articles_data/embedding-recycling/full.png) -to see it in full size! +<|page-185-lllmstxt|> +# Scaling Qdrant Cloud Clusters -##### Was this page useful? +The amount of data is always growing and at some point you might need to change the capacity of your cluster. You can easily scale your Qdrant cluster up or down from the Cluster detail page in the Qdrant Cloud console. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +![Cluster Scaling](/documentation/cloud/cluster-scaling.png) -Thank you for your feedback! 🙏 +## Vertical Scaling -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/embedding-recycler.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Vertical scaling is the process of increasing the capacity of a cluster by adding or removing CPU, storage and memory resources on each database node. -On this page: +You can start with a minimal cluster configuration and scale it up over time to accomodate the growing amount of data in your application. If your cluster consists of several nodes each node will need to be scaled to the same size. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/embedding-recycler.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Note that vertical cluster scaling will require a short downtime, if the collections in your cluster are not replicated. This is because each node of the cluster needs to be restarted to apply the CPU, memory and disk size. -× +If you want to scale your cluster down, the new, smaller memory size must be still sufficient to store all the data in the cluster. Otherwise, the database cluster could run out of memory and crash. Therefore, the new memory size must be at least as large as the current memory usage of the database cluster including a bit of buffer. Qdrant Cloud will automatically prevent you from scaling down the Qdrant database cluster with a too small memory size. -[Powered by](https://qdrant.tech/) +Note, that it is not possible to scale down the disk space of the cluster due to technical limitations of the underlying cloud providers. -<|page-109-lllmstxt|> -## neural-search-tutorial -- [Articles](https://qdrant.tech/articles/) -- Neural Search 101: A Complete Guide and Step-by-Step Tutorial +## Horizontal Scaling -[Back to Vector Search Manuals](https://qdrant.tech/articles/vector-search-manuals/) +Vertical scaling can be an effective way to improve the performance of a cluster and extend the capacity, but it has some limitations. The main disadvantage of vertical scaling is that there are limits to how much a cluster can be expanded. At some point, adding more resources to a cluster can become impractical or cost-prohibitive. -# Neural Search 101: A Complete Guide and Step-by-Step Tutorial +In such cases, horizontal scaling may be a more effective solution. -Andrey Vasnetsov +Horizontal scaling is the process of increasing the capacity of a cluster by adding more nodes and distributing the load and data among them. The horizontal scaling at Qdrant starts on the collection level. You have to choose the number of shards you want to distribute your collection around while creating the collection. Please refer to the [sharding documentation](/documentation/guides/distributed_deployment/#sharding) section for details. -· +When scaling up horizontally, the cloud platform will automatically rebalance all available shards across nodes to ensure that the data is evenly distributed. See [Configuring Clusters](/documentation/cloud/configure-cluster/#shard-rebalancing) for more details. -June 10, 2021 +When scaling down horizontally, the cloud platform will automatically ensure that any shards that are present on the nodes to be deleted, are moved to the remaining nodes. -![Neural Search 101: A Complete Guide and Step-by-Step Tutorial](https://qdrant.tech/articles_data/neural-search-tutorial/preview/title.jpg) +Important: One shard can not be split across nodes. So, if you configure 2 shards for a collection, but then scale your cluster from 1 to 3 nodes, your cluster nodes can't be fully utilized. The cloud platform will automatically rebalance your shards, so that two nodes will have one shard each, but the third node will not have any shards at all. You can use the [resharding feature](/documentation/cloud/cluster-scaling/#resharding) to change the number of shards in an existing collection. Once resharding is complete, the cloud platform will rebalance the shards across all nodes, ensuring that all nodes are utilized. -# [Anchor](https://qdrant.tech/articles/neural-search-tutorial/\#neural-search-101-a-comprehensive-guide-and-step-by-step-tutorial) Neural Search 101: A Comprehensive Guide and Step-by-Step Tutorial +We will be glad to consult you on an optimal strategy for scaling. -Information retrieval technology is one of the main technologies that enabled the modern Internet to exist. -These days, search technology is the heart of a variety of applications. -From web-pages search to product recommendations. -For many years, this technology didn’t get much change until neural networks came into play. +[Let us know](/documentation/support/) your needs and decide together on a proper solution. -In this guide we are going to find answers to these questions: +## Resharding -- What is the difference between regular and neural search? -- What neural networks could be used for search? -- In what tasks is neural network search useful? -- How to build and deploy own neural search service step-by-step? +*Available as of Qdrant v1.13.0* -## [Anchor](https://qdrant.tech/articles/neural-search-tutorial/\#what-is-neural-search) What is neural search? + -A regular full-text search, such as Google’s, consists of searching for keywords inside a document. -For this reason, the algorithm can not take into account the real meaning of the query and documents. -Many documents that might be of interest to the user are not found because they use different wording. +When creating a collection, it has a specific number of shards. The ideal number of shards might change as your cluster evolves. -Neural search tries to solve exactly this problem - it attempts to enable searches not by keywords but by meaning. -To achieve this, the search works in 2 steps. -In the first step, a specially trained neural network encoder converts the query and the searched objects into a vector representation called embeddings. -The encoder must be trained so that similar objects, such as texts with the same meaning or alike pictures get a close vector representation. +Resharding allows you to change the number of shards in your existing collections, both up and down, without having to recreate the collection from scratch. -![Encoders and embedding space](https://gist.githubusercontent.com/generall/c229cc94be8c15095286b0c55a3f19d7/raw/e52e3f1a320cd985ebc96f48955d7f355de8876c/encoders.png) +Resharding is a transparent process, meaning that the collection is still available while resharding is going on without having downtime. This allows you to scale from one node to any number of nodes and back, keeping your data perfectly distributed without compromise. -Having this vector representation, it is easy to understand what the second step should be. -To find documents similar to the query you now just need to find the nearest vectors. -The most convenient way to determine the distance between two vectors is to calculate the cosine distance. -The usual Euclidean distance can also be used, but it is not so efficient due to [the curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality). +To increase the number of shards (reshard up), use the [Update collection cluster setup API](https://api.qdrant.tech/master/api-reference/distributed/update-collection-cluster) to initiate the resharding process: -## [Anchor](https://qdrant.tech/articles/neural-search-tutorial/\#which-model-could-be-used) Which model could be used? +```http +POST /collections/{collection_name}/cluster +{ + "start_resharding": { + "direction": "up", + "shard_key": null + } +} +``` -It is ideal to use a model specially trained to determine the closeness of meanings. -For example, models trained on Semantic Textual Similarity (STS) datasets. -Current state-of-the-art models can be found on this [leaderboard](https://paperswithcode.com/sota/semantic-textual-similarity-on-sts-benchmark?p=roberta-a-robustly-optimized-bert-pretraining). +To decrease the number of shards (reshard down), you may specify the `"down"` direction. -However, not only specially trained models can be used. -If the model is trained on a large enough dataset, its internal features can work as embeddings too. -So, for instance, you can take any pre-trained on ImageNet model and cut off the last layer from it. -In the penultimate layer of the neural network, as a rule, the highest-level features are formed, which, however, do not correspond to specific classes. -The output of this layer can be used as an embedding. +The current status of resharding is listed in the [collection cluster info](https://api.qdrant.tech/v-1-12-x/api-reference/distributed/collection-cluster-info) which can be fetched with: -## [Anchor](https://qdrant.tech/articles/neural-search-tutorial/\#what-tasks-is-neural-search-good-for) What tasks is neural search good for? +```http +GET /collections/{collection_name}/cluster +``` -Neural search has the greatest advantage in areas where the query cannot be formulated precisely. -Querying a table in an SQL database is not the best place for neural search. +We always recommend to run an ongoing resharding operation till the end. But, if at any point the resharding operation needs to be aborted, you can use: -On the contrary, if the query itself is fuzzy, or it cannot be formulated as a set of conditions - neural search can help you. -If the search query is a picture, sound file or long text, neural network search is almost the only option. +```http +POST /collections/{collection_name}/cluster +{ + "abort_resharding": {} +} +``` -If you want to build a recommendation system, the neural approach can also be useful. -The user’s actions can be encoded in vector space in the same way as a picture or text. -And having those vectors, it is possible to find semantically similar users and determine the next probable user actions. +A few things to be aware of with regards to resharding: -## [Anchor](https://qdrant.tech/articles/neural-search-tutorial/\#step-by-step-neural-search-tutorial-using-qdrant) Step-by-step neural search tutorial using Qdrant +- during resharding, performance of your cluster may be slightly reduced +- during resharding, reported point counts will not be accurate +- resharding may be a long running operation on huge collections +- you can only run one resharding operation per collection at a time -With all that said, let’s make our neural network search. -As an example, I decided to make a search for startups by their description. -In this demo, we will see the cases when text search works better and the cases when neural network search works better. +<|page-186-lllmstxt|> +# Configure Qdrant Cloud Clusters -I will use data from [startups-list.com](https://www.startups-list.com/). -Each record contains the name, a paragraph describing the company, the location and a picture. -Raw parsed data can be found at [this link](https://storage.googleapis.com/generall-shared-data/startups_demo.json). +Qdrant Cloud offers several advanced configuration options to optimize clusters for your specific needs. You can access these options from the Cluster Details page in the Qdrant Cloud console. -### [Anchor](https://qdrant.tech/articles/neural-search-tutorial/\#step-1-prepare-data-for-neural-search) Step 1: Prepare data for neural search +## Collection Defaults -To be able to search for our descriptions in vector space, we must get vectors first. -We need to encode the descriptions into a vector representation. -As the descriptions are textual data, we can use a pre-trained language model. -As mentioned above, for the task of text search there is a whole set of pre-trained models specifically tuned for semantic similarity. +You can set default values for the configuration of new collections in your cluster. These defaults will be used when creating a new collection, unless you override them in the collection creation request. -One of the easiest libraries to work with pre-trained language models, in my opinion, is the [sentence-transformers](https://github.com/UKPLab/sentence-transformers) by UKPLab. -It provides a way to conveniently download and use many pre-trained models, mostly based on transformer architecture. -Transformers is not the only architecture suitable for neural search, but for our task, it is quite enough. +You can configure the default *Replication Factor*, the default *Write Consistency Factor*, and if vectors should be stored on disk only, instead of being cached in RAM. -We will use a model called `all-MiniLM-L6-v2`. -This model is an all-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs. -It is optimized for low memory consumption and fast inference. +Refer to [Qdrant Configuration](/documentation/guides/configuration/#configuration-options) for more details. -The complete code for data preparation with detailed comments can be found and run in [Colab Notebook](https://colab.research.google.com/drive/1kPktoudAP8Tu8n8l-iVMOQhVmHkWV_L9?usp=sharing). +## Advanced Optimizations -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1kPktoudAP8Tu8n8l-iVMOQhVmHkWV_L9?usp=sharing) +You can change the *Optimzer CPU Budget* and the *Async Scorer* configurations for your cluster. These advanced settings will have an impact on performance and reliability. We recommend using the default values unless you are confident they are required for your use case. -### [Anchor](https://qdrant.tech/articles/neural-search-tutorial/\#step-2-incorporate-a-vector-search-engine) Step 2: Incorporate a Vector search engine +See [Qdrant under the hood: io_uring](/articles/io_uring/#and-what-about-qdrant) and [Large Scale Search](/documentation/database-tutorials/large-scale-search/) for more details. -Now as we have a vector representation for all our records, we need to store them somewhere. -In addition to storing, we may also need to add or delete a vector, save additional information with the vector. -And most importantly, we need a way to search for the nearest vectors. +## Client IP Restrictions -The vector search engine can take care of all these tasks. -It provides a convenient API for searching and managing vectors. -In our tutorial, we will use [Qdrant vector search engine](https://github.com/qdrant/qdrant) vector search engine. -It not only supports all necessary operations with vectors but also allows you to store additional payload along with vectors and use it to perform filtering of the search result. -Qdrant has a client for Python and also defines the API schema if you need to use it from other languages. +If configured, only the chosen IP ranges will be allowed to access the cluster. This is useful for securing your cluster and ensuring that only clients coming from trusted networks can connect to it. -The easiest way to use Qdrant is to run a pre-built image. -So make sure you have Docker installed on your system. +## Restart Mode -To start Qdrant, use the instructions on its [homepage](https://github.com/qdrant/qdrant). +The cloud platform will automatically choose the optimal restart mode during version upgrades or maintenance for your cluster. If you have a multi-node cluster and one or more collections with a replication factor of at least 2, the cloud platform will use the rolling restart mode. This means that nodes in the cluster will be restarted one at a time, ensuring that the cluster remains available during the restart process. -Download image from [DockerHub](https://hub.docker.com/r/qdrant/qdrant): +If you have a multi-node cluster, but all collections have a replication factor of 1, the cloud platform will use the parallel restart mode. This means that nodes in the cluster will be restarted simultaneously, which will result in a short downtime period, but will be faster than a rolling restart. -```bash -docker pull qdrant/qdrant +It is possible to override your cluster's default restart mode in the advanced configuration section of the Cluster Details page. -``` +## Shard Rebalancing -And run the service inside the docker: +When you scale your cluster horizontally, the cloud platform will automatically rebalance shards across all nodes in the cluster, ensuring that data is evenly distributed. This is done to ensure that all nodes are utilized and that the performance of the cluster is optimal. -```bash -docker run -p 6333:6333 \ - -v $(pwd)/qdrant_storage:/qdrant/storage \ - qdrant/qdrant +Qdrant Cloud offers three strategies for shard rebalancing: -``` +* `by_count_and_size` (default): This strategy will rebalance the shards based on the number of shards and their size. It will ensure that all nodes have the same number of shards and that shard sizes are evenly distributed across nodes. +* `by_count`: This strategy will rebalance the shards based on the number of shards only. It will ensure that all nodes have the same number of shards, but shard sizes may not be balanced evenly across nodes. +* `by_size`: This strategy will rebalance the shards based on their size only. It will ensure that shards are evenly distributed across nodes by size, but the number of shards may not be even across all nodes. -You should see output like this +You can deactivate automatic shard rebalancing by deselecting the `rebalancing_strategy` option. This is useful if you want to manually control the shard distribution across nodes. -```text -... -[2021-02-05T00:08:51Z INFO actix_server::builder] Starting 12 workers -[2021-02-05T00:08:51Z INFO actix_server::builder] Starting "actix-web-service-0.0.0.0:6333" service on 0.0.0.0:6333 +<|page-187-lllmstxt|> +# Monitoring Qdrant Cloud Clusters -``` +## Telemetry -This means that the service is successfully launched and listening port 6333. -To make sure you can test [http://localhost:6333/](http://localhost:6333/) in your browser and get qdrant version info. +![Cluster Metrics](/documentation/cloud/cluster-metrics.png) -All uploaded to Qdrant data is saved into the `./qdrant_storage` directory and will be persisted even if you recreate the container. +Qdrant Cloud provides you with a set of metrics to monitor the health of your database cluster. You can access these metrics in the Qdrant Cloud Console in the **Metrics** and **Request** sections of the Cluster Details page. -### [Anchor](https://qdrant.tech/articles/neural-search-tutorial/\#step-3-upload-data-to-qdrant) Step 3: Upload data to Qdrant +## Logs -Now once we have the vectors prepared and the search engine running, we can start uploading the data. -To interact with Qdrant from python, I recommend using an out-of-the-box client library. +![Cluster Logs](/documentation/cloud/cluster-logs.png) -To install it, use the following command +Logs of the database cluster are available in the Qdrant Cloud Console in the **Logs** section of the Cluster Details page. -```bash -pip install qdrant-client +## Alerts -``` +You will receive automatic alerts via email before your cluster reaches the currently configured memory or storage limits, including recommendations for scaling your cluster. -At this point, we should have startup records in file `startups.json`, encoded vectors in file `startup_vectors.npy`, and running Qdrant on a local machine. -Let’s write a script to upload all startup data and vectors into the search engine. +## Qdrant Database Metrics and Telemetry -First, let’s create a client object for Qdrant. +You can also directly access the metrics and telemetry that the Qdrant database nodes provide. -```python -# Import client library -from qdrant_client import QdrantClient -from qdrant_client.models import VectorParams, Distance +To scrape metrics from a Qdrant cluster running in Qdrant Cloud, an [API key](/documentation/cloud/authentication/) is required to access `/metrics` and `/sys_metrics`. Qdrant Cloud also supports supplying the API key as a [Bearer token](https://www.rfc-editor.org/rfc/rfc6750.html), which may be required by some providers. -qdrant_client = QdrantClient(host='localhost', port=6333) +### Qdrant Node Metrics -``` +Metrics in a Prometheus compatible format are available at the `/metrics` endpoint of each Qdrant database node. When scraping, you should use the [node specific URLs](/documentation/cloud/cluster-access/#node-specific-endpoints) to ensure that you are scraping metrics from all nodes in each cluster. For more information see [Qdrant monitoring](/documentation/guides/monitoring/). -Qdrant allows you to combine vectors of the same purpose into collections. -Many independent vector collections can exist on one service at the same time. +You can also access the `/telemetry` [endpoint](https://api.qdrant.tech/api-reference/service/telemetry) of your database. This endpoint is available on the cluster endpoint and provides information about the current state of the database, including the number of vectors, shards, and other useful information. -Let’s create a new collection for our startup vectors. +For more information, see [Qdrant monitoring](/documentation/guides/monitoring/). -```python -if not qdrant_client.collection_exists('startups'): - qdrant_client.create_collection( - collection_name='startups', - vectors_config=VectorParams(size=384, distance=Distance.COSINE), - ) +### Cluster System Metrics -``` +Cluster system metrics is a cloud-only endpoint that not only shares all the information about the database from `/metrics` but also provides additional operational data from our infrastructure about your cluster, including information from our load balancers, ingresses, and cluster workloads themselves. -The `vector_size` parameter is very important. -It tells the service the size of the vectors in that collection. -All vectors in a collection must have the same size, otherwise, it is impossible to calculate the distance between them. -`384` is the output dimensionality of the encoder we are using. +Metrics in a Prometheus-compatible format are available at the `/sys_metrics` cluster endpoint. Database API Keys are used to authenticate access to cluster system metrics. `/sys_metrics` only need to be queried once per cluster on the main load-balanced cluster endpoint. You don't need to scrape each cluster node individually, instead it will always provide metrics about all nodes. -The `distance` parameter allows specifying the function used to measure the distance between two points. +## Grafana Dashboard -The Qdrant client library defines a special function that allows you to load datasets into the service. -However, since there may be too much data to fit a single computer memory, the function takes an iterator over the data as input. +If you scrape your Qdrant Cluster system metrics into your own monitoring system, and your are using Grafana, you can use our [Grafana dashboard](https://github.com/qdrant/qdrant-cloud-grafana-dashboard) to visualize these metrics. -Let’s create an iterator over the startup data and vectors. +![Grafa dashboard](/documentation/cloud/cloud-grafana-dashboard.png) -```python -import numpy as np -import json + -fd = open('./startups.json') +### Cluster System Mtrics `/sys_metrics` -# payload is now an iterator over startup data -payload = map(json.loads, fd) +In Qdrant Cloud, each Qdrant cluster will expose the following metrics. This endpoint is not available when running Qdrant open-source. -# Here we load all vectors into memory, numpy array works as iterable for itself. -# Other option would be to use Mmap, if we don't want to load all data into RAM -vectors = np.load('./startup_vectors.npy') +**List of metrics** -``` +| Name | Type | Meaning | +|-------------------------------------------------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------| +| app_info | gauge | Information about the Qdrant server | +| app_status_recovery_mode | gauge | If Qdrant is currently started in recovery mode | +| cluster_commit | | | +| cluster_enabled | | Indicates wether multi-node clustering is enabled | +| cluster_peers_total | counter | Total number of cluster peers | +| cluster_pending_operations_total | counter | Total number of pending operations in the cluster | +| cluster_term | | | +| cluster_voter | | | +| collection_hardware_metric_cpu | | | +| collection_hardware_metric_io_read | | | +| collection_hardware_metric_io_write | | | +| collections_total | counter | Number of collections | +| collections_vector_total | counter | Total number of vectors in all collections | +| container_cpu_cfs_periods_total | | | +| container_cpu_cfs_throttled_periods_total | counter | Indicating that your CPU demand was higher than what your instance offers | +| container_cpu_usage_seconds_total | counter | Total CPU usage in seconds | +| container_file_descriptors | | | +| container_fs_reads_bytes_total | counter | Total number of bytes read by the container file system (disk) | +| container_fs_reads_total | counter | Total number of read operations on the container file system (disk) | +| container_fs_writes_bytes_total | counter | Total number of bytes written by the container file system (disk) | +| container_fs_writes_total | counter | Total number of write operations on the container file system (disk) | +| container_memory_cache | gauge | Memory used for cache in the container | +| container_memory_mapped_file | gauge | Memory used for memory-mapped files in the container | +| container_memory_rss | gauge | Resident Set Size (RSS) - Memory used by the container excluding swap space used for caching | +| container_memory_working_set_bytes | gauge | Total memory used by the container, including both anonymous and file-backed memory | +| container_network_receive_bytes_total | counter | Total bytes received over the container's network interface | +| container_network_receive_errors_total | | | +| container_network_receive_packets_dropped_total | | | +| container_network_receive_packets_total | | | +| container_network_transmit_bytes_total | counter | Total bytes transmitted over the container's network interface | +| container_network_transmit_errors_total | | | +| container_network_transmit_packets_dropped_total | | | +| container_network_transmit_packets_total | | | +| kube_persistentvolumeclaim_info | | | +| kube_pod_container_info | | | +| kube_pod_container_resource_limits | gauge | Response contains limits for CPU and memory of DB. | +| kube_pod_container_resource_requests | gauge | Response contains requests for CPU and memory of DB. | +| kube_pod_container_status_last_terminated_exitcode | | | +| kube_pod_container_status_last_terminated_reason | | | +| kube_pod_container_status_last_terminated_timestamp | | | +| kube_pod_container_status_ready | | | +| kube_pod_container_status_restarts_total | | | +| kube_pod_container_status_running | | | +| kube_pod_container_status_terminated | | | +| kube_pod_container_status_terminated_reason | | | +| kube_pod_created | | | +| kube_pod_info | | | +| kube_pod_start_time | | | +| kube_pod_status_container_ready_time | | | +| kube_pod_status_initialized_time | | | +| kube_pod_status_phase | gauge | Pod status in terms of different phases (Failed/Running/Succeeded/Unknown) | +| kube_pod_status_ready | gauge | Pod readiness state (unknown/false/true) | +| kube_pod_status_ready_time | | | +| kube_pod_status_reason | | | +| kubelet_volume_stats_capacity_bytes | gauge | Amount of disk available | +| kubelet_volume_stats_inodes | gauge | Amount of inodes available | +| kubelet_volume_stats_inodes_used | gauge | Amount of inodes used | +| kubelet_volume_stats_used_bytes | gauge | Amount of disk used | +| memory_active_bytes | | | +| memory_allocated_bytes | | | +| memory_metadata_bytes | | | +| memory_resident_bytes | | | +| memory_retained_bytes | | | +| qdrant_cluster_state | | | +| qdrant_collection_commit | | | +| qdrant_collection_config_hnsw_full_ef_construct | | | +| qdrant_collection_config_hnsw_full_scan_threshold | | | +| qdrant_collection_config_hnsw_m | | | +| qdrant_collection_config_hnsw_max_indexing_threads | | | +| qdrant_collection_config_hnsw_on_disk | | | +| qdrant_collection_config_hnsw_payload_m | | | +| qdrant_collection_config_optimizer_default_segment_number | | | +| qdrant_collection_config_optimizer_deleted_threshold | | | +| qdrant_collection_config_optimizer_flush_interval_sec | | | +| qdrant_collection_config_optimizer_indexing_threshold | | | +| qdrant_collection_config_optimizer_max_optimization_threads | | | +| qdrant_collection_config_optimizer_max_segment_size | | | +| qdrant_collection_config_optimizer_memmap_threshold | | | +| qdrant_collection_config_optimizer_vacuum_min_vector_number | | | +| qdrant_collection_config_params_always_ram | | | +| qdrant_collection_config_params_on_disk_payload | | | +| qdrant_collection_config_params_product_compression | | | +| qdrant_collection_config_params_read_fanout_factor | | | +| qdrant_collection_config_params_replication_factor | | | +| qdrant_collection_config_params_scalar_quantile | | | +| qdrant_collection_config_params_scalar_type | | | +| qdrant_collection_config_params_shard_number | | | +| qdrant_collection_config_params_vector_size | | | +| qdrant_collection_config_params_write_consistency_factor | | | +| qdrant_collection_config_quantization_always_ram | | | +| qdrant_collection_config_quantization_product_compression | | | +| qdrant_collection_config_quantization_scalar_quantile | | | +| qdrant_collection_config_quantization_scalar_type | | | +| qdrant_collection_config_wal_capacity_mb | | | +| qdrant_collection_config_wal_segments_ahead | | | +| qdrant_collection_consensus_thread_status | | | +| qdrant_collection_is_voter | | | +| qdrant_collection_number_of_collections | counter | Total number of collections in Qdrant | +| qdrant_collection_number_of_grpc_requests | counter | Total number of gRPC requests on a collection | +| qdrant_collection_number_of_rest_requests | counter | Total number of REST requests on a collection | +| qdrant_collection_pending_operations | counter | Total number of pending operations on a collection | +| qdrant_collection_role | | | +| qdrant_collection_shard_segment_num_indexed_vectors | | | +| qdrant_collection_shard_segment_num_points | | | +| qdrant_collection_shard_segment_num_vectors | | | +| qdrant_collection_shard_segment_type | | | +| qdrant_collection_term | | | +| qdrant_collection_transfer | | | +| qdrant_operator_cluster_info_total | | | +| qdrant_operator_cluster_phase | gauge | Information about the status of Qdrant clusters | +| qdrant_operator_cluster_pod_up_to_date | | | +| qdrant_operator_cluster_restore_info_total | | | +| qdrant_operator_cluster_restore_phase | | | +| qdrant_operator_cluster_scheduled_snapshot_info_total | | | +| qdrant_operator_cluster_scheduled_snapshot_phase | | | +| qdrant_operator_cluster_snapshot_duration_sconds | | | +| qdrant_operator_cluster_snapshot_phase | gauge | Information about the status of Qdrant cluster backups | +| qdrant_operator_cluster_status_nodes | | | +| qdrant_operator_cluster_status_nodes_ready | | | +| qdrant_node_rssanon_bytes | gauge | Allocated memory without memory-mapped files. This is the hard metric on memory which will lead to an OOM if it goes over the limit | +| rest_responses_avg_duration_seconds | | | +| rest_responses_duration_seconds_bucket | | | +| rest_responses_duration_seconds_count | | | +| rest_responses_duration_seconds_sum | | | +| rest_responses_fail_total | | | +| rest_responses_max_duration_seconds | | | +| rest_responses_min_duration_seconds | | | +| rest_responses_total | | | +| traefik_service_open_connections | | | +| traefik_service_request_duration_seconds_bucket | | | +| traefik_service_request_duration_seconds_count | | | +| traefik_service_request_duration_seconds_sum | gauge | Response contains list of metrics for each Traefik service. | +| traefik_service_requests_bytes_total | | | +| traefik_service_requests_total | counter | Response contains list of metrics for each Traefik service. | +| traefik_service_responses_bytes_total | | | -And the final step - data uploading +<|page-188-lllmstxt|> +# Updating Qdrant Cloud Clusters -```python -qdrant_client.upload_collection( - collection_name='startups', - vectors=vectors, - payload=payload, - ids=None, # Vector ids will be assigned automatically - batch_size=256 # How many vectors will be uploaded in a single request? -) +As soon as a new Qdrant version is available. Qdrant Cloud will show you an update notification in the Cluster list and on the Cluster details page. -``` +To update to a new version, go to the Cluster Details page, choose the new version from the version dropdown and click **Update**. -Now we have vectors uploaded to the vector search engine. -In the next step, we will learn how to actually search for the closest vectors. +![Cluster Updates](/documentation/cloud/cluster-upgrades.png) -The full code for this step can be found [here](https://github.com/qdrant/qdrant_demo/blob/master/qdrant_demo/init_collection_startups.py). +If you have a multi-node cluster and if your collections have a replication factor of at least **2**, the update process will be zero-downtime and done in a rolling fashion. You will be able to use your database cluster normally. -### [Anchor](https://qdrant.tech/articles/neural-search-tutorial/\#step-4-make-a-search-api) Step 4: Make a search API +If you have a single-node cluster or a collection with a replication factor of **1**, the update process will require a short downtime period to restart your cluster with the new version. -Now that all the preparations are complete, let’s start building a neural search class. +See also [Restart Mode](/documentation/cloud/configure-cluster/#restart-mode) for more details. -First, install all the requirements: +<|page-189-lllmstxt|> +# Backing up Qdrant Cloud Clusters -```bash -pip install sentence-transformers numpy +Qdrant organizes cloud instances as clusters. On occasion, you may need to +restore your cluster because of application or system failure. -``` +You may already have a source of truth for your data in a regular database. If you +have a problem, you could reindex the data into your Qdrant vector search cluster. +However, this process can take time. For high availability critical projects we +recommend replication. It guarantees the proper cluster functionality as long as +at least one replica is running. -In order to process incoming requests neural search will need 2 things. -A model to convert the query into a vector and Qdrant client, to perform a search queries. +For other use-cases such as disaster recovery, you can set up automatic or +self-service backups. -```python -# File: neural_searcher.py +## Prerequisites -from qdrant_client import QdrantClient -from sentence_transformers import SentenceTransformer +You can back up your Qdrant clusters though the Qdrant Cloud +Dashboard at https://cloud.qdrant.io. This section assumes that you've already +set up your cluster, as described in the following sections: -class NeuralSearcher: +- [Create a cluster](/documentation/cloud/create-cluster/) +- Set up [Authentication](/documentation/cloud/authentication/) +- Configure one or more [Collections](/documentation/concepts/collections/) - def __init__(self, collection_name): - self.collection_name = collection_name - # Initialize encoder model - self.model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu') - # initialize Qdrant client - self.qdrant_client = QdrantClient(host='localhost', port=6333) +## Automatic Backups -``` +You can set up automatic backups of your clusters with our Cloud UI. With the +procedures listed in this page, you can set up +snapshots on a daily/weekly/monthly basis. You can keep as many snapshots as you +need. You can restore a cluster from the snapshot of your choice. -The search function looks as simple as possible: +> Note: When you restore a snapshot, consider the following: +> - The affected cluster is not available while a snapshot is being restored. +> - If you changed the cluster setup after the copy was created, the cluster + resets to the previous configuration. +> - The previous configuration includes: +> - CPU +> - Memory +> - Node count +> - Qdrant version -```python - def search(self, text: str): - # Convert text query into vector - vector = self.model.encode(text).tolist() +### Configure a Backup - # Use `vector` for search for closest vectors in the collection - search_result = self.qdrant_client.search( - collection_name=self.collection_name, - query_vector=vector, - query_filter=None, # We don't want any filters for now - top=5 # 5 the most closest results is enough - ) - # `search_result` contains found vector ids with similarity scores along with the stored payload - # In this function we are interested in payload only - payloads = [hit.payload for hit in search_result] - return payloads +After you have taken the prerequisite steps, you can configure a backup with the +[Qdrant Cloud Dashboard](https://cloud.qdrant.io). To do so, take these steps: -``` +1. On the **Cluster Detail Page** and select the **Backups** tab. +1. Now you can set up a backup schedule. + The **Days of Retention** is the number of days after a backup snapshot is + deleted. +1. Alternatively, you can select **Backup now** to take an immediate snapshot. -With Qdrant it is also feasible to add some conditions to the search. -For example, if we wanted to search for startups in a certain city, the search query could look like this: +![Configure a cluster backup](/documentation/cloud/backup-schedule.png) -```python -from qdrant_client.models import Filter +### Restore a Backup - ... +If you have a backup, it appears in the list of **Available Backups**. You can +choose to restore or delete the backups of your choice. - city_of_interest = "Berlin" +![Restore or delete a cluster backup](/documentation/cloud/restore-delete.png) - # Define a filter for cities - city_filter = Filter(**{ - "must": [{\ - "key": "city", # We store city information in a field of the same name\ - "match": { # This condition checks if payload field have requested value\ - "keyword": city_of_interest\ - }\ - }] - }) + - search_result = self.qdrant_client.search( - collection_name=self.collection_name, - query_vector=vector, - query_filter=city_filter, - top=5 - ) - ... +## Backups With a Snapshot -``` +Qdrant also offers a snapshot API which allows you to create a snapshot +of a specific collection or your entire cluster. For more information, see our +[snapshot documentation](/documentation/concepts/snapshots/). -We now have a class for making neural search queries. Let’s wrap it up into a service. +Here is how you can take a snapshot and recover a collection: -### [Anchor](https://qdrant.tech/articles/neural-search-tutorial/\#step-5-deploy-as-a-service) Step 5: Deploy as a service +1. Take a snapshot: + - For a single node cluster, call the snapshot endpoint on the exposed URL. + - For a multi node cluster call a snapshot on each node of the collection. + Specifically, prepend `node-{num}-` to your cluster URL. + Then call the [snapshot endpoint](/documentation/concepts/snapshots/#create-snapshot) on the individual hosts. Start with node 0. + - In the response, you'll see the name of the snapshot. +2. Delete and recreate the collection. +3. Recover the snapshot: + - Call the [recover endpoint](/documentation/concepts/snapshots/#recover-in-cluster-deployment). Set a location which points to the snapshot file (`file:///qdrant/snapshots/{collection_name}/{snapshot_file_name}`) for each host. -To build the service we will use the FastAPI framework. -It is super easy to use and requires minimal code writing. +## Backup Considerations -To install it, use the command +Backups are incremental for AWS and GCP clusters. For example, if you have two backups, backup number 2 +contains only the data that changed since backup number 1. This reduces the +total cost of your backups. -```bash -pip install fastapi uvicorn +For Azure clusters, backups are based on total disk usage. The cost is calculated +as half of the disk usage when the backup was taken. -``` +You can create multiple backup schedules. -Our service will have only one API endpoint and will look like this: +When you restore a snapshot, any changes made after the date of the snapshot +are lost. -```python -# File: service.py +<|page-190-lllmstxt|> +# Inference in Qdrant Managed Cloud -from fastapi import FastAPI +Inference is the process of creating vector embeddings from text, images, or other data types using a machine learning model. -# That is the file where NeuralSearcher is stored -from neural_searcher import NeuralSearcher +Qdrant Managed Cloud allows you to use inference directly in the cloud, without the need to set up and maintain your own inference infrastructure. -app = FastAPI() + -# Create an instance of the neural searcher -neural_searcher = NeuralSearcher(collection_name='startups') +![Cluster Cluster UI](/documentation/cloud/cloud-inference.png) -@app.get("/api/search") -def search_startup(q: str): - return { - "result": neural_searcher.search(text=q) - } +## Supported Models -if __name__ == "__main__": - import uvicorn - uvicorn.run(app, host="0.0.0.0", port=8000) +You can see the list of supported models in the Inference tab of the Cluster Detail page in the Qdrant Cloud Console. The list includes models for text, both to produce dense and sparse vectors, as well as multi-modal models for images. -``` +## Enabling/Disabling Inference -Now, if you run the service with +Inference is enabled by default for all new clusters, created after July, 7th 2025. You can enable it for existing clusters directly from the Inference tab of the Cluster Detail page in the Qdrant Cloud Console. Activating inference will trigger a restart of your cluster to apply the new configuration. -```bash -python service.py +## Billing -``` +Inference is billed based on the number of tokens processed by the model. The cost is calculated per 1,000,000 tokens. The price depends on the model and is displayed ont the Inference tab of the Cluster Detail page. You also can see the current usage of each model there. -and open your browser at [http://localhost:8000/docs](http://localhost:8000/docs) , you should be able to see a debug interface for your service. +## Using Inference -![FastAPI Swagger interface](https://gist.githubusercontent.com/generall/c229cc94be8c15095286b0c55a3f19d7/raw/d866e37a60036ebe65508bd736faff817a5d27e9/fastapi_neural_search.png) +Inference can be easily used through the Qdrant SDKs and the REST or GRPC APIs when upserting points and when querying the database. -Feel free to play around with it, make queries and check out the results. -This concludes the tutorial. +Instead of a vector, you can use special *Interface Objects*: -### [Anchor](https://qdrant.tech/articles/neural-search-tutorial/\#experience-neural-search-with-qdrants-free-demo) Experience Neural Search With Qdrant’s Free Demo +* **`Document`** object, used for text inference -Excited to see neural search in action? Take the next step and book a [free demo](https://qdrant.to/semantic-search-demo) with Qdrant! Experience firsthand how this cutting-edge technology can transform your search capabilities. +```js +// Document +{ + // Text input + text: "Your text", + // Name of the model, to do inference with + model: "", + // Extra parameters for the model, Optional + options: {} +} +``` -Our demo will help you grow intuition for cases when the neural search is useful. The demo contains a switch that selects between neural and full-text searches. You can turn neural search on and off to compare the result with regular full-text search. -Try to use a startup description to find similar ones. +* **`Image`** object, used for image inference -Join our [Discord community](https://qdrant.to/discord), where we talk about vector search and similarity learning, and publish other examples of neural networks and neural search applications. +```js +// Image +{ + // Image input + image: "", // Or base64 encoded image + // Name of the model, to do inference with + model: "", + // Extra parameters for the model, Optional + options: {} +} +``` -##### Was this page useful? +* **`Object`** object, reserved for other types of input, which might be implemented in the future. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No -Thank you for your feedback! 🙏 +The Qdrant API supports usage of these Inference Objects in all places, where regular vectors can be used. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/neural-search-tutorial.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +For example: -On this page: +```http +POST /collections//points/query +{ + "query": { + "nearest": [0.12, 0.34, 0.56, 0.78, ...] + } +} +``` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/neural-search-tutorial.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Can be replaced with -× +```http +POST /collections//points/query +{ + "query": { + "nearest": { + "text": "My Query Text", + "model": "" + } + } +} +``` -[Powered by](https://qdrant.tech/) +In this case, the Qdrant Cloud will use the configured embedding model to automatically create a vector from the Inference Object and then perform the search query with it. All of this happens within a low-latency network. -<|page-110-lllmstxt|> -## multitenancy -- [Articles](https://qdrant.tech/articles/) -- How to Implement Multitenancy and Custom Sharding in Qdrant +The input used for inference will not be saved anywhere. If you want to persist it in Qdrant, make sure to explicitly include it in the payload. -[Back to Vector Search Manuals](https://qdrant.tech/articles/vector-search-manuals/) -# How to Implement Multitenancy and Custom Sharding in Qdrant +### Text Inference -David Myriel +Let's consider an example of using Cloud Inference with a text model producing dense vectors. -· +Here, we create one point and use a simple search query with a `Document` Inference Object. -February 06, 2024 +```python +from qdrant_client import QdrantClient +from qdrant_client.models import PointStruct, Document -![How to Implement Multitenancy and Custom Sharding in Qdrant](https://qdrant.tech/articles_data/multitenancy/preview/title.jpg) +client = QdrantClient( + url="https://xyz-example.qdrant.io:6333", + api_key="", + # IMPORTANT + # If not enabled, inference will be performed locally + cloud_inference=True, +) + +points = [ + PointStruct( + id=1, + payload={"topic": "cooking", "type": "dessert"}, + vector=Document( + text="Recipe for baking chocolate chip cookies", + model="" + ) + ) +] -# [Anchor](https://qdrant.tech/articles/multitenancy/\#scaling-your-machine-learning-setup-the-power-of-multitenancy-and-custom-sharding-in-qdrant) Scaling Your Machine Learning Setup: The Power of Multitenancy and Custom Sharding in Qdrant +client.upsert(collection_name="", points=points) -We are seeing the topics of [multitenancy](https://qdrant.tech/documentation/guides/multiple-partitions/) and [distributed deployment](https://qdrant.tech/documentation/guides/distributed_deployment/#sharding) pop-up daily on our [Discord support channel](https://qdrant.to/discord). This tells us that many of you are looking to scale Qdrant along with the rest of your machine learning setup. +result = client.query_points( + collection_name="", + query=Document( + text="How to bake cookies?", + model="" + ) +) -Whether you are building a bank fraud-detection system, [RAG](https://qdrant.tech/articles/what-is-rag-in-ai/) for e-commerce, or services for the federal government - you will need to leverage a multitenant architecture to scale your product. -In the world of SaaS and enterprise apps, this setup is the norm. It will considerably increase your application’s performance and lower your hosting costs. +print(result) +``` -## [Anchor](https://qdrant.tech/articles/multitenancy/\#multitenancy--custom-sharding-with-qdrant) Multitenancy & custom sharding with Qdrant +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; +using Value = Qdrant.Client.Grpc.Value; -We have developed two major features just for this. **You can now scale a single Qdrant cluster and support all of your customers worldwide.** Under [multitenancy](https://qdrant.tech/documentation/guides/multiple-partitions/), each customer’s data is completely isolated and only accessible by them. At times, if this data is location-sensitive, Qdrant also gives you the option to divide your cluster by region or other criteria that further secure your customer’s access. This is called [custom sharding](https://qdrant.tech/documentation/guides/distributed_deployment/#user-defined-sharding). +var client = new QdrantClient( + host: "xyz-example.qdrant.io", + port: 6334, + https: true, + apiKey: "" +); -Combining these two will result in an efficiently-partitioned architecture that further leverages the convenience of a single Qdrant cluster. This article will briefly explain the benefits and show how you can get started using both features. +await client.UpsertAsync( + collectionName: "", + points: new List { + new() { + Id = 1, + Vectors = new Document() { + Text = "Recipe for baking chocolate chip cookies", + Model = "", + }, + Payload = { + ["topic"] = "cooking", + ["type"] = "dessert" + }, + }, + } +); -## [Anchor](https://qdrant.tech/articles/multitenancy/\#one-collection-many-tenants) One collection, many tenants +var points = await client.QueryAsync( + collectionName: "", + query: new Document() { + Text = "How to bake cookies?", + Model = "" + } +); -When working with Qdrant, you can upsert all your data to a single collection, and then partition each vector via its payload. This means that all your users are leveraging the power of a single Qdrant cluster, but their data is still isolated within the collection. Let’s take a look at a two-tenant collection: +foreach(var point in points) { + Console.WriteLine(point); +} +``` -**Figure 1:** Each individual vector is assigned a specific payload that denotes which tenant it belongs to. This is how a large number of different tenants can share a single Qdrant collection. -![Qdrant Multitenancy](https://qdrant.tech/articles_data/multitenancy/multitenancy-single.png) +```bash +# Create a new vector +curl -X PUT "https://xyz-example.qdrant.io:6333/collections//points?wait=true" \ + -H "Content-Type: application/json" \ + -H "api-key: " \ + -d '{ + "points": [ + { + "id": 1, + "payload": { "topic": "cooking", "type": "dessert" }, + "vector": { + "text": "Recipe for baking chocolate chip cookies", + "model": "" + } + } + ] + }' -Qdrant is built to excel in a single collection with a vast number of tenants. You should only create multiple collections when your data is not homogenous or if users’ vectors are created by different embedding models. Creating too many collections may result in resource overhead and cause dependencies. This can increase costs and affect overall performance. +# Perform a search query +curl -X POST "https://xyz-example.qdrant.io:6333/collections//points/query" \ + -H "Content-Type: application/json" \ + -H "api-key: " \ + -d '{ + "query": { + "text": "How to bake cookies?", + "model": "" + } + }' +``` -## [Anchor](https://qdrant.tech/articles/multitenancy/\#sharding-your-database) Sharding your database +```go +package main -With Qdrant, you can also specify a shard for each vector individually. This feature is useful if you want to [control where your data is kept in the cluster](https://qdrant.tech/documentation/guides/distributed_deployment/#sharding). For example, one set of vectors can be assigned to one shard on its own node, while another set can be on a completely different node. +import ( + "context" + "log" + "time" -During vector search, your operations will be able to hit only the subset of shards they actually need. In massive-scale deployments, **this can significantly improve the performance of operations that do not require the whole collection to be scanned**. + "github.com/qdrant/go-client/qdrant" +) -This works in the other direction as well. Whenever you search for something, you can specify a shard or several shards and Qdrant will know where to find them. It will avoid asking all machines in your cluster for results. This will minimize overhead and maximize performance. +func main() { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() -### [Anchor](https://qdrant.tech/articles/multitenancy/\#common-use-cases) Common use cases + client, err := qdrant.NewClient(&qdrant.Config{ + Host: "xyz-example.qdrant.io", + Port: 6334, + APIKey: "", + UseTLS: true, + }) + if err != nil { + log.Fatalf("did not connect: %v", err) + } + defer client.Close() -A clear use-case for this feature is managing a multitenant collection, where each tenant (let it be a user or organization) is assumed to be segregated, so they can have their data stored in separate shards. Sharding solves the problem of region-based data placement, whereby certain data needs to be kept within specific locations. To do this, however, you will need to [move your shards between nodes](https://qdrant.tech/documentation/guides/distributed_deployment/#moving-shards). + _, err = client.GetPointsClient().Upsert(ctx, &qdrant.UpsertPoints{ + CollectionName: "", + Points: []*qdrant.PointStruct{ + { + Id: qdrant.NewIDNum(uint64(1)), + Vectors: qdrant.NewVectorsDocument(&qdrant.Document{ + Text: "Recipe for baking chocolate chip cookies", + Model: "", + }), + Payload: qdrant.NewValueMap(map[string]any{ + "topic": "cooking", + "type": "dessert", + }), + }, + }, + }) + if err != nil { + log.Fatalf("error creating point: %v", err) + } -**Figure 2:** Users can both upsert and query shards that are relevant to them, all within the same collection. Regional sharding can help avoid cross-continental traffic. -![Qdrant Multitenancy](https://qdrant.tech/articles_data/multitenancy/shards.png) + points, err := client.Query(ctx, &qdrant.QueryPoints{ + CollectionName: "", + Query: qdrant.NewQueryNearest( + qdrant.NewVectorInputDocument(&qdrant.Document{ + Text: "How to bake cookies?", + Model: "", + }), + ), + }) + log.Printf("List of points: %s", points) +} +``` -Custom sharding also gives you precise control over other use cases. A time-based data placement means that data streams can index shards that represent latest updates. If you organize your shards by date, you can have great control over the recency of retrieved data. This is relevant for social media platforms, which greatly rely on time-sensitive data. +```http +# Insert new points with cloud-side inference +PUT /collections//points?wait=true +{ + "points": [ + { + "id": 1, + "payload": { "topic": "cooking", "type": "dessert" }, + "vector": { + "text": "Recipe for baking chocolate chip cookies", + "model": "" + } + } + ] +} -## [Anchor](https://qdrant.tech/articles/multitenancy/\#before-i-go-any-furtherhow-secure-is-my-user-data) Before I go any further
..how secure is my user data? +# Search in the collection using cloud-side inference +POST /collections//points/query +{ + "query": { + "text": "How to bake cookies?", + "model": "" + } +} +``` -By design, Qdrant offers three levels of isolation. We initially introduced collection-based isolation, but your scaled setup has to move beyond this level. In this scenario, you will leverage payload-based isolation (from multitenancy) and resource-based isolation (from sharding). The ultimate goal is to have a single collection, where you can manipulate and customize placement of shards inside your cluster more precisely and avoid any kind of overhead. The diagram below shows the arrangement of your data within a two-tier isolation arrangement. +```typescript +import {QdrantClient} from "@qdrant/js-client-rest"; -**Figure 3:** Users can query the collection based on two filters: the `group_id` and the individual `shard_key_selector`. This gives your data two additional levels of isolation. -![Qdrant Multitenancy](https://qdrant.tech/articles_data/multitenancy/multitenancy.png) +const client = new QdrantClient({ + url: 'https://xyz-example.qdrant.io:6333', + apiKey: '', +}); -## [Anchor](https://qdrant.tech/articles/multitenancy/\#create-custom-shards-for-a-single-collection) Create custom shards for a single collection +const points = [ + { + id: 1, + payload: { topic: "cooking", type: "dessert" }, + vector: { + text: "Recipe for baking chocolate chip cookies", + model: "" + } + } +]; -When creating a collection, you will need to configure user-defined sharding. This lets you control the shard placement of your data, so that operations can hit only the subset of shards they actually need. In big clusters, this can significantly improve the performance of operations, since you won’t need to go through the entire collection to retrieve data. +await client.upsert("", { wait: true, points }); -```python -client.create_collection( - collection_name="{tenant_data}", - shard_number=2, - sharding_method=models.ShardingMethod.CUSTOM, - # ... other collection parameters +const result = await client.query( + "", + { + query: { + text: "How to bake cookies?", + model: "" + }, + } ) -client.create_shard_key("{tenant_data}", "canada") -client.create_shard_key("{tenant_data}", "germany") +console.log(result); ``` -In this example, your cluster is divided between Germany and Canada. Canadian and German law differ when it comes to international data transfer. Let’s say you are creating a RAG application that supports the healthcare industry. Your Canadian customer data will have to be clearly separated for compliance purposes from your German customer. - -Even though it is part of the same collection, data from each shard is isolated from other shards and can be retrieved as such. For additional examples on shards and retrieval, consult [Distributed Deployments](https://qdrant.tech/documentation/guides/distributed_deployment/) documentation and [Qdrant Client specification](https://python-client.qdrant.tech/). - -## [Anchor](https://qdrant.tech/articles/multitenancy/\#configure-a-multitenant-setup-for-users) Configure a multitenant setup for users - -Let’s continue and start adding data. As you upsert your vectors to your new collection, you can add a `group_id` field to each vector. If you do this, Qdrant will assign each vector to its respective group. - -Additionally, each vector can now be allocated to a shard. You can specify the `shard_key_selector` for each individual vector. In this example, you are upserting data belonging to `tenant_1` to the Canadian region. +```rust +use qdrant_client::qdrant::Query; +use qdrant_client::qdrant::QueryPointsBuilder; +use qdrant_client::Payload; +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{Document}; +use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder}; -```python -client.upsert( - collection_name="{tenant_data}", - points=[\ - models.PointStruct(\ - id=1,\ - payload={"group_id": "tenant_1"},\ - vector=[0.9, 0.1, 0.1],\ - ),\ - models.PointStruct(\ - id=2,\ - payload={"group_id": "tenant_1"},\ - vector=[0.1, 0.9, 0.1],\ - ),\ - ], - shard_key_selector="canada", -) +#[tokio::main] +async fn main() { + let client = Qdrant::from_url("https://xyz-example.qdrant.io:6334") + .api_key("") + .build() + .unwrap(); + + let points = vec![ + PointStruct::new( + 1, + Document::new( + "Recipe for baking chocolate chip cookies", + "" + ), + Payload::try_from(serde_json::json!( + {"topic": "cooking", "type": "dessert"} + )).unwrap(), + ) + ]; -``` + let upsert_request = UpsertPointsBuilder::new( + "", + points + ).wait(true); -Keep in mind that the data for each `group_id` is isolated. In the example below, `tenant_1` vectors are kept separate from `tenant_2`. The first tenant will be able to access their data in the Canadian portion of the cluster. However, as shown below `tenant_2 ` might only be able to retrieve information hosted in Germany. + let _ = client.upsert_points(upsert_request).await; -```python -client.upsert( - collection_name="{tenant_data}", - points=[\ - models.PointStruct(\ - id=3,\ - payload={"group_id": "tenant_2"},\ - vector=[0.1, 0.1, 0.9],\ - ),\ - ], - shard_key_selector="germany", -) + let query_document = Document::new( + "How to bake cookies?", + "" + ); + let query_request = QueryPointsBuilder::new("") + .query(Query::new_nearest(query_document)); + + let result = client.query(query_request).await.unwrap(); + println!("Result: {:?}", result); +} ``` -## [Anchor](https://qdrant.tech/articles/multitenancy/\#retrieve-data-via-filters) Retrieve data via filters +```java +package org.example; -The access control setup is completed as you specify the criteria for data retrieval. When searching for vectors, you need to use a `query_filter` along with `group_id` to filter vectors for each user. +import static io.qdrant.client.PointIdFactory.id; +import static io.qdrant.client.QueryFactory.nearest; +import static io.qdrant.client.ValueFactory.value; +import static io.qdrant.client.VectorsFactory.vectors; -```python -client.search( - collection_name="{tenant_data}", - query_filter=models.Filter( - must=[\ - models.FieldCondition(\ - key="group_id",\ - match=models.MatchValue(\ - value="tenant_1",\ - ),\ - ),\ - ] - ), - query_vector=[0.1, 0.1, 0.9], - limit=10, -) +import io.qdrant.client.grpc.Points; +import io.qdrant.client.grpc.Points.Document; +import io.qdrant.client.grpc.Points.PointStruct; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutionException; + +public class Main { + public static void main(String[] args) + throws ExecutionException, InterruptedException { + QdrantClient client = + new QdrantClient( + QdrantGrpcClient.newBuilder("xyz-example.qdrant.io", 6334, true) + .withApiKey("") + .build()); + + client + .upsertAsync( + "", + List.of( + PointStruct.newBuilder() + .setId(id(1)) + .setVectors( + vectors( + Document.newBuilder() + .setText("Recipe for baking chocolate chip cookies") + .setModel("") + .build())) + .putAllPayload(Map.of("topic", value("cooking"), "type", value("dessert"))) + .build())) + .get(); + + List points = + client + .queryAsync( + Points.QueryPoints.newBuilder() + .setCollectionName("") + .setQuery( + nearest( + Document.newBuilder() + .setText("How to bake cookies?") + .setModel("") + .build())) + .build()) + .get(); + System.out.printf(points.toString()); + } +} ``` -## [Anchor](https://qdrant.tech/articles/multitenancy/\#performance-considerations) Performance considerations +Usage examples, specific to each cluster and model, can also be found in the Inference tab of the Cluster Detail page in the Qdrant Cloud Console. -The speed of indexation may become a bottleneck if you are adding large amounts of data in this way, as each user’s vector will be indexed into the same collection. To avoid this bottleneck, consider _bypassing the construction of a global vector index_ for the entire collection and building it only for individual groups instead. +Note that each model has a context window, which is the maximum number of tokens that can be processed by the model in a single request. If the input text exceeds the context window, it will be truncated to fit within the limit. The context window size is displayed in the Inference tab of the Cluster Detail page. -By adopting this strategy, Qdrant will index vectors for each user independently, significantly accelerating the process. +For dense vector models, you also have to ensure that the vector size configured in the collection matches the output size of the model. If the vector size does not match, the upsert will fail with an error. -To implement this approach, you should: +### Image Inference -1. Set `payload_m` in the HNSW configuration to a non-zero value, such as 16. -2. Set `m` in hnsw config to 0. This will disable building global index for the whole collection. +Here is another example of using Cloud Inference with an image model. This time, we will use the `CLIP` model to encode an image and then use a text query to search for it. -```python -from qdrant_client import QdrantClient, models +Since the `CLIP` model is multimodal, we can use both image and text inputs on the same vector field. -client = QdrantClient("localhost", port=6333) +```python +from qdrant_client import QdrantClient +from qdrant_client.models import PointStruct, Image, Document -client.create_collection( - collection_name="{tenant_data}", - vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), - hnsw_config=models.HnswConfigDiff( - payload_m=16, - m=0, - ), +client = QdrantClient( + url="https://xyz-example.qdrant.io:6333", + api_key="", + # IMPORTANT + # If not enabled, inference will be performed locally + cloud_inference=True, ) -``` +points = [ + PointStruct( + id=1, + vector=Image( + image="https://qdrant.tech/example.png", + model="qdrant/clip-vit-b-32-vision" + ), + payload={ + "title": "Example Image" + } + ) +] -3. Create keyword payload index for `group_id` field. +client.upsert(collection_name="", points=points) -```python -client.create_payload_index( - collection_name="{tenant_data}", - field_name="group_id", - field_schema=models.PayloadSchemaType.KEYWORD, +result = client.query_points( + collection_name="", + query=Document( + text="Mission to Mars", + model="qdrant/clip-vit-b-32-text" + ) ) +print(result) ``` -> Note: Keep in mind that global requests (without the `group_id` filter) will be slower since they will necessitate scanning all groups to identify the nearest neighbors. - -## [Anchor](https://qdrant.tech/articles/multitenancy/\#explore-multitenancy-and-custom-sharding-in-qdrant-for-scalable-solutions) Explore multitenancy and custom sharding in Qdrant for scalable solutions +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; +using Value = Qdrant.Client.Grpc.Value; -Qdrant is ready to support a massive-scale architecture for your machine learning project. If you want to see whether our [vector database](https://qdrant.tech/) is right for you, try the [quickstart tutorial](https://qdrant.tech/documentation/quick-start/) or read our [docs and tutorials](https://qdrant.tech/documentation/). +var client = new QdrantClient( + host: "xyz-example.qdrant.io", + port: 6334, + https: true, + apiKey: "" +); -To spin up a free instance of Qdrant, sign up for [Qdrant Cloud](https://qdrant.to/cloud) \- no strings attached. +await client.UpsertAsync( + collectionName: "", + points: new List { + new() { + Id = 1, + Vectors = new Image() { + Image = "https://qdrant.tech/example.png", + Model = "qdrant/clip-vit-b-32-vision", + }, + Payload = { + ["title"] = "Example Image" + }, + }, + } +); -Get support or share ideas in our [Discord](https://qdrant.to/discord) community. This is where we talk about vector search theory, publish examples and demos and discuss vector database setups. +var points = await client.QueryAsync( + collectionName: "", + query: new Document() { + Text = "Mission to Mars", + Model = "qdrant/clip-vit-b-32-text" + } +); -##### Was this page useful? +foreach(var point in points) { + Console.WriteLine(point); +} +``` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +```bash +# Create a new vector +curl -X PUT "https://xyz-example.qdrant.io:6333/collections//points?wait=true" \ + -H "Content-Type: application/json" \ + -H "api-key: " \ + -d '{ + "points": [ + { + "id": 1, + "vector": { + "image": "https://qdrant.tech/example.png", + "model": "qdrant/clip-vit-b-32-vision" + }, + "payload": { + "title": "Example Image" + } + } + ] + }' -Thank you for your feedback! 🙏 +# Perform a search query +curl -X POST "https://xyz-example.qdrant.io:6333/collections//points/query" \ + -H "Content-Type: application/json" \ + -H "api-key: " \ + -d '{ + "query": { + "text": "Mission to Mars", + "model": "qdrant/clip-vit-b-32-text" + } + }' +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/multitenancy.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +```go +package main -On this page: +import ( + "context" + "log" + "time" -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/multitenancy.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) + "github.com/qdrant/go-client/qdrant" +) -× +func main() { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() -[Powered by](https://qdrant.tech/) + client, err := qdrant.NewClient(&qdrant.Config{ + Host: "xyz-example.qdrant.io", + Port: 6334, + APIKey: "", + UseTLS: true, + }) + if err != nil { + log.Fatalf("did not connect: %v", err) + } + defer client.Close() -<|page-111-lllmstxt|> -## fastembed -- [Articles](https://qdrant.tech/articles/) -- FastEmbed: Qdrant's Efficient Python Library for Embedding Generation + _, err = client.GetPointsClient().Upsert(ctx, &qdrant.UpsertPoints{ + CollectionName: "", + Points: []*qdrant.PointStruct{ + { + Id: qdrant.NewIDNum(uint64(1)), + Vectors: qdrant.NewVectorsImage(&qdrant.Image{ + Image: "https://qdrant.tech/example.png", + Model: "qdrant/clip-vit-b-32-vision", + }), + Payload: qdrant.NewValueMap(map[string]any{ + "title": "Example image", + }), + }, + }, + }) + if err != nil { + log.Fatalf("error creating point: %v", err) + } -[Back to Ecosystem](https://qdrant.tech/articles/ecosystem/) + points, err := client.Query(ctx, &qdrant.QueryPoints{ + CollectionName: "", + Query: qdrant.NewQueryNearest( + qdrant.NewVectorInputDocument(&qdrant.Document{ + Text: "Mission to Mars", + Model: "qdrant/clip-vit-b-32-text", + }), + ), + }) + log.Printf("List of points: %s", points) +} +``` -# FastEmbed: Qdrant's Efficient Python Library for Embedding Generation +```http +# Insert new points with cloud-side inference +PUT /collections//points?wait=true +{ + "points": [ + { + "id": 1, + "vector": { + "image": "https://qdrant.tech/example.png", + "model": "qdrant/clip-vit-b-32-vision" + }, + "payload": { + "title": "Example Image" + } + } + ] +} -Nirant Kasliwal +# Search in the collection using cloud-side inference +POST /collections//points/query +{ + "query": { + "text": "Mission to Mars", + "model": "qdrant/clip-vit-b-32-text" + } +} +``` -· +```typescript +import {QdrantClient} from "@qdrant/js-client-rest"; -October 18, 2023 +const client = new QdrantClient({ + url: 'https://xyz-example.qdrant.io:6333', + apiKey: '', +}); -![FastEmbed: Qdrant's Efficient Python Library for Embedding Generation](https://qdrant.tech/articles_data/fastembed/preview/title.jpg) +const points = [ + { + id: 1, + vector: { + image: "https://qdrant.tech/example.png", + model: "qdrant/clip-vit-b-32-vision" + }, + payload: { + title: "Example Image" + } + } +]; -Data Science and Machine Learning practitioners often find themselves navigating through a labyrinth of models, libraries, and frameworks. Which model to choose, what embedding size, and how to approach tokenizing, are just some questions you are faced with when starting your work. We understood how many data scientists wanted an easier and more intuitive means to do their embedding work. This is why we built FastEmbed, a Python library engineered for speed, efficiency, and usability. We have created easy to use default workflows, handling the 80% use cases in NLP embedding. +await client.upsert("", { wait: true, points }); -## [Anchor](https://qdrant.tech/articles/fastembed/\#current-state-of-affairs-for-generating-embeddings) Current State of Affairs for Generating Embeddings +const result = await client.query( + "", + { + query: { + text: "Mission to Mars", + model: "qdrant/clip-vit-b-32-text" + }, + } +) -Usually you make embedding by utilizing PyTorch or TensorFlow models under the hood. However, using these libraries comes at a cost in terms of ease of use and computational speed. This is at least in part because these are built for both: model inference and improvement e.g. via fine-tuning. +console.log(result); +``` -To tackle these problems we built a small library focused on the task of quickly and efficiently creating text embeddings. We also decided to start with only a small sample of best in class transformer models. By keeping it small and focused on a particular use case, we could make our library focused without all the extraneous dependencies. We ship with limited models, quantize the model weights and seamlessly integrate them with the ONNX Runtime. FastEmbed strikes a balance between inference time, resource utilization and performance (recall/accuracy). +```rust +use qdrant_client::qdrant::Query; +use qdrant_client::qdrant::QueryPointsBuilder; +use qdrant_client::Payload; +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{Document, Image}; +use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder}; -## [Anchor](https://qdrant.tech/articles/fastembed/\#quick-embedding-text-document-example) Quick Embedding Text Document Example +#[tokio::main] +async fn main() { + let client = Qdrant::from_url("https://xyz-example.qdrant.io:6334") + .api_key("") + .build() + .unwrap(); + + let points = vec![ + PointStruct::new( + 1, + Image::new_from_url( + "https://qdrant.tech/example.png", + "qdrant/clip-vit-b-32-vision" + ), + Payload::try_from(serde_json::json!({ + "title": "Example Image" + })).unwrap(), + ) + ]; -Here is an example of how simple we have made embedding text documents: + let upsert_request = UpsertPointsBuilder::new( + "", + points + ).wait(true); -```python -documents: List[str] = [\ - "Hello, World!",\ - "fastembed is supported by and maintained by Qdrant."\ -] -embedding_model = DefaultEmbedding() -embeddings: List[np.ndarray] = list(embedding_model.embed(documents)) + let _ = client.upsert_points(upsert_request).await; -``` + let query_document = Document::new( + "Mission to Mars", + "qdrant/clip-vit-b-32-text" + ); -These 3 lines of code do a lot of heavy lifting for you: They download the quantized model, load it using ONNXRuntime, and then run a batched embedding creation of your documents. + let query_request = QueryPointsBuilder::new("") + .query(Query::new_nearest(query_document)); -### [Anchor](https://qdrant.tech/articles/fastembed/\#code-walkthrough) Code Walkthrough + let result = client.query(query_request).await.unwrap(); + println!("Result: {:?}", result); +} +``` -Let’s delve into a more advanced example code snippet line-by-line: +```java +package org.example; -```python -from fastembed.embedding import DefaultEmbedding +import static io.qdrant.client.PointIdFactory.id; +import static io.qdrant.client.QueryFactory.nearest; +import static io.qdrant.client.ValueFactory.value; +import static io.qdrant.client.VectorsFactory.vectors; -``` +import io.qdrant.client.grpc.Points; +import io.qdrant.client.grpc.Points.Document; +import io.qdrant.client.grpc.Points.Image; +import io.qdrant.client.grpc.Points.PointStruct; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutionException; + +public class Main { + public static void main(String[] args) + throws ExecutionException, InterruptedException { + QdrantClient client = + new QdrantClient( + QdrantGrpcClient.newBuilder("xyz-example.qdrant.io", 6334, true) + .withApiKey("") + .build()); -Here, we import the FlagEmbedding class from FastEmbed and alias it as Embedding. This is the core class responsible for generating embeddings based on your chosen text model. This is also the class which you can import directly as DefaultEmbedding which is [BAAI/bge-small-en-v1.5](https://huggingface.co/baai/bge-small-en-v1.5) + client + .upsertAsync( + "", + List.of( + PointStruct.newBuilder() + .setId(id(1)) + .setVectors( + vectors( + Image.newBuilder() + .setImage("https://qdrant.tech/example.png") + .setModel("qdrant/clip-vit-b-32-vision") + .build())) + .putAllPayload(Map.of("title", value("Example Image"))) + .build())) + .get(); -```python -documents: List[str] = [\ - "passage: Hello, World!",\ - "query: How is the World?",\ - "passage: This is an example passage.",\ - "fastembed is supported by and maintained by Qdrant."\ -] + List points = + client + .queryAsync( + Points.QueryPoints.newBuilder() + .setCollectionName("") + .setQuery( + nearest( + Document.newBuilder() + .setText("Mission to Mars") + .setModel("qdrant/clip-vit-b-32-text") + .build())) + .build()) + .get(); + System.out.printf(points.toString()); + } +} ``` -In this list called documents, we define four text strings that we want to convert into embeddings. - -Note the use of prefixes “passage” and “query” to differentiate the types of embeddings to be generated. This is inherited from the cross-encoder implementation of the BAAI/bge series of models themselves. This is particularly useful for retrieval and we strongly recommend using this as well. - -The use of text prefixes like “query” and “passage” isn’t merely syntactic sugar; it informs the algorithm on how to treat the text for embedding generation. A “query” prefix often triggers the model to generate embeddings that are optimized for similarity comparisons, while “passage” embeddings are fine-tuned for contextual understanding. If you omit the prefix, the default behavior is applied, although specifying it is recommended for more nuanced results. +Qdrant Cloud Inference server will download the images using the provided link. Alternatively, you can upload the image as a base64 encoded string. -Next, we initialize the Embedding model with the default model: [BAAI/bge-small-en-v1.5](https://huggingface.co/baai/bge-small-en-v1.5). +Note that each model has limitations on the file size and extensions it can work with. -```python -embedding_model = DefaultEmbedding() +Please refer to the model card for details. -``` +### Local Inference Compatibility -The default model and several other models have a context window of a maximum of 512 tokens. This maximum limit comes from the embedding model training and design itself. If you’d like to embed sequences larger than that, we’d recommend using some pooling strategy to get a single vector out of the sequence. For example, you can use the mean of the embeddings of different chunks of a document. This is also what the [SBERT Paper recommends](https://lilianweng.github.io/posts/2021-05-31-contrastive/#sentence-bert) +The Python SDK offers a unique capability: it supports both [local](/documentation/fastembed/fastembed-semantic-search/) and cloud inference through an identical interface. -This model strikes a balance between speed and accuracy, ideal for real-world applications. +You can easily switch between local and cloud inference by setting the cloud_inference flag when initializing the QdrantClient. For example: ```python -embeddings: List[np.ndarray] = list(embedding_model.embed(documents)) - +client = QdrantClient( + url="https://your-cluster.qdrant.io", + api_key="", + cloud_inference=True, # Set to False to use local inference +) ``` -Finally, we call the `embed()` method on our embedding\_model object, passing in the documents list. The method returns a Python generator, so we convert it to a list to get all the embeddings. These embeddings are NumPy arrays, optimized for fast mathematical operations. +This flexibility allows you to develop and test your applications locally or in continuous integration (CI) environments without requiring access to cloud inference resources. -The `embed()` method returns a list of NumPy arrays, each corresponding to the embedding of a document in your original documents list. The dimensions of these arrays are determined by the model you chose e.g. for “BAAI/bge-small-en-v1.5” it’s a 384-dimensional vector. +* When `cloud_inference` is set to `False`, inference is performed locally usign `fastembed`. +* When set to `True`, inference requests are handled by Qdrant Cloud. -You can easily parse these NumPy arrays for any downstream application—be it clustering, similarity comparison, or feeding them into a machine learning model for further analysis. +<|page-191-lllmstxt|> +# Distributed deployment -## [Anchor](https://qdrant.tech/articles/fastembed/\#3-key-features-of-fastembed) 3 Key Features of FastEmbed +Since version v0.8.0 Qdrant supports a distributed deployment mode. +In this mode, multiple Qdrant services communicate with each other to distribute the data across the peers to extend the storage capabilities and increase stability. -FastEmbed is built for inference speed, without sacrificing (too much) performance: +## How many Qdrant nodes should I run? -1. 50% faster than PyTorch Transformers -2. Better performance than Sentence Transformers and OpenAI Ada-002 -3. Cosine similarity of quantized and original model vectors is 0.92 +The ideal number of Qdrant nodes depends on how much you value cost-saving, resilience, and performance/scalability in relation to each other. -We use `BAAI/bge-small-en-v1.5` as our DefaultEmbedding, hence we’ve chosen that for comparison: +- **Prioritizing cost-saving**: If cost is most important to you, run a single Qdrant node. This is not recommended for production environments. Drawbacks: + - Resilience: Users will experience downtime during node restarts, and recovery is not possible unless you have backups or snapshots. + - Performance: Limited to the resources of a single server. -![](https://qdrant.tech/articles_data/fastembed/throughput.png) +- **Prioritizing resilience**: If resilience is most important to you, run a Qdrant cluster with three or more nodes and two or more shard replicas. Clusters with three or more nodes and replication can perform all operations even while one node is down. Additionally, they gain performance benefits from load-balancing and they can recover from the permanent loss of one node without the need for backups or snapshots (but backups are still strongly recommended). This is most recommended for production environments. Drawbacks: + - Cost: Larger clusters are more costly than smaller clusters, which is the only drawback of this configuration. -## [Anchor](https://qdrant.tech/articles/fastembed/\#under-the-hood-of-fastembed) Under the Hood of FastEmbed +- **Balancing cost, resilience, and performance**: Running a two-node Qdrant cluster with replicated shards allows the cluster to respond to most read/write requests even when one node is down, such as during maintenance events. Having two nodes also means greater performance than a single-node cluster while still being cheaper than a three-node cluster. Drawbacks: + - Resilience (uptime): The cluster cannot perform operations on collections when one node is down. Those operations require >50% of nodes to be running, so this is only possible in a 3+ node cluster. Since creating, editing, and deleting collections are usually rare operations, many users find this drawback to be negligible. + - Resilience (data integrity): If the data on one of the two nodes is permanently lost or corrupted, it cannot be recovered aside from snapshots or backups. Only 3+ node clusters can recover from the permanent loss of a single node since recovery operations require >50% of the cluster to be healthy. + - Cost: Replicating your shards requires storing two copies of your data. + - Performance: The maximum performance of a Qdrant cluster increases as you add more nodes. -**Quantized Models**: We quantize the models for CPU (and Mac Metal) – giving you the best buck for your compute model. Our default model is so small, you can run this in AWS Lambda if you’d like! +In summary, single-node clusters are best for non-production workloads, replicated 3+ node clusters are the gold standard, and replicated 2-node clusters strike a good balance. -Shout out to Huggingface’s [Optimum](https://github.com/huggingface/optimum) – which made it easier to quantize models. +## Enabling distributed mode in self-hosted Qdrant -**Reduced Installation Time**: +To enable distributed deployment - enable the cluster mode in the [configuration](/documentation/guides/configuration/) or using the ENV variable: `QDRANT__CLUSTER__ENABLED=true`. -FastEmbed sets itself apart by maintaining a low minimum RAM/Disk usage. +```yaml +cluster: + # Use `enabled: true` to run Qdrant in distributed deployment mode + enabled: true + # Configuration of the inter-cluster communication + p2p: + # Port for internal communication between peers + port: 6335 -It’s designed to be agile and fast, useful for businesses looking to integrate text embedding for production usage. For FastEmbed, the list of dependencies is refreshingly brief: + # Configuration related to distributed consensus algorithm + consensus: + # How frequently peers should ping each other. + # Setting this parameter to lower value will allow consensus + # to detect disconnected node earlier, but too frequent + # tick period may create significant network and CPU overhead. + # We encourage you NOT to change this parameter unless you know what you are doing. + tick_period_ms: 100 +``` -> - onnx: Version ^1.11 – We’ll try to drop this also in the future if we can! -> - onnxruntime: Version ^1.15 -> - tqdm: Version ^4.65 – used only at Download -> - requests: Version ^2.31 – used only at Download -> - tokenizers: Version ^0.13 +By default, Qdrant will use port `6335` for its internal communication. +All peers should be accessible on this port from within the cluster, but make sure to isolate this port from outside access, as it might be used to perform write operations. -This minimized list serves two purposes. First, it significantly reduces the installation time, allowing for quicker deployments. Second, it limits the amount of disk space required, making it a viable option even for environments with storage limitations. +Additionally, you must provide the `--uri` flag to the first peer so it can tell other nodes how it should be reached: -Notably absent from the dependency list are bulky libraries like PyTorch, and there’s no requirement for CUDA drivers. This is intentional. FastEmbed is engineered to deliver optimal performance right on your CPU, eliminating the need for specialized hardware or complex setups. +```bash +./qdrant --uri 'http://qdrant_node_1:6335' +``` -**ONNXRuntime**: The ONNXRuntime gives us the ability to support multiple providers. The quantization we do is limited for CPU (Intel), but we intend to support GPU versions of the same in the future as well.  This allows for greater customization and optimization, further aligning with your specific performance and computational requirements. +Subsequent peers in a cluster must know at least one node of the existing cluster to synchronize through it with the rest of the cluster. -## [Anchor](https://qdrant.tech/articles/fastembed/\#current-models) Current Models +To do this, they need to be provided with a bootstrap URL: -We’ve started with a small set of supported models: +```bash +./qdrant --bootstrap 'http://qdrant_node_1:6335' +``` -All the models we support are [quantized](https://pytorch.org/docs/stable/quantization.html) to enable even faster computation! +The URL of the new peers themselves will be calculated automatically from the IP address of their request. +But it is also possible to provide them individually using the `--uri` argument. -If you’re using FastEmbed and you’ve got ideas or need certain features, feel free to let us know. Just drop an issue on our GitHub page. That’s where we look first when we’re deciding what to work on next. Here’s where you can do it: [FastEmbed GitHub Issues](https://github.com/qdrant/fastembed/issues). +```text +USAGE: + qdrant [OPTIONS] -When it comes to FastEmbed’s DefaultEmbedding model, we’re committed to supporting the best Open Source models. +OPTIONS: + --bootstrap + Uri of the peer to bootstrap from in case of multi-peer deployment. If not specified - + this peer will be considered as a first in a new deployment -If anything changes, you’ll see a new version number pop up, like going from 0.0.6 to 0.1. So, it’s a good idea to lock in the FastEmbed version you’re using to avoid surprises. + --uri + Uri of this peer. Other peers should be able to reach it by this uri. -## [Anchor](https://qdrant.tech/articles/fastembed/\#using-fastembed-with-qdrant) Using FastEmbed with Qdrant + This value has to be supplied if this is the first peer in a new deployment. -Qdrant is a Vector Store, offering comprehensive, efficient, and scalable [enterprise solutions](https://qdrant.tech/enterprise-solutions/) for modern machine learning and AI applications. Whether you are dealing with billions of data points, require a low latency performant [vector database solution](https://qdrant.tech/qdrant-vector-database/), or specialized quantization methods – [Qdrant is engineered](https://qdrant.tech/documentation/overview/) to meet those demands head-on. + In case this is not the first peer and it bootstraps the value is optional. If not + supplied then qdrant will take internal grpc port from config and derive the IP address + of this peer on bootstrap peer (receiving side) -The fusion of FastEmbed with Qdrant’s vector store capabilities enables a transparent workflow for seamless embedding generation, storage, and retrieval. This simplifies the API design — while still giving you the flexibility to make significant changes e.g. you can use FastEmbed to make your own embedding other than the DefaultEmbedding and use that with Qdrant. +``` -Below is a detailed guide on how to get started with FastEmbed in conjunction with Qdrant. +After a successful synchronization you can observe the state of the cluster through the [REST API](https://api.qdrant.tech/master/api-reference/distributed/cluster-status): -### [Anchor](https://qdrant.tech/articles/fastembed/\#step-1-installation) Step 1: Installation +```http +GET /cluster +``` -Before diving into the code, the initial step involves installing the Qdrant Client along with the FastEmbed library. This can be done using pip: +Example result: +```json +{ + "result": { + "status": "enabled", + "peer_id": 11532566549086892000, + "peers": { + "9834046559507417430": { + "uri": "http://172.18.0.3:6335/" + }, + "11532566549086892528": { + "uri": "http://qdrant_node_1:6335/" + } + }, + "raft_info": { + "term": 1, + "commit": 4, + "pending_operations": 1, + "leader": 11532566549086892000, + "role": "Leader" + } + }, + "status": "ok", + "time": 5.731e-06 +} ``` -pip install qdrant-client[fastembed] -``` +Note that enabling distributed mode does not automatically replicate your data. See the section on [making use of a new distributed Qdrant cluster](#making-use-of-a-new-distributed-qdrant-cluster) for the next steps. -For those using zsh as their shell, you might encounter syntax issues. In such cases, wrap the package name in quotes: +## Enabling distributed mode in Qdrant Cloud -``` -pip install 'qdrant-client[fastembed]' +For best results, first ensure your cluster is running Qdrant v1.7.4 or higher. Older versions of Qdrant do support distributed mode, but improvements in v1.7.4 make distributed clusters more resilient during outages. -``` +In the [Qdrant Cloud console](https://cloud.qdrant.io/), click "Scale Up" to increase your cluster size to >1. Qdrant Cloud configures the distributed mode settings automatically. -### [Anchor](https://qdrant.tech/articles/fastembed/\#step-2-initializing-the-qdrant-client) Step 2: Initializing the Qdrant Client +Additionally, Qdrant Cloud also offers the ability to automatically rebalance and to reshard your collections, which is not available in self-hosted Qdrant. See the [Resharding](/documentation/cloud/cluster-scaling/#resharding) and [Shard Rebalancing](/documentation/cloud/configure-cluster/#shard-rebalancing) sections in for more details. -After successful installation, the next step involves initializing the Qdrant Client. This can be done either in-memory or by specifying a database path: +After the scale-up process completes, you will have a new empty node running alongside your existing node(s). To replicate data into this new empty node, see the next section. -```python -from qdrant_client import QdrantClient -# Initialize the client -client = QdrantClient(":memory:")  # or QdrantClient(path="path/to/db") +## Making use of a new distributed Qdrant cluster -``` +When you enable distributed mode and scale up to two or more nodes, your data does not move to the new node automatically; it starts out empty. To make use of your new empty node, do one of the following: -### [Anchor](https://qdrant.tech/articles/fastembed/\#step-3-preparing-documents-metadata-and-ids) Step 3: Preparing Documents, Metadata, and IDs +* Create a new replicated collection by setting the [replication_factor](#replication-factor) to 2 or more and setting the [number of shards](#choosing-the-right-number-of-shards) to a multiple of your number of nodes. +* If you have an existing collection which does not contain enough shards for each node, you must create a new collection as described in the previous bullet point. +* If you already have enough shards for each node, and you merely need to replicate your data, follow the directions for [creating new shard replicas](#creating-new-shard-replicas). +* If you already have enough shards for each node, and your data is already replicated, you can move data (without replicating it) onto the new node(s) by [moving shards](#moving-shards). -Once the client is initialized, prepare the text documents you wish to embed, along with any associated metadata and unique IDs: +## Raft -```python -docs = [\ - "Qdrant has Langchain integrations",\ - "Qdrant also has Llama Index integrations"\ -] -metadata = [\ - {"source": "Langchain-docs"},\ - {"source": "LlamaIndex-docs"},\ -] -ids = [42, 2] +Qdrant uses the [Raft](https://raft.github.io/) consensus protocol to maintain consistency regarding the cluster topology and the collections structure. -``` +Operations on points, on the other hand, do not go through the consensus infrastructure. +Qdrant is not intended to have strong transaction guarantees, which allows it to perform point operations with low overhead. +In practice, it means that Qdrant does not guarantee atomic distributed updates but allows you to wait until the [operation is complete](/documentation/concepts/points/#awaiting-result) to see the results of your writes. -Note that the add method we’ll use is overloaded: If you skip the ids, we’ll generate those for you. metadata is obviously optional. So, you can simply use this too: +Operations on collections, on the contrary, are part of the consensus which guarantees that all operations are durable and eventually executed by all nodes. +In practice it means that a majority of nodes agree on what operations should be applied before the service will perform them. -```python -docs = [\ - "Qdrant has Langchain integrations",\ - "Qdrant also has Llama Index integrations"\ -] +Practically, it means that if the cluster is in a transition state - either electing a new leader after a failure or starting up, the collection update operations will be denied. -``` +You may use the cluster [REST API](https://api.qdrant.tech/master/api-reference/distributed/cluster-status) to check the state of the consensus. -### [Anchor](https://qdrant.tech/articles/fastembed/\#step-4-adding-documents-to-a-collection) Step 4: Adding Documents to a Collection +## Sharding -With your documents, metadata, and IDs ready, you can proceed to add these to a specified collection within Qdrant using the add method: +A Collection in Qdrant is made of one or more shards. +A shard is an independent store of points which is able to perform all operations provided by collections. +There are two methods of distributing points across shards: -```python -client.add( - collection_name="demo_collection", - documents=docs, - metadata=metadata, - ids=ids -) +- **Automatic sharding**: Points are distributed among shards by using a [consistent hashing](https://en.wikipedia.org/wiki/Consistent_hashing) algorithm, so that shards are managing non-intersecting subsets of points. This is the default behavior. -``` +- **User-defined sharding**: _Available as of v1.7.0_ - Each point is uploaded to a specific shard, so that operations can hit only the shard or shards they need. Even with this distribution, shards still ensure having non-intersecting subsets of points. [See more...](#user-defined-sharding) -Inside this function, Qdrant Client uses FastEmbed to make the text embedding, generate ids if they’re missing, and then add them to the index with metadata. This uses the DefaultEmbedding model: [BAAI/bge-small-en-v1.5](https://huggingface.co/baai/bge-small-en-v1.5) +Each node knows where all parts of the collection are stored through the [consensus protocol](#raft), so when you send a search request to one Qdrant node, it automatically queries all other nodes to obtain the full search result. -![INDEX TIME: Sequence Diagram for Qdrant and FastEmbed](https://qdrant.tech/articles_data/fastembed/generate-embeddings-from-docs.png) +### Choosing the right number of shards -### [Anchor](https://qdrant.tech/articles/fastembed/\#step-5-performing-queries) Step 5: Performing Queries +When you create a collection, Qdrant splits the collection into `shard_number` shards. If left unset, `shard_number` is set to the number of nodes in your cluster when the collection was created. The `shard_number` cannot be changed without recreating the collection. -Finally, you can perform queries on your stored documents. Qdrant offers a robust querying capability, and the query results can be easily retrieved as follows: +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 300, + "distance": "Cosine" + }, + "shard_number": 6 +} +``` ```python -search_result = client.query( - collection_name="demo_collection", - query_text="This is a query document" -) -print(search_result) - -``` +from qdrant_client import QdrantClient, models -Behind the scenes, we first convert the query\_text to the embedding and use that to query the vector index. +client = QdrantClient(url="http://localhost:6333") -![QUERY TIME: Sequence Diagram for Qdrant and FastEmbed integration](https://qdrant.tech/articles_data/fastembed/generate-embeddings-query.png) +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=300, distance=models.Distance.COSINE), + shard_number=6, +) +``` -By following these steps, you effectively utilize the combined capabilities of FastEmbed and Qdrant, thereby streamlining your embedding generation and retrieval tasks. +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -Qdrant is designed to handle large-scale datasets with billions of data points. Its architecture employs techniques like [binary quantization](https://qdrant.tech/articles/binary-quantization/) and [scalar quantization](https://qdrant.tech/articles/scalar-quantization/) for efficient storage and retrieval. When you inject FastEmbed’s CPU-first design and lightweight nature into this equation, you end up with a system that can scale seamlessly while maintaining low latency. +const client = new QdrantClient({ host: "localhost", port: 6333 }); -## [Anchor](https://qdrant.tech/articles/fastembed/\#summary) Summary +client.createCollection("{collection_name}", { + vectors: { + size: 300, + distance: "Cosine", + }, + shard_number: 6, +}); +``` -If you’re curious about how FastEmbed and Qdrant can make your search tasks a breeze, why not take it for a spin? You get a real feel for what it can do. Here are two easy ways to get started: +```rust +use qdrant_client::qdrant::{CreateCollectionBuilder, Distance, VectorParamsBuilder}; +use qdrant_client::Qdrant; -1. **Cloud**: Get started with a free plan on the [Qdrant Cloud](https://qdrant.to/cloud?utm_source=qdrant&utm_medium=website&utm_campaign=fastembed&utm_content=article). +let client = Qdrant::from_url("http://localhost:6334").build()?; -2. **Docker Container**: If you’re the DIY type, you can set everything up on your own machine. Here’s a quick guide to help you out: [Quick Start with Docker](https://qdrant.tech/documentation/quick-start/?utm_source=qdrant&utm_medium=website&utm_campaign=fastembed&utm_content=article). +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(300, Distance::Cosine)) + .shard_number(6), + ) + .await?; +``` +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; -So, go ahead, take it for a test drive. We’re excited to hear what you think! +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -Lastly, If you find FastEmbed useful and want to keep up with what we’re doing, giving our GitHub repo a star would mean a lot to us. Here’s the link to [star the repository](https://github.com/qdrant/fastembed). +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(300) + .setDistance(Distance.Cosine) + .build()) + .build()) + .setShardNumber(6) + .build()) + .get(); +``` -If you ever have questions about FastEmbed, please ask them on the Qdrant Discord: [https://discord.gg/Qy6HCJK9Dc](https://discord.gg/Qy6HCJK9Dc) +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -##### Was this page useful? +var client = new QdrantClient("localhost", 6334); -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { Size = 300, Distance = Distance.Cosine }, + shardNumber: 6 +); +``` -Thank you for your feedback! 🙏 +```go +import ( + "context" -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/fastembed.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. + "github.com/qdrant/go-client/qdrant" +) -On this page: +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/fastembed.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 300, + Distance: qdrant.Distance_Cosine, + }), + ShardNumber: qdrant.PtrOf(uint32(6)), +}) +``` -× +To ensure all nodes in your cluster are evenly utilized, the number of shards must be a multiple of the number of nodes you are currently running in your cluster. -[Powered by](https://qdrant.tech/) +> Aside: Advanced use cases such as multitenancy may require an uneven distribution of shards. See [Multitenancy](/articles/multitenancy/). -<|page-112-lllmstxt|> -## usage-statistics -- [Documentation](https://qdrant.tech/documentation/) -- [Guides](https://qdrant.tech/documentation/guides/) -- Usage Statistics +We recommend creating at least 2 shards per node to allow future expansion without having to re-shard. [Resharding](#resharding) is possible when using our cloud offering, but should be avoided if hosting elsewhere as it would require creating a new collection. -# [Anchor](https://qdrant.tech/documentation/guides/usage-statistics/\#usage-statistics) Usage statistics +If you anticipate a lot of growth, we recommend 12 shards since you can expand from 1 node up to 2, 3, 6, and 12 nodes without having to re-shard. Having more than 12 shards in a small cluster may not be worth the performance overhead. -The Qdrant open-source container image collects anonymized usage statistics from users in order to improve the engine by default. You can [deactivate](https://qdrant.tech/documentation/guides/usage-statistics/#deactivate-telemetry) at any time, and any data that has already been collected can be [deleted on request](https://qdrant.tech/documentation/guides/usage-statistics/#request-information-deletion). +Shards are evenly distributed across all existing nodes when a collection is first created, but Qdrant does not automatically rebalance shards if your cluster size or replication factor changes (since this is an expensive operation on large clusters). See the next section for how to move shards after scaling operations. -Deactivating this will not affect your ability to monitor the Qdrant database yourself by accessing the `/metrics` or `/telemetry` endpoints of your database. It will just stop sending independend, anonymized usage statistics to the Qdrant team. +### Resharding -## [Anchor](https://qdrant.tech/documentation/guides/usage-statistics/\#why-do-we-collect-usage-statistics) Why do we collect usage statistics? +*Available as of v1.13.0 in Cloud* -We want to make Qdrant fast and reliable. To do this, we need to understand how it performs in real-world scenarios. -We do a lot of benchmarking internally, but it is impossible to cover all possible use cases, hardware, and configurations. +Resharding allows you to change the number of shards in your existing collections if you're hosting with our [Cloud](/documentation/cloud-intro/) offering. -In order to identify bottlenecks and improve Qdrant, we need to collect information about how it is used. +Resharding can change the number of shards both up and down, without having to recreate the collection from scratch. -Additionally, Qdrant uses a bunch of internal heuristics to optimize the performance. -To better set up parameters for these heuristics, we need to collect timings and counters of various pieces of code. -With this information, we can make Qdrant faster for everyone. +Please refer to the [Resharding](/documentation/cloud/cluster-scaling/#resharding) section in our cloud documentation for more details. -## [Anchor](https://qdrant.tech/documentation/guides/usage-statistics/\#what-information-is-collected) What information is collected? +### Moving shards -There are 3 types of information that we collect: +*Available as of v0.9.0* -- System information - general information about the system, such as CPU, RAM, and disk type. As well as the configuration of the Qdrant instance. -- Performance - information about timings and counters of various pieces of code. -- Critical error reports - information about critical errors, such as backtraces, that occurred in Qdrant. This information would allow to identify problems nobody yet reported to us. +Qdrant allows moving shards between nodes in the cluster and removing nodes from the cluster. This functionality unlocks the ability to dynamically scale the cluster size without downtime. It also allows you to upgrade or migrate nodes without downtime. -### [Anchor](https://qdrant.tech/documentation/guides/usage-statistics/\#we-never-collect-the-following-information) We **never** collect the following information: +If your cluster is running in Qdrant Cloud, shards are balanced across the cluster nodes automatically. For more information see the [Configuring Cloud Clusters](/documentation/cloud/configure-cluster/#shard-rebalancing) and [Cloud Cluster Scaling](/documentation/cloud/cluster-scaling/) documentation. -- User’s IP address -- Any data that can be used to identify the user or the user’s organization -- Any data, stored in the collections -- Any names of the collections -- Any URLs +Qdrant provides the information regarding the current shard distribution in the cluster with the [Collection Cluster info API](https://api.qdrant.tech/master/api-reference/distributed/collection-cluster-info). -## [Anchor](https://qdrant.tech/documentation/guides/usage-statistics/\#how-do-we-anonymize-data) How do we anonymize data? +Use the [Update collection cluster setup API](https://api.qdrant.tech/master/api-reference/distributed/update-collection-cluster) to initiate the shard transfer: -We understand that some users may be concerned about the privacy of their data. -That is why we make an extra effort to ensure your privacy. +```http +POST /collections/{collection_name}/cluster +{ + "move_shard": { + "shard_id": 0, + "from_peer_id": 381894127, + "to_peer_id": 467122995 + } +} +``` -There are several different techniques that we use to anonymize the data: + -- We use a random UUID to identify instances. This UUID is generated on each startup and is not stored anywhere. There are no other ways to distinguish between different instances. -- We round all big numbers, so that the last digits are always 0. For example, if the number is 123456789, we will store 123456000. -- We replace all names with irreversibly hashed values. So no collection or field names will leak into the telemetry. -- All urls are hashed as well. +After the transfer is initiated, the service will process it based on the used +[transfer method](#shard-transfer-method) keeping both shards in sync. Once the +transfer is completed, the old shard is deleted from the source node. -You can see exact version of anomymized collected data by accessing the [telemetry API](https://api.qdrant.tech/master/api-reference/service/telemetry) with `anonymize=true` parameter. +In case you want to downscale the cluster, you can move all shards away from a peer and then remove the peer using the [remove peer API](https://api.qdrant.tech/master/api-reference/distributed/remove-peer). -For example, [http://localhost:6333/telemetry?details\_level=6&anonymize=true](http://localhost:6333/telemetry?details_level=6&anonymize=true) +```http +DELETE /cluster/peer/{peer_id} +``` -## [Anchor](https://qdrant.tech/documentation/guides/usage-statistics/\#deactivate-usage-statistics) Deactivate usage statistics +After that, Qdrant will exclude the node from the consensus, and the instance will be ready for shutdown. -You can deactivate usage statistics by: +### User-defined sharding -- setting the `QDRANT__TELEMETRY_DISABLED` environment variable to `true` -- setting the config option `telemetry_disabled` to `true` in the `config/production.yaml` or `config/config.yaml` files -- using cli option `--disable-telemetry` +*Available as of v1.7.0* -Any of these options will prevent Qdrant from sending any usage statistics data. +Qdrant allows you to specify the shard for each point individually. This feature is useful if you want to control the shard placement of your data, so that operations can hit only the subset of shards they actually need. In big clusters, this can significantly improve the performance of operations that do not require the whole collection to be scanned. -If you decide to deactivate usage statistics, we kindly ask you to share your feedback with us in the [Discord community](https://qdrant.to/discord) or GitHub [discussions](https://github.com/qdrant/qdrant/discussions) +A clear use-case for this feature is managing a multi-tenant collection, where each tenant (let it be a user or organization) is assumed to be segregated, so they can have their data stored in separate shards. -## [Anchor](https://qdrant.tech/documentation/guides/usage-statistics/\#request-information-deletion) Request information deletion +To enable user-defined sharding, set `sharding_method` to `custom` during collection creation: -We provide an email address so that users can request the complete removal of their data from all of our tools. +```http +PUT /collections/{collection_name} +{ + "shard_number": 1, + "sharding_method": "custom" + // ... other collection parameters +} +``` -To do so, send an email to [privacy@qdrant.com](mailto:privacy@qdrant.com) containing the unique identifier generated for your Qdrant installation. -You can find this identifier in the telemetry API response ( `"id"` field), or in the logs of your Qdrant instance. +```python +from qdrant_client import QdrantClient, models -Any questions regarding the management of the data we collect can also be sent to this email address. +client = QdrantClient(url="http://localhost:6333") -##### Was this page useful? +client.create_collection( + collection_name="{collection_name}", + shard_number=1, + sharding_method=models.ShardingMethod.CUSTOM, + # ... other collection parameters +) +``` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -Thank you for your feedback! 🙏 +const client = new QdrantClient({ host: "localhost", port: 6333 }); -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/usage-statistics.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +client.createCollection("{collection_name}", { + shard_number: 1, + sharding_method: "custom", + // ... other collection parameters +}); +``` -On this page: +```rust +use qdrant_client::qdrant::{ + CreateCollectionBuilder, Distance, ShardingMethod, VectorParamsBuilder, +}; +use qdrant_client::Qdrant; -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/usage-statistics.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +let client = Qdrant::from_url("http://localhost:6334").build()?; -× +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(300, Distance::Cosine)) + .shard_number(1) + .sharding_method(ShardingMethod::Custom.into()), + ) + .await?; +``` -[Powered by](https://qdrant.tech/) +```java +import static io.qdrant.client.ShardKeyFactory.shardKey; -<|page-113-lllmstxt|> -## data-privacy -- [Articles](https://qdrant.tech/articles/) -- Data Privacy with Qdrant: Implementing Role-Based Access Control (RBAC) +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.ShardingMethod; -[Back to Vector Search Manuals](https://qdrant.tech/articles/vector-search-manuals/) +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -# Data Privacy with Qdrant: Implementing Role-Based Access Control (RBAC) +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + // ... other collection parameters + .setShardNumber(1) + .setShardingMethod(ShardingMethod.Custom) + .build()) + .get(); +``` -Qdrant Team +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -· +var client = new QdrantClient("localhost", 6334); -June 18, 2024 +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + // ... other collection parameters + shardNumber: 1, + shardingMethod: ShardingMethod.Custom +); +``` -![ Data Privacy with Qdrant: Implementing Role-Based Access Control (RBAC)](https://qdrant.tech/articles_data/data-privacy/preview/title.jpg) +```go +import ( + "context" -Data stored in vector databases is often proprietary to the enterprise and may include sensitive information like customer records, legal contracts, electronic health records (EHR), financial data, and intellectual property. Moreover, strong security measures become critical to safeguarding this data. If the data stored in a vector database is not secured, it may open a vulnerability known as “ [embedding inversion attack](https://arxiv.org/abs/2004.00053),” where malicious actors could potentially [reconstruct the original data from the embeddings](https://arxiv.org/pdf/2305.03010) themselves. + "github.com/qdrant/go-client/qdrant" +) -Strict compliance regulations govern data stored in vector databases across various industries. For instance, healthcare must comply with HIPAA, which dictates how protected health information (PHI) is stored, transmitted, and secured. Similarly, the financial services industry follows PCI DSS to safeguard sensitive financial data. These regulations require developers to ensure data storage and transmission comply with industry-specific legal frameworks across different regions. **As a result, features that enable data privacy, security and sovereignty are deciding factors when choosing the right vector database.** +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -This article explores various strategies to ensure the security of your critical data while leveraging the benefits of vector search. Implementing some of these security approaches can help you build privacy-enhanced similarity search algorithms and integrate them into your AI applications. -Additionally, you will learn how to build a fully data-sovereign architecture, allowing you to retain control over your data and comply with relevant data laws and regulations. +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + // ... other collection parameters + ShardNumber: qdrant.PtrOf(uint32(1)), + ShardingMethod: qdrant.ShardingMethod_Custom.Enum(), +}) +``` -> To skip right to the code implementation, [click here](https://qdrant.tech/articles/data-privacy/#jwt-on-qdrant). +In this mode, the `shard_number` means the number of shards per shard key, where points will be distributed evenly. For example, if you have 10 shard keys and a collection config with these settings: -## [Anchor](https://qdrant.tech/articles/data-privacy/\#vector-database-security-an-overview) Vector Database Security: An Overview +```json +{ + "shard_number": 1, + "sharding_method": "custom", + "replication_factor": 2 +} +``` -Vector databases are often unsecured by default to facilitate rapid prototyping and experimentation. This approach allows developers to quickly ingest data, build vector representations, and test similarity search algorithms without initial security concerns. However, in production environments, unsecured databases pose significant data breach risks. +Then you will have `1 * 10 * 2 = 20` total physical shards in the collection. -For production use, robust security systems are essential. Authentication, particularly using static API keys, is a common approach to control access and prevent unauthorized modifications. Yet, simple API authentication is insufficient for enterprise data, which requires granular control. +Physical shards require a large amount of resources, so make sure your custom sharding key has a low cardinality. -The primary challenge with static API keys is their all-or-nothing access, inadequate for role-based data segregation in enterprise applications. Additionally, a compromised key could grant attackers full access to manipulate or steal data. To strengthen the security of the vector database, developers typically need the following: +For large cardinality keys, it is recommended to use [partition by payload](/documentation/guides/multiple-partitions/#partition-by-payload) instead. -1. **Encryption**: This ensures that sensitive data is scrambled as it travels between the application and the vector database. This safeguards against Man-in-the-Middle ( [MitM](https://en.wikipedia.org/wiki/Man-in-the-middle_attack)) attacks, where malicious actors can attempt to intercept and steal data during transmission. -2. **Role-Based Access Control**: As mentioned before, traditional static API keys grant all-or-nothing access, which is a significant security risk in enterprise environments. RBAC offers a more granular approach by defining user roles and assigning specific data access permissions based on those roles. For example, an analyst might have read-only access to specific datasets, while an administrator might have full CRUD (Create, Read, Update, Delete) permissions across the database. -3. **Deployment Flexibility**: Data residency regulations like GDPR (General Data Protection Regulation) and industry-specific compliance requirements dictate where data can be stored, processed, and accessed. Developers would need to choose a database solution which offers deployment options that comply with these regulations. This might include on-premise deployments within a company’s private cloud or geographically distributed cloud deployments that adhere to data residency laws. +Now you need to create custom shards ([API reference](https://api.qdrant.tech/api-reference/distributed/create-shard-key#request)): -## [Anchor](https://qdrant.tech/articles/data-privacy/\#how-qdrant-handles-data-privacy-and-security) How Qdrant Handles Data Privacy and Security +```http +PUT /collections/{collection_name}/shards +{ + "shard_key": "{shard_key}" +} +``` -One of the cornerstones of our design choices at Qdrant has been the focus on security features. We have built in a range of features keeping the enterprise user in mind, which allow building of granular access control on a fully data sovereign architecture. +```python +from qdrant_client import QdrantClient, models -A Qdrant instance is unsecured by default. However, when you are ready to deploy in production, Qdrant offers a range of security features that allow you to control access to your data, protect it from breaches, and adhere to regulatory requirements. Using Qdrant, you can build granular access control, segregate roles and privileges, and create a fully data sovereign architecture. +client = QdrantClient(url="http://localhost:6333") -### [Anchor](https://qdrant.tech/articles/data-privacy/\#api-keys-and-tls-encryption) API Keys and TLS Encryption +client.create_shard_key("{collection_name}", "{shard_key}") +``` -For simpler use cases, Qdrant offers API key-based authentication. This includes both regular API keys and read-only API keys. Regular API keys grant full access to read, write, and delete operations, while read-only keys restrict access to data retrieval operations only, preventing write actions. +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -On Qdrant Cloud, you can create API keys using the [Cloud Dashboard](https://qdrant.to/cloud). This allows you to generate API keys that give you access to a single node or cluster, or multiple clusters. You can read the steps to do so [here](https://qdrant.tech/documentation/cloud/authentication/). +const client = new QdrantClient({ host: "localhost", port: 6333 }); -![web-ui](https://qdrant.tech/articles_data/data-privacy/web-ui.png) +client.createShardKey("{collection_name}", { + shard_key: "{shard_key}" +}); +``` -For on-premise or local deployments, you’ll need to configure API key authentication. This involves specifying a key in either the Qdrant configuration file or as an environment variable. This ensures that all requests to the server must include a valid API key sent in the header. +```rust +use qdrant_client::qdrant::{ + CreateShardKeyBuilder, CreateShardKeyRequestBuilder +}; +use qdrant_client::Qdrant; -When using the simple API key-based authentication, you should also turn on TLS encryption. Otherwise, you are exposing the connection to sniffing and MitM attacks. To secure your connection using TLS, you would need to create a certificate and private key, and then [enable TLS](https://qdrant.tech/documentation/guides/security/#tls) in the configuration. +let client = Qdrant::from_url("http://localhost:6334").build()?; -API authentication, coupled with TLS encryption, offers a first layer of security for your Qdrant instance. However, to enable more granular access control, the recommended approach is to leverage JSON Web Tokens (JWTs). +client + .create_shard_key( + CreateShardKeyRequestBuilder::new("{collection_name}") + .request(CreateShardKeyBuilder::default().shard_key("{shard_key".to_string())), + ) + .await?; +``` -### [Anchor](https://qdrant.tech/articles/data-privacy/\#jwt-on-qdrant) JWT on Qdrant +```java +import static io.qdrant.client.ShardKeyFactory.shardKey; -JSON Web Tokens (JWTs) are a compact, URL-safe, and stateless means of representing _claims_ to be transferred between two parties. These claims are encoded as a JSON object and are cryptographically signed. +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateShardKey; +import io.qdrant.client.grpc.Collections.CreateShardKeyRequest; -JWT is composed of three parts: a header, a payload, and a signature, which are concatenated with dots (.) to form a single string. The header contains the type of token and algorithm being used. The payload contains the claims (explained in detail later). The signature is a cryptographic hash and ensures the token’s integrity. +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -In Qdrant, JWT forms the foundation through which powerful access controls can be built. Let’s understand how. +client.createShardKeyAsync(CreateShardKeyRequest.newBuilder() + .setCollectionName("{collection_name}") + .setRequest(CreateShardKey.newBuilder() + .setShardKey(shardKey("{shard_key}")) + .build()) + .build()).get(); +``` -JWT is enabled on the Qdrant instance by specifying the API key and turning on the **jwt\_rbac** feature in the configuration (alternatively, they can be set as environment variables). For any subsequent request, the API key is used to encode or decode the token. +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -The way JWT works is that just the API key is enough to generate the token, and doesn’t require any communication with the Qdrant instance or server. There are several libraries that help generate tokens by encoding a payload, such as [PyJWT](https://pyjwt.readthedocs.io/en/stable/) (for Python), [jsonwebtoken](https://www.npmjs.com/package/jsonwebtoken) (for JavaScript), and [jsonwebtoken](https://crates.io/crates/jsonwebtoken) (for Rust). Qdrant uses the HS256 algorithm to encode or decode the tokens. +var client = new QdrantClient("localhost", 6334); -We will look at the payload structure shortly, but here’s how you can generate a token using PyJWT. +await client.CreateShardKeyAsync( + "{collection_name}", + new CreateShardKey { ShardKey = new ShardKey { Keyword = "{shard_key}", } } + ); +``` -```python -import jwt -import datetime +```go +import ( + "context" -# Define your API key and other payload data -api_key = "your_api_key" -payload = { ... -} + "github.com/qdrant/go-client/qdrant" +) -token = jwt.encode(payload, api_key, algorithm="HS256") -print(token) +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) +client.CreateShardKey(context.Background(), "{collection_name}", &qdrant.CreateShardKey{ + ShardKey: qdrant.NewShardKey("{shard_key}"), +}) ``` -Once you have generated the token, you should include it in the subsequent requests. You can do so by providing it as a bearer token in the Authorization header, or in the API Key header of your requests. +To specify the shard for each point, you need to provide the `shard_key` field in the upsert request: -Below is an example of how to do so using QdrantClient in Python: +```http +PUT /collections/{collection_name}/points +{ + "points": [ + { + "id": 1111, + "vector": [0.1, 0.2, 0.3] + }, + ] + "shard_key": "user_1" +} +``` ```python -from qdrant_client import QdrantClient +from qdrant_client import QdrantClient, models -qdrant_client = QdrantClient( - "http://localhost:6333", - api_key="", # the token goes here -) -# Example search vector -search_vector = [0.1, 0.2, 0.3, 0.4] +client = QdrantClient(url="http://localhost:6333") -# Example similarity search request -response = qdrant_client.search( - collection_name="demo_collection", - query_vector=search_vector, - limit=5 # Number of results to retrieve +client.upsert( + collection_name="{collection_name}", + points=[ + models.PointStruct( + id=1111, + vector=[0.1, 0.2, 0.3], + ), + ], + shard_key_selector="user_1", ) - ``` -For convenience, we have added a JWT generation tool in the Qdrant Web UI, which is present under the 🔑 tab. For your local deployments, you will find it at [http://localhost:6333/dashboard#/jwt](http://localhost:6333/dashboard#/jwt). +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -### [Anchor](https://qdrant.tech/articles/data-privacy/\#payload-configuration) Payload Configuration +const client = new QdrantClient({ host: "localhost", port: 6333 }); -There are several different options (claims) you can use in the JWT payload that help control access and functionality. Let’s look at them one by one. +client.upsert("{collection_name}", { + points: [ + { + id: 1111, + vector: [0.1, 0.2, 0.3], + }, + ], + shard_key: "user_1", +}); +``` -**exp**: This claim is the expiration time of the token, and is a unix timestamp in seconds. After the expiration time, the token will be invalid. +```rust +use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder}; +use qdrant_client::Payload; -**value\_exists**: This claim validates the token against a specific key-value stored in a collection. By using this claim, you can revoke access by simply changing a value without having to invalidate the API key. +client + .upsert_points( + UpsertPointsBuilder::new( + "{collection_name}", + vec![PointStruct::new( + 111, + vec![0.1, 0.2, 0.3], + Payload::default(), + )], + ) + .shard_key_selector("user_1".to_string()), + ) + .await?; +``` -**access**: This claim defines the access level of the token. The access level can be global read (r) or manage (m). It can also be specific to a collection, or even a subset of a collection, using read (r) and read-write (rw). +```java +import java.util.List; -Let’s look at a few example JWT payload configurations. +import static io.qdrant.client.PointIdFactory.id; +import static io.qdrant.client.ShardKeySelectorFactory.shardKeySelector; +import static io.qdrant.client.VectorsFactory.vectors; -**Scenario 1: 1-hour expiry time, and read-only access to a collection** +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.PointStruct; +import io.qdrant.client.grpc.Points.UpsertPoints; -```json -{ - "exp": 1690995200, // Set to 1 hour from the current time (Unix timestamp) - "access": [\ - {\ - "collection": "demo_collection",\ - "access": "r" // Read-only access\ - }\ - ] -} +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +client + .upsertAsync( + UpsertPoints.newBuilder() + .setCollectionName("{collection_name}") + .addAllPoints( + List.of( + PointStruct.newBuilder() + .setId(id(111)) + .setVectors(vectors(0.1f, 0.2f, 0.3f)) + .build())) + .setShardKeySelector(shardKeySelector("user_1")) + .build()) + .get(); ``` -**Scenario 2: 1-hour expiry time, and access to user with a specific role** - -Suppose you have a ‘users’ collection and have defined specific roles for each user, such as ‘developer’, ‘manager’, ‘admin’, ‘analyst’, and ‘revoked’. In such a scenario, you can use a combination of **exp** and **value\_exists**. +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -```json -{ - "exp": 1690995200, - "value_exists": { - "collection": "users", - "matches": [\ - { "key": "username", "value": "john" },\ - { "key": "role", "value": "developer" }\ - ], - }, -} +var client = new QdrantClient("localhost", 6334); +await client.UpsertAsync( + collectionName: "{collection_name}", + points: new List + { + new() { Id = 111, Vectors = new[] { 0.1f, 0.2f, 0.3f } } + }, + shardKeySelector: new ShardKeySelector { ShardKeys = { new List { "user_1" } } } +); ``` -Now, if you ever want to revoke access for a user, simply change the value of their role. All future requests will be invalid using a token payload of the above type. - -**Scenario 3: 1-hour expiry time, and read-write access to a subset of a collection** +```go +import ( + "context" -You can even specify access levels specific to subsets of a collection. This can be especially useful when you are leveraging [multitenancy](https://qdrant.tech/documentation/guides/multiple-partitions/), and want to segregate access. + "github.com/qdrant/go-client/qdrant" +) -```json -{ - "exp": 1690995200, - "access": [\ - {\ - "collection": "demo_collection",\ - "access": "r",\ - "payload": {\ - "user_id": "user_123456"\ - }\ - }\ - ] -} +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) +client.Upsert(context.Background(), &qdrant.UpsertPoints{ + CollectionName: "{collection_name}", + Points: []*qdrant.PointStruct{ + { + Id: qdrant.NewIDNum(111), + Vectors: qdrant.NewVectors(0.1, 0.2, 0.3), + }, + }, + ShardKeySelector: &qdrant.ShardKeySelector{ + ShardKeys: []*qdrant.ShardKey{ + qdrant.NewShardKey("user_1"), + }, + }, +}) ``` -By combining the claims, you can fully customize the access level that a user or a role has within the vector store. + + +* When using custom sharding, IDs are only enforced to be unique within a shard key. This means that you can have multiple points with the same ID, if they have different shard keys. +This is a limitation of the current implementation, and is an anti-pattern that should be avoided because it can create scenarios of points with the same ID to have different contents. In the future, we plan to add a global ID uniqueness check. + -### [Anchor](https://qdrant.tech/articles/data-privacy/\#creating-role-based-access-control-rbac-using-jwt) Creating Role-Based Access Control (RBAC) Using JWT +Now you can target the operations to specific shard(s) by specifying the `shard_key` on any operation you do. Operations that do not specify the shard key will be executed on __all__ shards. -As we saw above, JWT claims create powerful levers through which you can create granular access control on Qdrant. Let’s bring it all together and understand how it helps you create Role-Based Access Control (RBAC). +Another use-case would be to have shards that track the data chronologically, so that you can do more complex itineraries like uploading live data in one shard and archiving it once a certain age has passed. -In a typical enterprise application, you will have a segregation of users based on their roles and permissions. These could be: +Sharding per day -1. **Admin or Owner:** with full access, and can generate API keys. -2. **Editor:** with read-write access levels to specific collections. -3. **Viewer:** with read-only access to specific collections. -4. **Data Scientist or Analyst:** with read-only access to specific collections. -5. **Developer:** with read-write access to development- or testing-specific collections, but limited access to production data. -6. **Guest:** with limited read-only access to publicly available collections. +### Shard transfer method -In addition, you can create access levels within sections of a collection. In a multi-tenant application, where you have used payload-based partitioning, you can create read-only access for specific user roles for a subset of the collection that belongs to that user. +*Available as of v1.7.0* -Your application requirements will eventually help you decide the roles and access levels you should create. For example, in an application managing customer data, you could create additional roles such as: +There are different methods for transferring a shard, such as moving or +replicating, to another node. Depending on what performance and guarantees you'd +like to have and how you'd like to manage your cluster, you likely want to +choose a specific method. Each method has its own pros and cons. Which is +fastest depends on the size and state of a shard. -**Customer Support Representative**: read-write access to customer service-related data but no access to billing information. +Available shard transfer methods are: -**Billing Department**: read-only access to billing data and read-write access to payment records. +- `stream_records`: _(default)_ transfer by streaming just its records to the target node in batches. +- `snapshot`: transfer including its index and quantized data by utilizing a [snapshot](/documentation/concepts/snapshots/) automatically. +- `wal_delta`: _(auto recovery default)_ transfer by resolving [WAL] difference; the operations that were missed. -**Marketing Analyst**: read-only access to anonymized customer data for analytics. +Each has pros, cons and specific requirements, some of which are: -Each role can be assigned a JWT with claims that specify expiration times, read/write permissions for collections, and validating conditions. +| Method: | Stream records | Snapshot | WAL delta | +|:---|:---|:---|:---| +| **Version** | v0.8.0+ | v1.7.0+ | v1.8.0+ | +| **Target** | New/existing shard | New/existing shard | Existing shard | +| **Connectivity** | Internal gRPC API (6335) | REST API (6333)
Internal gRPC API (6335) | Internal gRPC API (6335) | +| **HNSW index** | Doesn't transfer, will reindex on target. | Does transfer, immediately ready on target. | Doesn't transfer, may index on target. | +| **Quantization** | Doesn't transfer, will requantize on target. | Does transfer, immediately ready on target. | Doesn't transfer, may quantize on target. | +| **Ordering** | Unordered updates on target[^unordered] | Ordered updates on target[^ordered] | Ordered updates on target[^ordered] | +| **Disk space** | No extra required | Extra required for snapshot on both nodes | No extra required | -In such an application, an example JWT payload for a customer support representative role could be: +[^unordered]: Weak ordering for updates: All records are streamed to the target node in order. + New updates are received on the target node in parallel, while the transfer + of records is still happening. We therefore have `weak` ordering, regardless + of what [ordering](#write-ordering) is used for updates. +[^ordered]: Strong ordering for updates: A snapshot of the shard + is created, it is transferred and recovered on the target node. That ensures + the state of the shard is kept consistent. New updates are queued on the + source node, and transferred in order to the target node. Updates therefore + have the same [ordering](#write-ordering) as the user selects, making + `strong` ordering possible. -```json +To select a shard transfer method, specify the `method` like: + +```http +POST /collections/{collection_name}/cluster { - "exp": 1690995200, - "access": [\ - {\ - "collection": "customer_data",\ - "access": "rw",\ - "payload": {\ - "department": "support"\ - }\ - }\ - ], - "value_exists": { - "collection": "departments", - "matches": [\ - { "key": "department", "value": "support" }\ - ] - } + "move_shard": { + "shard_id": 0, + "from_peer_id": 381894127, + "to_peer_id": 467122995, + "method": "snapshot" + } } - ``` -As you can see, by implementing RBAC, you can ensure proper segregation of roles and their privileges, and avoid privacy loopholes in your application. - -## [Anchor](https://qdrant.tech/articles/data-privacy/\#qdrant-hybrid-cloud-and-data-sovereignty) Qdrant Hybrid Cloud and Data Sovereignty - -Data governance varies by country, especially for global organizations dealing with different regulations on data privacy, security, and access. This often necessitates deploying infrastructure within specific geographical boundaries. +The `stream_records` transfer method is the simplest available. It simply +transfers all shard records in batches to the target node until it has +transferred all of them, keeping both shards in sync. It will also make sure the +transferred shard indexing process is keeping up before performing a final +switch. The method has two common disadvantages: 1. It does not transfer index +or quantization data, meaning that the shard has to be optimized again on the +new node, which can be very expensive. 2. The ordering guarantees are +`weak`[^unordered], which is not suitable for some applications. Because it is +so simple, it's also very robust, making it a reliable choice if the above cons +are acceptable in your use case. If your cluster is unstable and out of +resources, it's probably best to use the `stream_records` transfer method, +because it is unlikely to fail. -To address these needs, the vector database you choose should support deployment and scaling within your controlled infrastructure. [Qdrant Hybrid Cloud](https://qdrant.tech/documentation/hybrid-cloud/) offers this flexibility, along with features like sharding, replicas, JWT authentication, and monitoring. +The `snapshot` transfer method utilizes [snapshots](/documentation/concepts/snapshots/) +to transfer a shard. A snapshot is created automatically. It is then transferred +and restored on the target node. After this is done, the snapshot is removed +from both nodes. While the snapshot/transfer/restore operation is happening, the +source node queues up all new operations. All queued updates are then sent in +order to the target shard to bring it into the same state as the source. There +are two important benefits: 1. It transfers index and quantization data, so that +the shard does not have to be optimized again on the target node, making them +immediately available. This way, Qdrant ensures that there will be no +degradation in performance at the end of the transfer. Especially on large +shards, this can give a huge performance improvement. 2. The ordering guarantees +can be `strong`[^ordered], required for some applications. -Qdrant Hybrid Cloud integrates Kubernetes clusters from various environments—cloud, on-premises, or edge—into a unified managed service. This allows organizations to manage Qdrant databases through the Qdrant Cloud UI while keeping the databases within their infrastructure. +The `wal_delta` transfer method only transfers the difference between two +shards. More specifically, it transfers all operations that were missed to the +target shard. The [WAL] of both shards is used to resolve this. There are two +benefits: 1. It will be very fast because it only transfers the difference +rather than all data. 2. The ordering guarantees can be `strong`[^ordered], +required for some applications. Two disadvantages are: 1. It can only be used to +transfer to a shard that already exists on the other node. 2. Applicability is +limited because the WALs normally don't hold more than 64MB of recent +operations. But that should be enough for a node that quickly restarts, to +upgrade for example. If a delta cannot be resolved, this method automatically +falls back to `stream_records` which equals transferring the full shard. -With JWT and RBAC, Qdrant Hybrid Cloud provides a secure, private, and sovereign vector store. Enterprises can scale their AI applications geographically, comply with local laws, and maintain strict data control. +The `stream_records` method is currently used as default. This may change in the +future. As of Qdrant 1.9.0 `wal_delta` is used for automatic shard replications +to recover dead shards. -## [Anchor](https://qdrant.tech/articles/data-privacy/\#conclusion) Conclusion +[WAL]: /documentation/concepts/storage/#versioning -Vector similarity is increasingly becoming the backbone of AI applications that leverage unstructured data. By transforming data into vectors – their numerical representations – organizations can build powerful applications that harness semantic search, ranging from better recommendation systems to algorithms that help with personalization, or powerful customer support chatbots. +## Replication -However, to fully leverage the power of AI in production, organizations need to choose a vector database that offers strong privacy and security features, while also helping them adhere to local laws and regulations. +Qdrant allows you to replicate shards between nodes in the cluster. -Qdrant provides exceptional efficiency and performance, along with the capability to implement granular access control to data, Role-Based Access Control (RBAC), and the ability to build a fully data-sovereign architecture. +Shard replication increases the reliability of the cluster by keeping several copies of a shard spread across the cluster. +This ensures the availability of the data in case of node failures, except if all replicas are lost. -Interested in mastering vector search security and deployment strategies? [Join our Discord community](https://discord.gg/qdrant) to explore more advanced search strategies, connect with other developers and researchers in the industry, and stay updated on the latest innovations! +### Replication factor -##### Was this page useful? +When you create a collection, you can control how many shard replicas you'd like to store by changing the `replication_factor`. By default, `replication_factor` is set to "1", meaning no additional copy is maintained automatically. The default can be changed in the [Qdrant configuration](/documentation/guides/configuration/#configuration-options). You can change that by setting the `replication_factor` when you create a collection. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +The `replication_factor` can be updated for an existing collection, but the effect of this depends on how you're running Qdrant. If you're hosting the open source version of Qdrant yourself, changing the replication factor after collection creation doesn't do anything. You can manually [create](#creating-new-shard-replicas) or drop shard replicas to achieve your desired replication factor. In Qdrant Cloud (including Hybrid Cloud, Private Cloud) your shards will automatically be replicated or dropped to match your configured replication factor. -Thank you for your feedback! 🙏 +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 300, + "distance": "Cosine" + }, + "shard_number": 6, + "replication_factor": 2 +} +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/data-privacy.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +```python +from qdrant_client import QdrantClient, models -On this page: +client = QdrantClient(url="http://localhost:6333") -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/data-privacy.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=300, distance=models.Distance.COSINE), + shard_number=6, + replication_factor=2, +) +``` -× +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -[Powered by](https://qdrant.tech/) +const client = new QdrantClient({ host: "localhost", port: 6333 }); -<|page-114-lllmstxt|> -## changelog -- [Documentation](https://qdrant.tech/documentation/) -- [Private cloud](https://qdrant.tech/documentation/private-cloud/) -- Changelog +client.createCollection("{collection_name}", { + vectors: { + size: 300, + distance: "Cosine", + }, + shard_number: 6, + replication_factor: 2, +}); +``` -# [Anchor](https://qdrant.tech/documentation/private-cloud/changelog/\#changelog) Changelog +```rust +use qdrant_client::qdrant::{CreateCollectionBuilder, Distance, VectorParamsBuilder}; +use qdrant_client::Qdrant; -## [Anchor](https://qdrant.tech/documentation/private-cloud/changelog/\#171-2025-06-03) 1.7.1 (2025-06-03) +let client = Qdrant::from_url("http://localhost:6334").build()?; -| | | -| --- | --- | -| qdrant-kubernetes-api version | v1.16.6 | -| operator version | 2.6.0 | -| qdrant-cluster-manager version | v0.3.6 | +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(300, Distance::Cosine)) + .shard_number(6) + .replication_factor(2), + ) + .await?; +``` -- Performance and stability improvements +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; -## [Anchor](https://qdrant.tech/documentation/private-cloud/changelog/\#170-2025-05-14) 1.7.0 (2025-05-14) +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -| | | -| --- | --- | -| qdrant-kubernetes-api version | v1.16.3 | -| operator version | 2.4.2 | -| qdrant-cluster-manager version | v0.3.5 | +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(300) + .setDistance(Distance.Cosine) + .build()) + .build()) + .setShardNumber(6) + .setReplicationFactor(2) + .build()) + .get(); +``` -- Add optional automatic shard balancing -- Set strict mode by default for new clusters to only allow queries with payload filters on fields that are indexed +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -## [Anchor](https://qdrant.tech/documentation/private-cloud/changelog/\#164-2025-04-17) 1.6.4 (2025-04-17) +var client = new QdrantClient("localhost", 6334); -| | | -| --- | --- | -| qdrant-kubernetes-api version | v1.15.5 | -| operator version | 2.3.4 | -| qdrant-cluster-manager version | v0.3.4 | +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { Size = 300, Distance = Distance.Cosine }, + shardNumber: 6, + replicationFactor: 2 +); +``` -- Fix bug in operator Helm chart that caused role binding generation to fail when using `watch.namespaces` +```go +import ( + "context" -## [Anchor](https://qdrant.tech/documentation/private-cloud/changelog/\#163-2025-03-28) 1.6.3 (2025-03-28) + "github.com/qdrant/go-client/qdrant" +) -| | | -| --- | --- | -| qdrant-kubernetes-api version | v1.15.0 | -| operator version | 2.3.3 | -| qdrant-cluster-manager version | v0.3.4 | +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -- Performance and stability improvements for collection re-sharding +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 300, + Distance: qdrant.Distance_Cosine, + }), + ShardNumber: qdrant.PtrOf(uint32(6)), + ReplicationFactor: qdrant.PtrOf(uint32(2)), +}) +``` -## [Anchor](https://qdrant.tech/documentation/private-cloud/changelog/\#162-2025-03-21) 1.6.2 (2025-03-21) +This code sample creates a collection with a total of 6 logical shards backed by a total of 12 physical shards. -| | | -| --- | --- | -| qdrant-kubernetes-api version | v1.15.0 | -| operator version | 2.3.2 | -| qdrant-cluster-manager version | v0.3.3 | +Since a replication factor of "2" would require twice as much storage space, it is advised to make sure the hardware can host the additional shard replicas beforehand. -- Allow disabling NetworkPolicy management in Qdrant Cluster operator +### Creating new shard replicas -## [Anchor](https://qdrant.tech/documentation/private-cloud/changelog/\#161-2025-03-14) 1.6.1 (2025-03-14) +It is possible to create or delete replicas manually on an existing collection using the [Update collection cluster setup API](https://api.qdrant.tech/master/api-reference/distributed/update-collection-cluster). This is usually only necessary if you run Qdrant open-source. In Qdrant Cloud shard replication is handled and updated automatically, matching the configured `replication_factor`. -| | | -| --- | --- | -| qdrant-kubernetes-api version | v1.14.2 | -| operator version | 2.3.2 | -| qdrant-cluster-manager version | v0.3.3 | +A replica can be added on a specific peer by specifying the peer from which to replicate. -- Add support for GPU instances -- Experimental support for automatic shard balancing +```http +POST /collections/{collection_name}/cluster +{ + "replicate_shard": { + "shard_id": 0, + "from_peer_id": 381894127, + "to_peer_id": 467122995 + } +} +``` -## [Anchor](https://qdrant.tech/documentation/private-cloud/changelog/\#151-2025-03-04) 1.5.1 (2025-03-04) + -| | | -| --- | --- | -| qdrant-kubernetes-api version | v1.12.0 | -| operator version | 2.1.26 | -| qdrant-cluster-manager version | v0.3.2 | +And a replica can be removed on a specific peer. -- Fix scaling down clusters that have TLS with self-signed certificates configured -- Various performance improvements and stability fixes +```http +POST /collections/{collection_name}/cluster +{ + "drop_replica": { + "shard_id": 0, + "peer_id": 381894127 + } +} +``` -## [Anchor](https://qdrant.tech/documentation/private-cloud/changelog/\#150-2025-02-21) 1.5.0 (2025-02-21) +Keep in mind that a collection must contain at least one active replica of a shard. -| | | -| --- | --- | -| qdrant-kubernetes-api version | v1.12.0 | -| operator version | 2.1.26 | -| qdrant-cluster-manager version | v0.3.0 | +### Error handling -- Added support for P2P TLS configuration -- Faster node removal on scale down -- Various performance improvements and stability fixes +Replicas can be in different states: -## [Anchor](https://qdrant.tech/documentation/private-cloud/changelog/\#140-2025-01-23) 1.4.0 (2025-01-23) +- Active: healthy and ready to serve traffic +- Dead: unhealthy and not ready to serve traffic +- Partial: currently under resynchronization before activation -| | | -| --- | --- | -| qdrant-kubernetes-api version | v1.8.0 | -| operator version | 2.1.26 | -| qdrant-cluster-manager version | v0.3.0 | +A replica is marked as dead if it does not respond to internal healthchecks or if it fails to serve traffic. -- Support deleting peers on horizontal scale down, even if they are already offline -- Support removing partially deleted peers +A dead replica will not receive traffic from other peers and might require a manual intervention if it does not recover automatically. -## [Anchor](https://qdrant.tech/documentation/private-cloud/changelog/\#130-2025-01-17) 1.3.0 (2025-01-17) +This mechanism ensures data consistency and availability if a subset of the replicas fail during an update operation. -| | | -| --- | --- | -| qdrant-kubernetes-api version | v1.8.0 | -| operator version | 2.1.21 | -| qdrant-cluster-manager version | v0.2.10 | +### Node Failure Recovery -- Support for re-sharding with Qdrant >= 1.13.0 +Sometimes hardware malfunctions might render some nodes of the Qdrant cluster unrecoverable. +No system is immune to this. -## [Anchor](https://qdrant.tech/documentation/private-cloud/changelog/\#120-2025-01-16) 1.2.0 (2025-01-16) +But several recovery scenarios allow qdrant to stay available for requests and even avoid performance degradation. +Let's walk through them from best to worst. -| | | -| --- | --- | -| qdrant-kubernetes-api version | v1.8.0 | -| operator version | 2.1.20 | -| qdrant-cluster-manager version | v0.2.9 | +**Recover with replicated collection** -- Performance and stability improvements +If the number of failed nodes is less than the replication factor of the collection, then your cluster should still be able to perform read, search and update queries. -## [Anchor](https://qdrant.tech/documentation/private-cloud/changelog/\#110-2024-12-03) 1.1.0 (2024-12-03) +Now, if the failed node restarts, consensus will trigger the replication process to update the recovering node with the newest updates it has missed. -\| qdrant-kubernetes-api version \| v1.6.4 \| -\| operator version \| 2.1.10 \| -\| qdrant-cluster-manager version \| v0.2.6 \| +If the failed node never restarts, you can recover the lost shards if you have a 3+ node cluster. You cannot recover lost shards in smaller clusters because recovery operations go through [raft](#raft) which requires >50% of the nodes to be healthy. -- Activate cluster-manager for automatic shard replication +**Recreate node with replicated collections** -## [Anchor](https://qdrant.tech/documentation/private-cloud/changelog/\#100-2024-11-11) 1.0.0 (2024-11-11) +If a node fails and it is impossible to recover it, you should exclude the dead node from the consensus and create an empty node. -| | | -| --- | --- | -| qdrant-kubernetes-api version | v1.2.7 | -| operator version | 0.1.3 | -| qdrant-cluster-manager version | v0.2.4 | +To exclude failed nodes from the consensus, use [remove peer](https://api.qdrant.tech/master/api-reference/distributed/remove-peer) API. +Apply the `force` flag if necessary. -- Initial release +When you create a new node, make sure to attach it to the existing cluster by specifying `--bootstrap` CLI parameter with the URL of any of the running cluster nodes. -##### Was this page useful? +Once the new node is ready and synchronized with the cluster, you might want to ensure that the collection shards are replicated enough. Remember that Qdrant will not automatically balance shards since this is an expensive operation. +Use the [Replicate Shard Operation](https://api.qdrant.tech/master/api-reference/distributed/update-collection-cluster) to create another copy of the shard on the newly connected node. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +It's worth mentioning that Qdrant only provides the necessary building blocks to create an automated failure recovery. +Building a completely automatic process of collection scaling would require control over the cluster machines themself. +Check out our [cloud solution](https://qdrant.to/cloud), where we made exactly that. -Thank you for your feedback! 🙏 -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/private-cloud/changelog.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +**Recover from snapshot** -On this page: +If there are no copies of data in the cluster, it is still possible to recover from a snapshot. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/private-cloud/changelog.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Follow the same steps to detach failed node and create a new one in the cluster: -× +* To exclude failed nodes from the consensus, use [remove peer](https://api.qdrant.tech/master/api-reference/distributed/remove-peer) API. Apply the `force` flag if necessary. +* Create a new node, making sure to attach it to the existing cluster by specifying the `--bootstrap` CLI parameter with the URL of any of the running cluster nodes. -[Powered by](https://qdrant.tech/) +Snapshot recovery, used in single-node deployment, is different from cluster one. +Consensus manages all metadata about all collections and does not require snapshots to recover it. +But you can use snapshots to recover missing shards of the collections. -<|page-115-lllmstxt|> -## data-ingestion-beginners -- [Documentation](https://qdrant.tech/documentation/) -- Data Ingestion for Beginners +Use the [Collection Snapshot Recovery API](/documentation/concepts/snapshots/#recover-in-cluster-deployment) to do it. +The service will download the specified snapshot of the collection and recover shards with data from it. -![data-ingestion-beginners-7](https://qdrant.tech/documentation/examples/data-ingestion-beginners/data-ingestion-7.png) +Once all shards of the collection are recovered, the collection will become operational again. -# [Anchor](https://qdrant.tech/documentation/data-ingestion-beginners/\#send-s3-data-to-qdrant-vector-store-with-langchain) Send S3 Data to Qdrant Vector Store with LangChain +### Temporary node failure -| Time: 30 min | Level: Beginner | | | -| --- | --- | --- | --- | +If properly configured, running Qdrant in distributed mode can make your cluster resistant to outages when one node fails temporarily. -**Data ingestion into a vector store** is essential for building effective search and retrieval algorithms, especially since nearly 80% of data is unstructured, lacking any predefined format. +Here is how differently-configured Qdrant clusters respond: -In this tutorial, we’ll create a streamlined data ingestion pipeline, pulling data directly from **AWS S3** and feeding it into Qdrant. We’ll dive into vector embeddings, transforming unstructured data into a format that allows you to search documents semantically. Prepare to discover new ways to uncover insights hidden within unstructured data! +* 1-node clusters: All operations time out or fail for up to a few minutes. It depends on how long it takes to restart and load data from disk. +* 2-node clusters where shards ARE NOT replicated: All operations will time out or fail for up to a few minutes. It depends on how long it takes to restart and load data from disk. +* 2-node clusters where all shards ARE replicated to both nodes: All requests except for operations on collections continue to work during the outage. +* 3+-node clusters where all shards are replicated to at least 2 nodes: All requests continue to work during the outage. -## [Anchor](https://qdrant.tech/documentation/data-ingestion-beginners/\#ingestion-workflow-architecture) Ingestion Workflow Architecture +## Consistency guarantees -We’ll set up a powerful document ingestion and analysis pipeline in this workflow using cloud storage, natural language processing (NLP) tools, and embedding technologies. Starting with raw data in an S3 bucket, we’ll preprocess it with LangChain, apply embedding APIs for both text and images and store the results in Qdrant – a vector database optimized for similarity search. +By default, Qdrant focuses on availability and maximum throughput of search operations. +For the majority of use cases, this is a preferable trade-off. -**Figure 1: Data Ingestion Workflow Architecture** +During the normal state of operation, it is possible to search and modify data from any peers in the cluster. -![data-ingestion-beginners-5](https://qdrant.tech/documentation/examples/data-ingestion-beginners/data-ingestion-5.png) +Before responding to the client, the peer handling the request dispatches all operations according to the current topology in order to keep the data synchronized across the cluster. -Let’s break down each component of this workflow: +- reads are using a partial fan-out strategy to optimize latency and availability +- writes are executed in parallel on all active sharded replicas -- **S3 Bucket:** This is our starting point—a centralized, scalable storage solution for various file types like PDFs, images, and text. -- **LangChain:** Acting as the pipeline’s orchestrator, LangChain handles extraction, preprocessing, and manages data flow for embedding generation. It simplifies processing PDFs, so you won’t need to worry about applying OCR (Optical Character Recognition) here. -- **Qdrant:** As your vector database, Qdrant stores embeddings and their [payloads](https://qdrant.tech/documentation/concepts/payload/), enabling efficient similarity search and retrieval across all content types. +![Embeddings](/docs/concurrent-operations-replicas.png) -## [Anchor](https://qdrant.tech/documentation/data-ingestion-beginners/\#prerequisites) Prerequisites +However, in some cases, it is necessary to ensure additional guarantees during possible hardware instabilities, mass concurrent updates of same documents, etc. -![data-ingestion-beginners-11](https://qdrant.tech/documentation/examples/data-ingestion-beginners/data-ingestion-11.png) +Qdrant provides a few options to control consistency guarantees: -In this section, you’ll get a step-by-step guide on ingesting data from an S3 bucket. But before we dive in, let’s make sure you’re set up with all the prerequisites: +- `write_consistency_factor` - defines the number of replicas that must acknowledge a write operation before responding to the client. Increasing this value will make write operations tolerant to network partitions in the cluster, but will require a higher number of replicas to be active to perform write operations. +- Read `consistency` param, can be used with search and retrieve operations to ensure that the results obtained from all replicas are the same. If this option is used, Qdrant will perform the read operation on multiple replicas and resolve the result according to the selected strategy. This option is useful to avoid data inconsistency in case of concurrent updates of the same documents. This options is preferred if the update operations are frequent and the number of replicas is low. +- Write `ordering` param, can be used with update and delete operations to ensure that the operations are executed in the same order on all replicas. If this option is used, Qdrant will route the operation to the leader replica of the shard and wait for the response before responding to the client. This option is useful to avoid data inconsistency in case of concurrent updates of the same documents. This options is preferred if read operations are more frequent than update and if search performance is critical. -| | | -| --- | --- | -| Sample Data | We’ll use a sample dataset, where each folder includes product reviews in text format along with corresponding images. | -| AWS Account | An active [AWS account](https://aws.amazon.com/free/) with access to S3 services. | -| Qdrant Cloud | A [Qdrant Cloud account](https://cloud.qdrant.io/) with access to the WebUI for managing collections and running queries. | -| LangChain | You will use this [popular framework](https://www.langchain.com/) to tie everything together. | -#### [Anchor](https://qdrant.tech/documentation/data-ingestion-beginners/\#supported-document-types) Supported Document Types +### Write consistency factor -The documents used for ingestion can be of various types, such as PDFs, text files, or images. We will organize a structured S3 bucket with folders with the supported document types for testing and experimentation. +The `write_consistency_factor` represents the number of replicas that must acknowledge a write operation before responding to the client. It is set to 1 by default. +It can be configured at the collection's creation or when updating the +collection parameters. -#### [Anchor](https://qdrant.tech/documentation/data-ingestion-beginners/\#python-environment) Python Environment +This value can range from 1 to the number of replicas you have for each shard. -Ensure you have a Python environment (Python 3.9 or higher) with these libraries installed: +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 300, + "distance": "Cosine" + }, + "shard_number": 6, + "replication_factor": 2, + "write_consistency_factor": 2 +} +``` ```python -boto3 -langchain-community -langchain -python-dotenv -unstructured -unstructured[pdf] -qdrant_client -fastembed +from qdrant_client import QdrantClient, models -``` +client = QdrantClient(url="http://localhost:6333") -* * * +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=300, distance=models.Distance.COSINE), + shard_number=6, + replication_factor=2, + write_consistency_factor=2, +) +``` -**Access Keys:** Store your AWS access key, S3 secret key, and Qdrant API key in a .env file for easy access. Here’s a sample `.env` file. +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -```text -ACCESS_KEY = "" -SECRET_ACCESS_KEY = "" -QDRANT_KEY = "" +const client = new QdrantClient({ host: "localhost", port: 6333 }); +client.createCollection("{collection_name}", { + vectors: { + size: 300, + distance: "Cosine", + }, + shard_number: 6, + replication_factor: 2, + write_consistency_factor: 2, +}); ``` -* * * - -## [Anchor](https://qdrant.tech/documentation/data-ingestion-beginners/\#step-1-ingesting-data-from-s3) Step 1: Ingesting Data from S3 - -![data-ingestion-beginners-9.png](https://qdrant.tech/documentation/examples/data-ingestion-beginners/data-ingestion-9.png) +```rust +use qdrant_client::qdrant::{CreateCollectionBuilder, Distance, VectorParamsBuilder}; +use qdrant_client::Qdrant; -The LangChain framework makes it easy to ingest data from storage services like AWS S3, with built-in support for loading documents in formats such as PDFs, images, and text files. +let client = Qdrant::from_url("http://localhost:6334").build()?; -To connect LangChain with S3, you’ll use the `S3DirectoryLoader`, which lets you load files directly from an S3 bucket into LangChain’s pipeline. +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(300, Distance::Cosine)) + .shard_number(6) + .replication_factor(2) + .write_consistency_factor(2), + ) + .await?; +``` -### [Anchor](https://qdrant.tech/documentation/data-ingestion-beginners/\#example-configuring-langchain-to-load-files-from-s3) Example: Configuring LangChain to Load Files from S3 +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; -Here’s how to set up LangChain to ingest data from an S3 bucket: +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -```python -from langchain_community.document_loaders import S3DirectoryLoader +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(300) + .setDistance(Distance.Cosine) + .build()) + .build()) + .setShardNumber(6) + .setReplicationFactor(2) + .setWriteConsistencyFactor(2) + .build()) + .get(); +``` -# Initialize the S3 document loader -loader = S3DirectoryLoader( - "product-dataset", # S3 bucket name - "p_1", #S3 Folder name containing the data for the first product - aws_access_key_id=aws_access_key_id, # AWS Access Key - aws_secret_access_key=aws_secret_access_key # AWS Secret Access Key -) +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -# Load documents from the specified S3 bucket -docs = loader.load() +var client = new QdrantClient("localhost", 6334); +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { Size = 300, Distance = Distance.Cosine }, + shardNumber: 6, + replicationFactor: 2, + writeConsistencyFactor: 2 +); ``` -* * * - -## [Anchor](https://qdrant.tech/documentation/data-ingestion-beginners/\#step-2-turning-documents-into-embeddings) Step 2. Turning Documents into Embeddings +```go +import ( + "context" -[Embeddings](https://qdrant.tech/articles/what-are-embeddings/) are the secret sauce here—they’re numerical representations of data (like text, images, or audio) that capture the “meaning” in a form that’s easy to compare. By converting text and images into embeddings, you’ll be able to perform similarity searches quickly and efficiently. Think of embeddings as the bridge to storing and retrieving meaningful insights from your data in Qdrant. + "github.com/qdrant/go-client/qdrant" +) -### [Anchor](https://qdrant.tech/documentation/data-ingestion-beginners/\#models-well-use-for-generating-embeddings) Models We’ll Use for Generating Embeddings +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -To get things rolling, we’ll use two powerful models: +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 300, + Distance: qdrant.Distance_Cosine, + }), + ShardNumber: qdrant.PtrOf(uint32(6)), + ReplicationFactor: qdrant.PtrOf(uint32(2)), + WriteConsistencyFactor: qdrant.PtrOf(uint32(2)), +}) +``` -1. **`sentence-transformers/all-MiniLM-L6-v2` Embeddings** for transforming text data. -2. **`CLIP` (Contrastive Language-Image Pretraining)** for image data. +Write operations will fail if the number of active replicas is less than the +`write_consistency_factor`. In this case, the client is expected to send the +operation again to ensure a consistent state is reached. -* * * +Setting the `write_consistency_factor` to a lower value may allow accepting +writes even if there are unresponsive nodes. Unresponsive nodes are marked as +dead and will automatically be recovered once available to ensure data +consistency. -### [Anchor](https://qdrant.tech/documentation/data-ingestion-beginners/\#document-processing-function) Document Processing Function +The configuration of the `write_consistency_factor` is important for adjusting the cluster's behavior when some nodes go offline due to restarts, upgrades, or failures. -![data-ingestion-beginners-8.png](https://qdrant.tech/documentation/examples/data-ingestion-beginners/data-ingestion-8.png) +By default, the cluster continues to accept updates as long as at least one replica of each shard is online. However, this behavior means that once an offline replica is restored, it will require additional synchronization with the rest of the cluster. In some cases, this synchronization can be resource-intensive and undesirable. -Next, we’ll define two functions — `process_text` and `process_image` to handle different file types in our document pipeline. The `process_text` function extracts and returns the raw content from a text-based document, while `process_image` retrieves an image from an S3 source and loads it into memory. +Setting the `write_consistency_factor` to match the replication factor modifies the cluster's behavior so that unreplicated updates are rejected, preventing the need for extra synchronization. -```python -from PIL import Image +If the update is applied to enough replicas - according to the `write_consistency_factor` - the update will return a successful status. Any replicas that failed to apply the update will be temporarily disabled and are automatically recovered to keep data consistency. If the update could not be applied to enough replicas, it'll return an error and may be partially applied. The user must submit the operation again to ensure data consistency. -def process_text(doc): - source = doc.metadata['source'] # Extract document source (e.g., S3 URL) +For asynchronous updates and injection pipelines capable of handling errors and retries, this strategy might be preferable. - text = doc.page_content # Extract the content from the text file - print(f"Processing text from {source}") - return source, text -def process_image(doc): - source = doc.metadata['source'] # Extract document source (e.g., S3 URL) - print(f"Processing image from {source}") +### Read consistency - bucket_name, object_key = parse_s3_url(source) # Parse the S3 URL - response = s3.get_object(Bucket=bucket_name, Key=object_key) # Fetch image from S3 - img_bytes = response['Body'].read() +Read `consistency` can be specified for most read requests and will ensure that the returned result +is consistent across cluster nodes. - img = Image.open(io.BytesIO(img_bytes)) - return source, img +- `all` will query all nodes and return points, which present on all of them +- `majority` will query all nodes and return points, which present on the majority of them +- `quorum` will query randomly selected majority of nodes and return points, which present on all of them +- `1`/`2`/`3`/etc - will query specified number of randomly selected nodes and return points which present on all of them +- default `consistency` is `1` +```http +POST /collections/{collection_name}/points/query?consistency=majority +{ + "query": [0.2, 0.1, 0.9, 0.7], + "filter": { + "must": [ + { + "key": "city", + "match": { + "value": "London" + } + } + ] + }, + "params": { + "hnsw_ef": 128, + "exact": false + }, + "limit": 3 +} ``` -### [Anchor](https://qdrant.tech/documentation/data-ingestion-beginners/\#helper-functions-for-document-processing) Helper Functions for Document Processing - -To retrieve images from S3, a helper function `parse_s3_url` breaks down the S3 URL into its bucket and critical components. This is essential for fetching the image from S3 storage. - ```python -def parse_s3_url(s3_url): - parts = s3_url.replace("s3://", "").split("/", 1) - bucket_name = parts[0] - object_key = parts[1] - return bucket_name, object_key - +client.query_points( + collection_name="{collection_name}", + query=[0.2, 0.1, 0.9, 0.7], + query_filter=models.Filter( + must=[ + models.FieldCondition( + key="city", + match=models.MatchValue( + value="London", + ), + ) + ] + ), + search_params=models.SearchParams(hnsw_ef=128, exact=False), + limit=3, + consistency="majority", +) ``` -* * * - -## [Anchor](https://qdrant.tech/documentation/data-ingestion-beginners/\#step-3-loading-embeddings-into-qdrant) Step 3: Loading Embeddings into Qdrant - -![data-ingestion-beginners-10](https://qdrant.tech/documentation/examples/data-ingestion-beginners/data-ingestion-10.png) - -Now that your documents have been processed and converted into embeddings, the next step is to load these embeddings into Qdrant. - -### [Anchor](https://qdrant.tech/documentation/data-ingestion-beginners/\#creating-a-collection-in-qdrant) Creating a Collection in Qdrant - -In Qdrant, data is organized in collections, each representing a set of embeddings (or points) and their associated metadata (payload). To store the embeddings generated earlier, you’ll first need to create a collection. - -Here’s how to create a collection in Qdrant to store both text and image embeddings: - -```python -def create_collection(collection_name): - qdrant_client.create_collection( - collection_name, - vectors_config={ - "text_embedding": models.VectorParams( - size=384, # Dimension of text embeddings - distance=models.Distance.COSINE, # Cosine similarity is used for comparison - ), - "image_embedding": models.VectorParams( - size=512, # Dimension of image embeddings - distance=models.Distance.COSINE, # Cosine similarity is used for comparison - ), - }, - ) - -create_collection("products-data") - +```typescript +client.query("{collection_name}", { + query: [0.2, 0.1, 0.9, 0.7], + filter: { + must: [{ key: "city", match: { value: "London" } }], + }, + params: { + hnsw_ef: 128, + exact: false, + }, + limit: 3, + consistency: "majority", +}); ``` -* * * - -This function creates a collection for storing text (384 dimensions) and image (512 dimensions) embeddings, using cosine similarity to compare embeddings within the collection. - -Once the collection is set up, you can load the embeddings into Qdrant. This involves inserting (or updating) the embeddings and their associated metadata (payload) into the specified collection. +```rust +use qdrant_client::qdrant::{ + read_consistency::Value, Condition, Filter, QueryPointsBuilder, ReadConsistencyType, + SearchParamsBuilder, +}; +use qdrant_client::{Qdrant, QdrantError}; -Here’s the code for loading embeddings into Qdrant: +let client = Qdrant::from_url("http://localhost:6334").build()?; -```python -def ingest_data(points): - operation_info = qdrant_client.upsert( - collection_name="products-data", # Collection where data is being inserted - points=points +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query(vec![0.2, 0.1, 0.9, 0.7]) + .limit(3) + .filter(Filter::must([Condition::matches( + "city", + "London".to_string(), + )])) + .params(SearchParamsBuilder::default().hnsw_ef(128).exact(false)) + .read_consistency(Value::Type(ReadConsistencyType::Majority.into())), ) - return operation_info - + .await?; ``` -* * * - -**Explanation of Ingestion** - -1. **Upserting the Data Point:** The upsert method on the `qdrant_client` inserts each PointStruct into the specified collection. If a point with the same ID already exists, it will be updated with the new values. -2. **Operation Info:** The function returns `operation_info`, which contains details about the upsert operation, such as success status or any potential errors. - -**Running the Ingestion Code** - -Here’s how to call the function and ingest data: +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.QueryPoints; +import io.qdrant.client.grpc.Points.ReadConsistency; +import io.qdrant.client.grpc.Points.ReadConsistencyType; +import io.qdrant.client.grpc.Points.SearchParams; -```python -from qdrant_client import models +import static io.qdrant.client.QueryFactory.nearest; +import static io.qdrant.client.ConditionFactory.matchKeyword; -if __name__ == "__main__": - collection_name = "products-data" - create_collection(collection_name) - for i in range(1,6): # Five documents - folder = f"p_{i}" - loader = S3DirectoryLoader( - "product-dataset", - folder, - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key - ) - docs = loader.load() - points, text_review, product_image = [], "", "" - for idx, doc in enumerate(docs): - source = doc.metadata['source'] - if source.endswith(".txt") or source.endswith(".pdf"): - _text_review_source, text_review = process_text(doc) - elif source.endswith(".png"): - product_image_source, product_image = process_image(doc) - if text_review: - point = models.PointStruct( - id=idx, # Unique identifier for each point - vector={ - "text_embedding": models.Document( - text=text_review, model="sentence-transformers/all-MiniLM-L6-v2" - ), - "image_embedding": models.Image( - image=product_image, model="Qdrant/clip-ViT-B-32-vision" - ), - }, - payload={"review": text_review, "product_image": product_image_source}, - ) - points.append(point) - operation_info = ingest_data(points) - print(operation_info) +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +client.queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setFilter(Filter.newBuilder().addMust(matchKeyword("city", "London")).build()) + .setQuery(nearest(.2f, 0.1f, 0.9f, 0.7f)) + .setParams(SearchParams.newBuilder().setHnswEf(128).setExact(false).build()) + .setLimit(3) + .setReadConsistency( + ReadConsistency.newBuilder().setType(ReadConsistencyType.Majority).build()) + .build()) + .get(); ``` -The `PointStruct` is instantiated with these key parameters: +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; +using static Qdrant.Client.Grpc.Conditions; -- **id:** A unique identifier for each embedding, typically an incremental index. +var client = new QdrantClient("localhost", 6334); -- **vector:** A dictionary holding the text and image inputs to be embedded. `qdrant-client` uses [FastEmbed](https://github.com/qdrant/fastembed) under the hood to automatically generate vector representations from these inputs locally. +await client.QueryAsync( + collectionName: "{collection_name}", + query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, + filter: MatchKeyword("city", "London"), + searchParams: new SearchParams { HnswEf = 128, Exact = false }, + limit: 3, + readConsistency: new ReadConsistency { Type = ReadConsistencyType.Majority } +); +``` -- **payload:** A dictionary storing additional metadata, like product reviews and image references, which is invaluable for retrieval and context during searches. +```go +import ( + "context" + "github.com/qdrant/go-client/qdrant" +) -The code dynamically loads folders from an S3 bucket, processes text and image files separately, and stores their embeddings and associated data in dedicated lists. It then creates a `PointStruct` for each data entry and calls the ingestion function to load it into Qdrant. +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -### [Anchor](https://qdrant.tech/documentation/data-ingestion-beginners/\#exploring-the-qdrant-webui-dashboard) Exploring the Qdrant WebUI Dashboard +client.Query(context.Background(), &qdrant.QueryPoints{ + CollectionName: "{collection_name}", + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), + Filter: &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewMatch("city", "London"), + }, + }, + Params: &qdrant.SearchParams{ + HnswEf: qdrant.PtrOf(uint64(128)), + }, + Limit: qdrant.PtrOf(uint64(3)), + ReadConsistency: qdrant.NewReadConsistencyType(qdrant.ReadConsistencyType_Majority), +}) +``` -Once the embeddings are loaded into Qdrant, you can use the WebUI dashboard to visualize and manage your collections. The dashboard provides a clear, structured interface for viewing collections and their data. Let’s take a closer look in the next section. +### Write ordering -## [Anchor](https://qdrant.tech/documentation/data-ingestion-beginners/\#step-4-visualizing-data-in-qdrant-webui) Step 4: Visualizing Data in Qdrant WebUI +Write `ordering` can be specified for any write request to serialize it through a single "leader" node, +which ensures that all write operations (issued with the same `ordering`) are performed and observed +sequentially. -To start visualizing your data in the Qdrant WebUI, head to the **Overview** section and select **Access the database**. +- `weak` _(default)_ ordering does not provide any additional guarantees, so write operations can be freely reordered. +- `medium` ordering serializes all write operations through a dynamically elected leader, which might cause minor inconsistencies in case of leader change. +- `strong` ordering serializes all write operations through the permanent leader, which provides strong consistency, but write operations may be unavailable if the leader is down. -**Figure 2: Accessing the Database from the Qdrant UI**![data-ingestion-beginners-2.png](https://qdrant.tech/documentation/examples/data-ingestion-beginners/data-ingestion-2.png) + -When prompted, enter your API key. Once inside, you’ll be able to view your collections and the corresponding data points. You should see your collection displayed like this: +```http +PUT /collections/{collection_name}/points?ordering=strong +{ + "batch": { + "ids": [1, 2, 3], + "payloads": [ + {"color": "red"}, + {"color": "green"}, + {"color": "blue"} + ], + "vectors": [ + [0.9, 0.1, 0.1], + [0.1, 0.9, 0.1], + [0.1, 0.1, 0.9] + ] + } +} +``` -**Figure 3: The product-data Collection in Qdrant**![data-ingestion-beginners-4.png](https://qdrant.tech/documentation/examples/data-ingestion-beginners/data-ingestion-4.png) +```python +client.upsert( + collection_name="{collection_name}", + points=models.Batch( + ids=[1, 2, 3], + payloads=[ + {"color": "red"}, + {"color": "green"}, + {"color": "blue"}, + ], + vectors=[ + [0.9, 0.1, 0.1], + [0.1, 0.9, 0.1], + [0.1, 0.1, 0.9], + ], + ), + ordering=models.WriteOrdering.STRONG, +) +``` -Here’s a look at the most recent point ingested into Qdrant: +```typescript +client.upsert("{collection_name}", { + batch: { + ids: [1, 2, 3], + payloads: [{ color: "red" }, { color: "green" }, { color: "blue" }], + vectors: [ + [0.9, 0.1, 0.1], + [0.1, 0.9, 0.1], + [0.1, 0.1, 0.9], + ], + }, + ordering: "strong", +}); +``` -**Figure 4: The Latest Point Added to the product-data Collection**![data-ingestion-beginners-6.png](https://qdrant.tech/documentation/examples/data-ingestion-beginners/data-ingestion-6.png) +```rust +use qdrant_client::qdrant::{ + PointStruct, UpsertPointsBuilder, WriteOrdering, WriteOrderingType +}; +use qdrant_client::Qdrant; -The Qdrant WebUI’s search functionality allows you to perform vector searches across your collections. With options to apply filters and parameters, retrieving relevant embeddings and exploring relationships within your data becomes easy. To start, head over to the **Console** in the left panel, where you can create queries: +let client = Qdrant::from_url("http://localhost:6334").build()?; -**Figure 5: Overview of Console in Qdrant**![data-ingestion-beginners-1.png](https://qdrant.tech/documentation/examples/data-ingestion-beginners/data-ingestion-1.png) +client + .upsert_points( + UpsertPointsBuilder::new( + "{collection_name}", + vec![ + PointStruct::new(1, vec![0.9, 0.1, 0.1], [("color", "red".into())]), + PointStruct::new(2, vec![0.1, 0.9, 0.1], [("color", "green".into())]), + PointStruct::new(3, vec![0.1, 0.1, 0.9], [("color", "blue".into())]), + ], + ) + .ordering(WriteOrdering { + r#type: WriteOrderingType::Strong.into(), + }), + ) + .await?; +``` -The first query retrieves all collections, the second fetches points from the product-data collection, and the third performs a sample query. This demonstrates how straightforward it is to interact with your data in the Qdrant UI. +```java +import java.util.List; +import java.util.Map; -Now, let’s retrieve some documents from the database using a query!. +import static io.qdrant.client.PointIdFactory.id; +import static io.qdrant.client.ValueFactory.value; +import static io.qdrant.client.VectorsFactory.vectors; -**Figure 6: Querying the Qdrant Client to Retrieve Relevant Documents**![data-ingestion-beginners-3.png](https://qdrant.tech/documentation/examples/data-ingestion-beginners/data-ingestion-3.png) +import io.qdrant.client.grpc.Points.PointStruct; +import io.qdrant.client.grpc.Points.UpsertPoints; +import io.qdrant.client.grpc.Points.WriteOrdering; +import io.qdrant.client.grpc.Points.WriteOrderingType; -In this example, we queried **Phones with improved design**. Then, we converted the text to vectors using OpenAI and retrieved a relevant phone review highlighting design improvements. +client + .upsertAsync( + UpsertPoints.newBuilder() + .setCollectionName("{collection_name}") + .addAllPoints( + List.of( + PointStruct.newBuilder() + .setId(id(1)) + .setVectors(vectors(0.9f, 0.1f, 0.1f)) + .putAllPayload(Map.of("color", value("red"))) + .build(), + PointStruct.newBuilder() + .setId(id(2)) + .setVectors(vectors(0.1f, 0.9f, 0.1f)) + .putAllPayload(Map.of("color", value("green"))) + .build(), + PointStruct.newBuilder() + .setId(id(3)) + .setVectors(vectors(0.1f, 0.1f, 0.94f)) + .putAllPayload(Map.of("color", value("blue"))) + .build())) + .setOrdering(WriteOrdering.newBuilder().setType(WriteOrderingType.Strong).build()) + .build()) + .get(); +``` -## [Anchor](https://qdrant.tech/documentation/data-ingestion-beginners/\#conclusion) Conclusion +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -In this guide, we set up an S3 bucket, ingested various data types, and stored embeddings in Qdrant. Using LangChain, we dynamically processed text and image files, making it easy to work with each file type. +var client = new QdrantClient("localhost", 6334); -Now, it’s your turn. Try experimenting with different data types, such as videos, and explore Qdrant’s advanced features to enhance your applications. To get started, [sign up](https://cloud.qdrant.io/signup) for Qdrant today. +await client.UpsertAsync( + collectionName: "{collection_name}", + points: new List + { + new() + { + Id = 1, + Vectors = new[] { 0.9f, 0.1f, 0.1f }, + Payload = { ["color"] = "red" } + }, + new() + { + Id = 2, + Vectors = new[] { 0.1f, 0.9f, 0.1f }, + Payload = { ["color"] = "green" } + }, + new() + { + Id = 3, + Vectors = new[] { 0.1f, 0.1f, 0.9f }, + Payload = { ["color"] = "blue" } + } + }, + ordering: WriteOrderingType.Strong +); +``` -![data-ingestion-beginners-12](https://qdrant.tech/documentation/examples/data-ingestion-beginners/data-ingestion-12.png) +```go +import ( + "context" -##### Was this page useful? + "github.com/qdrant/go-client/qdrant" +) -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) -Thank you for your feedback! 🙏 +client.Upsert(context.Background(), &qdrant.UpsertPoints{ + CollectionName: "{collection_name}", + Points: []*qdrant.PointStruct{ + { + Id: qdrant.NewIDNum(1), + Vectors: qdrant.NewVectors(0.9, 0.1, 0.1), + Payload: qdrant.NewValueMap(map[string]any{"color": "red"}), + }, + { + Id: qdrant.NewIDNum(2), + Vectors: qdrant.NewVectors(0.1, 0.9, 0.1), + Payload: qdrant.NewValueMap(map[string]any{"color": "green"}), + }, + { + Id: qdrant.NewIDNum(3), + Vectors: qdrant.NewVectors(0.1, 0.1, 0.9), + Payload: qdrant.NewValueMap(map[string]any{"color": "blue"}), + }, + }, + Ordering: &qdrant.WriteOrdering{ + Type: qdrant.WriteOrderingType_Strong, + }, +}) +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/data-ingestion-beginners.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +## Listener mode -On this page: + -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/data-ingestion-beginners.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +In some cases it might be useful to have a Qdrant node that only accumulates data and does not participate in search operations. +There are several scenarios where this can be useful: -× +- Listener option can be used to store data in a separate node, which can be used for backup purposes or to store data for a long time. +- Listener node can be used to synchronize data into another region, while still performing search operations in the local region. -[Powered by](https://qdrant.tech/) -<|page-116-lllmstxt|> -## vectors -- [Documentation](https://qdrant.tech/documentation/) -- [Concepts](https://qdrant.tech/documentation/concepts/) -- Vectors +To enable listener mode, set `node_type` to `Listener` in the config file: -# [Anchor](https://qdrant.tech/documentation/concepts/vectors/\#vectors) Vectors -Vectors (or embeddings) are the core concept of the Qdrant Vector Search engine. -Vectors define the similarity between objects in the vector space. +```yaml +storage: + node_type: "Listener" +``` -If a pair of vectors are similar in vector space, it means that the objects they represent are similar in some way. +Listener node will not participate in search operations, but will still accept write operations and will store the data in the local storage. -For example, if you have a collection of images, you can represent each image as a vector. -If two images are similar, their vectors will be close to each other in the vector space. +All shards, stored on the listener node, will be converted to the `Listener` state. -In order to obtain a vector representation of an object, you need to apply a vectorization algorithm to the object. -Usually, this algorithm is a neural network that converts the object into a fixed-size vector. +Additionally, all write requests sent to the listener node will be processed with `wait=false` option, which means that the write oprations will be considered successful once they are written to WAL. +This mechanism should allow to minimize upsert latency in case of parallel snapshotting. -The neural network is usually [trained](https://qdrant.tech/articles/metric-learning-tips/) on a pairs or [triplets](https://qdrant.tech/articles/triplet-loss/) of similar and dissimilar objects, so it learns to recognize a specific type of similarity. +## Consensus Checkpointing -By using this property of vectors, you can explore your data in a number of ways; e.g. by searching for similar objects, clustering objects, and more. +Consensus checkpointing is a technique used in Raft to improve performance and simplify log management by periodically creating a consistent snapshot of the system state. +This snapshot represents a point in time where all nodes in the cluster have reached agreement on the state, and it can be used to truncate the log, reducing the amount of data that needs to be stored and transferred between nodes. -## [Anchor](https://qdrant.tech/documentation/concepts/vectors/\#vector-types) Vector Types +For example, if you attach a new node to the cluster, it should replay all the log entries to catch up with the current state. +In long-running clusters, this can take a long time, and the log can grow very large. -Modern neural networks can output vectors in different shapes and sizes, and Qdrant supports most of them. -Let’s take a look at the most common types of vectors supported by Qdrant. +To prevent this, one can use a special checkpointing mechanism, that will truncate the log and create a snapshot of the current state. -### [Anchor](https://qdrant.tech/documentation/concepts/vectors/\#dense-vectors) Dense Vectors +To use this feature, simply call the `/cluster/recover` API on required node: -This is the most common type of vector. It is a simple list of numbers, it has a fixed length and each element of the list is a floating-point number. +```http +POST /cluster/recover +``` -It looks like this: +This API can be triggered on any non-leader node, it will send a request to the current consensus leader to create a snapshot. The leader will in turn send the snapshot back to the requesting node for application. -```json +In some cases, this API can be used to recover from an inconsistent cluster state by forcing a snapshot creation. -// A piece of a real-world dense vector -[\ - -0.013052909,\ - 0.020387933,\ - -0.007869,\ - -0.11111383,\ - -0.030188112,\ - -0.0053388323,\ - 0.0010654867,\ - 0.072027855,\ - -0.04167721,\ - 0.014839341,\ - -0.032948174,\ - -0.062975034,\ - -0.024837125,\ - ....\ -] +<|page-192-lllmstxt|> +# Quantization -``` +Quantization is an optional feature in Qdrant that enables efficient storage and search of high-dimensional vectors. +By transforming original vectors into a new representations, quantization compresses data while preserving close to original relative distances between vectors. +Different quantization methods have different mechanics and tradeoffs. We will cover them in this section. -The majority of neural networks create dense vectors, so you can use them with Qdrant without any additional processing. -Although compatible with most embedding models out there, Qdrant has been tested with the following [verified embedding providers](https://qdrant.tech/documentation/embeddings/). +Quantization is primarily used to reduce the memory footprint and accelerate the search process in high-dimensional vector spaces. +In the context of the Qdrant, quantization allows you to optimize the search engine for specific use cases, striking a balance between accuracy, storage efficiency, and search speed. -### [Anchor](https://qdrant.tech/documentation/concepts/vectors/\#sparse-vectors) Sparse Vectors +There are tradeoffs associated with quantization. +On the one hand, quantization allows for significant reductions in storage requirements and faster search times. +This can be particularly beneficial in large-scale applications where minimizing the use of resources is a top priority. +On the other hand, quantization introduces an approximation error, which can lead to a slight decrease in search quality. +The level of this tradeoff depends on the quantization method and its parameters, as well as the characteristics of the data. -Sparse vectors are a special type of vectors. -Mathematically, they are the same as dense vectors, but they contain many zeros so they are stored in a special format. +## Scalar Quantization -Sparse vectors in Qdrant don’t have a fixed length, as it is dynamically allocated during vector insertion. -The amount of non-zero values in sparse vectors is currently limited to u32 datatype range (4294967295). +*Available as of v1.1.0* -In order to define a sparse vector, you need to provide a list of non-zero elements and their indexes. +Scalar quantization, in the context of vector search engines, is a compression technique that compresses vectors by reducing the number of bits used to represent each vector component. -```json -// A sparse vector with 4 non-zero elements -{ - "indexes": [1, 3, 5, 7], - "values": [0.1, 0.2, 0.3, 0.4] -} +For instance, Qdrant uses 32-bit floating numbers to represent the original vector components. Scalar quantization allows you to reduce the number of bits used to 8. +In other words, Qdrant performs `float32 -> uint8` conversion for each vector component. +Effectively, this means that the amount of memory required to store a vector is reduced by a factor of 4. -``` +In addition to reducing the memory footprint, scalar quantization also speeds up the search process. +Qdrant uses a special SIMD CPU instruction to perform fast vector comparison. +This instruction works with 8-bit integers, so the conversion to `uint8` allows Qdrant to perform the comparison faster. -Sparse vectors in Qdrant are kept in special storage and indexed in a separate index, so their configuration is different from dense vectors. +The main drawback of scalar quantization is the loss of accuracy. The `float32 -> uint8` conversion introduces an error that can lead to a slight decrease in search quality. +However, this error is usually negligible, and tends to be less significant for high-dimensional vectors. +In our experiments, we found that the error introduced by scalar quantization is usually less than 1%. -To create a collection with sparse vectors: +However, this value depends on the data and the quantization parameters. +Please refer to the [Quantization Tips](#quantization-tips) section for more information on how to optimize the quantization parameters for your use case. -httpbashpythontypescriptrustjavacsharpgo +## Binary Quantization -```http -PUT /collections/{collection_name} -{ - "sparse_vectors": { - "text": { } - } -} +*Available as of v1.5.0* -``` +Binary quantization is an extreme case of scalar quantization. +This feature lets you represent each vector component as a single bit, effectively reducing the memory footprint by a **factor of 32**. -```bash -curl -X PUT http://localhost:6333/collections/{collection_name} \ - -H 'Content-Type: application/json' \ - --data-raw '{ - "sparse_vectors": { - "text": { } - } - }' +This is the fastest quantization method, since it lets you perform a vector comparison with a few CPU instructions. -``` +Binary quantization can achieve up to a **40x** speedup compared to the original vectors. -```python -from qdrant_client import QdrantClient, models +However, binary quantization is only efficient for high-dimensional vectors and require a centered distribution of vector components. -client = QdrantClient(url="http://localhost:6333") +At the moment, binary quantization shows good accuracy results with the following models: -client.create_collection( - collection_name="{collection_name}", - vectors_config={}, - sparse_vectors_config={ - "text": models.SparseVectorParams(), - }, -) +- OpenAI `text-embedding-ada-002` - 1536d tested with [dbpedia dataset](https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M) achieving 0.98 recall@100 with 4x oversampling +- Cohere AI `embed-english-v2.0` - 4096d tested on Wikipedia embeddings - 0.98 recall@50 with 2x oversampling -``` +Models with a lower dimensionality or a different distribution of vector components may require additional experiments to find the optimal quantization parameters. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +We recommend using binary quantization only with rescoring enabled, as it can significantly improve the search quality +with just a minor performance impact. +Additionally, oversampling can be used to tune the tradeoff between search speed and search quality in the query time. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +### Binary Quantization as Hamming Distance -client.createCollection("{collection_name}", { - sparse_vectors: { - text: { }, - }, -}); +The additional benefit of this method is that you can efficiently emulate Hamming distance with dot product. -``` +Specifically, if original vectors contain `{-1, 1}` as possible values, then the dot product of two vectors is equal to the Hamming distance by simply replacing `-1` with `0` and `1` with `1`. -```rust -use qdrant_client::Qdrant; -use qdrant_client::qdrant::{ - CreateCollectionBuilder, SparseVectorParamsBuilder, SparseVectorsConfigBuilder, -}; + -let client = Qdrant::from_url("http://localhost:6334").build()?; +
+ Sample truth table -let mut sparse_vector_config = SparseVectorsConfigBuilder::default(); +| Vector 1 | Vector 2 | Dot product | +|----------|----------|-------------| +| 1 | 1 | 1 | +| 1 | -1 | -1 | +| -1 | 1 | -1 | +| -1 | -1 | 1 | -sparse_vector_config.add_named_vector_params("text", SparseVectorParamsBuilder::default()); +| Vector 1 | Vector 2 | Hamming distance | +|----------|----------|------------------| +| 1 | 1 | 0 | +| 1 | 0 | 1 | +| 0 | 1 | 1 | +| 0 | 0 | 0 | -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .sparse_vectors_config(sparse_vector_config), - ) - .await?; +
-``` +As you can see, both functions are equal up to a constant factor, which makes similarity search equivalent. +Binary quantization makes it efficient to compare vectors using this representation. -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.SparseVectorConfig; -import io.qdrant.client.grpc.Collections.SparseVectorParams; +### 1.5-Bit and 2-Bit Quantization -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +*Available as of v1.15.0* -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setSparseVectorsConfig( - SparseVectorConfig.newBuilder() - .putMap("text", SparseVectorParams.getDefaultInstance())) - .build()) - .get(); +**Binary quantization** storage can use **2 and 1.5 bits** per dimension, improving precision for smaller vectors. One-bit compression resulted in significant data loss and precision drops for vectors smaller than a thousand dimensions, often requiring expensive rescoring. 2-bit quantization offers 16X compression compared to 32X with one bit, improving performance for smaller vector dimensions. The 1.5-bit quantization compression offers 24X compression and intermediate accuracy. -``` +A major limitation of binary quantization is poor handling of values close to zero. +2-bit quantization addresses this by explicitly representing zeros using an efficient scoring mechanism. In the case of 1.5-bit quantization, the zero-bit is shared between two values, balancing the efficiency of binary quantization with the accuracy improvements of 2-bit quantization, especially when 2-bit BQ requires too much memory. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +In order to build 2-bit representation, Qdrant computes values distribution and then assigns bit values to 3 possible buckets: -var client = new QdrantClient("localhost", 6334); +- `-1` - 00 +- `0` - 01 +- `1` - 11 -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - sparseVectorsConfig: ("text", new SparseVectorParams()) -); +1.5-bit quantization is similar, but merges buckets of pairs of elements into a binary triptets -``` +{{
}} -```go -import ( - "context" +See how to set up 1.5-bit and 2-bit quantization in the [following section](#set-up-bit-depth). - "github.com/qdrant/go-client/qdrant" -) +### Asymmetric Quantization -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +*Available as of v1.15.0* -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - SparseVectorsConfig: qdrant.NewSparseVectorsConfig( - map[string]*qdrant.SparseVectorParams{ - "text": {}, - }), -}) +The **Asymmetric Quantization** technique allows qdrant to use different vector encoding algorithm for stored vectors and for queries. +Particularly interesting combination is a Binary stored vectors and Scalar quantized queries. -``` +{{
}} -Insert a point with a sparse vector into the created collection: +This approach maintains storage size and RAM usage similar to binary quantization while offering improved precision. It is beneficial for memory-constrained deployments, or where the bottleneck is disk I/O rather than CPU. +This is particularly useful for indexing millions of vectors as it improves precision without sacrificing much because the limitation in such scenarios is disk speed, not CPU. This approach requires less rescoring for the same quality output. -httppythontypescriptrustjavacsharpgo +See how to set up Asymmetric Quantization quantization in the [following section](#set-up-asymmetric-quantization) -```http -PUT /collections/{collection_name}/points -{ - "points": [\ - {\ - "id": 1,\ - "vector": {\ - "text": {\ - "indices": [1, 3, 5, 7],\ - "values": [0.1, 0.2, 0.3, 0.4]\ - }\ - }\ - }\ - ] -} +## Product Quantization -``` +*Available as of v1.2.0* -```python -from qdrant_client import QdrantClient, models +Product quantization is a method of compressing vectors to minimize their memory usage by dividing them into +chunks and quantizing each segment individually. +Each chunk is approximated by a centroid index that represents the original vector component. +The positions of the centroids are determined through the utilization of a clustering algorithm such as k-means. +For now, Qdrant uses only 256 centroids, so each centroid index can be represented by a single byte. -client = QdrantClient(url="http://localhost:6333") +Product quantization can compress by a more prominent factor than a scalar one. +But there are some tradeoffs. Product quantization distance calculations are not SIMD-friendly, so it is slower than scalar quantization. +Also, product quantization has a loss of accuracy, so it is recommended to use it only for high-dimensional vectors. -client.upsert( - collection_name="{collection_name}", - points=[\ - models.PointStruct(\ - id=1,\ - payload={}, # Add any additional payload if necessary\ - vector={\ - "text": models.SparseVector(\ - indices=[1, 3, 5, 7],\ - values=[0.1, 0.2, 0.3, 0.4]\ - )\ - },\ - )\ - ], -) +Please refer to the [Quantization Tips](#quantization-tips) section for more information on how to optimize the quantization parameters for your use case. -``` +## How to choose the right quantization method -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +Here is a brief table of the pros and cons of each quantization method: -const client = new QdrantClient({ host: "localhost", port: 6333 }); +| Quantization method | Accuracy | Speed | Compression | +|---------------------|----------|--------------|-------------| +| Scalar | 0.99 | up to x2 | 4 | +| Product | 0.7 | 0.5 | up to 64 | +| Binary (1 bit) | 0.95* | up to x40 | 32 | +| Binary (1.5 bit) | 0.95** | up to x30 | 24 | +| Binary (2 bit) | 0.95*** | up to x20 | 16 | -client.upsert("{collection_name}", { - points: [\ - {\ - id: 1,\ - vector: {\ - text: {\ - indices: [1, 3, 5, 7],\ - values: [0.1, 0.2, 0.3, 0.4]\ - },\ - },\ - }\ - ] -}); +- `*` - for compatible models with high-dimensional vectors (approx. 1536+ dimensions) +- `**` - for compatible models with medium-dimensional vectors (approx. 1024-1536 dimensions) +- `***` - for compatible models with low-dimensional vectors (approx. 768-1024 dimensions) -``` +- **Binary Quantization** is the fastest method and the most memory-efficient, but it requires a centered distribution of vector components. It is recommended to use with tested models only. + - If you are planning to use binary quantization with low or medium-dimensional vectors (approx. 512-1024 dimensions), it is recommended to use 1.5-bit or 2-bit quantization as well as asymmetric quantization feature. -```rust -use qdrant_client::qdrant::{NamedVectors, PointStruct, UpsertPointsBuilder, Vector}; +- **Scalar Quantization** is the most universal method, as it provides a good balance between accuracy, speed, and compression. It is recommended as default quantization if binary quantization is not applicable. +- **Product Quantization** may provide a better compression ratio, but it has a significant loss of accuracy and is slower than scalar quantization. It is recommended if the memory footprint is the top priority and the search speed is not critical. -use qdrant_client::{Payload, Qdrant}; +## Setting up Quantization in Qdrant -let client = Qdrant::from_url("http://localhost:6334").build()?; +You can configure quantization for a collection by specifying the quantization parameters in the `quantization_config` section of the collection configuration. -let points = vec![PointStruct::new(\ - 1,\ - NamedVectors::default().add_vector(\ - "text",\ - Vector::new_sparse(vec![1, 3, 5, 7], vec![0.1, 0.2, 0.3, 0.4]),\ - ),\ - Payload::new(),\ -)]; +Quantization will be automatically applied to all vectors during the indexation process. +Quantized vectors are stored alongside the original vectors in the collection, so you will still have access to the original vectors if you need them. -client - .upsert_points(UpsertPointsBuilder::new("{collection_name}", points)) - .await?; +*Available as of v1.1.1* -``` +The `quantization_config` can also be set on a per vector basis by specifying it in a named vector. -```java -import java.util.List; -import java.util.Map; +### Setting up Scalar Quantization -import static io.qdrant.client.PointIdFactory.id; -import static io.qdrant.client.VectorFactory.vector; -import static io.qdrant.client.VectorsFactory.namedVectors; +To enable scalar quantization, you need to specify the quantization parameters in the `quantization_config` section of the collection configuration. -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.PointStruct; +When enabling scalar quantization on an existing collection, use a PATCH request or the corresponding `update_collection` method and omit the vector configuration, as it's already defined. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +```python +from qdrant_client import QdrantClient, models -client - .upsertAsync( - "{collection_name}", - List.of( - PointStruct.newBuilder() - .setId(id(1)) - .setVectors( - namedVectors(Map.of( - "text", vector(List.of(1.0f, 2.0f), List.of(6, 7)))) - ) - .build())) - .get(); +client = QdrantClient(url="http://localhost:6333") +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), + quantization_config=models.ScalarQuantization( + scalar=models.ScalarQuantizationConfig( + type=models.ScalarType.INT8, + quantile=0.99, + always_ram=True, + ), + ), +) ``` ```csharp @@ -42994,18 +61055,19 @@ using Qdrant.Client.Grpc; var client = new QdrantClient("localhost", 6334); -await client.UpsertAsync( - collectionName: "{collection_name}", - points: new List < PointStruct > { - new() { - Id = 1, - Vectors = new Dictionary { - ["text"] = ([0.1f, 0.2f, 0.3f, 0.4f], [1, 3, 5, 7]) - } - } +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine }, + quantizationConfig: new QuantizationConfig + { + Scalar = new ScalarQuantization + { + Type = QuantizationType.Int8, + Quantile = 0.99f, + AlwaysRam = true } + } ); - ``` ```go @@ -43020,120 +61082,174 @@ client, err := qdrant.NewClient(&qdrant.Config{ Port: 6334, }) -client.Upsert(context.Background(), &qdrant.UpsertPoints{ +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ CollectionName: "{collection_name}", - Points: []*qdrant.PointStruct{ - { - Id: qdrant.NewIDNum(1), - Vectors: qdrant.NewVectorsMap( - map[string]*qdrant.Vector{ - "text": qdrant.NewVectorSparse( - []uint32{1, 3, 5, 7}, - []float32{0.1, 0.2, 0.3, 0.4}), - }), + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 768, + Distance: qdrant.Distance_Cosine, + }), + QuantizationConfig: qdrant.NewQuantizationScalar( + &qdrant.ScalarQuantization{ + Type: qdrant.QuantizationType_Int8, + Quantile: qdrant.PtrOf(float32(0.99)), + AlwaysRam: qdrant.PtrOf(true), }, - }, + ), }) - ``` -Now you can run a search with sparse vectors: - -httppythontypescriptrustjavacsharpgo - ```http -POST /collections/{collection_name}/points/query +PUT /collections/{collection_name} { - "query": { - "indices": [1, 3, 5, 7], - "values": [0.1, 0.2, 0.3, 0.4] + "vectors": { + "size": 768, + "distance": "Cosine" }, - "using": "text" + "quantization_config": { + "scalar": { + "type": "int8", + "quantile": 0.99, + "always_ram": true + } + } } - -``` - -```python -from qdrant_client import QdrantClient, models - -client = QdrantClient(url="http://localhost:6333") - -result = client.query_points( - collection_name="{collection_name}", - query=models.SparseVector(indices=[1, 3, 5, 7], values=[0.1, 0.2, 0.3, 0.4]), - using="text", -).points - ``` ```typescript import { QdrantClient } from "@qdrant/js-client-rest"; - -const client = new QdrantClient({ host: "localhost", port: 6333 }); - -client.query("{collection_name}", { - query: { - indices: [1, 3, 5, 7], - values: [0.1, 0.2, 0.3, 0.4] + +const client = new QdrantClient({ host: "localhost", port: 6333 }); + +client.createCollection("{collection_name}", { + vectors: { + size: 768, + distance: "Cosine", + }, + quantization_config: { + scalar: { + type: "int8", + quantile: 0.99, + always_ram: true, }, - using: "text", - limit: 3, + }, }); - ``` ```rust -use qdrant_client::qdrant::QueryPointsBuilder; +use qdrant_client::qdrant::{ + CreateCollectionBuilder, Distance, QuantizationType, ScalarQuantizationBuilder, + VectorParamsBuilder, +}; use qdrant_client::Qdrant; let client = Qdrant::from_url("http://localhost:6334").build()?; client - .query( - QueryPointsBuilder::new("{collection_name}") - .query(vec![(1, 0.2), (3, 0.1), (5, 0.9), (7, 0.7)]) - .limit(10) - .using("text"), + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) + .quantization_config( + ScalarQuantizationBuilder::default() + .r#type(QuantizationType::Int8.into()) + .quantile(0.99) + .always_ram(true), + ), ) .await?; - ``` ```java -import java.util.List; - import io.qdrant.client.QdrantClient; import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.QueryPoints; - -import static io.qdrant.client.QueryFactory.nearest; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.QuantizationConfig; +import io.qdrant.client.grpc.Collections.QuantizationType; +import io.qdrant.client.grpc.Collections.ScalarQuantization; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -client.queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setUsing("text") - .setQuery(nearest(List.of(0.1f, 0.2f, 0.3f, 0.4f), List.of(1, 3, 5, 7))) - .setLimit(3) - .build()) - .get(); +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(768) + .setDistance(Distance.Cosine) + .build()) + .build()) + .setQuantizationConfig( + QuantizationConfig.newBuilder() + .setScalar( + ScalarQuantization.newBuilder() + .setType(QuantizationType.Int8) + .setQuantile(0.99f) + .setAlwaysRam(true) + .build()) + .build()) + .build()) + .get(); +``` + +There are 3 parameters that you can specify in the `quantization_config` section: + +`type` - the type of the quantized vector components. Currently, Qdrant supports only `int8`. + +`quantile` - the quantile of the quantized vector components. +The quantile is used to calculate the quantization bounds. +For instance, if you specify `0.99` as the quantile, 1% of extreme values will be excluded from the quantization bounds. + +Using quantiles lower than `1.0` might be useful if there are outliers in your vector components. +This parameter only affects the resulting precision and not the memory footprint. +It might be worth tuning this parameter if you experience a significant decrease in search quality. + +`always_ram` - whether to keep quantized vectors always cached in RAM or not. By default, quantized vectors are loaded in the same way as the original vectors. +However, in some setups you might want to keep quantized vectors in RAM to speed up the search process. + +In this case, you can set `always_ram` to `true` to store quantized vectors in RAM. + +### Setting up Binary Quantization + +To enable binary quantization, you need to specify the quantization parameters in the `quantization_config` section of the collection configuration. + +When enabling binary quantization on an existing collection, use a PATCH request or the corresponding `update_collection` method and omit the vector configuration, as it's already defined. + +```python +from qdrant_client import QdrantClient, models +client = QdrantClient(url="http://localhost:6333") + +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE), + quantization_config=models.BinaryQuantization( + binary=models.BinaryQuantizationConfig( + always_ram=True, + ), + ), +) ``` ```csharp using Qdrant.Client; +using Qdrant.Client.Grpc; var client = new QdrantClient("localhost", 6334); -await client.QueryAsync( - collectionName: "{collection_name}", - query: new (float, uint)[] {(0.1f, 1), (0.2f, 3), (0.3f, 5), (0.4f, 7)}, - usingVector: "text", - limit: 3 +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { Size = 1536, Distance = Distance.Cosine }, + quantizationConfig: new QuantizationConfig + { + Binary = new BinaryQuantization { AlwaysRam = true } + } ); - ``` ```go @@ -43148,87 +61264,33 @@ client, err := qdrant.NewClient(&qdrant.Config{ Port: 6334, }) -client.Query(context.Background(), &qdrant.QueryPoints{ +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ CollectionName: "{collection_name}", - Query: qdrant.NewQuerySparse( - []uint32{1, 3, 5, 7}, - []float32{0.1, 0.2, 0.3, 0.4}), - Using: qdrant.PtrOf("text"), + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 1536, + Distance: qdrant.Distance_Cosine, + }), + QuantizationConfig: qdrant.NewQuantizationBinary( + &qdrant.BinaryQuantization{ + AlwaysRam: qdrant.PtrOf(true), + }, + ), }) - -``` - -### [Anchor](https://qdrant.tech/documentation/concepts/vectors/\#multivectors) Multivectors - -**Available as of v1.10.0** - -Qdrant supports the storing of a variable amount of same-shaped dense vectors in a single point. -This means that instead of a single dense vector, you can upload a matrix of dense vectors. - -The length of the matrix is fixed, but the number of vectors in the matrix can be different for each point. - -Multivectors look like this: - -```json -// A multivector of size 4 -"vector": [\ - [-0.013, 0.020, -0.007, -0.111],\ - [-0.030, -0.055, 0.001, 0.072],\ - [-0.041, 0.014, -0.032, -0.062],\ - ....\ -] - ``` -There are two scenarios where multivectors are useful: - -- **Multiple representation of the same object** \- For example, you can store multiple embeddings for pictures of the same object, taken from different angles. This approach assumes that the payload is same for all vectors. -- **Late interaction embeddings** \- Some text embedding models can output multiple vectors for a single text. -For example, a family of models such as ColBERT output a relatively small vector for each token in the text. - -In order to use multivectors, we need to specify a function that will be used to compare between matrices of vectors - -Currently, Qdrant supports `max_sim` function, which is defined as a sum of maximum similarities between each pair of vectors in the matrices. - -score=∑i=1Nmaxj=1MSim(vectorAi,vectorBj) - -Where N is the number of vectors in the first matrix, M is the number of vectors in the second matrix, and Sim is a similarity function, for example, cosine similarity. - -To use multivectors, create a collection with the following configuration: - -httppythontypescriptrustjavacsharpgo - ```http -PUT collections/{collection_name} +PUT /collections/{collection_name} { - "vectors": { - "size": 128, - "distance": "Cosine", - "multivector_config": { - "comparator": "max_sim" + "vectors": { + "size": 1536, + "distance": "Cosine" + }, + "quantization_config": { + "binary": { + "always_ram": true + } } - } } - -``` - -```python - -from qdrant_client import QdrantClient, models - -client = QdrantClient(url="http://localhost:6333") - -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams( - size=128, - distance=models.Distance.COSINE, - multivector_config=models.MultiVectorConfig( - comparator=models.MultiVectorComparator.MAX_SIM - ), - ), -) - ``` ```typescript @@ -43238,20 +61300,20 @@ const client = new QdrantClient({ host: "localhost", port: 6333 }); client.createCollection("{collection_name}", { vectors: { - size: 128, + size: 1536, distance: "Cosine", - multivector_config: { - comparator: "max_sim" - } + }, + quantization_config: { + binary: { + always_ram: true, + }, }, }); - ``` ```rust use qdrant_client::qdrant::{ - CreateCollectionBuilder, Distance, VectorParamsBuilder, - MultiVectorComparator, MultiVectorConfigBuilder, + BinaryQuantizationBuilder, CreateCollectionBuilder, Distance, VectorParamsBuilder, }; use qdrant_client::Qdrant; @@ -43260,36 +61322,69 @@ let client = Qdrant::from_url("http://localhost:6334").build()?; client .create_collection( CreateCollectionBuilder::new("{collection_name}") - .vectors_config( - VectorParamsBuilder::new(100, Distance::Cosine) - .multivector_config( - MultiVectorConfigBuilder::new(MultiVectorComparator::MaxSim) - ), - ), + .vectors_config(VectorParamsBuilder::new(1536, Distance::Cosine)) + .quantization_config(BinaryQuantizationBuilder::new(true)), ) .await?; - ``` ```java import io.qdrant.client.QdrantClient; import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.BinaryQuantization; +import io.qdrant.client.grpc.Collections.CreateCollection; import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.MultiVectorComparator; -import io.qdrant.client.grpc.Collections.MultiVectorConfig; +import io.qdrant.client.grpc.Collections.QuantizationConfig; import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -client.createCollectionAsync("{collection_name}", - VectorParams.newBuilder().setSize(128) - .setDistance(Distance.Cosine) - .setMultivectorConfig(MultiVectorConfig.newBuilder() - .setComparator(MultiVectorComparator.MaxSim) - .build()) - .build()).get(); +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(1536) + .setDistance(Distance.Cosine) + .build()) + .build()) + .setQuantizationConfig( + QuantizationConfig.newBuilder() + .setBinary(BinaryQuantization.newBuilder().setAlwaysRam(true).build()) + .build()) + .build()) + .get(); +``` + +`always_ram` - whether to keep quantized vectors always cached in RAM or not. By default, quantized vectors are loaded in the same way as the original vectors. +However, in some setups you might want to keep quantized vectors in RAM to speed up the search process. + +In this case, you can set `always_ram` to `true` to store quantized vectors in RAM. + +#### Set up bit depth + +To enable 2bit or 1.5bit quantization, you need to specify `encoding` parameter in the `quantization_config` section of the collection configuration. Available values are `two_bits` and `one_and_half_bits`. + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE), + quantization_config=models.BinaryQuantization( + binary=models.BinaryQuantizationConfig( + encoding=models.BinaryQuantizationEncoding.TWO_BITS, + always_ram=True, + ), + ), +) ``` ```csharp @@ -43300,82 +61395,58 @@ var client = new QdrantClient("localhost", 6334); await client.CreateCollectionAsync( collectionName: "{collection_name}", - vectorsConfig: new VectorParams { - Size = 128, - Distance = Distance.Cosine, - MultivectorConfig = new() { - Comparator = MultiVectorComparator.MaxSim - } + vectorsConfig: new VectorParams { Size = 1536, Distance = Distance.Cosine }, + quantizationConfig: new QuantizationConfig + { + Binary = new BinaryQuantization { + Encoding = BinaryQuantizationEncoding.TwoBits, + AlwaysRam = true + } } ); - ``` ```go import ( - "context" + "context" - "github.com/qdrant/go-client/qdrant" + "github.com/qdrant/go-client/qdrant" ) client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, + Host: "localhost", + Port: 6334, }) client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 128, - Distance: qdrant.Distance_Cosine, - MultivectorConfig: &qdrant.MultiVectorConfig{ - Comparator: qdrant.MultiVectorComparator_MaxSim, - }, - }), + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 1536, + Distance: qdrant.Distance_Cosine, + }), + QuantizationConfig: qdrant.NewQuantizationBinary( + &qdrant.BinaryQuantization{ + Encoding: qdrant.BinaryQuantizationEncoding_TwoBits.Enum(), + AlwaysRam: qdrant.PtrOf(true), + }, + ), }) - ``` -To insert a point with multivector: - -httppythontypescriptrustjavacsharpgo - ```http -PUT collections/{collection_name}/points +PUT /collections/{collection_name} { - "points": [\ - {\ - "id": 1,\ - "vector": [\ - [-0.013, 0.020, -0.007, -0.111, ...],\ - [-0.030, -0.055, 0.001, 0.072, ...],\ - [-0.041, 0.014, -0.032, -0.062, ...]\ - ]\ - }\ - ] + "vectors": { + "size": 1536, + "distance": "Cosine" + }, + "quantization_config": { + "binary": { + "encoding": "two_bits", + "always_ram": true + } + } } - -``` - -```python -from qdrant_client import QdrantClient, models - -client = QdrantClient(url="http://localhost:6333") - -client.upsert( - collection_name="{collection_name}", - points=[\ - models.PointStruct(\ - id=1,\ - vector=[\ - [-0.013, 0.020, -0.007, -0.111],\ - [-0.030, -0.055, 0.001, 0.072],\ - [-0.041, 0.014, -0.032, -0.062]\ - ],\ - )\ - ], -) - ``` ```typescript @@ -43383,75 +61454,104 @@ import { QdrantClient } from "@qdrant/js-client-rest"; const client = new QdrantClient({ host: "localhost", port: 6333 }); -client.upsert("{collection_name}", { - points: [\ - {\ - id: 1,\ - vector: [\ - [-0.013, 0.020, -0.007, -0.111, ...],\ - [-0.030, -0.055, 0.001, 0.072, ...],\ - [-0.041, 0.014, -0.032, -0.062, ...]\ - ],\ - }\ - ] +client.createCollection("{collection_name}", { + vectors: { + size: 1536, + distance: "Cosine", + }, + quantization_config: { + binary: { + encoding: "two_bits", + always_ram: true, + }, + }, }); - ``` ```rust -use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder, Vector}; +use qdrant_client::qdrant::{ + BinaryQuantizationBuilder, + CreateCollectionBuilder, + Distance, + VectorParamsBuilder, + BinaryQuantizationEncoding, +}; use qdrant_client::Qdrant; let client = Qdrant::from_url("http://localhost:6334").build()?; -let points = vec![\ - PointStruct::new(\ - 1,\ - Vector::new_multi(vec![\ - vec![-0.013, 0.020, -0.007, -0.111],\ - vec![-0.030, -0.055, 0.001, 0.072],\ - vec![-0.041, 0.014, -0.032, -0.062],\ - ]),\ - Payload::new()\ - )\ -]; - client - .upsert_points( - UpsertPointsBuilder::new("{collection_name}", points) - ).await?; - + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(1536, Distance::Cosine)) + .quantization_config(BinaryQuantizationBuilder::new(true) + .encoding(BinaryQuantizationEncoding::TwoBits) + ), + ) + .await?; ``` ```java -import java.util.List; - -import static io.qdrant.client.PointIdFactory.id; -import static io.qdrant.client.VectorsFactory.vectors; -import static io.qdrant.client.VectorFactory.multiVector; - import io.qdrant.client.QdrantClient; import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.PointStruct; +import io.qdrant.client.grpc.Collections.BinaryQuantization; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.QuantizationConfig; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; +import io.qdrant.client.grpc.Collections.BinaryQuantizationEncoding; QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); client -.upsertAsync( - "{collection_name}", - List.of( - PointStruct.newBuilder() - .setId(id(1)) - .setVectors(vectors(multiVector(new float[][] { - {-0.013f, 0.020f, -0.007f, -0.111f}, - {-0.030f, -0.055f, 0.001f, 0.072f}, - {-0.041f, 0.014f, -0.032f, -0.062f} - }))) - .build() - )) -.get(); + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(1536) + .setDistance(Distance.Cosine) + .build()) + .build()) + .setQuantizationConfig( + QuantizationConfig.newBuilder() + .setBinary(BinaryQuantization + .newBuilder() + .setEncoding(BinaryQuantizationEncoding.TwoBits) + .setAlwaysRam(true) + .build()) + .build()) + .build()) + .get(); +``` + +#### Set up asymmetric quantization + +To enable asymmetric quantization, you need to specify `query_encoding` parameter in the `quantization_config` section of the collection configuration. Available values are: +- `default` and `binary` - use regular binary quantization for the query. +- `scalar8bits` - use 8bit quantization for the query. +- `scalar4bits` - use 4bit quantization for the query. + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE), + quantization_config=models.BinaryQuantization( + binary=models.BinaryQuantizationConfig( + query_encoding=models.BinaryQuantizationQueryEncoding.SCALAR8BITS, + always_ram=True, + ), + ), +) ``` ```csharp @@ -43460,80 +61560,63 @@ using Qdrant.Client.Grpc; var client = new QdrantClient("localhost", 6334); -await client.UpsertAsync( +await client.CreateCollectionAsync( collectionName: "{collection_name}", - points: new List { - new() { - Id = 1, - Vectors = new float[][] { - [-0.013f, 0.020f, -0.007f, -0.111f], - [-0.030f, -0.05f, 0.001f, 0.072f], - [-0.041f, 0.014f, -0.032f, -0.062f ], - }, - }, + vectorsConfig: new VectorParams { Size = 1536, Distance = Distance.Cosine }, + quantizationConfig: new QuantizationConfig + { + Binary = new BinaryQuantization { + QueryEncoding = new BinaryQuantizationQueryEncoding + { + Setting = BinaryQuantizationQueryEncoding.Types.Setting.Scalar8Bits, + }, + AlwaysRam = true + } } ); - ``` ```go import ( - "context" + "context" - "github.com/qdrant/go-client/qdrant" + "github.com/qdrant/go-client/qdrant" ) client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, + Host: "localhost", + Port: 6334, }) -client.Upsert(context.Background(), &qdrant.UpsertPoints{ - CollectionName: "{collection_name}", - Points: []*qdrant.PointStruct{ - { - Id: qdrant.NewIDNum(1), - Vectors: qdrant.NewVectorsMulti( - [][]float32{ - {-0.013, 0.020, -0.007, -0.111}, - {-0.030, -0.055, 0.001, 0.072}, - {-0.041, 0.014, -0.032, -0.062}}), - }, - }, +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 1536, + Distance: qdrant.Distance_Cosine, + }), + QuantizationConfig: qdrant.NewQuantizationBinary( + &qdrant.BinaryQuantization{ + QueryEncoding: qdrant.NewBinaryQuantizationQueryEncodingSetting(BinaryQuantizationQueryEncoding_Scalar8Bits), + AlwaysRam: qdrant.PtrOf(true), + }, + ), }) - ``` -To search with multivector (available in `query` API): - -httppythontypescriptrustjavacsharpgo - ```http -POST collections/{collection_name}/points/query +PUT /collections/{collection_name} { - "query": [\ - [-0.013, 0.020, -0.007, -0.111, ...],\ - [-0.030, -0.055, 0.001, 0.072, ...],\ - [-0.041, 0.014, -0.032, -0.062, ...]\ - ] + "vectors": { + "size": 1536, + "distance": "Cosine" + }, + "quantization_config": { + "binary": { + "query_encoding": "scalar8bits", + "always_ram": true + } + } } - -``` - -```python -from qdrant_client import QdrantClient, models - -client = QdrantClient(url="http://localhost:6333") - -client.query_points( - collection_name="{collection_name}", - query=[\ - [-0.013, 0.020, -0.007, -0.111],\ - [-0.030, -0.055, 0.001, 0.072],\ - [-0.041, 0.014, -0.032, -0.062]\ - ], -) - ``` ```typescript @@ -43541,70 +61624,121 @@ import { QdrantClient } from "@qdrant/js-client-rest"; const client = new QdrantClient({ host: "localhost", port: 6333 }); -client.query("{collection_name}", { - "query": [\ - [-0.013, 0.020, -0.007, -0.111],\ - [-0.030, -0.055, 0.001, 0.072],\ - [-0.041, 0.014, -0.032, -0.062]\ - ] +client.createCollection("{collection_name}", { + vectors: { + size: 1536, + distance: "Cosine", + }, + quantization_config: { + binary: { + query_encoding: "scalar8bits", + always_ram: true, + }, + }, }); - ``` ```rust +use qdrant_client::qdrant::{ + BinaryQuantizationBuilder, + CreateCollectionBuilder, + Distance, + VectorParamsBuilder, + BinaryQuantizationQueryEncoding, +}; use qdrant_client::Qdrant; -use qdrant_client::qdrant::{ QueryPointsBuilder, VectorInput }; let client = Qdrant::from_url("http://localhost:6334").build()?; -let res = client.query( - QueryPointsBuilder::new("{collection_name}") - .query(VectorInput::new_multi( - vec![\ - vec![-0.013, 0.020, -0.007, -0.111],\ - vec![-0.030, -0.055, 0.001, 0.072],\ - vec![-0.041, 0.014, -0.032, -0.062],\ - ] - )) -).await?; - +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(1536, Distance::Cosine)) + .quantization_config( + BinaryQuantizationBuilder::new(true) + .query_encoding(BinaryQuantizationQueryEncoding::scalar8bits()) + ), + ) + .await?; ``` ```java -import static io.qdrant.client.QueryFactory.nearest; - import io.qdrant.client.QdrantClient; import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.QueryPoints; +import io.qdrant.client.grpc.Collections.BinaryQuantization; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.QuantizationConfig; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; +import io.qdrant.client.grpc.Collections.BinaryQuantizationQueryEncoding; QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -client.queryAsync(QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(nearest(new float[][] { - {-0.013f, 0.020f, -0.007f, -0.111f}, - {-0.030f, -0.055f, 0.001f, 0.072f}, - {-0.041f, 0.014f, -0.032f, -0.062f} - })) - .build()).get(); +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(1536) + .setDistance(Distance.Cosine) + .build()) + .build()) + .setQuantizationConfig( + QuantizationConfig.newBuilder() + .setBinary(BinaryQuantization.newBuilder() + .setQueryEncoding(BinaryQuantizationQueryEncoding + .newBuilder() + .setSetting(BinaryQuantizationQueryEncoding.Setting.Scalar8Bits) + .build()) + .setAlwaysRam(true) + .build()) + .build()) + .build()) + .get(); +``` + +### Setting up Product Quantization + +To enable product quantization, you need to specify the quantization parameters in the `quantization_config` section of the collection configuration. + +When enabling product quantization on an existing collection, use a PATCH request or the corresponding `update_collection` method and omit the vector configuration, as it's already defined. + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), + quantization_config=models.ProductQuantization( + product=models.ProductQuantizationConfig( + compression=models.CompressionRatio.X16, + always_ram=True, + ), + ), +) ``` ```csharp using Qdrant.Client; +using Qdrant.Client.Grpc; var client = new QdrantClient("localhost", 6334); -await client.QueryAsync( - collectionName: "{collection_name}", - query: new float[][] { - [-0.013f, 0.020f, -0.007f, -0.111f], - [-0.030f, -0.055f, 0.001 , 0.072f], - [-0.041f, 0.014f, -0.032f, -0.062f], - } +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine }, + quantizationConfig: new QuantizationConfig + { + Product = new ProductQuantization { Compression = CompressionRatio.X16, AlwaysRam = true } + } ); - ``` ```go @@ -43619,62 +61753,35 @@ client, err := qdrant.NewClient(&qdrant.Config{ Port: 6334, }) -client.Query(context.Background(), &qdrant.QueryPoints{ +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ CollectionName: "{collection_name}", - Query: qdrant.NewQueryMulti( - [][]float32{ - {-0.013, 0.020, -0.007, -0.111}, - {-0.030, -0.055, 0.001, 0.072}, - {-0.041, 0.014, -0.032, -0.062}, - }), + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 768, + Distance: qdrant.Distance_Cosine, + }), + QuantizationConfig: qdrant.NewQuantizationProduct( + &qdrant.ProductQuantization{ + Compression: qdrant.CompressionRatio_x16, + AlwaysRam: qdrant.PtrOf(true), + }, + ), }) - ``` -## [Anchor](https://qdrant.tech/documentation/concepts/vectors/\#named-vectors) Named Vectors - -In Qdrant, you can store multiple vectors of different sizes and [types](https://qdrant.tech/documentation/concepts/vectors/#vector-types) in the same data [point](https://qdrant.tech/documentation/concepts/points/). This is useful when you need to define your data with multiple embeddings to represent different features or modalities (e.g., image, text or video). - -To store different vectors for each point, you need to create separate named vector spaces in the [collection](https://qdrant.tech/documentation/concepts/collections/). You can define these vector spaces during collection creation and manage them independently. - -To create a collection with named vectors, you need to specify a configuration for each vector: - -httppythontypescriptrustjavacsharpgo - ```http PUT /collections/{collection_name} { - "vectors": { - "image": { - "size": 4, - "distance": "Dot" - }, - "text": { - "size": 5, + "vectors": { + "size": 768, "distance": "Cosine" + }, + "quantization_config": { + "product": { + "compression": "x16", + "always_ram": true + } } - }, - "sparse_vectors": { - "text-sparse": {} - } } - -``` - -```python -from qdrant_client import QdrantClient, models - -client = QdrantClient(url="http://localhost:6333") - -client.create_collection( - collection_name="{collection_name}", - vectors_config={ - "image": models.VectorParams(size=4, distance=models.Distance.DOT), - "text": models.VectorParams(size=5, distance=models.Distance.COSINE), - }, - sparse_vectors_config={"text-sparse": models.SparseVectorParams()}, -) - ``` ```typescript @@ -43683,55 +61790,48 @@ import { QdrantClient } from "@qdrant/js-client-rest"; const client = new QdrantClient({ host: "localhost", port: 6333 }); client.createCollection("{collection_name}", { - vectors: { - image: { size: 4, distance: "Dot" }, - text: { size: 5, distance: "Cosine" }, + vectors: { + size: 768, + distance: "Cosine", + }, + quantization_config: { + product: { + compression: "x16", + always_ram: true, }, - sparse_vectors: { - text_sparse: {} - } + }, }); - ``` ```rust use qdrant_client::qdrant::{ - CreateCollectionBuilder, Distance, SparseVectorParamsBuilder, SparseVectorsConfigBuilder, - VectorParamsBuilder, VectorsConfigBuilder, + CompressionRatio, CreateCollectionBuilder, Distance, ProductQuantizationBuilder, + VectorParamsBuilder, }; use qdrant_client::Qdrant; let client = Qdrant::from_url("http://localhost:6334").build()?; -let mut vector_config = VectorsConfigBuilder::default(); -vector_config.add_named_vector_params("text", VectorParamsBuilder::new(5, Distance::Dot)); -vector_config.add_named_vector_params("image", VectorParamsBuilder::new(4, Distance::Cosine)); - -let mut sparse_vectors_config = SparseVectorsConfigBuilder::default(); -sparse_vectors_config - .add_named_vector_params("text-sparse", SparseVectorParamsBuilder::default()); - client .create_collection( CreateCollectionBuilder::new("{collection_name}") - .vectors_config(vector_config) - .sparse_vectors_config(sparse_vectors_config), + .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) + .quantization_config( + ProductQuantizationBuilder::new(CompressionRatio::X16.into()).always_ram(true), + ), ) .await?; - ``` ```java -import java.util.Map; - import io.qdrant.client.QdrantClient; import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CompressionRatio; import io.qdrant.client.grpc.Collections.CreateCollection; import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.SparseVectorConfig; -import io.qdrant.client.grpc.Collections.SparseVectorParams; +import io.qdrant.client.grpc.Collections.ProductQuantization; +import io.qdrant.client.grpc.Collections.QuantizationConfig; import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.grpc.Collections.VectorParamsMap; import io.qdrant.client.grpc.Collections.VectorsConfig; QdrantClient client = @@ -43741,22 +61841,58 @@ client .createCollectionAsync( CreateCollection.newBuilder() .setCollectionName("{collection_name}") - .setVectorsConfig(VectorsConfig.newBuilder().setParamsMap( - VectorParamsMap.newBuilder().putAllMap(Map.of("image", - VectorParams.newBuilder() - .setSize(4) - .setDistance(Distance.Dot) - .build(), - "text", - VectorParams.newBuilder() - .setSize(5) - .setDistance(Distance.Cosine) - .build())))) - .setSparseVectorsConfig(SparseVectorConfig.newBuilder().putMap( - "text-sparse", SparseVectorParams.getDefaultInstance())) + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(768) + .setDistance(Distance.Cosine) + .build()) + .build()) + .setQuantizationConfig( + QuantizationConfig.newBuilder() + .setProduct( + ProductQuantization.newBuilder() + .setCompression(CompressionRatio.x16) + .setAlwaysRam(true) + .build()) + .build()) .build()) .get(); +``` + +There are two parameters that you can specify in the `quantization_config` section: + +`compression` - compression ratio. +Compression ratio represents the size of the quantized vector in bytes divided by the size of the original vector in bytes. +In this case, the quantized vector will be 16 times smaller than the original vector. + +`always_ram` - whether to keep quantized vectors always cached in RAM or not. By default, quantized vectors are loaded in the same way as the original vectors. +However, in some setups you might want to keep quantized vectors in RAM to speed up the search process. Then set `always_ram` to `true`. + +### Searching with Quantization + +Once you have configured quantization for a collection, you don't need to do anything extra to search with quantization. +Qdrant will automatically use quantized vectors if they are available. + +However, there are a few options that you can use to control the search process: + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") +client.query_points( + collection_name="{collection_name}", + query=[0.2, 0.1, 0.9, 0.7], + search_params=models.SearchParams( + quantization=models.QuantizationSearchParams( + ignore=False, + rescore=True, + oversampling=2.0, + ) + ), +) ``` ```csharp @@ -43765,27 +61901,20 @@ using Qdrant.Client.Grpc; var client = new QdrantClient("localhost", 6334); -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParamsMap - { - Map = { - ["image"] = new VectorParams { - Size = 4, Distance = Distance.Dot - }, - ["text"] = new VectorParams { - Size = 5, Distance = Distance.Cosine - }, - } - }, - sparseVectorsConfig: new SparseVectorConfig - { - Map = { - ["text-sparse"] = new() - } - } +await client.QueryAsync( + collectionName: "{collection_name}", + query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, + searchParams: new SearchParams + { + Quantization = new QuantizationSearchParams + { + Ignore = false, + Rescore = true, + Oversampling = 2.0 + } + }, + limit: 10 ); - ``` ```go @@ -43800,175 +61929,160 @@ client, err := qdrant.NewClient(&qdrant.Config{ Port: 6334, }) -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ +client.Query(context.Background(), &qdrant.QueryPoints{ CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfigMap( - map[string]*qdrant.VectorParams{ - "image": { - Size: 4, - Distance: qdrant.Distance_Dot, - }, - "text": { - Size: 5, - Distance: qdrant.Distance_Cosine, - }, - }), - SparseVectorsConfig: qdrant.NewSparseVectorsConfig( - map[string]*qdrant.SparseVectorParams{ - "text-sparse": {}, + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), + Params: &qdrant.SearchParams{ + Quantization: &qdrant.QuantizationSearchParams{ + Ignore: qdrant.PtrOf(false), + Rescore: qdrant.PtrOf(true), + Oversampling: qdrant.PtrOf(2.0), }, - ), + }, }) - ``` -To insert a point with named vectors: - -httppythontypescriptrustjavacsharpgo - ```http -PUT /collections/{collection_name}/points?wait=true +POST /collections/{collection_name}/points/query { - "points": [\ - {\ - "id": 1,\ - "vector": {\ - "image": [0.9, 0.1, 0.1, 0.2],\ - "text": [0.4, 0.7, 0.1, 0.8, 0.1],\ - "text-sparse": {\ - "indices": [1, 3, 5, 7],\ - "values": [0.1, 0.2, 0.3, 0.4]\ - }\ - }\ - }\ - ] + "query": [0.2, 0.1, 0.9, 0.7], + "params": { + "quantization": { + "ignore": false, + "rescore": true, + "oversampling": 2.0 + } + }, + "limit": 10 } - ``` -```python -client.upsert( - collection_name="{collection_name}", - points=[\ - models.PointStruct(\ - id=1,\ - vector={\ - "image": [0.9, 0.1, 0.1, 0.2],\ - "text": [0.4, 0.7, 0.1, 0.8, 0.1],\ - "text-sparse": {\ - "indices": [1, 3, 5, 7],\ - "values": [0.1, 0.2, 0.3, 0.4],\ - },\ - },\ - ),\ - ], -) +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -``` +const client = new QdrantClient({ host: "localhost", port: 6333 }); -```typescript -client.upsert("{collection_name}", { - points: [\ - {\ - id: 1,\ - vector: {\ - image: [0.9, 0.1, 0.1, 0.2],\ - text: [0.4, 0.7, 0.1, 0.8, 0.1],\ - text_sparse: {\ - indices: [1, 3, 5, 7],\ - values: [0.1, 0.2, 0.3, 0.4]\ - }\ - },\ - },\ - ], +client.query("{collection_name}", { + query: [0.2, 0.1, 0.9, 0.7], + params: { + quantization: { + ignore: false, + rescore: true, + oversampling: 2.0, + }, + }, + limit: 10, }); - ``` ```rust - use qdrant_client::qdrant::{ - NamedVectors, PointStruct, UpsertPointsBuilder, Vector, + QuantizationSearchParamsBuilder, QueryPointsBuilder, SearchParamsBuilder, }; -use qdrant_client::Payload; +use qdrant_client::Qdrant; +let client = Qdrant::from_url("http://localhost:6334").build()?; + client - .upsert_points( - UpsertPointsBuilder::new( - "{collection_name}", - vec![PointStruct::new(\ - 1,\ - NamedVectors::default()\ - .add_vector("text", Vector::new_dense(vec![0.4, 0.7, 0.1, 0.8, 0.1]))\ - .add_vector("image", Vector::new_dense(vec![0.9, 0.1, 0.1, 0.2]))\ - .add_vector(\ - "text-sparse",\ - Vector::new_sparse(vec![1, 3, 5, 7], vec![0.1, 0.2, 0.3, 0.4]),\ - ),\ - Payload::default(),\ - )], - ) - .wait(true), + .query( + QueryPointsBuilder::new("{collection_name}") + .query(vec![0.2, 0.1, 0.9, 0.7]) + .limit(10) + .params( + SearchParamsBuilder::default().quantization( + QuantizationSearchParamsBuilder::default() + .ignore(false) + .rescore(true) + .oversampling(2.0), + ), + ), ) .await?; - ``` ```java -import java.util.List; -import java.util.Map; +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.QuantizationSearchParams; +import io.qdrant.client.grpc.Points.QueryPoints; +import io.qdrant.client.grpc.Points.SearchParams; -import static io.qdrant.client.PointIdFactory.id; -import static io.qdrant.client.VectorFactory.vector; -import static io.qdrant.client.VectorsFactory.namedVectors; +import static io.qdrant.client.QueryFactory.nearest; -import io.qdrant.client.grpc.Points.PointStruct; +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -client - .upsertAsync( - "{collection_name}", - List.of( - PointStruct.newBuilder() - .setId(id(1)) - .setVectors( - namedVectors( - Map.of( - "image", - vector(List.of(0.9f, 0.1f, 0.1f, 0.2f)), - "text", - vector(List.of(0.4f, 0.7f, 0.1f, 0.8f, 0.1f)), - "text-sparse", - vector(List.of(0.1f, 0.2f, 0.3f, 0.4f), List.of(1, 3, 5, 7))))) - .build())) - .get(); +client.queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) + .setParams( + SearchParams.newBuilder() + .setQuantization( + QuantizationSearchParams.newBuilder() + .setIgnore(false) + .setRescore(true) + .setOversampling(2.0) + .build()) + .build()) + .setLimit(10) + .build()) + .get(); +``` + +`ignore` - Toggle whether to ignore quantized vectors during the search process. By default, Qdrant will use quantized vectors if they are available. + +`rescore` - Having the original vectors available, Qdrant can re-evaluate top-k search results using the original vectors. +This can improve the search quality, but may slightly decrease the search speed, compared to the search without rescore. +It is recommended to disable rescore only if the original vectors are stored on a slow storage (e.g. HDD or network storage). +By default, rescore is enabled. + +**Available as of v1.3.0** +`oversampling` - Defines how many extra vectors should be pre-selected using quantized index, and then re-scored using original vectors. +For example, if oversampling is 2.4 and limit is 100, then 240 vectors will be pre-selected using quantized index, and then top-100 will be returned after re-scoring. +Oversampling is useful if you want to tune the tradeoff between search speed and search quality in the query time. + +## Quantization tips + +#### Accuracy tuning + +In this section, we will discuss how to tune the search precision. +The fastest way to understand the impact of quantization on the search quality is to compare the search results with and without quantization. + +In order to disable quantization, you can set `ignore` to `true` in the search request: + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.query_points( + collection_name="{collection_name}", + query=[0.2, 0.1, 0.9, 0.7], + search_params=models.SearchParams( + quantization=models.QuantizationSearchParams( + ignore=True, + ) + ), +) ``` ```csharp using Qdrant.Client; using Qdrant.Client.Grpc; -await client.UpsertAsync( - collectionName: "{collection_name}", - points: new List - { - new() - { - Id = 1, - Vectors = new Dictionary - { - ["image"] = new() { - Data = {0.9f, 0.1f, 0.1f, 0.2f} - }, - ["text"] = new() { - Data = {0.4f, 0.7f, 0.1f, 0.8f, 0.1f} - }, - ["text-sparse"] = ([0.1f, 0.2f, 0.3f, 0.4f], [1, 3, 5, 7]), - } - } - } -); +var client = new QdrantClient("localhost", 6334); +await client.QueryAsync( + collectionName: "{collection_name}", + query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, + searchParams: new SearchParams + { + Quantization = new QuantizationSearchParams { Ignore = true } + }, + limit: 10 +); ``` ```go @@ -43978,50 +62092,33 @@ import ( "github.com/qdrant/go-client/qdrant" ) -client.Upsert(context.Background(), &qdrant.UpsertPoints{ +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.Query(context.Background(), &qdrant.QueryPoints{ CollectionName: "{collection_name}", - Points: []*qdrant.PointStruct{ - { - Id: qdrant.NewIDNum(1), - Vectors: qdrant.NewVectorsMap(map[string]*qdrant.Vector{ - "image": qdrant.NewVector(0.9, 0.1, 0.1, 0.2), - "text": qdrant.NewVector(0.4, 0.7, 0.1, 0.8, 0.1), - "text-sparse": qdrant.NewVectorSparse( - []uint32{1, 3, 5, 7}, - []float32{0.1, 0.2, 0.3, 0.4}), - }), + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), + Params: &qdrant.SearchParams{ + Quantization: &qdrant.QuantizationSearchParams{ + Ignore: qdrant.PtrOf(false), }, }, }) - -``` - -To search with named vectors (available in `query` API): - -httppythontypescriptrustjavacsharpgo +``` ```http POST /collections/{collection_name}/points/query { "query": [0.2, 0.1, 0.9, 0.7], - "using": "image", - "limit": 3 + "params": { + "quantization": { + "ignore": true + } + }, + "limit": 10 } - -``` - -```python -from qdrant_client import QdrantClient - -client = QdrantClient(url="http://localhost:6333") - -client.query_points( - collection_name="{collection_name}", - query=[0.2, 0.1, 0.9, 0.7], - using="image", - limit=3, -) - ``` ```typescript @@ -44030,15 +62127,19 @@ import { QdrantClient } from "@qdrant/js-client-rest"; const client = new QdrantClient({ host: "localhost", port: 6333 }); client.query("{collection_name}", { - query: [0.2, 0.1, 0.9, 0.7], - using: "image", - limit: 3, + query: [0.2, 0.1, 0.9, 0.7], + params: { + quantization: { + ignore: true, + }, + }, }); - ``` ```rust -use qdrant_client::qdrant::QueryPointsBuilder; +use qdrant_client::qdrant::{ + QuantizationSearchParamsBuilder, QueryPointsBuilder, SearchParamsBuilder, +}; use qdrant_client::Qdrant; let client = Qdrant::from_url("http://localhost:6334").build()?; @@ -44048,45 +62149,90 @@ client QueryPointsBuilder::new("{collection_name}") .query(vec![0.2, 0.1, 0.9, 0.7]) .limit(3) - .using("image"), + .params( + SearchParamsBuilder::default() + .quantization(QuantizationSearchParamsBuilder::default().ignore(true)), + ), ) .await?; - ``` ```java -import java.util.List; - import io.qdrant.client.QdrantClient; import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.QuantizationSearchParams; import io.qdrant.client.grpc.Points.QueryPoints; +import io.qdrant.client.grpc.Points.SearchParams; import static io.qdrant.client.QueryFactory.nearest; QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -client.queryAsync(QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) - .setUsing("image") - .setLimit(3) - .build()).get(); +client.queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) + .setParams( + SearchParams.newBuilder() + .setQuantization( + QuantizationSearchParams.newBuilder().setIgnore(true).build()) + .build()) + .setLimit(10) + .build()) + .get(); +``` + +- **Adjust the quantile parameter**: The quantile parameter in scalar quantization determines the quantization bounds. +By setting it to a value lower than 1.0, you can exclude extreme values (outliers) from the quantization bounds. +For example, if you set the quantile to 0.99, 1% of the extreme values will be excluded. +By adjusting the quantile, you find an optimal value that will provide the best search quality for your collection. + +- **Enable rescore**: Having the original vectors available, Qdrant can re-evaluate top-k search results using the original vectors. On large collections, this can improve the search quality, with just minor performance impact. + +#### Memory and speed tuning + +In this section, we will discuss how to tune the memory and speed of the search process with quantization. + +There are 3 possible modes to place storage of vectors within the qdrant collection: + +- **All in RAM** - all vector, original and quantized, are loaded and kept in RAM. This is the fastest mode, but requires a lot of RAM. Enabled by default. +- **Original on Disk, quantized in RAM** - this is a hybrid mode, allows to obtain a good balance between speed and memory usage. Recommended scenario if you are aiming to shrink the memory footprint while keeping the search speed. + +This mode is enabled by setting `always_ram` to `true` in the quantization config while using memmap storage: + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") + +client.create_collection( + collection_name="{collection_name}", + vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE, on_disk=True), + quantization_config=models.ScalarQuantization( + scalar=models.ScalarQuantizationConfig( + type=models.ScalarType.INT8, + always_ram=True, + ), + ), +) ``` ```csharp using Qdrant.Client; +using Qdrant.Client.Grpc; var client = new QdrantClient("localhost", 6334); -await client.QueryAsync( +await client.CreateCollectionAsync( collectionName: "{collection_name}", - query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, - usingVector: "image", - limit: 3 + vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine, OnDisk = true }, + quantizationConfig: new QuantizationConfig + { + Scalar = new ScalarQuantization { Type = QuantizationType.Int8, AlwaysRam = true } + } ); - ``` ```go @@ -44101,93 +62247,35 @@ client, err := qdrant.NewClient(&qdrant.Config{ Port: 6334, }) -client.Query(context.Background(), &qdrant.QueryPoints{ +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), - Using: qdrant.PtrOf("image"), + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 768, + Distance: qdrant.Distance_Cosine, + OnDisk: qdrant.PtrOf(true), + }), + QuantizationConfig: qdrant.NewQuantizationScalar(&qdrant.ScalarQuantization{ + Type: qdrant.QuantizationType_Int8, + AlwaysRam: qdrant.PtrOf(true), + }), }) - -``` - -## [Anchor](https://qdrant.tech/documentation/concepts/vectors/\#datatypes) Datatypes - -Newest versions of embeddings models generate vectors with very large dimentionalities. -With OpenAI’s `text-embedding-3-large` embedding model, the dimensionality can go up to 3072. - -The amount of memory required to store such vectors grows linearly with the dimensionality, -so it is important to choose the right datatype for the vectors. - -The choice between datatypes is a trade-off between memory consumption and precision of vectors. - -Qdrant supports a number of datatypes for both dense and sparse vectors: - -**Float32** - -This is the default datatype for vectors in Qdrant. It is a 32-bit (4 bytes) floating-point number. -The standard OpenAI embedding of 1536 dimensionality will require 6KB of memory to store in Float32. - -You don’t need to specify the datatype for vectors in Qdrant, as it is set to Float32 by default. - -**Float16** - -This is a 16-bit (2 bytes) floating-point number. It is also known as half-precision float. -Intuitively, it looks like this: - -```text -float32 -> float16 delta (float32 - float16).abs - -0.79701585 -> 0.796875 delta 0.00014084578 -0.7850789 -> 0.78515625 delta 0.00007736683 -0.7775044 -> 0.77734375 delta 0.00016063452 -0.85776305 -> 0.85791016 delta 0.00014710426 -0.6616839 -> 0.6616211 delta 0.000062823296 - ``` -The main advantage of Float16 is that it requires half the memory of Float32, while having virtually no impact on the quality of vector search. - -To use Float16, you need to specify the datatype for vectors in the collection configuration: - -httppythontypescriptrustjavacsharpgo - ```http PUT /collections/{collection_name} { - "vectors": { - "size": 128, - "distance": "Cosine", - "datatype": "float16" // <-- For dense vectors - }, - "sparse_vectors": { - "text": { - "index": { - "datatype": "float16" // <-- And for sparse vectors - } + "vectors": { + "size": 768, + "distance": "Cosine", + "on_disk": true + }, + "quantization_config": { + "scalar": { + "type": "int8", + "always_ram": true + } } - } } - -``` - -```python -from qdrant_client import QdrantClient, models - -client = QdrantClient(url="http://localhost:6333") - -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams( - size=128, - distance=models.Distance.COSINE, - datatype=models.Datatype.FLOAT16 - ), - sparse_vectors_config={ - "text": models.SparseVectorParams( - index=models.SparseIndexParams(datatype=models.Datatype.FLOAT16) - ), - }, -) - ``` ```typescript @@ -44197,81 +62285,98 @@ const client = new QdrantClient({ host: "localhost", port: 6333 }); client.createCollection("{collection_name}", { vectors: { - size: 128, + size: 768, distance: "Cosine", - datatype: "float16" + on_disk: true, + }, + quantization_config: { + scalar: { + type: "int8", + always_ram: true, + }, }, - sparse_vectors: { - text: { - index: { - datatype: "float16" - } - } - } }); - ``` ```rust use qdrant_client::qdrant::{ - CreateCollectionBuilder, Datatype, Distance, SparseIndexConfigBuilder, SparseVectorParamsBuilder, SparseVectorsConfigBuilder, VectorParamsBuilder + CreateCollectionBuilder, Distance, QuantizationType, ScalarQuantizationBuilder, + VectorParamsBuilder, }; use qdrant_client::Qdrant; let client = Qdrant::from_url("http://localhost:6334").build()?; -let mut sparse_vector_config = SparseVectorsConfigBuilder::default(); -sparse_vector_config.add_named_vector_params( - "text", - SparseVectorParamsBuilder::default() - .index(SparseIndexConfigBuilder::default().datatype(Datatype::Float32)), -); - -let create_collection = CreateCollectionBuilder::new("{collection_name}") - .sparse_vectors_config(sparse_vector_config) - .vectors_config( - VectorParamsBuilder::new(128, Distance::Cosine).datatype(Datatype::Float16), - ); - -client.create_collection(create_collection).await?; - +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) + .quantization_config( + ScalarQuantizationBuilder::default() + .r#type(QuantizationType::Int8.into()) + .always_ram(true), + ), + ) + .await?; ``` ```java import io.qdrant.client.QdrantClient; import io.qdrant.client.QdrantGrpcClient; import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Datatype; import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.SparseIndexConfig; -import io.qdrant.client.grpc.Collections.SparseVectorConfig; -import io.qdrant.client.grpc.Collections.SparseVectorParams; +import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; +import io.qdrant.client.grpc.Collections.QuantizationConfig; +import io.qdrant.client.grpc.Collections.QuantizationType; +import io.qdrant.client.grpc.Collections.ScalarQuantization; import io.qdrant.client.grpc.Collections.VectorParams; import io.qdrant.client.grpc.Collections.VectorsConfig; -QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig(VectorsConfig.newBuilder() - .setParams(VectorParams.newBuilder() - .setSize(128) - .setDistance(Distance.Cosine) - .setDatatype(Datatype.Float16) - .build()) - .build()) - .setSparseVectorsConfig( - SparseVectorConfig.newBuilder() - .putMap("text", SparseVectorParams.newBuilder() - .setIndex(SparseIndexConfig.newBuilder() - .setDatatype(Datatype.Float16) - .build()) - .build())) - .build()) - .get(); + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(768) + .setDistance(Distance.Cosine) + .setOnDisk(true) + .build()) + .build()) + .setQuantizationConfig( + QuantizationConfig.newBuilder() + .setScalar( + ScalarQuantization.newBuilder() + .setType(QuantizationType.Int8) + .setAlwaysRam(true) + .build()) + .build()) + .build()) + .get(); +``` + +In this scenario, the number of disk reads may play a significant role in the search speed. +In a system with high disk latency, the re-scoring step may become a bottleneck. + +Consider disabling `rescore` to improve the search speed: + +```python +from qdrant_client import QdrantClient, models + +client = QdrantClient(url="http://localhost:6333") +client.query_points( + collection_name="{collection_name}", + query=[0.2, 0.1, 0.9, 0.7], + search_params=models.SearchParams( + quantization=models.QuantizationSearchParams(rescore=False) + ), +) ``` ```csharp @@ -44280,23 +62385,15 @@ using Qdrant.Client.Grpc; var client = new QdrantClient("localhost", 6334); -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { - Size = 128, - Distance = Distance.Cosine, - Datatype = Datatype.Float16 - }, - sparseVectorsConfig: ( - "text", - new SparseVectorParams { - Index = new SparseIndexConfig { - Datatype = Datatype.Float16 - } - } - ) +await client.QueryAsync( + collectionName: "{collection_name}", + query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, + searchParams: new SearchParams + { + Quantization = new QuantizationSearchParams { Rescore = false } + }, + limit: 3 ); - ``` ```go @@ -44311,60 +62408,98 @@ client, err := qdrant.NewClient(&qdrant.Config{ Port: 6334, }) -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ +client.Query(context.Background(), &qdrant.QueryPoints{ CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 128, - Distance: qdrant.Distance_Cosine, - Datatype: qdrant.Datatype_Float16.Enum(), - }), - SparseVectorsConfig: qdrant.NewSparseVectorsConfig( - map[string]*qdrant.SparseVectorParams{ - "text": { - Index: &qdrant.SparseIndexConfig{ - Datatype: qdrant.Datatype_Float16.Enum(), - }, - }, - }), + Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), + Params: &qdrant.SearchParams{ + Quantization: &qdrant.QuantizationSearchParams{ + Rescore: qdrant.PtrOf(false), + }, + }, }) +``` +```http +POST /collections/{collection_name}/points/query +{ + "query": [0.2, 0.1, 0.9, 0.7], + "params": { + "quantization": { + "rescore": false + } + }, + "limit": 10 +} ``` -**Uint8** +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -Another step towards memory optimization is to use the Uint8 datatype for vectors. -Unlike Float16, Uint8 is not a floating-point number, but an integer number in the range from 0 to 255. +const client = new QdrantClient({ host: "localhost", port: 6333 }); -Not all embeddings models generate vectors in the range from 0 to 255, so you need to be careful when using Uint8 datatype. +client.query("{collection_name}", { + query: [0.2, 0.1, 0.9, 0.7], + params: { + quantization: { + rescore: false, + }, + }, +}); +``` -In order to convert a number from float range to Uint8 range, you need to apply a process called quantization. +```rust +use qdrant_client::qdrant::{ + QuantizationSearchParamsBuilder, QueryPointsBuilder, SearchParamsBuilder, +}; +use qdrant_client::Qdrant; -Some embedding providers may provide embeddings in a pre-quantized format. -One of the most notable examples is the [Cohere int8 & binary embeddings](https://cohere.com/blog/int8-binary-embeddings). +let client = Qdrant::from_url("http://localhost:6334").build()?; -For other embeddings, you will need to apply quantization yourself. +client + .query( + QueryPointsBuilder::new("{collection_name}") + .query(vec![0.2, 0.1, 0.9, 0.7]) + .limit(3) + .params( + SearchParamsBuilder::default() + .quantization(QuantizationSearchParamsBuilder::default().rescore(false)), + ), + ) + .await?; +``` + +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.QuantizationSearchParams; +import io.qdrant.client.grpc.Points.QueryPoints; +import io.qdrant.client.grpc.Points.SearchParams; -httppythontypescriptrustjavacsharpgo +import static io.qdrant.client.QueryFactory.nearest; -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 128, - "distance": "Cosine", - "datatype": "uint8" // <-- For dense vectors - }, - "sparse_vectors": { - "text": { - "index": { - "datatype": "uint8" // <-- For sparse vectors - } - } - } -} +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +client.queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) + .setParams( + SearchParams.newBuilder() + .setQuantization( + QuantizationSearchParams.newBuilder().setRescore(false).build()) + .build()) + .setLimit(3) + .build()) + .get(); ``` +- **All on Disk** - all vectors, original and quantized, are stored on disk. This mode allows to achieve the smallest memory footprint, but at the cost of the search speed. + +It is recommended to use this mode if you have a large collection and fast storage (e.g. SSD or NVMe). + +This mode is enabled by setting `always_ram` to `false` in the quantization config while using mmap storage: + ```python from qdrant_client import QdrantClient, models @@ -44372,16 +62507,75 @@ client = QdrantClient(url="http://localhost:6333") client.create_collection( collection_name="{collection_name}", - vectors_config=models.VectorParams( - size=128, distance=models.Distance.COSINE, datatype=models.Datatype.UINT8 - ), - sparse_vectors_config={ - "text": models.SparseVectorParams( - index=models.SparseIndexParams(datatype=models.Datatype.UINT8) + vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE, on_disk=True), + quantization_config=models.ScalarQuantization( + scalar=models.ScalarQuantizationConfig( + type=models.ScalarType.INT8, + always_ram=False, ), - }, + ), +) +``` + +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; + +var client = new QdrantClient("localhost", 6334); + +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine, OnDisk = true}, + quantizationConfig: new QuantizationConfig + { + Scalar = new ScalarQuantization { Type = QuantizationType.Int8, AlwaysRam = false } + } +); +``` + +```go +import ( + "context" + + "github.com/qdrant/go-client/qdrant" ) +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "localhost", + Port: 6334, +}) + +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ + CollectionName: "{collection_name}", + VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ + Size: 768, + Distance: qdrant.Distance_Cosine, + OnDisk: qdrant.PtrOf(true), + }), + QuantizationConfig: qdrant.NewQuantizationScalar( + &qdrant.ScalarQuantization{ + Type: qdrant.QuantizationType_Int8, + AlwaysRam: qdrant.PtrOf(false), + }, + ), +}) +``` + +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 768, + "distance": "Cosine", + "on_disk": true + }, + "quantization_config": { + "scalar": { + "type": "int8", + "always_ram": false + } + } +} ``` ```typescript @@ -44391,12780 +62585,14019 @@ const client = new QdrantClient({ host: "localhost", port: 6333 }); client.createCollection("{collection_name}", { vectors: { - size: 128, + size: 768, distance: "Cosine", - datatype: "uint8" + on_disk: true, + }, + quantization_config: { + scalar: { + type: "int8", + always_ram: false, + }, }, - sparse_vectors: { - text: { - index: { - datatype: "uint8" - } - } - } }); - ``` ```rust use qdrant_client::qdrant::{ - CreateCollectionBuilder, Datatype, Distance, SparseIndexConfigBuilder, - SparseVectorParamsBuilder, SparseVectorsConfigBuilder, VectorParamsBuilder, + CreateCollectionBuilder, Distance, QuantizationType, ScalarQuantizationBuilder, + VectorParamsBuilder, }; - use qdrant_client::Qdrant; let client = Qdrant::from_url("http://localhost:6334").build()?; -let mut sparse_vector_config = SparseVectorsConfigBuilder::default(); - -sparse_vector_config.add_named_vector_params( - "text", - SparseVectorParamsBuilder::default() - .index(SparseIndexConfigBuilder::default().datatype(Datatype::Uint8)), -); -let create_collection = CreateCollectionBuilder::new("{collection_name}") - .sparse_vectors_config(sparse_vector_config) - .vectors_config( - VectorParamsBuilder::new(128, Distance::Cosine) - .datatype(Datatype::Uint8) - ); - -client.create_collection(create_collection).await?; - +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine).on_disk(true)) + .quantization_config( + ScalarQuantizationBuilder::default() + .r#type(QuantizationType::Int8.into()) + .always_ram(false), + ), + ) + .await?; ``` ```java import io.qdrant.client.QdrantClient; import io.qdrant.client.QdrantGrpcClient; import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Datatype; import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.SparseIndexConfig; -import io.qdrant.client.grpc.Collections.SparseVectorConfig; -import io.qdrant.client.grpc.Collections.SparseVectorParams; +import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; +import io.qdrant.client.grpc.Collections.QuantizationConfig; +import io.qdrant.client.grpc.Collections.QuantizationType; +import io.qdrant.client.grpc.Collections.ScalarQuantization; import io.qdrant.client.grpc.Collections.VectorParams; import io.qdrant.client.grpc.Collections.VectorsConfig; -QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig(VectorsConfig.newBuilder() - .setParams(VectorParams.newBuilder() - .setSize(128) - .setDistance(Distance.Cosine) - .setDatatype(Datatype.Uint8) - .build()) - .build()) - .setSparseVectorsConfig( - SparseVectorConfig.newBuilder() - .putMap("text", SparseVectorParams.newBuilder() - .setIndex(SparseIndexConfig.newBuilder() - .setDatatype(Datatype.Uint8) - .build()) - .build())) - .build()) - .get(); - + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig( + VectorsConfig.newBuilder() + .setParams( + VectorParams.newBuilder() + .setSize(768) + .setDistance(Distance.Cosine) + .setOnDisk(true) + .build()) + .build()) + .setQuantizationConfig( + QuantizationConfig.newBuilder() + .setScalar( + ScalarQuantization.newBuilder() + .setType(QuantizationType.Int8) + .setAlwaysRam(false) + .build()) + .build()) + .build()) + .get(); ``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +<|page-193-lllmstxt|> +# Monitoring & Telemetry -var client = new QdrantClient("localhost", 6334); +Qdrant exposes its metrics in [Prometheus](https://prometheus.io/docs/instrumenting/exposition_formats/#text-based-format)/[OpenMetrics](https://github.com/OpenObservability/OpenMetrics) format, so you can integrate them easily +with the compatible tools and monitor Qdrant with your own monitoring system. You can +use the `/metrics` endpoint and configure it as a scrape target. -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { - Size = 128, - Distance = Distance.Cosine, - Datatype = Datatype.Uint8 - }, - sparseVectorsConfig: ( - "text", - new SparseVectorParams { - Index = new SparseIndexConfig { - Datatype = Datatype.Uint8 - } - } - ) -); +Metrics endpoint: -``` +The integration with Qdrant is easy to +[configure](https://prometheus.io/docs/prometheus/latest/getting_started/#configure-prometheus-to-monitor-the-sample-targets) +with Prometheus and Grafana. -```go -import ( - "context" +## Monitoring multi-node clusters - "github.com/qdrant/go-client/qdrant" -) +When scraping metrics from multi-node Qdrant clusters, it is important to scrape from +each node individually instead of using a load-balanced URL. Otherwise, your metrics will appear inconsistent after each scrape. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +## Monitoring in Qdrant Cloud -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 128, - Distance: qdrant.Distance_Cosine, - Datatype: qdrant.Datatype_Uint8.Enum(), - }), - SparseVectorsConfig: qdrant.NewSparseVectorsConfig( - map[string]*qdrant.SparseVectorParams{ - "text": { - Index: &qdrant.SparseIndexConfig{ - Datatype: qdrant.Datatype_Uint8.Enum(), - }, - }, - }), -}) +Qdrant Cloud offers additional metrics and telemetry that are not available in the open-source version. For more information, see [Qdrant Cloud Monitoring](/documentation/cloud/cluster-monitoring/). -``` +## Exposed metrics -## [Anchor](https://qdrant.tech/documentation/concepts/vectors/\#quantization) Quantization +There are two endpoints avaliable: -Apart from changing the datatype of the original vectors, Qdrant can create quantized representations of vectors alongside the original ones. -This quantized representation can be used to quickly select candidates for rescoring with the original vectors or even used directly for search. +- `/metrics` is the direct endpoint of the underlying Qdrant database node. -Quantization is applied in the background, during the optimization process. +- `/sys_metrics` is a Qdrant cloud-only endpoint that provides additional operational and infrastructure metrics about your cluster, like CPU, memory and disk utilisation, collection metrics and load balancer telemetry. For more information, see [Qdrant Cloud Monitoring](/documentation/cloud/cluster-monitoring/). -More information about the quantization process can be found in the [Quantization](https://qdrant.tech/documentation/guides/quantization/) section. -## [Anchor](https://qdrant.tech/documentation/concepts/vectors/\#vector-storage) Vector Storage +### Node metrics `/metrics` -Depending on the requirements of the application, Qdrant can use one of the data storage options. -Keep in mind that you will have to tradeoff between search speed and the size of RAM used. +Each Qdrant server will expose the following metrics. -More information about the storage options can be found in the [Storage](https://qdrant.tech/documentation/concepts/storage/#vector-storage) section. +| Name | Type | Meaning | +| ----------------------------------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------- | +| app_info | gauge | Information about Qdrant server | +| app_status_recovery_mode | gauge | If Qdrant is currently started in recovery mode | +| collections_total | gauge | Number of collections | +| collections_vector_total | gauge | Total number of vectors in all collections | +| collections_full_total | gauge | Number of full collections | +| collections_aggregated_total | gauge | Number of aggregated collections | +| rest_responses_total | counter | Total number of responses through REST API | +| rest_responses_fail_total | counter | Total number of failed responses through REST API | +| rest_responses_avg_duration_seconds | gauge | Average response duration in REST API | +| rest_responses_min_duration_seconds | gauge | Minimum response duration in REST API | +| rest_responses_max_duration_seconds | gauge | Maximum response duration in REST API | +| grpc_responses_total | counter | Total number of responses through gRPC API | +| grpc_responses_fail_total | counter | Total number of failed responses through REST API | +| grpc_responses_avg_duration_seconds | gauge | Average response duration in gRPC API | +| grpc_responses_min_duration_seconds | gauge | Minimum response duration in gRPC API | +| grpc_responses_max_duration_seconds | gauge | Maximum response duration in gRPC API | +| cluster_enabled | gauge | Whether the cluster support is enabled. 1 - YES | +| memory_active_bytes | gauge | Total number of bytes in active pages allocated by the application. [Reference](https://jemalloc.net/jemalloc.3.html#stats.active) | +| memory_allocated_bytes | gauge | Total number of bytes allocated by the application. [Reference](https://jemalloc.net/jemalloc.3.html#stats.allocated) | +| memory_metadata_bytes | gauge | Total number of bytes dedicated to allocator metadata. [Reference](https://jemalloc.net/jemalloc.3.html#stats.metadata) | +| memory_resident_bytes | gauge | Maximum number of bytes in physically resident data pages mapped. [Reference](https://jemalloc.net/jemalloc.3.html#stats.resident) | +| memory_retained_bytes | gauge | Total number of bytes in virtual memory mappings. [Reference](https://jemalloc.net/jemalloc.3.html#stats.retained) | +| collection_hardware_metric_cpu | gauge | CPU measurements of a collection (Experimental) | -##### Was this page useful? +**Cluster-related metrics** -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +There are also some metrics which are exposed in distributed mode only. -Thank you for your feedback! 🙏 +| Name | Type | Meaning | +| -------------------------------- | ------- | ---------------------------------------------------------------------- | +| cluster_peers_total | gauge | Total number of cluster peers | +| cluster_term | counter | Current cluster term | +| cluster_commit | counter | Index of last committed (finalized) operation cluster peer is aware of | +| cluster_pending_operations_total | gauge | Total number of pending operations for cluster peer | +| cluster_voter | gauge | Whether the cluster peer is a voter or learner. 1 - VOTER | -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/vectors.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +## Telemetry endpoint -On this page: +Qdrant also provides a `/telemetry` endpoint, which provides information about the current state of the database, including the number of vectors, shards, and other useful information. You can find a full documentation of this endpoint in the [API reference](https://api.qdrant.tech/api-reference/service/telemetry). -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/vectors.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +## Kubernetes health endpoints -× +*Available as of v1.5.0* -[Powered by](https://qdrant.tech/) +Qdrant exposes three endpoints, namely +[`/healthz`](http://localhost:6333/healthz), +[`/livez`](http://localhost:6333/livez) and +[`/readyz`](http://localhost:6333/readyz), to indicate the current status of the +Qdrant server. -<|page-117-lllmstxt|> -## io_uring -- [Articles](https://qdrant.tech/articles/) -- Qdrant under the hood: io\_uring +These currently provide the most basic status response, returning HTTP 200 if +Qdrant is started and ready to be used. -[Back to Qdrant Internals](https://qdrant.tech/articles/qdrant-internals/) +Regardless of whether an [API key](/documentation/guides/security/#authentication) is configured, +the endpoints are always accessible. -# Qdrant under the hood: io\_uring +You can read more about Kubernetes health endpoints +[here](https://kubernetes.io/docs/reference/using-api/health-checks/). -Andre Bogus +<|page-194-lllmstxt|> +# Configuration -· +Qdrant ships with sensible defaults for collection and network settings that are suitable for most use cases. You can view these defaults in the [Qdrant source](https://github.com/qdrant/qdrant/blob/master/config/config.yaml). If you need to customize the settings, you can do so using configuration files and environment variables. -June 21, 2023 + -![Qdrant under the hood: io_uring](https://qdrant.tech/articles_data/io_uring/preview/title.jpg) +## Configuration Files -With Qdrant [version 1.3.0](https://github.com/qdrant/qdrant/releases/tag/v1.3.0) we -introduce the alternative io\_uring based _async uring_ storage backend on -Linux-based systems. Since its introduction, io\_uring has been known to improve -async throughput wherever the OS syscall overhead gets too high, which tends to -occur in situations where software becomes _IO bound_ (that is, mostly waiting -on disk). +To customize Qdrant, you can mount your configuration file in any of the following locations. This guide uses `.yaml` files, but Qdrant also supports other formats such as `.toml`, `.json`, and `.ini`. -## [Anchor](https://qdrant.tech/articles/io_uring/\#inputoutput) Input+Output +1. **Main Configuration: `qdrant/config/config.yaml`** -Around the mid-90s, the internet took off. The first servers used a process- -per-request setup, which was good for serving hundreds if not thousands of -concurrent request. The POSIX Input + Output (IO) was modeled in a strictly -synchronous way. The overhead of starting a new process for each request made -this model unsustainable. So servers started forgoing process separation, opting -for the thread-per-request model. But even that ran into limitations. + Mount your custom `config.yaml` file to override default settings: -I distinctly remember when someone asked the question whether a server could -serve 10k concurrent connections, which at the time exhausted the memory of -most systems (because every thread had to have its own stack and some other -metadata, which quickly filled up available memory). As a result, the -synchronous IO was replaced by asynchronous IO during the 2.5 kernel update, -either via `select` or `epoll` (the latter being Linux-only, but a small bit -more efficient, so most servers of the time used it). + ```bash + docker run -p 6333:6333 \ + -v $(pwd)/config.yaml:/qdrant/config/config.yaml \ + qdrant/qdrant + ``` -However, even this crude form of asynchronous IO carries the overhead of at -least one system call per operation. Each system call incurs a context switch, -and while this operation is itself not that slow, the switch disturbs the -caches. Today’s CPUs are much faster than memory, but if their caches start to -miss data, the memory accesses required led to longer and longer wait times for -the CPU. +2. **Environment-Specific Configuration: `config/{RUN_MODE}.yaml`** -### [Anchor](https://qdrant.tech/articles/io_uring/\#memory-mapped-io) Memory-mapped IO + Qdrant looks for an environment-specific configuration file based on the `RUN_MODE` variable. By default, the [official Docker image](https://hub.docker.com/r/qdrant/qdrant) uses `RUN_MODE=production`, meaning it will look for `config/production.yaml`. -Another way of dealing with file IO (which unlike network IO doesn’t have a hard -time requirement) is to map parts of files into memory - the system fakes having -that chunk of the file in memory, so when you read from a location there, the -kernel interrupts your process to load the needed data from disk, and resumes -your process once done, whereas writing to the memory will also notify the -kernel. Also the kernel can prefetch data while the program is running, thus -reducing the likelyhood of interrupts. + You can override this by setting `RUN_MODE` to another value (e.g., `dev`), and providing the corresponding file: -Thus there is still some overhead, but (especially in asynchronous -applications) it’s far less than with `epoll`. The reason this API is rarely -used in web servers is that these usually have a large variety of files to -access, unlike a database, which can map its own backing store into memory -once. + ```bash + docker run -p 6333:6333 \ + -v $(pwd)/dev.yaml:/qdrant/config/dev.yaml \ + -e RUN_MODE=dev \ + qdrant/qdrant + ``` -### [Anchor](https://qdrant.tech/articles/io_uring/\#combating-the-poll-ution) Combating the Poll-ution +3. **Local Configuration: `config/local.yaml`** -There were multiple experiments to improve matters, some even going so far as -moving a HTTP server into the kernel, which of course brought its own share of -problems. Others like Intel added their own APIs that ignored the kernel and -worked directly on the hardware. + The `local.yaml` file is typically used for machine-specific settings that are not tracked in version control: -Finally, Jens Axboe took matters into his own hands and proposed a ring buffer -based interface called _io\_uring_. The buffers are not directly for data, but -for operations. User processes can setup a Submission Queue (SQ) and a -Completion Queue (CQ), both of which are shared between the process and the -kernel, so there’s no copying overhead. + ```bash + docker run -p 6333:6333 \ + -v $(pwd)/local.yaml:/qdrant/config/local.yaml \ + qdrant/qdrant + ``` -![io_uring diagram](https://qdrant.tech/articles_data/io_uring/io-uring.png) +4. **Custom Configuration via `--config-path`** -Apart from avoiding copying overhead, the queue-based architecture lends -itself to multithreading as item insertion/extraction can be made lockless, -and once the queues are set up, there is no further syscall that would stop -any user thread. + You can specify a custom configuration file path using the `--config-path` argument. This will override other configuration files: -Servers that use this can easily get to over 100k concurrent requests. Today -Linux allows asynchronous IO via io\_uring for network, disk and accessing -other ports, e.g. for printing or recording video. + ```bash + docker run -p 6333:6333 \ + -v $(pwd)/config.yaml:/path/to/config.yaml \ + qdrant/qdrant \ + ./qdrant --config-path /path/to/config.yaml + ``` -## [Anchor](https://qdrant.tech/articles/io_uring/\#and-what-about-qdrant) And what about Qdrant? +For details on how these configurations are loaded and merged, see the [loading order and priority](#loading-order-and-priority). The full list of available configuration options can be found [below](#configuration-options). -Qdrant can store everything in memory, but not all data sets may fit, which can -require storing on disk. Before io\_uring, Qdrant used mmap to do its IO. This -led to some modest overhead in case of disk latency. The kernel may -stop a user thread trying to access a mapped region, which incurs some context -switching overhead plus the wait time until the disk IO is finished. Ultimately, -this works very well with the asynchronous nature of Qdrant’s core. +## Environment Variables -One of the great optimizations Qdrant offers is quantization (either -[scalar](https://qdrant.tech/articles/scalar-quantization/) or -[product](https://qdrant.tech/articles/product-quantization/)-based). -However unless the collection resides fully in memory, this optimization -method generates significant disk IO, so it is a prime candidate for possible -improvements. +You can also configure Qdrant using environment variables, which always take the highest priority and override any file-based settings. -If you run Qdrant on Linux, you can enable io\_uring with the following in your -configuration: +Environment variables follow this format: they should be prefixed with `QDRANT__`, and nested properties should be separated by double underscores (`__`). For example: + +```bash +docker run -p 6333:6333 \ + -e QDRANT__LOG_LEVEL=INFO \ + -e QDRANT__SERVICE__API_KEY= \ + -e QDRANT__SERVICE__ENABLE_TLS=1 \ + -e QDRANT__TLS__CERT=./tls/cert.pem \ + qdrant/qdrant +``` + +This results in the following configuration: ```yaml -# within the storage config -storage: - # enable the async scorer which uses io_uring - async_scorer: true +log_level: INFO +service: + enable_tls: true + api_key: +tls: + cert: ./tls/cert.pem +``` + +## Loading Order and Priority + +During startup, Qdrant merges multiple configuration sources into a single effective configuration. The loading order is as follows (from least to most significant): + +1. Embedded default configuration +2. `config/config.yaml` +3. `config/{RUN_MODE}.yaml` +4. `config/local.yaml` +5. Custom configuration file +6. Environment variables + +### Overriding Behavior +Settings from later sources in the list override those from earlier sources: + +- Settings in `config/{RUN_MODE}.yaml` (3) will override those in `config/config.yaml` (2). +- A custom configuration file provided via `--config-path` (5) will override all other file-based settings. +- Environment variables (6) have the highest priority and will override any settings from files. + +## Configuration Validation + +Qdrant validates the configuration during startup. If any issues are found, the server will terminate immediately, providing information about the error. For example: + +```console +Error: invalid type: 64-bit integer `-1`, expected an unsigned 64-bit or smaller integer for key `storage.hnsw_index.max_indexing_threads` in config/production.yaml ``` -You can return to the mmap based backend by either deleting the `async_scorer` -entry or setting the value to `false`. +This ensures that misconfigurations are caught early, preventing Qdrant from running with invalid settings. -## [Anchor](https://qdrant.tech/articles/io_uring/\#benchmarks) Benchmarks +## Configuration Options -To run the benchmark, use a test instance of Qdrant. If necessary spin up a -docker container and load a snapshot of the collection you want to benchmark -with. You can copy and edit our [benchmark script](https://qdrant.tech/articles_data/io_uring/rescore-benchmark.sh) -to run the benchmark. Run the script with and without enabling -`storage.async_scorer` and once. You can measure IO usage with `iostat` from -another console. +The following YAML example describes the available configuration options. -For our benchmark, we chose the laion dataset picking 5 million 768d entries. -We enabled scalar quantization + HNSW with m=16 and ef\_construct=512. -We do the quantization in RAM, HNSW in RAM but keep the original vectors on -disk (which was a network drive rented from Hetzner for the benchmark). +```yaml +log_level: INFO -If you want to reproduce the benchmarks, you can get snapshots containing the -datasets: +# Logging configuration +# Qdrant logs to stdout. You may configure to also write logs to a file on disk. +# Be aware that this file may grow indefinitely. +# logger: +# # Logging format, supports `text` and `json` +# format: text +# on_disk: +# enabled: true +# log_file: path/to/log/file.log +# log_level: INFO +# # Logging format, supports `text` and `json` +# format: text -- [mmap only](https://storage.googleapis.com/common-datasets-snapshots/laion-768-6m-mmap.snapshot) -- [with scalar quantization](https://storage.googleapis.com/common-datasets-snapshots/laion-768-6m-sq-m16-mmap.shapshot) +storage: + # Where to store all the data + storage_path: ./storage -Running the benchmark, we get the following IOPS, CPU loads and wall clock times: + # Where to store snapshots + snapshots_path: ./snapshots -| | oversampling | parallel | ~max IOPS | CPU% (of 4 cores) | time (s) (avg of 3) | -| --- | --- | --- | --- | --- | --- | -| io\_uring | 1 | 4 | 4000 | 200 | 12 | -| mmap | 1 | 4 | 2000 | 93 | 43 | -| io\_uring | 1 | 8 | 4000 | 200 | 12 | -| mmap | 1 | 8 | 2000 | 90 | 43 | -| io\_uring | 4 | 8 | 7000 | 100 | 30 | -| mmap | 4 | 8 | 2300 | 50 | 145 | + snapshots_config: + # "local" or "s3" - where to store snapshots + snapshots_storage: local + # s3_config: + # bucket: "" + # region: "" + # access_key: "" + # secret_key: "" -Note that in this case, the IO operations have relatively high latency due to -using a network disk. Thus, the kernel takes more time to fulfil the mmap -requests, and application threads need to wait, which is reflected in the CPU -percentage. On the other hand, with the io\_uring backend, the application -threads can better use available cores for the rescore operation without any -IO-induced delays. + # Where to store temporary files + # If null, temporary snapshots are stored in: storage/snapshots_temp/ + temp_path: null -Oversampling is a new feature to improve accuracy at the cost of some -performance. It allows setting a factor, which is multiplied with the `limit` -while doing the search. The results are then re-scored using the original vector -and only then the top results up to the limit are selected. + # If true - point payloads will not be stored in memory. + # It will be read from the disk every time it is requested. + # This setting saves RAM by (slightly) increasing the response time. + # Note: those payload values that are involved in filtering and are indexed - remain in RAM. + # + # Default: true + on_disk_payload: true -## [Anchor](https://qdrant.tech/articles/io_uring/\#discussion) Discussion + # Maximum number of concurrent updates to shard replicas + # If `null` - maximum concurrency is used. + update_concurrency: null -Looking back, disk IO used to be very serialized; re-positioning read-write -heads on moving platter was a slow and messy business. So the system overhead -didn’t matter as much, but nowadays with SSDs that can often even parallelize -operations while offering near-perfect random access, the overhead starts to -become quite visible. While memory-mapped IO gives us a fair deal in terms of -ease of use and performance, we can improve on the latter in exchange for -some modest complexity increase. + # Write-ahead-log related configuration + wal: + # Size of a single WAL segment + wal_capacity_mb: 32 -io\_uring is still quite young, having only been introduced in 2019 with kernel -5.1, so some administrators will be wary of introducing it. Of course, as with -performance, the right answer is usually “it depends”, so please review your -personal risk profile and act accordingly. + # Number of WAL segments to create ahead of actual data requirement + wal_segments_ahead: 0 -## [Anchor](https://qdrant.tech/articles/io_uring/\#best-practices) Best Practices + # Normal node - receives all updates and answers all queries + node_type: "Normal" -If your on-disk collection’s query performance is of sufficiently high -priority to you, enable the io\_uring-based async\_scorer to greatly reduce -operating system overhead from disk IO. On the other hand, if your -collections are in memory only, activating it will be ineffective. Also note -that many queries are not IO bound, so the overhead may or may not become -measurable in your workload. Finally, on-device disks typically carry lower -latency than network drives, which may also affect mmap overhead. + # Listener node - receives all updates, but does not answer search/read queries + # Useful for setting up a dedicated backup node + # node_type: "Listener" -Therefore before you roll out io\_uring, perform the above or a similar -benchmark with both mmap and io\_uring and measure both wall time and IOps). -Benchmarks are always highly use-case dependent, so your mileage may vary. -Still, doing that benchmark once is a small price for the possible performance -wins. Also please -[tell us](https://discord.com/channels/907569970500743200/907569971079569410) -about your benchmark results! + performance: + # Number of parallel threads used for search operations. If 0 - auto selection. + max_search_threads: 0 + + # Max number of threads (jobs) for running optimizations across all collections, each thread runs one job. + # If 0 - have no limit and choose dynamically to saturate CPU. + # Note: each optimization job will also use `max_indexing_threads` threads by itself for index building. + max_optimization_threads: 0 + + # CPU budget, how many CPUs (threads) to allocate for an optimization job. + # If 0 - auto selection, keep 1 or more CPUs unallocated depending on CPU size + # If negative - subtract this number of CPUs from the available CPUs. + # If positive - use this exact number of CPUs. + optimizer_cpu_budget: 0 + + # Prevent DDoS of too many concurrent updates in distributed mode. + # One external update usually triggers multiple internal updates, which breaks internal + # timings. For example, the health check timing and consensus timing. + # If null - auto selection. + update_rate_limit: null + + # Limit for number of incoming automatic shard transfers per collection on this node, does not affect user-requested transfers. + # The same value should be used on all nodes in a cluster. + # Default is to allow 1 transfer. + # If null - allow unlimited transfers. + #incoming_shard_transfers_limit: 1 + + # Limit for number of outgoing automatic shard transfers per collection on this node, does not affect user-requested transfers. + # The same value should be used on all nodes in a cluster. + # Default is to allow 1 transfer. + # If null - allow unlimited transfers. + #outgoing_shard_transfers_limit: 1 + + # Enable async scorer which uses io_uring when rescoring. + # Only supported on Linux, must be enabled in your kernel. + # See: + #async_scorer: false + + optimizers: + # The minimal fraction of deleted vectors in a segment, required to perform segment optimization + deleted_threshold: 0.2 + + # The minimal number of vectors in a segment, required to perform segment optimization + vacuum_min_vector_number: 1000 -##### Was this page useful? + # Target amount of segments optimizer will try to keep. + # Real amount of segments may vary depending on multiple parameters: + # - Amount of stored points + # - Current write RPS + # + # It is recommended to select default number of segments as a factor of the number of search threads, + # so that each segment would be handled evenly by one of the threads. + # If `default_segment_number = 0`, will be automatically selected by the number of available CPUs + default_segment_number: 0 -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No + # Do not create segments larger this size (in KiloBytes). + # Large segments might require disproportionately long indexation times, + # therefore it makes sense to limit the size of segments. + # + # If indexation speed have more priority for your - make this parameter lower. + # If search speed is more important - make this parameter higher. + # Note: 1Kb = 1 vector of size 256 + # If not set, will be automatically selected considering the number of available CPUs. + max_segment_size_kb: null -Thank you for your feedback! 🙏 + # Maximum size (in KiloBytes) of vectors to store in-memory per segment. + # Segments larger than this threshold will be stored as read-only memmapped file. + # To enable memmap storage, lower the threshold + # Note: 1Kb = 1 vector of size 256 + # To explicitly disable mmap optimization, set to `0`. + # If not set, will be disabled by default. + memmap_threshold_kb: null -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/io_uring.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. + # Maximum size (in KiloBytes) of vectors allowed for plain index. + # Default value based on https://github.com/google-research/google-research/blob/master/scann/docs/algorithms.md + # Note: 1Kb = 1 vector of size 256 + # To explicitly disable vector indexing, set to `0`. + # If not set, the default value will be used. + indexing_threshold_kb: 20000 -On this page: + # Interval between forced flushes. + flush_interval_sec: 5 -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/io_uring.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) + # Max number of threads (jobs) for running optimizations per shard. + # Note: each optimization job will also use `max_indexing_threads` threads by itself for index building. + # If null - have no limit and choose dynamically to saturate CPU. + # If 0 - no optimization threads, optimizations will be disabled. + max_optimization_threads: null -× + # This section has the same options as 'optimizers' above. All values specified here will overwrite the collections + # optimizers configs regardless of the config above and the options specified at collection creation. + #optimizers_overwrite: + # deleted_threshold: 0.2 + # vacuum_min_vector_number: 1000 + # default_segment_number: 0 + # max_segment_size_kb: null + # memmap_threshold_kb: null + # indexing_threshold_kb: 20000 + # flush_interval_sec: 5 + # max_optimization_threads: null -[Powered by](https://qdrant.tech/) + # Default parameters of HNSW Index. Could be overridden for each collection or named vector individually + hnsw_index: + # Number of edges per node in the index graph. Larger the value - more accurate the search, more space required. + m: 16 -<|page-118-lllmstxt|> -## dataset-quality -- [Articles](https://qdrant.tech/articles/) -- Finding errors in datasets with Similarity Search + # Number of neighbours to consider during the index building. Larger the value - more accurate the search, more time required to build index. + ef_construct: 100 -[Back to Data Exploration](https://qdrant.tech/articles/data-exploration/) + # Minimal size threshold (in KiloBytes) below which full-scan is preferred over HNSW search. + # This measures the total size of vectors being queried against. + # When the maximum estimated amount of points that a condition satisfies is smaller than + # `full_scan_threshold_kb`, the query planner will use full-scan search instead of HNSW index + # traversal for better performance. + # Note: 1Kb = 1 vector of size 256 + full_scan_threshold_kb: 10000 -# Finding errors in datasets with Similarity Search + # Number of parallel threads used for background index building. + # If 0 - automatically select. + # Best to keep between 8 and 16 to prevent likelihood of building broken/inefficient HNSW graphs. + # On small CPUs, less threads are used. + max_indexing_threads: 0 -George Panchuk + # Store HNSW index on disk. If set to false, index will be stored in RAM. Default: false + on_disk: false -· + # Custom M param for hnsw graph built for payload index. If not set, default M will be used. + payload_m: null -July 18, 2022 + # Default shard transfer method to use if none is defined. + # If null - don't have a shard transfer preference, choose automatically. + # If stream_records, snapshot or wal_delta - prefer this specific method. + # More info: https://qdrant.tech/documentation/guides/distributed_deployment/#shard-transfer-method + shard_transfer_method: null -![Finding errors in datasets with Similarity Search](https://qdrant.tech/articles_data/dataset-quality/preview/title.jpg) + # Default parameters for collections + collection: + # Number of replicas of each shard that network tries to maintain + replication_factor: 1 -Nowadays, people create a huge number of applications of various types and solve problems in different areas. -Despite such diversity, they have something in common - they need to process data. -Real-world data is a living structure, it grows day by day, changes a lot and becomes harder to work with. + # How many replicas should apply the operation for us to consider it successful + write_consistency_factor: 1 -In some cases, you need to categorize or label your data, which can be a tough problem given its scale. -The process of splitting or labelling is error-prone and these errors can be very costly. -Imagine that you failed to achieve the desired quality of the model due to inaccurate labels. -Worse, your users are faced with a lot of irrelevant items, unable to find what they need and getting annoyed by it. -Thus, you get poor retention, and it directly impacts company revenue. -It is really important to avoid such errors in your data. + # Default parameters for vectors. + vectors: + # Whether vectors should be stored in memory or on disk. + on_disk: null -## [Anchor](https://qdrant.tech/articles/dataset-quality/\#furniture-web-marketplace) Furniture web-marketplace + # shard_number_per_node: 1 -Let’s say you work on an online furniture marketplace. + # Default quantization configuration. + # More info: https://qdrant.tech/documentation/guides/quantization + quantization: null -![Furniture marketplace](https://storage.googleapis.com/demo-dataset-quality-public/article/furniture_marketplace.png) + # Default strict mode parameters for newly created collections. + strict_mode: + # Whether strict mode is enabled for a collection or not. + enabled: false -Furniture marketplace + # Max allowed `limit` parameter for all APIs that don't have their own max limit. + max_query_limit: null -In this case, to ensure a good user experience, you need to split items into different categories: tables, chairs, beds, etc. -One can arrange all the items manually and spend a lot of money and time on this. -There is also another way: train a classification or similarity model and rely on it. -With both approaches it is difficult to avoid mistakes. -Manual labelling is a tedious task, but it requires concentration. -Once you got distracted or your eyes became blurred mistakes won’t keep you waiting. -The model also can be wrong. -You can analyse the most uncertain predictions and fix them, but the other errors will still leak to the site. -There is no silver bullet. You should validate your dataset thoroughly, and you need tools for this. + # Max allowed `timeout` parameter. + max_timeout: null -When you are sure that there are not many objects placed in the wrong category, they can be considered outliers or anomalies. -Thus, you can train a model or a bunch of models capable of looking for anomalies, e.g. autoencoder and a classifier on it. -However, this is again a resource-intensive task, both in terms of time and manual labour, since labels have to be provided for classification. -On the contrary, if the proportion of out-of-place elements is high enough, outlier search methods are likely to be useless. + # Allow usage of unindexed fields in retrieval based (eg. search) filters. + unindexed_filtering_retrieve: null -### [Anchor](https://qdrant.tech/articles/dataset-quality/\#similarity-search) Similarity search + # Allow usage of unindexed fields in filtered updates (eg. delete by payload). + unindexed_filtering_update: null -The idea behind similarity search is to measure semantic similarity between related parts of the data. -E.g. between category title and item images. -The hypothesis is, that unsuitable items will be less similar. + # Max HNSW value allowed in search parameters. + search_max_hnsw_ef: null -We can’t directly compare text and image data. -For this we need an intermediate representation - embeddings. -Embeddings are just numeric vectors containing semantic information. -We can apply a pre-trained model to our data to produce these vectors. -After embeddings are created, we can measure the distances between them. + # Whether exact search is allowed or not. + search_allow_exact: null -Assume we want to search for something other than a single bed in «Single beds» category. + # Max oversampling value allowed in search. + search_max_oversampling: null -![Similarity search](https://storage.googleapis.com/demo-dataset-quality-public/article/similarity_search.png) +service: + # Maximum size of POST data in a single request in megabytes + max_request_size_mb: 32 -Similarity search + # Number of parallel workers used for serving the api. If 0 - equal to the number of available cores. + # If missing - Same as storage.max_search_threads + max_workers: 0 -One of the possible pipelines would look like this: + # Host to bind the service on + host: 0.0.0.0 -- Take the name of the category as an anchor and calculate the anchor embedding. -- Calculate embeddings for images of each object placed into this category. -- Compare obtained anchor and object embeddings. -- Find the furthest. + # HTTP(S) port to bind the service on + http_port: 6333 -For instance, we can do it with the [CLIP](https://huggingface.co/sentence-transformers/clip-ViT-B-32-multilingual-v1) model. + # gRPC port to bind the service on. + # If `null` - gRPC is disabled. Default: null + # Comment to disable gRPC: + grpc_port: 6334 -![Category vs. Image](https://storage.googleapis.com/demo-dataset-quality-public/article/category_vs_image_transparent.png) + # Enable CORS headers in REST API. + # If enabled, browsers would be allowed to query REST endpoints regardless of query origin. + # More info: https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS + # Default: true + enable_cors: true -Category vs. Image + # Enable HTTPS for the REST and gRPC API + enable_tls: false -We can also calculate embeddings for titles instead of images, or even for both of them to find more errors. + # Check user HTTPS client certificate against CA file specified in tls config + verify_https_client_certificate: false -![Category vs. Title and Image](https://storage.googleapis.com/demo-dataset-quality-public/article/category_vs_name_and_image_transparent.png) + # Set an api-key. + # If set, all requests must include a header with the api-key. + # example header: `api-key: ` + # + # If you enable this you should also enable TLS. + # (Either above or via an external service like nginx.) + # Sending an api-key over an unencrypted channel is insecure. + # + # Uncomment to enable. + # api_key: your_secret_api_key_here -Category vs. Title and Image + # Set an api-key for read-only operations. + # If set, all requests must include a header with the api-key. + # example header: `api-key: ` + # + # If you enable this you should also enable TLS. + # (Either above or via an external service like nginx.) + # Sending an api-key over an unencrypted channel is insecure. + # + # Uncomment to enable. + # read_only_api_key: your_secret_read_only_api_key_here -As you can see, different approaches can find new errors or the same ones. -Stacking several techniques or even the same techniques with different models may provide better coverage. -Hint: Caching embeddings for the same models and reusing them among different methods can significantly speed up your lookup. + # Uncomment to enable JWT Role Based Access Control (RBAC). + # If enabled, you can generate JWT tokens with fine-grained rules for access control. + # Use generated token instead of API key. + # + # jwt_rbac: true -### [Anchor](https://qdrant.tech/articles/dataset-quality/\#diversity-search) Diversity search + # Hardware reporting adds information to the API responses with a + # hint on how many resources were used to execute the request. + # + # Uncomment to enable. + # hardware_reporting: true -Since pre-trained models have only general knowledge about the data, they can still leave some misplaced items undetected. -You might find yourself in a situation when the model focuses on non-important features, selects a lot of irrelevant elements, and fails to find genuine errors. -To mitigate this issue, you can perform a diversity search. +cluster: + # Use `enabled: true` to run Qdrant in distributed deployment mode + enabled: false -Diversity search is a method for finding the most distinctive examples in the data. -As similarity search, it also operates on embeddings and measures the distances between them. -The difference lies in deciding which point should be extracted next. + # Configuration of the inter-cluster communication + p2p: + # Port for internal communication between peers + port: 6335 -Let’s imagine how to get 3 points with similarity search and then with diversity search. + # Use TLS for communication between peers + enable_tls: false -Similarity: + # Configuration related to distributed consensus algorithm + consensus: + # How frequently peers should ping each other. + # Setting this parameter to lower value will allow consensus + # to detect disconnected nodes earlier, but too frequent + # tick period may create significant network and CPU overhead. + # We encourage you NOT to change this parameter unless you know what you are doing. + tick_period_ms: 100 -1. Calculate distance matrix -2. Choose your anchor -3. Get a vector corresponding to the distances from the selected anchor from the distance matrix -4. Sort fetched vector -5. Get top-3 embeddings +# Set to true to prevent service from sending usage statistics to the developers. +# Read more: https://qdrant.tech/documentation/guides/telemetry +telemetry_disabled: false -Diversity: +# TLS configuration. +# Required if either service.enable_tls or cluster.p2p.enable_tls is true. +tls: + # Server certificate chain file + cert: ./tls/cert.pem -1. Calculate distance matrix -2. Initialize starting point (randomly or according to the certain conditions) -3. Get a distance vector for the selected starting point from the distance matrix -4. Find the furthest point -5. Get a distance vector for the new point -6. Find the furthest point from all of already fetched points + # Server private key file + key: ./tls/key.pem -![Diversity search](https://storage.googleapis.com/demo-dataset-quality-public/article/diversity_transparent.png) + # Certificate authority certificate file. + # This certificate will be used to validate the certificates + # presented by other nodes during inter-cluster communication. + # + # If verify_https_client_certificate is true, it will verify + # HTTPS client certificate + # + # Required if cluster.p2p.enable_tls is true. + ca_cert: ./tls/cacert.pem -Diversity search + # TTL in seconds to reload certificate from disk, useful for certificate rotations. + # Only works for HTTPS endpoints. Does not support gRPC (and intra-cluster communication). + # If `null` - TTL is disabled. + cert_ttl: 3600 +``` -Diversity search utilizes the very same embeddings, and you can reuse them. -If your data is huge and does not fit into memory, vector search engines like [Qdrant](https://github.com/qdrant/qdrant) might be helpful. +<|page-195-lllmstxt|> +# Security -Although the described methods can be used independently. But they are simple to combine and improve detection capabilities. -If the quality remains insufficient, you can fine-tune the models using a similarity learning approach (e.g. with [Quaterion](https://quaterion.qdrant.tech/) both to provide a better representation of your data and pull apart dissimilar objects in space. +Please read this page carefully. Although there are various ways to secure your Qdrant instances, **they are unsecured by default**. +You need to enable security measures before production use. Otherwise, they are completely open to anyone -## [Anchor](https://qdrant.tech/articles/dataset-quality/\#conclusion) Conclusion +## Authentication -In this article, we enlightened distance-based methods to find errors in categorized datasets. -Showed how to find incorrectly placed items in the furniture web store. -I hope these methods will help you catch sneaky samples leaked into the wrong categories in your data, and make your users\` experience more enjoyable. +*Available as of v1.2.0* -Poke the [demo](https://dataset-quality.qdrant.tech/). +Qdrant supports a simple form of client authentication using a static API key. +This can be used to secure your instance. -Stay tuned :) +To enable API key based authentication in your own Qdrant instance you must +specify a key in the configuration: -##### Was this page useful? +```yaml +service: + # Set an api-key. + # If set, all requests must include a header with the api-key. + # example header: `api-key: ` + # + # If you enable this you should also enable TLS. + # (Either above or via an external service like nginx.) + # Sending an api-key over an unencrypted channel is insecure. + api_key: your_secret_api_key_here +``` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Or alternatively, you can use the environment variable: -Thank you for your feedback! 🙏 +```bash +docker run -p 6333:6333 \ + -e QDRANT__SERVICE__API_KEY=your_secret_api_key_here \ + qdrant/qdrant +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/dataset-quality.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. + -On this page: +For using API key based authentication in Qdrant Cloud see the cloud +[Authentication](/documentation/cloud/authentication/) +section. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/dataset-quality.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +The API key then needs to be present in all REST or gRPC requests to your instance. +All official Qdrant clients for Python, Go, Rust, .NET and Java support the API key parameter. -× + -[Powered by](https://qdrant.tech/) +```bash +curl \ + -X GET https://localhost:6333 \ + --header 'api-key: your_secret_api_key_here' +``` -<|page-119-lllmstxt|> -## qdrant-fundamentals -- [Documentation](https://qdrant.tech/documentation/) -- [Faq](https://qdrant.tech/documentation/faq/) -- Qdrant Fundamentals +```python +from qdrant_client import QdrantClient -# [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#frequently-asked-questions-general-topics) Frequently Asked Questions: General Topics +client = QdrantClient( + url="https://localhost:6333", + api_key="your_secret_api_key_here", +) +``` -| | | | | | -| --- | --- | --- | --- | --- | -| [Vectors](https://qdrant.tech/documentation/faq/qdrant-fundamentals/#vectors) | [Search](https://qdrant.tech/documentation/faq/qdrant-fundamentals/#search) | [Collections](https://qdrant.tech/documentation/faq/qdrant-fundamentals/#collections) | [Compatibility](https://qdrant.tech/documentation/faq/qdrant-fundamentals/#compatibility) | [Cloud](https://qdrant.tech/documentation/faq/qdrant-fundamentals/#cloud) | +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -## [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#vectors) Vectors +const client = new QdrantClient({ + url: "http://localhost", + port: 6333, + apiKey: "your_secret_api_key_here", +}); +``` -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#what-is-the-maximum-vector-dimension-supported-by-qdrant) What is the maximum vector dimension supported by Qdrant? +```rust +use qdrant_client::Qdrant; -Qdrant supports up to 65,535 dimensions by default, but this can be configured to support higher dimensions. +let client = Qdrant::from_url("https://xyz-example.eu-central.aws.cloud.qdrant.io:6334") + .api_key("") + .build()?; +``` -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#what-is-the-maximum-size-of-vector-metadata-that-can-be-stored) What is the maximum size of vector metadata that can be stored? +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; -There is no inherent limitation on metadata size, but it should be [optimized for performance and resource usage](https://qdrant.tech/documentation/guides/optimize/). Users can set upper limits in the configuration. +QdrantClient client = + new QdrantClient( + QdrantGrpcClient.newBuilder( + "xyz-example.eu-central.aws.cloud.qdrant.io", + 6334, + true) + .withApiKey("") + .build()); +``` -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#can-the-same-similarity-search-query-yield-different-results-on-different-machines) Can the same similarity search query yield different results on different machines? +```csharp +using Qdrant.Client; -Yes, due to differences in hardware configurations and parallel processing, results may vary slightly. +var client = new QdrantClient( + host: "xyz-example.eu-central.aws.cloud.qdrant.io", + https: true, + apiKey: "" +); +``` -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#how-do-i-choose-the-right-vector-embeddings-for-my-use-case) How do I choose the right vector embeddings for my use case? +```go +import "github.com/qdrant/go-client/qdrant" -This depends on the nature of your data and the specific application. Consider factors like dimensionality, domain-specific models, and the performance characteristics of different embeddings. +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "xyz-example.eu-central.aws.cloud.qdrant.io", + Port: 6334, + APIKey: "", + UseTLS: true, +}) +``` -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#how-does-qdrant-handle-different-vector-embeddings-from-various-providers-in-the-same-collection) How does Qdrant handle different vector embeddings from various providers in the same collection? + -Qdrant natively [supports multiple vectors per data point](https://qdrant.tech/documentation/concepts/vectors/#multivectors), allowing different embeddings from various providers to coexist within the same collection. +### Read-only API key -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#can-i-migrate-my-embeddings-from-another-vector-store-to-qdrant) Can I migrate my embeddings from another vector store to Qdrant? +*Available as of v1.7.0* -Yes, Qdrant supports migration of embeddings from other vector stores, facilitating easy transitions and adoption of Qdrant’s features. +In addition to the regular API key, Qdrant also supports a read-only API key. +This key can be used to access read-only operations on the instance. -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#why-the-amount-of-indexed-vectors-doesnt-match-the-amount-of-vectors-in-the-collection) Why the amount of indexed vectors doesn’t match the amount of vectors in the collection? +```yaml +service: + read_only_api_key: your_secret_read_only_api_key_here +``` -Qdrant doesn’t always need to index all vectors in the collection. -It stores data is segments, and if the segment is small enough, it is more efficient to perform a full-scan search on it. +Or with the environment variable: -Make sure to check that the collection status is `green` and that the number of unindexed vectors smaller than indexing threshold. +```bash +export QDRANT__SERVICE__READ_ONLY_API_KEY=your_secret_read_only_api_key_here +``` -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#why-collection-info-shows-inaccurate-number-of-points) Why collection info shows inaccurate number of points? +Both API keys can be used simultaneously. -Collection info API in Qdrant returns an approximate number of points in the collection. -If you need an exact number, you can use the [count](https://qdrant.tech/documentation/concepts/points/#counting-points) API. +### Granular access control with JWT -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#vectors-in-the-collection-dont-match-what-i-uploaded) Vectors in the collection don’t match what I uploaded. +*Available as of v1.9.0* -There are two possible reasons for this: +For more complex cases, Qdrant supports granular access control with [JSON Web Tokens (JWT)](https://jwt.io/). +This allows you to create tokens which restrict access to data stored in your cluster, and build [Role-based access control (RBAC)](https://en.wikipedia.org/wiki/Role-based_access_control) on top of that. +In this way, you can define permissions for users and restrict access to sensitive endpoints. -- You used the `Cosine` distance metric in the [collection settings](https://qdrant.tech/concepts/collections/#collections). In this case, Qdrant pre-normalizes your vectors for faster distance computation. If you strictly need the original vectors to be preserved, consider using the `Dot` distance metric instead. -- You used the `uint8` [datatype](https://qdrant.tech/documentation/concepts/vectors/#datatypes) to store vectors. `uint8` requires a special format for input values, which might not be compatible with the typical output of embedding models. +To enable JWT-based authentication in your own Qdrant instance you need to specify the `api-key` and enable the `jwt_rbac` feature in the configuration: -## [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#search) Search +```yaml +service: + api_key: you_secret_api_key_here + jwt_rbac: true +``` -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#how-does-qdrant-handle-real-time-data-updates-and-search) How does Qdrant handle real-time data updates and search? +Or with the environment variables: -Qdrant supports live updates for vector data, with newly inserted, updated and deleted vectors available for immediate search. The system uses full-scan search on unindexed segments during background index updates. +```bash +export QDRANT__SERVICE__API_KEY=your_secret_api_key_here +export QDRANT__SERVICE__JWT_RBAC=true +``` -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#my-search-results-contain-vectors-with-null-values-why) My search results contain vectors with null values. Why? +The `api_key` you set in the configuration will be used to encode and decode the JWTs, so –needless to say– keep it secure. If your `api_key` changes, all existing tokens will be invalid. -By default, Qdrant tries to minimize network traffic and doesn’t return vectors in search results. -But you can force Qdrant to do so by setting the `with_vector` parameter of the Search/Scroll to `true`. +To use JWT-based authentication, you need to provide it as a bearer token in the `Authorization` header, or as an key in the `Api-Key` header of your requests. -If you’re still seeing `"vector": null` in your results, it might be that the vector you’re passing is not in the correct format, or there’s an issue with how you’re calling the upsert method. +```http +Authorization: Bearer -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#how-can-i-search-without-a-vector) How can I search without a vector? +// or -You are likely looking for the [scroll](https://qdrant.tech/documentation/concepts/points/#scroll-points) method. It allows you to retrieve the records based on filters or even iterate over all the records in the collection. +Api-Key: +``` -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#does-qdrant-support-a-full-text-search-or-a-hybrid-search) Does Qdrant support a full-text search or a hybrid search? +```python +from qdrant_client import QdrantClient -Qdrant is a vector search engine in the first place, and we only implement full-text support as long as it doesn’t compromise the vector search use case. -That includes both the interface and the performance. +qdrant_client = QdrantClient( + "xyz-example.eu-central.aws.cloud.qdrant.io", + api_key="", +) +``` -What Qdrant can do: +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -- Search with full-text filters -- Apply full-text filters to the vector search (i.e., perform vector search among the records with specific words or phrases) -- Do prefix search and semantic [search-as-you-type](https://qdrant.tech/articles/search-as-you-type/) -- Sparse vectors, as used in [SPLADE](https://github.com/naver/splade) or similar models -- [Multi-vectors](https://qdrant.tech/documentation/concepts/vectors/#multivectors), for example ColBERT and other late-interaction models -- Combination of the [multiple searches](https://qdrant.tech/documentation/concepts/hybrid-queries/) +const client = new QdrantClient({ + host: "xyz-example.eu-central.aws.cloud.qdrant.io", + apiKey: "", +}); +``` -What Qdrant doesn’t plan to support: +```rust +use qdrant_client::Qdrant; -- Non-vector-based retrieval or ranking functions -- Built-in ontologies or knowledge graphs -- Query analyzers and other NLP tools +let client = Qdrant::from_url("https://xyz-example.eu-central.aws.cloud.qdrant.io:6334") + .api_key("") + .build()?; +``` -Of course, you can always combine Qdrant with any specialized tool you need, including full-text search engines. -Read more about [our approach](https://qdrant.tech/articles/hybrid-search/) to hybrid search. +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; -## [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#collections) Collections +QdrantClient client = + new QdrantClient( + QdrantGrpcClient.newBuilder( + "xyz-example.eu-central.aws.cloud.qdrant.io", + 6334, + true) + .withApiKey("") + .build()); +``` -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#how-many-collections-can-i-create) How many collections can I create? +```csharp +using Qdrant.Client; -As many as you want, but be aware that each collection requires additional resources. -It is _highly_ recommended not to create many small collections, as it will lead to significant resource consumption overhead. +var client = new QdrantClient( + host: "xyz-example.eu-central.aws.cloud.qdrant.io", + https: true, + apiKey: "" +); +``` -We consider creating a collection for each user/dialog/document as an antipattern. +```go +import "github.com/qdrant/go-client/qdrant" -Please read more about collections, isolation, and multiple users in our [Multitenancy](https://qdrant.tech/documentation/tutorials/multiple-partitions/) tutorial. +client, err := qdrant.NewClient(&qdrant.Config{ + Host: "xyz-example.eu-central.aws.cloud.qdrant.io", + Port: 6334, + APIKey: "", + UseTLS: true, +}) +``` +#### Generating JSON Web Tokens -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#how-do-i-upload-a-large-number-of-vectors-into-a-qdrant-collection) How do I upload a large number of vectors into a Qdrant collection? +Due to the nature of JWT, anyone who knows the `api_key` can generate tokens by using any of the existing libraries and tools, it is not necessary for them to have access to the Qdrant instance to generate them. -Read about our recommendations in the [bulk upload](https://qdrant.tech/documentation/tutorials/bulk-upload/) tutorial. +For convenience, we have added a JWT generation tool the Qdrant Web UI under the 🔑 tab, if you're using the default url, it will be at `http://localhost:6333/dashboard#/jwt`. -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#can-i-only-store-quantized-vectors-and-discard-full-precision-vectors) Can I only store quantized vectors and discard full precision vectors? +- **JWT Header** - Qdrant uses the `HS256` algorithm to decode the tokens. -No, Qdrant requires full precision vectors for operations like reindexing, rescoring, etc. + ```json + { + "alg": "HS256", + "typ": "JWT" + } + ``` -## [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#compatibility) Compatibility +- **JWT Payload** - You can include any combination of the [parameters available](#jwt-configuration) in the payload. Keep reading for more info on each one. -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#is-qdrant-compatible-with-cpus-or-gpus-for-vector-computation) Is Qdrant compatible with CPUs or GPUs for vector computation? + ```json + { + "exp": 1640995200, // Expiration time + "value_exists": ..., // Validate this token by looking for a point with a payload value + "access": "r", // Define the access level. + } + ``` -Qdrant primarily relies on CPU acceleration for scalability and efficiency. However, we also support GPU-accelerated indexing on all major vendors. +**Signing the token** - To confirm that the generated token is valid, it needs to be signed with the `api_key` you have set in the configuration. +That would mean, that someone who knows the `api_key` gives the authorization for the new token to be used in the Qdrant instance. +Qdrant can validate the signature, because it knows the `api_key` and can decode the token. -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#do-you-guarantee-compatibility-across-versions) Do you guarantee compatibility across versions? +The process of token generation can be done on the client side offline, and doesn't require any communication with the Qdrant instance. -In case your version is older, we only guarantee compatibility between two consecutive minor versions. This also applies to client versions. Ensure your client version is never more than one minor version away from your cluster version. -While we will assist with break/fix troubleshooting of issues and errors specific to our products, Qdrant is not accountable for reviewing, writing (or rewriting), or debugging custom code. +Here is an example of libraries that can be used to generate JWT tokens: -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#do-you-support-downgrades) Do you support downgrades? +- Python: [PyJWT](https://pyjwt.readthedocs.io/en/stable/) +- JavaScript: [jsonwebtoken](https://www.npmjs.com/package/jsonwebtoken) +- Rust: [jsonwebtoken](https://crates.io/crates/jsonwebtoken) -We do not support downgrading a cluster on any of our products. If you deploy a newer version of Qdrant, your -data is automatically migrated to the newer storage format. This migration is not reversible. +#### JWT Configuration -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#how-do-i-avoid-issues-when-updating-to-the-latest-version) How do I avoid issues when updating to the latest version? +These are the available options, or **claims** in the JWT lingo. You can use them in the JWT payload to define its functionality. -We only guarantee compatibility if you update between consecutive versions. You would need to upgrade versions one at a time: `1.1 -> 1.2`, then `1.2 -> 1.3`, then `1.3 -> 1.4`. +- **`exp`** - The expiration time of the token. This is a Unix timestamp in seconds. The token will be invalid after this time. The check for this claim includes a 30-second leeway to account for clock skew. -## [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#cloud) Cloud + ```json + { + "exp": 1640995200, // Expiration time + } + ``` -### [Anchor](https://qdrant.tech/documentation/faq/qdrant-fundamentals/\#is-it-possible-to-scale-down-a-qdrant-cloud-cluster) Is it possible to scale down a Qdrant Cloud cluster? +- **`value_exists`** - This is a claim that can be used to validate the token against the data stored in a collection. Structure of this claim is as follows: -Yes, it is possible to both vertically and horizontally scale down a Qdrant Cloud cluster. -Note, that during the vertical scaling down, the disk size cannot be reduced. + ```json + { + "value_exists": { + "collection": "my_validation_collection", + "matches": [ + { "key": "my_key", "value": "value_that_must_exist" } + ], + }, + } + ``` -##### Was this page useful? + If this claim is present, Qdrant will check if there is a point in the collection with the specified key-values. If it does, the token is valid. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No + This claim is especially useful if you want to have an ability to revoke tokens without changing the `api_key`. + Consider a case where you have a collection of users, and you want to revoke access to a specific user. -Thank you for your feedback! 🙏 + ```json + { + "value_exists": { + "collection": "users", + "matches": [ + { "key": "user_id", "value": "andrey" }, + { "key": "role", "value": "manager" } + ], + }, + } + ``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/faq/qdrant-fundamentals.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. + You can create a token with this claim, and when you want to revoke access, you can change the `role` of the user to something else, and the token will be invalid. -On this page: +- **`access`** - This claim defines the [access level](#table-of-access) of the token. If this claim is present, Qdrant will check if the token has the required access level to perform the operation. If this claim is **not** present, **manage** access is assumed. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/faq/qdrant-fundamentals.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) + It can provide global access with `r` for read-only, or `m` for manage. For example: -× + ```json + { + "access": "r" + } + ``` -[Powered by](https://qdrant.tech/) + It can also be specific to one or more collections. The `access` level for each collection is `r` for read-only, or `rw` for read-write, like this: -<|page-120-lllmstxt|> -## embeddings -- [Documentation](https://qdrant.tech/documentation/) -- Embeddings + ```json + { + "access": [ + { + "collection": "my_collection", + "access": "rw" + } + ] + } + ``` -# [Anchor](https://qdrant.tech/documentation/embeddings/\#supported-embedding-providers--models) Supported Embedding Providers & Models + You can also specify which subset of the collection the user is able to access by specifying a `payload` restriction that the points must have. -Qdrant supports all available text and multimodal dense vector embedding models as well as vector embedding services without any limitations. + ```json + { + "access": [ + { + "collection": "my_collection", + "access": "r", + "payload": { + "user_id": "user_123456" + } + } + ] + } + ``` -## [Anchor](https://qdrant.tech/documentation/embeddings/\#some-of-the-embeddings-you-can-use-with-qdrant) Some of the Embeddings you can use with Qdrant + This `payload` claim will be used to implicitly filter the points in the collection. It will be equivalent to appending this filter to each request: -SentenceTransformers, BERT, SBERT, Clip, OpenClip, Open AI, Vertex AI, Azure AI, AWS Bedrock, Jina AI, Upstage AI, Mistral AI, Cohere AI, Voyage AI, Aleph Alpha, Baidu Qianfan, BGE, Instruct, Watsonx Embeddings, Snowflake Embeddings, NVIDIA NeMo, Nomic, OCI Embeddings, Ollama Embeddings, MixedBread, Together AI, Clarifai, Databricks Embeddings, GPT4All Embeddings, John Snow Labs Embeddings. + ```json + { "filter": { "must": [{ "key": "user_id", "match": { "value": "user_123456" } }] } } + ``` -Additionally, [any open-source embeddings from HuggingFace](https://huggingface.co/spaces/mteb/leaderboard) can be used with Qdrant. +### Table of access -## [Anchor](https://qdrant.tech/documentation/embeddings/\#code-samples) Code samples +Check out this table to see which actions are allowed or denied based on the access level. -| Embeddings Providers | Description | -| --- | --- | -| [Aleph Alpha](https://qdrant.tech/documentation/embeddings/aleph-alpha/) | Multilingual embeddings focused on European languages. | -| [Bedrock](https://qdrant.tech/documentation/embeddings/bedrock/) | AWS managed service for foundation models and embeddings. | -| [Cohere](https://qdrant.tech/documentation/embeddings/cohere/) | Language model embeddings for NLP tasks. | -| [Gemini](https://qdrant.tech/documentation/embeddings/gemini/) | Google’s multimodal embeddings for text and vision. | -| [Jina AI](https://qdrant.tech/documentation/embeddings/jina-embeddings/) | Customizable embeddings for neural search. | -| [Mistral](https://qdrant.tech/documentation/embeddings/mistral/) | Open-source, efficient language model embeddings. | -| [MixedBread](https://qdrant.tech/documentation/embeddings/mixedbread/) | Lightweight embeddings for constrained environments. | -| [Mixpeek](https://qdrant.tech/documentation/embeddings/mixpeek/) | Managed SDK for video chunking, embedding, and post-processing. ​ | -| [Nomic](https://qdrant.tech/documentation/embeddings/nomic/) | Embeddings for data visualization. | -| [Nvidia](https://qdrant.tech/documentation/embeddings/nvidia/) | GPU-optimized embeddings from Nvidia. | -| [Ollama](https://qdrant.tech/documentation/embeddings/ollama/) | Embeddings for conversational AI. | -| [OpenAI](https://qdrant.tech/documentation/embeddings/openai/) | Industry-leading embeddings for NLP. | -| [Prem AI](https://qdrant.tech/documentation/embeddings/premai/) | Precise language embeddings. | -| [Twelve Labs](https://qdrant.tech/documentation/embeddings/twelvelabs/) | Multimodal embeddings from Twelve labs. | -| [Snowflake](https://qdrant.tech/documentation/embeddings/snowflake/) | Scalable embeddings for big data. | -| [Upstage](https://qdrant.tech/documentation/embeddings/upstage/) | Embeddings for speech and language tasks. | -| [Voyage AI](https://qdrant.tech/documentation/embeddings/voyage/) | Navigation and spatial understanding embeddings. | +This is also applicable to using api keys instead of tokens. In that case, `api_key` maps to **manage**, while `read_only_api_key` maps to **read-only**. -##### Was this page useful? +
Symbols: ✅ Allowed | ❌ Denied | 🟡 Allowed, but filtered
-![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +| Action | manage | read-only | collection read-write | collection read-only | collection with payload claim (r / rw) | +|--------|--------|-----------|----------------------|-----------------------|------------------------------------| +| list collections | ✅ | ✅ | 🟡 | 🟡 | 🟡 | +| get collection info | ✅ | ✅ | ✅ | ✅ | ❌ | +| create collection | ✅ | ❌ | ❌ | ❌ | ❌ | +| delete collection | ✅ | ❌ | ❌ | ❌ | ❌ | +| update collection params | ✅ | ❌ | ❌ | ❌ | ❌ | +| get collection cluster info | ✅ | ✅ | ✅ | ✅ | ❌ | +| collection exists | ✅ | ✅ | ✅ | ✅ | ✅ | +| update collection cluster setup | ✅ | ❌ | ❌ | ❌ | ❌ | +| update aliases | ✅ | ❌ | ❌ | ❌ | ❌ | +| list collection aliases | ✅ | ✅ | 🟡 | 🟡 | 🟡 | +| list aliases | ✅ | ✅ | 🟡 | 🟡 | 🟡 | +| create shard key | ✅ | ❌ | ❌ | ❌ | ❌ | +| delete shard key | ✅ | ❌ | ❌ | ❌ | ❌ | +| create payload index | ✅ | ❌ | ✅ | ❌ | ❌ | +| delete payload index | ✅ | ❌ | ✅ | ❌ | ❌ | +| list collection snapshots | ✅ | ✅ | ✅ | ✅ | ❌ | +| create collection snapshot | ✅ | ❌ | ✅ | ❌ | ❌ | +| delete collection snapshot | ✅ | ❌ | ✅ | ❌ | ❌ | +| download collection snapshot | ✅ | ✅ | ✅ | ✅ | ❌ | +| upload collection snapshot | ✅ | ❌ | ❌ | ❌ | ❌ | +| recover collection snapshot | ✅ | ❌ | ❌ | ❌ | ❌ | +| list shard snapshots | ✅ | ✅ | ✅ | ✅ | ❌ | +| create shard snapshot | ✅ | ❌ | ✅ | ❌ | ❌ | +| delete shard snapshot | ✅ | ❌ | ✅ | ❌ | ❌ | +| download shard snapshot | ✅ | ✅ | ✅ | ✅ | ❌ | +| upload shard snapshot | ✅ | ❌ | ❌ | ❌ | ❌ | +| recover shard snapshot | ✅ | ❌ | ❌ | ❌ | ❌ | +| list full snapshots | ✅ | ✅ | ❌ | ❌ | ❌ | +| create full snapshot | ✅ | ❌ | ❌ | ❌ | ❌ | +| delete full snapshot | ✅ | ❌ | ❌ | ❌ | ❌ | +| download full snapshot | ✅ | ✅ | ❌ | ❌ | ❌ | +| get cluster info | ✅ | ✅ | ❌ | ❌ | ❌ | +| recover raft state | ✅ | ❌ | ❌ | ❌ | ❌ | +| delete peer | ✅ | ❌ | ❌ | ❌ | ❌ | +| get point | ✅ | ✅ | ✅ | ✅ | ❌ | +| get points | ✅ | ✅ | ✅ | ✅ | ❌ | +| upsert points | ✅ | ❌ | ✅ | ❌ | ❌ | +| update points batch | ✅ | ❌ | ✅ | ❌ | ❌ | +| delete points | ✅ | ❌ | ✅ | ❌ | ❌ / 🟡 | +| update vectors | ✅ | ❌ | ✅ | ❌ | ❌ | +| delete vectors | ✅ | ❌ | ✅ | ❌ | ❌ / 🟡 | +| set payload | ✅ | ❌ | ✅ | ❌ | ❌ | +| overwrite payload | ✅ | ❌ | ✅ | ❌ | ❌ | +| delete payload | ✅ | ❌ | ✅ | ❌ | ❌ | +| clear payload | ✅ | ❌ | ✅ | ❌ | ❌ | +| scroll points | ✅ | ✅ | ✅ | ✅ | 🟡 | +| query points | ✅ | ✅ | ✅ | ✅ | 🟡 | +| search points | ✅ | ✅ | ✅ | ✅ | 🟡 | +| search groups | ✅ | ✅ | ✅ | ✅ | 🟡 | +| recommend points | ✅ | ✅ | ✅ | ✅ | ❌ | +| recommend groups | ✅ | ✅ | ✅ | ✅ | ❌ | +| discover points | ✅ | ✅ | ✅ | ✅ | ❌ | +| count points | ✅ | ✅ | ✅ | ✅ | 🟡 | +| version | ✅ | ✅ | ✅ | ✅ | ✅ | +| readyz, healthz, livez | ✅ | ✅ | ✅ | ✅ | ✅ | +| telemetry | ✅ | ✅ | ❌ | ❌ | ❌ | +| metrics | ✅ | ✅ | ❌ | ❌ | ❌ | +| update locks | ✅ | ❌ | ❌ | ❌ | ❌ | +| get locks | ✅ | ✅ | ❌ | ❌ | ❌ | -Thank you for your feedback! 🙏 +## TLS -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/embeddings/_index.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +*Available as of v1.2.0* -On this page: +TLS for encrypted connections can be enabled on your Qdrant instance to secure +connections. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/embeddings/_index.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) + -× +First make sure you have a certificate and private key for TLS, usually in +`.pem` format. On your local machine you may use +[mkcert](https://github.com/FiloSottile/mkcert#readme) to generate a self signed +certificate. -[Powered by](https://qdrant.tech/) +To enable TLS, set the following properties in the Qdrant configuration with the +correct paths and restart: -<|page-121-lllmstxt|> -## minicoil -- [Articles](https://qdrant.tech/articles/) -- miniCOIL: on the Road to Usable Sparse Neural Retrieval +```yaml +service: + # Enable HTTPS for the REST and gRPC API + enable_tls: true -[Back to Machine Learning](https://qdrant.tech/articles/machine-learning/) +# TLS configuration. +# Required if either service.enable_tls or cluster.p2p.enable_tls is true. +tls: + # Server certificate chain file + cert: ./tls/cert.pem -# miniCOIL: on the Road to Usable Sparse Neural Retrieval + # Server private key file + key: ./tls/key.pem +``` -Evgeniya Sukhodolskaya +For internal communication when running cluster mode, TLS can be enabled with: -· +```yaml +cluster: + # Configuration of the inter-cluster communication + p2p: + # Use TLS for communication between peers + enable_tls: true +``` -May 13, 2025 +With TLS enabled, you must start using HTTPS connections. For example: -![miniCOIL: on the Road to Usable Sparse Neural Retrieval](https://qdrant.tech/articles_data/minicoil/preview/title.jpg) +```bash +curl -X GET https://localhost:6333 +``` -Have you ever heard of sparse neural retrieval? If so, have you used it in production? +```python +from qdrant_client import QdrantClient -It’s a field with excellent potential – who wouldn’t want to use an approach that combines the strengths of dense and term-based text retrieval? Yet it’s not so popular. Is it due to the common curse of _“What looks good on paper is not going to work in practice”?_? +client = QdrantClient( + url="https://localhost:6333", +) +``` -This article describes our path towards sparse neural retrieval _as it should be_ – lightweight term-based retrievers capable of distinguishing word meanings. +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -Learning from the mistakes of previous attempts, we created **miniCOIL**, a new sparse neural candidate to take BM25’s place in hybrid searches. We’re happy to share it with you and are awaiting your feedback. +const client = new QdrantClient({ url: "https://localhost", port: 6333 }); +``` -## [Anchor](https://qdrant.tech/articles/minicoil/\#the-good-the-bad-and-the-ugly) The Good, the Bad and the Ugly +```rust +use qdrant_client::Qdrant; -Sparse neural retrieval is not so well known, as opposed to methods it’s based on – term-based and dense retrieval. Their weaknesses motivated this field’s development, guiding its evolution. Let’s follow its path. +let client = Qdrant::from_url("http://localhost:6334").build()?; +``` -![Retrievers evolution](https://qdrant.tech/articles_data/minicoil/models_evolution.png) +Certificate rotation is enabled with a default refresh time of one hour. This +reloads certificate files every hour while Qdrant is running. This way changed +certificates are picked up when they get updated externally. The refresh time +can be tuned by changing the `tls.cert_ttl` setting. You can leave this on, even +if you don't plan to update your certificates. Currently this is only supported +for the REST API. -Retrievers evolution +Optionally, you can enable client certificate validation on the server against a +local certificate authority. Set the following properties and restart: -### [Anchor](https://qdrant.tech/articles/minicoil/\#term-based-retrieval) Term-based Retrieval +```yaml +service: + # Check user HTTPS client certificate against CA file specified in tls config + verify_https_client_certificate: false -Term-based retrieval usually treats text as a bag of words. These words play roles of different importance, contributing to the overall relevance score between a document and a query. +# TLS configuration. +# Required if either service.enable_tls or cluster.p2p.enable_tls is true. +tls: + # Certificate authority certificate file. + # This certificate will be used to validate the certificates + # presented by other nodes during inter-cluster communication. + # + # If verify_https_client_certificate is true, it will verify + # HTTPS client certificate + # + # Required if cluster.p2p.enable_tls is true. + ca_cert: ./tls/cacert.pem +``` -Famous **BM25** estimates words’ contribution based on their: +## Hardening -1. Importance in a particular text – Term Frequency (TF) based. -2. Significance within the whole corpus – Inverse Document Frequency (IDF) based. +We recommend reducing the amount of permissions granted to Qdrant containers so that you can reduce the risk of exploitation. Here are some ways to reduce the permissions of a Qdrant container: -It also has several parameters reflecting typical text length in the corpus, the exact meaning of which you can check in [our detailed breakdown of the BM25 formula](https://qdrant.tech/articles/bm42/#why-has-bm25-stayed-relevant-for-so-long). +* Run Qdrant as a non-root user. This can help mitigate the risk of future container breakout vulnerabilities. Qdrant does not need the privileges of the root user for any purpose. + - You can use the image `qdrant/qdrant:-unprivileged` instead of the default Qdrant image. + - You can use the flag `--user=1000:2000` when running [`docker run`](https://docs.docker.com/reference/cli/docker/container/run/). + - You can set [`user: 1000`](https://docs.docker.com/compose/compose-file/05-services/#user) when using Docker Compose. + - You can set [`runAsUser: 1000`](https://kubernetes.io/docs/tasks/configure-pod-container/security-context) when running in Kubernetes (our [Helm chart](https://github.com/qdrant/qdrant-helm) does this by default). -Precisely defining word importance within a text is nontrivial. +* Run Qdrant with a read-only root filesystem. This can help mitigate vulnerabilities that require the ability to modify system files, which is a permission Qdrant does not need. As long as the container uses mounted volumes for storage (`/qdrant/storage` and `/qdrant/snapshots` by default), Qdrant can continue to operate while being prevented from writing data outside of those volumes. + - You can use the flag `--read-only` when running [`docker run`](https://docs.docker.com/reference/cli/docker/container/run/). + - You can set [`read_only: true`](https://docs.docker.com/compose/compose-file/05-services/#read_only) when using Docker Compose. + - You can set [`readOnlyRootFilesystem: true`](https://kubernetes.io/docs/tasks/configure-pod-container/security-context) when running in Kubernetes (our [Helm chart](https://github.com/qdrant/qdrant-helm) does this by default). -BM25 is built on the idea that term importance can be defined statistically. -This isn’t far from the truth in long texts, where frequent repetition of a certain word signals that the text is related to this concept. In very short texts – say, chunks for Retrieval Augmented Generation (RAG) – it’s less applicable, with TF of 0 or 1. We approached fixing it in our [BM42 modification of BM25 algorithm.](https://qdrant.tech/articles/bm42/) +* Block Qdrant's external network access. This can help mitigate [server side request forgery attacks](https://owasp.org/www-community/attacks/Server_Side_Request_Forgery), like via the [snapshot recovery API](https://api.qdrant.tech/api-reference/snapshots/recover-from-snapshot). Single-node Qdrant clusters do not require any outbound network access. Multi-node Qdrant clusters only need the ability to connect to other Qdrant nodes via TCP ports 6333, 6334, and 6335. + - You can use [`docker network create --internal `](https://docs.docker.com/reference/cli/docker/network/create/#internal) and use that network when running [`docker run --network `](https://docs.docker.com/reference/cli/docker/container/run/#network). + - You can create an [internal network](https://docs.docker.com/compose/compose-file/06-networks/#internal) when using Docker Compose. + - You can create a [NetworkPolicy](https://kubernetes.io/docs/concepts/services-networking/network-policies/) when using Kubernetes. Note that multi-node Qdrant clusters [will also need access to cluster DNS in Kubernetes](https://github.com/ahmetb/kubernetes-network-policy-recipes/blob/master/11-deny-egress-traffic-from-an-application.md#allowing-dns-traffic). -Yet there is one component of a word’s importance for retrieval, which is not considered in BM25 at all – word meaning. The same words have different meanings in different contexts, and it affects the text’s relevance. Think of _“fruit **bat**”_ and _“baseball **bat**"_—the same importance in the text, different meanings. +There are other techniques for reducing the permissions such as dropping [Linux capabilities](https://www.man7.org/linux/man-pages/man7/capabilities.7.html) depending on your deployment method, but the methods mentioned above are the most important. -### [Anchor](https://qdrant.tech/articles/minicoil/\#dense-retrieval) Dense Retrieval +<|page-196-lllmstxt|> +# Usage statistics -How to capture the meaning? Bag-of-words models like BM25 assume that words are placed in a text independently, while linguists say: +The Qdrant open-source container image collects anonymized usage statistics from users in order to improve the engine by default. You can [deactivate](#deactivate-telemetry) at any time, and any data that has already been collected can be [deleted on request](#request-information-deletion). -> “You shall know a word by the company it keeps” - John Rupert Firth +Deactivating this will not affect your ability to monitor the Qdrant database yourself by accessing the `/metrics` or `/telemetry` endpoints of your database. It will just stop sending independend, anonymized usage statistics to the Qdrant team. -This idea, together with the motivation to numerically express word relationships, powered the development of the second branch of retrieval – dense vectors. Transformer models with attention mechanisms solved the challenge of distinguishing word meanings within text context, making it a part of relevance matching in retrieval. + -Yet dense retrieval didn’t (and can’t) become a complete replacement for term-based retrieval. Dense retrievers are capable of broad semantic similarity searches, yet they lack precision when we need results including a specific keyword. +## Why do we collect usage statistics? -It’s a fool’s errand – trying to make dense retrievers do exact matching, as they’re built in a paradigm where every word matches every other word semantically to some extent, and this semantic similarity depends on the training data of a particular model. +We want to make Qdrant fast and reliable. To do this, we need to understand how it performs in real-world scenarios. +We do a lot of benchmarking internally, but it is impossible to cover all possible use cases, hardware, and configurations. -### [Anchor](https://qdrant.tech/articles/minicoil/\#sparse-neural-retrieval) Sparse Neural Retrieval +In order to identify bottlenecks and improve Qdrant, we need to collect information about how it is used. -So, on one side, we have weak control over matching, sometimes leading to too broad retrieval results, and on the other—lightweight, explainable and fast term-based retrievers like BM25, incapable of capturing semantics. +Additionally, Qdrant uses a bunch of internal heuristics to optimize the performance. +To better set up parameters for these heuristics, we need to collect timings and counters of various pieces of code. +With this information, we can make Qdrant faster for everyone. -Of course, we want the best of both worlds, fused in one model, no drawbacks included. Sparse neural retrieval evolution was pushed by this desire. -- Why **sparse**? Term-based retrieval can operate on sparse vectors, where each word in the text is assigned a non-zero value (its importance in this text). -- Why **neural**? Instead of deriving an importance score for a word based on its statistics, let’s use machine learning models capable of encoding words’ meaning. +## What information is collected? -**So why is it not widely used?** +There are 3 types of information that we collect: -![Problems of modern sparse neural retrievers](https://qdrant.tech/articles_data/minicoil/models_problems.png) +* System information - general information about the system, such as CPU, RAM, and disk type. As well as the configuration of the Qdrant instance. +* Performance - information about timings and counters of various pieces of code. +* Critical error reports - information about critical errors, such as backtraces, that occurred in Qdrant. This information would allow to identify problems nobody yet reported to us. -Problems of modern sparse neural retrievers +### We **never** collect the following information: -The detailed history of sparse neural retrieval makes for [a whole other article](https://qdrant.tech/articles/modern-sparse-neural-retrieval/). Summing a big part of it up, there were many attempts to map a word representation produced by a dense encoder to a single-valued importance score, and most of them never saw the real world outside of research papers ( **DeepImpact**, **TILDEv2**, **uniCOIL**). +- User's IP address +- Any data that can be used to identify the user or the user's organization +- Any data, stored in the collections +- Any names of the collections +- Any URLs -Trained end-to-end on a relevance objective, most of the **sparse encoders** estimated word importance well only for a particular domain. Their out-of-domain accuracy, on datasets they hadn’t “seen” during training, [was worse than BM25.](https://arxiv.org/pdf/2307.10488) +## How do we anonymize data? -The SOTA of sparse neural retrieval is **SPLADE** – (Sparse Lexical and Expansion Model). This model has made its way into retrieval systems - you can [use SPLADE++ in Qdrant with FastEmbed](https://qdrant.tech/documentation/fastembed/fastembed-splade/). +We understand that some users may be concerned about the privacy of their data. +That is why we make an extra effort to ensure your privacy. -Yet there’s a catch. The “expansion” part of SPLADE’s name refers to a technique that combats against another weakness of term-based retrieval – **vocabulary mismatch**. While dense encoders can successfully connect related terms like “fruit bat” and “flying fox”, term-based retrieval fails at this task. +There are several different techniques that we use to anonymize the data: -SPLADE solves this problem by **expanding documents and queries with additional fitting terms**. However, it leads to SPLADE inference becoming heavy. Additionally, produced representations become not-so-sparse (so, consequently, not lightweight) and far less explainable as expansion choices are made by machine learning models. +- We use a random UUID to identify instances. This UUID is generated on each startup and is not stored anywhere. There are no other ways to distinguish between different instances. +- We round all big numbers, so that the last digits are always 0. For example, if the number is 123456789, we will store 123456000. +- We replace all names with irreversibly hashed values. So no collection or field names will leak into the telemetry. +- All urls are hashed as well. -> “Big man in a suit of armor. Take that off, what are you?” +You can see exact version of anomymized collected data by accessing the [telemetry API](https://api.qdrant.tech/master/api-reference/service/telemetry) with `anonymize=true` parameter. -Experiments showed that SPLADE without its term expansion tells the same old story of sparse encoders — [it performs worse than BM25.](https://arxiv.org/pdf/2307.10488) +For example, -## [Anchor](https://qdrant.tech/articles/minicoil/\#eyes-on-the-prize-usable-sparse-neural-retrieval) Eyes on the Prize: Usable Sparse Neural Retrieval -Striving for perfection on specific benchmarks, the sparse neural retrieval field either produced models performing worse than BM25 out-of-domain(ironically, [trained with BM25-based hard negatives](https://arxiv.org/pdf/2307.10488)) or models based on heavy document expansion, lowering sparsity. +## Deactivate usage statistics -To be usable in production, the minimal criteria a sparse neural retriever should meet are: +You can deactivate usage statistics by: -- **Producing lightweight sparse representations (it’s in the name!).** Inheriting the perks of term-based retrieval, it should be lightweight and simple. For broader semantic search, there are dense retrievers. -- **Being better than BM25 at ranking in different domains.** The goal is a term-based retriever capable of distinguishing word meanings — what BM25 can’t do — preserving BM25’s out-of-domain, time-proven performance. +- setting the `QDRANT__TELEMETRY_DISABLED` environment variable to `true` +- setting the config option `telemetry_disabled` to `true` in the `config/production.yaml` or `config/config.yaml` files +- using cli option `--disable-telemetry` -![The idea behind miniCOIL](https://qdrant.tech/articles_data/minicoil/minicoil.png) +Any of these options will prevent Qdrant from sending any usage statistics data. -The idea behind miniCOIL +If you decide to deactivate usage statistics, we kindly ask you to share your feedback with us in the [Discord community](https://qdrant.to/discord) or GitHub [discussions](https://github.com/qdrant/qdrant/discussions) -### [Anchor](https://qdrant.tech/articles/minicoil/\#inspired-by-coil) Inspired by COIL +## Request information deletion -One of the attempts in the field of Sparse Neural Retrieval — [Contextualized Inverted Lists (COIL)](https://qdrant.tech/articles/modern-sparse-neural-retrieval/#sparse-neural-retriever-which-understood-homonyms) — stands out with its approach to term weights encoding. +We provide an email address so that users can request the complete removal of their data from all of our tools. -Instead of squishing high-dimensional token representations (usually 768-dimensional BERT embeddings) into a single number, COIL authors project them to smaller vectors of 32 dimensions. They propose storing these vectors in **inverted lists** of an **inverted index** (used in term-based retrieval) as is and comparing vector representations through dot product. +To do so, send an email to privacy@qdrant.com containing the unique identifier generated for your Qdrant installation. +You can find this identifier in the telemetry API response (`"id"` field), or in the logs of your Qdrant instance. -This approach captures deeper semantics, a single number simply cannot convey all the nuanced meanings a word can have. +Any questions regarding the management of the data we collect can also be sent to this email address. -Despite this advantage, COIL failed to gain widespread adoption for several key reasons: +<|page-197-lllmstxt|> +# Solving common errors -- Inverted indexes are usually not designed to store vectors and perform vector operations. -- Trained end-to-end with a relevance objective on [MS MARCO dataset](https://microsoft.github.io/msmarco/), COIL’s performance is heavily domain-bound. -- Additionally, COIL operates on tokens, reusing BERT’s tokenizer. However, working at a word level is far better for term-based retrieval. Imagine we want to search for a _“retriever”_ in our documentation. COIL will break it down into `re`, `#trie`, and `#ver` 32-dimensional vectors and match all three parts separately – not so convenient. +## Too many files open (OS error 24) -However, COIL representations allow distinguishing homographs, a skill BM25 lacks. The best ideas don’t start from zero. We propose an approach **built on top of COIL, keeping in mind what needs fixing**: +Each collection segment needs some files to be open. At some point you may encounter the following errors in your server log: -1. We should **abandon end-to-end training on a relevance objective** to get a model performant on out-of-domain data. There is not enough data to train a model able to generalize. -2. We should **keep representations sparse and reusable in a classic inverted index**. -3. We should **fix tokenization**. This problem is the easiest one to solve, as it was already done in several sparse neural retrievers, and [we also learned to do it in our BM42](https://qdrant.tech/articles/bm42/#wordpiece-retokenization). +```text +Error: Too many files open (OS error 24) +``` -### [Anchor](https://qdrant.tech/articles/minicoil/\#standing-on-the-shoulders-of-bm25) Standing on the Shoulders of BM25 +In such a case you may need to increase the limit of the open files. It might be done, for example, while you launch the Docker container: -BM25 has been a decent baseline across various domains for many years – and for a good reason. So why discard a time-proven formula? +```bash +docker run --ulimit nofile=10000:10000 qdrant/qdrant:latest +``` -Instead of training our sparse neural retriever to assign words’ importance scores, let’s add a semantic COIL-inspired component to BM25 formula. +The command above will set both soft and hard limits to `10000`. -score(D,Q)=∑i=1NIDF(qi)⋅ImportanceDqi⋅Meaningqi×dj, where term dj∈D equals qi +If you are not using Docker, the following command will change the limit for the current user session: -Then, if we manage to capture a word’s meaning, our solution alone could work like BM25 combined with a semantically aware reranker – or, in other words: +```bash +ulimit -n 10000 +``` -- It could see the difference between homographs; -- When used with word stems, it could distinguish parts of speech. +Please note, the command should be executed before you run Qdrant server. -![Meaning component](https://qdrant.tech/articles_data/minicoil/examples.png) +## Can't open Collections meta Wal -Meaning component +When starting a Qdrant instance as part of a distributed deployment, you may +come across an error message similar to this: -And if our model stumbles upon a word it hasn’t “seen” during training, we can just fall back to the original BM25 formula! +```bash +Can't open Collections meta Wal: Os { code: 11, kind: WouldBlock, message: "Resource temporarily unavailable" } +``` -### [Anchor](https://qdrant.tech/articles/minicoil/\#bag-of-words-in-4d) Bag-of-words in 4D +It means that Qdrant cannot start because a collection cannot be loaded. Its +associated [WAL](/documentation/concepts/storage/#versioning) files are currently +unavailable, likely because the same files are already being used by another +Qdrant instance. -COIL uses 32 values to describe one term. Do we need this many? How many words with 32 separate meanings could we name without additional research? +Each node must have their own separate storage directory, volume or mount. -Yet, even if we use fewer values in COIL representations, the initial problem of dense vectors not fitting into a classical inverted index persists. +The formed cluster will take care of sharing all data with each node, putting it +all in the correct places for you. If using Kubernetes, each node must have +their own volume. If using Docker, each node must have their own storage mount +or volume. If using Qdrant directly, each node must have their own storage +directory. -Unless
 We perform a simple trick! -![miniCOIL vectors to sparse representation](https://qdrant.tech/articles_data/minicoil/bow_4D.png) +## Using python gRPC client with `multiprocessing` -miniCOIL vectors to sparse representation +When using the Python gRPC client with `multiprocessing`, you may encounter an error like this: -Imagine a bag-of-words sparse vector. Every word from the vocabulary takes up one cell. If the word is present in the encoded text — we assign some weight; if it isn’t — it equals zero. +```text +<_InactiveRpcError of RPC that terminated with: + status = StatusCode.UNAVAILABLE + details = "sendmsg: Socket operation on non-socket (88)" + debug_error_string = "UNKNOWN:Error received from peer {grpc_message:"sendmsg: Socket operation on non-socket (88)", grpc_status:14, created_time:"....."}" +``` -If we have a mini COIL vector describing a word’s meaning, for example, in 4D semantic space, we could just dedicate 4 consecutive cells for word in the sparse vector, one cell per “meaning” dimension. If we don’t, we could fall back to a classic one-cell description with a pure BM25 score. +This error happens, because `multiprocessing` creates copies of gRPC channels, which share the same socket. When the parent process closes the channel, it closes the socket, and the child processes try to use a closed socket. -**Such representations can be used in any standard inverted index.** +To prevent this error, you can use the `forkserver` or `spawn` start methods for `multiprocessing`. -## [Anchor](https://qdrant.tech/articles/minicoil/\#training-minicoil) Training miniCOIL +```python +import multiprocessing -Now, we’re coming to the part where we need to somehow get this low-dimensional encapsulation of a word’s meaning – **a miniCOIL vector**. +multiprocessing.set_start_method("forkserver") # or "spawn" +``` -We want to work smarter, not harder, and rely as much as possible on time-proven solutions. Dense encoders are good at encoding a word’s meaning in its context, so it would be convenient to reuse their output. Moreover, we could kill two birds with one stone if we wanted to add miniCOIL to hybrid search – where dense encoder inference is done regardless. +Alternatively, you can switch to `REST` API, async client, or use built-in parallelization in the Python client - functions like `qdrant.upload_points(...)` -### [Anchor](https://qdrant.tech/articles/minicoil/\#reducing-dimensions) Reducing Dimensions +<|page-198-lllmstxt|> +# Migration -Dense encoder outputs are high-dimensional, so we need to perform **dimensionality reduction, which should preserve the word’s meaning in context**. The goal is to: +Migrating data between vector databases, especially across regions, platforms, or deployment types, can be a hassle. That’s where the [Qdrant Migration Tool](https://github.com/qdrant/migration) comes in. It supports a wide range of migration needs, including transferring data between Qdrant instances and migrating from other vector database providers to Qdrant. -- Avoid relevance objective and dependence on labelled datasets; -- Find a target capturing spatial relations between word’s meanings; -- Use the simplest architecture possible. +You can run the migration tool on any machine where you have connectivity to both the source and the target Qdrant databases. Direct connectivity between both databases is not required. For optimal performance, you should run the tool on a machine with a fast network connection and minimum latency to both databases. -### [Anchor](https://qdrant.tech/articles/minicoil/\#training-data) Training Data +In this tutorial, we will learn how to use the migration tool and walk through a practical example of migrating from other vector databases to Qdrant. -We want miniCOIL vectors to be comparable according to a word’s meaning — _fruit **bat**_ and _vampire **bat**_ should be closer to each other in low-dimensional vector space than to _baseball **bat**_. So, we need something to calibrate on when reducing the dimensionality of words’ contextualized representations. -It’s said that a word’s meaning is hidden in the surrounding context or, simply put, in any texts that include this word. In bigger texts, we risk the word’s meaning blending out. So, let’s work at the sentence level and assume that sentences sharing one word should cluster in a way that each cluster contains sentences where this word is used in one specific meaning. +## Why use this instead of Qdrant’s Native Snapshotting? -If that’s true, we could encode various sentences with a sophisticated dense encoder and form a reusable spatial relations target for input dense encoders. It’s not a big problem to find lots of textual data containing frequently used words when we have datasets like the [OpenWebText dataset](https://paperswithcode.com/dataset/openwebtext), spanning the whole web. With this amount of data available, we could afford generalization and domain independence, which is hard to achieve with the relevance objective. +Qdrant supports [snapshot-based backups](https://qdrant.tech/documentation/concepts/snapshots/), low-level disk operations built for same cluster recovery or local backups. These snapshots: -#### [Anchor](https://qdrant.tech/articles/minicoil/\#its-going-to-work-i-bat) It’s Going to Work, I Bat +* Require snapshot consistency across nodes. +* Can be hard to port across machines or cloud zones. -Let’s test our assumption and take a look at the word _“bat”_. +On the other hand, the Qdrant Migration Tool: -We took several thousand sentences with this word, which we sampled from [OpenWebText dataset](https://paperswithcode.com/dataset/openwebtext) and vectorized with a [`mxbai-embed-large-v1`](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) encoder. The goal was to check if we could distinguish any clusters containing sentences where _“bat”_ shares the same meaning. +* Streams data in live batches. +* Can resume interrupted migrations. +* Works even when data is being inserted. +* Supports collection reconfiguration (e.g., change replication, and quantization) +* Supports migrating from other vector DBs (Pinecone, Chroma, Weaviate, etc.) -![Sentences with "bat" in 2D](https://qdrant.tech/articles_data/minicoil/bat.png) +## How to Use the Qdrant Migration Tool -Sentences with “bat” in 2D. +You can run the tool via Docker. -A very important observation: _Looks like a bat_:) +Installation: -The result had two big clusters related to _“bat”_ as an animal and _“bat”_ as a sports equipment, and two smaller ones related to fluttering motion and the verb used in sports. Seems like it could work! +```shell +docker pull registry.cloud.qdrant.io/library/qdrant-migration +``` -### [Anchor](https://qdrant.tech/articles/minicoil/\#architecture-and-training-objective) Architecture and Training Objective +Here is an example of how to perform a Qdrant to Qdrant migration: -Let’s continue dealing with _“bats”_. +```bash +docker run --rm -it \ + -e SOURCE_API_KEY='your-source-key' \ + -e TARGET_API_KEY='your-target-key' \ + registry.cloud.qdrant.io/library/qdrant-migration qdrant \ + --source-url 'https://source-instance.cloud.qdrant.io' \ + --source-collection 'benchmark' \ + --target-url 'https://target-instance.cloud.qdrant.io' \ + --target-collection 'benchmark' -We have a training pool of sentences containing the word _“bat”_ in different meanings. Using a dense encoder of choice, we get a contextualized embedding of _“bat”_ from each sentence and learn to compress it into a low-dimensional miniCOIL _“bat”_ space, guided by [`mxbai-embed-large-v1`](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) sentence embeddings. +``` -We’re dealing with only one word, so it should be enough to use just one linear layer for dimensionality reduction, with a [`Tanh activation`](https://pytorch.org/docs/stable/generated/torch.nn.Tanh.html) on top, mapping values of compressed vectors to (-1, 1) range. The activation function choice is made to align miniCOIL representations with dense encoder ones, which are mainly compared through `cosine similarity`. +## Example: Migrate from Pinecone to Qdrant -![miniCOIL architecture on a word level](https://qdrant.tech/articles_data/minicoil/miniCOIL_one_word.png) +Let’s now walk through an example of migrating from Pinecone to Qdrant. Assuming your Pinecone index looks like this: -miniCOIL architecture on a word level +![Pinecone Dashboard showing index details](/documentation/guides/pinecone-index.png) -As a training objective, we can select the minimization of [triplet loss](https://qdrant.tech/articles/triplet-loss/), where triplets are picked and aligned based on distances between [`mxbai-embed-large-v1`](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) sentence embeddings. We rely on the confidence (size of the margin) of [`mxbai-embed-large-v1`](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) to guide our _“bat”_ miniCOIL compression. +The information you need from Pinecone is: -![miniCOIL training](https://qdrant.tech/articles_data/minicoil/training_objective.png) +* Your Pinecone API key +* The index name +* The index host URL -miniCOIL training +With that information, you can migrate your vector database from Pinecone to Qdrant with the following command: -#### [Anchor](https://qdrant.tech/articles/minicoil/\#eating-elephant-one-bite-at-a-time) Eating Elephant One Bite at a Time +```bash +docker run --net=host --rm -it registry.cloud.qdrant.io/library/qdrant-migration pinecone \ + --pinecone.index-host 'https://sample-movies-efgjrye.svc.aped-4627-b74a.pinecone.io' \ + --pinecone.index-name 'sample-movies' \ + --pinecone.api-key 'pcsk_7Dh5MW_
' \ + --qdrant.url 'https://5f1a5c6c-7d47-45c3-8d47-d7389b1fad66.eu-west-1-0.aws.cloud.qdrant.io:6334' \ + --qdrant.api-key 'eyJhbGciOiJIUzI1NiIsInR5c
' \ + --qdrant.collection 'sample-movies' \ + --migration.batch-size 64 -Now, we have the full idea of how to train miniCOIL for one word. How do we scale to a whole vocabulary? -What if we keep it simple and continue training a model per word? It has certain benefits: +``` +When the migration is complete, you will see the new collection on Qdrant with all the vectors. -1. Extremely simple architecture: even one layer per word can suffice. -2. Super fast and easy training process. -3. Cheap and fast inference due to the simple architecture. -4. Flexibility to discover and tune underperforming words. -5. Flexibility to extend and shrink the vocabulary depending on the domain and use case. +## Conclusion -Then we could train all the words we’re interested in and simply combine (stack) all models into one big miniCOIL. +The **Qdrant Migration Tool** makes data transfer across vector database instances effortless. Whether you're moving between cloud regions, upgrading from self-hosted to Qdrant Cloud, or switching from other databases such as Pinecone, this tool saves you hours of manual effort. [Try it today](https://github.com/qdrant/migration). -![miniCOIL model](https://qdrant.tech/articles_data/minicoil/miniCOIL_full.png) +<|page-199-lllmstxt|> +# Static Embeddings: should you pay attention? +In the world of resource-constrained computing, a quiet revolution is taking place. While transformers dominate +leaderboards with their impressive capabilities, static embeddings are making an unexpected comeback, offering +remarkable speed improvements with surprisingly small quality trade-offs. **We evaluated how Qdrant users can benefit +from this renaissance, and the results are promising**. -miniCOIL model +## What makes static embeddings different? -### [Anchor](https://qdrant.tech/articles/minicoil/\#implementation-details) Implementation Details +Transformers are often seen as the only way to go when it comes to embeddings. The use of attention mechanisms helps to +capture the relationships between the input tokens, so each token gets a vector representation that is context-aware +and defined not only by the token itself but also by the surrounding tokens. Transformer-based models easily beat the +quality of the older methods, such as word2vec or GloVe, which could only create a single vector embedding per each +word. As a result, the word "bank" would have identical representation in the context of "river bank" and "financial +institution". -The code of the training approach sketched above is open-sourced [in this repository](https://github.com/qdrant/miniCOIL). +![Static embeddings](/documentation/tutorials/static-embeddings/financial-river-bank.png) -Here are the specific characteristics of the miniCOIL model we trained based on this approach: +Transformer-based models would represent the word "bank" differently in each of the contexts. However, transformers come +with a cost. They are computationally expensive and usually require a lot of memory, although the embeddings models +usually have fewer parameters than the Large Language Models. Still, GPUs are preferred to be used, even for inference. -| Component | Description | -| --- | --- | -| **Input Dense Encoder** | [`jina-embeddings-v2-small-en`](https://huggingface.co/jinaai/jina-embeddings-v2-small-en) (512 dimensions) | -| **miniCOIL Vectors Size** | 4 dimensions | -| **miniCOIL Vocabulary** | List of 30,000 of the most common English words, cleaned of stop words and words shorter than 3 letters, [taken from here](https://github.com/arstgit/high-frequency-vocabulary/tree/master). Words are stemmed to align miniCOIL with our BM25 implementation. | -| **Training Data** | 40 million sentences — a random subset of the [OpenWebText dataset](https://paperswithcode.com/dataset/openwebtext). To make triplet sampling convenient, we uploaded sentences and their [`mxbai-embed-large-v1`](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) embeddings to Qdrant and built a [full-text payload index](https://qdrant.tech/documentation/concepts/indexing/#full-text-index) on sentences with a tokenizer of type `word`. | -| **Training Data per Word** | We sample 8000 sentences per word and form triplets with a margin of at least **0.1**.
Additionally, we apply **augmentation** — take a sentence and cut out the target word plus its 1–3 neighbours. We reuse the same similarity score between original and augmented sentences for simplicity. | -| **Training Parameters** | **Epochs**: 60
**Optimizer**: Adam with a learning rate of 1e-4
**Validation set**: 20% | +Static embeddings are still a thing, though! [MinishLab](https://minishlab.github.io/) introduced their [model2vec +technique](https://huggingface.co/blog/Pringled/model2vec) in October 2024, achieving a remarkable 15x reduction in +model size and up to 500x speed increase while maintaining impressive performance levels. Their idea was to distill the +knowledge from the transformer-based sentence transformer and create a static embedding model that would be much faster +and less memory-consuming. This introduction seems to be a catalyst for the static embeddings renaissance, as we can see +static embeddings to be integrated even into popular [Sentence Transformers](https://www.sbert.net/) library. The +[recent blog post on the Hugging Face blog](https://huggingface.co/blog/static-embeddings) by [Tom +Aarsen](https://www.tomaarsen.com) reveals how to train a static embedding model using Sentence Transformers and still +get up to 85% of transformer-level quality at a fraction of computational cost. The blog post also introduces an +embedding model for English text retrieval, which is called `static-retrieval-mrl-en-v1`. -Each word was **trained on just one CPU**, and it took approximately fifty seconds per word to train. -We included this `minicoil-v1` version in the [v0.7.0 release of our FastEmbed library](https://github.com/qdrant/fastembed). +## Static embeddings in Qdrant -You can check an example of `minicoil-v1` usage with FastEmbed in the [HuggingFace card](https://huggingface.co/Qdrant/minicoil-v1). +From the vector database perspective, static embeddings are not different from any other embedding models. They are +dense vectors after all, and you can simply store them in a Qdrant collection. Here is how you do it with the +`sentence-transformers/static-retrieval-mrl-en-v1` model: -## [Anchor](https://qdrant.tech/articles/minicoil/\#results) Results +```python +import uuid -### [Anchor](https://qdrant.tech/articles/minicoil/\#validation-loss) Validation Loss +from sentence_transformers import SentenceTransformer +from qdrant_client import QdrantClient, models -Input transformer [`jina-embeddings-v2-small-en`](https://huggingface.co/jinaai/jina-embeddings-v2-small-en) approximates the “role model” transformer [`mxbai-embed-large-v1`](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) context relations with a (measured though triplets) quality of 83%. That means that in 17% of cases, [`jina-embeddings-v2-small-en`](https://huggingface.co/jinaai/jina-embeddings-v2-small-en) will take a sentence triplet from [`mxbai-embed-large-v1`](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) and embed it in a way that the negative example from the perspective of `mxbai` will be closer to the anchor than the positive one. +# The model produces vectors of size 1024 +model = SentenceTransformer( + "sentence-transformers/static-retrieval-mrl-en-v1" +) -The validation loss we obtained, depending on the miniCOIL vector size (4, 8, or 16), demonstrates miniCOIL correctly distinguishing from 76% (60 failed triplets on average per batch of size 256) to 85% (38 failed triplets on average per batch of size 256) triplets respectively. +# Let's assume we have a collection "my_collection" +# with a single vector called "static" +client = QdrantClient("http://localhost:6333") -![Validation loss](https://qdrant.tech/articles_data/minicoil/validation_loss.png) +# Calling the sentence transformer model to encode +# the text is not different compared to any other model +client.upsert( + "my_collection", + points=[ + models.PointStruct( + id=uuid.uuid4().hex, + vector=model.encode("Hello, world!"), + payload={"static": "Hello, world!"}, + ) + ] +) +``` -Validation loss +The retrieval is not going to be any faster just because you use static embeddings. However, **you will experience a +huge speedup in creating the vectors from your data**, what is usually a bottleneck. The Hugging Face blog post mentions +that the model might be even up to 400x faster on a CPU than the state-of-the-art embedding model. + +We didn't perform any proper benchmarking of the encoding speed, but one of the experiments done on `TREC-COVID` dataset +from [BeIR](https://github.com/beir-cellar/beir) shows that we can **encode and fully index 171K documents in Qdrant in +around 7.5 minutes**. All of it done on a consumer-grade laptop, without GPU acceleration. + +## Quantization of the static embeddings + +What can actually make the retrieval faster is the use of Matryoshka Embeddings, as the `static-retrieval-mrl-en-v1` +model was trained with that technique in mind. However, that's not the only way to speed up search. Quantization +methods are really popular among our users, and we were curious to check if they might be applied to the static +embeddings with the same success. + +We took the `static-retrieval-mrl-en-v1` model and tested it on various subsets of +[BeIR](https://github.com/beir-cellar/beir) with and without Binary Quantization, to see how much if affects the +retrieval quality. The results are really promising, as shown in our NDCG@10 measurements (a metric that evaluates the +ranking quality of search results, with higher scores indicating better performance): + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NDCG@10
DatasetOriginal vectorsBinary Quantization, no rescoring
SciFact0.593480.54195
TREC-COVID0.44280.44185
ArguAna0.443930.42164
NFCorpus0.300450.28027
+ +Binary Quantization definitely speeds up the retrieval, and make it cheaper, but also seems not to affect the quality of +the retrieval much in some cases. **However, that's something you should carefully verify on your own data**. If you are +a Qdrant user, then you can just enable quantization on an existing collection and [measure the impact on the retrieval +quality](/documentation/beginner-tutorials/retrieval-quality/). + +All the tests we did were performed using [`beir-qdrant`](https://github.com/kacperlukawski/beir-qdrant), and might be +reproduced by running [the script available on the project +repo](https://github.com/kacperlukawski/beir-qdrant/blob/main/examples/retrieval/search/evaluate_static_embeddings.py). + +## Who should use static embeddings? + +Static embeddings seem to be a budget-friendly option for those who would like to use semantic search in their +applications, but can't afford hosting standard representation models, or cannot do it, i.e. due to hardware +constraints. Some of the use cases might be: + +- **Mobile applications** - although many smartphones have powerful CPUs or even GPUs, the battery life is still a + concern, and the static embeddings might be a good compromise between the quality and the power consumption. Moreover, + the static embeddings can be used in the applications that require offline mode. +- **Web browser extensions** - running a transformer-based model in a web browser is usually not quite an option, but + static embeddings might be a good choice, as they have fewer parameters and are faster to encode. +- **Embedded systems** - the static embeddings might be a good choice for the devices with limited computational power, + such as IoT devices or microcontrollers. + +If you are one of the above, then you should definitely give static embeddings a try. **However, if the search quality +is not the top of your priorities, then you might consider using static embeddings even in the high-performance +environments**. The speedup in the encoding process might be a game-changer for you. + +### Customization of the static embeddings + +Last, but not least. The training pipeline published by [Tom Aarsen](https://www.tomaarsen.com) can help you to train +your own static embeddings models, so **you can adjust it the specifics of your data easily**. This training process +will also be way faster than for a transformer-based model, so you can even retrain it more often. Recomputing the +embeddings is a bottleneck of the semantic search systems, and the static embeddings might be a good solution to this +problem. Whether a custom static embedding model can beat a general pre-trained model remains an open question, but it's +definitely worth trying. -### [Anchor](https://qdrant.tech/articles/minicoil/\#benchmarking) Benchmarking +<|page-200-lllmstxt|> +# OpenLLMetry -The benchmarking code is open-sourced in [this repository](https://github.com/qdrant/mini-coil-demo/tree/master/minicoil_demo). +OpenLLMetry from [Traceloop](https://www.traceloop.com/) is a set of extensions built on top of [OpenTelemetry](https://opentelemetry.io/) that gives you complete observability over your LLM application. -To check our 4D miniCOIL version performance in different domains, we, ironically, chose a subset of the same [BEIR datasets](https://github.com/beir-cellar/beir), high benchmark values on which became an end in itself for many sparse neural retrievers. Yet the difference is that **miniCOIL wasn’t trained on BEIR datasets and shouldn’t be biased towards them**. +OpenLLMetry supports instrumenting the `qdrant_client` Python library and exporting the traces to various observability platforms, as described in their [Integrations catalog](https://www.traceloop.com/docs/openllmetry/integrations/introduction#the-integrations-catalog). -We’re testing our 4D miniCOIL model versus [our BM25 implementation](https://huggingface.co/Qdrant/bm25). BEIR datasets are indexed to Qdrant using the following parameters for both methods: +This page assumes you're using `qdrant-client` version 1.7.3 or above. +## Usage -- `k = 1.2`, `b = 0.75` default values recommended to use with BM25 scoring; -- `avg_len` estimated on 50,000 documents from a respective dataset. +To set up OpenLLMetry, follow these steps: -We compare models based on the `NDCG@10` metric, as we’re interested in the ranking performance of miniCOIL compared to BM25. Both retrieve the same subset of indexed documents based on exact matches, but miniCOIL should ideally rank this subset better based on its semantics understanding. +1. Install the SDK: -The result on several domains we tested is the following: +```console +pip install traceloop-sdk +``` -| Dataset | BM25 (NDCG@10) | MiniCOIL (NDCG@10) | -| --- | --- | --- | -| MS MARCO | 0.237 | **0.244** | -| NQ | 0.304 | **0.319** | -| Quora | 0.784 | **0.802** | -| FiQA-2018 | 0.252 | **0.257** | -| HotpotQA | **0.634** | 0.633 | +1. Instantiate the SDK: -We can see miniCOIL performing slightly better than BM25 in four out of five tested domains. It shows that **we’re moving in the right direction**. +```python +from traceloop.sdk import Traceloop -## [Anchor](https://qdrant.tech/articles/minicoil/\#key-takeaways) Key Takeaways +Traceloop.init() +``` -This article describes our attempt to make a lightweight sparse neural retriever that is able to generalize to out-of-domain data. Sparse neural retrieval has a lot of potential, and we hope to see it gain more traction. +You're now tracing your `qdrant_client` usage with OpenLLMetry! -### [Anchor](https://qdrant.tech/articles/minicoil/\#why-is-this-approach-useful) Why is this Approach Useful? +## Without the SDK -This approach to training sparse neural retrievers: +Since Traceloop provides standard OpenTelemetry instrumentations, you can use them as standalone packages. To do so, follow these steps: -1. Doesn’t rely on a relevance objective because it is trained in a self-supervised way, so it doesn’t need labeled datasets to scale. -2. Builds on the proven BM25 formula, simply adding a semantic component to it. -3. Creates lightweight sparse representations that fit into a standard inverted index. -4. Fully reuses the outputs of dense encoders, making it adaptable to different models. This also makes miniCOIL a cheap upgrade for hybrid search solutions. -5. Uses an extremely simple model architecture, with one trainable layer per word in miniCOIL’s vocabulary. This results in very fast training and inference. Also, this word-level training makes it easy to expand miniCOIL’s vocabulary for a specific use case. +1. Install the package: -### [Anchor](https://qdrant.tech/articles/minicoil/\#the-right-tool-for-the-right-job) The Right Tool for the Right Job +```console +pip install opentelemetry-instrumentation-qdrant +``` -When are miniCOIL retrievers applicable? +1. Instantiate the `QdrantInstrumentor`. -If you need precise term matching but BM25-based retrieval doesn’t meet your needs, ranking higher documents with words of the right form but the wrong semantical meaning. +```python +from opentelemetry.instrumentation.qdrant import QdrantInstrumentor -Say you’re implementing search in your documentation. In this use case, keywords-based search prevails, but BM25 won’t account for different context-based meanings of these keywords. For example, if you’re searching for a _“data **point**”_ in our documentation, you’d prefer to see _“a **point** is a record in Qdrant”_ ranked higher than _floating **point** precision_, and here miniCOIL-based retrieval is an alternative to consider. +QdrantInstrumentor().instrument() +``` -Additionally, miniCOIL fits nicely as a part of a hybrid search, as it enhances sparse retrieval without any noticeable increase in resource consumption, directly reusing contextual word representations produced by a dense encoder. +## Further Reading -To sum up, miniCOIL should work as if BM25 understood the meaning of words and ranked documents based on this semantic knowledge. It operates only on exact matches, so if you aim for documents semantically similar to the query but expressed in different words, dense encoders are the way to go. +- 📚 OpenLLMetry [API reference](https://www.traceloop.com/docs/api-reference/introduction) +- 📄 [Source Code](https://github.com/traceloop/openllmetry/tree/main/packages/opentelemetry-instrumentation-qdrant) -### [Anchor](https://qdrant.tech/articles/minicoil/\#whats-next) What’s Next? +<|page-201-lllmstxt|> +# OpenLIT -We will continue working on improving our approach – both in-depth, searching for ways to improve the model’s quality, and in-width, extending it to various dense encoders and languages beyond English. +[OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native LLM Application Observability tool and includes OpenTelemetry auto-instrumentation to monitor Qdrant and provide insights to improve database operations and application performance. -And we would love to share this road to usable sparse neural retrieval with you! -##### Was this page useful? +This page assumes you're using `qdrant-client` version 1.7.3 or above. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +## Usage -Thank you for your feedback! 🙏 +### Step 1: Install OpenLIT -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/miniCOIL.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Open your command line or terminal and run: -On this page: +```bash +pip install openlit +``` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/miniCOIL.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +### Step 2: Initialize OpenLIT in your Application +Integrating OpenLIT into LLM applications is straightforward with just **two lines of code**: -× +```python +import openlit -[Powered by](https://qdrant.tech/) +openlit.init() +``` -<|page-122-lllmstxt|> -## vector-search-resource-optimization -- [Articles](https://qdrant.tech/articles/) -- Vector Search Resource Optimization Guide +OpenLIT directs the trace to your console by default. To forward telemetry data to an HTTP OTLP endpoint, configure the `otlp_endpoint` parameter or the `OTEL_EXPORTER_OTLP_ENDPOINT` environment variable. -[Back to Vector Search Manuals](https://qdrant.tech/articles/vector-search-manuals/) +For OpenTelemetry backends requiring authentication, use the `otlp_headers` parameter or the `OTEL_EXPORTER_OTLP_HEADERS` environment variable with the required values. -# Vector Search Resource Optimization Guide +## Further Reading -David Myriel +With the LLM Observability data now being collected by OpenLIT, the next step is to visualize and analyze this data to get insights Qdrant's performance, behavior, and identify areas of improvement. -· +To begin exploring your LLM Application's performance data within the OpenLIT UI, please see the [Quickstart Guide](https://docs.openlit.io/latest/quickstart). -February 09, 2025 +If you want to integrate and send the generated metrics and traces to your existing observability tools like Promethues+Jaeger, Grafana or more, refer to the [Official Documentation for OpenLIT Connections](https://docs.openlit.io/latest/connections/intro) for detailed instructions. -![Vector Search Resource Optimization Guide](https://qdrant.tech/articles_data/vector-search-resource-optimization/preview/title.jpg) +<|page-202-lllmstxt|> +# Airbyte -## [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#whats-in-this-guide) What’s in This Guide? +[Airbyte](https://airbyte.com/) is an open-source data integration platform that helps you replicate your data +between different systems. It has a [growing list of connectors](https://docs.airbyte.io/integrations) that can +be used to ingest data from multiple sources. Building data pipelines is also crucial for managing the data in +Qdrant, and Airbyte is a great tool for this purpose. -[**Resource Management Strategies:**](https://qdrant.tech/articles/vector-search-resource-optimization/#storage-disk-vs-ram) If you are trying to scale your app on a budget - this is the guide for you. We will show you how to avoid wasting compute resources and get the maximum return on your investment. +Airbyte may take care of the data ingestion from a selected source, while Qdrant will help you to build a search +engine on top of it. There are three supported modes of how the data can be ingested into Qdrant: -[**Performance Improvement Tricks:**](https://qdrant.tech/articles/vector-search-resource-optimization/#configure-indexing-for-faster-searches) We’ll dive into advanced techniques like indexing, compression, and partitioning. Our tips will help you get better results at scale, while reducing total resource expenditure. +* **Full Refresh Sync** +* **Incremental - Append Sync** +* **Incremental - Append + Deduped** -[**Query Optimization Methods:**](https://qdrant.tech/articles/vector-search-resource-optimization/#query-optimization) Improving your vector database setup isn’t just about saving costs. We’ll show you how to build search systems that deliver consistently high precision while staying adaptable. +You can read more about these modes in the [Airbyte documentation](https://docs.airbyte.io/integrations/destinations/qdrant). -* * * +## Prerequisites -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#remember-optimization-is-a-balancing-act) Remember: Optimization is a Balancing Act +Before you start, make sure you have the following: -In this guide, we will show you how to use Qdrant’s features to meet your performance needs. -However - there are resource tradeoffs and you can’t have it all. -It is up to you to choose the optimization strategy that best fits your goals. +1. Airbyte instance, either [Open Source](https://airbyte.com/solutions/airbyte-open-source), + [Self-Managed](https://airbyte.com/solutions/airbyte-enterprise), or [Cloud](https://airbyte.com/solutions/airbyte-cloud). +2. Running instance of Qdrant. It has to be accessible by URL from the machine where Airbyte is running. + You can follow the [installation guide](/documentation/guides/installation/) to set up Qdrant. -![optimization](https://qdrant.tech/articles_data/vector-search-resource-optimization/optimization.png) +## Setting up Qdrant as a destination -Let’s take a look at some common goals and optimization strategies: +Once you have a running instance of Airbyte, you can set up Qdrant as a destination directly in the UI. +Airbyte's Qdrant destination is connected with a single collection in Qdrant. -| Intended Result | Optimization Strategy | -| --- | --- | -| [**High Search Precision + Low Memory Expenditure**](https://qdrant.tech/documentation/guides/optimize/#1-high-speed-search-with-low-memory-usage) | [**On-Disk Indexing**](https://qdrant.tech/documentation/guides/optimize/#1-high-speed-search-with-low-memory-usage) | -| [**Low Memory Expenditure + Fast Search Speed**](https://qdrant.tech/documentation/guides/quantization/) | [**Quantization**](https://qdrant.tech/documentation/guides/quantization/) | -| [**High Search Precision + Fast Search Speed**](https://qdrant.tech/documentation/guides/optimize/#3-high-precision-with-high-speed-search) | [**RAM Storage + Quantization**](https://qdrant.tech/documentation/guides/optimize/#3-high-precision-with-high-speed-search) | -| [**Balance Latency vs Throughput**](https://qdrant.tech/documentation/guides/optimize/#balancing-latency-and-throughput) | [**Segment Configuration**](https://qdrant.tech/documentation/guides/optimize/#balancing-latency-and-throughput) | +![Airbyte Qdrant destination](/documentation/frameworks/airbyte/qdrant-destination.png) -After this article, check out the code samples in our docs on [**Qdrant’s Optimization Methods**](https://qdrant.tech/documentation/guides/optimize/). +### Text processing -* * * +Airbyte has some built-in mechanisms to transform your texts into embeddings. You can choose how you want to +chunk your fields into pieces before calculating the embeddings, but also which fields should be used to +create the point payload. -## [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#configure-indexing-for-faster-searches) Configure Indexing for Faster Searches +![Processing settings](/documentation/frameworks/airbyte/processing.png) -![indexing](https://qdrant.tech/articles_data/vector-search-resource-optimization/index.png) +### Embeddings -A vector index is the central location where Qdrant calculates vector similarity. It is the backbone of your search process, retrieving relevant results from vast amounts of data. +You can choose the model that will be used to calculate the embeddings. Currently, Airbyte supports multiple +models, including OpenAI and Cohere. -Qdrant uses the [**HNSW (Hierarchical Navigable Small World Graph) algorithm**](https://qdrant.tech/documentation/concepts/indexing/#vector-index) as its dense vector index, which is both powerful and scalable. +![Embeddings settings](/documentation/frameworks/airbyte/embedding.png) -**Figure 2:** A sample HNSW vector index with three layers. Follow the blue arrow on the top layer to see how a query travels throughout the database index. The closest result is on the bottom level, nearest to the gray query point. +Using some precomputed embeddings from your data source is also possible. In this case, you can pass the field +name containing the embeddings and their dimensionality. -![hnsw](https://qdrant.tech/articles_data/vector-search-resource-optimization/hnsw.png) +![Precomputed embeddings settings](/documentation/frameworks/airbyte/precomputed-embedding.png) -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#vector-index-optimization-parameters) Vector Index Optimization Parameters +### Qdrant connection details -Working with massive datasets that contain billions of vectors demands significant resources—and those resources come with a price. While Qdrant provides reasonable defaults, tailoring them to your specific use case can unlock optimal performance. Here’s what you need to know. +Finally, we can configure the target Qdrant instance and collection. In case you use the built-in authentication +mechanism, here is where you can pass the token. -The following parameters give you the flexibility to fine-tune Qdrant’s performance for your specific workload. You can modify them directly in Qdrant’s [**configuration**](https://qdrant.tech/documentation/guides/configuration/) files or at the collection and named vector levels for more granular control. +![Qdrant connection details](/documentation/frameworks/airbyte/qdrant-config.png) -**Figure 3:** A description of three key HNSW parameters. +Once you confirm creating the destination, Airbyte will test if a specified Qdrant cluster is accessible and +might be used as a destination. -![hnsw-parameters](https://qdrant.tech/articles_data/vector-search-resource-optimization/hnsw-parameters.png) +## Setting up connection -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#1-the-m-parameter-determines-edges-per-node) 1\. The `m` parameter determines edges per node +Airbyte combines sources and destinations into a single entity called a connection. Once you have a destination +configured and a source, you can create a connection between them. It doesn't matter what source you use, as +long as Airbyte supports it. The process is pretty straightforward, but depends on the source you use. -This controls the number of edges in the graph. A higher value enhances search accuracy but demands more memory and build time. Fine-tune this to balance memory usage and precision. +![Airbyte connection](/documentation/frameworks/airbyte/connection.png) -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#2-the-ef_construct-parameter-controls-the-index-build-range) 2\. The `ef_construct` parameter controls the index build range +## Further Reading -This parameter sets how many neighbors are considered during index construction. A larger value improves the accuracy of the index but increases the build time. Use this to customize your indexing speed versus quality. +* [Airbyte documentation](https://docs.airbyte.com/understanding-airbyte/connections/). +* [Source Code](https://github.com/airbytehq/airbyte/tree/master/airbyte-integrations/connectors/destination-qdrant) -You need to set both the `m` and `ef parameters` as you create the collection: +<|page-203-lllmstxt|> +# Using Aleph Alpha Embeddings with Qdrant -```python -client.update_collection( - collection_name="{collection_name}", - vectors_config={ - "my_vector": models.VectorParamsDiff( - hnsw_config=models.HnswConfigDiff( - m=32, - ef_construct=123, - ), - ), - } -) +Aleph Alpha is a multimodal and multilingual embeddings' provider. Their API allows creating the embeddings for text and images, both +in the same latent space. They maintain an [official Python client](https://github.com/Aleph-Alpha/aleph-alpha-client) that might be +installed with pip: +```bash +pip install aleph-alpha-client ``` -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#3-the-ef-parameter-updates-vector-search-range) 3\. The `ef` parameter updates vector search range - -This determines how many neighbors are evaluated during a search query. You can adjust this to balance query speed and accuracy. - -The `ef` parameter is configured during the search process: +There is both synchronous and asynchronous client available. Obtaining the embeddings for an image and storing it into Qdrant might +be done in the following way: ```python -client.query_points( - collection_name="{collection_name}", - query=[...] - search_params=models.SearchParams(hnsw_ef=128, exact=False), -) - -``` - -* * * - -These are just the basics of HNSW. Learn More about [**Indexing**](https://qdrant.tech/documentation/concepts/indexing/). - -* * * - -## [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#data-compression-techniques) Data Compression Techniques +import qdrant_client +from qdrant_client.models import Batch -![compression](https://qdrant.tech/articles_data/vector-search-resource-optimization/compress.png) +from aleph_alpha_client import ( + Prompt, + AsyncClient, + SemanticEmbeddingRequest, + SemanticRepresentation, + ImagePrompt +) -Efficient data compression is a cornerstone of resource optimization in vector databases. By reducing memory usage, you can achieve faster query performance without sacrificing too much accuracy. +aa_token = "<< your_token >>" +model = "luminous-base" -One powerful technique is [**quantization**](https://qdrant.tech/documentation/guides/quantization/), which transforms high-dimensional vectors into compact representations while preserving relative similarity. Let’s explore the quantization options available in Qdrant. +qdrant_client = qdrant_client.QdrantClient() +async with AsyncClient(token=aa_token) as client: + prompt = ImagePrompt.from_file("./path/to/the/image.jpg") + prompt = Prompt.from_image(prompt) -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#scalar-quantization) Scalar Quantization + query_params = { + "prompt": prompt, + "representation": SemanticRepresentation.Symmetric, + "compress_to_size": 128, + } + query_request = SemanticEmbeddingRequest(**query_params) + query_response = await client.semantic_embed( + request=query_request, model=model + ) + + qdrant_client.upsert( + collection_name="MyCollection", + points=Batch( + ids=[1], + vectors=[query_response.embedding], + ) + ) +``` -Scalar quantization strikes an excellent balance between compression and performance, making it the go-to choice for most use cases. +If we wanted to create text embeddings with the same model, we wouldn't use `ImagePrompt.from_file`, but simply provide the input +text into the `Prompt.from_text` method. -This method minimizes the number of bits used to represent each vector component. For instance, Qdrant compresses 32-bit floating-point values ( **float32**) into 8-bit unsigned integers ( **uint8**), slashing memory usage by an impressive 75%. +<|page-204-lllmstxt|> +# Apache Airflow -**Figure 4:** The top example shows a float32 vector with a size of 40 bytes. Converting it to int8 format reduces its size by a factor of four, while maintaining approximate similarity relationships between vectors. The loss in precision compared to the original representation is typically negligible for most practical applications. +[Apache Airflow](https://airflow.apache.org/) is an open-source platform for authoring, scheduling and monitoring data and computing workflows. Airflow uses Python to create workflows that can be easily scheduled and monitored. -![scalar-quantization](https://qdrant.tech/articles_data/vector-search-resource-optimization/scalar-quantization.png) +Qdrant is available as a [provider](https://airflow.apache.org/docs/apache-airflow-providers-qdrant/stable/index.html) in Airflow to interface with the database. -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#benefits-of-scalar-quantization) Benefits of Scalar Quantization: +## Prerequisites -| Benefit | Description | -| --- | --- | -| **Memory usage will drop** | Compression cuts memory usage by a factor of 4. Qdrant compresses 32-bit floating-point values (float32) into 8-bit unsigned integers (uint8). | -| **Accuracy loss is minimal** | Converting from float32 to uint8 introduces a small loss in precision. Typical error rates remain below 1%, making this method highly efficient. | -| **Best for specific use cases** | To be used with high-dimensional vectors where minor accuracy losses are acceptable. | +Before configuring Airflow, you need: -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#set-it-up-as-you-create-the-collection) Set it up as you create the collection: +1. A Qdrant instance to connect to. You can set one up in our [installation guide](/documentation/guides/installation/). -```python -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE), - quantization_config=models.ScalarQuantization( - scalar=models.ScalarQuantizationConfig( - type=models.ScalarType.INT8, - quantile=0.99, - always_ram=True, - ), - ), -) +2. A running Airflow instance. You can use their [Quick Start Guide](https://airflow.apache.org/docs/apache-airflow/stable/start.html). -``` +## Installation -When working with Qdrant, you can fine-tune the quantization configuration to optimize precision, memory usage, and performance. Here’s what the key configuration options include: +You can install the Qdrant provider by running `pip install apache-airflow-providers-qdrant` in your Airflow shell. -| Configuration Option | Description | -| --- | --- | -| `type` | Specifies the quantized vector type (currently supports only int8). | -| `quantile` | Sets bounds for quantization, excluding outliers. For example, 0.99 excludes the top 1% of extreme values to maintain better accuracy. | -| `always_ram` | Keeps quantized vectors in RAM to speed up searches. | +**NOTE**: You'll have to restart your Airflow session for the provider to be available. -Adjust these settings to strike the right balance between precision and efficiency for your specific workload. +## Setting up a connection -* * * +Open the `Admin-> Connections` section of the Airflow UI. Click the `Create` link to create a new [Qdrant connection](https://airflow.apache.org/docs/apache-airflow-providers-qdrant/stable/connections.html). -Learn More about [**Scalar Quantization**](https://qdrant.tech/documentation/guides/quantization/) +![Qdrant connection](/documentation/frameworks/airflow/connection.png) -* * * +You can also set up a connection using [environment variables](https://airflow.apache.org/docs/apache-airflow/stable/howto/connection.html#environment-variables-connections) or an [external secret backend](https://airflow.apache.org/docs/apache-airflow/stable/security/secrets/secrets-backend/index.html). -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#binary-quantization) Binary Quantization +## Qdrant hook -**Binary quantization** takes scalar quantization to the next level by compressing each vector component into just **a single bit**. This method achieves unparalleled memory efficiency and query speed, reducing memory usage by a factor of 32 and enabling searches up to 40x faster. +An Airflow hook is an abstraction of a specific API that allows Airflow to interact with an external system. -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#benefits-of-binary-quantization)**Benefits of Binary Quantization:** +```python +from airflow.providers.qdrant.hooks.qdrant import QdrantHook -Binary quantization is ideal for large-scale datasets and compatible embedding models, where compression and speed are paramount. +hook = QdrantHook(conn_id="qdrant_connection") -**Figure 5:** This method causes maximum compression. It reduces memory usage by 32x and speeds up searches by up to 40x. +hook.verify_connection() +``` -![binary-quantization](https://qdrant.tech/articles_data/vector-search-resource-optimization/binary-quantization.png) +A [`qdrant_client#QdrantClient`](https://pypi.org/project/qdrant-client/) instance is available via `@property conn` of the `QdrantHook` instance for use within your Airflow workflows. -| Benefit | Description | -| --- | --- | -| **Efficient similarity calculations** | Emulates Hamming distance through dot product comparisons, making it fast and effective. | -| **Perfect for high-dimensional vectors** | Works well with embedding models like OpenAI’s text-embedding-ada-002 or Cohere’s embed-english-v3.0. | -| **Precision management** | Consider rescoring or oversampling to offset precision loss. | +```python +from qdrant_client import models -Here’s how you can enable binary quantization in Qdrant: +hook.conn.count("") -```python -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE), - quantization_config=models.BinaryQuantization( - binary=models.BinaryQuantizationConfig( - always_ram=True, - ), - ), +hook.conn.upsert( + "", + points=[ + models.PointStruct(id=32, vector=[0.32, 0.12, 0.123], payload={"color": "red"}) + ], ) ``` -> By default, quantized vectors load like original vectors unless you set `always_ram` to `True` for instant access and faster queries. - -* * * - -Learn more about [**Binary Quantization**](https://qdrant.tech/documentation/guides/quantization/) - -* * * - -## [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#scaling-the-database) Scaling the Database - -![sharding](https://qdrant.tech/articles_data/vector-search-resource-optimization/shards.png) - -Efficiently managing large datasets in distributed systems like Qdrant requires smart strategies for data isolation. **Multitenancy** and **Sharding** are essential tools to help you handle high volumes of user-specific data while maintaining performance and scalability. - -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#multitenancy) Multitenancy - -**Multitenancy** is a software architecture where multiple independent users (or tenants) share the same resources or environment. In Qdrant, a single collection with logical partitioning is often the most efficient setup for multitenant use cases. - -**Figure 5:** Each individual vector is assigned a specific payload that denotes which tenant it belongs to. This is how a large number of different tenants can share a single Qdrant collection. - -![multitenancy](https://qdrant.tech/articles_data/vector-search-resource-optimization/multitenancy.png) +## Qdrant Ingest Operator -**Why Choose Multitenancy?** - -- **Logical Isolation**: Ensures each tenant’s data remains separate while residing in the same collection. -- **Minimized Overhead**: Reduces resource consumption compared to maintaining separate collections for each user. -- **Scalability**: Handles high user volumes without compromising performance. - -Here’s how you can implement multitenancy efficiently in Qdrant: +The Qdrant provider also provides a convenience operator for uploading data to a Qdrant collection that internally uses the Qdrant hook. ```python -client.create_payload_index( - collection_name="{collection_name}", - field_name="group_id", - field_schema=models.KeywordIndexParams( - type="keyword", - is_tenant=True, - ), -) - -``` - -Creating a keyword payload index, with the `is_tenant` parameter set to `True`, modifies the way the vectors will be logically stored. Storage structure will be organized to co-locate vectors of the same tenant together. +from airflow.providers.qdrant.operators.qdrant import QdrantIngestOperator -Now, each point stored in Qdrant should have the `group_id` payload attribute set: +vectors = [ + [0.11, 0.22, 0.33, 0.44], + [0.55, 0.66, 0.77, 0.88], + [0.88, 0.11, 0.12, 0.13], +] +ids = [32, 21, "b626f6a9-b14d-4af9-b7c3-43d8deb719a6"] +payload = [{"meta": "data"}, {"meta": "data_2"}, {"meta": "data_3", "extra": "data"}] -```python -client.upsert( - collection_name="{collection_name}", - points=[\ - models.PointStruct(\ - id=1,\ - payload={"group_id": "user_1"},\ - vector=[0.9, 0.1, 0.1],\ - ),\ -\ - models.PointStruct(\ - id=2,\ - payload={"group_id": "user_2"},\ - vector=[0.5, 0.9, 0.4],\ - )\ - ] +QdrantIngestOperator( + conn_id="qdrant_connection", + task_id="qdrant_ingest", + collection_name="", + vectors=vectors, + ids=ids, + payload=payload, ) - ``` -* * * - -To ensure proper data isolation in a multitenant environment, you can assign a unique identifier, such as a **group\_id**, to each vector. This approach ensures that each user’s data remains segregated, allowing users to access only their own data. You can further enhance this setup by applying filters during queries to restrict access to the relevant data. - -* * * +## Reference +- 📩 [Provider package PyPI](https://pypi.org/project/apache-airflow-providers-qdrant/) +- 📚 [Provider docs](https://airflow.apache.org/docs/apache-airflow-providers-qdrant/stable/index.html) +- 📄 [Source Code](https://github.com/apache/airflow/tree/main/providers/qdrant) -Learn More about [**Multitenancy**](https://qdrant.tech/documentation/guides/multiple-partitions/) +<|page-205-lllmstxt|> +# Apache Spark -* * * +[Spark](https://spark.apache.org/) is a distributed computing framework designed for big data processing and analytics. The [Qdrant-Spark connector](https://github.com/qdrant/qdrant-spark) enables Qdrant to be a storage destination in Spark. -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#sharding) Sharding +## Installation -Sharding is a critical strategy in Qdrant for splitting collections into smaller units, called **shards**, to efficiently distribute data across multiple nodes. It’s a powerful tool for improving scalability and maintaining performance in large-scale systems. +To integrate the connector into your Spark environment, get the JAR file from one of the sources listed below. -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#user-defined-sharding) User-Defined Sharding: +- GitHub Releases -**User-Defined Sharding** allows you to take control of data placement by specifying a shard key. This feature is particularly useful in multi-tenant setups, as it enables the isolation of each tenant’s data within separate shards, ensuring better organization and enhanced data security. - -**Figure 6:** Users can both upsert and query shards that are relevant to them, all within the same collection. Regional sharding can help avoid cross-continental traffic. - -![user-defined-sharding](https://qdrant.tech/articles_data/vector-search-resource-optimization/user-defined-sharding.png) +The packaged `jar` file with all the required dependencies can be found [here](https://github.com/qdrant/qdrant-spark/releases). -**Example:** +- Building from Source -```python -client.create_collection( - collection_name="my_custom_sharded_collection", - shard_number=1, - sharding_method=models.ShardingMethod.CUSTOM -) -client.create_shard_key("my_custom_sharded_collection", "tenant_id") +To build the `jar` from source, you need [JDK@8](https://www.azul.com/downloads/#zulu) and [Maven](https://maven.apache.org/) installed. Once the requirements have been satisfied, run the following command in the [project root](https://github.com/qdrant/qdrant-spark). +```bash +mvn package -DskipTests ``` -* * * - -When implementing user-defined sharding in Qdrant, two key parameters are critical to achieving efficient data distribution: - -1. **Shard Key**: - -The shard key determines how data points are distributed across shards. For example, using a key like `tenant_id` allows you to control how Qdrant partitions the data. Each data point added to the collection will be assigned to a shard based on the value of this key, ensuring logical isolation of data. +The JAR file will be written into the `target` directory by default. -2. **Shard Number**: +- Maven Central -This defines the total number of physical shards for each shard key, influencing resource allocation and query performance. +Find the project on Maven Central [here](https://central.sonatype.com/artifact/io.qdrant/spark). +## Usage -Here’s how you can add a data point to a collection with user-defined sharding: +### Creating a Spark session with Qdrant support ```python -client.upsert( - collection_name="my_custom_sharded_collection", - points=[\ - models.PointStruct(\ - id=1111,\ - vector=[0.1, 0.2, 0.3]\ - )\ - ], - shard_key_selector="tenant_1" -) +from pyspark.sql import SparkSession +spark = SparkSession.builder.config( + "spark.jars", + "path/to/file/spark-VERSION.jar", # Specify the path to the downloaded JAR file + ) + .master("local[*]") + .appName("qdrant") + .getOrCreate() ``` -* * * - -This code assigns the point to a specific shard based on the `tenant_1` shard key, ensuring proper data placement. +```scala +import org.apache.spark.sql.SparkSession -Here’s how to choose the shard\_number: +val spark = SparkSession.builder + .config("spark.jars", "path/to/file/spark-VERSION.jar") // Specify the path to the downloaded JAR file + .master("local[*]") + .appName("qdrant") + .getOrCreate() +``` -| Recommendation | Description | -| --- | --- | -| **Match Shards to Nodes** | The number of shards should align with the number of nodes in your cluster to balance resource utilization and query performance. | -| **Plan for Scalability** | Start with at least **2 shards per node** to allow room for future growth. | -| **Future-Proofing** | Starting with around **12 shards** is a good rule of thumb. This setup allows your system to scale seamlessly from 1 to 12 nodes without requiring re-sharding. | +```java +import org.apache.spark.sql.SparkSession; + +public class QdrantSparkJavaExample { + public static void main(String[] args) { + SparkSession spark = SparkSession.builder() + .config("spark.jars", "path/to/file/spark-VERSION.jar") // Specify the path to the downloaded JAR file + .master("local[*]") + .appName("qdrant") + .getOrCreate(); + } +} +``` -Learn more about [**Sharding in Distributed Deployment**](https://qdrant.tech/documentation/guides/distributed_deployment/) +### Loading data -* * * +Before loading the data using this connector, a collection has to be [created](https://qdrant.tech/documentation/concepts/collections/#create-a-collection) in advance with the appropriate vector dimensions and configurations. -## [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#query-optimization) Query Optimization +The connector supports ingesting multiple named/unnamed, dense/sparse vectors. -![qdrant](https://qdrant.tech/articles_data/vector-search-resource-optimization/query.png) -Improving vector database performance is critical when dealing with large datasets and complex queries. By leveraging techniques like **filtering**, **batch processing**, **reranking**, **rescoring**, and **oversampling**, so you can ensure fast response times and maintain efficiency even at scale. +_Click each to expand._ -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#filtering) Filtering +
+ Unnamed/Default vector -Filtering allows you to select only the required fields in your query results. By limiting the output size, you can significantly reduce response time and improve performance. +```python + + .write + .format("io.qdrant.spark.Qdrant") + .option("qdrant_url", ) + .option("collection_name", ) + .option("embedding_field", ) # Expected to be a field of type ArrayType(FloatType) + .option("schema", .schema.json()) + .mode("append") + .save() +``` -The filterable vector index is Qdrant’s solves pre and post-filtering problems by adding specialized links to the search graph. It aims to maintain the speed advantages of vector search while allowing for precise filtering, addressing the inefficiencies that can occur when applying filters after the vector search. +
-**Example:** +
+ Named vector ```python -results = client.search( - collection_name="my_collection", - query_vector=[0.1, 0.2, 0.3], - query_filter=models.Filter(must=[\ - models.FieldCondition(\ - key="category",\ - match=models.MatchValue(value="my-category-name"),\ - )\ - ]), - limit=10, -) - + + .write + .format("io.qdrant.spark.Qdrant") + .option("qdrant_url", ) + .option("collection_name", ) + .option("embedding_field", ) # Expected to be a field of type ArrayType(FloatType) + .option("vector_name", ) + .option("schema", .schema.json()) + .mode("append") + .save() ``` -**Figure 7:** The filterable vector index adds specialized links to the search graph to speed up traversal. - -![filterable-vector-index](https://qdrant.tech/articles_data/vector-search-resource-optimization/filterable-vector-index.png) - -[**Filterable vector index**](https://qdrant.tech/documentation/concepts/indexing/): This technique builds additional links **(orange)** between leftover data points. The filtered points which stay behind are now traversible once again. Qdrant uses special category-based methods to connect these data points. +> #### NOTE +> +> The `embedding_field` and `vector_name` options are maintained for backward compatibility. It is recommended to use `vector_fields` and `vector_names` for named vectors as shown below. -* * * +
-Read more about [**Filtering Docs**](https://qdrant.tech/documentation/concepts/filtering/) and check out the [**Complete Filtering Guide**](https://qdrant.tech/articles/vector-search-filtering/). +
+ Multiple named vectors -* * * +```python + + .write + .format("io.qdrant.spark.Qdrant") + .option("qdrant_url", "") + .option("collection_name", "") + .option("vector_fields", ",") + .option("vector_names", ",") + .option("schema", .schema.json()) + .mode("append") + .save() +``` -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#batch-processing) Batch Processing +
-Batch processing consolidates multiple operations into a single execution cycle, reducing request overhead and enhancing throughput. It’s an effective strategy for both data insertion and query execution. +
+ Sparse vectors -![batch-processing](https://qdrant.tech/articles_data/vector-search-resource-optimization/batch-processing.png) +```python + + .write + .format("io.qdrant.spark.Qdrant") + .option("qdrant_url", "") + .option("collection_name", "") + .option("sparse_vector_value_fields", "") + .option("sparse_vector_index_fields", "") + .option("sparse_vector_names", "") + .option("schema", .schema.json()) + .mode("append") + .save() +``` -**Batch Insertions**: Instead of inserting vectors individually, group them into medium-sized batches to minimize the number of database requests and the overhead of frequent writes. +
-**Example:** +
+ Multiple sparse vectors ```python -vectors = [\ - [.1, .0, .0, .0],\ - [.0, .1, .0, .0],\ - [.0, .0, .1, .0],\ - [.0, .0, .0, .1],\ - 
\ -] -client.upload_collection( - collection_name="test_collection", - vectors=vectors, -) - + + .write + .format("io.qdrant.spark.Qdrant") + .option("qdrant_url", "") + .option("collection_name", "") + .option("sparse_vector_value_fields", ",") + .option("sparse_vector_index_fields", ",") + .option("sparse_vector_names", ",") + .option("schema", .schema.json()) + .mode("append") + .save() ``` -This reduces write operations and ensures faster data ingestion. - -**Batch Queries**: Similarly, you can batch multiple queries together rather than executing them one by one. This reduces the number of round trips to the database, optimizing performance and reducing latency. +
-**Example:** +
+ Combination of named dense and sparse vectors ```python -results = client.search_batch( - collection_name="test_collection", - requests=[\ - SearchRequest(\ - vector=[0., 0., 2., 0.],\ - limit=1,\ - ),\ - SearchRequest(\ - vector=[0., 0., 0., 0.01],\ - with_vector=True,\ - limit=2,\ - )\ - ] -) - + + .write + .format("io.qdrant.spark.Qdrant") + .option("qdrant_url", "") + .option("collection_name", "") + .option("vector_fields", ",") + .option("vector_names", ",") + .option("sparse_vector_value_fields", ",") + .option("sparse_vector_index_fields", ",") + .option("sparse_vector_names", ",") + .option("schema", .schema.json()) + .mode("append") + .save() ``` -Batch queries are particularly useful when processing a large number of similar queries or when handling multiple user requests simultaneously. - -* * * +
-#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#hybrid-search) Hybrid Search +
+ Multi-vectors -Hybrid search combines **keyword filtering** with **vector similarity search**, enabling faster and more precise results. Keywords help narrow down the dataset quickly, while vector similarity ensures semantic accuracy. This search method combines [**dense and sparse vectors**](https://qdrant.tech/documentation/concepts/vectors/). - -Hybrid search in Qdrant uses both fusion and reranking. The former is about combining the results from different search methods, based solely on the scores returned by each method. That usually involves some normalization, as the scores returned by different methods might be in different ranges. +```python + + .write + .format("io.qdrant.spark.Qdrant") + .option("qdrant_url", "") + .option("collection_name", "") + .option("multi_vector_fields", "") + .option("multi_vector_names", "") + .option("schema", .schema.json()) + .mode("append") + .save() +``` -**Figure 8**: Hybrid Search Architecture +
-![hybrid-search](https://qdrant.tech/articles_data/vector-search-resource-optimization/hybrid-search.png) +
+ Multiple Multi-vectors -After that, there is a formula that takes the relevancy measures and calculates the final score that we use later on to reorder the documents. Qdrant has built-in support for the Reciprocal Rank Fusion method, which is the de facto standard in the field. +```python + + .write + .format("io.qdrant.spark.Qdrant") + .option("qdrant_url", "") + .option("collection_name", "") + .option("multi_vector_fields", ",") + .option("multi_vector_names", ",") + .option("schema", .schema.json()) + .mode("append") + .save() +``` -* * * +
-Learn more about [**Hybrid Search**](https://qdrant.tech/articles/hybrid-search/) and read out [**Hybrid Queries docs**](https://qdrant.tech/documentation/concepts/hybrid-queries/). +
+ No vectors - Entire dataframe is stored as payload -* * * +```python + + .write + .format("io.qdrant.spark.Qdrant") + .option("qdrant_url", "") + .option("collection_name", "") + .option("schema", .schema.json()) + .mode("append") + .save() +``` -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#oversampling) Oversampling +
-Oversampling is a technique that helps compensate for any precision lost due to quantization. Since quantization simplifies vectors, some relevant matches could be missed in the initial search. To avoid this, you can **retrieve more candidates**, increasing the chances that the most relevant vectors make it into the final results. +## Databricks -You can control the number of extra candidates by setting an `oversampling` parameter. For example, if your desired number of results ( `limit`) is 4 and you set an `oversampling` factor of 2, Qdrant will retrieve 8 candidates (4 × 2). + -You can adjust the oversampling factor to control how many extra vectors Qdrant includes in the initial pool. More candidates mean a better chance of obtaining high-quality top-K results, especially after rescoring with the original vectors. +You can use the `qdrant-spark` connector as a library in [Databricks](https://www.databricks.com/). -* * * +- Go to the `Libraries` section in your Databricks cluster dashboard. +- Select `Install New` to open the library installation modal. +- Search for `io.qdrant:spark:VERSION` in the Maven packages and click `Install`. -Learn more about [**Oversampling**](https://qdrant.tech/articles/what-is-vector-quantization/#2-oversampling). +![Databricks](/documentation/frameworks/spark/databricks.png) -* * * +## Datatype support -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#rescoring) Rescoring +The appropriate Spark data types are mapped to the Qdrant payload based on the provided `schema`. -After oversampling to gather more potential matches, each candidate is re-evaluated based on additional criteria to ensure higher accuracy and relevance to the query. +## Options and Spark types -The rescoring process maps the quantized vectors to their corresponding original vectors, allowing you to consider factors like context, metadata, or additional relevance that wasn’t included in the initial search, leading to more accurate results. +| Option | Description | Column DataType | Required | +| :--------------------------- | :----------------------------------------------------------------------------------- | :-------------------------------- | :------- | +| `qdrant_url` | gRPC URL of the Qdrant instance. Eg: | - | ✅ | +| `collection_name` | Name of the collection to write data into | - | ✅ | +| `schema` | JSON string of the dataframe schema | - | ✅ | +| `embedding_field` | Name of the column holding the embeddings (Deprecated - Use `vector_fields` instead) | `ArrayType(FloatType)` | ❌ | +| `id_field` | Name of the column holding the point IDs. Default: Random UUID | `StringType` or `IntegerType` | ❌ | +| `batch_size` | Max size of the upload batch. Default: 64 | - | ❌ | +| `retries` | Number of upload retries. Default: 3 | - | ❌ | +| `api_key` | Qdrant API key for authentication | - | ❌ | +| `vector_name` | Name of the vector in the collection. | - | ❌ | +| `vector_fields` | Comma-separated names of columns holding the vectors. | `ArrayType(FloatType)` | ❌ | +| `vector_names` | Comma-separated names of vectors in the collection. | - | ❌ | +| `sparse_vector_index_fields` | Comma-separated names of columns holding the sparse vector indices. | `ArrayType(IntegerType)` | ❌ | +| `sparse_vector_value_fields` | Comma-separated names of columns holding the sparse vector values. | `ArrayType(FloatType)` | ❌ | +| `sparse_vector_names` | Comma-separated names of the sparse vectors in the collection. | - | ❌ | +| `multi_vector_fields` | Comma-separated names of columns holding the multi-vector values. | `ArrayType(ArrayType(FloatType))` | ❌ | +| `multi_vector_names` | Comma-separated names of the multi-vectors in the collection. | - | ❌ | +| `shard_key_selector` | Comma-separated names of custom shard keys to use during upsert. | - | ❌ | +| `wait` | Wait for each batch upsert to complete. `true` or `false`. Defaults to `true`. | - | ❌ | -**Example of Rescoring and Oversampling:**: +For more information, be sure to check out the [Qdrant-Spark GitHub repository](https://github.com/qdrant/qdrant-spark). The Apache Spark guide is available [here](https://spark.apache.org/docs/latest/quick-start.html). Happy data processing! -```python -client.query_points( - collection_name="my_collection", - query_vector=[0.22, -0.01, -0.98, 0.37], - search_params=models.SearchParams( - quantization=models.QuantizationSearchParams( - rescore=True, # Enables rescoring with original vectors - oversampling=2 # Retrieves extra candidates for rescoring - ) - ), - limit=4 # Desired number of final results -) +<|page-206-lllmstxt|> +# Apify -``` +[Apify](https://apify.com/) is a web scraping and browser automation platform featuring an [app store](https://apify.com/store) with over 1,500 pre-built micro-apps known as Actors. These serverless cloud programs, which are essentially dockers under the hood, are designed for various web automation applications, including data collection. -* * * +One such Actor, built especially for AI and RAG applications, is [Website Content Crawler](https://apify.com/apify/website-content-crawler). -Learn more about [**Rescoring**](https://qdrant.tech/articles/what-is-vector-quantization/#3-rescoring-with-original-vectors). +It's ideal for this purpose because it has built-in HTML processing and data-cleaning functions. That means you can easily remove fluff, duplicates, and other things on a web page that aren't relevant, and provide only the necessary data to the language model. -* * * +The Markdown can then be used to feed Qdrant to train AI models or supply them with fresh web content. -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#reranking) Reranking +Qdrant is available as an [official integration](https://apify.com/apify/qdrant-integration) to load Apify datasets into a collection. -Reranking adjusts the order of search results based on additional criteria, ensuring the most relevant results are prioritized. +You can refer to the [Apify documentation](https://docs.apify.com/platform/integrations/qdrant) to set up the integration via the Apify UI. -This method is about taking the results from different search methods and reordering them based on some additional processing using the content of the documents, not just the scores. This processing may rely on an additional neural model, such as a cross-encoder which would be inefficient enough to be used on the whole dataset. +## Programmatic Usage -![reranking](https://qdrant.tech/articles_data/vector-search-resource-optimization/reranking.png) +Apify also supports programmatic access to integrations via the [Apify Python SDK](https://docs.apify.com/sdk/python/). -These methods are practically applicable only when used on a smaller subset of candidates returned by the faster search methods. Late interaction models, such as ColBERT, are way more efficient in this case, as they can be used to rerank the candidates without the need to access all the documents in the collection. +1. Install the Apify Python SDK by running the following command: -**Example:** + ```sh + pip install apify-client + ``` -```python -client.query_points( - "collection-name", - prefetch=prefetch, # Previous results - query=late_vectors, # Colbert converted query - using="colbertv2.0", - with_payload=True, - limit=10, -) +2. Create a Python script and import all the necessary modules: -``` + ```python + from apify_client import ApifyClient -* * * + APIFY_API_TOKEN = "YOUR-APIFY-TOKEN" + OPENAI_API_KEY = "YOUR-OPENAI-API-KEY" + # COHERE_API_KEY = "YOUR-COHERE-API-KEY" -Learn more about [**Reranking**](https://qdrant.tech/documentation/search-precision/reranking-hybrid-search/#rerank). + QDRANT_URL = "YOUR-QDRANT-URL" + QDRANT_API_KEY = "YOUR-QDRANT-API-KEY" -* * * + client = ApifyClient(APIFY_API_TOKEN) + ``` -## [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#storage-disk-vs-ram) Storage: Disk vs RAM +3. Call the [Website Content Crawler](https://apify.com/apify/website-content-crawler) Actor to crawl the Qdrant documentation and extract text content from the web pages: -![disk](https://qdrant.tech/articles_data/vector-search-resource-optimization/disk.png) + ```python + actor_call = client.actor("apify/website-content-crawler").call( + run_input={"startUrls": [{"url": "https://qdrant.tech/documentation/"}]} + ) + ``` + +4. Call the Qdrant integration and store all data in the Qdrant Vector Database: + + ```python + qdrant_integration_inputs = { + "qdrantUrl": QDRANT_URL, + "qdrantApiKey": QDRANT_API_KEY, + "qdrantCollectionName": "apify", + "qdrantAutoCreateCollection": True, + "datasetId": actor_call["defaultDatasetId"], + "datasetFields": ["text"], + "enableDeltaUpdates": True, + "deltaUpdatesPrimaryDatasetFields": ["url"], + "expiredObjectDeletionPeriodDays": 30, + "embeddingsProvider": "OpenAI", # "Cohere" + "embeddingsApiKey": OPENAI_API_KEY, + "performChunking": True, + "chunkSize": 1000, + "chunkOverlap": 0, + } + actor_call = client.actor("apify/qdrant-integration").call(run_input=qdrant_integration_inputs) -| Storage | Description | -| --- | --- | -| **RAM** | Crucial for fast access to frequently used data, such as indexed vectors. The amount of RAM required can be estimated based on your dataset size and dimensionality. For example, storing **1 million vectors with 1024 dimensions** would require approximately **5.72 GB of RAM**. | -| **Disk** | Suitable for less frequently accessed data, such as payloads and non-critical information. Disk-backed storage reduces memory demands but can introduce slight latency. | + ``` -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#which-disk-type) Which Disk Type? +Upon running the script, the data from will be scraped, transformed into vector embeddings and stored in the Qdrant collection. -**Local SSDs** are recommended for optimal performance, as they provide the fastest query response times with minimal latency. While network-attached storage is also viable, it typically introduces additional latency that can affect performance, so local SSDs are preferred when possible, particularly for workloads requiring high-speed random access. +## Further Reading -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#memory-management-for-vectors-and-payload) Memory Management for Vectors and Payload +- Apify [Documentation](https://docs.apify.com/) +- Apify [Templates](https://apify.com/templates) +- Integration [Source Code](https://github.com/apify/actor-vector-database-integrations) -As your data scales, effective resource management becomes crucial to keeping costs low while ensuring your application remains reliable and performant. One of the key areas to focus on is **memory management**. +<|page-207-lllmstxt|> +# Microsoft Autogen -Understanding how Qdrant handles memory can help you make informed decisions about scaling your vector database. Qdrant supports two main methods for storing vectors: +[AutoGen](https://github.com/microsoft/autogen/tree/0.2) is an open-source programming framework for building AI agents and facilitating cooperation among multiple agents to solve tasks. -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#1-in-memory-storage) 1\. In-Memory Storage +- Multi-agent conversations: AutoGen agents can communicate with each other to solve tasks. This allows for more complex and sophisticated applications than would be possible with a single LLM. -- **How it works**: All data is stored in RAM, providing the fastest access times for queries and operations. -- **When to use it**: This setup is ideal for applications where performance is critical, and your RAM capacity can accommodate all data. -- **Advantages**: Maximum speed for queries and updates. -- **Limitations**: RAM usage can become a bottleneck as your dataset grows. +- Customization: AutoGen agents can be customized to meet the specific needs of an application. This includes the ability to choose the LLMs to use, the types of human input to allow, and the tools to employ. -#### [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#2-memmap-storage) 2\. Memmap Storage +- Human participation: AutoGen allows human participation. This means that humans can provide input and feedback to the agents as needed. -- **How it works**: Instead of loading all data into memory, memmap storage maps data files directly to a virtual address space on disk. The system’s page cache handles data access, making it highly efficient. -- **When to use it**: Perfect for storing large collections that exceed your available RAM while still maintaining near in-memory performance when enough RAM is available. -- **Advantages**: Balances performance and memory usage, allowing you to work with datasets larger than your physical RAM. -- **Limitations**: Slightly slower than pure in-memory storage but significantly more scalable. +With the [Autogen-Qdrant integration](https://microsoft.github.io/autogen/0.2/docs/reference/agentchat/contrib/vectordb/qdrant/), you build Autogen workflows backed by Qdrant't performant retrievals. + +## Installation + +```bash +pip install "autogen-agentchat[retrievechat-qdrant]" +``` -To enable memmap vector storage in Qdrant, you can set the **on\_disk** parameter to `true` when creating or updating a collection. +## Usage + +#### Configuration ```python -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams( - 
 - on_disk=True - ) -) +import autogen +config_list = autogen.config_list_from_json("OAI_CONFIG_LIST") ``` -To do the same for payloads: +The `config_list_from_json` function first looks for the environment variable `OAI_CONFIG_LIST` which needs to be a valid JSON string. If not found, it then looks for a JSON file named `OAI_CONFIG_LIST`. A sample file can be found [here](https://github.com/microsoft/autogen/blob/0.2/OAI_CONFIG_LIST_sample). + +#### Construct agents for RetrieveChat + +We start by initializing the RetrieveAssistantAgent and QdrantRetrieveUserProxyAgent. The system message needs to be set to "You are a helpful assistant." for RetrieveAssistantAgent. The detailed instructions are given in the user message. ```python -client.create_collection( - collection_name="{collection_name}", - on_disk_payload= True +from qdrant_client import QdrantClient +from sentence_transformers import SentenceTransformer + +from autogen import AssistantAgent +from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent + +# 1. Create an AssistantAgent instance named "assistant" +assistant = AssistantAgent( + name="assistant", + system_message="You are a helpful assistant.", + llm_config={ + "timeout": 600, + "cache_seed": 42, + "config_list": config_list, + }, ) +sentence_transformer_ef = SentenceTransformer("all-distilroberta-v1").encode +client = QdrantClient(url="http://localhost:6333/") + +# 2. Create the RetrieveUserProxyAgent instance named "ragproxyagent" +# Refer to https://microsoft.github.io/autogen/docs/reference/agentchat/contrib/retrieve_user_proxy_agent +# for more information on the RetrieveUserProxyAgent +ragproxyagent = RetrieveUserProxyAgent( + name="ragproxyagent", + human_input_mode="NEVER", + max_consecutive_auto_reply=10, + retrieve_config={ + "task": "code", + "docs_path": [ + "path/to/some/doc.md", + "path/to/some/other/doc.md", + ], + "chunk_token_size": 2000, + "model": config_list[0]["model"], + "vector_db": "qdrant", + "db_config": {"client": client}, + "get_or_create": True, + "overwrite": True, + "embedding_function": sentence_transformer_ef, # Defaults to "BAAI/bge-small-en-v1.5" via FastEmbed + }, + code_execution_config=False, +) ``` -The general guideline for selecting a storage method in Qdrant is to use **InMemory storage** when high performance is a priority, and sufficient RAM is available to accommodate the dataset. This approach ensures the fastest access speeds by keeping data readily accessible in memory. +#### Run the agent -However, for larger datasets or scenarios where memory is limited, **Memmap** and **OnDisk storage** are more suitable. These methods significantly reduce memory usage by storing data on disk while leveraging advanced techniques like page caching and indexing to maintain efficient and relatively fast data access. +```python +# Always reset the assistant before starting a new conversation. +assistant.reset() -## [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#monitoring-the-database) Monitoring the Database +# We use the ragproxyagent to generate a prompt to be sent to the assistant as the initial message. +# The assistant receives it and generates a response. The response will be sent back to the ragproxyagent for processing. +# The conversation continues until the termination condition is met. -![monitoring](https://qdrant.tech/articles_data/vector-search-resource-optimization/monitor.png) +qa_problem = "What is the .....?" +chat_results = ragproxyagent.initiate_chat(assistant, message=ragproxyagent.message_generator, problem=qa_problem) +``` -Continuous monitoring is essential for maintaining system health and identifying potential issues before they escalate. Tools like **Prometheus** and **Grafana** are widely used to achieve this. +## Next steps -- **Prometheus**: An open-source monitoring and alerting toolkit, Prometheus collects and stores metrics in a time-series database. It scrapes metrics from predefined endpoints and supports powerful querying and visualization capabilities. -- **Grafana**: Often paired with Prometheus, Grafana provides an intuitive interface for visualizing metrics and creating interactive dashboards. +- AutoGen [documentation](https://microsoft.github.io/autogen/0.2) +- Autogen [examples](https://microsoft.github.io/autogen/0.2/docs/Examples) +- [Source Code](https://github.com/microsoft/autogen/blob/0.2/autogen/agentchat/contrib/vectordb/qdrant.py) -Qdrant exposes metrics in the **Prometheus/OpenMetrics** format through the /metrics endpoint. Prometheus can scrape this endpoint to monitor various aspects of the Qdrant system. +<|page-208-lllmstxt|> +# Bedrock Embeddings -For a local Qdrant instance, the metrics endpoint is typically available at: +You can use [AWS Bedrock](https://aws.amazon.com/bedrock/) with Qdrant. AWS Bedrock supports multiple [embedding model providers](https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html). -```python -http://localhost:6333/metrics +You'll need the following information from your AWS account: -``` +- Region +- Access key ID +- Secret key -* * * +To configure your credentials, review the following AWS article: [How do I create an AWS access key](https://repost.aws/knowledge-center/create-access-key). -Here are some important metrics to monitor: +With the following code sample, you can generate embeddings using the [Titan Embeddings G1 - Text model](https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html) which produces sentence embeddings of size 1536. -| **Metric Name** | | **Meaning** | -| --- | --- | --- | -| collections\_total | | Total number of collections | -| collections\_vector\_total | | Total number of vectors in all collections | -| rest\_responses\_avg\_duration\_seconds | | Average response duration in REST API | -| grpc\_responses\_avg\_duration\_seconds | | Average response duration in gRPC API | -| rest\_responses\_fail\_total | | Total number of failed responses (REST) | +```python +# Install the required dependencies +# pip install boto3 qdrant_client -Read more about [**Qdrant Open Source Monitoring**](https://qdrant.tech/documentation/guides/monitoring/) and [**Qdrant Cloud Monitoring**](https://qdrant.tech/documentation/cloud/cluster-monitoring/) for managed clusters. +import json +import boto3 -* * * +from qdrant_client import QdrantClient, models -## [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#recap-when-should-you-optimize) Recap: When Should You Optimize? +session = boto3.Session() -![solutions](https://qdrant.tech/articles_data/vector-search-resource-optimization/solutions.png) +bedrock_client = session.client( + "bedrock-runtime", + region_name="", + aws_access_key_id="", + aws_secret_access_key="", +) -| Scenario | Description | -| --- | --- | -| **When You Scale Up** | As data grows and the request surge, optimizing resource usage ensures your systems stay responsive and cost-efficient, even under heavy loads. | -| **If Facing Budget Constraints** | Strike the perfect balance between performance and cost, cutting unnecessary expenses while maintaining essential capabilities. | -| **You Need Better Performance** | If you’re noticing slow query speeds, latency issues, or frequent timeouts, it’s time to fine-tune your resource allocation. | -| **When System Stability is Paramount** | To manage high-traffic environments you will need to prevent crashes or failures caused by resource exhaustion. | +qdrant_client = QdrantClient(url="http://localhost:6333") -## [Anchor](https://qdrant.tech/articles/vector-search-resource-optimization/\#get-the-cheatsheet) Get the Cheatsheet +qdrant_client.create_collection( + "{collection_name}", + vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE), +) -Want to download a printer-friendly version of this guide? [**Download it now.**](https://try.qdrant.tech/resource-optimization-guide). +body = json.dumps({"inputText": "Some text to generate embeddings for"}) -[![downloadable vector search resource optimization guide](https://qdrant.tech/articles_data/vector-search-resource-optimization/downloadable-guide.jpg)](https://try.qdrant.tech/resource-optimization-guide) +response = bedrock_client.invoke_model( + body=body, + modelId="amazon.titan-embed-text-v1", + accept="application/json", + contentType="application/json", +) -##### Was this page useful? +response_body = json.loads(response.get("body").read()) -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +qdrant_client.upsert( + "{collection_name}", + points=[models.PointStruct(id=1, vector=response_body["embedding"])], +) +``` -Thank you for your feedback! 🙏 +```javascript +// Install the required dependencies +// npm install @aws-sdk/client-bedrock-runtime @qdrant/js-client-rest -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/vector-search-resource-optimization.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +import { + BedrockRuntimeClient, + InvokeModelCommand, +} from "@aws-sdk/client-bedrock-runtime"; +import { QdrantClient } from '@qdrant/js-client-rest'; -On this page: +const main = async () => { + const bedrockClient = new BedrockRuntimeClient({ + region: "", + credentials: { + accessKeyId: "",, + secretAccessKey: "", + }, + }); -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/vector-search-resource-optimization.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) + const qdrantClient = new QdrantClient({ url: 'http://localhost:6333' }); -× + await qdrantClient.createCollection("{collection_name}", { + vectors: { + size: 1536, + distance: 'Cosine', + } + }); + + const response = await bedrockClient.send( + new InvokeModelCommand({ + modelId: "amazon.titan-embed-text-v1", + body: JSON.stringify({ + inputText: "Some text to generate embeddings for", + }), + contentType: "application/json", + accept: "application/json", + }) + ); -[Powered by](https://qdrant.tech/) + const body = new TextDecoder().decode(response.body); -<|page-123-lllmstxt|> -## immutable-data-structures -- [Articles](https://qdrant.tech/articles/) -- Qdrant Internals: Immutable Data Structures + await qdrantClient.upsert("{collection_name}", { + points: [ + { + id: 1, + vector: JSON.parse(body).embedding, + }, + ], + }); +} -[Back to Qdrant Internals](https://qdrant.tech/articles/qdrant-internals/) +main(); +``` -# Qdrant Internals: Immutable Data Structures +<|page-209-lllmstxt|> +# AWS Lakechain -Andrey Vasnetsov +[Project Lakechain](https://awslabs.github.io/project-lakechain/) is a framework based on the AWS Cloud Development Kit (CDK), allowing to express and deploy scalable document processing pipelines on AWS using infrastructure-as-code. It emphasizes on modularity and extensibility of pipelines, and provides 60+ ready to use components for prototyping complex processing pipelines that scale out of the box to millions of documents. -· +The Qdrant storage connector available with Lakechain enables uploading vector embeddings produced by other middlewares to a Qdrant collection. -August 20, 2024 + -![Qdrant Internals: Immutable Data Structures](https://qdrant.tech/articles_data/immutable-data-structures/preview/title.jpg) +To use the Qdrant storage connector, you import it in your CDK stack, and connect it to a data source providing document embeddings. -## [Anchor](https://qdrant.tech/articles/immutable-data-structures/\#data-structures-101) Data Structures 101 +> You need to specify a Qdrant API key to the connector, by specifying a reference to an [AWS Secrets Manager](https://aws.amazon.com/secrets-manager/) secret containing the API key. -Those who took programming courses might remember that there is no such thing as a universal data structure. -Some structures are good at accessing elements by index (like arrays), while others shine in terms of insertion efficiency (like linked lists). +```typescript +import { QdrantStorageConnector } from '@project-lakechain/qdrant-storage-connector'; +import { CacheStorage } from '@project-lakechain/core'; -![Hardware-optimized data structure](https://qdrant.tech/articles_data/immutable-data-structures/hardware-optimized.png) +class Stack extends cdk.Stack { + constructor(scope: cdk.Construct, id: string) { + const cache = new CacheStorage(this, 'Cache'); -Hardware-optimized data structure + const qdrantApiKey = secrets.Secret.fromSecretNameV2( + this, + 'QdrantApiKey', + process.env.QDRANT_API_KEY_SECRET_NAME as string + ); -However, when we move from theoretical data structures to real-world systems, and particularly in performance-critical areas such as [vector search](https://qdrant.tech/use-cases/), things become more complex. [Big-O notation](https://en.wikipedia.org/wiki/Big_O_notation) provides a good abstraction, but it doesn’t account for the realities of modern hardware: cache misses, memory layout, disk I/O, and other low-level considerations that influence actual performance. + const connector = new QdrantStorageConnector.Builder() + .withScope(this) + .withIdentifier('QdrantStorageConnector') + .withCacheStorage(cache) + .withSource(source) // 👈 Specify a data source + .withApiKey(qdrantApiKey) + .withCollectionName('{collection_name}') + .withUrl('https://xyz-example.eu-central.aws.cloud.qdrant.io:6333') + .build(); + } +} +``` -> From the perspective of hardware efficiency, the ideal data structure is a contiguous array of bytes that can be read sequentially in a single thread. This scenario allows hardware optimizations like prefetching, caching, and branch prediction to operate at their best. +When the document being processed is a text document, you can choose to store the text of the document in the Qdrant payload. To do so, you can use the `withStoreText` and `withTextKey` options. If the document is not a text, this option is ignored. -However, real-world use cases require more complex structures to perform various operations like insertion, deletion, and search. -These requirements increase complexity and introduce performance trade-offs. +```typescript +const connector = new QdrantStorageConnector.Builder() + .withScope(this) + .withIdentifier('QdrantStorageConnector') + .withCacheStorage(cache) + .withSource(source) + .withApiKey(qdrantApiKey) + .withCollectionName('{collection_name}') + .withStoreText(true) + .withTextKey('my-content') + .withUrl('https://xyz-example.eu-central.aws.cloud.qdrant.io:6333') + .build(); +``` -### [Anchor](https://qdrant.tech/articles/immutable-data-structures/\#mutability) Mutability +Since Qdrant supports [multiple vectors](/documentation/concepts/vectors/#named-vectors) per point, you can use the `withVectorName` option to specify one. The connector defaults to unnamed (default) vector. -One of the most significant challenges when working with data structures is ensuring **mutability — the ability to change the data structure after it’s created**, particularly with fast update operations. +```typescript +const connector = new QdrantStorageConnector.Builder() + .withScope(this) + .withIdentifier('QdrantStorageConnector') + .withCacheStorage(cache) + .withSource(source) + .withApiKey(qdrantApiKey) + .withCollectionName('collection_name') + .withVectorName('my-vector-name') + .withUrl('https://xyz-example.eu-central.aws.cloud.qdrant.io:6333') + .build(); +``` -Let’s consider a simple example: we want to iterate over items in sorted order. -Without a mutability requirement, we can use a simple array and sort it once. -This is very close to our ideal scenario. We can even put the structure on disk - which is trivial for an array. +## Further Reading -However, if we need to insert an item into this array, **things get more complicated**. -Inserting into a sorted array requires shifting all elements after the insertion point, which leads to linear time complexity for each insertion, which is not acceptable for many applications. +- [Introduction to Lakechain](https://awslabs.github.io/project-lakechain/general/introduction/) +- [Lakechain Examples](https://github.com/awslabs/project-lakechain/tree/main/examples) -To handle such cases, more complex structures like [B-trees](https://en.wikipedia.org/wiki/B-tree) come into play. B-trees are specifically designed to optimize both insertion and read operations for large data sets. However, they sacrifice the raw speed of array reads for better insertion performance. +<|page-210-lllmstxt|> +# BuildShip -Here’s a benchmark that illustrates the difference between iterating over a plain array and a BTreeSet in Rust: +[BuildShip](https://buildship.com/) is a low-code visual builder to create APIs, scheduled jobs, and backend workflows with AI assitance. -```rust -use std::collections::BTreeSet; -use rand::Rng; +You can use the [Qdrant integration](https://buildship.com/integrations/qdrant) to development workflows with semantic-search capabilites. -fn main() { - // Benchmark plain vector VS btree in a task of iteration over all elements - let mut rand = rand::thread_rng(); - let vector: Vec<_> = (0..1000000).map(|_| rand.gen::()).collect(); - let btree: BTreeSet<_> = vector.iter().copied().collect(); +## Prerequisites - { - let mut sum = 0; - for el in vector { - sum += el; - } - } // Elapsed: 850.924”s +1. A Qdrant instance to connect to. You can get a free cloud instance at [cloud.qdrant.io](https://cloud.qdrant.io/). +2. A [BuildsShip](https://buildship.app/) for developing workflows. - { - let mut sum = 0; - for el in btree { - sum += el; - } - } // Elapsed: 5.213025ms, ~6x slower +## Nodes -} +Nodes are are fundamental building blocks of BuildShip. Each responsible for an operation in your workflow. -``` +The Qdrant integration includes the following nodes with extensibility if required. -[Vector databases](https://qdrant.tech/), like Qdrant, have to deal with a large variety of data structures. -If we could make them immutable, it would significantly improve performance and optimize memory usage. +### Add Point -## [Anchor](https://qdrant.tech/articles/immutable-data-structures/\#how-does-immutability-help) How Does Immutability Help? +![Add Point](/documentation/frameworks/buildship/add.png) -A large part of the immutable advantage comes from the fact that we know the exact data we need to put into the structure even before we start building it. -The simplest example is a sorted array: we would know exactly how many elements we have to put into the array so we can allocate the exact amount of memory once. +### Retrieve Points -More complex data structures might require additional statistics to be collected before the structure is built. -A Qdrant-related example of this is [Scalar Quantization](https://qdrant.tech/articles/scalar-quantization/#conversion-to-integers): in order to select proper quantization levels, we have to know the distribution of the data. +![Retrieve Points](/documentation/frameworks/buildship/get.png) -![Scalar Quantization Quantile](https://qdrant.tech/articles_data/immutable-data-structures/quantization-quantile.png) +### Delete Points -Scalar Quantization Quantile +![Delete Points](/documentation/frameworks/buildship/delete.png) -Computing this distribution requires knowing all the data in advance, but once we have it, applying scalar quantization is a simple operation. +### Search Points -Let’s take a look at a non-exhaustive list of data structures and potential improvements we can get from making them immutable: +![Search Points](/documentation/frameworks/buildship/search.png) -| Function | Mutable Data Structure | Immutable Alternative | Potential improvements | -| --- | --- | --- | --- | -| Read by index | Array | Fixed chunk of memory | Allocate exact amount of memory | -| Vector Storage | Array or Arrays | Memory-mapped file | Offload data to disk | -| Read sorted ranges | B-Tree | Sorted Array | Store all data close, avoid cache misses | -| Read by key | Hash Map | Hash Map with Perfect Hashing | Avoid hash collisions | -| Get documents by keyword | Inverted Index | Inverted Index with Sorted
and BitPacked Postings | Less memory usage, faster search | -| Vector Search | HNSW graph | HNSW graph with
payload-aware connections | Better precision with filters | -| Tenant Isolation | Vector Storage | Defragmented Vector Storage | Faster access to on-disk data | +## Further Reading -For more info on payload-aware connections in HNSW, read our [previous article](https://qdrant.tech/articles/filtrable-hnsw/). +- [BuildShip Docs](https://docs.buildship.com/basics/node). +- [BuildShip Integrations](https://buildship.com/integrations) -This time around, we will focus on the latest additions to Qdrant: +<|page-211-lllmstxt|> +# Camel -- **the immutable hash map with perfect hashing** -- **defragmented vector storage**. +[Camel](https://www.camel-ai.org) is a Python framework to build and use LLM-based agents for real-world task solving. -### [Anchor](https://qdrant.tech/articles/immutable-data-structures/\#perfect-hashing) Perfect Hashing +Qdrant is available as a storage mechanism in Camel for ingesting and retrieving semantically similar data. -A hash table is one of the most commonly used data structures implemented in almost every programming language, including Rust. -It provides fast access to elements by key, with an average time complexity of O(1) for read and write operations. +## Usage With Qdrant -There is, however, the assumption that should be satisfied for the hash table to work efficiently: _hash collisions should not cause too much overhead_. -In a hash table, each key is mapped to a “bucket,” a slot where the value is stored. -When different keys map to the same bucket, a collision occurs. +- Install Camel with the `vector-databases` extra. -In regular mutable hash tables, minimization of collisions is achieved by: +```bash +pip install "camel[vector-databases]" +``` -- making the number of buckets bigger so the probability of collision is lower -- using a linked list or a tree to store multiple elements with the same hash +- Configure the `QdrantStorage` class. -However, these strategies have overheads, which become more significant if we consider using high-latency storage like disk. +```python +from camel.storages import QdrantStorage, VectorDBQuery, VectorRecord +from camel.types import VectorDistance -Indeed, every read operation from disk is several orders of magnitude slower than reading from RAM, so we want to know the correct location of the data from the first attempt. +qdrant_storage = QdrantStorage( + url_and_api_key=( + "https://xyz-example.eu-central.aws.cloud.qdrant.io:6333", + "", + ), + collection_name="{collection_name}", + distance=VectorDistance.COSINE, + vector_dim=384, +) +``` -In order to achieve this, we can use a so-called minimal perfect hash function (MPHF). -This special type of hash function is constructed specifically for a given set of keys, and it guarantees no collisions while using minimal amount of buckets. +The `QdrantStorage` class implements methods to read and write to a Qdrant instance. An instance of this class can now be passed to retrievers for interfacing with your Qdrant collections. -In Qdrant, we decided to use _fingerprint-based minimal perfect hash function_ implemented in the [ph crate 🩀](https://crates.io/crates/ph) by [Piotr Beling](https://dl.acm.org/doi/10.1145/3596453). -According to our benchmarks, using the perfect hash function does introduce some overhead in terms of hashing time, but it significantly reduces the time for the whole operation: +```python +qdrant_storage.add([VectorRecord( + vector=[-0.1, 0.1, ...], + payload={'key1': 'value1'}, + ), + VectorRecord( + vector=[-0.1, 0.1, ...], + payload={'key2': 'value2'}, + ),]) -| Volume | `ph::Function` | `std::hash::Hash` | `HashMap::get` | -| --- | --- | --- | --- | -| 1000 | 60ns | ~20ns | 34ns | -| 100k | 90ns | ~20ns | 220ns | -| 10M | 238ns | ~20ns | 500ns | +query_results = qdrant_storage.query(VectorDBQuery(query_vector=[0.1, 0.2, ...], top_k=10)) +for result in query_results: + print(result.record.payload, result.similarity) -Even thought the absolute time for hashing is higher, the time for the whole operation is lower, because PHF guarantees no collisions. -The difference is even more significant when we consider disk read time, which -might up to several milliseconds (10^6 ns). +qdrant_storage.clear() +``` -PHF RAM size scales linearly for `ph::Function`: 3.46 kB for 10k elements, 119MB for 350M elements. -The construction time required to build the hash function is surprisingly low, and we only need to do it once: +- Use the `QdrantStorage` in Camel's Vector Retriever. -| Volume | `ph::Function` (construct) | PHF size | Size of int64 keys (for reference) | -| --- | --- | --- | --- | -| 1M | 52ms | 0.34Mb | 7.62Mb | -| 100M | 7.4s | 33.7Mb | 762.9Mb | +```python +from camel.embeddings import OpenAIEmbedding +from camel.retrievers import VectorRetriever -The usage of PHF in Qdrant lets us minimize the latency of cold reads, which is especially important for large-scale multi-tenant systems. With PHF, it is enough to read a single page from a disk to get the exact location of the data. +# Initialize the VectorRetriever with an embedding model +vr = VectorRetriever(embedding_model=OpenAIEmbedding()) -### [Anchor](https://qdrant.tech/articles/immutable-data-structures/\#defragmentation) Defragmentation +content_input_path = "" -When you read data from a disk, you almost never read a single byte. Instead, you read a page, which is a fixed-size chunk of data. -On many systems, the page size is 4KB, which means that every read operation will read 4KB of data, even if you only need a single byte. +vr.process(content_input_path, qdrant_storage) -Vector search, on the other hand, requires reading a lot of small vectors, which might create a large overhead. -It is especially noticeable if we use binary quantization, where the size of even large OpenAI 1536d vectors is compressed down to **192 bytes**. +# Execute the query and retrieve results +results = vr.query("", vector_storage) +``` -![Overhead when reading a single vector](https://qdrant.tech/articles_data/immutable-data-structures/page-vector.png) +- Camel also provides an Auto Retriever implementation that handles both embedding and storing data and executing queries. -Overhead when reading single vector +```python +from camel.retrievers import AutoRetriever +from camel.types import StorageType -That means if the vectors we access during the search are randomly scattered across the disk, we will have to read 4KB for each vector, which is 20 times more than the actual data size. +ar = AutoRetriever( + url_and_api_key=( + "https://xyz-example.eu-central.aws.cloud.qdrant.io:6333", + "", + ), + storage_type=StorageType.QDRANT, +) -There is, however, a simple way to avoid this overhead: **defragmentation**. -If we knew some additional information about the data, we could combine all relevant vectors into a single page. +retrieved_info = ar.run_vector_retriever( + contents=[""], + query="""", + return_detailed_info=True, +) -![Defragmentation](https://qdrant.tech/articles_data/immutable-data-structures/defragmentation.png) +print(retrieved_info) +``` -Defragmentation +You can refer to the Camel [documentation](https://docs.camel-ai.org/index.html) for more information about the retrieval mechansims. -This additional information is available to Qdrant via the [payload index](https://qdrant.tech/documentation/concepts/indexing/#payload-index). +## End-To-End Examples -By specifying the payload index, which is going to be used for filtering most of the time, we can put all vectors with the same payload together. -This way, reading a single page will also read nearby vectors, which will be used in the search. +- [Camel RAG Cookbook](https://docs.camel-ai.org/cookbooks/agents_with_rag.html) +- [Customer Service Discord Bot with Agentic RAG](https://docs.camel-ai.org/cookbooks/customer_service_Discord_bot_with_agentic_RAG.html) -This approach is especially efficient for [multi-tenant systems](https://qdrant.tech/documentation/guides/multiple-partitions/), where only a small subset of vectors is actively used for search. -The capacity of such a deployment is typically defined by the size of the hot subset, which is much smaller than the total number of vectors. +<|page-212-lllmstxt|> +# Cheshire Cat -> Grouping relevant vectors together allows us to optimize the size of the hot subset by avoiding caching of irrelevant data. -> The following benchmark data compares RPS for defragmented and non-defragmented storage: +[Cheshire Cat](https://cheshirecat.ai/) is an open-source framework that allows you to develop intelligent agents on top of many Large Language Models (LLM). You can develop your custom AI architecture to assist you in a wide range of tasks. -| % of hot subset | Tenant Size (vectors) | RPS, Non-defragmented | RPS, Defragmented | -| --- | --- | --- | --- | -| 2.5% | 50k | 1.5 | 304 | -| 12.5% | 50k | 0.47 | 279 | -| 25% | 50k | 0.4 | 63 | -| 50% | 50k | 0.3 | 8 | -| 2.5% | 5k | 56 | 490 | -| 12.5% | 5k | 5.8 | 488 | -| 25% | 5k | 3.3 | 490 | -| 50% | 5k | 3.1 | 480 | -| 75% | 5k | 2.9 | 130 | -| 100% | 5k | 2.7 | 95 | +![Cheshire cat](/documentation/frameworks/cheshire-cat/cat.jpg) -**Dataset size:** 2M 768d vectors (~6Gb Raw data), binary quantization, 650Mb of RAM limit. -All benchmarks are made with minimal RAM allocation to demonstrate disk cache efficiency. +## Cheshire Cat and Qdrant -As you can see, the biggest impact is on the small tenant size, where defragmentation allows us to achieve **100x more RPS**. -Of course, the real-world impact of defragmentation depends on the specific workload and the size of the hot subset, but enabling this feature can significantly improve the performance of Qdrant. +Cheshire Cat uses Qdrant as the default [Vector Memory](https://cheshire-cat-ai.github.io/docs/faq/llm-concepts/vector-memory/) for ingesting and retrieving documents. -Please find more details on how to enable defragmentation in the [indexing documentation](https://qdrant.tech/documentation/concepts/indexing/#tenant-index). +``` +# Decide host and port for your Cat. Default will be localhost:1865 +CORE_HOST=localhost +CORE_PORT=1865 -## [Anchor](https://qdrant.tech/articles/immutable-data-structures/\#updating-immutable-data-structures) Updating Immutable Data Structures +# Qdrant server +# QDRANT_HOST=localhost +# QDRANT_PORT=6333 +``` -One may wonder how Qdrant allows updating collection data if everything is immutable. -Indeed, [Qdrant API](https://api.qdrant.tech/) allows the change of any vector or payload at any time, so from the user’s perspective, the whole collection is mutable at any time. +Cheshire Cat takes great advantage of the following features of Qdrant: -As it usually happens with every decent magic trick, the secret is disappointingly simple: not all data in Qdrant is immutable. -In Qdrant, storage is divided into segments, which might be either mutable or immutable. -New data is always written to the mutable segment, which is later converted to the immutable one by the optimization process. +* [Collection Aliases](/documentation/concepts/collections/#collection-aliases) to manage the change from one embedder to another. +* [Quantization](/documentation/guides/quantization/) to obtain a good balance between speed, memory usage and quality of the results. +* [Snapshots](/documentation/concepts/snapshots/) to not miss any information. +* [Community](https://discord.com/invite/tdtYvXjC4h) -![Optimization process](https://qdrant.tech/articles_data/immutable-data-structures/optimization.png) +![RAG Pipeline](/documentation/frameworks/cheshire-cat/stregatto.jpg) -Optimization process +## How to use the Cheshire Cat -If we need to update the data in the immutable or currenly optimized segment, instead of changing the data in place, we perform a copy-on-write operation, move the data to the mutable segment, and update it there. +### Requirements -Data in the original segment is marked as deleted, and later vacuumed by the optimization process. +To run the Cheshire Cat, you need to have [Docker](https://docs.docker.com/engine/install/) and [docker-compose](https://docs.docker.com/compose/install/) already installed on your system. -## [Anchor](https://qdrant.tech/articles/immutable-data-structures/\#downsides-and-how-to-compensate) Downsides and How to Compensate +```shell +docker run --rm -it -p 1865:80 ghcr.io/cheshire-cat-ai/core:latest +``` -While immutable data structures are great for read-heavy operations, they come with trade-offs: +* Chat with the Cheshire Cat on [localhost:1865/admin](http://localhost:1865/admin). +* You can also interact via REST API and try out the endpoints on [localhost:1865/docs](http://localhost:1865/docs) -- **Higher update costs:** Immutable structures are less efficient for updates. The amortized time complexity might be the same as mutable structures, but the constant factor is higher. -- **Rebuilding overhead:** In some cases, we may need to rebuild indices or structures for the same data more than once. -- **Read-heavy workloads:** Immutability assumes a search-heavy workload, which is typical for search engines but not for all applications. +Check the [instructions on github](https://github.com/cheshire-cat-ai/core/blob/main/README.md) for a more comprehensive quick start. -In Qdrant, we mitigate these downsides by allowing the user to adapt the system to their specific workload. -For example, changing the default size of the segment might help to reduce the overhead of rebuilding indices. +### First configuration of the LLM -In extreme cases, multi-segment storage can act as a single segment, falling back to the mutable data structure when needed. +* Open the Admin Portal in your browser at [localhost:1865/admin](http://localhost:1865/admin). +* Configure the LLM in the `Settings` tab. +* If you don't explicitly choose it using `Settings` tab, the Embedder follows the LLM. -## [Anchor](https://qdrant.tech/articles/immutable-data-structures/\#conclusion) Conclusion +## Next steps -Immutable data structures, while tricky to implement correctly, offer significant performance gains, especially for read-heavy systems like search engines. They allow us to take full advantage of hardware optimizations, reduce memory overhead, and improve cache performance. +For more information, refer to the Cheshire Cat [documentation](https://cheshire-cat-ai.github.io/docs/) and [blog](https://cheshirecat.ai/blog/). -In Qdrant, the combination of techniques like perfect hashing and defragmentation brings further benefits, making our vector search operations faster and more efficient. While there are trade-offs, the flexibility of Qdrant’s architecture — including segment-based storage — allows us to balance the best of both worlds. +* [Getting started](https://cheshirecat.ai/hello-world/) +* [How the Cat works](https://cheshirecat.ai/how-the-cat-works/) +* [Write Your First Plugin](https://cheshirecat.ai/write-your-first-plugin/) +* [Cheshire Cat's use of Qdrant - Vector Space](https://cheshirecat.ai/dont-get-lost-in-vector-space/) +* [Cheshire Cat's use of Qdrant - Aliases](https://cheshirecat.ai/the-drunken-cat-effect/) +* [Cheshire Cat's use of Qdrant - Quantization](https://cheshirecat.ai/gentle-introduction-to-cheshire-cat-vector-search/) +* [Cheshire Cat at Qdrant vector Space Talks](https://qdrant.tech/blog/meow-with-cheshire-cat/) +* [Discord Community](https://discord.com/invite/bHX5sNFCYU) -##### Was this page useful? +<|page-213-lllmstxt|> +# CocoIndex -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +[CocoIndex](https://cocoindex.com) is a high performance ETL framework to transform data for AI, with real-time incremental processing. -Thank you for your feedback! 🙏 +Qdrant is available as a native built-in vector database to store and retrieve embeddings. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/immutable-data-structures.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. -On this page: +Install CocoIndex: +```bash +pip install -U cocoindex +``` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/immutable-data-structures.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Install Postgres with [Docker Compose](https://docs.docker.com/compose/install/): +```bash +docker compose -f <(curl -L https://raw.githubusercontent.com/cocoindex-io/cocoindex/refs/heads/main/dev/postgres.yaml) up -d +``` +CocoIndex is a stateful ETL framework and only processes data that has changed. It uses Postgres as a metadata store to track the state of the data. -× +```python +import cocoindex -[Powered by](https://qdrant.tech/) +doc_embeddings.export( + "doc_embeddings", + cocoindex.storages.Qdrant( + collection_name="cocoindex", + grpc_url="https://xyz-example.cloud-region.cloud-provider.cloud.qdrant.io:6334/", + api_key="", + ), + primary_key_fields=["id_field"], + setup_by_user=True, +) +``` -<|page-124-lllmstxt|> -## natural-language-search-oracle-cloud-infrastructure-cohere-langchain -- [Documentation](https://qdrant.tech/documentation/) -- [Examples](https://qdrant.tech/documentation/examples/) -- RAG System for Employee Onboarding +The spec takes the following fields: -# [Anchor](https://qdrant.tech/documentation/examples/natural-language-search-oracle-cloud-infrastructure-cohere-langchain/\#rag-system-for-employee-onboarding) RAG System for Employee Onboarding +- `collection_name` (type: str, required): The name of the collection to export the data to. +- `grpc_url` (type: str, optional): The gRPC URL of the Qdrant instance. Defaults to http://localhost:6334/. +- `api_key` (type: str, optional). API key to authenticate requests with. -Public websites are a great way to share information with a wide audience. However, finding the right information can be -challenging, if you are not familiar with the website’s structure or the terminology used. That’s what the search bar is -for, but it is not always easy to formulate a query that will return the desired results, if you are not yet familiar -with the content. This is even more important in a corporate environment, and for the new employees, who are just -starting to learn the ropes, and don’t even know how to ask the right questions yet. You may have even the best intranet -pages, but onboarding is more than just reading the documentation, it is about understanding the processes. Semantic -search can help with finding right resources easier, but wouldn’t it be easier to just chat with the website, like you -would with a colleague? +Before exporting, you must create a collection with a vector name that matches the vector field name in CocoIndex, and set `setup_by_user=True` during export. -Technological advancements have made it possible to interact with websites using natural language. This tutorial will -guide you through the process of integrating [Cohere](https://cohere.com/)’s language models with Qdrant to enable -natural language search on your documentation. We are going to use [LangChain](https://langchain.com/) as an -orchestrator. Everything will be hosted on [Oracle Cloud Infrastructure (OCI)](https://www.oracle.com/cloud/), so you -can scale your application as needed, and do not send your data to third parties. That is especially important when you -are working with confidential or sensitive data. +## Further Reading -## [Anchor](https://qdrant.tech/documentation/examples/natural-language-search-oracle-cloud-infrastructure-cohere-langchain/\#building-up-the-application) Building up the application +- [CocoIndex Documentation](https://cocoindex.io/docs/ops/storages#qdrant) +- [Example Code to build text embeddings with Qdrant](https://github.com/cocoindex-io/cocoindex/tree/main/examples/text_embedding_qdrant) -Our application will consist of two main processes: indexing and searching. Langchain will glue everything together, -as we will use a few components, including Cohere and Qdrant, as well as some OCI services. Here is a high-level -overview of the architecture: +<|page-214-lllmstxt|> +# cognee -![Architecture diagram of the target system](https://qdrant.tech/documentation/examples/faq-oci-cohere-langchain/architecture-diagram.png) +[cognee](https://www.cognee.ai) is a memory management tool for AI Apps and Agents -### [Anchor](https://qdrant.tech/documentation/examples/natural-language-search-oracle-cloud-infrastructure-cohere-langchain/\#prerequisites) Prerequisites +Qdrant is available as a native built-in vector database to store and retrieve embeddings. -Before we dive into the implementation, make sure to set up all the necessary accounts and tools. -#### [Anchor](https://qdrant.tech/documentation/examples/natural-language-search-oracle-cloud-infrastructure-cohere-langchain/\#libraries) Libraries -We are going to use a few Python libraries. Of course, Langchain will be our main framework, but the Cohere models on -OCI are accessible via the [OCI SDK](https://docs.oracle.com/en-us/iaas/tools/python/2.125.1/). Let’s install all the -necessary libraries: -```shell -pip install langchain oci qdrant-client langchainhub +## 📩 Installation +You can install Cognee using either **pip**, **poetry**, **uv** or any other python package manager. +Cognee supports Python 3.8 to 3.12 + +### With pip + +```bash +pip install cognee ``` -#### [Anchor](https://qdrant.tech/documentation/examples/natural-language-search-oracle-cloud-infrastructure-cohere-langchain/\#oracle-cloud) Oracle Cloud +## Local Cognee installation -Our application will be fully running on Oracle Cloud Infrastructure (OCI). It’s up to you to choose how you want to -deploy your application. Qdrant Hybrid Cloud will be running in your [Kubernetes cluster running on Oracle Cloud\\ -(OKE)](https://www.oracle.com/cloud/cloud-native/container-engine-kubernetes/), so all the processes might be also -deployed there. You can get started with signing up for an account on [Oracle Cloud](https://signup.cloud.oracle.com/). +You can install the local Cognee repo using **pip**, **poetry** and **uv**. +For local pip installation please make sure your pip version is above version 21.3. -Cohere models are available on OCI as a part of the [Generative AI\\ -Service](https://www.oracle.com/artificial-intelligence/generative-ai/generative-ai-service/). We need both the -[Generation models](https://docs.oracle.com/en-us/iaas/Content/generative-ai/use-playground-generate.htm) and the -[Embedding models](https://docs.oracle.com/en-us/iaas/Content/generative-ai/use-playground-embed.htm). Please follow the -linked tutorials to grasp the basics of using Cohere models there. +### with UV with all optional dependencies -Accessing the models programmatically requires knowing the compartment OCID. Please refer to the [documentation that\\ -describes how to find it](https://docs.oracle.com/en-us/iaas/Content/GSG/Tasks/contactingsupport_topic-Locating_Oracle_Cloud_Infrastructure_IDs.htm#Finding_the_OCID_of_a_Compartment). -For the further reference, we will assume that the compartment OCID is stored in the environment variable: +```bash +uv sync --all-extras +``` -shellpython +## đŸ’» Basic Usage -```shell -export COMPARTMENT_OCID="" +### Setup ``` - -```python import os - -os.environ["COMPARTMENT_OCID"] = "" +os.environ["LLM_API_KEY"] = "YOUR OPENAI_API_KEY" +VECTOR_DB_PROVIDER="qdrant" +VECTOR_DB_URL=https://url-to-your-qdrant-cloud-instance.cloud.qdrant.io:6333 +VECTOR_DB_KEY=your-qdrant-api-key ``` -#### [Anchor](https://qdrant.tech/documentation/examples/natural-language-search-oracle-cloud-infrastructure-cohere-langchain/\#qdrant-hybrid-cloud) Qdrant Hybrid Cloud +You can also set the variables by creating .env file, using our template. +To use different LLM providers, for more info check out our documentation -Qdrant Hybrid Cloud running on Oracle Cloud helps you build a solution without sending your data to external services. Our documentation provides a step-by-step guide on how to [deploy Qdrant Hybrid Cloud on Oracle\\ -Cloud](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/#oracle-cloud-infrastructure). -Qdrant will be running on a specific URL and access will be restricted by the API key. Make sure to store them both as environment variables as well: +### Simple example -```shell -export QDRANT_URL="https://qdrant.example.com" -export QDRANT_API_KEY="your-api-key" +This script will run the default pipeline: -``` +```python +import cognee +import asyncio -_Optional:_ Whenever you use LangChain, you can also [configure LangSmith](https://docs.smith.langchain.com/), which will help us trace, monitor and debug LangChain applications. You can sign up for LangSmith [here](https://smith.langchain.com/). -```shell -export LANGCHAIN_TRACING_V2=true -export LANGCHAIN_API_KEY="your-api-key" -export LANGCHAIN_PROJECT="your-project" # if not specified, defaults to "default" +async def main(): + # Add text to cognee + await cognee.add("Natural language processing (NLP) is an interdisciplinary subfield of computer science and information retrieval.") + + # Generate the knowledge graph + await cognee.cognify() + + # Query the knowledge graph + results = await cognee.search("Tell me about NLP") + + # Display the results + for result in results: + print(result) + +if __name__ == '__main__': + asyncio.run(main()) + +``` +Example output: ``` + Natural Language Processing (NLP) is a cross-disciplinary and interdisciplinary field that involves computer science and information retrieval. It focuses on the interaction between computers and human language, enabling machines to understand and process natural language. -Now you can get started: +``` -```python -import os +<|page-215-lllmstxt|> +# Cohere -os.environ["QDRANT_URL"] = "https://qdrant.example.com" -os.environ["QDRANT_API_KEY"] = "your-api-key" +Qdrant is compatible with Cohere [co.embed API](https://docs.cohere.ai/reference/embed) and its official Python SDK that +might be installed as any other package: +```bash +pip install cohere ``` -Let’s create the collection that will store the indexed documents. We will use the `qdrant-client` library, and our -collection will be named `oracle-cloud-website`. Our embedding model, `cohere.embed-english-v3.0`, produces embeddings -of size 1024, and we have to specify that when creating the collection. +The embeddings returned by co.embed API might be used directly in the Qdrant client's calls: ```python -from qdrant_client import QdrantClient, models +import cohere +import qdrant_client +from qdrant_client.models import Batch -client = QdrantClient( - location=os.environ.get("QDRANT_URL"), - api_key=os.environ.get("QDRANT_API_KEY"), -) -client.create_collection( - collection_name="oracle-cloud-website", - vectors_config=models.VectorParams( - size=1024, - distance=models.Distance.COSINE, +cohere_client = cohere.Client("<< your_api_key >>") +qdrant_client = qdrant_client.QdrantClient() +qdrant_client.upsert( + collection_name="MyCollection", + points=Batch( + ids=[1], + vectors=cohere_client.embed( + model="large", + texts=["The best vector database"], + ).embeddings, ), ) - ``` -### [Anchor](https://qdrant.tech/documentation/examples/natural-language-search-oracle-cloud-infrastructure-cohere-langchain/\#indexing-process) Indexing process +If you are interested in seeing an end-to-end project created with co.embed API and Qdrant, please check out the +"[Question Answering as a Service with Cohere and Qdrant](/articles/qa-with-cohere-and-qdrant/)" article. -We have all the necessary tools set up, so let’s start with the indexing process. We will use the Cohere Embedding -models to convert the text into vectors, and then store them in Qdrant. Langchain is integrated with OCI Generative AI -Service, so we can easily access the models. +## Embed v3 -Our dataset will be fairly simple, as it will consist of the questions and answers from the [Oracle Cloud Free Tier\\ -FAQ page](https://www.oracle.com/cloud/free/faq/). +Embed v3 is a new family of Cohere models, released in November 2023. The new models require passing an additional +parameter to the API call: `input_type`. It determines the type of task you want to use the embeddings for. -![Some examples of the Oracle Cloud FAQ](https://qdrant.tech/documentation/examples/faq-oci-cohere-langchain/oracle-faq.png) +- `input_type="search_document"` - for documents to store in Qdrant +- `input_type="search_query"` - for search queries to find the most relevant documents +- `input_type="classification"` - for classification tasks +- `input_type="clustering"` - for text clustering -Questions and answers are presented in an HTML format, but we don’t want to manually extract the text and adapt it for -each subpage. Instead, we will use the `WebBaseLoader` that just loads the HTML content from given URL and converts it -to text. +While implementing semantic search applications, such as RAG, you should use `input_type="search_document"` for the +indexed documents and `input_type="search_query"` for the search queries. The following example shows how to index +documents with the Embed v3 model: ```python -from langchain_community.document_loaders.web_base import WebBaseLoader - -loader = WebBaseLoader("https://www.oracle.com/cloud/free/faq/") -documents = loader.load() +import cohere +import qdrant_client +from qdrant_client.models import Batch +cohere_client = cohere.Client("<< your_api_key >>") +client = qdrant_client.QdrantClient() +client.upsert( + collection_name="MyCollection", + points=Batch( + ids=[1], + vectors=cohere_client.embed( + model="embed-english-v3.0", # New Embed v3 model + input_type="search_document", # Input type for documents + texts=["Qdrant is the a vector database written in Rust"], + ).embeddings, + ), +) ``` -Our `documents` is a list with just a single element, which is the text of the whole page. We need to split it into -meaningful parts, so we will use the `RecursiveCharacterTextSplitter` component. It will try to keep all paragraphs (and -then sentences, and then words) together as long as possible, as those would generically seem to be the strongest -semantically related pieces of text. The chunk size and overlap are both parameters that can be adjusted to fit the -specific use case. +Once the documents are indexed, you can search for the most relevant documents using the Embed v3 model: ```python -from langchain_text_splitters import RecursiveCharacterTextSplitter +client.query_points( + collection_name="MyCollection", + query=cohere_client.embed( + model="embed-english-v3.0", # New Embed v3 model + input_type="search_query", # Input type for search queries + texts=["The best vector database"], + ).embeddings[0], +) +``` -splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=100) -split_documents = splitter.split_documents(documents) + -``` +<|page-216-lllmstxt|> +![Confluent Logo](/documentation/frameworks/confluent/confluent-logo.png) -Our documents might be now indexed, but we need to convert them into vectors. Let’s configure the embeddings so the -`cohere.embed-english-v3.0` is used. Not all the regions support the Generative AI Service, so we need to specify the -region where the models are stored. We will use the `us-chicago-1`, but please check the -[documentation](https://docs.oracle.com/en-us/iaas/Content/generative-ai/overview.htm#regions) for the most up-to-date -list of supported regions. +Built by the original creators of Apache KafkaÂź, [Confluent Cloud](https://www.confluent.io/confluent-cloud/?utm_campaign=tm.pmm_cd.cwc_partner_Qdrant_generic&utm_source=Qdrant&utm_medium=partnerref) is a cloud-native and complete data streaming platform available on AWS, Azure, and Google Cloud. The platform includes a fully managed, elastically scaling Kafka engine, 120+ connectors, serverless Apache FlinkÂź, enterprise-grade security controls, and a robust governance suite. -```python -from langchain_community.embeddings.oci_generative_ai import OCIGenAIEmbeddings +With our [Qdrant-Kafka Sink Connector](https://github.com/qdrant/qdrant-kafka), Qdrant is part of the [Connect with Confluent](https://www.confluent.io/partners/connect/) technology partner program. It brings fully managed data streams directly to organizations from Confluent Cloud, making it easier for organizations to stream any data to Qdrant with a fully managed Apache Kafka service. -embeddings = OCIGenAIEmbeddings( - model_id="cohere.embed-english-v3.0", - service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", - compartment_id=os.environ.get("COMPARTMENT_OCID"), -) +## Usage -``` +### Pre-requisites -Now we can embed the documents and store them in Qdrant. We will create an instance of `Qdrant` and add the split -documents to the collection. +- A Confluent Cloud account. You can begin with a [free trial](https://www.confluent.io/confluent-cloud/tryfree/?utm_campaign=tm.pmm_cd.cwc_partner_qdrant_tryfree&utm_source=qdrant&utm_medium=partnerref) with credits for the first 30 days. +- Qdrant instance to connect to. You can get a free cloud instance at [cloud.qdrant.io](https://cloud.qdrant.io/). -```python -from langchain.vectorstores.qdrant import Qdrant +### Installation -qdrant = Qdrant( - client=client, - collection_name="oracle-cloud-website", - embeddings=embeddings, -) +1) Download the latest connector zip file from [Confluent Hub](https://www.confluent.io/hub/qdrant/qdrant-kafka). -qdrant.add_documents(split_documents, batch_size=20) +2) Configure an environment and cluster on Confluent and create a topic to produce messages for. -``` +3) Navigate to the `Connectors` section of the Confluent cluster and click `Add Plugin`. Upload the zip file with the following info. -Our documents should be now indexed and ready for searching. Let’s move to the next step. +![Qdrant Connector Install](/documentation/frameworks/confluent/install.png) -### [Anchor](https://qdrant.tech/documentation/examples/natural-language-search-oracle-cloud-infrastructure-cohere-langchain/\#speaking-to-the-website) Speaking to the website +4) Once installed, navigate to the connector and set the following configuration values. -The intended method of interaction with the website is through the chatbot. Large Language Model, in our case [Cohere\\ -Command](https://cohere.com/command), will be answering user’s questions based on the relevant documents that Qdrant -will return using the question as a query. Our LLM is also hosted on OCI, so we can access it similarly to the embedding -model: +![Qdrant Connector Config](/documentation/frameworks/confluent/config.png) -```python -from langchain_community.llms.oci_generative_ai import OCIGenAI +Replace the placeholder values with your credentials. -llm = OCIGenAI( - model_id="cohere.command", - service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", - compartment_id=os.environ.get("COMPARTMENT_OCID"), -) +5) Add the Qdrant instance host to the allowed networking endpoints. -``` +![Qdrant Connector Endpoint](/documentation/frameworks/confluent/endpoint.png) -Connection to Qdrant might be established in the same way as we did during the indexing process. We can use it to create -a retrieval chain, which implements the question-answering process. The retrieval chain also requires an additional -chain that will combine retrieved documents before sending them to an LLM. +7) Start the connector. -```python -from langchain.chains.combine_documents import create_stuff_documents_chain -from langchain.chains.retrieval import create_retrieval_chain -from langchain import hub +## Producing Messages -retriever = qdrant.as_retriever() -combine_docs_chain = create_stuff_documents_chain( - llm=llm, - # Default prompt is loaded from the hub, but we can also modify it - prompt=hub.pull("langchain-ai/retrieval-qa-chat"), -) -retrieval_qa_chain = create_retrieval_chain( - retriever=retriever, - combine_docs_chain=combine_docs_chain, -) -response = retrieval_qa_chain.invoke({"input": "What is the Oracle Cloud Free Tier?"}) +You can now produce messages for the configured topic, and they'll be written into the configured Qdrant instance. -``` +![Qdrant Connector Message](/documentation/frameworks/confluent/message.png) -The output of the `.invoke` method is a dictionary-like structure with the query and answer, but we can also access the -source documents used to generate the response. This might be useful for debugging or for further processing. +## Message Formats -```python +The connector supports messages in the following formats. + +_Click each to expand._ + +
+ Unnamed/Default vector + +Reference: [Creating a collection with a default vector](https://qdrant.tech/documentation/concepts/collections/#create-a-collection). + +```json { - 'input': 'What is the Oracle Cloud Free Tier?', - 'context': [\ - Document(\ - page_content='* Free Tier is generally available in regions where commercial Oracle Cloud Infrastructure service is available. See the data regions page for detailed service availability (the exact regions available for Free Tier may differ during the sign-up process). The US$300 cloud credit is available in',\ - metadata={\ - 'language': 'en-US',\ - 'source': 'https://www.oracle.com/cloud/free/faq/',\ - 'title': "FAQ on Oracle's Cloud Free Tier",\ - '_id': 'c8cf98e0-4b88-4750-be42-4157495fed2c',\ - '_collection_name': 'oracle-cloud-website'\ - }\ - ),\ - Document(\ - page_content='Oracle Cloud Free Tier allows you to sign up for an Oracle Cloud account which provides a number of Always Free services and a Free Trial with US$300 of free credit to use on all eligible Oracle Cloud Infrastructure services for up to 30 days. The Always Free services are available for an unlimited',\ - metadata={\ - 'language': 'en-US',\ - 'source': 'https://www.oracle.com/cloud/free/faq/',\ - 'title': "FAQ on Oracle's Cloud Free Tier",\ - '_id': 'dc291430-ff7b-4181-944a-39f6e7a0de69',\ - '_collection_name': 'oracle-cloud-website'\ - }\ - ),\ - Document(\ - page_content='Oracle Cloud Free Tier does not include SLAs. Community support through our forums is available to all customers. Customers using only Always Free resources are not eligible for Oracle Support. Limited support is available for Oracle Cloud Free Tier with Free Trial credits. After you use all of',\ - metadata={\ - 'language': 'en-US',\ - 'source': 'https://www.oracle.com/cloud/free/faq/',\ - 'title': "FAQ on Oracle's Cloud Free Tier",\ - '_id': '9e831039-7ccc-47f7-9301-20dbddd2fc07',\ - '_collection_name': 'oracle-cloud-website'\ - }\ - ),\ - Document(\ - page_content='looking to test things before moving to cloud, a student wanting to learn, or an academic developing curriculum in the cloud, Oracle Cloud Free Tier enables you to learn, explore, build and test for free.',\ - metadata={\ - 'language': 'en-US',\ - 'source': 'https://www.oracle.com/cloud/free/faq/',\ - 'title': "FAQ on Oracle's Cloud Free Tier",\ - '_id': 'e2dc43e1-50ee-4678-8284-6df60a835cf5',\ - '_collection_name': 'oracle-cloud-website'\ - }\ - )\ + "collection_name": "{collection_name}", + "id": 1, + "vector": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8 ], - 'answer': ' Oracle Cloud Free Tier is a subscription that gives you access to Always Free services and a Free Trial with $300 of credit that can be used on all eligible Oracle Cloud Infrastructure services for up to 30 days. \n\nThrough this Free Tier, you can learn, explore, build, and test for free. It is aimed at those who want to experiment with cloud services before making a commitment, as wellTheir use cases range from testing prior to cloud migration to learning and academic curriculum development. ' + "payload": { + "name": "kafka", + "description": "Kafka is a distributed streaming platform", + "url": "https://kafka.apache.org/" + } } - ``` -#### [Anchor](https://qdrant.tech/documentation/examples/natural-language-search-oracle-cloud-infrastructure-cohere-langchain/\#other-experiments) Other experiments +
-Asking the basic questions is just the beginning. What you want to avoid is a hallucination, where the model generates -an answer that is not based on the actual content. The default prompt of Langchain should already prevent this, but you -might still want to check it. Let’s ask a question that is not directly answered on the FAQ page: +
+ Named multiple vectors -```python -response = retrieval_qa.invoke({ - "input": "Is Oracle Generative AI Service included in the free tier?" -}) +Reference: [Creating a collection with multiple vectors](https://qdrant.tech/documentation/concepts/collections/#collection-with-multiple-vectors). +```json +{ + "collection_name": "{collection_name}", + "id": 1, + "vector": { + "some-dense": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8 + ], + "some-other-dense": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8 + ] + }, + "payload": { + "name": "kafka", + "description": "Kafka is a distributed streaming platform", + "url": "https://kafka.apache.org/" + } +} ``` -Output: +
-> Oracle Generative AI Services are not specifically mentioned as being available in the free tier. As per the text, the -> $300 free credit can be used on all eligible services for up to 30 days. To confirm if Oracle Generative AI Services -> are included in the free credit offer, it is best to check the official Oracle Cloud website or contact their support. +
+ Sparse vectors -It seems that Cohere Command model could not find the exact answer in the provided documents, but it tried to interpret -the context and provide a reasonable answer, without making up the information. This is a good sign that the model is -not hallucinating in that case. +Reference: [Creating a collection with sparse vectors](https://qdrant.tech/documentation/concepts/collections/#collection-with-sparse-vectors). + +```json +{ + "collection_name": "{collection_name}", + "id": 1, + "vector": { + "some-sparse": { + "indices": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9 + ], + "values": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 1.0 + ] + } + }, + "payload": { + "name": "kafka", + "description": "Kafka is a distributed streaming platform", + "url": "https://kafka.apache.org/" + } +} +``` -## [Anchor](https://qdrant.tech/documentation/examples/natural-language-search-oracle-cloud-infrastructure-cohere-langchain/\#wrapping-up) Wrapping up +
-This tutorial has shown how to integrate Cohere’s language models with Qdrant to enable natural language search on your -website. We have used Langchain as an orchestrator, and everything was hosted on Oracle Cloud Infrastructure (OCI). -Real world would require integrating this mechanism into your organization’s systems, but we built a solid foundation -that can be further developed. +
+ Multi-vectors -##### Was this page useful? +Reference: -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +- [Multi-vectors](https://qdrant.tech/documentation/concepts/vectors/#multivectors) -Thank you for your feedback! 🙏 +```json +{ + "collection_name": "{collection_name}", + "id": 1, + "vector": { + "some-multi": [ + [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 1.0 + ], + [ + 1.0, + 0.9, + 0.8, + 0.5, + 0.4, + 0.8, + 0.6, + 0.4, + 0.2, + 0.1 + ] + ] + }, + "payload": { + "name": "kafka", + "description": "Kafka is a distributed streaming platform", + "url": "https://kafka.apache.org/" + } +} +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/natural-language-search-oracle-cloud-infrastructure-cohere-langchain.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +
-On this page: +
+ Combination of named dense and sparse vectors -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/natural-language-search-oracle-cloud-infrastructure-cohere-langchain.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Reference: -× +- [Creating a collection with multiple vectors](https://qdrant.tech/documentation/concepts/collections/#collection-with-multiple-vectors). -[Powered by](https://qdrant.tech/) +- [Creating a collection with sparse vectors](https://qdrant.tech/documentation/concepts/collections/#collection-with-sparse-vectors). -<|page-125-lllmstxt|> -## what-are-embeddings -- [Articles](https://qdrant.tech/articles/) -- What are Vector Embeddings? - Revolutionize Your Search Experience +```json +{ + "collection_name": "{collection_name}", + "id": "a10435b5-2a58-427a-a3a0-a5d845b147b7", + "vector": { + "some-other-dense": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8 + ], + "some-sparse": { + "indices": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9 + ], + "values": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 1.0 + ] + } + }, + "payload": { + "name": "kafka", + "description": "Kafka is a distributed streaming platform", + "url": "https://kafka.apache.org/" + } +} +``` -[Back to Vector Search Manuals](https://qdrant.tech/articles/vector-search-manuals/) +
-# What are Vector Embeddings? - Revolutionize Your Search Experience +## Further Reading -Sabrina Aquino +- [Kafka Connect Docs](https://docs.confluent.io/platform/current/connect/index.html) +- [Confluent Connectors Docs](https://docs.confluent.io/cloud/current/connectors/bring-your-connector/custom-connector-qs.html) -· +<|page-217-lllmstxt|> +# CrewAI -February 06, 2024 +[CrewAI](https://www.crewai.com) is a framework for orchestrating role-playing, autonomous AI agents. By leveraging collaborative intelligence, CrewAI allows agents to work together seamlessly, tackling complex tasks. -![What are Vector Embeddings? - Revolutionize Your Search Experience](https://qdrant.tech/articles_data/what-are-embeddings/preview/title.jpg) +The framework has a sophisticated memory system designed to significantly enhance the capabilities of AI agents. This system aids agents to remember, reason, and learn from past interactions. You can use Qdrant to store short-term memory and entity memories of CrewAI agents. -> **Embeddings** are numerical machine learning representations of the semantic of the input data. They capture the meaning of complex, high-dimensional data, like text, images, or audio, into vectors. Enabling algorithms to process and analyze the data more efficiently. +- Short-Term Memory -You know when you’re scrolling through your social media feeds and the content just feels incredibly tailored to you? There’s the news you care about, followed by a perfect tutorial with your favorite tech stack, and then a meme that makes you laugh so hard you snort. +Temporarily stores recent interactions and outcomes using RAG, enabling agents to recall and utilize information relevant to their current context during the current executions. -Or what about how YouTube recommends videos you ended up loving. It’s by creators you’ve never even heard of and you didn’t even send YouTube a note about your ideal content lineup. +- Entity Memory -This is the magic of embeddings. +Entity Memory Captures and organizes information about entities (people, places, concepts) encountered during tasks, facilitating deeper understanding and relationship mapping. Uses RAG for storing entity information. -These are the result of **deep learning models** analyzing the data of your interactions online. From your likes, shares, comments, searches, the kind of content you linger on, and even the content you decide to skip. It also allows the algorithm to predict future content that you are likely to appreciate. +## Usage with Qdrant -The same embeddings can be repurposed for search, ads, and other features, creating a highly personalized user experience. +We'll learn how to customize CrewAI's default memory storage to use Qdrant. -![How embeddings are applied to perform recommendantions and other use cases](https://qdrant.tech/articles_data/what-are-embeddings/Embeddings-Use-Case.jpg) +### Installation -They make [high-dimensional](https://www.sciencedirect.com/topics/computer-science/high-dimensional-data) data more manageable. This reduces storage requirements, improves computational efficiency, and makes sense of a ton of **unstructured** data. +First, install CrewAI and Qdrant client packages: -## [Anchor](https://qdrant.tech/articles/what-are-embeddings/\#why-use-vector-embeddings) Why use vector embeddings? +```shell +pip install 'crewai[tools]' 'qdrant-client[fastembed]' +``` -The **nuances** of natural language or the hidden **meaning** in large datasets of images, sounds, or user interactions are hard to fit into a table. Traditional relational databases can’t efficiently query most types of data being currently used and produced, making the **retrieval** of this information very limited. +### Setup a CrewAI Project -In the embeddings space, synonyms tend to appear in similar contexts and end up having similar embeddings. The space is a system smart enough to understand that “pretty” and “attractive” are playing for the same team. Without being explicitly told so. +You can learn to set up a CrewAI project [here](https://docs.crewai.com/installation#create-a-new-crewai-project). Let's assume the project was name `mycrew`. -That’s the magic. +### Define the Qdrant storage -At their core, vector embeddings are about semantics. They take the idea that “a word is known by the company it keeps” and apply it on a grand scale. +> src/mycrew/storage.py -![Example of how synonyms are placed closer together in the embeddings space](https://qdrant.tech/articles_data/what-are-embeddings/Similar-Embeddings.jpg) +```python +from typing import Any, Dict, List, Optional -This capability is crucial for creating search systems, recommendation engines, retrieval augmented generation (RAG) and any application that benefits from a deep understanding of content. +from crewai.memory.storage.rag_storage import RAGStorage +from qdrant_client import QdrantClient -## [Anchor](https://qdrant.tech/articles/what-are-embeddings/\#how-do-embeddings-work) How do embeddings work? -Embeddings are created through neural networks. They capture complex relationships and semantics into [dense vectors](https://www1.se.cuhk.edu.hk/~seem5680/lecture/semantics-with-dense-vectors-2018.pdf) which are more suitable for machine learning and data processing applications. They can then project these vectors into a proper **high-dimensional** space, specifically, a [Vector Database](https://qdrant.tech/articles/what-is-a-vector-database/). +class QdrantStorage(RAGStorage): + """ + Extends Storage to handle embeddings for memory entries using Qdrant. -![The process for turning raw data into embeddings and placing them into the vector space](https://qdrant.tech/articles_data/what-are-embeddings/How-Embeddings-Work.jpg) + """ -The meaning of a data point is implicitly defined by its **position** on the vector space. After the vectors are stored, we can use their spatial properties to perform [nearest neighbor searches](https://en.wikipedia.org/wiki/Nearest_neighbor_search#:~:text=Nearest%20neighbor%20search%20%28NNS%29%2C,the%20larger%20the%20function%20values.). These searches retrieve semantically similar items based on how close they are in this space. + def __init__(self, type, allow_reset=True, embedder_config=None, crew=None): + super().__init__(type, allow_reset, embedder_config, crew) + + def search( + self, + query: str, + limit: int = 3, + filter: Optional[dict] = None, + score_threshold: float = 0, + ) -> List[Any]: + points = self.client.query( + self.type, + query_text=query, + query_filter=filter, + limit=limit, + score_threshold=score_threshold, + ) + results = [ + { + "id": point.id, + "metadata": point.metadata, + "context": point.document, + "score": point.score, + } + for point in points + ] -> The quality of the vector representations drives the performance. The embedding model that works best for you depends on your use case. + return results + + def reset(self) -> None: + self.client.delete_collection(self.type) + + def _initialize_app(self): + self.client = QdrantClient() + # uncomment the next line of code + # and choose from the [supported embedders](https://qdrant.github.io/fastembed/examples/Supported_Models/) + # if you don't want to use the default one + # self.client._embedding_model_name = 'jinaai/jina-embeddings-v2-small-en' + if not self.client.collection_exists(self.type): + self.client.create_collection( + collection_name=self.type, + vectors_config=self.client.get_fastembed_vector_params(), + sparse_vectors_config=self.client.get_fastembed_sparse_vector_params(), + ) -### [Anchor](https://qdrant.tech/articles/what-are-embeddings/\#creating-vector-embeddings) Creating vector embeddings + def save(self, value: Any, metadata: Dict[str, Any]) -> None: + self.client.add(self.type, documents=[value], metadata=[metadata or {}]) +``` -Embeddings translate the complexities of human language to a format that computers can understand. It uses neural networks to assign **numerical values** to the input data, in a way that similar data has similar values. +The `add` AND `query` methods use [FastEmbed](https://github.com/qdrant/fastembed/) to vectorize data. You can however customize it if required. -![The process of using Neural Networks to create vector embeddings](https://qdrant.tech/articles_data/what-are-embeddings/How-Do-Embeddings-Work_.jpg) +### Instantiate your crew -For example, if I want to make my computer understand the word ‘right’, I can assign a number like 1.3. So when my computer sees 1.3, it sees the word ‘right’. +You can learn about setting up agents and tasks for your crew [here](https://docs.crewai.com/quickstart). We can update the instantiation of `Crew` to use our storage mechanism. -Now I want to make my computer understand the context of the word ‘right’. I can use a two-dimensional vector, such as \[1.3, 0.8\], to represent ‘right’. The first number 1.3 still identifies the word ‘right’, but the second number 0.8 specifies the context. +> src/mycrew/crew.py -We can introduce more dimensions to capture more nuances. For example, a third dimension could represent formality of the word, a fourth could indicate its emotional connotation (positive, neutral, negative), and so on. +```python +from crewai import Crew +from crewai.memory.entity.entity_memory import EntityMemory +from crewai.memory.short_term.short_term_memory import ShortTermMemory -The evolution of this concept led to the development of embedding models like [Word2Vec](https://en.wikipedia.org/wiki/Word2vec) and [GloVe](https://en.wikipedia.org/wiki/GloVe). They learn to understand the context in which words appear to generate high-dimensional vectors for each word, capturing far more complex properties. +from mycrew.storage import QdrantStorage -![How Word2Vec model creates the embeddings for a word](https://qdrant.tech/articles_data/what-are-embeddings/Word2Vec-model.jpg) +Crew( + # Import the agents and tasks here. + memory=True, + entity_memory=EntityMemory(storage=QdrantStorage("entity")), + short_term_memory=ShortTermMemory(storage=QdrantStorage("short-term")), +) +``` -However, these models still have limitations. They generate a single vector per word, based on its usage across texts. This means all the nuances of the word “right” are blended into one vector representation. That is not enough information for computers to fully understand the context. +You can now run your Crew workflow with `crew run`. It'll use Qdrant for memory ingestion and retrieval. -So, how do we help computers grasp the nuances of language in different contexts? In other words, how do we differentiate between: +## Further Reading -- “your answer is right” -- “turn right at the corner” -- “everyone has the right to freedom of speech” +- [CrewAI Documentation](https://docs.crewai.com/introduction) +- [CrewAI Examples](https://github.com/crewAIInc/crewAI-examples) -Each of these sentences use the word ‘right’, with different meanings. +<|page-218-lllmstxt|> +# Dagster -More advanced models like [BERT](https://en.wikipedia.org/wiki/BERT_%28language_model%29) and [GPT](https://en.wikipedia.org/wiki/Generative_pre-trained_transformer) use deep learning models based on the [transformer architecture](https://arxiv.org/abs/1706.03762), which helps computers consider the full context of a word. These models pay attention to the entire context. The model understands the specific use of a word in its **surroundings**, and then creates different embeddings for each. +[Dagster](https://dagster.io) is a Python framework for data orchestration built for data engineers, with integrated lineage, observability, a declarative programming model, and best-in-class testability. -![How the BERT model creates the embeddings for a word](https://qdrant.tech/articles_data/what-are-embeddings/BERT-model.jpg) +The `dagster-qdrant` library lets you integrate Qdrant's vector database with Dagster, making it easy to build AI-driven data pipelines. You can run vector searches and manage data directly within Dagster. -But how does this process of understanding and interpreting work in practice? Think of the term: “biophilic design”, for example. To generate its embedding, the transformer architecture can use the following contexts: +### Installation -- “Biophilic design incorporates natural elements into architectural planning.” -- “Offices with biophilic design elements report higher employee well-being.” -- “
plant life, natural light, and water features are key aspects of biophilic design.” +```bash +pip install dagster dagster-qdrant +``` -And then it compares contexts to known architectural and design principles: +### Example -- “Sustainable designs prioritize environmental harmony.” -- “Ergonomic spaces enhance user comfort and health.” +```py +from dagster_qdrant import QdrantConfig, QdrantResource -The model creates a vector embedding for “biophilic design” that encapsulates the concept of integrating natural elements into man-made environments. Augmented with attributes that highlight the correlation between this integration and its positive impact on health, well-being, and environmental sustainability. +import dagster as dg -### [Anchor](https://qdrant.tech/articles/what-are-embeddings/\#integration-with-embedding-apis) Integration with embedding APIs -Selecting the right embedding model for your use case is crucial to your application performance. Qdrant makes it easier by offering seamless integration with the best selection of embedding APIs, including [Cohere](https://qdrant.tech/documentation/embeddings/cohere/), [Gemini](https://qdrant.tech/documentation/embeddings/gemini/), [Jina Embeddings](https://qdrant.tech/documentation/embeddings/jina-embeddings/), [OpenAI](https://qdrant.tech/documentation/embeddings/openai/), [Aleph Alpha](https://qdrant.tech/documentation/embeddings/aleph-alpha/), [Fastembed](https://github.com/qdrant/fastembed), and [AWS Bedrock](https://qdrant.tech/documentation/embeddings/bedrock/). +@dg.asset +def my_table(qdrant_resource: QdrantResource): + with qdrant_resource.get_client() as qdrant: + qdrant.add( + collection_name="test_collection", + documents=[ + "This is a document about oranges", + "This is a document about pineapples", + "This is a document about strawberries", + "This is a document about cucumbers", + ], + ) + results = qdrant.query( + collection_name="test_collection", query_text="hawaii", limit=3 + ) -If you’re looking for NLP and rapid prototyping, including language translation, question-answering, and text generation, OpenAI is a great choice. Gemini is ideal for image search, duplicate detection, and clustering tasks. -Fastembed, which we’ll use on the example below, is designed for efficiency and speed, great for applications needing low-latency responses, such as autocomplete and instant content recommendations. +defs = dg.Definitions( + assets=[my_table], + resources={ + "qdrant_resource": QdrantResource( + config=QdrantConfig( + host="xyz-example.eu-central.aws.cloud.qdrant.io", + api_key="", + ) + ) + }, +) +``` -We plan to go deeper into selecting the best model based on performance, cost, integration ease, and scalability in a future post. +## Next steps -## [Anchor](https://qdrant.tech/articles/what-are-embeddings/\#create-a-neural-search-service-with-fastmbed) Create a neural search service with Fastmbed +- Dagster [documentation](https://docs.dagster.io) -Now that you’re familiar with the core concepts around vector embeddings, how about start building your own [Neural Search Service](https://qdrant.tech/documentation/tutorials/neural-search/)? +- Dagster [examples](https://github.com/dagster-io/dagster/tree/b985d57aadc7d9bf88d8dcbd32b16d3487e433cc/examples) -Tutorial guides you through a practical application of how to use Qdrant for document management based on descriptions of companies from [startups-list.com](https://www.startups-list.com/). From embedding data, integrating it with Qdrant’s vector database, constructing a search API, and finally deploying your solution with FastAPI. +<|page-219-lllmstxt|> +![Datadog Cover](/documentation/observability/datadog/datadog-cover.jpg) -Check out what the final version of this project looks like on the [live online demo](https://qdrant.to/semantic-search-demo). +[Datadog](https://www.datadoghq.com/) is a cloud-based monitoring and analytics platform that offers real-time monitoring of servers, databases, and numerous other tools and services. It provides visibility into the performance of applications and enables businesses to detect issues before they affect users. -Let us know what you’re building with embeddings! Join our [Discord](https://discord.gg/qdrant-907569970500743200) community and share your projects! +You can install the [Qdrant integration](https://docs.datadoghq.com/integrations/qdrant/) to get real-time metrics to monitor your Qdrant deployment within Datadog including: -##### Was this page useful? +- The performance of REST and gRPC interfaces with metrics such as total requests, total failures, and time to serve to identify potential bottlenecks and mitigate them. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +- Information about the readiness of the cluster, and deployment (total peers, pending operations, etc.) to gain insights into your Qdrant deployment. -Thank you for your feedback! 🙏 +### Usage -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/what-are-embeddings.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +- With the [Datadog Agent installed](https://docs.datadoghq.com/agent/basic_agent_usage), run the following command to add the Qdrant integration: -On this page: +```shell +datadog-agent integration install -t qdrant==1.0.0 +``` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/what-are-embeddings.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +- Edit the `conf.d/qdrant.d/conf.yaml` file in your [Agent's configuration directory](https://docs.datadoghq.com/agent/guide/agent-configuration-files/#agent-configuration-directory) to start collecting your [Qdrant metrics](/documentation/guides/monitoring/). -× +Most importantly, set the `openmetrics_endpoint` value to the `/metrics` endpoint of your Qdrant instance. -[Powered by](https://qdrant.tech/) +```yaml +instances: + ## @param openmetrics_endpoint - string - optional + ## The URL exposing metrics in the OpenMetrics format. + - openmetrics_endpoint: http://localhost:6333/metrics +``` -<|page-126-lllmstxt|> -## serverless -- [Articles](https://qdrant.tech/articles/) -- Serverless Semantic Search +If the Qdrant instance requires authentication, you can specify the token by configuring [`extra_headers`](https://github.com/DataDog/integrations-core/blob/26f9ae7660f042c43f5d771f0c937ff805cf442c/openmetrics/datadog_checks/openmetrics/data/conf.yaml.example#L553C1-L558C35). -[Back to Practical Examples](https://qdrant.tech/articles/practicle-examples/) +```yaml +# @param extra_headers - mapping - optional +# Additional headers to send with every request. +extra_headers: + api-key: +``` -# Serverless Semantic Search +- Restart the Datadog agent. -Andre Bogus +- You can now head over to the Datadog dashboard to view the [metrics](https://docs.datadoghq.com/integrations/qdrant/#data-collected) emitted by the Qdrant check. -· +## Further Reading -July 12, 2023 +- [Getting started with Datadog](https://docs.datadoghq.com/getting_started/) +- [Qdrant integration source](https://github.com/DataDog/integrations-extras/tree/master/qdrant) -![Serverless Semantic Search](https://qdrant.tech/articles_data/serverless/preview/title.jpg) +<|page-220-lllmstxt|> +# DeepEval -Do you want to insert a semantic search function into your website or online app? Now you can do so - without spending any money! In this example, you will learn how to create a free prototype search engine for your own non-commercial purposes. +[DeepEval](https://deepeval.com) by Confident AI is an open-source framework for testing large language model systems. Similar to Pytest but designed for LLM outputs, it evaluates metrics like G-Eval, hallucination, answer relevancy. -## [Anchor](https://qdrant.tech/articles/serverless/\#ingredients) Ingredients +DeepEval can be integrated with Qdrant to evaluate RAG pipelines — ensuring your LLM applications return relevant, grounded, and faithful responses based on retrieved vector search context. -- A [Rust](https://rust-lang.org/) toolchain -- [cargo lambda](https://cargo-lambda.info/) (install via package manager, [download](https://github.com/cargo-lambda/cargo-lambda/releases) binary or `cargo install cargo-lambda`) -- The [AWS CLI](https://aws.amazon.com/cli) -- Qdrant instance ( [free tier](https://cloud.qdrant.io/) available) -- An embedding provider service of your choice (see our [Embeddings docs](https://qdrant.tech/documentation/embeddings/). You may be able to get credits from [AI Grant](https://aigrant.org/), also Cohere has a [rate-limited non-commercial free tier](https://cohere.com/pricing)) -- AWS Lambda account (12-month free tier available) +## How it works -## [Anchor](https://qdrant.tech/articles/serverless/\#what-youre-going-to-build) What you’re going to build +A test case is a blueprint provided by DeepEval to unit test LLM outputs. There are two types of test cases in DeepEval: -You’ll combine the embedding provider and the Qdrant instance to a neat semantic search, calling both services from a small Lambda function. +`LLMTestCase`: Used to evaluate a single input-output pair, such as RAG responses or agent actions. -![lambda integration diagram](https://qdrant.tech/articles_data/serverless/lambda_integration.png) +`ConversationalTestCase`: A sequence of `LLMTestCase` turns representing a back-and-forth interaction with an LLM system. This is especially useful for chatbot or assistant testing. -Now lets look at how to work with each ingredient before connecting them. +## Metrics Overview -## [Anchor](https://qdrant.tech/articles/serverless/\#rust-and-cargo-lambda) Rust and cargo-lambda +DeepEval offers a suite of metrics to evaluate various aspects of LLM outputs, including: -You want your function to be quick, lean and safe, so using Rust is a no-brainer. To compile Rust code for use within Lambda functions, the `cargo-lambda` subcommand has been built. `cargo-lambda` can put your Rust code in a zip file that AWS Lambda can then deploy on a no-frills `provided.al2` runtime. +- **Answer Relevancy**: Measures how relevant the LLM's output is to the given input query. +- **Faithfulness**: Assesses whether the LLM's response is grounded in the provided context, ensuring factual accuracy. +- **Contextual Precision**: Determines whether the most relevant pieces of context are ranked higher than less relevant ones. +- **G-Eval**: A versatile metric that uses LLM-as-a-judge with chain-of-thought reasoning to evaluate outputs based on custom criteria. +- **Hallucination**: Detects instances where the LLM generates information not present in the source context. +- **Toxicity**: Assesses the presence of harmful or offensive content in the LLM's output. +- **Bias**: Evaluates the output for any unintended biases. +- **Summarization**: Measures the quality and accuracy of generated summaries. -To interface with AWS Lambda, you will need a Rust project with the following dependencies in your `Cargo.toml`: +For a comprehensive list and detailed explanations of all available metrics, please refer to the [DeepEval metrics reference](https://deepeval.com/docs/metrics-introduction). -```toml -[dependencies] -tokio = { version = "1", features = ["macros"] } -lambda_http = { version = "0.8", default-features = false, features = ["apigw_http"] } -lambda_runtime = "0.8" +## Using Qdrant with DeepEval -``` +Install the client libraries. -This gives you an interface consisting of an entry point to start the Lambda runtime and a way to register your handler for HTTP calls. Put the following snippet into `src/helloworld.rs`: +```bash +$ pip install deepeval qdrant-client -```rust -use lambda_http::{run, service_fn, Body, Error, Request, RequestExt, Response}; +$ deepeval login +``` -/// This is your callback function for responding to requests at your URL -async fn function_handler(_req: Request) -> Result, Error> { - Response::from_text("Hello, Lambda!") -} +You can use Qdrant to power your RAG system by retrieving relevant documents for a query, feeding them into your prompt, and evaluating the generated output using DeepEval. -#[tokio::main] -async fn main() { - run(service_fn(function_handler)).await -} +```python +from deepeval.test_case import LLMTestCase, ConversationalTestCase +from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, ... -``` +# 1. Query context from Qdrant +context = qdrant_client.query_points(...) -You can also use a closure to bind other arguments to your function handler (the `service_fn` call then becomes `service_fn(|req| function_handler(req, ...))`). Also if you want to extract parameters from the request, you can do so using the [Request](https://docs.rs/lambda_http/latest/lambda_http/type.Request.html) methods (e.g. `query_string_parameters` or `query_string_parameters_ref`). +# 2. Construct prompt using query + retrieved context +prompt = build_prompt(query, context) -Add the following to your `Cargo.toml` to define the binary: +# 3. Generate response from your LLM +response = llm.generate(prompt) -```toml -[[bin]] -name = "helloworld" -path = "src/helloworld.rs" +# 4. Create a test case for evaluation +test_case = LLMTestCase( + input=query, + actual_output=response, + expected_output=ground_truth_answer, + retrieval_context=context +) +# 5. Evaluate the output using DeepEval +evaluate( + test_cases=[test_case], + metrics=[ + AnswerRelevancyMetric(), + FaithfulnessMetric(), + ContextualPrecisionMetric(), + ... + ], +) ``` -On the AWS side, you need to setup a Lambda and IAM role to use with your function. +All evaluations performed using DeepEval can be viewed on the [Confident AI Dashboard](https://app.confident-ai.com). -![create lambda web page](https://qdrant.tech/articles_data/serverless/create_lambda.png) +You can scale this process with a dataset (e.g. from Hugging Face) and evaluate multiple test cases at once by looping through question-answer pairs, querying Qdrant for context, and scoring with DeepEval metrics. -Choose your function name, select “Provide your own bootstrap on Amazon Linux 2”. As architecture, use `arm64`. You will also activate a function URL. Here it is up to you if you want to protect it via IAM or leave it open, but be aware that open end points can be accessed by anyone, potentially costing money if there is too much traffic. +## Further Reading -By default, this will also create a basic role. To look up the role, you can go into the Function overview: +- [End-to-end Evalutation Example](https://github.com/qdrant/qdrant-rag-eval/blob/master/workshop-rag-eval-qdrant-deepeval/notebook/rag_eval_qdrant_deepeval.ipynb) +- [DeepEval documentation](https://deepeval.com) + +<|page-221-lllmstxt|> +# DLT(Data Load Tool) -![function overview](https://qdrant.tech/articles_data/serverless/lambda_overview.png) +[DLT](https://dlthub.com/) is an open-source library that you can add to your Python scripts to load data from various and often messy data sources into well-structured, live datasets. -Click on the “Info” link near the “▾ Function overview” heading, and select the “Permissions” tab on the left. +With the DLT-Qdrant integration, you can now select Qdrant as a DLT destination to load data into. -You will find the “Role name” directly under _Execution role_. Note it down for later. +**DLT Enables** -![function overview](https://qdrant.tech/articles_data/serverless/lambda_role.png) +- Automated maintenance - with schema inference, alerts and short declarative code, maintenance becomes simple. +- Run it where Python runs - on Airflow, serverless functions, notebooks. Scales on micro and large infrastructure alike. +- User-friendly, declarative interface that removes knowledge obstacles for beginners while empowering senior professionals. -To test that your “Hello, Lambda” service works, you can compile and upload the function: +## Usage + +To get started, install `dlt` with the `qdrant` extra. ```bash -$ export LAMBDA_FUNCTION_NAME=hello -$ export LAMBDA_ROLE= -$ export LAMBDA_REGION=us-east-1 -$ cargo lambda build --release --arm --bin helloworld --output-format zip - Downloaded libc v0.2.137 -# [..] output omitted for brevity - Finished release [optimized] target(s) in 1m 27s -$ # Delete the old empty definition -$ aws lambda delete-function-url-config --region $LAMBDA_REGION --function-name $LAMBDA_FUNCTION_NAME -$ aws lambda delete-function --region $LAMBDA_REGION --function-name $LAMBDA_FUNCTION_NAME -$ # Upload the function -$ aws lambda create-function --function-name $LAMBDA_FUNCTION_NAME \ - --handler bootstrap \ - --architectures arm64 \ - --zip-file fileb://./target/lambda/helloworld/bootstrap.zip \ - --runtime provided.al2 \ - --region $LAMBDA_REGION \ - --role $LAMBDA_ROLE \ - --tracing-config Mode=Active -$ # Add the function URL -$ aws lambda add-permission \ - --function-name $LAMBDA_FUNCTION_NAME \ - --action lambda:InvokeFunctionUrl \ - --principal "*" \ - --function-url-auth-type "NONE" \ - --region $LAMBDA_REGION \ - --statement-id url -$ # Here for simplicity unauthenticated URL access. Beware! -$ aws lambda create-function-url-config \ - --function-name $LAMBDA_FUNCTION_NAME \ - --region $LAMBDA_REGION \ - --cors "AllowOrigins=*,AllowMethods=*,AllowHeaders=*" \ - --auth-type NONE +pip install "dlt[qdrant]" +``` + +Configure the destination in the DLT secrets file. The file is located at `~/.dlt/secrets.toml` by default. Add the following section to the secrets file. +```toml +[destination.qdrant.credentials] +location = "https://your-qdrant-url" +api_key = "your-qdrant-api-key" ``` -Now you can go to your _Function Overview_ and click on the Function URL. You should see something like this: +The location will default to `http://localhost:6333` and `api_key` is not defined - which are the defaults for a local Qdrant instance. +Find more information about DLT configurations [here](https://dlthub.com/docs/general-usage/credentials). -```text -Hello, Lambda! +Define the source of the data. -``` +```python +import dlt +from dlt.destinations.qdrant import qdrant_adapter -Bearer ! You have set up a Lambda function in Rust. On to the next ingredient: +movies = [ + { + "title": "Blade Runner", + "year": 1982, + "description": "The film is about a dystopian vision of the future that combines noir elements with sci-fi imagery." + }, + { + "title": "Ghost in the Shell", + "year": 1995, + "description": "The film is about a cyborg policewoman and her partner who set out to find the main culprit behind brain hacking, the Puppet Master." + }, + { + "title": "The Matrix", + "year": 1999, + "description": "The movie is set in the 22nd century and tells the story of a computer hacker who joins an underground group fighting the powerful computers that rule the earth." + } +] +``` -## [Anchor](https://qdrant.tech/articles/serverless/\#embedding) Embedding + -Most providers supply a simple https GET or POST interface you can use with an API key, which you have to supply in an authentication header. If you are using this for non-commercial purposes, the rate limited trial key from Cohere is just a few clicks away. Go to [their welcome page](https://dashboard.cohere.ai/welcome/register), register and you’ll be able to get to the dashboard, which has an “API keys” menu entry which will bring you to the following page: -[cohere dashboard](https://qdrant.tech/articles_data/serverless/cohere-dashboard.png) +Define the pipeline. -From there you can click on the ⎘ symbol next to your API key to copy it to the clipboard. _Don’t put your API key in the code!_ Instead read it from an env variable you can set in the lambda environment. This avoids accidentally putting your key into a public repo. Now all you need to get embeddings is a bit of code. First you need to extend your dependencies with `reqwest` and also add `anyhow` for easier error handling: +```python +pipeline = dlt.pipeline( + pipeline_name="movies", + destination="qdrant", + dataset_name="movies_dataset", +) +``` -```toml -anyhow = "1.0" -reqwest = { version = "0.11.18", default-features = false, features = ["json", "rustls-tls"] } -serde = "1.0" +Run the pipeline. +```python +info = pipeline.run( + qdrant_adapter( + movies, + embed=["title", "description"] + ) +) ``` -Now given the API key from above, you can make a call to get the embedding vectors: +The data is now loaded into Qdrant. -```rust -use anyhow::Result; -use serde::Deserialize; -use reqwest::Client; +To use vector search after the data has been loaded, you must specify which fields Qdrant needs to generate embeddings for. You do that by wrapping the data (or [DLT resource](https://dlthub.com/docs/general-usage/resource)) with the `qdrant_adapter` function. -#[derive(Deserialize)] -struct CohereResponse { outputs: Vec> } +## Write disposition -pub async fn embed(client: &Client, text: &str, api_key: &str) -> Result>> { - let CohereResponse { outputs } = client - .post("https://api.cohere.ai/embed") - .header("Authorization", &format!("Bearer {api_key}")) - .header("Content-Type", "application/json") - .header("Cohere-Version", "2021-11-08") - .body(format!("{{\"text\":[\"{text}\"],\"model\":\"small\"}}")) - .send() - .await? - .json() - .await?; - Ok(outputs) -} +A DLT [write disposition](https://dlthub.com/docs/dlt-ecosystem/destinations/qdrant/#write-disposition) defines how the data should be written to the destination. All write dispositions are supported by the Qdrant destination. -``` +## DLT Sync -Note that this may return multiple vectors if the text overflows the input dimensions. -Cohere’s `small` model has 1024 output dimensions. +Qdrant destination supports syncing of the [`DLT` state](https://dlthub.com/docs/general-usage/state#syncing-state-with-destination). -Other providers have similar interfaces. Consult our [Embeddings docs](https://qdrant.tech/documentation/embeddings/) for further information. See how little code it took to get the embedding? +## Next steps -While you’re at it, it’s a good idea to write a small test to check if embedding works and the vectors are of the expected size: +- The comprehensive Qdrant DLT destination documentation can be found [here](https://dlthub.com/docs/dlt-ecosystem/destinations/qdrant/). +- [Source Code](https://github.com/dlt-hub/dlt/tree/devel/dlt/destinations/impl/qdrant) -```rust -#[tokio::test] -async fn check_embedding() { - // ignore this test if API_KEY isn't set - let Ok(api_key) = &std::env::var("API_KEY") else { return; } - let embedding = crate::embed("What is semantic search?", api_key).unwrap()[0]; - // Cohere's `small` model has 1024 output dimensions. - assert_eq!(1024, embedding.len()); -} +<|page-222-lllmstxt|> +# Dynamiq -``` +Dynamiq is your all-in-one Gen AI framework, designed to streamline the development of AI-powered applications. Dynamiq specializes in orchestrating retrieval-augmented generation (RAG) and large language model (LLM) agents. -Run this while setting the `API_KEY` environment variable to check if the embedding works. +Qdrant is a vector database available in Dynamiq, capable of serving multiple roles. It can be used for writing and retrieving documents, acting as memory for agent interactions, and functioning as a retrieval tool that agents can call when needed. -## [Anchor](https://qdrant.tech/articles/serverless/\#qdrant-search) Qdrant search +## Installing -Now that you have embeddings, it’s time to put them into your Qdrant. You could of course use `curl` or `python` to set up your collection and upload the points, but as you already have Rust including some code to obtain the embeddings, you can stay in Rust, adding `qdrant-client` to the mix. +First, ensure you have the `dynamiq` library installed: -```rust -use anyhow::Result; -use qdrant_client::prelude::*; -use qdrant_client::qdrant::{VectorsConfig, VectorParams}; -use qdrant_client::qdrant::vectors_config::Config; -use std::collections::HashMap; +```bash +$ pip install dynamiq +``` -fn setup<'i>( - embed_client: &reqwest::Client, - embed_api_key: &str, - qdrant_url: &str, - api_key: Option<&str>, - collection_name: &str, - data: impl Iterator)>, -) -> Result<()> { - let mut config = QdrantClientConfig::from_url(qdrant_url); - config.api_key = api_key; - let client = QdrantClient::new(Some(config))?; +## Retriever node - // create the collections - if !client.has_collection(collection_name).await? { - client - .create_collection(&CreateCollection { - collection_name: collection_name.into(), - vectors_config: Some(VectorsConfig { - config: Some(Config::Params(VectorParams { - size: 1024, // output dimensions from above - distance: Distance::Cosine as i32, - ..Default::default() - })), - }), - ..Default::default() - }) - .await?; - } - let mut id_counter = 0_u64; - let points = data.map(|(text, payload)| { - let id = std::mem::replace(&mut id_counter, *id_counter + 1); - let vectors = Some(embed(embed_client, text, embed_api_key).unwrap()); - PointStruct { id, vectors, payload } - }).collect(); - client.upsert_points(collection_name, points, None).await?; - Ok(()) -} +The QdrantDocumentRetriever node enables efficient retrieval of relevant documents based on vector similarity search. -``` +```python +from dynamiq.nodes.retrievers import QdrantDocumentRetriever +from dynamiq import Workflow -Depending on whether you want to efficiently filter the data, you can also add some indexes. I’m leaving this out for brevity. Also this does not implement chunking (splitting the data to upsert in multiple requests, which avoids timeout errors). +# Define a retriever node to fetch most relevant documents +retriever_node = QdrantDocumentRetriever( + index_name="default", + top_k=5, # Optional: Maximum number of documents to retrieve + filters={...} # Optional: Additional filtering conditions +) -Add a suitable `main` method and you can run this code to insert the points (or just use the binary from the example). Be sure to include the port in the `qdrant_url`. +# Create a workflow and add the retriever node +wf = Workflow() +wf.flow.add_nodes(retriever_node) -Now that you have the points inserted, you can search them by embedding: +# Execute retrieval +result = wf.run(input_data={ + 'embedding': query_embedding # Provide an embedded query for similarity search +}) -```rust -use anyhow::Result; -use qdrant_client::prelude::*; -pub async fn search( - text: &str, - collection_name: String, - client: &Client, - api_key: &str, - qdrant: &QdrantClient, -) -> Result> { - Ok(qdrant.search_points(&SearchPoints { - collection_name, - limit: 5, // use what fits your use case here - with_payload: Some(true.into()), - vector: embed(client, text, api_key)?, - ..Default::default() - }).await?.result) -} +``` -``` +## Writer node -You can also filter by adding a `filter: ...` field to the `SearchPoints`, and you will likely want to process the result further, but the example code already does that, so feel free to start from there in case you need this functionality. +The QdrantDocumentWriter node allows storing documents in the Qdrant vector database. -## [Anchor](https://qdrant.tech/articles/serverless/\#putting-it-all-together) Putting it all together +```python +from dynamiq.nodes.writers import QdrantDocumentWriter -Now that you have all the parts, it’s time to join them up. Now copying and wiring up the snippets above is left as an exercise to the reader. +# Define a writer node to store documents in Qdrant +writer_node = QdrantDocumentWriter( + index_name="default", + create_if_not_exist=True +) -You’ll want to extend the `main` method a bit to connect with the Client once at the start, also get API keys from the environment so you don’t need to compile them into the code. To do that, you can get them with `std::env::var(_)` from the rust code and set the environment from the AWS console. +# Create a workflow and add the writer node +wf = Workflow() +wf.flow.add_nodes(writer_node) -```bash -$ export QDRANT_URI= -$ export QDRANT_API_KEY= -$ export COHERE_API_KEY= -$ export COLLECTION_NAME=site-cohere -$ aws lambda update-function-configuration \ - --function-name $LAMBDA_FUNCTION_NAME \ - --environment "Variables={QDRANT_URI=$QDRANT_URI,\ - QDRANT_API_KEY=$QDRANT_API_KEY,COHERE_API_KEY=${COHERE_API_KEY},\ - COLLECTION_NAME=${COLLECTION_NAME}"` +# Execute writing +result = wf.run(input_data={ + 'documents': embedded_documents # Provide embedded documents for storage +}) +``` -``` +# Additional Tutorials -In any event, you will arrive at one command line program to insert your data and one Lambda function. The former can just be `cargo run` to set up the collection. For the latter, you can again call `cargo lambda` and the AWS console: +Discover additional examples and use cases of Qdrant with Dynamiq: -```bash -$ export LAMBDA_FUNCTION_NAME=search -$ export LAMBDA_REGION=us-east-1 -$ cargo lambda build --release --arm --output-format zip - Downloaded libc v0.2.137 -# [..] output omitted for brevity - Finished release [optimized] target(s) in 1m 27s -$ # Update the function -$ aws lambda update-function-code --function-name $LAMBDA_FUNCTION_NAME \ - --zip-file fileb://./target/lambda/page-search/bootstrap.zip \ - --region $LAMBDA_REGION +- [Using Qdrant with Dynamiq – A Hands-on Tutorial](https://colab.research.google.com/drive/1rlZJW4lOM36b7ZxK-dVJv5dE2xrgwxU_?usp=sharing) +- [End-to-End Application with Qdrant and Dynamiq](https://colab.research.google.com/drive/1RaR25BCj_D5wzQ70ejUQyKzdCM6DUXMF?usp=sharing) -``` -## [Anchor](https://qdrant.tech/articles/serverless/\#discussion) Discussion -Lambda works by spinning up your function once the URL is called, so they don’t need to keep the compute on hand unless it is actually used. This means that the first call will be burdened by some 1-2 seconds of latency for loading the function, later calls will resolve faster. Of course, there is also the latency for calling the embeddings provider and Qdrant. On the other hand, the free tier doesn’t cost a thing, so you certainly get what you pay for. And for many use cases, a result within one or two seconds is acceptable. +## For more details, please refer to: -Rust minimizes the overhead for the function, both in terms of file size and runtime. Using an embedding service means you don’t need to care about the details. Knowing the URL, API key and embedding size is sufficient. Finally, with free tiers for both Lambda and Qdrant as well as free credits for the embedding provider, the only cost is your time to set everything up. Who could argue with free? +- [Dynamiq Documentation](https://docs.getdynamiq.ai/) +- [Dynamiq GitHub](https://github.com/dynamiq-ai/dynamiq) -##### Was this page useful? +<|page-223-lllmstxt|> +## Feast -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +[Feast (**Fe**ature **St**ore)](https://docs.feast.dev) is an open-source feature store that helps teams operate production ML systems at scale by allowing them to define, manage, validate, and serve features for production AI/ML. -Thank you for your feedback! 🙏 +Qdrant is available as a supported vectorstore in Feast to integrate in your workflows. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/serverless.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +## Insatallation -On this page: +To use the Qdrant online store, you need to install Feast with the `qdrant` extra. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/serverless.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +```bash +pip install 'feast[qdrant]' +``` -× +## Usage -[Powered by](https://qdrant.tech/) +An example config with Qdrant could look like: -<|page-127-lllmstxt|> -## rag-chatbot-vultr-dspy-ollama -- [Documentation](https://qdrant.tech/documentation/) -- [Examples](https://qdrant.tech/documentation/examples/) -- Private RAG Information Extraction Engine +```yaml +project: my_feature_repo +registry: data/registry.db +provider: local +online_store: + type: qdrant + host: xyz-example.eu-central.aws.cloud.qdrant.io + port: 6333 + api_key: + vector_len: 384 + # Reference: https://qdrant.tech/documentation/concepts/vectors/#named-vectors + # vector_name: text-vec + write_batch_size: 100 +``` -# [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-vultr-dspy-ollama/\#private-rag-information-extraction-engine) Private RAG Information Extraction Engine +You can refer to the Feast [documentation](https://docs.feast.dev/reference/alpha-vector-database#configuration-and-installation) for the full list of configuration options. -| Time: 90 min | Level: Advanced | | | -| --- | --- | --- | --- | +## Retrieving Documents -Handling private documents is a common task in many industries. Various businesses possess a large amount of -unstructured data stored as huge files that must be processed and analyzed. Industry reports, financial analysis, legal -documents, and many other documents are stored in PDF, Word, and other formats. Conversational chatbots built on top of -RAG pipelines are one of the viable solutions for finding the relevant answers in such documents. However, if we want to -extract structured information from these documents, and pass them to downstream systems, we need to use a different -approach. +The Qdrant online store supports retrieving document vectors for a given list of entity keys. The document vectors are returned as a dictionary where the key is the entity key and the value being the vector. -Information extraction is a process of structuring unstructured data into a format that can be easily processed by -machines. In this tutorial, we will show you how to use [DSPy](https://dspy-docs.vercel.app/) to perform that process on -a set of documents. Assuming we cannot send our data to an external service, we will use [Ollama](https://ollama.com/) -to run our own LLM model on our premises, using [Vultr](https://www.vultr.com/) as a cloud provider. Qdrant, acting in -this setup as a knowledge base providing the relevant pieces of documents for a given query, will also be hosted in the -Hybrid Cloud mode on Vultr. The last missing piece, the DSPy application will be also running in the same environment. -If you work in a regulated industry, or just need to keep your data private, this tutorial is for you. +```python +from feast import FeatureStore -![Architecture diagram](https://qdrant.tech/documentation/examples/information-extraction-ollama-vultr/architecture-diagram.png) +feature_store = FeatureStore(repo_path="feature_store.yaml") -## [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-vultr-dspy-ollama/\#deploying-qdrant-hybrid-cloud-on-vultr) Deploying Qdrant Hybrid Cloud on Vultr +query_vector = [1.0, 2.0, 3.0, 4.0, 5.0] +top_k = 5 -All the services we are going to use in this tutorial will be running on [Vultr Kubernetes\\ -Engine](https://www.vultr.com/kubernetes/). That gives us a lot of flexibility in terms of scaling and managing the resources. Vultr manages the control plane and worker nodes and provides integration with other managed services such as Load Balancers, Block Storage, and DNS. +feature_values = feature_store.retrieve_online_documents( + feature="my_feature", + query=query_vector, + top_k=top_k +) +``` -1. To start using managed Kubernetes on Vultr, follow the [platform-specific documentation](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/#vultr). -2. Once your Kubernetes clusters are up, [you can begin deploying Qdrant Hybrid Cloud](https://qdrant.tech/documentation/hybrid-cloud/). +## 📚 Further Reading -### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-vultr-dspy-ollama/\#installing-the-necessary-packages) Installing the necessary packages +- [Feast Documentation](http://docs.feast.dev/) +- [Source](https://github.com/feast-dev/feast/tree/master/sdk/python/feast/infra/online_stores/qdrant_online_store) -We are going to need a couple of Python packages to run our application. They might be installed together with the -`dspy-ai` package and `qdrant` extra: +<|page-224-lllmstxt|> +# FiftyOne -```shell -pip install dspy-ai dspy-qdrant +[FiftyOne](https://voxel51.com/) is an open-source toolkit designed to enhance computer vision workflows by optimizing dataset quality +and providing valuable insights about your models. FiftyOne 0.20, which includes a native integration with Qdrant, supporting workflows +like [image similarity search](https://docs.voxel51.com/user_guide/brain.html#image-similarity) and +[text search](https://docs.voxel51.com/user_guide/brain.html#text-similarity). + +Qdrant helps FiftyOne to find the most similar images in the dataset using vector embeddings. + +FiftyOne is available as a Python package that might be installed in the following way: +```bash +pip install fiftyone ``` -### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-vultr-dspy-ollama/\#qdrant-hybrid-cloud) Qdrant Hybrid Cloud +Please check out the documentation of FiftyOne on [Qdrant integration](https://docs.voxel51.com/integrations/qdrant.html). -Our [documentation](https://qdrant.tech/documentation/hybrid-cloud/) contains a comprehensive guide on how to set up Qdrant in the Hybrid Cloud mode on Vultr. Please follow it carefully to get your Qdrant instance up and running. Once it’s done, we need to store the Qdrant URL and the API key in the environment variables. You can do it by running the following commands: +<|page-225-lllmstxt|> +# Firebase Genkit -shellpython +[Genkit](https://firebase.google.com/products/genkit) is a framework to build, deploy, and monitor production-ready AI-powered apps. -```shell -export QDRANT_URL="https://qdrant.example.com" -export QDRANT_API_KEY="your-api-key" +You can build apps that generate custom content, use semantic search, handle unstructured inputs, answer questions with your business data, autonomously make decisions, orchestrate tool calls, and more. -``` +You can use Qdrant for indexing/semantic retrieval of data in your Genkit applications via the [Qdrant-Genkit plugin](https://github.com/qdrant/qdrant-genkit). -```python -import os +Genkit currently supports server-side development in JavaScript/TypeScript (Node.js) with Go support in active development. -os.environ["QDRANT_URL"] = "https://qdrant.example.com" -os.environ["QDRANT_API_KEY"] = "your-api-key" +## Installation +```bash +npm i genkitx-qdrant ``` -DSPy is framework we are going to use. It’s integrated with Qdrant already, but it assumes you use -[FastEmbed](https://qdrant.github.io/fastembed/) to create the embeddings. DSPy does not provide a way to index the -data, but leaves this task to the user. We are going to create a collection on our own, and fill it with the embeddings -of our document chunks. +## Configuration -#### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-vultr-dspy-ollama/\#data-indexing) Data indexing +To use this plugin, specify it when you call `configureGenkit()`: -FastEmbed uses the `BAAI/bge-small-en` as the default embedding model. We are going to use it as well. Our collection -will be created automatically if we call the `.add` method on an existing `QdrantClient` instance. In this tutorial we -are not going to focus much on the document parsing, as there are plenty of tools that can help with that. The -[`unstructured`](https://github.com/Unstructured-IO/unstructured) library is one of the options you can launch on your -infrastructure. In our simplified example, we are going to use a list of strings as our documents. These are the -descriptions of the made up technical events. Each of them should contain the name of the event along with the location -and start and end dates. - -```python -documents = [\ - "Taking place in San Francisco, USA, from the 10th to the 12th of June, 2024, the Global Developers Conference is the annual gathering spot for developers worldwide, offering insights into software engineering, web development, and mobile applications.",\ - "The AI Innovations Summit, scheduled for 15-17 September 2024 in London, UK, aims at professionals and researchers advancing artificial intelligence and machine learning.",\ - "Berlin, Germany will host the CyberSecurity World Conference between November 5th and 7th, 2024, serving as a key forum for cybersecurity professionals to exchange strategies and research on threat detection and mitigation.",\ - "Data Science Connect in New York City, USA, occurring from August 22nd to 24th, 2024, connects data scientists, analysts, and engineers to discuss data science's innovative methodologies, tools, and applications.",\ - "Set for July 14-16, 2024, in Tokyo, Japan, the Frontend Developers Fest invites developers to delve into the future of UI/UX design, web performance, and modern JavaScript frameworks.",\ - "The Blockchain Expo Global, happening May 20-22, 2024, in Dubai, UAE, focuses on blockchain technology's applications, opportunities, and challenges for entrepreneurs, developers, and investors.",\ - "Singapore's Cloud Computing Summit, scheduled for October 3-5, 2024, is where IT professionals and cloud experts will convene to discuss strategies, architectures, and cloud solutions.",\ - "The IoT World Forum, taking place in Barcelona, Spain from December 1st to 3rd, 2024, is the premier conference for those focused on the Internet of Things, from smart cities to IoT security.",\ - "Los Angeles, USA, will become the hub for game developers, designers, and enthusiasts at the Game Developers Arcade, running from April 18th to 20th, 2024, to showcase new games and discuss development tools.",\ - "The TechWomen Summit in Sydney, Australia, from March 8-10, 2024, aims to empower women in tech with workshops, keynotes, and networking opportunities.",\ - "Seoul, South Korea's Mobile Tech Conference, happening from September 29th to October 1st, 2024, will explore the future of mobile technology, including 5G networks and app development trends.",\ - "The Open Source Summit, to be held in Helsinki, Finland from August 11th to 13th, 2024, celebrates open source technologies and communities, offering insights into the latest software and collaboration techniques.",\ - "Vancouver, Canada will play host to the VR/AR Innovation Conference from June 20th to 22nd, 2024, focusing on the latest in virtual and augmented reality technologies.",\ - "Scheduled for May 5-7, 2024, in London, UK, the Fintech Leaders Forum brings together experts to discuss the future of finance, including innovations in blockchain, digital currencies, and payment technologies.",\ - "The Digital Marketing Summit, set for April 25-27, 2024, in New York City, USA, is designed for marketing professionals and strategists to discuss digital marketing and social media trends.",\ - "EcoTech Symposium in Paris, France, unfolds over 2024-10-09 to 2024-10-11, spotlighting sustainable technologies and green innovations for environmental scientists, tech entrepreneurs, and policy makers.",\ - "Set in Tokyo, Japan, from 16th to 18th May '24, the Robotic Innovations Conference showcases automation, robotics, and AI-driven solutions, appealing to enthusiasts and engineers.",\ - "The Software Architecture World Forum in Dublin, Ireland, occurring 22-24 Sept 2024, gathers software architects and IT managers to discuss modern architecture patterns.",\ - "Quantum Computing Summit, convening in Silicon Valley, USA from 2024/11/12 to 2024/11/14, is a rendezvous for exploring quantum computing advancements with physicists and technologists.",\ - "From March 3 to 5, 2024, the Global EdTech Conference in London, UK, discusses the intersection of education and technology, featuring e-learning and digital classrooms.",\ - "Bangalore, India's NextGen DevOps Days, from 28 to 30 August 2024, is a hotspot for IT professionals keen on the latest DevOps tools and innovations.",\ - "The UX/UI Design Conference, slated for April 21-23, 2024, in New York City, USA, invites discussions on the latest in user experience and interface design among designers and developers.",\ - "Big Data Analytics Summit, taking place 2024 July 10-12 in Amsterdam, Netherlands, brings together data professionals to delve into big data analysis and insights.",\ - "Toronto, Canada, will see the HealthTech Innovation Forum from June 8 to 10, '24, focusing on technology's impact on healthcare with professionals and innovators.",\ - "Blockchain for Business Summit, happening in Singapore from 2024-05-02 to 2024-05-04, focuses on blockchain's business applications, from finance to supply chain.",\ - "Las Vegas, USA hosts the Global Gaming Expo from October 18th to 20th, 2024, a premiere event for game developers, publishers, and enthusiasts.",\ - "The Renewable Energy Tech Conference in Copenhagen, Denmark, from 2024/09/05 to 2024/09/07, discusses renewable energy innovations and policies.",\ - "Set for 2024 Apr 9-11 in Boston, USA, the Artificial Intelligence in Healthcare Summit gathers healthcare professionals to discuss AI's healthcare applications.",\ - "Nordic Software Engineers Conference, happening in Stockholm, Sweden from June 15 to 17, 2024, focuses on software development in the Nordic region.",\ - "The International Space Exploration Symposium, scheduled in Houston, USA from 2024-08-05 to 2024-08-07, invites discussions on space exploration technologies and missions."\ -] +```js +import { qdrant } from 'genkitx-qdrant'; +const ai = genkit({ + plugins: [ + qdrant([ + { + embedder: googleAI.embedder('text-embedding-004'), + collectionName: 'collectionName', + clientParams: { + url: 'http://localhost:6333', + } + } + ]), + ], +}); ``` -We’ll be able to ask general questions, for example, about topics we are interested in or events happening in a specific -location, but expect the results to be returned in a structured format. +You'll need to specify a collection name, the embedding model you want to use and the Qdrant client parameters. In +addition, there are a few optional parameters: -![An example of extracted information](https://qdrant.tech/documentation/examples/information-extraction-ollama-vultr/extracted-information.png) +- `embedderOptions`: Additional options to pass options to the embedder: -Indexing in Qdrant is a single call if we have the documents defined: + ```js + embedderOptions: { taskType: 'RETRIEVAL_DOCUMENT' }, + ``` -```python -client.add( - collection_name="document-parts", - documents=documents, - metadata=[{"document": document} for document in documents], -) +- `contentPayloadKey`: Name of the payload filed with the document content. Defaults to "content". -``` + ```js + contentPayloadKey: 'content'; + ``` -Our collection is ready to be queried. We can now move to the next step, which is setting up the Ollama model. +- `metadataPayloadKey`: Name of the payload filed with the document metadata. Defaults to "metadata". -### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-vultr-dspy-ollama/\#ollama-on-vultr) Ollama on Vultr + ```js + metadataPayloadKey: 'metadata'; + ``` -Ollama is a great tool for running the LLM models on your own infrastructure. It’s designed to be lightweight and easy -to use, and [an official Docker image](https://hub.docker.com/r/ollama/ollama) is available. We can use it to run Ollama -on our Vultr Kubernetes cluster. In case of LLMs we may have some special requirements, like a GPU, and Vultr provides -the [Vultr Kubernetes Engine for Cloud GPU](https://www.vultr.com/products/cloud-gpu/) so the model can be run on a -specialized machine. Please refer to the official documentation to get Ollama up and running within your environment. -Once it’s done, we need to store the Ollama URL in the environment variable: +- `dataTypePayloadKey`: Name of the payload filed with the document datatype. Defaults to "_content_type". -shellpython + ```js + dataTypePayloadKey: '_datatype'; + ``` -```shell -export OLLAMA_URL="https://ollama.example.com" +- `collectionCreateOptions`: [Additional options](<(https://qdrant.tech/documentation/concepts/collections/#create-a-collection)>) when creating the Qdrant collection. + +## Usage +Import retriever and indexer references like so: + +```js +import { qdrantIndexerRef, qdrantRetrieverRef } from 'genkitx-qdrant'; ``` -```python -os.environ["OLLAMA_URL"] = "https://ollama.example.com" +Then, pass their references to `retrieve()` and `index()`: +```js +// To export an indexer reference: +export const qdrantIndexer = qdrantIndexerRef('collectionName', 'displayName'); ``` -We will refer to this URL later on when configuring the Ollama model in our application. +```js +// To export a retriever reference: +export const qdrantRetriever = qdrantRetrieverRef('collectionName', 'displayName'); +``` -#### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-vultr-dspy-ollama/\#setting-up-the-large-language-model) Setting up the Large Language Model +You can refer to [Retrieval-augmented generation](https://genkit.dev/docs/rag/) for a general +discussion on indexers and retrievers. -We are going to use one of the lightweight LLMs available in Ollama, a `gemma:2b` model. It was developed by Google -DeepMind team and has 3B parameters. The [Ollama version](https://ollama.com/library/gemma:2b) uses 4-bit quantization. -Installing the model is as simple as running the following command on the machine where Ollama is running: +## Further Reading -```shell -ollama run gemma:2b +- [Introduction to Genkit](https://genkit.dev/) +- [Genkit Documentation](https://genkit.dev/docs/get-started/) +- [Source Code](https://github.com/qdrant/qdrant-genkit) -``` +<|page-226-lllmstxt|> +| Time: 10 min | Level: Beginner | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/qdrant/examples/blob/gemini-getting-started/gemini-getting-started/gemini-getting-started.ipynb) | +| --- | ----------- | ----------- | -Ollama models are also integrated with DSPy, so we can use them directly in our application. +# Gemini -## [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-vultr-dspy-ollama/\#implementing-the-information-extraction-pipeline) Implementing the information extraction pipeline +Qdrant is compatible with Gemini Embedding Model API and its official Python SDK that can be installed as any other package: -DSPy is a bit different from the other LLM frameworks. It’s designed to optimize the prompts and weights of LMs in a -pipeline. It’s a bit like a compiler for LMs: you write a pipeline in a high-level language, and DSPy generates the -prompts and weights for you. This means you can build complex systems without having to worry about the details of how -to prompt your LMs, as DSPy will do that for you. It is somehow similar to PyTorch but for LLMs. +Gemini is a new family of Google PaLM models, released in December 2023. The new embedding models succeed the previous Gecko Embedding Model. -First of all, we will define the Language Model we are going to use: +In the latest models, an additional parameter, `task_type`, can be passed to the API call. This parameter serves to designate the intended purpose for the embeddings utilized. -```python -import dspy +The Embedding Model API supports various task types, outlined as follows: -gemma_model = dspy.OllamaLocal( - model="gemma:2b", - base_url=os.environ.get("OLLAMA_URL"), - max_tokens=500, -) +1. `retrieval_query`: query in a search/retrieval setting +2. `retrieval_document`: document from the corpus being searched +3. `semantic_similarity`: semantic text similarity +4. `classification`: embeddings to be used for text classification +5. `clustering`: the generated embeddings will be used for clustering +6. `task_type_unspecified`: Unset value, which will default to one of the other values. -``` -Similarly, we have to define connection to our Qdrant Hybrid Cloud cluster: +If you're building a semantic search application, such as RAG, you should use `task_type="retrieval_document"` for the indexed documents and `task_type="retrieval_query"` for the search queries. -```python -from dspy_qdrant import QdrantRM -from qdrant_client import QdrantClient, models +The following example shows how to do this with Qdrant: -client = QdrantClient( - os.environ.get("QDRANT_URL"), - api_key=os.environ.get("QDRANT_API_KEY"), -) -qdrant_retriever = QdrantRM( - qdrant_collection_name="document-parts", - qdrant_client=client, -) +## Setup +```bash +pip install google-generativeai ``` -Finally, both components have to be configured in DSPy with a simple call to one of the functions: +Let's see how to use the Embedding Model API to embed a document for retrieval. + +The following example shows how to embed a document with the `models/embedding-001` with the `retrieval_document` task type: + +## Embedding a document ```python -dspy.configure(lm=gemma_model, rm=qdrant_retriever) +import google.generativeai as gemini_client +from qdrant_client import QdrantClient +from qdrant_client.models import Distance, PointStruct, VectorParams + +collection_name = "example_collection" + +GEMINI_API_KEY = "YOUR GEMINI API KEY" # add your key here +client = QdrantClient(url="http://localhost:6333") +gemini_client.configure(api_key=GEMINI_API_KEY) +texts = [ + "Qdrant is a vector database that is compatible with Gemini.", + "Gemini is a new family of Google PaLM models, released in December 2023.", +] + +results = [ + gemini_client.embed_content( + model="models/embedding-001", + content=sentence, + task_type="retrieval_document", + title="Qdrant x Gemini", + ) + for sentence in texts +] ``` -### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-vultr-dspy-ollama/\#application-logic) Application logic +## Creating Qdrant Points and Indexing documents with Qdrant -There is a concept of signatures which defines input and output formats of the pipeline. We are going to define a simple -signature for the event: +### Creating Qdrant Points ```python -class Event(dspy.Signature): - description = dspy.InputField( - desc="Textual description of the event, including name, location and dates" +points = [ + PointStruct( + id=idx, + vector=response['embedding'], + payload={"text": text}, ) - event_name = dspy.OutputField(desc="Name of the event") - location = dspy.OutputField(desc="Location of the event") - start_date = dspy.OutputField(desc="Start date of the event, YYYY-MM-DD") - end_date = dspy.OutputField(desc="End date of the event, YYYY-MM-DD") - + for idx, (response, text) in enumerate(zip(results, texts)) +] ``` -It is designed to derive the structured information from the textual description of the event. Now, we can build our -module that will use it, along with Qdrant and Ollama model. Let’s call it `EventExtractor`: +### Create Collection ```python -class EventExtractor(dspy.Module): +client.create_collection(collection_name, vectors_config= + VectorParams( + size=768, + distance=Distance.COSINE, + ) +) +``` - def __init__(self): - super().__init__() - # Retrieve module to get relevant documents - self.retriever = dspy.Retrieve(k=3) - # Predict module for the created signature - self.predict = dspy.Predict(Event) +### Add these into the collection - def forward(self, query: str): - # Retrieve the most relevant documents - results = self.retriever.forward(query) +```python +client.upsert(collection_name, points) +``` - # Try to extract events from the retrieved documents - events = [] - for document in results.passages: - event = self.predict(description=document) - events.append(event) +## Searching for documents with Qdrant - return events +Once the documents are indexed, you can search for the most relevant documents using the same model with the `retrieval_query` task type: +```python +client.search( + collection_name=collection_name, + query_vector=gemini_client.embed_content( + model="models/embedding-001", + content="Is Qdrant compatible with Gemini?", + task_type="retrieval_query", + )["embedding"], +) ``` -The logic is simple: we retrieve the most relevant documents from Qdrant, and then try to extract the structured -information from them using the `Event` signature. We can simply call it and see the results: - -```python -extractor = EventExtractor() -extractor.forward("Blockchain events close to Europe") +## Using Gemini Embedding Models with Binary Quantization -``` +You can use Gemini Embedding Models with [Binary Quantization](/articles/binary-quantization/) - a technique that allows you to reduce the size of the embeddings by 32 times without losing the quality of the search results too much. -Output: +In this table, you can see the results of the search with the `models/embedding-001` model with Binary Quantization in comparison with the original model: -```python -[\ - Prediction(\ - event_name='Event Name: Blockchain Expo Global',\ - location='Dubai, UAE',\ - start_date='2024-05-20',\ - end_date='2024-05-22'\ - ),\ - Prediction(\ - event_name='Event Name: Blockchain for Business Summit',\ - location='Singapore',\ - start_date='2024-05-02',\ - end_date='2024-05-04'\ - ),\ - Prediction(\ - event_name='Event Name: Open Source Summit',\ - location='Helsinki, Finland',\ - start_date='2024-08-11',\ - end_date='2024-08-13'\ - )\ -] +At an oversampling of 3 and a limit of 100, we've a 95% recall against the exact nearest neighbors with rescore enabled. -``` +| Oversampling | | 1 | 1 | 2 | 2 | 3 | 3 | +|--------------|---------|----------|----------|----------|----------|----------|----------| +| | **Rescore** | False | True | False | True | False | True | +| **Limit** | | | | | | | | +| 10 | | 0.523333 | 0.831111 | 0.523333 | 0.915556 | 0.523333 | 0.950000 | +| 20 | | 0.510000 | 0.836667 | 0.510000 | 0.912222 | 0.510000 | 0.937778 | +| 50 | | 0.489111 | 0.841556 | 0.489111 | 0.913333 | 0.488444 | 0.947111 | +| 100 | | 0.485778 | 0.846556 | 0.485556 | 0.929000 | 0.486000 | **0.956333** | -The task was solved successfully, even without any optimization. However, each of the events has the “Event Name: " -prefix that we might want to remove. DSPy allows optimizing the module, so we can improve the results. Optimization -might be done in different ways, and it’s [well covered in the DSPy\\ -documentation](https://dspy.ai/learn/optimization/optimizers/). +That's it! You can now use Gemini Embedding Models with Qdrant! -We are not going to go through the optimization process in this tutorial. However, we encourage you to experiment with -it, as it might significantly improve the performance of your pipeline. +<|page-227-lllmstxt|> +# Haystack -Created module might be easily stored on a specific path, and loaded later on: +[Haystack](https://haystack.deepset.ai/) serves as a comprehensive NLP framework, offering a modular methodology for constructing +cutting-edge generative AI, QA, and semantic knowledge base search systems. A critical element in contemporary NLP systems is an +efficient database for storing and retrieving extensive text data. Vector databases excel in this role, as they house vector +representations of text and implement effective methods for swift retrieval. Thus, we are happy to announce the integration +with Haystack - `QdrantDocumentStore`. This document store is unique, as it is maintained externally by the Qdrant team. -```python -extractor.save("event_extractor") +The new document store comes as a separate package and can be updated independently of Haystack: +```bash +pip install qdrant-haystack ``` -To load, just create an instance of the module and call the `load` method: +`QdrantDocumentStore` supports [all the configuration properties](/documentation/collections/#create-collection) available in +the Qdrant Python client. If you want to customize the default configuration of the collection used under the hood, you can +provide that settings when you create an instance of the `QdrantDocumentStore`. For example, if you'd like to enable the +Scalar Quantization, you'd make that in the following way: ```python -second_extractor = EventExtractor() -second_extractor.load("event_extractor") +from qdrant_haystack.document_stores import QdrantDocumentStore +from qdrant_client import models +document_store = QdrantDocumentStore( + ":memory:", + index="Document", + embedding_dim=512, + recreate_index=True, + quantization_config=models.ScalarQuantization( + scalar=models.ScalarQuantizationConfig( + type=models.ScalarType.INT8, + quantile=0.99, + always_ram=True, + ), + ), +) ``` -This is especially useful when you optimize the module, as the optimized version might be stored and loaded later on -without redoing the optimization process each time you run the application. - -### [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-vultr-dspy-ollama/\#deploying-the-extraction-pipeline) Deploying the extraction pipeline +## Further Reading -Vultr gives us a lot of flexibility in terms of deploying the applications. Perfectly, we would use the Kubernetes -cluster we set up earlier to run it. The deployment is as simple as running any other Python application. This time we -don’t need a GPU, as Ollama is already running on a separate machine, and DSPy just interacts with it. +- [Haystack Documentation](https://haystack.deepset.ai/integrations/qdrant-document-store) +- [Source Code](https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/qdrant) -## [Anchor](https://qdrant.tech/documentation/examples/rag-chatbot-vultr-dspy-ollama/\#wrapping-up) Wrapping up +<|page-228-lllmstxt|> +# HoneyHive -In this tutorial, we showed you how to set up a private environment for information extraction using DSPy, Ollama, and -Qdrant. All the components might be securely hosted on the Vultr cloud, giving you full control over your data. +[HoneyHive](https://www.honeyhive.ai/) is an AI evaluation and observability platform for Generative AI applications. HoneyHive’s platform gives developers enterprise-grade tools to debug complex retrieval pipelines, evaluate performance over large test suites, monitor usage in real-time, and manage prompts within a shared workspace. Teams use HoneyHive to iterate faster, detect failures at scale, and deliver exceptional AI products. -##### Was this page useful? +By integrating Qdrant with HoneyHive, you can: -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +- Trace vector database operations +- Monitor latency, embedding quality, and context relevance +- Evaluate retrieval performance in your RAG pipelines +- Optimize paramaters such as `chunk_size` or `chunk_overlap` -Thank you for your feedback! 🙏 +## Prerequisites -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/rag-chatbot-vultr-dspy-ollama.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +- A HoneyHive account and API key +- Python 3.8+ -On this page: +## Installation -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/rag-chatbot-vultr-dspy-ollama.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Install the required packages: -× +```bash +pip install qdrant-client openai honeyhive +``` -[Powered by](https://qdrant.tech/) +## Basic Integration Example -<|page-128-lllmstxt|> -## benchmarks-intro -# How vector search should be benchmarked? +The following example demonstrates a complete RAG pipeline with HoneyHive tracing for Qdrant operations. We'll break down each component step by step. -January 01, 0001 +### Initialize Clients and Setup -# [Anchor](https://qdrant.tech/benchmarks/benchmarks-intro/\#benchmarking-vector-databases) Benchmarking Vector Databases +First, set up the necessary clients and configuration for HoneyHive, OpenAI, and Qdrant: -At Qdrant, performance is the top-most priority. We always make sure that we use system resources efficiently so you get the **fastest and most accurate results at the cheapest cloud costs**. So all of our decisions from [choosing Rust](https://qdrant.tech/articles/why-rust/), [io optimisations](https://qdrant.tech/articles/io_uring/), [serverless support](https://qdrant.tech/articles/serverless/), [binary quantization](https://qdrant.tech/articles/binary-quantization/), to our [fastembed library](https://qdrant.tech/articles/fastembed/) are all based on our principle. In this article, we will compare how Qdrant performs against the other vector search engines. +```python +from qdrant_client import QdrantClient +from qdrant_client.http.models import PointStruct, VectorParams, Distance +import openai +import os +from honeyhive.tracer import HoneyHiveTracer +from honeyhive.tracer.custom import trace +from openai import OpenAI -Here are the principles we followed while designing these benchmarks: +# Set API Keys +openai.api_key = os.getenv("OPENAI_API_KEY") +honeyhive_api_key = os.getenv("HONEYHIVE_API_KEY") -- We do comparative benchmarks, which means we focus on **relative numbers** rather than absolute numbers. -- We use affordable hardware, so that you can reproduce the results easily. -- We run benchmarks on the same exact machines to avoid any possible hardware bias. -- All the benchmarks are [open-sourced](https://github.com/qdrant/vector-db-benchmark), so you can contribute and improve them. +# Initialize HoneyHive Tracer +HoneyHiveTracer.init( + api_key=honeyhive_api_key, + project="qdrant-rag-example", + session_name="qdrant-integration-demo" +) -Scenarios we tested +# Initialize OpenAI client +openai_client = OpenAI(api_key=openai.api_key) +``` -1. Upload & Search benchmark on single node [Benchmark](https://qdrant.tech/benchmarks/single-node-speed-benchmark/) -2. Filtered search benchmark - [Benchmark](https://qdrant.tech/benchmarks/#filtered-search-benchmark) -3. Memory consumption benchmark - Coming soon -4. Cluster mode benchmark - Coming soon +### Connect to Qdrant -Some of our experiment design decisions are described in the [F.A.Q Section](https://qdrant.tech/benchmarks/#benchmarks-faq). -Reach out to us on our [Discord channel](https://qdrant.to/discord) if you want to discuss anything related Qdrant or these benchmarks. +You can connect to Qdrant in two ways: self-hosted (local) or cloud-hosted (Qdrant Cloud): -Share this article +#### Option 1: Self-Hosted Qdrant (Local) -[x](https://twitter.com/intent/tweet?url=https%3A%2F%2Fqdrant.tech%2Fbenchmarks%2Fbenchmarks-intro%2F&text=How%20vector%20search%20should%20be%20benchmarked? "x")[LinkedIn](https://www.linkedin.com/sharing/share-offsite/?url=https%3A%2F%2Fqdrant.tech%2Fbenchmarks%2Fbenchmarks-intro%2F "LinkedIn") +To run Qdrant locally, you need to have Docker installed and run the following command: -Up! +```bash +docker pull qdrant/qdrant +docker run -p 6333:6333 -p 6334:6334 -v "$(pwd)/qdrant_storage:/qdrant/storage" qdrant/qdrant +``` -<|page-129-lllmstxt|> -## rag-and-genai -- [Articles](https://qdrant.tech/articles/) -- RAG & GenAI - -#### RAG & GenAI - -Leverage Qdrant for Retrieval-Augmented Generation (RAG) and build AI Agents - -[![Preview](https://qdrant.tech/articles_data/agentic-rag/preview/preview.jpg)\\ -**What is Agentic RAG? Building Agents with Qdrant** \\ -Agents are a new paradigm in AI, and they are changing how we build RAG systems. Learn how to build agents with Qdrant and which framework to choose.\\ -\\ -Kacper Ɓukawski\\ -\\ -November 22, 2024](https://qdrant.tech/articles/agentic-rag/)[![Preview](https://qdrant.tech/articles_data/rapid-rag-optimization-with-qdrant-and-quotient/preview/preview.jpg)\\ -**Optimizing RAG Through an Evaluation-Based Methodology** \\ -Learn how Qdrant-powered RAG applications can be tested and iteratively improved using LLM evaluation tools like Quotient.\\ -\\ -Atita Arora\\ -\\ -June 12, 2024](https://qdrant.tech/articles/rapid-rag-optimization-with-qdrant-and-quotient/)[![Preview](https://qdrant.tech/articles_data/semantic-cache-ai-data-retrieval/preview/preview.jpg)\\ -**Semantic Cache: Accelerating AI with Lightning-Fast Data Retrieval** \\ -Semantic cache is reshaping AI applications by enabling rapid data retrieval. Discover how its implementation benefits your RAG setup.\\ -\\ -Daniel Romero, David Myriel\\ -\\ -May 07, 2024](https://qdrant.tech/articles/semantic-cache-ai-data-retrieval/)[![Preview](https://qdrant.tech/articles_data/what-is-rag-in-ai/preview/preview.jpg)\\ -**What is RAG: Understanding Retrieval-Augmented Generation** \\ -Explore how RAG enables LLMs to retrieve and utilize relevant external data when generating responses, rather than being limited to their original training data alone.\\ -\\ -Sabrina Aquino\\ -\\ -March 19, 2024](https://qdrant.tech/articles/what-is-rag-in-ai/)[![Preview](https://qdrant.tech/articles_data/rag-is-dead/preview/preview.jpg)\\ -**Is RAG Dead? The Role of Vector Databases in Vector Search \| Qdrant** \\ -Uncover the necessity of vector databases for RAG and learn how Qdrant's vector database empowers enterprise AI with unmatched accuracy and cost-effectiveness.\\ -\\ -David Myriel\\ -\\ -February 27, 2024](https://qdrant.tech/articles/rag-is-dead/) - -× - -[Powered by](https://qdrant.tech/) +Then connect to the local Qdrant instance: -<|page-130-lllmstxt|> -## security -- [Documentation](https://qdrant.tech/documentation/) -- [Guides](https://qdrant.tech/documentation/guides/) -- Security +```python +# Connect to local Qdrant +client = QdrantClient(url="http://localhost:6333") +print("Connected to local Qdrant instance") +``` -# [Anchor](https://qdrant.tech/documentation/guides/security/\#security) Security +#### Option 2: Qdrant Cloud -Please read this page carefully. Although there are various ways to secure your Qdrant instances, **they are unsecured by default**. -You need to enable security measures before production use. Otherwise, they are completely open to anyone +For Qdrant Cloud, you'll need your cluster host and API key: -## [Anchor](https://qdrant.tech/documentation/guides/security/\#authentication) Authentication +```python +# Qdrant Cloud configuration +QDRANT_HOST = os.getenv("QDRANT_HOST") # e.g., "your-cluster-id.eu-central.aws.cloud.qdrant.io" +QDRANT_API_KEY = os.getenv("QDRANT_API_KEY") -_Available as of v1.2.0_ +# Connect to Qdrant Cloud +client = QdrantClient(url=QDRANT_HOST, api_key=QDRANT_API_KEY) +print("Connected to Qdrant Cloud") +``` -Qdrant supports a simple form of client authentication using a static API key. -This can be used to secure your instance. +### Create a Collection -To enable API key based authentication in your own Qdrant instance you must -specify a key in the configuration: +Create a collection to store document embeddings: -```yaml -service: - # Set an api-key. - # If set, all requests must include a header with the api-key. - # example header: `api-key: ` - # - # If you enable this you should also enable TLS. - # (Either above or via an external service like nginx.) - # Sending an api-key over an unencrypted channel is insecure. - api_key: your_secret_api_key_here +```python +collection_name = "documents" +vector_size = 1536 # For text-embedding-3-small +vector_distance = Distance.COSINE +# Create collection if it doesn't exist +if not client.collection_exists(collection_name): + client.create_collection( + collection_name=collection_name, + vectors_config=VectorParams(size=vector_size, distance=vector_distance) + ) ``` -Or alternatively, you can use the environment variable: +### Define Embedding Function with Tracing -```bash -docker run -p 6333:6333 \ - -e QDRANT__SERVICE__API_KEY=your_secret_api_key_here \ - qdrant/qdrant +Create a function to generate embeddings with HoneyHive tracing: +```python +@trace() +def embed_text(text: str) -> list: + """Generate embeddings for a text using OpenAI's API.""" + response = openai_client.embeddings.create( + model="text-embedding-3-small", + input=text + ) + return response.data[0].embedding ``` -For using API key based authentication in Qdrant Cloud see the cloud -[Authentication](https://qdrant.tech/documentation/cloud/authentication/) -section. - -The API key then needs to be present in all REST or gRPC requests to your instance. -All official Qdrant clients for Python, Go, Rust, .NET and Java support the API key parameter. +### Insert Documents with Tracing -bashpythontypescriptrustjavacsharpgo +Create a function to insert documents into Qdrant with tracing: -```bash -curl \ - -X GET https://localhost:6333 \ - --header 'api-key: your_secret_api_key_here' +```python +@trace() +def insert_documents(docs): + """Insert documents into Qdrant collection.""" + points = [] + for idx, doc in enumerate(docs): + vector = embed_text(doc) + points.append(PointStruct( + id=idx + 1, + vector=vector, + payload={"text": doc} + )) + + client.upsert( + collection_name=collection_name, + points=points + ) + return len(points) + +# Sample documents +documents = [ + "Qdrant is a vector database optimized for storing and searching high-dimensional vectors.", + "HoneyHive provides observability for AI applications, including RAG pipelines.", + "Retrieval-Augmented Generation (RAG) combines retrieval systems with generative models.", + "Vector databases like Qdrant are essential for efficient similarity search in RAG systems.", + "OpenAI's embedding models convert text into high-dimensional vectors for semantic search." +] +# Insert documents +num_inserted = insert_documents(documents) ``` -```python -from qdrant_client import QdrantClient +### Retrieve Documents with Tracing -client = QdrantClient( - url="https://localhost:6333", - api_key="your_secret_api_key_here", -) +Create a function to retrieve relevant documents from Qdrant with tracing: +```python +@trace() +def get_relevant_docs(query: str, top_k: int = 3) -> list: + """Retrieve relevant documents for a query.""" + # Embed the query + q_vector = embed_text(query) + + # Search in Qdrant + search_response = client.query_points( + collection_name=collection_name, + query=q_vector, + limit=top_k, + with_payload=True + ) + + # Extract results + docs = [] + for point in search_response.points: + docs.append({ + "id": point.id, + "text": point.payload.get("text"), + "score": point.score + }) + + return docs ``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +### Generate Response with Tracing -const client = new QdrantClient({ - url: "http://localhost", - port: 6333, - apiKey: "your_secret_api_key_here", -}); +Create a function to generate a response using OpenAI with tracing: -``` +```python +@trace() +def answer_query(query: str, relevant_docs: list) -> str: + """Generate an answer for a query using retrieved documents.""" + if not relevant_docs: + return "Could not retrieve relevant documents to answer the query." -```rust -use qdrant_client::Qdrant; + # Format context from retrieved documents + context_parts = [] + for i, doc in enumerate(relevant_docs): + context_parts.append(f"Document {i+1} (ID: {doc['id']}, Score: {doc['score']:.4f}):\n{doc['text']}") + context = "\n\n".join(context_parts) -let client = Qdrant::from_url("https://xyz-example.eu-central.aws.cloud.qdrant.io:6334") - .api_key("") - .build()?; + # Create prompt + prompt = f"""Answer the question based ONLY on the following context: -``` +Context: +{context} -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; +Question: {query} -QdrantClient client = - new QdrantClient( - QdrantGrpcClient.newBuilder( - "xyz-example.eu-central.aws.cloud.qdrant.io", - 6334, - true) - .withApiKey("") - .build()); +Answer:""" + + # Generate answer + completion = openai_client.chat.completions.create( + model="gpt-4o-mini", + messages=[ + {"role": "system", "content": "You are a helpful assistant that answers questions based strictly on the provided context. If the answer is not in the context, say so clearly."}, + {"role": "user", "content": prompt} + ], + temperature=0.2 + ) + return completion.choices[0].message.content.strip() ``` -```csharp -using Qdrant.Client; +### Complete RAG Pipeline -var client = new QdrantClient( - host: "xyz-example.eu-central.aws.cloud.qdrant.io", - https: true, - apiKey: "" -); +Create a function to run the complete RAG pipeline with tracing: +```python +@trace() +def rag_pipeline(query: str) -> dict: + """End-to-end RAG pipeline.""" + # Get relevant documents + relevant_docs = get_relevant_docs(query) + + # Generate answer + answer = answer_query(query, relevant_docs) + + return { + "query": query, + "answer": answer, + "retrieved_documents": relevant_docs + } ``` -```go -import "github.com/qdrant/go-client/qdrant" - -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "xyz-example.eu-central.aws.cloud.qdrant.io", - Port: 6334, - APIKey: "", - UseTLS: true, -}) - +### Batch Processing + +For larger document sets, you can use batch processing to improve performance: + +```python +@trace() +def batch_insert_documents(documents_to_insert, batch_size=10, start_id_offset=0): + """Insert documents in batches.""" + total_inserted = 0 + + for i in range(0, len(documents_to_insert), batch_size): + batch_docs = documents_to_insert[i:i+batch_size] + points = [] + + for local_idx, doc in enumerate(batch_docs): + relative_idx = i + local_idx + vector = embed_text(doc) + point_id = relative_idx + start_id_offset + 1 + points.append(PointStruct( + id=point_id, + vector=vector, + payload={"text": doc} + )) + + if points: + client.upsert( + collection_name=collection_name, + points=points + ) + total_inserted += len(points) + + return total_inserted ``` -### [Anchor](https://qdrant.tech/documentation/guides/security/\#read-only-api-key) Read-only API key +### Test the RAG Pipeline -_Available as of v1.7.0_ - -In addition to the regular API key, Qdrant also supports a read-only API key. -This key can be used to access read-only operations on the instance. +Here's how to test the complete RAG pipeline: -```yaml -service: - read_only_api_key: your_secret_read_only_api_key_here +```python +# Test query +test_query = "What is Qdrant used for?" +result = rag_pipeline(test_query) +print(f"Query: {result['query']}") +print(f"Answer: {result['answer']}") +print("\nRetrieved Documents:") +for i, doc in enumerate(result['retrieved_documents']): + print(f"Document {i+1} (ID: {doc['id']}, Score: {doc['score']:.4f}): {doc['text']}") ``` -Or with the environment variable: - -```bash -export QDRANT__SERVICE__READ_ONLY_API_KEY=your_secret_read_only_api_key_here +## Viewing Traces in HoneyHive -``` +After running your RAG pipeline with Qdrant, you can view the traces in the HoneyHive UI: -Both API keys can be used simultaneously. +1. Navigate to your project in the HoneyHive dashboard +2. Click on the "Traces" tab to see all the traces from your RAG pipeline +3. Click on a specific trace to see detailed information about each step in the pipeline +4. Analyze the performance of your vector operations, embeddings, and retrieval processes -### [Anchor](https://qdrant.tech/documentation/guides/security/\#granular-access-control-with-jwt) Granular access control with JWT +With HoneyHive, you can easily monitor and optimize your Qdrant-powered RAG pipeline, ensuring that it delivers the best possible results for your users. -_Available as of v1.9.0_ +## Further Reading -For more complex cases, Qdrant supports granular access control with [JSON Web Tokens (JWT)](https://jwt.io/). -This allows you to create tokens which restrict access to data stored in your cluster, and build [Role-based access control (RBAC)](https://en.wikipedia.org/wiki/Role-based_access_control) on top of that. -In this way, you can define permissions for users and restrict access to sensitive endpoints. +- [HoneyHive Documentation](https://docs.honeyhive.ai/introduction/what-is-hhai) -To enable JWT-based authentication in your own Qdrant instance you need to specify the `api-key` and enable the `jwt_rbac` feature in the configuration: +<|page-229-lllmstxt|> +![Fluvio Logo](/documentation/data-management/fluvio/fluvio-logo.png) -```yaml -service: - api_key: you_secret_api_key_here - jwt_rbac: true +[InfinyOn Fluvio](https://www.fluvio.io/) is an open-source platform written in Rust for high speed, real-time data processing. It is cloud native, designed to work with any infrastructure type, from bare metal hardware to containerized platforms. -``` +## Usage with Qdrant -Or with the environment variables: +With the [Qdrant Fluvio Connector](https://github.com/qdrant/qdrant-fluvio), you can stream records from Fluvio topics to Qdrant collections, leveraging Fluvio's delivery guarantees and high-throughput. -```bash -export QDRANT__SERVICE__API_KEY=your_secret_api_key_here -export QDRANT__SERVICE__JWT_RBAC=true +### Pre-requisites -``` +- A Fluvio installation. You can refer to the [Fluvio Quickstart](https://www.fluvio.io/docs/fluvio/quickstart/) for instructions. +- Qdrant server to connect to. You can set up a [local instance](/documentation/quickstart/) or a free cloud instance at [cloud.qdrant.io](https://cloud.qdrant.io/). -The `api_key` you set in the configuration will be used to encode and decode the JWTs, so –needless to say– keep it secure. If your `api_key` changes, all existing tokens will be invalid. +### Downloading the connector -To use JWT-based authentication, you need to provide it as a bearer token in the `Authorization` header, or as an key in the `Api-Key` header of your requests. +Run the following commands after [setting up Fluvio](https://www.fluvio.io/docs/fluvio/quickstart). -httppythontypescriptrustjavacsharpgo +```console +cdk hub download qdrant/qdrant-sink@0.1.0 +``` -```http -Authorization: Bearer +### Example Config -// or +> _config.yaml_ -Api-Key: +```yaml +apiVersion: 0.1.0 +meta: + version: 0.1.0 + name: my-qdrant-connector + type: qdrant-sink + topic: topic-name + secrets: + - name: QDRANT_API_KEY +qdrant: + url: https://xyz-example.eu-central.aws.cloud.qdrant.io:6334 + api_key: "${{ secrets.QDRANT_API_KEY }}" ``` -```python -from qdrant_client import QdrantClient - -qdrant_client = QdrantClient( - "xyz-example.eu-central.aws.cloud.qdrant.io", - api_key="", -) +> _secrets.txt_ +```text +QDRANT_API_KEY= ``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; - -const client = new QdrantClient({ - host: "xyz-example.eu-central.aws.cloud.qdrant.io", - apiKey: "", -}); +### Running +```console +cdk deploy start --ipkg qdrant-qdrant-sink-0.1.0.ipkg -c config.yaml --secrets secrets.txt ``` -```rust -use qdrant_client::Qdrant; +### Produce Messages -let client = Qdrant::from_url("https://xyz-example.eu-central.aws.cloud.qdrant.io:6334") - .api_key("") - .build()?; +You can now run the following to generate messages to be written into Qdrant. +```console +fluvio produce topic-name ``` -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; +### Message Formats -QdrantClient client = - new QdrantClient( - QdrantGrpcClient.newBuilder( - "xyz-example.eu-central.aws.cloud.qdrant.io", - 6334, - true) - .withApiKey("") - .build()); +This sink connector supports messages with dense/sparse/multi vectors. -``` +_Click each to expand._ -```csharp -using Qdrant.Client; +
+ Unnamed/Default vector -var client = new QdrantClient( - host: "xyz-example.eu-central.aws.cloud.qdrant.io", - https: true, - apiKey: "" -); +Reference: [Creating a collection with a default vector](https://qdrant.tech/documentation/concepts/collections/#create-a-collection). +```json +{ + "collection_name": "{collection_name}", + "id": 1, + "vectors": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8 + ], + "payload": { + "name": "fluvio", + "description": "Solution for distributed stream processing", + "url": "https://www.fluvio.io/" + } +} ``` -```go -import "github.com/qdrant/go-client/qdrant" +
-client, err := qdrant.NewClient(&qdrant.Config{ - Host: "xyz-example.eu-central.aws.cloud.qdrant.io", - Port: 6334, - APIKey: "", - UseTLS: true, -}) +
+ Named multiple vectors + +Reference: [Creating a collection with multiple vectors](https://qdrant.tech/documentation/concepts/collections/#collection-with-multiple-vectors). +```json +{ + "collection_name": "{collection_name}", + "id": 1, + "vectors": { + "some-dense": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8 + ], + "some-other-dense": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8 + ] + }, + "payload": { + "name": "fluvio", + "description": "Solution for distributed stream processing", + "url": "https://www.fluvio.io/" + } +} ``` -#### [Anchor](https://qdrant.tech/documentation/guides/security/\#generating-json-web-tokens) Generating JSON Web Tokens +
-Due to the nature of JWT, anyone who knows the `api_key` can generate tokens by using any of the existing libraries and tools, it is not necessary for them to have access to the Qdrant instance to generate them. +
+ Sparse vectors -For convenience, we have added a JWT generation tool the Qdrant Web UI under the 🔑 tab, if you’re using the default url, it will be at `http://localhost:6333/dashboard#/jwt`. +Reference: [Creating a collection with sparse vectors](https://qdrant.tech/documentation/concepts/collections/#collection-with-sparse-vectors). -- **JWT Header** \- Qdrant uses the `HS256` algorithm to decode the tokens. +```json +{ + "collection_name": "{collection_name}", + "id": 1, + "vectors": { + "some-sparse": { + "indices": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9 + ], + "values": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 1.0 + ] + } + }, + "payload": { + "name": "fluvio", + "description": "Solution for distributed stream processing", + "url": "https://www.fluvio.io/" + } +} +``` +
+
+ Multi-vector ```json { - "alg": "HS256", - "typ": "JWT" + "collection_name": "{collection_name}", + "id": 1, + "vectors": { + "some-multi": [ + [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 1.0 + ], + [ + 1.0, + 0.9, + 0.8, + 0.5, + 0.4, + 0.8, + 0.6, + 0.4, + 0.2, + 0.1 + ] + ] + }, + "payload": { + "name": "fluvio", + "description": "Solution for distributed stream processing", + "url": "https://www.fluvio.io/" + } } - ``` -- **JWT Payload** \- You can include any combination of the [parameters available](https://qdrant.tech/documentation/guides/security/#jwt-configuration) in the payload. Keep reading for more info on each one. +
+ +
+ Combination of named dense and sparse vectors +Reference: +- [Creating a collection with multiple vectors](https://qdrant.tech/documentation/concepts/collections/#collection-with-multiple-vectors). + +- [Creating a collection with sparse vectors](https://qdrant.tech/documentation/concepts/collections/#collection-with-sparse-vectors). ```json { - "exp": 1640995200, // Expiration time - "value_exists": ..., // Validate this token by looking for a point with a payload value - "access": "r", // Define the access level. + "collection_name": "{collection_name}", + "id": "a10435b5-2a58-427a-a3a0-a5d845b147b7", + "vectors": { + "some-other-dense": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8 + ], + "some-sparse": { + "indices": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9 + ], + "values": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 1.0 + ] + } + }, + "payload": { + "name": "fluvio", + "description": "Solution for distributed stream processing", + "url": "https://www.fluvio.io/" + } } - ``` +
-**Signing the token** \- To confirm that the generated token is valid, it needs to be signed with the `api_key` you have set in the configuration. -That would mean, that someone who knows the `api_key` gives the authorization for the new token to be used in the Qdrant instance. -Qdrant can validate the signature, because it knows the `api_key` and can decode the token. +### Further Reading -The process of token generation can be done on the client side offline, and doesn’t require any communication with the Qdrant instance. +- [Fluvio Quickstart](https://www.fluvio.io/docs/fluvio/quickstart) +- [Fluvio Tutorials](https://www.fluvio.io/docs/fluvio/tutorials/) +- [Connector Source](https://github.com/qdrant/qdrant-fluvio) -Here is an example of libraries that can be used to generate JWT tokens: +<|page-230-lllmstxt|> +# Jina Embeddings -- Python: [PyJWT](https://pyjwt.readthedocs.io/en/stable/) -- JavaScript: [jsonwebtoken](https://www.npmjs.com/package/jsonwebtoken) -- Rust: [jsonwebtoken](https://crates.io/crates/jsonwebtoken) +Qdrant is compatible with [Jina AI](https://jina.ai/) embeddings. You can get a free trial key from [Jina Embeddings](https://jina.ai/embeddings/) to get embeddings. -#### [Anchor](https://qdrant.tech/documentation/guides/security/\#jwt-configuration) JWT Configuration +Qdrant users can receive a 10% discount on Jina AI APIs by using the code **QDRANT**. -These are the available options, or **claims** in the JWT lingo. You can use them in the JWT payload to define its functionality. +## Technical Summary -- **`exp`** \- The expiration time of the token. This is a Unix timestamp in seconds. The token will be invalid after this time. The check for this claim includes a 30-second leeway to account for clock skew. +| Model | Dimension | Language | MRL (matryoshka) | Context | +|:----------------------:|:---------:|:---------:|:-----------:|:---------:| +| **jina-embeddings-v4** | **2048 (single-vector), 128 (multi-vector)** | **Multilingual (30+)** | **Yes** | **32768 + Text/Image** | +| jina-clip-v2 | 1024 | Multilingual (100+, focus on 30) | Yes | Text/Image | +| jina-embeddings-v3 | 1024 | Multilingual (89 languages) | Yes | 8192 | +| jina-embeddings-v2-base-en | 768 | English | No | 8192 | +| jina-embeddings-v2-base-de | 768 | German & English | No | 8192 | +| jina-embeddings-v2-base-es | 768 | Spanish & English | No | 8192 | +| jina-embeddings-v2-base-zh | 768 | Chinese & English | No | 8192 | +> Jina recommends using `jina-embeddings-v4` for all tasks. -```json -{ - "exp": 1640995200, // Expiration time -} +On top of the backbone, `jina-embeddings-v4` has been trained with 5 task-specific adapters for different embedding uses. Include `task` in your request to optimize your downstream application: -``` ++ **retrieval.query**: Used to encode user queries or questions in retrieval tasks. ++ **retrieval.passage**: Used to encode large documents in retrieval tasks at indexing time. ++ **code.query**: Used to encode user queries or questions in code related retrieval tasks. ++ **code.passage**: Used to encode large documents in code related retrieval tasks at indexing time. ++ **text-matching**: Used to encode text for similarity matching, such as measuring similarity between two sentences. -- **`value_exists`** \- This is a claim that can be used to validate the token against the data stored in a collection. Structure of this claim is as follows: +Similarly, `jina-embeddings-v3` has been trained with 5 task-specific adapters for different embedding uses. Include `task` in your request to optimize your downstream application: ++ **retrieval.query**: Used to encode user queries or questions in retrieval tasks. ++ **retrieval.passage**: Used to encode large documents in retrieval tasks at indexing time. ++ **classification**: Used to encode text for text classification tasks. ++ **text-matching**: Used to encode text for similarity matching, such as measuring similarity between two sentences. ++ **separation**: Used for clustering or reranking tasks. +`jina-embeddings-v4`, `jina-embeddings-v3` and `jina-clip-v2` support **Matryoshka Representation Learning**, allowing users to control the embedding dimension with minimal performance loss. +Include `dimensions` in your request to select the desired dimension. +By default, **dimensions** is set to 2048 (`jina-embeddings-v4`) or 1024 (`jina-embeddings-v3` and `jina-clip-v2`), and a number between 256 and 2048 is recommended. +You can reference the table below for hints on dimension vs. performance for the `jina-embeddings-v3` model. Similar results hold for the others. -```json -{ - "value_exists": { - "collection": "my_validation_collection", - "matches": [\ - { "key": "my_key", "value": "value_that_must_exist" }\ - ], - }, -} +| Dimension | 32 | 64 | 128 | 256 | 512 | 768 | 1024 | +|:----------------------:|:---------:|:---------:|:-----------:|:---------:|:----------:|:---------:|:---------:| +| Average Retrieval Performance (nDCG@10) | 52.54 | 58.54 | 61.64 | 62.72 | 63.16 | 63.3 | 63.35 | -``` +`jina-embeddings-v4` and `jina-embeddings-v3` supports [Late Chunking](https://jina.ai/news/late-chunking-in-long-context-embedding-models/), the technique to leverage the model's long-context capabilities for generating contextual chunk embeddings. Include `late_chunking=True` in your request to enable contextual chunked representation. When set to true, Jina AI API will concatenate all sentences in the input field and feed them as a single string to the model. Internally, the model embeds this long concatenated string and then performs late chunking, returning a list of embeddings that matches the size of the input list. +## Example +### Text-to-Text Retrieval -If this claim is present, Qdrant will check if there is a point in the collection with the specified key-values. If it does, the token is valid. +The code below demonstrates how to use `jina-embeddings-v4` with Qdrant: + +```python +import requests -This claim is especially useful if you want to have an ability to revoke tokens without changing the `api_key`. -Consider a case where you have a collection of users, and you want to revoke access to a specific user. +import qdrant_client +from qdrant_client.models import Distance, VectorParams, Batch +# Provide Jina API key and choose one of the available models. +JINA_API_KEY = "jina_xxxxxxxxxxx" +MODEL = "jina-embeddings-v4" +DIMENSIONS = 2048 # Or choose your desired output vector dimensionality. +TASK = 'retrieval.passage' # For indexing, or set to retrieval.query for querying +# Get embeddings from the API +url = "https://api.jina.ai/v1/embeddings" -```json -{ - "value_exists": { - "collection": "users", - "matches": [\ - { "key": "user_id", "value": "andrey" },\ - { "key": "role", "value": "manager" }\ - ], - }, +headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {JINA_API_KEY}", } -``` +data = { + "input": ["Your text string goes here", "You can send multiple texts"], + "model": MODEL, + "dimensions": DIMENSIONS, + "task": TASK, + "late_chunking": True, +} + +response = requests.post(url, headers=headers, json=data) +embeddings = [d["embedding"] for d in response.json()["data"]] +# Index the embeddings into Qdrant +client = qdrant_client.QdrantClient(":memory:") +client.create_collection( + collection_name="MyCollection", + vectors_config=VectorParams(size= DIMENSIONS, distance=Distance.DOT), +) -You can create a token with this claim, and when you want to revoke access, you can change the `role` of the user to something else, and the token will be invalid. -- **`access`** \- This claim defines the [access level](https://qdrant.tech/documentation/guides/security/#table-of-access) of the token. If this claim is present, Qdrant will check if the token has the required access level to perform the operation. If this claim is **not** present, **manage** access is assumed. +qdrant_client.upsert( + collection_name="MyCollection", + points=Batch( + ids=list(range(len(embeddings))), + vectors=embeddings, + ), +) -It can provide global access with `r` for read-only, or `m` for manage. For example: +``` +### Text-to-Image Retrieval +The code below demonstrates how to use `jina-embeddings-v4` with Qdrant: -```json -{ - "access": "r" +```python +import requests +from qdrant_client import QdrantClient +from qdrant_client.models import Distance, VectorParams, PointStruct + +# Provide your Jina API key and choose the model. +JINA_API_KEY = "jina_xxxxxxxxxxx" +MODEL = "jina-embeddings-v4" +DIMENSIONS = 2048 # Set the desired output vector dimensionality. + +# Define the inputs +text_input = "A blue cat" +image_url = "https://i.pinimg.com/600x315/21/48/7e/21487e8e0970dd366dafaed6ab25d8d8.jpg" + +# Get embeddings from the Jina API +url = "https://api.jina.ai/v1/embeddings" +headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {JINA_API_KEY}", +} +data = { + "input": [ + {"text": text_input}, + {"image": image_url}, + ], + "model": MODEL, + "dimensions": DIMENSIONS, } -``` +response = requests.post(url, headers=headers, json=data) +response_data = response.json()["data"] +# The model doesn't differentiate between images and text, so we extract output based on the input order. +text_embedding = response_data[0]["embedding"] +image_embedding = response_data[1]["embedding"] +# Initialize Qdrant client +client = QdrantClient(url="http://localhost:6333/") -It can also be specific to one or more collections. The `access` level for each collection is `r` for read-only, or `rw` for read-write, like this: +# Create a collection with named vectors +collection_name = "MyCollection" +client.recreate_collection( + collection_name=collection_name, + vectors_config={ + "text_vector": VectorParams(size=DIMENSIONS, distance=Distance.DOT), + "image_vector": VectorParams(size=DIMENSIONS, distance=Distance.DOT), + }, +) +client.upsert( + collection_name=collection_name, + points=[ + PointStruct( + id=0, + vector={ + "text_vector": text_embedding, + "image_vector": image_embedding, + } + ) + ], +) +# Now let's query the collection +search_query = "A purple cat" -```json -{ - "access": [\ - {\ - "collection": "my_collection",\ - "access": "rw"\ - }\ - ] +# Get the embedding for the search query from the Jina API +url = "https://api.jina.ai/v1/embeddings" +headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {JINA_API_KEY}", } +data = { + "input": [{"text": search_query}], + "model": MODEL, + "dimensions": DIMENSIONS, + # "task": "retrieval.query" # Uncomment this line for text-to-text retrieval tasks +} + +response = requests.post(url, headers=headers, json=data) +query_embedding = response.json()["data"][0]["embedding"] + +search_results = client.query_points( + collection_name=collection_name, + query=query_embedding, + using="image_vector", + limit=5 +).points +for result in search_results: + print(f"ID: {result.id}, Score: {result.score}") ``` +<|page-231-lllmstxt|> +# Keboola +[Keboola](https://www.keboola.com/) is a data operations platform that integrates data engineering, analytics, and machine learning tools into a single environment. It helps businesses unify their data sources, transform data, and deploy ML models to production. -You can also specify which subset of the collection the user is able to access by specifying a `payload` restriction that the points must have. +## Prerequisites +1. A Qdrant instance to connect to. You can get a free cloud instance at [cloud.qdrant.io](https://cloud.qdrant.io/). +2. A [Keboola](https://www.keboola.com/) account to develop your data workflows. +## Setting Up -```json -{ - "access": [\ - {\ - "collection": "my_collection",\ - "access": "r",\ - "payload": {\ - "user_id": "user_123456"\ - }\ - }\ - ] -} +- In your Keboola platform, navigate to the Components section. +- Find and add the Qdrant component from the component marketplace. +- Configure the connection to your Qdrant instance using your URL and API key. -``` +## Using Qdrant in Keboola +With Keboola's Qdrant integration, you can: +- **Data Pipeline Integration**: Extract data from any source in Keboola, transform it, and load vector embeddings into Qdrant for semantic search capabilities. -This `payload` claim will be used to implicitly filter the points in the collection. It will be equivalent to appending this filter to each request: +- **Vector Database Management**: Create, manage, and update collections in Qdrant directly from your Keboola workflows. +- **Orchestration**: Schedule and automate your vector database operations as part of your data pipeline. +- **ML Operations**: Combine your machine learning models with vector search capabilities for advanced AI applications. -```json -{ "filter": { "must": [{ "key": "user_id", "match": { "value": "user_123456" } }] } } +## Example Use Case -``` +A common use case is to build a RAG (Retrieval Augmented Generation) system where: +1. Data is extracted from multiple sources in Keboola +2. Text is processed and transformed in Keboola's transformation engine +3. Embeddings are generated and stored in Qdrant +4. Applications query the Qdrant vectors for semantic search capabilities -### [Anchor](https://qdrant.tech/documentation/guides/security/\#table-of-access) Table of access +## Further Reading -Check out this table to see which actions are allowed or denied based on the access level. +- [Keboola Documentation](https://help.keboola.com/) +- [Keboola Academy](https://academy.keboola.com/) +- [Data Operations with Keboola](https://www.keboola.com/blog/data-operations) -This is also applicable to using api keys instead of tokens. In that case, `api_key` maps to **manage**, while `read_only_api_key` maps to **read-only**. +<|page-232-lllmstxt|> +# Kotaemon -**Symbols:** ✅ Allowed \| ❌ Denied \| 🟡 Allowed, but filtered +[Kotaemon](https://github.com/Cinnamon/kotaemon) is open-source clean & customizable RAG UI for chatting with your documents. Built with both end users and developers in mind. -| Action | manage | read-only | collection read-write | collection read-only | collection with payload claim (r / rw) | -| --- | --- | --- | --- | --- | --- | -| list collections | ✅ | ✅ | 🟡 | 🟡 | 🟡 | -| get collection info | ✅ | ✅ | ✅ | ✅ | ❌ | -| create collection | ✅ | ❌ | ❌ | ❌ | ❌ | -| delete collection | ✅ | ❌ | ❌ | ❌ | ❌ | -| update collection params | ✅ | ❌ | ❌ | ❌ | ❌ | -| get collection cluster info | ✅ | ✅ | ✅ | ✅ | ❌ | -| collection exists | ✅ | ✅ | ✅ | ✅ | ✅ | -| update collection cluster setup | ✅ | ❌ | ❌ | ❌ | ❌ | -| update aliases | ✅ | ❌ | ❌ | ❌ | ❌ | -| list collection aliases | ✅ | ✅ | 🟡 | 🟡 | 🟡 | -| list aliases | ✅ | ✅ | 🟡 | 🟡 | 🟡 | -| create shard key | ✅ | ❌ | ❌ | ❌ | ❌ | -| delete shard key | ✅ | ❌ | ❌ | ❌ | ❌ | -| create payload index | ✅ | ❌ | ✅ | ❌ | ❌ | -| delete payload index | ✅ | ❌ | ✅ | ❌ | ❌ | -| list collection snapshots | ✅ | ✅ | ✅ | ✅ | ❌ | -| create collection snapshot | ✅ | ❌ | ✅ | ❌ | ❌ | -| delete collection snapshot | ✅ | ❌ | ✅ | ❌ | ❌ | -| download collection snapshot | ✅ | ✅ | ✅ | ✅ | ❌ | -| upload collection snapshot | ✅ | ❌ | ❌ | ❌ | ❌ | -| recover collection snapshot | ✅ | ❌ | ❌ | ❌ | ❌ | -| list shard snapshots | ✅ | ✅ | ✅ | ✅ | ❌ | -| create shard snapshot | ✅ | ❌ | ✅ | ❌ | ❌ | -| delete shard snapshot | ✅ | ❌ | ✅ | ❌ | ❌ | -| download shard snapshot | ✅ | ✅ | ✅ | ✅ | ❌ | -| upload shard snapshot | ✅ | ❌ | ❌ | ❌ | ❌ | -| recover shard snapshot | ✅ | ❌ | ❌ | ❌ | ❌ | -| list full snapshots | ✅ | ✅ | ❌ | ❌ | ❌ | -| create full snapshot | ✅ | ❌ | ❌ | ❌ | ❌ | -| delete full snapshot | ✅ | ❌ | ❌ | ❌ | ❌ | -| download full snapshot | ✅ | ✅ | ❌ | ❌ | ❌ | -| get cluster info | ✅ | ✅ | ❌ | ❌ | ❌ | -| recover raft state | ✅ | ❌ | ❌ | ❌ | ❌ | -| delete peer | ✅ | ❌ | ❌ | ❌ | ❌ | -| get point | ✅ | ✅ | ✅ | ✅ | ❌ | -| get points | ✅ | ✅ | ✅ | ✅ | ❌ | -| upsert points | ✅ | ❌ | ✅ | ❌ | ❌ | -| update points batch | ✅ | ❌ | ✅ | ❌ | ❌ | -| delete points | ✅ | ❌ | ✅ | ❌ | ❌ / 🟡 | -| update vectors | ✅ | ❌ | ✅ | ❌ | ❌ | -| delete vectors | ✅ | ❌ | ✅ | ❌ | ❌ / 🟡 | -| set payload | ✅ | ❌ | ✅ | ❌ | ❌ | -| overwrite payload | ✅ | ❌ | ✅ | ❌ | ❌ | -| delete payload | ✅ | ❌ | ✅ | ❌ | ❌ | -| clear payload | ✅ | ❌ | ✅ | ❌ | ❌ | -| scroll points | ✅ | ✅ | ✅ | ✅ | 🟡 | -| query points | ✅ | ✅ | ✅ | ✅ | 🟡 | -| search points | ✅ | ✅ | ✅ | ✅ | 🟡 | -| search groups | ✅ | ✅ | ✅ | ✅ | 🟡 | -| recommend points | ✅ | ✅ | ✅ | ✅ | ❌ | -| recommend groups | ✅ | ✅ | ✅ | ✅ | ❌ | -| discover points | ✅ | ✅ | ✅ | ✅ | ❌ | -| count points | ✅ | ✅ | ✅ | ✅ | 🟡 | -| version | ✅ | ✅ | ✅ | ✅ | ✅ | -| readyz, healthz, livez | ✅ | ✅ | ✅ | ✅ | ✅ | -| telemetry | ✅ | ✅ | ❌ | ❌ | ❌ | -| metrics | ✅ | ✅ | ❌ | ❌ | ❌ | -| update locks | ✅ | ❌ | ❌ | ❌ | ❌ | -| get locks | ✅ | ✅ | ❌ | ❌ | ❌ | +Qdrant is supported as a vectorstore in Kotaemon for ingesting and retrieving documents. -## [Anchor](https://qdrant.tech/documentation/guides/security/\#tls) TLS +## Configuration -_Available as of v1.2.0_ +- Refer to [Getting started](https://cinnamon.github.io/kotaemon/) guide to set up Kotaemon. -TLS for encrypted connections can be enabled on your Qdrant instance to secure -connections. +- To configure Kotaemon to use Qdrant as the vector store, update the `flowsettings.py` as follows. -First make sure you have a certificate and private key for TLS, usually in -`.pem` format. On your local machine you may use -[mkcert](https://github.com/FiloSottile/mkcert#readme) to generate a self signed -certificate. +```python +KH_VECTORSTORE = { + "__type__": "kotaemon.storages.QdrantVectorStore", + "url": "https://xyz-example.eu-central.aws.cloud.qdrant.io:6333", + "api_key": "'", + "client_kwargs": {} # Additional options to pass to qdrant_client.QdrantClient +} +``` -To enable TLS, set the following properties in the Qdrant configuration with the -correct paths and restart: +- Restart Kotaemon for the changes to take effect. -```yaml -service: - # Enable HTTPS for the REST and gRPC API - enable_tls: true +The reference for all the Qdrant client options can be found [here](https://python-client.qdrant.tech/qdrant_client.qdrant_client) -# TLS configuration. -# Required if either service.enable_tls or cluster.p2p.enable_tls is true. -tls: - # Server certificate chain file - cert: ./tls/cert.pem +## Further reading - # Server private key file - key: ./tls/key.pem +- [Kotaemon Documentation](https://cinnamon.github.io/kotaemon/) +- [Source](https://github.com/Cinnamon/kotaemon) -``` +<|page-233-lllmstxt|> +# Langchain -For internal communication when running cluster mode, TLS can be enabled with: +Langchain is a library that makes developing Large Language Model-based applications much easier. It unifies the interfaces +to different libraries, including major embedding providers and Qdrant. Using Langchain, you can focus on the business value instead of writing the boilerplate. -```yaml -cluster: - # Configuration of the inter-cluster communication - p2p: - # Use TLS for communication between peers - enable_tls: true +Langchain distributes the Qdrant integration as a partner package. +It might be installed with pip: + +```bash +pip install langchain-qdrant ``` -With TLS enabled, you must start using HTTPS connections. For example: +The integration supports searching for relevant documents usin dense/sparse and hybrid retrieval. -bashpythontypescriptrust +Qdrant acts as a vector index that may store the embeddings with the documents used to generate them. There are various ways to use it, but calling `QdrantVectorStore.from_texts` or `QdrantVectorStore.from_documents` is probably the most straightforward way to get started: -```bash -curl -X GET https://localhost:6333 +```python +from langchain_qdrant import QdrantVectorStore +from langchain_openai import OpenAIEmbeddings +embeddings = OpenAIEmbeddings() + +doc_store = QdrantVectorStore.from_texts( + texts, embeddings, url="", api_key="", collection_name="texts" +) ``` -```python -from qdrant_client import QdrantClient +## Using an existing collection -client = QdrantClient( - url="https://localhost:6333", +To get an instance of `langchain_qdrant.QdrantVectorStore` without loading any new documents or texts, you can use the `QdrantVectorStore.from_existing_collection()` method. + +```python +doc_store = QdrantVectorStore.from_existing_collection( + embeddings=embeddings, + collection_name="my_documents", + url="", + api_key="", ) +``` + +## Local mode + +Python client allows you to run the same code in local mode without running the Qdrant server. That's great for testing things +out and debugging or if you plan to store just a small amount of vectors. The embeddings might be fully kept in memory or +persisted on disk. + +### In-memory + +For some testing scenarios and quick experiments, you may prefer to keep all the data in memory only, so it gets lost when the +client is destroyed - usually at the end of your script/notebook. +```python +qdrant = QdrantVectorStore.from_documents( + docs, + embeddings, + location=":memory:", # Local mode with in-memory storage only + collection_name="my_documents", +) ``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +### On-disk storage -const client = new QdrantClient({ url: "https://localhost", port: 6333 }); +Local mode, without using the Qdrant server, may also store your vectors on disk so they’re persisted between runs. +```python +qdrant = Qdrant.from_documents( + docs, + embeddings, + path="/tmp/local_qdrant", + collection_name="my_documents", +) ``` -```rust -use qdrant_client::Qdrant; +### On-premise server deployment -let client = Qdrant::from_url("http://localhost:6334").build()?; +No matter if you choose to launch QdrantVectorStore locally with [a Docker container](/documentation/guides/installation/), or +select a Kubernetes deployment with [the official Helm chart](https://github.com/qdrant/qdrant-helm), the way you're +going to connect to such an instance will be identical. You'll need to provide a URL pointing to the service. +```python +url = "<---qdrant url here --->" +qdrant = QdrantVectorStore.from_documents( + docs, + embeddings, + url, + prefer_grpc=True, + collection_name="my_documents", +) ``` -Certificate rotation is enabled with a default refresh time of one hour. This -reloads certificate files every hour while Qdrant is running. This way changed -certificates are picked up when they get updated externally. The refresh time -can be tuned by changing the `tls.cert_ttl` setting. You can leave this on, even -if you don’t plan to update your certificates. Currently this is only supported -for the REST API. +## Similarity search -Optionally, you can enable client certificate validation on the server against a -local certificate authority. Set the following properties and restart: +`QdrantVectorStore` supports 3 modes for similarity searches. They can be configured using the `retrieval_mode` parameter when setting up the class. -```yaml -service: - # Check user HTTPS client certificate against CA file specified in tls config - verify_https_client_certificate: false +- Dense Vector Search(Default) +- Sparse Vector Search +- Hybrid Search -# TLS configuration. -# Required if either service.enable_tls or cluster.p2p.enable_tls is true. -tls: - # Certificate authority certificate file. - # This certificate will be used to validate the certificates - # presented by other nodes during inter-cluster communication. - # - # If verify_https_client_certificate is true, it will verify - # HTTPS client certificate - # - # Required if cluster.p2p.enable_tls is true. - ca_cert: ./tls/cacert.pem +### Dense Vector Search + +To search with only dense vectors, + +- The `retrieval_mode` parameter should be set to `RetrievalMode.DENSE`(default). +- A [dense embeddings](https://python.langchain.com/v0.2/docs/integrations/text_embedding/) value should be provided for the `embedding` parameter. + +```py +from langchain_qdrant import RetrievalMode + +qdrant = QdrantVectorStore.from_documents( + docs, + embedding=embeddings, + location=":memory:", + collection_name="my_documents", + retrieval_mode=RetrievalMode.DENSE, +) +query = "What did the president say about Ketanji Brown Jackson" +found_docs = qdrant.similarity_search(query) ``` -## [Anchor](https://qdrant.tech/documentation/guides/security/\#hardening) Hardening +### Sparse Vector Search -We recommend reducing the amount of permissions granted to Qdrant containers so that you can reduce the risk of exploitation. Here are some ways to reduce the permissions of a Qdrant container: +To search with only sparse vectors, -- Run Qdrant as a non-root user. This can help mitigate the risk of future container breakout vulnerabilities. Qdrant does not need the privileges of the root user for any purpose. +- The `retrieval_mode` parameter should be set to `RetrievalMode.SPARSE`. +- An implementation of the [SparseEmbeddings interface](https://github.com/langchain-ai/langchain/blob/master/libs/partners/qdrant/langchain_qdrant/sparse_embeddings.py) using any sparse embeddings provider has to be provided as value to the `sparse_embedding` parameter. - - You can use the image `qdrant/qdrant:-unprivileged` instead of the default Qdrant image. - - You can use the flag `--user=1000:2000` when running [`docker run`](https://docs.docker.com/reference/cli/docker/container/run/). - - You can set [`user: 1000`](https://docs.docker.com/compose/compose-file/05-services/#user) when using Docker Compose. - - You can set [`runAsUser: 1000`](https://kubernetes.io/docs/tasks/configure-pod-container/security-context) when running in Kubernetes (our [Helm chart](https://github.com/qdrant/qdrant-helm) does this by default). -- Run Qdrant with a read-only root filesystem. This can help mitigate vulnerabilities that require the ability to modify system files, which is a permission Qdrant does not need. As long as the container uses mounted volumes for storage ( `/qdrant/storage` and `/qdrant/snapshots` by default), Qdrant can continue to operate while being prevented from writing data outside of those volumes. +The `langchain-qdrant` package provides a [FastEmbed](https://github.com/qdrant/fastembed) based implementation out of the box. - - You can use the flag `--read-only` when running [`docker run`](https://docs.docker.com/reference/cli/docker/container/run/). - - You can set [`read_only: true`](https://docs.docker.com/compose/compose-file/05-services/#read_only) when using Docker Compose. - - You can set [`readOnlyRootFilesystem: true`](https://kubernetes.io/docs/tasks/configure-pod-container/security-context) when running in Kubernetes (our [Helm chart](https://github.com/qdrant/qdrant-helm) does this by default). -- Block Qdrant’s external network access. This can help mitigate [server side request forgery attacks](https://owasp.org/www-community/attacks/Server_Side_Request_Forgery), like via the [snapshot recovery API](https://api.qdrant.tech/api-reference/snapshots/recover-from-snapshot). Single-node Qdrant clusters do not require any outbound network access. Multi-node Qdrant clusters only need the ability to connect to other Qdrant nodes via TCP ports 6333, 6334, and 6335. +To use it, install the [FastEmbed package](https://github.com/qdrant/fastembed#-installation). - - You can use [`docker network create --internal `](https://docs.docker.com/reference/cli/docker/network/create/#internal) and use that network when running [`docker run --network `](https://docs.docker.com/reference/cli/docker/container/run/#network). - - You can create an [internal network](https://docs.docker.com/compose/compose-file/06-networks/#internal) when using Docker Compose. - - You can create a [NetworkPolicy](https://kubernetes.io/docs/concepts/services-networking/network-policies/) when using Kubernetes. Note that multi-node Qdrant clusters [will also need access to cluster DNS in Kubernetes](https://github.com/ahmetb/kubernetes-network-policy-recipes/blob/master/11-deny-egress-traffic-from-an-application.md#allowing-dns-traffic). +```python +from langchain_qdrant import FastEmbedSparse, RetrievalMode -There are other techniques for reducing the permissions such as dropping [Linux capabilities](https://www.man7.org/linux/man-pages/man7/capabilities.7.html) depending on your deployment method, but the methods mentioned above are the most important. +sparse_embeddings = FastEmbedSparse(model_name="Qdrant/BM25") -##### Was this page useful? +qdrant = QdrantVectorStore.from_documents( + docs, + sparse_embedding=sparse_embeddings, + location=":memory:", + collection_name="my_documents", + retrieval_mode=RetrievalMode.SPARSE, +) -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +query = "What did the president say about Ketanji Brown Jackson" +found_docs = qdrant.similarity_search(query) +``` -Thank you for your feedback! 🙏 +### Hybrid Vector Search -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/security.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +To perform a hybrid search using dense and sparse vectors with score fusion, -On this page: +- The `retrieval_mode` parameter should be set to `RetrievalMode.HYBRID`. +- A [dense embeddings](https://python.langchain.com/v0.2/docs/integrations/text_embedding/) value should be provided for the `embedding` parameter. +- An implementation of the [SparseEmbeddings interface](https://github.com/langchain-ai/langchain/blob/master/libs/partners/qdrant/langchain_qdrant/sparse_embeddings.py) using any sparse embeddings provider has to be provided as value to the `sparse_embedding` parameter. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/security.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +```python +from langchain_qdrant import FastEmbedSparse, RetrievalMode -× +sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25") -[Powered by](https://qdrant.tech/) +qdrant = QdrantVectorStore.from_documents( + docs, + embedding=embeddings, + sparse_embedding=sparse_embeddings, + location=":memory:", + collection_name="my_documents", + retrieval_mode=RetrievalMode.HYBRID, +) -<|page-131-lllmstxt|> -## async-api -- [Documentation](https://qdrant.tech/documentation/) -- [Database tutorials](https://qdrant.tech/documentation/database-tutorials/) -- Build With Async API +query = "What did the president say about Ketanji Brown Jackson" +found_docs = qdrant.similarity_search(query) +``` -# [Anchor](https://qdrant.tech/documentation/database-tutorials/async-api/\#using-qdrants-async-api-for-efficient-python-applications) Using Qdrant’s Async API for Efficient Python Applications +Note that if you've added documents with HYBRID mode, you can switch to any retrieval mode when searching. Since both the dense and sparse vectors are available in the collection. -Asynchronous programming is being broadly adopted in the Python ecosystem. Tools such as FastAPI [have embraced this new\\ -paradigm](https://fastapi.tiangolo.com/async/), but it is also becoming a standard for ML models served as SaaS. For example, the Cohere SDK -[provides an async client](https://github.com/cohere-ai/cohere-python/blob/856a4c3bd29e7a75fa66154b8ac9fcdf1e0745e0/src/cohere/client.py#L189) next to its synchronous counterpart. +## Next steps -Databases are often launched as separate services and are accessed via a network. All the interactions with them are IO-bound and can -be performed asynchronously so as not to waste time actively waiting for a server response. In Python, this is achieved by -using [`async/await`](https://docs.python.org/3/library/asyncio-task.html) syntax. That lets the interpreter switch to another task -while waiting for a response from the server. +If you'd like to know more about running Qdrant in a Langchain-based application, please read our article +[Question Answering with Langchain and Qdrant without boilerplate](/articles/langchain-integration/). Some more information +might also be found in the [Langchain documentation](https://python.langchain.com/docs/integrations/vectorstores/qdrant). -## [Anchor](https://qdrant.tech/documentation/database-tutorials/async-api/\#when-to-use-async-api) When to use async API +- [Source Code](https://github.com/langchain-ai/langchain/tree/master/libs%2Fpartners%2Fqdrant) -There is no need to use async API if the application you are writing will never support multiple users at once (e.g it is a script that runs once per day). However, if you are writing a web service that multiple users will use simultaneously, you shouldn’t be -blocking the threads of the web server as it limits the number of concurrent requests it can handle. In this case, you should use -the async API. +<|page-234-lllmstxt|> +# LangChain for Java -Modern web frameworks like [FastAPI](https://fastapi.tiangolo.com/) and [Quart](https://quart.palletsprojects.com/en/latest/) support -async API out of the box. Mixing asynchronous code with an existing synchronous codebase might be a challenge. The `async/await` syntax -cannot be used in synchronous functions. On the other hand, calling an IO-bound operation synchronously in async code is considered -an antipattern. Therefore, if you build an async web service, exposed through an [ASGI](https://asgi.readthedocs.io/en/latest/) server, -you should use the async API for all the interactions with Qdrant. +LangChain for Java, also known as [Langchain4J](https://github.com/langchain4j/langchain4j), is a community port of [Langchain](https://www.langchain.com/) for building context-aware AI applications in Java -### [Anchor](https://qdrant.tech/documentation/database-tutorials/async-api/\#using-qdrant-asynchronously) Using Qdrant asynchronously +You can use Qdrant as a vector store in Langchain4J through the [`langchain4j-qdrant`](https://central.sonatype.com/artifact/dev.langchain4j/langchain4j-qdrant) module. -The simplest way of running asynchronous code is to use define `async` function and use the `asyncio.run` in the following way to run it: +## Setup -```python -from qdrant_client import models +Add the `langchain4j-qdrant` to your project dependencies. -import qdrant_client -import asyncio +```xml + + dev.langchain4j + langchain4j-qdrant + VERSION + +``` -async def main(): - client = qdrant_client.AsyncQdrantClient("localhost") +## Usage - # Create a collection - await client.create_collection( - collection_name="my_collection", - vectors_config=models.VectorParams(size=4, distance=models.Distance.COSINE), - ) +Before you use the following code sample, customize the following values for your configuration: - # Insert a vector - await client.upsert( - collection_name="my_collection", - points=[\ - models.PointStruct(\ - id="5c56c793-69f3-4fbf-87e6-c4bf54c28c26",\ - payload={\ - "color": "red",\ - },\ - vector=[0.9, 0.1, 0.1, 0.5],\ - ),\ - ], - ) +- `YOUR_COLLECTION_NAME`: Use our [Collections](/documentation/concepts/collections/) guide to create or + list collections. +- `YOUR_HOST_URL`: Use the GRPC URL for your system. If you used the [Quick Start](/documentation/quick-start/) guide, + it may be http://localhost:6334. If you've deployed in the [Qdrant Cloud](/documentation/cloud/), you may have a + longer URL such as `https://example.location.cloud.qdrant.io:6334`. +- `YOUR_API_KEY`: Substitute the API key associated with your configuration. +```java +import dev.langchain4j.store.embedding.EmbeddingStore; +import dev.langchain4j.store.embedding.qdrant.QdrantEmbeddingStore; - # Search for nearest neighbors - points = await client.query_points( - collection_name="my_collection", - query=[0.9, 0.1, 0.1, 0.5], - limit=2, - ).points +EmbeddingStore embeddingStore = + QdrantEmbeddingStore.builder() + // Ensure the collection is configured with the appropriate dimensions + // of the embedding model. + // Reference https://qdrant.tech/documentation/concepts/collections/ + .collectionName("YOUR_COLLECTION_NAME") + .host("YOUR_HOST_URL") + // GRPC port of the Qdrant server + .port(6334) + .apiKey("YOUR_API_KEY") + .build(); +``` - # Your async code using AsyncQdrantClient might be put here - # ... +`QdrantEmbeddingStore` supports all the semantic features of Langchain4J. -asyncio.run(main()) +## Further Reading + +- You can refer to the [Langchain4J examples](https://github.com/langchain4j/langchain4j-examples/) to get started. +- [Source Code](https://github.com/langchain4j/langchain4j/tree/main/langchain4j-qdrant) + +<|page-235-lllmstxt|> +# LangGraph +[LangGraph](https://github.com/langchain-ai/langgraph) is a library for building stateful, multi-actor applications, ideal for creating agentic workflows. It provides fine-grained control over both the flow and state of your application, crucial for creating reliable agents. + +You can define flows that involve cycles, essential for most agentic architectures, differentiating it from DAG-based solutions. Additionally, LangGraph includes built-in persistence, enabling advanced human-in-the-loop and memory features. + +LangGraph works seamlessly with all the components of LangChain. This means we can utilize Qdrant's [Langchain integration](/documentation/frameworks/langchain/) to create retrieval nodes in LangGraph, available in both Python and Javascript! + +## Usage + +- Install the required dependencies + +```python +$ pip install langgraph langchain_community langchain_qdrant fastembed ``` -The `AsyncQdrantClient` provides the same methods as the synchronous counterpart `QdrantClient`. If you already have a synchronous -codebase, switching to async API is as simple as replacing `QdrantClient` with `AsyncQdrantClient` and adding `await` before each -method call. +```typescript +$ npm install @langchain/langgraph langchain @langchain/qdrant @langchain/openai +``` -##### Was this page useful? +- Create a retriever tool to add to the LangGraph workflow. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +```python -Thank you for your feedback! 🙏 +from langchain.tools.retriever import create_retriever_tool +from langchain_community.embeddings import FastEmbedEmbeddings -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/database-tutorials/async-api.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +from langchain_qdrant import FastEmbedSparse, QdrantVectorStore, RetrievalMode -On this page: +# We'll set up Qdrant to retrieve documents using Hybrid search. +# Learn more at https://qdrant.tech/articles/hybrid-search/ +retriever = QdrantVectorStore.from_texts( + url="http://localhost:6333/", + collection_name="langgraph-collection", + embedding=FastEmbedEmbeddings(model_name="BAAI/bge-small-en-v1.5"), + sparse_embedding=FastEmbedSparse(model_name="Qdrant/bm25"), + retrieval_mode=RetrievalMode.HYBRID, + texts=["", "", ...] +).as_retriever() -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/database-tutorials/async-api.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +retriever_tool = create_retriever_tool( + retriever, + "retrieve_my_texts", + "Retrieve texts stored in the Qdrant collection", +) +``` -× +```typescript +import { QdrantVectorStore } from "@langchain/qdrant"; +import { OpenAIEmbeddings } from "@langchain/openai"; +import { createRetrieverTool } from "langchain/tools/retriever"; + +const vectorStore = await QdrantVectorStore.fromTexts( + ["", ""], + [{ id: 2 }, { id: 1 }], + new OpenAIEmbeddings(), + { + url: "http://localhost:6333/", + collectionName: "goldel_escher_bach", + } +); -[Powered by](https://qdrant.tech/) +const retriever = vectorStore.asRetriever(); -<|page-132-lllmstxt|> -## qdrant-dspy-medicalbot -- [Documentation](https://qdrant.tech/documentation/) -- [Examples](https://qdrant.tech/documentation/examples/) -- Building a Chain-of-Thought Medical Chatbot with Qdrant and DSPy +const tool = createRetrieverTool( + retriever, + { + name: "retrieve_my_texts", + description: + "Retrieve texts stored in the Qdrant collection", + }, +); +``` -# [Anchor](https://qdrant.tech/documentation/examples/qdrant-dspy-medicalbot/\#building-a-chain-of-thought-medical-chatbot-with-qdrant-and-dspy) Building a Chain-of-Thought Medical Chatbot with Qdrant and DSPy +- Add the retriever tool as a node in LangGraph -Accessing medical information from LLMs can lead to hallucinations or outdated information. Relying on this type of information can result in serious medical consequences. Building a trustworthy and context-aware medical chatbot can solve this. +```python +from langgraph.graph import StateGraph +from langgraph.prebuilt import ToolNode -In this article, we will look at how to tackle these challenges using: +workflow = StateGraph() -- **Retrieval-Augmented Generation (RAG)**: Instead of answering the questions from scratch, the bot retrieves the information from medical literature before answering questions. -- **Filtering**: Users can filter the results by specialty and publication year, ensuring the information is accurate and up-to-date. +# Define other the nodes which we'll cycle between. +workflow.add_node("retrieve_qdrant", ToolNode([retriever_tool])) -Let’s discover the technologies needed to build the medical bot. +graph = workflow.compile() +``` -## [Anchor](https://qdrant.tech/documentation/examples/qdrant-dspy-medicalbot/\#tech-stack-overview) Tech Stack Overview +```typescript +import { StateGraph } from "@langchain/langgraph"; +import { ToolNode } from "@langchain/langgraph/prebuilt"; -To build a robust and trustworthy medical chatbot, we will combine the following technologies: +// Define the graph +const workflow = new StateGraph(SomeGraphState) + // Define the nodes which we'll cycle between. + .addNode("retrieve", new ToolNode([tool])); -- [**Qdrant Cloud**](https://qdrant.tech/cloud/): Qdrant is a high-performance vector search engine for storing and retrieving large collections of embeddings. In this project, we will use it to enable fast and accurate search across millions of medical documents, supporting dense and multi-vector (ColBERT) retrieval for context-aware answers. -- [**Stanford DSPy**](https://qdrant.tech/documentation/frameworks/dspy/) **:** DSPy is the AI framework we will use to obtain the final answer. It allows the medical bot to retrieve the relevant information and reason step-by-step to produce accurate and explainable answers. +const graph = workflow.compile(); +``` -![medicalbot flow chart](https://qdrant.tech/articles_data/Qdrant-DSPy-medicalbot/medicalbot.png) +## Further Reading -## [Anchor](https://qdrant.tech/documentation/examples/qdrant-dspy-medicalbot/\#dataset-preparation-and-indexing) Dataset Preparation and Indexing +- [LangGraph Documentation](https://langchain-ai.github.io/langgraph/) +- [LangGraph End-to-End Guides](https://langchain-ai.github.io/langgraph/tutorials/) -A medical chatbot is only as good as the knowledge it has access to. For this project, we will leverage the [MIRIAD medical dataset](https://huggingface.co/datasets/miriad/miriad-5.8M), a large-scale collection of medical passages enriched with metadata such as publication year and specialty. +<|page-236-lllmstxt|> +# LlamaIndex -### [Anchor](https://qdrant.tech/documentation/examples/qdrant-dspy-medicalbot/\#indexing-with-dense-and-colbert-multivectors) Indexing with Dense and ColBERT Multivectors +Llama Index acts as an interface between your external data and Large Language Models. So you can bring your +private data and augment LLMs with it. LlamaIndex simplifies data ingestion and indexing, integrating Qdrant as a vector index. -To enable high-quality retrieval, we will embed each medical passage with two models: +Installing Llama Index is straightforward if we use pip as a package manager. Qdrant is not installed by default, so we need to +install it separately. The integration of both tools also comes as another package. + +```bash +pip install llama-index llama-index-vector-stores-qdrant +``` -- **Dense Embeddings**: These are generated using the `BAAI/bge-small-en` model and capture the passages’ general semantic meaning. -- **ColBERT Multivectors**: These provide more fine-grained representations, enabling precise ranking of results. +Llama Index requires providing an instance of `QdrantClient`, so it can interact with Qdrant server. ```python -dense_documents = [\ - models.Document(text=doc, model="BAAI/bge-small-en") for doc in ds["passage_text"]\ -] +from llama_index.core.indices.vector_store.base import VectorStoreIndex +from llama_index.vector_stores.qdrant import QdrantVectorStore -colbert_documents = [\ - models.Document(text=doc, model="colbert-ir/colbertv2.0")\ - for doc in ds["passage_text"]\ -] +import qdrant_client -collection_name = "miriad" +client = qdrant_client.QdrantClient( + "", + api_key="", # For Qdrant Cloud, None for local instance +) -# Create collection -if not client.collection_exists(collection_name): - client.create_collection( - collection_name=collection_name, - vectors_config={ - "dense": models.VectorParams(size=384, distance=models.Distance.COSINE), - "colbert": models.VectorParams( - size=128, - distance=models.Distance.COSINE, - multivector_config=models.MultiVectorConfig( - comparator=models.MultiVectorComparator.MAX_SIM - ), - hnsw_config=models.HnswConfigDiff(m=0), # reranker: no indexing - ), - }, - ) +vector_store = QdrantVectorStore(client=client, collection_name="documents") +index = VectorStoreIndex.from_vector_store(vector_store=vector_store) ``` -We disable indexing for the ColBERT multivector since it will only be used for reranking. To learn more about this, check out the [How to Effectively Use Multivector Representations in Qdrant for Reranking](https://qdrant.tech/documentation/advanced-tutorials/using-multivector-representations/) article. +## Further Reading -### [Anchor](https://qdrant.tech/documentation/examples/qdrant-dspy-medicalbot/\#batch-uploading-to-qdrant) Batch Uploading to Qdrant +- [LlamaIndex Documentation](https://docs.llamaindex.ai/en/stable/examples/vector_stores/QdrantIndexDemo/) +- [Example Notebook](https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/docs/examples/vector_stores/QdrantIndexDemo.ipynb) +- [Source Code](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/vector_stores/llama-index-vector-stores-qdrant) -To avoid hitting API limits, we upload the data in batches, each batch containing: +<|page-237-lllmstxt|> +# Make.com -- The passage text -- ColBERT and dense embeddings. -- `year` and `specialty` metadata fields. +[Make](https://www.make.com/) is a platform for anyone to design, build, and automate anything—from tasks and workflows to apps and systems without code. -```python -BATCH_SIZE = 3 -points_batch = [] +Find the comprehensive list of available Make apps [here](https://www.make.com/en/integrations). -for i in range(len(ds["passage_text"])): - point = models.PointStruct( - id=i, - vector={"dense": dense_documents[i], "colbert": colbert_documents[i]}, - payload={ - "passage_text": ds["passage_text"][i], - "year": ds["year"][i], - "specialty": ds["specialty"][i], - }, - ) - points_batch.append(point) +Qdrant is available as an [app](https://www.make.com/en/integrations/qdrant) within Make to add to your scenarios. - if len(points_batch) == BATCH_SIZE: - client.upsert(collection_name=collection_name, points=points_batch) - print(f"Uploaded batch ending at index {i}") - points_batch = [] +![Qdrant Make hero](/documentation/frameworks/make/hero-page.png) -# Final flush -if points_batch: - client.upsert(collection_name=collection_name, points=points_batch) - print("Uploaded final batch.") +## Prerequisites -``` +Before you start, make sure you have the following: -## [Anchor](https://qdrant.tech/documentation/examples/qdrant-dspy-medicalbot/\#retrieval-augmented-generation-rag-pipeline) Retrieval-Augmented Generation (RAG) Pipeline +1. A Qdrant instance to connect to. You can get free cloud instance [cloud.qdrant.io](https://cloud.qdrant.io/). +2. An account at Make.com. You can register yourself [here](https://www.make.com/en/register). -Our chatbot will use a Retrieval-Augmented Generation (RAG) pipeline to ensure its answers are grounded in medical literature. +## Setting up a connection -### [Anchor](https://qdrant.tech/documentation/examples/qdrant-dspy-medicalbot/\#integration-of-dspy-and-qdrant) Integration of DSPy and Qdrant +Navigate to your scenario on the Make dashboard and select a Qdrant app module to start a connection. +![Qdrant Make connection](/documentation/frameworks/make/connection.png) -At the heart of the application is the Qdrant vector database that provides the information sent to DSPy to generate the final answer. This is what happens when a user submits a query: +You can now establish a connection to Qdrant using your [instance credentials](/documentation/cloud/authentication/). -- DSPy searches against the Qdrant vector database to retrieve the top documents and answers the query. The results are also filtered with a particular year range for a specific specialty. -- The retrieved passages are then reranked using ColBERT multivector embeddings, leading to the most relevant and contextually appropriate answers. -- DSPy uses these passages to guide the language model through a chain-of-thought reasoning to generate the most accurate answer. +![Qdrant Make form](/documentation/frameworks/make/connection-form.png) -```python -def rerank_with_colbert(query_text, min_year, max_year, specialty): - from fastembed import TextEmbedding, LateInteractionTextEmbedding +## Modules + + Modules represent actions that Make performs with an app. - # Encode query once with both models - dense_model = TextEmbedding("BAAI/bge-small-en") - colbert_model = LateInteractionTextEmbedding("colbert-ir/colbertv2.0") +The Qdrant Make app enables you to trigger the following app modules. +![Qdrant Make modules](/documentation/frameworks/make/modules.png) - dense_query = list(dense_model.embed(query_text))[0] - colbert_query = list(colbert_model.embed(query_text))[0] +The modules support mapping to connect the data retrieved by one module to another module to perform the desired action. You can read more about the data processing options available for the modules in the [Make reference](https://www.make.com/en/help/modules). - # Combined query: retrieve with dense, - # rerank with ColBERT - results = client.query_points( - collection_name=collection_name, - prefetch=models.Prefetch(query=dense_query, using="dense"), - query=colbert_query, - using="colbert", - limit=5, - with_payload=True, - query_filter=Filter( - must=[\ - FieldCondition(key="specialty", match=MatchValue(value=specialty)),\ - FieldCondition(\ - key="year",\ - range=models.Range(gt=None, gte=min_year, lt=None, lte=max_year),\ - ),\ - ] - ), - ) +## Next steps - points = results.points - docs = [] +- Find a list of Make workflow templates to connect with Qdrant [here](https://www.make.com/en/templates). - for point in points: - docs.append(point.payload["passage_text"]) +- Make scenario reference docs can be found [here](https://www.make.com/en/help/scenarios). - return docs +<|page-238-lllmstxt|> +# Mastra + +[Mastra](https://mastra.ai/) is a Typescript framework to build AI applications and features quickly. It gives you the set of primitives you need: workflows, agents, RAG, integrations, syncs and evals. You can run Mastra on your local machine, or deploy to a serverless cloud. +Qdrant is available as a vector store in Mastra node to augment application with retrieval capabilities. +## Setup + +```bash +npm install @mastra/core ``` -The pipeline ensures that each response is grounded in real and recent medical literature and is aligned with the user’s needs. +## Usage -## [Anchor](https://qdrant.tech/documentation/examples/qdrant-dspy-medicalbot/\#guardrails-and-medical-question-detection) Guardrails and Medical Question Detection +```typescript +import { QdrantVector } from "@mastra/rag"; -Since this is a medical chatbot, we can introduce a simple guardrail to ensure it doesn’t respond to unrelated questions like the weather. This can be implemented using a DSPy module. +const qdrant = new QdrantVector({ + url: "https://xyz-example.eu-central.aws.cloud.qdrant.io:6333" + apiKey: "", + https: true +}); +``` -The chatbot checks if every question is medical-related before attempting to answer it. This is achieved by a DSPy module that classifies each incoming query as medical or not. If the question is not medical-related, the chatbot declines to answer, reducing the risk of misinformation or inappropriate responses. +## Constructor Options -```python -class MedicalGuardrail(dspy.Module): - def forward(self, question): - prompt = ( - """ - Is the following question a medical question? - Answer with 'Yes' or 'No'.n" - f"Question: {question}n" - "Answer: - """ - ) - response = dspy.settings.lm(prompt) - answer = response[0].strip().lower() - return answer.startswith("yes") +| Name | Type | Description | +|--------|-----------|-------------------------------------------------------------------------------------------------------| +| `url` | `string` | REST URL of the Qdrant instance. Eg. | +| `apiKey` | `string` | Optional Qdrant API key | +| `https` | `boolean` | Whether to use TLS when setting up the connection. Recommended. | -if not self.guardrail.forward(question): +## Methods - class DummyResult: - final_answer = """ - Sorry, I can only answer medical questions. - Please ask a question related to medicine or healthcare - """ +### `createIndex()` - return DummyResult() +| Name | Type | Description | Default Value | +|------------|------------------------------------------|-------------------------------------------------|--------------| +| `indexName` | `string` | Name of the index to create | | +| `dimension` | `number` | Vector dimension size | | +| `metric` | `string` | Distance metric for similarity search | `cosine` | -``` +### `upsert()` -By combining this guardrail with specialty and year filtering, we ensure that the chatbot: +| Name | Type | Description | Default Value | +|-------------|---------------------------|-----------------------------------------|--------------| +| `vectors` | `number[][]` | Array of embedding vectors | | +| `metadata` | `Record[]` | Metadata for each vector (optional) | | +| `namespace` | `string` | Optional namespace for organization | | -- Only answers medical questions. -- Answers questions from recent medical literature. -- Doesn’t make up answers by grounding its answers in the provided literature. +### `query()` -![medicalbot demo](https://qdrant.tech/articles_data/Qdrant-DSPy-medicalbot/medicaldemo.png) +| Name | Type | Description | Default Value | +|------------|-------------------------|---------------------------------------------|--------------| +| `vector` | `number[]` | Query vector to find similar vectors | | +| `topK` | `number` | Number of results to return (optional) | `10` | +| `filter` | `Record` | Metadata filters for the query (optional) | | -## [Anchor](https://qdrant.tech/documentation/examples/qdrant-dspy-medicalbot/\#conclusion) Conclusion +### `listIndexes()` -By leveraging Qdrant and DSPy, you can build a medical chatbot that generates accurate and up-to-date medical responses. Qdrant provides the technology and enables fast and scalable retrieval, while DSPy synthesizes this information to provide correct answers grounded in the medical literature. As a result, you can achieve a medical system that is truthful, safe, and provides relevant responses. Check out the entire project from this [notebook](https://github.com/qdrant/examples/blob/master/DSPy-medical-bot/medical_bot_DSPy_Qdrant.ipynb). You’ll need a free [Qdrant Cloud](https://qdrant.tech/cloud/) account to run the notebook. +Returns an array of index names as strings. -##### Was this page useful? +### `describeIndex()` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +| Name | Type | Description | +|-------------|----------|----------------------------------| +| `indexName` | `string` | Name of the index to describe | -Thank you for your feedback! 🙏 +#### Returns -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/Qdrant-DSPy-medicalbot.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +```typescript +interface IndexStats { + dimension: number; + count: number; + metric: "cosine" | "euclidean" | "dotproduct"; +} +``` -On this page: +### `deleteIndex()` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/Qdrant-DSPy-medicalbot.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +| Name | Type | Description | +|-------------|----------|----------------------------------| +| `indexName` | `string` | Name of the index to delete | -× +## Response Types -[Powered by](https://qdrant.tech/) +Query results are returned in this format: -<|page-133-lllmstxt|> -## beginner-tutorials -- [Documentation](https://qdrant.tech/documentation/) -- Vector Search Basics +```typescript +interface QueryResult { + id: string; + score: number; + metadata: Record; +} +``` + +## Further Reading -# [Anchor](https://qdrant.tech/documentation/beginner-tutorials/\#beginner-tutorials) Beginner Tutorials +- [Mastra Examples](https://github.com/mastra-ai/mastra/tree/main/examples) +- [Mastra Documentation](http://mastra.ai/docs/) -| | -| --- | -| [Build Your First Semantic Search Engine in 5 Minutes](https://qdrant.tech/documentation/beginner-tutorials/search-beginners/) | -| [Build a Neural Search Service with Sentence Transformers and Qdrant](https://qdrant.tech/documentation/beginner-tutorials/neural-search/) | -| [Build a Hybrid Search Service with FastEmbed and Qdrant](https://qdrant.tech/documentation/beginner-tutorials/hybrid-search-fastembed/) | -| [Measure and Improve Retrieval Quality in Semantic Search](https://qdrant.tech/documentation/beginner-tutorials/retrieval-quality/) | +<|page-239-lllmstxt|> +![Mem0 Logo](/documentation/frameworks/mem0/mem0-banner.png) -##### Was this page useful? +[Mem0](https://mem0.ai) is a self-improving memory layer for LLM applications, enabling personalized AI experiences that save costs and delight users. Mem0 remembers user preferences, adapts to individual needs, and continuously improves over time, ideal for chatbots and AI systems. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Mem0 supports various vector store providers, including Qdrant, for efficient data handling and search capabilities. -Thank you for your feedback! 🙏 +## Installation -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/beginner-tutorials/_index.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +To install Mem0 with Qdrant support, use the following command: -On this page: +```sh +pip install mem0ai +``` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/beginner-tutorials/_index.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +## Usage -× +Here's a basic example of how to use Mem0 with Qdrant: -[Powered by](https://qdrant.tech/) +```python +import os +from mem0 import Memory -<|page-134-lllmstxt|> -## binary-quantization -- [Articles](https://qdrant.tech/articles/) -- Binary Quantization - Vector Search, 40x Faster +os.environ["OPENAI_API_KEY"] = "sk-xx" + +config = { + "vector_store": { + "provider": "qdrant", + "config": { + "collection_name": "test", + "host": "localhost", + "port": 6333, + } + } +} -[Back to Qdrant Internals](https://qdrant.tech/articles/qdrant-internals/) +m = Memory.from_config(config) +m.add("Likes to play cricket on weekends", user_id="alice", metadata={"category": "hobbies"}) +``` -# Binary Quantization - Vector Search, 40x Faster +## Configuration -Nirant Kasliwal +When configuring Mem0 to use Qdrant as the vector store, you can specify [various parameters](https://docs.mem0.ai/components/vectordbs/dbs/qdrant#config) in the `config` dictionary. -· +## Advanced Usage -September 18, 2023 +Mem0 provides additional functionality for managing and querying your vector data. Here are some examples: -![Binary Quantization - Vector Search, 40x Faster ](https://qdrant.tech/articles_data/binary-quantization/preview/title.jpg) +```python +# Search memories +related_memories = m.search(query="What are Alice's hobbies?", user_id="alice") -# [Anchor](https://qdrant.tech/articles/binary-quantization/\#optimizing-high-dimensional-vectors-with-binary-quantization) Optimizing High-Dimensional Vectors with Binary Quantization +# Update existing memory +result = m.update(memory_id="m1", data="Likes to play tennis on weekends") -Qdrant is built to handle typical scaling challenges: high throughput, low latency and efficient indexing. **Binary quantization (BQ)** is our latest attempt to give our customers the edge they need to scale efficiently. This feature is particularly excellent for collections with large vector lengths and a large number of points. +# Get memory history +history = m.history(memory_id="m1") +``` -Our results are dramatic: Using BQ will reduce your memory consumption and improve retrieval speeds by up to 40x. +## Further Reading -As is the case with other quantization methods, these benefits come at the cost of recall degradation. However, our implementation lets you balance the tradeoff between speed and recall accuracy at time of search, rather than time of index creation. +- [Mem0 GitHub Repository](https://github.com/mem0ai/mem0) +- [Mem0 Documentation](https://docs.mem0.ai/) -The rest of this article will cover: +<|page-240-lllmstxt|> +# NLWeb -1. The importance of binary quantization -2. Basic implementation using our Python client -3. Benchmark analysis and usage recommendations +Microsoft's [NLWeb](https://github.com/microsoft/NLWeb) is a proposed framework that enables natural language interfaces for websites, using Schema.org, formats like RSS and the emerging [MCP protocol](https://github.com/microsoft/NLWeb/blob/main/docs/RestAPI.md). -## [Anchor](https://qdrant.tech/articles/binary-quantization/\#what-is-binary-quantization) What is Binary Quantization? +Qdrant is supported as a vector store backend within NLWeb for embedding storage and context retrieval. -Binary quantization (BQ) converts any vector embedding of floating point numbers into a vector of binary or boolean values. This feature is an extension of our past work on [scalar quantization](https://qdrant.tech/articles/scalar-quantization/) where we convert `float32` to `uint8` and then leverage a specific SIMD CPU instruction to perform fast vector comparison. +## Usage -![What is binary quantization](https://qdrant.tech/articles_data/binary-quantization/bq-2.png) +NLWeb includes Qdrant integration by default. You can install and configure it to use Qdrant as the retrieval engine. -**This binarization function is how we convert a range to binary values. All numbers greater than zero are marked as 1. If it’s zero or less, they become 0.** +### Installation -The benefit of reducing the vector embeddings to binary values is that boolean operations are very fast and need significantly less CPU instructions. In exchange for reducing our 32 bit embeddings to 1 bit embeddings we can see up to a 40x retrieval speed up gain! +Clone the repo and set up your environment: -One of the reasons vector search still works with such a high compression rate is that these large vectors are over-parameterized for retrieval. This is because they are designed for ranking, clustering, and similar use cases, which typically need more information encoded in the vector. +```bash +git clone https://github.com/microsoft/NLWeb +cd NLWeb +python -m venv .venv +source venv/bin/activate # or `venv\Scripts\activate` on Windows +cd code +pip install -r requirements.txt +``` -For example, The 1536 dimension OpenAI embedding is worse than Open Source counterparts of 384 dimension at retrieval and ranking. Specifically, it scores 49.25 on the same [Embedding Retrieval Benchmark](https://huggingface.co/spaces/mteb/leaderboard) where the Open Source `bge-small` scores 51.82. This 2.57 points difference adds up quite soon. +### Configuring Qdrant -Our implementation of quantization achieves a good balance between full, large vectors at ranking time and binary vectors at search and retrieval time. It also has the ability for you to adjust this balance depending on your use case. +To use **Qdrant**, update your configuration. -## [Anchor](https://qdrant.tech/articles/binary-quantization/\#faster-search-and-retrieval) Faster search and retrieval +#### 1. Copy and edit the environment variables file -Unlike product quantization, binary quantization does not rely on reducing the search space for each probe. Instead, we build a binary index that helps us achieve large increases in search speed. +```bash +cp .env.template .env +``` -![Speed by quantization method](https://qdrant.tech/articles_data/binary-quantization/bq-3.png) +Ensure the following values are set in your `.env` file: -HNSW is the approximate nearest neighbor search. This means our accuracy improves up to a point of diminishing returns, as we check the index for more similar candidates. In the context of binary quantization, this is referred to as the **oversampling rate**. +```text +QDRANT_URL="https://xyz-example.cloud-region.cloud-provider.cloud.qdrant.io:6333" +QDRANT_API_KEY="" +``` -For example, if `oversampling=2.0` and the `limit=100`, then 200 vectors will first be selected using a quantized index. For those 200 vectors, the full 32 bit vector will be used with their HNSW index to a much more accurate 100 item result set. As opposed to doing a full HNSW search, we oversample a preliminary search and then only do the full search on this much smaller set of vectors. +#### 2. Update config files in `code/config` -## [Anchor](https://qdrant.tech/articles/binary-quantization/\#improved-storage-efficiency) Improved storage efficiency +* **`config_retrieval.yaml`** -The following diagram shows the binarization function, whereby we reduce 32 bits storage to 1 bit information. +```yaml +retrieval_engine: qdrant_url +``` -Text embeddings can be over 1024 elements of floating point 32 bit numbers. For example, remember that OpenAI embeddings are 1536 element vectors. This means each vector is 6kB for just storing the vector. +Alternatively, you can use an in-memory Qdrant instance for experimentation. -![Improved storage efficiency](https://qdrant.tech/articles_data/binary-quantization/bq-4.png) +```yaml +retrieval_engine: qdrant_local -In addition to storing the vector, we also need to maintain an index for faster search and retrieval. Qdrant’s formula to estimate overall memory consumption is: +endpoints: + qdrant_local: + # Path to a local directory + database_path: "../data/" + # Set the collection name to use + index_name: nlweb_collection + # Specify the database type + db_type: qdrant +``` -`memory_size = 1.5 * number_of_vectors * vector_dimension * 4 bytes` +### Loading Data -For 100K OpenAI Embedding ( `ada-002`) vectors we would need 900 Megabytes of RAM and disk space. This consumption can start to add up rapidly as you create multiple collections or add more items to the database. +Once configured, load your content using RSS feeds. -**With binary quantization, those same 100K OpenAI vectors only require 128 MB of RAM.** We benchmarked this result using methods similar to those covered in our [Scalar Quantization memory estimation](https://qdrant.tech/articles/scalar-quantization/#benchmarks). +From the `code` directory: -This reduction in RAM usage is achieved through the compression that happens in the binary conversion. HNSW and quantized vectors will live in RAM for quick access, while original vectors can be offloaded to disk only. For searching, quantized HNSW will provide oversampled candidates, then they will be re-evaluated using their disk-stored original vectors to refine the final results. All of this happens under the hood without any additional intervention on your part. +```bash +python -m tools.db_load https://feeds.libsyn.com/121695/rss Behind-the-Tech +``` + +This will ingest the content into your local Qdrant instance. -### [Anchor](https://qdrant.tech/articles/binary-quantization/\#when-should-you-not-use-bq) When should you not use BQ? +### Running the Server -Since this method exploits the over-parameterization of embedding, you can expect poorer results for small embeddings i.e. less than 1024 dimensions. With the smaller number of elements, there is not enough information maintained in the binary vector to achieve good results. +To start NLWeb, from the `code` directory, run: -You will still get faster boolean operations and reduced RAM usage, but the accuracy degradation might be too high. +```bash +python app-file.py +``` -## [Anchor](https://qdrant.tech/articles/binary-quantization/\#sample-implementation) Sample implementation +You can now query your content via natural language using either the web UI at or directly through the MCP-compatible [REST API](https://github.com/microsoft/NLWeb/blob/main/docs/RestAPI.md). -Now that we have introduced you to binary quantization, let’s try our a basic implementation. In this example, we will be using OpenAI and Cohere with Qdrant. +## Further Reading -#### [Anchor](https://qdrant.tech/articles/binary-quantization/\#create-a-collection-with-binary-quantization-enabled) Create a collection with Binary Quantization enabled +* [Source](https://github.com/microsoft/NLWeb) +* [Life of a Chat Query](https://github.com/microsoft/NLWeb/tree/main/docs/LifeOfAChatQuery.md) +* [Modifying behavior by changing prompts](https://github.com/microsoft/NLWeb/tree/main/docs/Prompts.md) +* [Modifying control flow](https://github.com/microsoft/NLWeb/tree/main/docs/ControlFlow.md) +* [Modifying the user interface](https://github.com/microsoft/NLWeb/tree/main/docs/UserInterface.md) -Here is what you should do at indexing time when you create the collection: +<|page-241-lllmstxt|> +| Time: 10 min | Level: Beginner | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/qdrant/examples/blob/mistral-getting-started/mistral-embed-getting-started/mistral_qdrant_getting_started.ipynb) | +| --- | ----------- | ----------- | -1. We store all the “full” vectors on disk. -2. Then we set the binary embeddings to be in RAM. +# Mistral +Qdrant is compatible with the new released Mistral Embed and its official Python SDK that can be installed as any other package: + +## Setup + +### Install the client + +```bash +pip install mistralai +``` -By default, both the full vectors and BQ get stored in RAM. We move the full vectors to disk because this saves us memory and allows us to store more vectors in RAM. By doing this, we explicitly move the binary vectors to memory by setting `always_ram=True`. +And then we set this up: ```python +from mistralai.client import MistralClient from qdrant_client import QdrantClient +from qdrant_client.models import PointStruct, VectorParams, Distance -#collect to our Qdrant Server -client = QdrantClient( - url="http://localhost:6333", - prefer_grpc=True, -) - -#Create the collection to hold our embeddings -# on_disk=True and the quantization_config are the areas to focus on -collection_name = "binary-quantization" -if not client.collection_exists(collection_name): - client.create_collection( - collection_name=f"{collection_name}", - vectors_config=models.VectorParams( - size=1536, - distance=models.Distance.DOT, - on_disk=True, - ), - optimizers_config=models.OptimizersConfigDiff( - default_segment_number=5, - ), - hnsw_config=models.HnswConfigDiff( - m=0, - ), - quantization_config=models.BinaryQuantization( - binary=models.BinaryQuantizationConfig(always_ram=True), - ), - ) +collection_name = "example_collection" +MISTRAL_API_KEY = "your_mistral_api_key" +client = QdrantClient(":memory:") +mistral_client = MistralClient(api_key=MISTRAL_API_KEY) +texts = [ + "Qdrant is the best vector search engine!", + "Loved by Enterprises and everyone building for low latency, high performance, and scale.", +] ``` -#### [Anchor](https://qdrant.tech/articles/binary-quantization/\#what-is-happening-in-the-hnswconfig) What is happening in the HnswConfig? +Let's see how to use the Embedding Model API to embed a document for retrieval. -We’re setting `m` to 0 i.e. disabling the HNSW graph construction. This allows faster uploads of vectors and payloads. We will turn it back on down below, once all the data is loaded. +The following example shows how to embed a document with the `models/embedding-001` with the `retrieval_document` task type: -#### [Anchor](https://qdrant.tech/articles/binary-quantization/\#next-we-upload-our-vectors-to-this-and-then-enable-the-graph-construction) Next, we upload our vectors to this and then enable the graph construction: +## Embedding a document ```python -batch_size = 10000 -client.upload_collection( - collection_name=collection_name, - ids=range(len(dataset)), - vectors=dataset["openai"], - payload=[\ - {"text": x} for x in dataset["text"]\ - ], - parallel=10, # based on the machine +result = mistral_client.embeddings( + model="mistral-embed", + input=texts, ) +``` + +The returned result has a data field with a key: `embedding`. The value of this key is a list of floats representing the embedding of the document. + +### Converting this into Qdrant Points +```python +points = [ + PointStruct( + id=idx, + vector=response.embedding, + payload={"text": text}, + ) + for idx, (response, text) in enumerate(zip(result.data, texts)) +] ``` -Enable HNSW graph construction again: +## Create a collection and Insert the documents ```python -client.update_collection( - collection_name=f"{collection_name}", - hnsw_config=models.HnswConfigDiff( - m=16, - , +client.create_collection(collection_name, vectors_config=VectorParams( + size=1024, + distance=Distance.COSINE, + ) ) - +client.upsert(collection_name, points) ``` -#### [Anchor](https://qdrant.tech/articles/binary-quantization/\#configure-the-search-parameters) Configure the search parameters: +## Searching for documents with Qdrant -When setting search parameters, we specify that we want to use `oversampling` and `rescore`. Here is an example snippet: +Once the documents are indexed, you can search for the most relevant documents using the same model with the `retrieval_query` task type: ```python client.search( - collection_name="{collection_name}", - query_vector=[0.2, 0.1, 0.9, 0.7, ...], - search_params=models.SearchParams( - quantization=models.QuantizationSearchParams( - ignore=False, - rescore=True, - oversampling=2.0, - ) + collection_name=collection_name, + query_vector=mistral_client.embeddings( + model="mistral-embed", input=["What is the best to use for vector search scaling?"] + ).data[0].embedding, +) +``` + +## Using Mistral Embedding Models with Binary Quantization + +You can use Mistral Embedding Models with [Binary Quantization](/articles/binary-quantization/) - a technique that allows you to reduce the size of the embeddings by 32 times without losing the quality of the search results too much. + +At an oversampling of 3 and a limit of 100, we've a 95% recall against the exact nearest neighbors with rescore enabled. + +| Oversampling | | 1 | 1 | 2 | 2 | 3 | 3 | +|--------------|---------|----------|----------|----------|----------|----------|--------------| +| | **Rescore** | False | True | False | True | False | True | +| **Limit** | | | | | | | | +| 10 | | 0.53444 | 0.857778 | 0.534444 | 0.918889 | 0.533333 | 0.941111 | +| 20 | | 0.508333 | 0.837778 | 0.508333 | 0.903889 | 0.508333 | 0.927778 | +| 50 | | 0.492222 | 0.834444 | 0.492222 | 0.903556 | 0.492889 | 0.940889 | +| 100 | | 0.499111 | 0.845444 | 0.498556 | 0.918333 | 0.497667 | **0.944556** | + +That's it! You can now use Mistral Embedding Models with Qdrant! + +<|page-242-lllmstxt|> +# Using MixedBread with Qdrant + +MixedBread is a unique provider offering embeddings across multiple domains. Their models are versatile for various search tasks when integrated with Qdrant. MixedBread is creating state-of-the-art models and tools that make search smarter, faster, and more relevant. Whether you're building a next-gen search engine or RAG (Retrieval Augmented Generation) systems, or whether you're enhancing your existing search solution, they've got the ingredients to make it happen. + +## Installation + +You can install the required package using the following pip command: + +```bash +pip install mixedbread +``` + +## Integration Example + +Below is an example of how to obtain embeddings using MixedBread's API and store them in a Qdrant collection: + +```python +import qdrant_client +from qdrant_client.models import Batch +from mixedbread import MixedBreadModel + +# Initialize MixedBread model +model = MixedBreadModel("mixedbread-variant") + +# Generate embeddings +text = "MixedBread provides versatile embeddings for various domains." +embeddings = model.embed(text) + +# Initialize Qdrant client +qdrant_client = qdrant_client.QdrantClient(host="localhost", port=6333) + +# Upsert the embedding into Qdrant +qdrant_client.upsert( + collection_name="VersatileEmbeddings", + points=Batch( + ids=[1], + vectors=[embeddings], ) ) ``` -After Qdrant pulls the oversampled vectors set, the full vectors which will be, say 1536 dimensions for OpenAI will then be pulled up from disk. Qdrant computes the nearest neighbor with the query vector and returns the accurate, rescored order. This method produces much more accurate results. We enabled this by setting `rescore=True`. +<|page-243-lllmstxt|> +# Mixpeek Video Embeddings -These two parameters are how you are going to balance speed versus accuracy. The larger the size of your oversample, the more items you need to read from disk and the more elements you have to search with the relatively slower full vector index. On the other hand, doing this will produce more accurate results. +Mixpeek's video processing capabilities allow you to chunk and embed videos, while Qdrant provides efficient storage and retrieval of these embeddings. -If you have lower accuracy requirements you can even try doing a small oversample without rescoring. Or maybe, for your data set combined with your accuracy versus speed requirements you can just search the binary index and no rescoring, i.e. leaving those two parameters out of the search query. +## Prerequisites -## [Anchor](https://qdrant.tech/articles/binary-quantization/\#benchmark-results) Benchmark results +- Python 3.7+ +- Mixpeek API key +- Mixpeek client installed (`pip install mixpeek`) +- Qdrant client installed (`pip install qdrant-client`) -We retrieved some early results on the relationship between limit and oversampling using the the DBPedia OpenAI 1M vector dataset. We ran all these experiments on a Qdrant instance where 100K vectors were indexed and used 100 random queries. +## Installation -We varied the 3 parameters that will affect query time and accuracy: limit, rescore and oversampling. We offer these as an initial exploration of this new feature. You are highly encouraged to reproduce these experiments with your data sets. +1. Install the required packages: -> Aside: Since this is a new innovation in vector databases, we are keen to hear feedback and results. [Join our Discord server](https://discord.gg/Qy6HCJK9Dc) for further discussion! +```bash +pip install mixpeek qdrant-client +``` -**Oversampling:** -In the figure below, we illustrate the relationship between recall and number of candidates: +2. Set up your Mixpeek API key: -![Correct vs candidates](https://qdrant.tech/articles_data/binary-quantization/bq-5.png) +```python +from mixpeek import Mixpeek -We see that “correct” results i.e. recall increases as the number of potential “candidates” increase (limit x oversampling). To highlight the impact of changing the `limit`, different limit values are broken apart into different curves. For example, we see that the lowest recall for limit 50 is around 94 correct, with 100 candidates. This also implies we used an oversampling of 2.0 +mixpeek = Mixpeek('your_api_key_here') +``` -As oversampling increases, we see a general improvement in results – but that does not hold in every case. +3. Initialize the Qdrant client: -**Rescore:** -As expected, rescoring increases the time it takes to return a query. -We also repeated the experiment with oversampling except this time we looked at how rescore impacted result accuracy. +```python +from qdrant_client import QdrantClient -![Relationship between limit and rescore on correct](https://qdrant.tech/articles_data/binary-quantization/bq-7.png) +client = QdrantClient("localhost", port=6333) +``` -**Limit:** -We experiment with limits from Top 1 to Top 50 and we are able to get to 100% recall at limit 50, with rescore=True, in an index with 100K vectors. +## Usage -## [Anchor](https://qdrant.tech/articles/binary-quantization/\#recommendations) Recommendations +### 1. Create Qdrant Collection -Quantization gives you the option to make tradeoffs against other parameters: -Dimension count/embedding size -Throughput and Latency requirements -Recall requirements +Make sure to create a Qdrant collection before inserting vectors. You can create a collection with the appropriate vector size (768 for "vuse-generic-v1" model) using: -If you’re working with OpenAI or Cohere embeddings, we recommend the following oversampling settings: +```python +client.create_collection( + collection_name="video_chunks", + vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE) +) +``` -| Method | Dimensionality | Test Dataset | Recall | Oversampling | -| --- | --- | --- | --- | --- | -| OpenAI text-embedding-3-large | 3072 | [DBpedia 1M](https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-3072-1M) | 0.9966 | 3x | -| OpenAI text-embedding-3-small | 1536 | [DBpedia 100K](https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-small-1536-100K) | 0.9847 | 3x | -| OpenAI text-embedding-3-large | 1536 | [DBpedia 1M](https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-1M) | 0.9826 | 3x | -| OpenAI text-embedding-ada-002 | 1536 | [DbPedia 1M](https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M) | 0.98 | 4x | -| Gemini | 768 | No Open Data | 0.9563 | 3x | -| Mistral Embed | 768 | No Open Data | 0.9445 | 3x | +### 2. Process and Embed Video -If you determine that binary quantization is appropriate for your datasets and queries then we suggest the following: +First, process the video into chunks and embed each chunk: -- Binary Quantization with always\_ram=True -- Vectors stored on disk -- Oversampling=2.0 (or more) -- Rescore=True +```python +from mixpeek import Mixpeek +from qdrant_client import QdrantClient, models -## [Anchor](https://qdrant.tech/articles/binary-quantization/\#whats-next) What’s next? +mixpeek = Mixpeek('your_api_key_here') +client = QdrantClient("localhost", port=6333) -Binary quantization is exceptional if you need to work with large volumes of data under high recall expectations. You can try this feature either by spinning up a [Qdrant container image](https://hub.docker.com/r/qdrant/qdrant) locally or, having us create one for you through a [free account](https://cloud.qdrant.io/signup) in our cloud hosted service. +video_url = "https://mixpeek-public-demo.s3.us-east-2.amazonaws.com/starter/jurassic_park_trailer.mp4" -The article gives examples of data sets and configuration you can use to get going. Our documentation covers [adding large datasets to Qdrant](https://qdrant.tech/documentation/tutorials/bulk-upload/) to your Qdrant instance as well as [more quantization methods](https://qdrant.tech/documentation/guides/quantization/). +# Process video chunks +processed_chunks = mixpeek.tools.video.process( + video_source=video_url, + chunk_interval=1, # 1 second intervals + resolution=[720, 1280] +) -If you have any feedback, drop us a note on Twitter or LinkedIn to tell us about your results. [Join our lively Discord Server](https://discord.gg/Qy6HCJK9Dc) if you want to discuss BQ with like-minded people! +# Embed each chunk and insert into Qdrant +for index, chunk in enumerate(processed_chunks): + print(f"Processing video chunk: {index}") -##### Was this page useful? + embedding = mixpeek.embed.video( + model_id="vuse-generic-v1", + input=chunk['base64_chunk'], + input_type="base64" + )['embedding'] -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No + # Insert into Qdrant + client.upsert( + collection_name="video_chunks", + points=[models.PointStruct( + id=index, + vector=embedding, + payload={ + "start_time": chunk["start_time"], + "end_time": chunk["end_time"] + } + )] + ) -Thank you for your feedback! 🙏 + print(f" Embedding preview: {embedding[:5] + ['...'] + embedding[-5:]}") -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/binary-quantization.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +print(f"Processed and inserted {len(processed_chunks)} chunks") +``` -On this page: +### 3. Search for Similar Video Chunks -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/binary-quantization.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +To search for similar video chunks, you can use either text or video queries: -× +#### Text Query -[Powered by](https://qdrant.tech/) +```python +query_text = "a car chase scene" -<|page-135-lllmstxt|> -## monitoring -- [Documentation](https://qdrant.tech/documentation/) -- [Guides](https://qdrant.tech/documentation/guides/) -- Monitoring & Telemetry +# Embed the text query +query_embedding = mixpeek.embed.video( + model_id="vuse-generic-v1", + input=query_text, + input_type="text" +)['embedding'] -# [Anchor](https://qdrant.tech/documentation/guides/monitoring/\#monitoring--telemetry) Monitoring & Telemetry +# Search in Qdrant +search_results = client.query_points( + collection_name="video_chunks", + query=query_embedding, + limit=5 +).points -Qdrant exposes its metrics in [Prometheus](https://prometheus.io/docs/instrumenting/exposition_formats/#text-based-format)/ [OpenMetrics](https://github.com/OpenObservability/OpenMetrics) format, so you can integrate them easily -with the compatible tools and monitor Qdrant with your own monitoring system. You can -use the `/metrics` endpoint and configure it as a scrape target. +for result in search_results: + print(f"Chunk ID: {result.id}, Score: {result.score}") + print(f"Time range: {result.payload['start_time']} - {result.payload['end_time']}") +``` -Metrics endpoint: [http://localhost:6333/metrics](http://localhost:6333/metrics) +#### Video Query -The integration with Qdrant is easy to -[configure](https://prometheus.io/docs/prometheus/latest/getting_started/#configure-prometheus-to-monitor-the-sample-targets) -with Prometheus and Grafana. +```python +query_video_url = "https://mixpeek-public-demo.s3.us-east-2.amazonaws.com/starter/jurassic_bunny.mp4" -## [Anchor](https://qdrant.tech/documentation/guides/monitoring/\#monitoring-multi-node-clusters) Monitoring multi-node clusters +# Embed the video query +query_embedding = mixpeek.embed.video( + model_id="vuse-generic-v1", + input=query_video_url, + input_type="url" +)['embedding'] -When scraping metrics from multi-node Qdrant clusters, it is important to scrape from -each node individually instead of using a load-balanced URL. Otherwise, your metrics will appear inconsistent after each scrape. +# Search in Qdrant +search_results = client.query_points( + collection_name="video_chunks", + query=query_embedding, + limit=5 +).points -## [Anchor](https://qdrant.tech/documentation/guides/monitoring/\#monitoring-in-qdrant-cloud) Monitoring in Qdrant Cloud +for result in search_results: + print(f"Chunk ID: {result.id}, Score: {result.score}") + print(f"Time range: {result.payload['start_time']} - {result.payload['end_time']}") +``` -Qdrant Cloud offers additional metrics and telemetry that are not available in the open-source version. For more information, see [Qdrant Cloud Monitoring](https://qdrant.tech/documentation/cloud/cluster-monitoring/). +## Resources +For more information on Mixpeek Embed, review the official documentation: https://docs.mixpeek.com/api-documentation/inference/embed -## [Anchor](https://qdrant.tech/documentation/guides/monitoring/\#exposed-metrics) Exposed metrics +<|page-244-lllmstxt|> +# N8N -There are two endpoints avaliable: +[N8N](https://n8n.io/) is an automation platform that allows you to build flexible workflows focused on deep data integration. -- `/metrics` is the direct endpoint of the underlying Qdrant database node. +[Qdrant's official node](https://github.com/qdrant/n8n-nodes-qdrant) for n8n enables semantic search capabilities in your workflows. -- `/sys_metrics` is a Qdrant cloud-only endpoint that provides additional operational and infrastructure metrics about your cluster, like CPU, memory and disk utilisation, collection metrics and load balancer telemetry. For more information, see [Qdrant Cloud Monitoring](https://qdrant.tech/documentation/cloud/cluster-monitoring/). +## Prerequisites +1. A Qdrant instance to connect to. You can get a free cloud instance at [cloud.qdrant.io](https://cloud.qdrant.io/). +2. A running N8N instance. You can learn more about using the N8N cloud or self-hosting [here](https://docs.n8n.io/choose-n8n/). -### [Anchor](https://qdrant.tech/documentation/guides/monitoring/\#node-metrics-metrics) Node metrics `/metrics` +## Setting up the node -Each Qdrant server will expose the following metrics. +- Select and install the official Qdrant node from the list of nodes in your workflow editor. -| Name | Type | Meaning | -| --- | --- | --- | -| app\_info | gauge | Information about Qdrant server | -| app\_status\_recovery\_mode | gauge | If Qdrant is currently started in recovery mode | -| collections\_total | gauge | Number of collections | -| collections\_vector\_total | gauge | Total number of vectors in all collections | -| collections\_full\_total | gauge | Number of full collections | -| collections\_aggregated\_total | gauge | Number of aggregated collections | -| rest\_responses\_total | counter | Total number of responses through REST API | -| rest\_responses\_fail\_total | counter | Total number of failed responses through REST API | -| rest\_responses\_avg\_duration\_seconds | gauge | Average response duration in REST API | -| rest\_responses\_min\_duration\_seconds | gauge | Minimum response duration in REST API | -| rest\_responses\_max\_duration\_seconds | gauge | Maximum response duration in REST API | -| grpc\_responses\_total | counter | Total number of responses through gRPC API | -| grpc\_responses\_fail\_total | counter | Total number of failed responses through REST API | -| grpc\_responses\_avg\_duration\_seconds | gauge | Average response duration in gRPC API | -| grpc\_responses\_min\_duration\_seconds | gauge | Minimum response duration in gRPC API | -| grpc\_responses\_max\_duration\_seconds | gauge | Maximum response duration in gRPC API | -| cluster\_enabled | gauge | Whether the cluster support is enabled. 1 - YES | -| memory\_active\_bytes | gauge | Total number of bytes in active pages allocated by the application. [Reference](https://jemalloc.net/jemalloc.3.html#stats.active) | -| memory\_allocated\_bytes | gauge | Total number of bytes allocated by the application. [Reference](https://jemalloc.net/jemalloc.3.html#stats.allocated) | -| memory\_metadata\_bytes | gauge | Total number of bytes dedicated to allocator metadata. [Reference](https://jemalloc.net/jemalloc.3.html#stats.metadata) | -| memory\_resident\_bytes | gauge | Maximum number of bytes in physically resident data pages mapped. [Reference](https://jemalloc.net/jemalloc.3.html#stats.resident) | -| memory\_retained\_bytes | gauge | Total number of bytes in virtual memory mappings. [Reference](https://jemalloc.net/jemalloc.3.html#stats.retained) | -| collection\_hardware\_metric\_cpu | gauge | CPU measurements of a collection | +![Qdrant n8n node](/documentation/frameworks/n8n/node.png) -**Cluster-related metrics** +- Once installed, you can create a connection to Qdrant using your [credentials](/documentation/cloud/authentication/). -There are also some metrics which are exposed in distributed mode only. +![Qdrant Credentials](/documentation/frameworks/n8n/credentials.png) -| Name | Type | Meaning | -| --- | --- | --- | -| cluster\_peers\_total | gauge | Total number of cluster peers | -| cluster\_term | counter | Current cluster term | -| cluster\_commit | counter | Index of last committed (finalized) operation cluster peer is aware of | -| cluster\_pending\_operations\_total | gauge | Total number of pending operations for cluster peer | -| cluster\_voter | gauge | Whether the cluster peer is a voter or learner. 1 - VOTER | +## Operations -## [Anchor](https://qdrant.tech/documentation/guides/monitoring/\#telemetry-endpoint) Telemetry endpoint +The node supports the following operations: -Qdrant also provides a `/telemetry` endpoint, which provides information about the current state of the database, including the number of vectors, shards, and other useful information. You can find a full documentation of this endpoint in the [API reference](https://api.qdrant.tech/api-reference/service/telemetry). +### Collection -## [Anchor](https://qdrant.tech/documentation/guides/monitoring/\#kubernetes-health-endpoints) Kubernetes health endpoints +- [List Collections](https://api.qdrant.tech/v-1-14-x/api-reference/collections/get-collections) - List all collections in the Qdrant instance +- [Create Collection](https://api.qdrant.tech/v-1-14-x/api-reference/collections/create-collection) - Create a new collection with specified vector parameters +- [Update Collection](https://api.qdrant.tech/v-1-14-x/api-reference/collections/update-collection) - Update parameters of an existing collection +- [Get Collection](https://api.qdrant.tech/v-1-14-x/api-reference/collections/get-collection) - Get information about a specific collection +- [Collection Exists](https://api.qdrant.tech/v-1-14-x/api-reference/collections/collection-exists) - Check if a collection exists +- [Delete Collection](https://api.qdrant.tech/v-1-14-x/api-reference/collections/delete-collection) - Delete a collection -_Available as of v1.5.0_ +### Point -Qdrant exposes three endpoints, namely -[`/healthz`](http://localhost:6333/healthz), -[`/livez`](http://localhost:6333/livez) and -[`/readyz`](http://localhost:6333/readyz), to indicate the current status of the -Qdrant server. +- [Upsert Points](https://api.qdrant.tech/v-1-14-x/api-reference/points/upsert-points) - Insert or update points in a collection +- [Retrieve Point](https://api.qdrant.tech/v-1-14-x/api-reference/points/get-point) - Get a single point by ID +- [Retrieve Points](https://api.qdrant.tech/v-1-14-x/api-reference/points/get-points) - Get multiple points by their IDs +- [Delete Points](https://api.qdrant.tech/v-1-14-x/api-reference/points/delete-points) - Remove points from a collection +- [Count Points](https://api.qdrant.tech/v-1-14-x/api-reference/points/count-points) - Count points in a collection with optional filtering +- [Scroll Points](https://api.qdrant.tech/v-1-14-x/api-reference/points/scroll-points) - Scroll through all points in a collection +- [Batch Update Points](https://api.qdrant.tech/v-1-14-x/api-reference/points/batch-update) - Perform multiple point operations in a single request -These currently provide the most basic status response, returning HTTP 200 if -Qdrant is started and ready to be used. +### Vector -Regardless of whether an [API key](https://qdrant.tech/documentation/guides/security/#authentication) is configured, -the endpoints are always accessible. +- [Update Vectors](https://api.qdrant.tech/v-1-14-x/api-reference/points/update-vectors) - Update vectors for existing points +- [Delete Vectors](https://api.qdrant.tech/v-1-14-x/api-reference/points/delete-vectors) - Remove vectors from points -You can read more about Kubernetes health endpoints -[here](https://kubernetes.io/docs/reference/using-api/health-checks/). +### Search -##### Was this page useful? +- [Query Points](https://api.qdrant.tech/v-1-14-x/api-reference/search/query-points) - Search for similar vectors +- [Query Points In Batch](https://api.qdrant.tech/v-1-14-x/api-reference/search/query-batch-points) - Perform multiple vector searches in batch +- [Query Points Groups](https://api.qdrant.tech/v-1-14-x/api-reference/search/query-points-groups) - Group search results by payload field +- [Matrix Pairs](https://api.qdrant.tech/v-1-14-x/api-reference/search/matrix-pairs) - Calculate distance matrix between pairs of points +- [Matrix Offsets](https://api.qdrant.tech/v-1-14-x/api-reference/search/matrix-offsets) - Calculate distance matrix using offsets -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +### Payload -Thank you for your feedback! 🙏 +- [Set Payload](https://api.qdrant.tech/v-1-14-x/api-reference/points/set-payload) - Set payload for points +- [Overwrite Payload](https://api.qdrant.tech/v-1-14-x/api-reference/points/overwrite-payload) - Replace entire payload for points +- [Delete Payload](https://api.qdrant.tech/v-1-14-x/api-reference/points/delete-payload) - Remove payload from points +- [Clear Payload](https://api.qdrant.tech/v-1-14-x/api-reference/points/clear-payload) - Clear all payload fields +- [Payload Facets](https://api.qdrant.tech/v-1-14-x/api-reference/points/facet) - Get payload field statistics +- [Create Payload Index](https://api.qdrant.tech/v-1-14-x/api-reference/indexes/create-field-index) - Create an index for payload fields +- [Delete Payload Index](https://api.qdrant.tech/v-1-14-x/api-reference/indexes/delete-field-index) - Remove a payload field index -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/monitoring.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +## Further Reading -On this page: +- [N8N Reference](https://docs.n8n.io). +- [Qdrant Node Source](https://github.com/qdrant/n8n-nodes-qdrant). -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/monitoring.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +<|page-245-lllmstxt|> +# Neo4j GraphRAG -× +[Neo4j GraphRAG](https://neo4j.com/docs/neo4j-graphrag-python/current/) is a Python package to build graph retrieval augmented generation (GraphRAG) applications using Neo4j and Python. As a first-party library, it offers a robust, feature-rich, and high-performance solution, with the added assurance of long-term support and maintenance directly from Neo4j. It offers a Qdrant retriever natively to search for vectors stored in a Qdrant collection. -[Powered by](https://qdrant.tech/) +## Installation -<|page-136-lllmstxt|> -## cloud-pricing-payments -- [Documentation](https://qdrant.tech/documentation/) -- Billing & Payments +```bash +pip install neo4j-graphrag[qdrant] +``` -# [Anchor](https://qdrant.tech/documentation/cloud-pricing-payments/\#qdrant-cloud-billing--payments) Qdrant Cloud Billing & Payments +## Usage -Qdrant database clusters in Qdrant Cloud are priced based on CPU, memory, and disk storage usage. To get a clearer idea for the pricing structure, based on the amounts of vectors you want to store, please use our [Pricing Calculator](https://cloud.qdrant.io/calculator). +A vector query with Neo4j and Qdrant could look like: -## [Anchor](https://qdrant.tech/documentation/cloud-pricing-payments/\#billing) Billing +```python +from neo4j import GraphDatabase +from neo4j_graphrag.retrievers import QdrantNeo4jRetriever +from qdrant_client import QdrantClient +from examples.embedding_biology import EMBEDDING_BIOLOGY -You can pay for your Qdrant Cloud database clusters either with a credit card or through an AWS, GCP, or Azure Marketplace subscription. +NEO4J_URL = "neo4j://localhost:7687" +NEO4J_AUTH = ("neo4j", "password") -Your payment method is charged at the beginning of each month for the previous month’s usage. There is no difference in pricing between the different payment methods. +with GraphDatabase.driver(NEO4J_URL, auth=NEO4J_AUTH) as neo4j_driver: + retriever = QdrantNeo4jRetriever( + driver=neo4j_driver, + client=QdrantClient(url="http://localhost:6333"), + collection_name="{collection_name}", + id_property_external="neo4j_id", + id_property_neo4j="id", + ) -If you choose to pay through a marketplace, the Qdrant Cloud usage costs are added as usage units to your existing billing for your cloud provider services. A detailed breakdown of your usage is available in the Qdrant Cloud Console. +retriever.search(query_vector=[0.5523, 0.523, 0.132, 0.523, ...], top_k=5) +``` -Note: Even if you pay using a marketplace subscription, your database clusters will still be deployed into Qdrant-owned infrastructure. The setup and management of Qdrant database clusters will also still be done via the Qdrant Cloud Console UI. +Alternatively, you can use any [Langchain embeddings providers](https://python.langchain.com/docs/integrations/text_embedding/), to vectorize text queries automatically. -If you wish to deploy Qdrant database clusters into your own environment from Qdrant Cloud then we recommend our [Hybrid Cloud](https://qdrant.tech/documentation/hybrid-cloud/) solution. +```python +from langchain_huggingface.embeddings import HuggingFaceEmbeddings +from neo4j import GraphDatabase +from neo4j_graphrag.retrievers import QdrantNeo4jRetriever +from qdrant_client import QdrantClient -![Payment Options](https://qdrant.tech/documentation/cloud/payment-options.png) +NEO4J_URL = "neo4j://localhost:7687" +NEO4J_AUTH = ("neo4j", "password") -### [Anchor](https://qdrant.tech/documentation/cloud-pricing-payments/\#credit-card) Credit Card +with GraphDatabase.driver(NEO4J_URL, auth=NEO4J_AUTH) as neo4j_driver: + embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") + retriever = QdrantNeo4jRetriever( + driver=neo4j_driver, + client=QdrantClient(url="http://localhost:6333"), + collection_name="{collection_name}", + id_property_external="neo4j_id", + id_property_neo4j="id", + embedder=embedder, + ) -Credit card payments are processed through Stripe. To set up a credit card, go to the Billing Details screen in the [Qdrant Cloud Console](https://cloud.qdrant.io/), select **Stripe** as the payment method, and enter your credit card details. +retriever.search(query_text="my user query", top_k=10) +``` -### [Anchor](https://qdrant.tech/documentation/cloud-pricing-payments/\#aws-marketplace) AWS Marketplace +## Further Reading -Our [AWS Marketplace](https://aws.amazon.com/marketplace/pp/prodview-rtphb42tydtzg) listing streamlines access to Qdrant for users who rely on Amazon Web Services for hosting and application development. +- [Neo4j GraphRAG Reference](https://neo4j.com/docs/neo4j-graphrag-python/current/index.html) +- [Qdrant Retriever Reference](https://neo4j.com/docs/neo4j-graphrag-python/current/user_guide_rag.html#qdrant-neo4j-retriever-user-guide) +- [Source](https://github.com/neo4j/neo4j-graphrag-python/tree/main/src/neo4j_graphrag/retrievers/external/qdrant) -To subscribe: +<|page-246-lllmstxt|> +# Nomic -1. Go to Billing Details screen in the [Qdrant Cloud Console](https://cloud.qdrant.io/) -2. Select **AWS Marketplace** as the payment method. You will be redirected to the AWS Marketplace listing for Qdrant. -3. Click the bright orange button - **View purchase options**. -4. On the next screen, under Purchase, click **Subscribe**. -5. Up top, on the green banner, click **Set up your account**. +The `nomic-embed-text-v1` model is an open source [8192 context length](https://github.com/nomic-ai/contrastors) text encoder. +While you can find it on the [Hugging Face Hub](https://huggingface.co/nomic-ai/nomic-embed-text-v1), +you may find it easier to obtain them through the [Nomic Text Embeddings](https://docs.nomic.ai/reference/endpoints/nomic-embed-text). +Once installed, you can configure it with the official Python client, FastEmbed or through direct HTTP requests. -You will be redirected to the Billing Details screen in the [Qdrant Cloud Console](https://cloud.qdrant.io/). From there you can start to create Qdrant database clusters. + -### [Anchor](https://qdrant.tech/documentation/cloud-pricing-payments/\#gcp-marketplace) GCP Marketplace +You can use Nomic embeddings directly in Qdrant client calls. There is a difference in the way the embeddings +are obtained for documents and queries. -Our [GCP Marketplace](https://console.cloud.google.com/marketplace/product/qdrant-public/qdrant) listing streamlines access to Qdrant for users who rely on the Google Cloud Platform for hosting and application development. +#### Upsert using [Nomic SDK](https://github.com/nomic-ai/nomic) -To subscribe: +The `task_type` parameter defines the embeddings that you get. +For documents, set the `task_type` to `search_document`: -1. Go to Billing Details screen in the [Qdrant Cloud Console](https://cloud.qdrant.io/) -2. Select **GCP Marketplace** as the payment method. You will be redirected to the GCP Marketplace listing for Qdrant. -3. Select **Subscribe**. (If you have already subscribed, select **Manage on Provider**.) -4. On the next screen, choose options as required, and select **Subscribe**. -5. On the pop-up window that appers, select **Sign up with Qdrant**. +```python +from qdrant_client import QdrantClient, models +from nomic import embed -You will be redirected to the Billing Details screen in the [Qdrant Cloud Console](https://cloud.qdrant.io/). From there you can start to create Qdrant database clusters. +output = embed.text( + texts=["Qdrant is the best vector database!"], + model="nomic-embed-text-v1", + task_type="search_document", +) -### [Anchor](https://qdrant.tech/documentation/cloud-pricing-payments/\#azure-marketplace) Azure Marketplace +client = QdrantClient() +client.upsert( + collection_name="my-collection", + points=models.Batch( + ids=[1], + vectors=output["embeddings"], + ), +) +``` -Our [Azure Marketplace](https://portal.azure.com/#view/Microsoft_Azure_Marketplace/GalleryItemDetailsBladeNopdl/id/qdrantsolutionsgmbh1698769709989.qdrant-db/selectionMode~/false/resourceGroupId//resourceGroupLocation//dontDiscardJourney~/false/selectedMenuId/home/launchingContext~/%7B%22galleryItemId%22%3A%22qdrantsolutionsgmbh1698769709989.qdrant-dbqdrant_cloud_unit%22%2C%22source%22%3A%5B%22GalleryFeaturedMenuItemPart%22%2C%22VirtualizedTileDetails%22%5D%2C%22menuItemId%22%3A%22home%22%2C%22subMenuItemId%22%3A%22Search%20results%22%2C%22telemetryId%22%3A%221df5537b-8b29-4200-80ce-0cd38c7e0e56%22%7D/searchTelemetryId/6b44fb90-7b9c-4286-aad8-59f88f3cc2ff) listing streamlines access to Qdrant for users who rely on Microsoft Azure for hosting and application development. +#### Upsert using [FastEmbed](https://github.com/qdrant/fastembed) -To subscribe: +```python +from fastembed import TextEmbedding +from client import QdrantClient, models -1. Go to Billing Details screen in the [Qdrant Cloud Console](https://cloud.qdrant.io/) -2. Select **Azure Marketplace** as the payment method. You will be redirected to the Azure Marketplace listing for Qdrant. -3. Select **Subscribe**. -4. On the next screen, choose options as required, and select **Review + Subscribe**. -5. After reviewing all settings, select **Subscribe**. -6. Once the SaaS subscription is created, select **Configure account now**. +model = TextEmbedding("nomic-ai/nomic-embed-text-v1") -You will be redirected to the Billing Details screen in the [Qdrant Cloud Console](https://cloud.qdrant.io/). From there you can start to create Qdrant database clusters. +output = model.embed(["Qdrant is the best vector database!"]) -##### Was this page useful? +client = QdrantClient() +client.upsert( + collection_name="my-collection", + points=models.Batch( + ids=[1], + vectors=[embeddings.tolist() for embeddings in output], + ), +) +``` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +#### Search using [Nomic SDK](https://github.com/nomic-ai/nomic) -Thank you for your feedback! 🙏 +To query the collection, set the `task_type` to `search_query`: -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-pricing-payments.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +```python +output = embed.text( + texts=["What is the best vector database?"], + model="nomic-embed-text-v1", + task_type="search_query", +) -On this page: +client.search( + collection_name="my-collection", + query_vector=output["embeddings"][0], +) +``` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-pricing-payments.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +#### Search using [FastEmbed](https://github.com/qdrant/fastembed) -× +```python +output = next(model.embed("What is the best vector database?")) -[Powered by](https://qdrant.tech/) +client.search( + collection_name="my-collection", + query_vector=output.tolist(), +) +``` -<|page-137-lllmstxt|> -## frameworks -- [Documentation](https://qdrant.tech/documentation/) -- Frameworks +For more information, see the Nomic documentation on [Text embeddings](https://docs.nomic.ai/reference/endpoints/nomic-embed-text). -## [Anchor](https://qdrant.tech/documentation/frameworks/\#framework-integrations) Framework Integrations +<|page-247-lllmstxt|> +# Nvidia -| Framework | Description | -| --- | --- | -| [AutoGen](https://qdrant.tech/documentation/frameworks/autogen/) | Framework from Microsoft building LLM applications using multiple conversational agents. | -| [Camel](https://qdrant.tech/documentation/frameworks/camel/) | Framework to build and use LLM-based agents for real-world task solving | -| [Canopy](https://qdrant.tech/documentation/frameworks/canopy/) | Framework from Pinecone for building RAG applications using LLMs and knowledge bases. | -| [Cheshire Cat](https://qdrant.tech/documentation/frameworks/cheshire-cat/) | Framework to create personalized AI assistants using custom data. | -| [CrewAI](https://qdrant.tech/documentation/frameworks/crewai/) | CrewAI is a framework to build automated workflows using multiple AI agents that perform complex tasks. | -| [Dagster](https://qdrant.tech/documentation/frameworks/dagster/) | Python framework for data orchestration with integrated lineage, observability. | -| [DeepEval](https://qdrant.tech/documentation/frameworks/deepeval/) | Python framework for testing large language model systems. | -| [DocArray](https://qdrant.tech/documentation/frameworks/docarray/) | Python library for managing data in multi-modal AI applications. | -| [DSPy](https://qdrant.tech/documentation/frameworks/dspy/) | Framework for algorithmically optimizing LM prompts and weights. | -| [dsRAG](https://qdrant.tech/documentation/frameworks/dsrag/) | High-performance Python retrieval engine for unstructured data. | -| [Dynamiq](https://qdrant.tech/documentation/frameworks/dynamiq/) | Dynamiq is all-in-one Gen AI framework, designed to streamline the development of AI-powered applications. | -| [Feast](https://qdrant.tech/documentation/frameworks/feast/) | Open-source feature store to operate production ML systems at scale as a set of features. | -| [Fifty-One](https://qdrant.tech/documentation/frameworks/fifty-one/) | Toolkit for building high-quality datasets and computer vision models. | -| [Genkit](https://qdrant.tech/documentation/frameworks/genkit/) | Framework to build, deploy, and monitor production-ready AI-powered apps. | -| [Haystack](https://qdrant.tech/documentation/frameworks/haystack/) | LLM orchestration framework to build customizable, production-ready LLM applications. | -| [HoneyHive](https://qdrant.tech/documentation/frameworks/honeyhive/) | AI observability and evaluation platform that provides tracing and monitoring tools for GenAI pipelines. | -| [Lakechain](https://qdrant.tech/documentation/frameworks/lakechain/) | Python framework for deploying document processing pipelines on AWS using infrastructure-as-code. | -| [Langchain](https://qdrant.tech/documentation/frameworks/langchain/) | Python framework for building context-aware, reasoning applications using LLMs. | -| [Langchain-Go](https://qdrant.tech/documentation/frameworks/langchain-go/) | Go framework for building context-aware, reasoning applications using LLMs. | -| [Langchain4j](https://qdrant.tech/documentation/frameworks/langchain4j/) | Java framework for building context-aware, reasoning applications using LLMs. | -| [LangGraph](https://qdrant.tech/documentation/frameworks/langgraph/) | Python, Javascript libraries for building stateful, multi-actor applications. | -| [LlamaIndex](https://qdrant.tech/documentation/frameworks/llama-index/) | A data framework for building LLM applications with modular integrations. | -| [Mastra](https://qdrant.tech/documentation/frameworks/mastra/) | Typescript framework to build AI applications and features quickly. | -| [Mirror Security](https://qdrant.tech/documentation/frameworks/mirror-security/) | Python framework for vector encryption and access control. | -| [Mem0](https://qdrant.tech/documentation/frameworks/mem0/) | Self-improving memory layer for LLM applications, enabling personalized AI experiences. | -| [Neo4j GraphRAG](https://qdrant.tech/documentation/frameworks/neo4j-graphrag/) | Package to build graph retrieval augmented generation (GraphRAG) applications using Neo4j and Python. | -| [NLWeb](https://qdrant.tech/documentation/frameworks/nlweb/) | A framework to turn websites into chat-ready data using schema.org and associated data formats. | -| [OpenAI Agents](https://qdrant.tech/documentation/frameworks/openai-agents/) | Python framework for managing multiple AI agents that can work together. | -| [Pandas-AI](https://qdrant.tech/documentation/frameworks/pandas-ai/) | Python library to query/visualize your data (CSV, XLSX, PostgreSQL, etc.) in natural language | -| [Ragbits](https://qdrant.tech/documentation/frameworks/ragbits/) | Python package that offers essential “bits” for building powerful Retrieval-Augmented Generation (RAG) applications. | -| [Rig-rs](https://qdrant.tech/documentation/frameworks/rig-rs/) | Rust library for building scalable, modular, and ergonomic LLM-powered applications. | -| [Semantic Router](https://qdrant.tech/documentation/frameworks/semantic-router/) | Python library to build a decision-making layer for AI applications using vector search. | -| [SmolAgents](https://qdrant.tech/documentation/frameworks/smolagents/) | Barebones library for agents. Agents write python code to call tools and orchestrate other agent. | -| [Solon](https://qdrant.tech/documentation/frameworks/solon/) | A lightweight, high-performance Java enterprise framework | -| [Spring AI](https://qdrant.tech/documentation/frameworks/spring-ai/) | Java AI framework for building with Spring design principles such as portability and modular design. | -| [Superduper](https://qdrant.tech/documentation/frameworks/superduper/) | Framework for building flexible, compositional AI apps which may be applied directly to databases. | -| [Sycamore](https://qdrant.tech/documentation/frameworks/sycamore/) | Document processing engine for ETL, RAG, LLM-based applications, and analytics on unstructured data. | -| [Testcontainers](https://qdrant.tech/documentation/frameworks/testcontainers/) | Framework for providing throwaway, lightweight instances of systems for testing | -| [txtai](https://qdrant.tech/documentation/frameworks/txtai/) | Python library for semantic search, LLM orchestration and language model workflows. | -| [Vanna AI](https://qdrant.tech/documentation/frameworks/vanna-ai/) | Python RAG framework for SQL generation and querying. | - -##### Was this page useful? - -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No - -Thank you for your feedback! 🙏 - -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/frameworks/_index.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. - -On this page: - -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/frameworks/_index.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) - -× - -[Powered by](https://qdrant.tech/) +Qdrant supports working with [Nvidia embeddings](https://build.nvidia.com/explore/retrieval). -<|page-138-lllmstxt|> -## rag-is-dead -- [Articles](https://qdrant.tech/articles/) -- Is RAG Dead? The Role of Vector Databases in Vector Search \| Qdrant +You can generate an API key to authenticate the requests from the [Nvidia Playground](). -[Back to RAG & GenAI](https://qdrant.tech/articles/rag-and-genai/) +### Setting up the Qdrant client and Nvidia session -# Is RAG Dead? The Role of Vector Databases in Vector Search \| Qdrant +```python +import requests +from qdrant_client import QdrantClient -David Myriel +NVIDIA_BASE_URL = "https://ai.api.nvidia.com/v1/retrieval/nvidia/embeddings" -· +NVIDIA_API_KEY = "" -February 27, 2024 +nvidia_session = requests.Session() -![Is RAG Dead? The Role of Vector Databases in Vector Search | Qdrant](https://qdrant.tech/articles_data/rag-is-dead/preview/title.jpg) +client = QdrantClient(":memory:") -# [Anchor](https://qdrant.tech/articles/rag-is-dead/\#is-rag-dead-the-role-of-vector-databases-in-ai-efficiency-and-vector-search) Is RAG Dead? The Role of Vector Databases in AI Efficiency and Vector Search +headers = { + "Authorization": f"Bearer {NVIDIA_API_KEY}", + "Accept": "application/json", +} -When Anthropic came out with a context window of 100K tokens, they said: “ _[Vector search](https://qdrant.tech/solutions/) is dead. LLMs are getting more accurate and won’t need RAG anymore._” +texts = [ + "Qdrant is the best vector search engine!", + "Loved by Enterprises and everyone building for low latency, high performance, and scale.", +] +``` -Google’s Gemini 1.5 now offers a context window of 10 million tokens. [Their supporting paper](https://storage.googleapis.com/deepmind-media/gemini/gemini_v1_5_report.pdf) claims victory over accuracy issues, even when applying Greg Kamradt’s [NIAH methodology](https://twitter.com/GregKamradt/status/1722386725635580292). +```typescript +import { QdrantClient } from '@qdrant/js-client-rest'; -_It’s over. [RAG](https://qdrant.tech/articles/what-is-rag-in-ai/) (Retrieval Augmented Generation) must be completely obsolete now. Right?_ +const NVIDIA_BASE_URL = "https://ai.api.nvidia.com/v1/retrieval/nvidia/embeddings" +const NVIDIA_API_KEY = "" -No. +const client = new QdrantClient({ url: 'http://localhost:6333' }); -Larger context windows are never the solution. Let me repeat. Never. They require more computational resources and lead to slower processing times. +const headers = { + "Authorization": "Bearer " + NVIDIA_API_KEY, + "Accept": "application/json", + "Content-Type": "application/json" +} -The community is already stress testing Gemini 1.5: +const texts = [ + "Qdrant is the best vector search engine!", + "Loved by Enterprises and everyone building for low latency, high performance, and scale.", +] +``` -![RAG and Gemini 1.5](https://qdrant.tech/articles_data/rag-is-dead/rag-is-dead-1.png) +The following example shows how to embed documents with the `embed-qa-4` model that generates sentence embeddings of size 1024. -This is not surprising. LLMs require massive amounts of compute and memory to run. To cite Grant, running such a model by itself “would deplete a small coal mine to generate each completion”. Also, who is waiting 30 seconds for a response? +### Embedding documents -## [Anchor](https://qdrant.tech/articles/rag-is-dead/\#context-stuffing-is-not-the-solution) Context stuffing is not the solution +```python +payload = { + "input": texts, + "input_type": "passage", + "model": "NV-Embed-QA", +} -> Relying on context is expensive, and it doesn’t improve response quality in real-world applications. Retrieval based on [vector search](https://qdrant.tech/solutions/) offers much higher precision. +response_body = nvidia_session.post( + NVIDIA_BASE_URL, headers=headers, json=payload +).json() +``` -If you solely rely on an [LLM](https://qdrant.tech/articles/what-is-rag-in-ai/) to perfect retrieval and precision, you are doing it wrong. +```typescript +let body = { + "input": texts, + "input_type": "passage", + "model": "NV-Embed-QA" +} -A large context window makes it harder to focus on relevant information. This increases the risk of errors or hallucinations in its responses. +let response = await fetch(NVIDIA_BASE_URL, { + method: "POST", + body: JSON.stringify(body), + headers +}); -Google found Gemini 1.5 significantly more accurate than GPT-4 at shorter context lengths and “a very small decrease in recall towards 1M tokens”. The recall is still below 0.8. +let response_body = await response.json() +``` -![Gemini 1.5 Data](https://qdrant.tech/articles_data/rag-is-dead/rag-is-dead-2.png) +### Converting the model outputs to Qdrant points -We don’t think 60-80% is good enough. The LLM might retrieve enough relevant facts in its context window, but it still loses up to 40% of the available information. +```python +from qdrant_client.models import PointStruct -> The whole point of vector search is to circumvent this process by efficiently picking the information your app needs to generate the best response. A [vector database](https://qdrant.tech/) keeps the compute load low and the query response fast. You don’t need to wait for the LLM at all. +points = [ + PointStruct( + id=idx, + vector=data["embedding"], + payload={"text": text}, + ) + for idx, (data, text) in enumerate(zip(response_body["data"], texts)) +] +``` -Qdrant’s benchmark results are strongly in favor of accuracy and efficiency. We recommend that you consider them before deciding that an LLM is enough. Take a look at our [open-source benchmark reports](https://qdrant.tech/benchmarks/) and [try out the tests](https://github.com/qdrant/vector-db-benchmark) yourself. +```typescript +let points = response_body.data.map((data, i) => { + return { + id: i, + vector: data.embedding, + payload: { + text: texts[i] + } + } +}) +``` -## [Anchor](https://qdrant.tech/articles/rag-is-dead/\#vector-search-in-compound-systems) Vector search in compound systems +### Creating a collection to insert the documents -The future of AI lies in careful system engineering. As per [Zaharia et al.](https://bair.berkeley.edu/blog/2024/02/18/compound-ai-systems/), results from Databricks find that “60% of LLM applications use some form of RAG, while 30% use multi-step chains.” +```python +from qdrant_client.models import VectorParams, Distance -Even Gemini 1.5 demonstrates the need for a complex strategy. When looking at [Google’s MMLU Benchmark](https://storage.googleapis.com/deepmind-media/gemini/gemini_v1_5_report.pdf), the model was called 32 times to reach a score of 90.0% accuracy. This shows us that even a basic compound arrangement is superior to monolithic models. +collection_name = "example_collection" -As a retrieval system, a [vector database](https://qdrant.tech/) perfectly fits the need for compound systems. Introducing them into your design opens the possibilities for superior applications of LLMs. It is superior because it’s faster, more accurate, and much cheaper to run. +client.create_collection( + collection_name, + vectors_config=VectorParams( + size=1024, + distance=Distance.COSINE, + ), +) +client.upsert(collection_name, points) +``` -> The key advantage of RAG is that it allows an LLM to pull in real-time information from up-to-date internal and external knowledge sources, making it more dynamic and adaptable to new information. - Oliver Molander, CEO of IMAGINAI +```typescript +const COLLECTION_NAME = "example_collection" -## [Anchor](https://qdrant.tech/articles/rag-is-dead/\#qdrant-scales-to-enterprise-rag-scenarios) Qdrant scales to enterprise RAG scenarios +await client.createCollection(COLLECTION_NAME, { + vectors: { + size: 1024, + distance: 'Cosine', + } +}); -People still don’t understand the economic benefit of vector databases. Why would a large corporate AI system need a standalone vector database like [Qdrant](https://qdrant.tech/)? In our minds, this is the most important question. Let’s pretend that LLMs cease struggling with context thresholds altogether. +await client.upsert(COLLECTION_NAME, { + wait: true, + points +}) +``` -**How much would all of this cost?** +## Searching for documents with Qdrant -If you are running a RAG solution in an enterprise environment with petabytes of private data, your compute bill will be unimaginable. Let’s assume 1 cent per 1K input tokens (which is the current GPT-4 Turbo pricing). Whatever you are doing, every time you go 100 thousand tokens deep, it will cost you $1. +Once the documents are added, you can search for the most relevant documents. -That’s a buck a question. +```python +payload = { + "input": "What is the best to use for vector search scaling?", + "input_type": "query", + "model": "NV-Embed-QA", +} -> According to our estimations, vector search queries are **at least** 100 million times cheaper than queries made by LLMs. +response_body = nvidia_session.post( + NVIDIA_BASE_URL, headers=headers, json=payload +).json() -Conversely, the only up-front investment with vector databases is the indexing (which requires more compute). After this step, everything else is a breeze. Once setup, Qdrant easily scales via [features like Multitenancy and Sharding](https://qdrant.tech/articles/multitenancy/). This lets you scale up your reliance on the vector retrieval process and minimize your use of the compute-heavy LLMs. As an optimization measure, Qdrant is irreplaceable. +client.search( + collection_name=collection_name, + query_vector=response_body["data"][0]["embedding"], +) +``` -Julien Simon from HuggingFace says it best: +```typescript +body = { + "input": "What is the best to use for vector search scaling?", + "input_type": "query", + "model": "NV-Embed-QA", +} -> RAG is not a workaround for limited context size. For mission-critical enterprise use cases, RAG is a way to leverage high-value, proprietary company knowledge that will never be found in public datasets used for LLM training. At the moment, the best place to index and query this knowledge is some sort of vector index. In addition, RAG downgrades the LLM to a writing assistant. Since built-in knowledge becomes much less important, a nice small 7B open-source model usually does the trick at a fraction of the cost of a huge generic model. +response = await fetch(NVIDIA_BASE_URL, { + method: "POST", + body: JSON.stringify(body), + headers +}); -## [Anchor](https://qdrant.tech/articles/rag-is-dead/\#get-superior-accuracy-with-qdrants-vector-database) Get superior accuracy with Qdrant’s vector database +response_body = await response.json() -As LLMs continue to require enormous computing power, users will need to leverage vector search and [RAG](https://qdrant.tech/rag/rag-evaluation-guide/). +await client.search(COLLECTION_NAME, { + vector: response_body.data[0].embedding, +}); +``` -Our customers remind us of this fact every day. As a product, [our vector database](https://qdrant.tech/) is highly scalable and business-friendly. We develop our features strategically to follow our company’s Unix philosophy. +<|page-248-lllmstxt|> +# Using Ollama with Qdrant -We want to keep Qdrant compact, efficient and with a focused purpose. This purpose is to empower our customers to use it however they see fit. +[Ollama](https://ollama.com) provides specialized embeddings for niche applications. Ollama supports a [variety of embedding models](https://ollama.com/search?c=embedding), making it possible to build retrieval augmented generation (RAG) applications that combine text prompts with existing documents or other data in specialized areas. -When large enterprises release their generative AI into production, they need to keep costs under control, while retaining the best possible quality of responses. Qdrant has the [vector search solutions](https://qdrant.tech/solutions/) to do just that. Revolutionize your vector search capabilities and get started with [a Qdrant demo](https://qdrant.tech/contact-us/). +## Installation -##### Was this page useful? +You can install the required packages using the following pip command: -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +```bash +pip install ollama qdrant-client +``` -Thank you for your feedback! 🙏 +## Integration Example -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/rag-is-dead.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +The following code assumes Ollama is accessible at port `11434` and Qdrant at port `6334`. -On this page: +```python +from qdrant_client import QdrantClient, models +import ollama -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/rag-is-dead.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +COLLECTION_NAME = "NicheApplications" -× +# Initialize Ollama client +oclient = ollama.Client(host="localhost") -[Powered by](https://qdrant.tech/) +# Initialize Qdrant client +qclient = QdrantClient(host="localhost", port=6333) -<|page-139-lllmstxt|> -## bm42 -- [Articles](https://qdrant.tech/articles/) -- BM42: New Baseline for Hybrid Search +# Text to embed +text = "Ollama excels in niche applications with specific embeddings" -[Back to Machine Learning](https://qdrant.tech/articles/machine-learning/) +# Generate embeddings +response = oclient.embeddings(model="llama3.2", prompt=text) +embeddings = response["embedding"] -# BM42: New Baseline for Hybrid Search +# Create a collection if it doesn't already exist +if not qclient.collection_exists(COLLECTION_NAME): + qclient.create_collection( + collection_name=COLLECTION_NAME, + vectors_config=models.VectorParams( + size=len(embeddings), distance=models.Distance.COSINE + ), + ) -Andrey Vasnetsov +# Upload the vectors to the collection along with the original text as payload +qclient.upsert( + collection_name=COLLECTION_NAME, + points=[models.PointStruct(id=1, vector=embeddings, payload={"text": text})], +) -· +``` -July 01, 2024 +<|page-249-lllmstxt|> +# OpenAI -![BM42: New Baseline for Hybrid Search](https://qdrant.tech/articles_data/bm42/preview/title.jpg) +Qdrant supports working with [OpenAI embeddings](https://platform.openai.com/docs/guides/embeddings/embeddings). -For the last 40 years, BM25 has served as the standard for search engines. -It is a simple yet powerful algorithm that has been used by many search engines, including Google, Bing, and Yahoo. +There is an official OpenAI Python package that simplifies obtaining them, and it can be installed with pip: -Though it seemed that the advent of vector search would diminish its influence, it did so only partially. -The current state-of-the-art approach to retrieval nowadays tries to incorporate BM25 along with embeddings into a hybrid search system. +```bash +pip install openai +``` -However, the use case of text retrieval has significantly shifted since the introduction of RAG. -Many assumptions upon which BM25 was built are no longer valid. +### Setting up the OpenAI and Qdrant clients -For example, the typical length of documents and queries vary significantly between traditional web search and modern RAG systems. +```python +import openai +import qdrant_client -In this article, we will recap what made BM25 relevant for so long and why alternatives have struggled to replace it. Finally, we will discuss BM42, as the next step in the evolution of lexical search. +openai_client = openai.Client( + api_key="" +) -## [Anchor](https://qdrant.tech/articles/bm42/\#why-has-bm25-stayed-relevant-for-so-long) Why has BM25 stayed relevant for so long? +client = qdrant_client.QdrantClient(":memory:") -To understand why, we need to analyze its components. +texts = [ + "Qdrant is the best vector search engine!", + "Loved by Enterprises and everyone building for low latency, high performance, and scale.", +] +``` -The famous BM25 formula is defined as: +The following example shows how to embed a document with the `text-embedding-3-small` model that generates sentence embeddings of size 1536. You can find the list of all supported models [here](https://platform.openai.com/docs/models/embeddings). -score(D,Q)=∑i=1NIDF(qi)×f(qi,D)⋅(k1+1)f(qi,D)+k1⋅(1−b+b⋅\|D\|avgdl) +### Embedding a document -Let’s simplify this to gain a better understanding. +```python +embedding_model = "text-embedding-3-small" -- The score(D,Q) \- means that we compute the score for each pair of document D and query Q. +result = openai_client.embeddings.create(input=texts, model=embedding_model) +``` -- The ∑i=1N \- means that each of N terms in the query contribute to the final score as a part of the sum. +### Converting the model outputs to Qdrant points -- The IDF(qi) \- is the inverse document frequency. The more rare the term qi is, the more it contributes to the score. A simplified formula for this is: +```python +from qdrant_client.models import PointStruct +points = [ + PointStruct( + id=idx, + vector=data.embedding, + payload={"text": text}, + ) + for idx, (data, text) in enumerate(zip(result.data, texts)) +] +``` -IDF(qi)=Number of documentsNumber of documents with qi +### Creating a collection to insert the documents -It is fair to say that the `IDF` is the most important part of the BM25 formula. -`IDF` selects the most important terms in the query relative to the specific document collection. -So intuitively, we can interpret the `IDF` as **term importance within the corpora**. +```python +from qdrant_client.models import VectorParams, Distance -That explains why BM25 is so good at handling queries, which dense embeddings consider out-of-domain. +collection_name = "example_collection" -The last component of the formula can be intuitively interpreted as **term importance within the document**. -This might look a bit complicated, so let’s break it down. +client.create_collection( + collection_name, + vectors_config=VectorParams( + size=1536, + distance=Distance.COSINE, + ), +) +client.upsert(collection_name, points) +``` + +## Searching for documents with Qdrant -Term importance in document (qi)=f(qi,D)⋅(k1+1)f(qi,D)+k1⋅(1−b+b⋅\|D\|avgdl) +Once the documents are indexed, you can search for the most relevant documents using the same model. + +```python +client.search( + collection_name=collection_name, + query_vector=openai_client.embeddings.create( + input=["What is the best to use for vector search scaling?"], + model=embedding_model, + ) + .data[0] + .embedding, +) +``` -- The f(qi,D) \- is the frequency of the term qi in the document D. Or in other words, the number of times the term qi appears in the document D. -- The k1 and b are the hyperparameters of the BM25 formula. In most implementations, they are constants set to k1=1.5 and b=0.75. Those constants define relative implications of the term frequency and the document length in the formula. -- The \|D\|avgdl \- is the relative length of the document D compared to the average document length in the corpora. The intuition befind this part is following: if the token is found in the smaller document, it is more likely that this token is important for this document. +## Using OpenAI Embedding Models with Qdrant's Binary Quantization -#### [Anchor](https://qdrant.tech/articles/bm42/\#will-bm25-term-importance-in-the-document-work-for-rag) Will BM25 term importance in the document work for RAG? +You can use OpenAI embedding Models with [Binary Quantization](/articles/binary-quantization/) - a technique that allows you to reduce the size of the embeddings by 32 times without losing the quality of the search results too much. -As we can see, the _term importance in the document_ heavily depends on the statistics within the document. Moreover, statistics works well if the document is long enough. -Therefore, it is suitable for searching webpages, books, articles, etc. -However, would it work as well for modern search applications, such as RAG? Let’s see. +|Method|Dimensionality|Test Dataset|Recall|Oversampling| +|-|-|-|-|-| +|OpenAI text-embedding-3-large|3072|[DBpedia 1M](https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-3072-1M) | 0.9966|3x| +|OpenAI text-embedding-3-small|1536|[DBpedia 100K](https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-small-1536-100K)| 0.9847|3x| +|OpenAI text-embedding-3-large|1536|[DBpedia 1M](https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-1M)| 0.9826|3x| +|OpenAI text-embedding-ada-002|1536|[DbPedia 1M](https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M) |0.98|4x| -The typical length of a document in RAG is much shorter than that of web search. In fact, even if we are working with webpages and articles, we would prefer to split them into chunks so that -a) Dense models can handle them and -b) We can pinpoint the exact part of the document which is relevant to the query +<|page-250-lllmstxt|> +# Pipedream -As a result, the document size in RAG is small and fixed. +[Pipedream](https://pipedream.com/) is a development platform that allows developers to connect many different applications, data sources, and APIs in order to build automated cross-platform workflows. It also offers code-level control with Node.js, Python, Go, or Bash if required. -That effectively renders the term importance in the document part of the BM25 formula useless. -The term frequency in the document is always 0 or 1, and the relative length of the document is always 1. +You can use the [Qdrant app](https://pipedream.com/apps/qdrant) in Pipedream to add vector search capabilities to your workflows. -So, the only part of the BM25 formula that is still relevant for RAG is `IDF`. Let’s see how we can leverage it. +## Prerequisites -## [Anchor](https://qdrant.tech/articles/bm42/\#why-splade-is-not-always-the-answer) Why SPLADE is not always the answer +1. A Qdrant instance to connect to. You can get a free cloud instance at [cloud.qdrant.io](https://cloud.qdrant.io/). +2. A [Pipedream project](https://pipedream.com/) to develop your workflows. -Before discussing our new approach, let’s examine the current state-of-the-art alternative to BM25 - SPLADE. +## Setting Up -The idea behind SPLADE is interesting—what if we let a smart, end-to-end trained model generate a bag-of-words representation of the text for us? -It will assign all the weights to the tokens, so we won’t need to bother with statistics and hyperparameters. -The documents are then represented as a sparse embedding, where each token is represented as an element of the sparse vector. +Search for the Qdrant app in your workflow apps. -And it works in academic benchmarks. Many papers report that SPLADE outperforms BM25 in terms of retrieval quality. -This performance, however, comes at a cost. +![Qdrant Pipedream App](/documentation/frameworks/pipedream/qdrant-app.png) -- **Inappropriate Tokenizer**: To incorporate transformers for this task, SPLADE models require using a standard transformer tokenizer. These tokenizers are not designed for retrieval tasks. For example, if the word is not in the (quite limited) vocabulary, it will be either split into subwords or replaced with a `[UNK]` token. This behavior works well for language modeling but is completely destructive for retrieval tasks. +The Qdrant app offers extensible API interface and pre-built actions. -- **Expensive Token Expansion**: In order to compensate the tokenization issues, SPLADE uses _token expansion_ technique. This means that we generate a set of similar tokens for each token in the query. There are a few problems with this approach: +![Qdrant App Features](/documentation/frameworks/pipedream/app-features.png) - - It is computationally and memory expensive. We need to generate more values for each token in the document, which increases both the storage size and retrieval time. - - It is not always clear where to stop with the token expansion. The more tokens we generate, the more likely we are to get the relevant one. But simultaneously, the more tokens we generate, the more likely we are to get irrelevant results. - - Token expansion dilutes the interpretability of the search. We can’t say which tokens were used in the document and which were generated by the token expansion. -- **Domain and Language Dependency**: SPLADE models are trained on specific corpora. This means that they are not always generalizable to new or rare domains. As they don’t use any statistics from the corpora, they cannot adapt to the new domain without fine-tuning. +Select any of the actions of the app to set up a connection. -- **Inference Time**: Additionally, currently available SPLADE models are quite big and slow. They usually require a GPU to make the inference in a reasonable time. +![Qdrant Connect Account](/documentation/frameworks/pipedream/app-upsert-action.png) +Configure connection with the credentials of your Qdrant instance. -At Qdrant, we acknowledge the aforementioned problems and are looking for a solution. -Our idea was to combine the best of both worlds - the simplicity and interpretability of BM25 and the intelligence of transformers while avoiding the pitfalls of SPLADE. +![Qdrant Connection Credentials](/documentation/frameworks/pipedream/app-connection.png) -And here is what we came up with. +You can verify your credentials using the "Test Connection" button. -## [Anchor](https://qdrant.tech/articles/bm42/\#the-best-of-both-worlds) The best of both worlds +Once a connection is set up, you can use the app to build workflows with the [2000+ apps supported by Pipedream](https://pipedream.com/apps/). -As previously mentioned, `IDF` is the most important part of the BM25 formula. In fact it is so important, that we decided to build its calculation into the Qdrant engine itself. -Check out our latest [release notes](https://github.com/qdrant/qdrant/releases/tag/v1.10.0). This type of separation allows streaming updates of the sparse embeddings while keeping the `IDF` calculation up-to-date. +## Further Reading -As for the second part of the formula, _the term importance within the document_ needs to be rethought. +- [Pipedream Documentation](https://pipedream.com/docs). +- [Qdrant Cloud Authentication](https://qdrant.tech/documentation/cloud/authentication/). +- [Source Code](https://github.com/PipedreamHQ/pipedream/tree/master/components/qdrant) -Since we can’t rely on the statistics within the document, we can try to use the semantics of the document instead. -And semantics is what transformers are good at. Therefore, we only need to solve two problems: +<|page-251-lllmstxt|> +# Power Apps -- How does one extract the importance information from the transformer? -- How can tokenization issues be avoided? +Microsoft [Power Apps](https://www.microsoft.com/en-us/power-platform/products/power-apps) is a suite of apps, services, and connectors that provides a rapid development environment to build custom apps for your business needs. You can quickly build custom business apps that connect to your data stored in many online and on-premises data sources. -### [Anchor](https://qdrant.tech/articles/bm42/\#attention-is-all-you-need) Attention is all you need +You can use the [Qdrant Connector](https://learn.microsoft.com/en-us/connectors/qdrant/) in Power Apps to add vector search capabilities to your flows. -Transformer models, even those used to generate embeddings, generate a bunch of different outputs. -Some of those outputs are used to generate embeddings. +## Prerequisites -Others are used to solve other kinds of tasks, such as classification, text generation, etc. +1. A Qdrant instance to connect to. You can get a free cloud instance at [cloud.qdrant.io](https://cloud.qdrant.io/). +2. A [Power Apps account](https://www.microsoft.com/en-in/power-platform/products/power-apps/) to develop your flows. -The one particularly interesting output for us is the attention matrix. +## Setting Up -![Attention matrix](https://qdrant.tech/articles_data/bm42/attention-matrix.png) +Search for the Qdrant connector when adding a new action in a Power Apps flow. The connector offers an exhaustive list of pre-built Qdrant actions. -Attention matrix +![Qdrant Connector](/documentation/platforms/powerapps/qdrant-operations.png) -The attention matrix is a square matrix, where each row and column corresponds to the token in the input sequence. -It represents the importance of each token in the input sequence for each other. +Set up a connection with the credentials of your Qdrant instance. The connection is verified upon creation. -The classical transformer models are trained to predict masked tokens in the context, so the attention weights define which context tokens influence the masked token most. +![Qdrant Connection Credentials](/documentation/platforms/powerapps/qdrant-connection.png) -Apart from regular text tokens, the transformer model also has a special token called `[CLS]`. This token represents the whole sequence in the classification tasks, which is exactly what we need. +Once a connection is set up, you can now build flows paired with the various connectors [supported by Power Apps](https://learn.microsoft.com/en-us/connectors/connector-reference/connector-reference-powerapps-connectors/). -By looking at the attention row for the `[CLS]` token, we can get the importance of each token in the document for the whole document. +## Further Reading -```python -sentences = "Hello, World - is the starting point in most programming languages" +- [Power Apps Documentation](https://learn.microsoft.com/en-us/power-apps/). +- [Power Apps Guides](https://learn.microsoft.com/en-us/training/powerplatform/power-apps). -features = transformer.tokenize(sentences) +<|page-252-lllmstxt|> +# Prem AI -# ... +[PremAI](https://premai.io/) is a unified generative AI development platform for fine-tuning deploying, and monitoring AI models. -attentions = transformer.auto_model(**features, output_attentions=True).attentions +Qdrant is compatible with PremAI APIs. -weights = torch.mean(attentions[-1][0,:,0], axis=0) -# â–Č â–Č â–Č â–Č -# │ │ │ └─── [CLS] token is the first one -# │ │ └─────── First item of the batch -# │ └────────── Last transformer layer -# └────────────────────────── Average all 6 attention heads +### Installing the SDKs -for weight, token in zip(weights, tokens): - print(f"{token}: {weight}") +```bash +pip install premai qdrant-client +``` -# [CLS] : 0.434 // Filter out the [CLS] token -# hello : 0.039 -# , : 0.039 -# world : 0.107 // <-- The most important token -# - : 0.033 -# is : 0.024 -# the : 0.031 -# starting : 0.054 -# point : 0.028 -# in : 0.018 -# most : 0.016 -# programming : 0.060 // <-- The third most important token -# languages : 0.062 // <-- The second most important token -# [SEP] : 0.047 // Filter out the [SEP] token +To install the npm package: +```bash +npm install @premai/prem-sdk @qdrant/js-client-rest ``` -The resulting formula for the BM42 score would look like this: +### Import all required packages -score(D,Q)=∑i=1NIDF(qi)×Attention(CLS,qi) +```python +from premai import Prem -Note that classical transformers have multiple attention heads, so we can get multiple importance vectors for the same document. The simplest way to combine them is to simply average them. +from qdrant_client import QdrantClient +from qdrant_client.models import Distance, VectorParams +``` -These averaged attention vectors make up the importance information we were looking for. -The best part is, one can get them from any transformer model, without any additional training. -Therefore, BM42 can support any natural language as long as there is a transformer model for it. +```typescript +import Prem from '@premai/prem-sdk'; +import { QdrantClient } from '@qdrant/js-client-rest'; +``` -In our implementation, we use the `sentence-transformers/all-MiniLM-L6-v2` model, which gives a huge boost in the inference speed compared to the SPLADE models. In practice, any transformer model can be used. -It doesn’t require any additional training, and can be easily adapted to work as BM42 backend. +### Define all the constants -### [Anchor](https://qdrant.tech/articles/bm42/\#wordpiece-retokenization) WordPiece retokenization +We need to define the project ID and the embedding model to use. You can learn more about obtaining these in the PremAI [docs](https://docs.premai.io/quick-start). -The final piece of the puzzle we need to solve is the tokenization issue. In order to get attention vectors, we need to use native transformer tokenization. -But this tokenization is not suitable for the retrieval tasks. What can we do about it? -Actually, the solution we came up with is quite simple. We reverse the tokenization process after we get the attention vectors. +```python +PROJECT_ID = 123 +EMBEDDING_MODEL = "text-embedding-3-large" +COLLECTION_NAME = "prem-collection-py" +QDRANT_SERVER_URL = "http://localhost:6333" +DOCUMENTS = [ + "This is a sample python document", + "We will be using qdrant and premai python sdk" +] +``` -Transformers use [WordPiece](https://huggingface.co/learn/nlp-course/en/chapter6/6) tokenization. -In case it sees the word, which is not in the vocabulary, it splits it into subwords. +```typescript +const PROJECT_ID = 123; +const EMBEDDING_MODEL = "text-embedding-3-large"; +const COLLECTION_NAME = "prem-collection-js"; +const SERVER_URL = "http://localhost:6333" +const DOCUMENTS = [ + "This is a sample javascript document", + "We will be using qdrant and premai javascript sdk" +]; +``` -Here is how that looks: +### Set up PremAI and Qdrant clients -```text -"unbelievable" -> ["un", "##believ", "##able"] +```python +prem_client = Prem(api_key="xxxx-xxx-xxx") +qdrant_client = QdrantClient(url=QDRANT_SERVER_URL) ``` -What can merge the subwords back into the words. Luckily, the subwords are marked with the `##` prefix, so we can easily detect them. -Since the attention weights are normalized, we can simply sum the attention weights of the subwords to get the attention weight of the word. +```typescript +const premaiClient = new Prem({ + apiKey: "xxxx-xxx-xxx" +}) +const qdrantClient = new QdrantClient({ url: SERVER_URL }); +``` -After that, we can apply the same traditional NLP techniques, as +### Generating Embeddings -- Removing of the stop-words -- Removing of the punctuation -- Lemmatization +```python +from typing import Union, List + +def get_embeddings( + project_id: int, + embedding_model: str, + documents: Union[str, List[str]] +) -> List[List[float]]: + """ + Helper function to get the embeddings from premai sdk + Args + project_id (int): The project id from prem saas platform. + embedding_model (str): The embedding model alias to choose + documents (Union[str, List[str]]): Single texts or list of texts to embed + Returns: + List[List[int]]: A list of list of integers that represents different + embeddings + """ + embeddings = [] + documents = [documents] if isinstance(documents, str) else documents + for embedding in prem_client.embeddings.create( + project_id=project_id, + model=embedding_model, + input=documents + ).data: + embeddings.append(embedding.embedding) + + return embeddings +``` -In this way, we can significantly reduce the number of tokens, and therefore minimize the memory footprint of the sparse embeddings. We won’t simultaneously compromise the ability to match (almost) exact tokens. +```typescript +async function getEmbeddings(projectID, embeddingModel, documents) { + const response = await premaiClient.embeddings.create({ + project_id: projectID, + model: embeddingModel, + input: documents + }); + return response; +} +``` -## [Anchor](https://qdrant.tech/articles/bm42/\#practical-examples) Practical examples +### Converting Embeddings to Qdrant Points -| Trait | BM25 | SPLADE | BM42 | -| --- | --- | --- | --- | -| Interpretability | High ✅ | Ok 🆗 | High ✅ | -| Document Inference speed | Very high ✅ | Slow 🐌 | High ✅ | -| Query Inference speed | Very high ✅ | Slow 🐌 | Very high ✅ | -| Memory footprint | Low ✅ | High ❌ | Low ✅ | -| In-domain accuracy | Ok 🆗 | High ✅ | High ✅ | -| Out-of-domain accuracy | Ok 🆗 | Low ❌ | Ok 🆗 | -| Small documents accuracy | Low ❌ | High ✅ | High ✅ | -| Large documents accuracy | High ✅ | Low ❌ | Ok 🆗 | -| Unknown tokens handling | Yes ✅ | Bad ❌ | Yes ✅ | -| Multi-lingual support | Yes ✅ | No ❌ | Yes ✅ | -| Best Match | Yes ✅ | No ❌ | Yes ✅ | -Starting from Qdrant v1.10.0, BM42 can be used in Qdrant via FastEmbed inference. +```python +from qdrant_client.models import PointStruct -Let’s see how you can setup a collection for hybrid search with BM42 and [jina.ai](https://jina.ai/embeddings/) dense embeddings. +embeddings = get_embeddings( + project_id=PROJECT_ID, + embedding_model=EMBEDDING_MODEL, + documents=DOCUMENTS +) -httppython +points = [ + PointStruct( + id=idx, + vector=embedding, + payload={"text": text}, + ) for idx, (embedding, text) in enumerate(zip(embeddings, DOCUMENTS)) +] +``` -```http -PUT collections/my-hybrid-collection -{ - "vectors": { - "jina": { - "size": 768, - "distance": "Cosine" - } - }, - "sparse_vectors": { - "bm42": { - "modifier": "idf" // <--- This parameter enables the IDF calculation - } - } +```typescript +function convertToQdrantPoints(embeddings, texts) { + return embeddings.data.map((data, i) => { + return { + id: i, + vector: data.embedding, + payload: { + text: texts[i] + } + }; + }); } +const embeddings = await getEmbeddings(PROJECT_ID, EMBEDDING_MODEL, DOCUMENTS); +const points = convertToQdrantPoints(embeddings, DOCUMENTS); ``` -```python -from qdrant_client import QdrantClient, models - -client = QdrantClient() +### Set up a Qdrant Collection -client.create_collection( - collection_name="my-hybrid-collection", - vectors_config={ - "jina": models.VectorParams( - size=768, - distance=models.Distance.COSINE, - ) - }, - sparse_vectors_config={ - "bm42": models.SparseVectorParams( - modifier=models.Modifier.IDF, - ) - } +```python +qdrant_client.create_collection( + collection_name=COLLECTION_NAME, + vectors_config=VectorParams(size=3072, distance=Distance.DOT) ) - +``` +```typescript +await qdrantClient.createCollection(COLLECTION_NAME, { + vectors: { + size: 3072, + distance: 'Cosine' + } +}) ``` -The search query will retrieve the documents with both dense and sparse embeddings and combine the scores -using the Reciprocal Rank Fusion (RRF) algorithm. +### Insert Documents into the Collection ```python -from fastembed import SparseTextEmbedding, TextEmbedding +doc_ids = list(range(len(embeddings))) -query_text = "best programming language for beginners?" +qdrant_client.upsert( + collection_name=COLLECTION_NAME, + points=points + ) +``` -model_bm42 = SparseTextEmbedding(model_name="Qdrant/bm42-all-minilm-l6-v2-attentions") -model_jina = TextEmbedding(model_name="jinaai/jina-embeddings-v2-base-en") +```typescript +await qdrantClient.upsert(COLLECTION_NAME, { + wait: true, + points + }); +``` -sparse_embedding = list(model_bm42.query_embed(query_text))[0] -dense_embedding = list(model_jina.query_embed(query_text))[0] +### Perform a Search -client.query_points( - collection_name="my-hybrid-collection", - prefetch=[\ - models.Prefetch(query=sparse_embedding.as_object(), using="bm42", limit=10),\ - models.Prefetch(query=dense_embedding.tolist(), using="jina", limit=10),\ - ], - query=models.FusionQuery(fusion=models.Fusion.RRF), # <--- Combine the scores - limit=10 +```python +query = "what is the extension of python document" + +query_embedding = get_embeddings( + project_id=PROJECT_ID, + embedding_model=EMBEDDING_MODEL, + documents=query ) +qdrant_client.search(collection_name=COLLECTION_NAME, query_vector=query_embedding[0]) ``` +```typescript +const query = "what is the extension of javascript document" +const query_embedding_response = await getEmbeddings(PROJECT_ID, EMBEDDING_MODEL, query) -### [Anchor](https://qdrant.tech/articles/bm42/\#benchmarks) Benchmarks +await qdrantClient.search(COLLECTION_NAME, { + vector: query_embedding_response.data[0].embedding +}); +``` -To prove the point further we have conducted some benchmarks to highlight the cases where BM42 outperforms BM25. -Please note, that we didn’t intend to make an exhaustive evaluation, as we are presenting a new approach, not a new model. +<|page-253-lllmstxt|> +# PrivateGPT -For out experiments we choose [quora](https://huggingface.co/datasets/BeIR/quora) dataset, which represents a question-deduplication task ~~the Question-Answering task~~. +[PrivateGPT](https://docs.privategpt.dev/) is a production-ready AI project that allows you to inquire about your documents using Large Language Models (LLMs) with offline support. -The typical example of the dataset is the following: +PrivateGPT uses Qdrant as the default vectorstore for ingesting and retrieving documents. -```text -{"_id": "109", "text": "How GST affects the CAs and tax officers?"} -{"_id": "110", "text": "Why can't I do my homework?"} -{"_id": "111", "text": "How difficult is it get into RSI?"} +## Configuration -``` +Qdrant settings can be configured by setting values to the qdrant property in the `settings.yaml` file. By default, Qdrant tries to connect to an instance at http://localhost:3000. -As you can see, it has pretty short texts, there are not much of the statistics to rely on. +Example: +```yaml +qdrant: + url: "https://xyz-example.eu-central.aws.cloud.qdrant.io:6333" + api_key: "" +``` -After encoding with BM42, the average vector size is only **5.6 elements per document**. +The available [configuration options](https://docs.privategpt.dev/manual/storage/vector-stores#qdrant-configuration) are: +| Field | Description | +|--------------|-------------| +| location | If `:memory:` - use in-memory Qdrant instance.
If `str` - use it as a `url` parameter.| +| url | Either host or str of `Optional[scheme], host, Optional[port], Optional[prefix]`.
Eg. `http://localhost:6333` | +| port | Port of the REST API interface. Default: `6333` | +| grpc_port | Port of the gRPC interface. Default: `6334` | +| prefer_grpc | If `true` - use gRPC interface whenever possible in custom methods. | +| https | If `true` - use HTTPS(SSL) protocol.| +| api_key | API key for authentication in Qdrant Cloud.| +| prefix | If set, add `prefix` to the REST URL path.
Example: `service/v1` will result in `http://localhost:6333/service/v1/{qdrant-endpoint}` for REST API.| +| timeout | Timeout for REST and gRPC API requests.
Default: 5.0 seconds for REST and unlimited for gRPC | +| host | Host name of Qdrant service. If url and host are not set, defaults to 'localhost'.| +| path | Persistence path for QdrantLocal. Eg. `local_data/private_gpt/qdrant`| +| force_disable_check_same_thread | Force disable check_same_thread for QdrantLocal sqlite connection.| -With `datatype: uint8` available in Qdrant, the total size of the sparse vector index is about **13MB** for ~530k documents. +## Next steps -As a reference point, we use: +Find the PrivateGPT docs [here](https://docs.privategpt.dev/). -- BM25 with tantivy -- the [sparse vector BM25 implementation](https://github.com/qdrant/bm42_eval/blob/master/index_bm25_qdrant.py) with the same preprocessing pipeline like for BM42: tokenization, stop-words removal, and lemmatization +<|page-254-lllmstxt|> +![Pulumi Logo](/documentation/platforms/pulumi/pulumi-logo.png) -| | BM25 (tantivy) | BM25 (Sparse) | BM42 | -| --- | --- | --- | --- | -| ~~Precision @ 10~~ \* | ~~0.45~~ | ~~0.45~~ | ~~0.49~~ | -| Recall @ 10 | ~~0.71~~ **0.89** | 0.83 | 0.85 | +Pulumi is an open source infrastructure as code tool for creating, deploying, and managing cloud infrastructure. -\\* \- values were corrected after the publication due to a mistake in the evaluation script. +A Qdrant SDK in any of Pulumi's supported languages can be generated based on the [Qdrant Terraform Provider](https://registry.terraform.io/providers/qdrant/qdrant-cloud/latest). -To make our benchmarks transparent, we have published scripts we used for the evaluation: see [github repo](https://github.com/qdrant/bm42_eval). +## Pre-requisites -Please note, that both BM25 and BM42 won’t work well on their own in a production environment. -Best results are achieved with a combination of sparse and dense embeddings in a hybrid approach. -In this scenario, the two models are complementary to each other. -The sparse model is responsible for exact token matching, while the dense model is responsible for semantic matching. +1. A [Pulumi Installation](https://www.pulumi.com/docs/install/). +2. An [API key](/documentation/qdrant-cloud-api/#authentication-connecting-to-cloud-api) to access the Qdrant cloud API. -Some more advanced models might outperform default `sentence-transformers/all-MiniLM-L6-v2` model we were using. -We encourage developers involved in training embedding models to include a way to extract attention weights and contribute to the BM42 backend. +## Setup -## [Anchor](https://qdrant.tech/articles/bm42/\#fostering-curiosity-and-experimentation) Fostering curiosity and experimentation +- Create a Pulumi project in any of the [supported languages](https://www.pulumi.com/docs/languages-sdks/) by running -Despite all of its advantages, BM42 is not always a silver bullet. -For large documents without chunks, BM25 might still be a better choice. +```bash +mkdir qdrant-pulumi && cd qdrant-pulumi +pulumi new "" -y +``` -There might be a smarter way to extract the importance information from the transformer. There could be a better method to weigh IDF against attention scores. +- Generate a Pulumi SDK for Qdrant by running the following in your Pulumi project directory. -Qdrant does not specialize in model training. Our core project is the search engine itself. However, we understand that we are not operating in a vacuum. By introducing BM42, we are stepping up to empower our community with novel tools for experimentation. +```bash +pulumi package add terraform-provider registry.terraform.io/qdrant/qdrant-cloud +``` -We truly believe that the sparse vectors method is at exact level of abstraction to yield both powerful and flexible results. +- Set the Qdrant cloud API as a config value. -Many of you are sharing your recent Qdrant projects in our [Discord channel](https://discord.com/invite/qdrant). Feel free to try out BM42 and let us know what you come up with. +```bash +pulumi config set qdrant-cloud:apiKey "" --secret +``` -##### Was this page useful? +- You can now import the SDK as: -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +```python +import pulumi_qdrant_cloud as qdrant_cloud +``` -Thank you for your feedback! 🙏 +```typescript +import * as qdrantCloud from "qdrant-cloud"; +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/bm42.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +```java +import com.pulumi.qdrantcloud.*; +``` -On this page: +## Usage -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/bm42.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +The provider includes the following data-sources and resources to work with: -× +### Data Sources -[Powered by](https://qdrant.tech/) +- `qdrant-cloud_booking_packages` - Get IDs and detailed information about the packages/subscriptions available. [Reference](https://github.com/qdrant/terraform-provider-qdrant-cloud/blob/main/docs/data-sources/booking_packages.md) -<|page-140-lllmstxt|> -## memory-consumption -- [Articles](https://qdrant.tech/articles/) -- Minimal RAM you need to serve a million vectors +```python +qdrant_cloud.get_booking_packages(cloud_provider="aws", cloud_region="us-west-2") +``` -[Back to Qdrant Internals](https://qdrant.tech/articles/qdrant-internals/) +```typescript +qdrantCloud.getBookingPackages({ + cloudProvider: "aws", + cloudRegion: "us-west-2" +}) +``` -# Minimal RAM you need to serve a million vectors +```java +import com.pulumi.qdrantcloud.inputs.GetBookingPackagesArgs; -Andrei Vasnetsov +QdrantcloudFunctions.getBookingPackages(GetBookingPackagesArgs.builder() + .cloudProvider("aws") + .cloudRegion("us-west-2") + .build()); +``` -· +- `qdrant-cloud_accounts_auth_keys` - List API keys for Qdrant clusters. [Reference](https://github.com/qdrant/terraform-provider-qdrant-cloud/blob/main/docs/data-sources/accounts_auth_keys.md) -December 07, 2022 +```python +qdrant_cloud.get_accounts_auth_keys(account_id="") +``` -![Minimal RAM you need to serve a million vectors](https://qdrant.tech/articles_data/memory-consumption/preview/title.jpg) +```typescript +qdrantCloud.getAccountsAuthKeys({ + accountId: "" +}) +``` -When it comes to measuring the memory consumption of our processes, we often rely on tools such as `htop` to give us an indication of how much RAM is being used. However, this method can be misleading and doesn’t always accurately reflect the true memory usage of a process. +```java +import com.pulumi.qdrantcloud.inputs.GetAccountsAuthKeysArgs; -There are many different ways in which `htop` may not be a reliable indicator of memory usage. -For instance, a process may allocate memory in advance but not use it, or it may not free deallocated memory, leading to overstated memory consumption. -A process may be forked, which means that it will have a separate memory space, but it will share the same code and data with the parent process. -This means that the memory consumption of the child process will be counted twice. -Additionally, a process may utilize disk cache, which is also accounted as resident memory in the `htop` measurements. +QdrantcloudFunctions.getAccountsAuthKeys(GetAccountsAuthKeysArgs.builder() + .accountId("") + .build()); +``` -As a result, even if `htop` shows that a process is using 10GB of memory, it doesn’t necessarily mean that the process actually requires 10GB of RAM to operate efficiently. -In this article, we will explore how to properly measure RAM usage and optimize [Qdrant](https://qdrant.tech/) for optimal memory consumption. +- `qdrant-cloud_accounts_cluster` - Get Cluster Information. [Reference](https://github.com/qdrant/terraform-provider-qdrant-cloud/blob/main/docs/data-sources/accounts_cluster.md) -## [Anchor](https://qdrant.tech/articles/memory-consumption/\#how-to-measure-actual-ram-requirements) How to measure actual RAM requirements +```python +qdrant_cloud.get_accounts_cluster( + account_id="", + id="", +) +``` -We need to know memory consumption in order to estimate how much RAM is required to run the program. -So in order to determine that, we can conduct a simple experiment. -Let’s limit the allowed memory of the process and observe at which point it stops functioning. -In this way we can determine the minimum amount of RAM the program needs to operate. +```typescript +qdrantCloud.getAccountsCluster({ + accountId: "", + id: "" +}) +``` -One way to do this is by conducting a grid search, but a more efficient method is to use binary search to quickly find the minimum required amount of RAM. -We can use docker to limit the memory usage of the process. +```java +import com.pulumi.qdrantcloud.inputs.GetAccountsClusterArgs; -Before running each benchmark, it is important to clear the page cache with the following command: +QdrantcloudFunctions.getAccountsCluster(GetAccountsClusterArgs + .builder() + .accountId("") + .id("") + .build()); +``` -```bash -sudo bash -c 'sync; echo 1 > /proc/sys/vm/drop_caches' +- `qdrant-cloud_accounts_clusters` - List Qdrant clusters. [Reference](https://github.com/qdrant/terraform-provider-qdrant-cloud/blob/main/docs/data-sources/accounts_clusters.md) +```python +qdrant_cloud.get_accounts_clusters(account_id="") ``` -This ensures that the process doesn’t utilize any data from previous runs, providing more accurate and consistent results. - -We can use the following command to run Qdrant with a memory limit of 1GB: +```typescript +qdrantCloud.getAccountsClusters({ + accountId: "" +}) +``` -```bash -docker run -it --rm \ - --memory 1024mb \ - --network=host \ - -v "$(pwd)/data/storage:/qdrant/storage" \ - qdrant/qdrant:latest +```java +import com.pulumi.qdrantcloud.inputs.GetAccountsClustersArgs; +QdrantcloudFunctions.getAccountsClusters( + GetAccountsClustersArgs.builder().accountId("").build()); ``` -## [Anchor](https://qdrant.tech/articles/memory-consumption/\#lets-run-some-benchmarks) Let’s run some benchmarks +### Resources -Let’s run some benchmarks to see how much RAM Qdrant needs to serve 1 million vectors. +- `qdrant-cloud_accounts_cluster` - Create clusters on Qdrant cloud - [Reference](https://github.com/qdrant/terraform-provider-qdrant-cloud/blob/main/docs/resources/accounts_cluster.md) -We can use the `glove-100-angular` and scripts from the [vector-db-benchmark](https://github.com/qdrant/vector-db-benchmark) project to upload and query the vectors. -With the first run we will use the default configuration of Qdrant with all data stored in RAM. +```python +qdrant_cloud.AccountsCluster( + resource_name="pl-example-cluster-resource", + name="pl-example-cluster", + cloud_provider="gcp", + cloud_region="us-east4", + configuration=qdrant_cloud.AccountsClusterConfigurationArgs( + number_of_nodes=1, + node_configuration=qdrant_cloud.AccountsClusterConfigurationNodeConfigurationArgs( + package_id="3920d1eb-d3eb-4117-9578-b12d89bb1c5d" + ), + ), + account_id="", +) +``` -```bash -# Upload vectors -python run.py --engines qdrant-all-in-ram --datasets glove-100-angular +```typescript +new qdrantCloud.AccountsCluster("pl-example-cluster-resource", { + cloudProvider: "gcp", + cloudRegion: "us-east4", + configuration: { + numberOfNodes: 1, + nodeConfiguration: { + packageId: "3920d1eb-d3eb-4117-9578-b12d89bb1c5d" + } + }, + accountId: "" +}) +``` +```java +import com.pulumi.qdrantcloud.AccountsClusterArgs; +import com.pulumi.qdrantcloud.inputs.AccountsClusterConfigurationArgs; +import com.pulumi.qdrantcloud.inputs.AccountsClusterConfigurationNodeConfigurationArgs; + +new AccountsCluster("pl-example-cluster-resource", AccountsClusterArgs.builder() + .name("pl-example-cluster") + .cloudProvider("gcp") + .cloudRegion("us-east4") + .configuration(AccountsClusterConfigurationArgs.builder() + .numberOfNodes(1.0) + .nodeConfiguration(AccountsClusterConfigurationNodeConfigurationArgs.builder() + .packageId("3920d1eb-d3eb-4117-9578-b12d89bb1c5d") + .build()) + .build()) + .accountId("") + .build()); ``` -After uploading vectors, we will repeat the same experiment with different RAM limits to see how they affect the memory consumption and search speed. +- `qdrant-cloud_accounts_auth_key` - Create API keys for Qdrant cloud clusters. [Reference](https://github.com/qdrant/terraform-provider-qdrant-cloud/blob/main/docs/resources/accounts_auth_key.md) -```bash -# Search vectors -python run.py --engines qdrant-all-in-ram --datasets glove-100-angular --skip-upload +```python +qdrant_cloud.AccountsAuthKey( + resource_name="pl-example-key-resource", + cluster_ids=[""], +) +``` +```typescript +new qdrantCloud.AccountsAuthKey("pl-example-cluster-resource", { + clusterIds: ["", ""] +}) ``` -### [Anchor](https://qdrant.tech/articles/memory-consumption/\#all-in-memory) All in Memory +```java +import com.pulumi.qdrantcloud.AccountsAuthKey; +import com.pulumi.qdrantcloud.AccountsAuthKeyArgs; -In the first experiment, we tested how well our system performs when all vectors are stored in memory. -We tried using different amounts of memory, ranging from 1512mb to 1024mb, and measured the number of requests per second (rps) that our system was able to handle. +new AccountsAuthKey("pl-example-key-resource", AccountsAuthKeyArgs.builder() + .clusterIds("", "") + .build()); +``` -| Memory | Requests/s | -| --- | --- | -| 1512mb | 774.38 | -| 1256mb | 760.63 | -| 1200mb | 794.72 | -| 1152mb | out of memory | -| 1024mb | out of memory | +## Further Reading -We found that 1152MB memory limit resulted in our system running out of memory, but using 1512mb, 1256mb, and 1200mb of memory resulted in our system being able to handle around 780 RPS. -This suggests that about 1.2GB of memory is needed to serve around 1 million vectors, and there is no speed degradation when limiting memory usage above 1.2GB. +- [Provider Documentation](https://registry.terraform.io/providers/qdrant/qdrant-cloud/latest/docs) +- [Pulumi Quickstart](https://www.pulumi.com/docs/get-started/) -### [Anchor](https://qdrant.tech/articles/memory-consumption/\#vectors-stored-using-mmap) Vectors stored using MMAP +<|page-255-lllmstxt|> +![Redpanda Cover](/documentation/data-management/redpanda/redpanda-cover.png) -Let’s go a bit further! -In the second experiment, we tested how well our system performs when **vectors are stored using the memory-mapped file** (mmap). -Create collection with: +[Redpanda Connect](https://www.redpanda.com/connect) is a declarative data-agnostic streaming service designed for efficient, stateless processing steps. It offers transaction-based resiliency with back pressure, ensuring at-least-once delivery when connecting to at-least-once sources with sinks, without the need to persist messages during transit. -```http -PUT /collections/benchmark -{ - "vectors": { - ... - "on_disk": true - } -} +Connect pipelines are configured using a YAML file, which organizes components hierarchically. Each section represents a different component type, such as inputs, processors and outputs, and these can have nested child components and [dynamic values](https://docs.redpanda.com/redpanda-connect/configuration/interpolation/). -``` +The [Qdrant Output](https://docs.redpanda.com/redpanda-connect/components/outputs/qdrant/) component enables streaming vector data into Qdrant collections in your RedPanda pipelines. -This configuration tells Qdrant to use mmap for vectors if the segment size is greater than 20000Kb (which is approximately 40K 128d-vectors). +## Example -Now the out-of-memory happens when we allow using **600mb** RAM only +An example configuration of the output once the inputs and processors are set, would look like: -Experiments details +```yaml +input: + # https://docs.redpanda.com/redpanda-connect/components/inputs/about/ -| Memory | Requests/s | -| --- | --- | -| 1200mb | 759.94 | -| 1100mb | 687.00 | -| 1000mb | 10 | +pipeline: + processors: + # https://docs.redpanda.com/redpanda-connect/components/processors/about/ -— use a bit faster disk — +output: + label: "qdrant-output" + qdrant: + max_in_flight: 64 + batching: + count: 8 + grpc_host: xyz-example.eu-central.aws.cloud.qdrant.io:6334 + api_token: "" + tls: + enabled: true + # skip_cert_verify: false + # enable_renegotiation: false + # root_cas: "" + # root_cas_file: "" + # client_certs: [] + collection_name: "" + id: root = uuid_v4() + vector_mapping: 'root = {"some_dense": this.vector, "some_sparse": {"indices": [23,325,532],"values": [0.352,0.532,0.532]}}' + payload_mapping: 'root = {"field": this.value, "field_2": 987}' +``` -| Memory | Requests/s | -| --- | --- | -| 1000mb | 25 rps | -| 750mb | 5 rps | -| 625mb | 2.5 rps | -| 600mb | out of memory | +## Further Reading -At this point we have to switch from network-mounted storage to a faster disk, as the network-based storage is too slow to handle the amount of sequential reads that our system needs to serve the queries. +- [Getting started with Connect](https://docs.redpanda.com/redpanda-connect/guides/getting_started/) +- [Qdrant Output Reference](https://docs.redpanda.com/redpanda-connect/components/outputs/qdrant/) -But let’s first see how much RAM we need to serve 1 million vectors and then we will discuss the speed optimization as well. +<|page-256-lllmstxt|> +# Rig-rs -### [Anchor](https://qdrant.tech/articles/memory-consumption/\#vectors-and-hnsw-graph-stored-using-mmap) Vectors and HNSW graph stored using MMAP +[Rig](http://rig.rs) is a Rust library for building scalable, modular, and ergonomic LLM-powered applications. It has full support for LLM completion and embedding workflows with minimal boiler plate. -In the third experiment, we tested how well our system performs when vectors and [HNSW](https://qdrant.tech/articles/filtrable-hnsw/) graph are stored using the memory-mapped files. -Create collection with: +Rig supports Qdrant as a vectorstore to ingest and search for documents semantically. -```http -PUT /collections/benchmark -{ - "vectors": { - ... - "on_disk": true - }, - "hnsw_config": { - "on_disk": true - }, - ... -} +## Installation +```console +cargo add rig-core rig-qdrant qdrant-client ``` -With this configuration we are able to serve 1 million vectors with **only 135mb of RAM**! +## Usage -Experiments details +Here's an example ingest and retrieve flow using Rig and Qdrant. -| Memory | Requests/s | -| --- | --- | -| 600mb | 5 rps | -| 300mb | 0.9 rps / 1.1 sec per query | -| 150mb | 0.4 rps / 2.5 sec per query | -| 135mb | 0.33 rps / 3 sec per query | -| 125mb | out of memory | +```rust +use qdrant_client::{ + qdrant::{PointStruct, QueryPointsBuilder, UpsertPointsBuilder}, + Payload, Qdrant, +}; +use rig::{ + embeddings::EmbeddingsBuilder, + providers::openai::{Client, TEXT_EMBEDDING_3_SMALL}, + vector_store::VectorStoreIndex, +}; +use rig_qdrant::QdrantVectorStore; +use serde_json::json; -At this point the importance of the disk speed becomes critical. -We can serve the search requests with 135mb of RAM, but the speed of the requests makes it impossible to use the system in production. +const COLLECTION_NAME: &str = "rig-collection"; -Let’s see how we can improve the speed. +// Initialize Qdrant client. +let client = Qdrant::from_url("http://localhost:6334").build()?; +// Initialize OpenAI client. +let openai_client = Client::new(""); +let model = openai_client.embedding_model(TEXT_EMBEDDING_3_SMALL); + +let documents = EmbeddingsBuilder::new(model.clone()) + .simple_document("0981d983-a5f8-49eb-89ea-f7d3b2196d2e", "Definition of a *flurbo*: A flurbo is a green alien that lives on cold planets") + .simple_document("62a36d43-80b6-4fd6-990c-f75bb02287d1", "Definition of a *glarb-glarb*: A glarb-glarb is a ancient tool used by the ancestors of the inhabitants of planet Jiro to farm the land.") + .simple_document("f9e17d59-32e5-440c-be02-b2759a654824", "Definition of a *linglingdong*: A term used by inhabitants of the far side of the moon to describe humans.") + .build() + .await?; -## [Anchor](https://qdrant.tech/articles/memory-consumption/\#how-to-speed-up-the-search) How to speed up the search +let points: Vec = documents + .into_iter() + .map(|d| { + let vec: Vec = d.embeddings[0].vec.iter().map(|&x| x as f32).collect(); + PointStruct::new( + d.id, + vec, + Payload::try_from(json!({ + "document": d.document, + })) + .unwrap(), + ) + }) + .collect(); -To measure the impact of disk parameters on search speed, we used the `fio` tool to test the speed of different types of disks. +client + .upsert_points(UpsertPointsBuilder::new(COLLECTION_NAME, points)) + .await?; -```bash -# Install fio -sudo apt-get install fio +let query_params = QueryPointsBuilder::new(COLLECTION_NAME).with_payload(true); +let vector_store = QdrantVectorStore::new(client, model, query_params.build()); -# Run fio to check the random reads speed -fio --randrepeat=1 \ - --ioengine=libaio \ - --direct=1 \ - --gtod_reduce=1 \ - --name=fiotest \ - --filename=testfio \ - --bs=4k \ - --iodepth=64 \ - --size=8G \ - --readwrite=randread +let results = vector_store + .top_n::("Define a glarb-glarb?", 1) + .await?; +println!("Results: {:?}", results); ``` -Initially, we tested on a network-mounted disk, but its performance was too slow, with a read IOPS of 6366 and a bandwidth of 24.9 MiB/s: +## Further reading -```text -read: IOPS=6366, BW=24.9MiB/s (26.1MB/s)(8192MiB/329424msec) +- [Rig-rs Documentation](https://rig.rs) +- [Source Code](https://github.com/0xPlaygrounds/rig) -``` +<|page-257-lllmstxt|> +# Salesforce Mulesoft -To improve performance, we switched to a local disk, which showed much faster results, with a read IOPS of 63.2k and a bandwidth of 247 MiB/s: +[MuleSoft Anypoint](https://www.salesforce.com/in/mulesoft/anypoint-platform/) is an integration platform to connect applications, data, and devices across on-premises and cloud environments. It provides a unified platform to build, manage, and secure APIs and integrations, making digital transformation smoother and more scalable. -```text -read: IOPS=63.2k, BW=247MiB/s (259MB/s)(8192MiB/33207msec) +[MAC Project](https://mac-project.ai) is an open-source initiative to bring AI capabilities into the MuleSoft ecosystem. It provides connectors to add AI capabilities to an Anypoint project by integrating LLMs, vector databases including Qdrant. -``` +## Setup -That gave us a significant speed boost, but we wanted to see if we could improve performance even further. -To do that, we switched to a machine with a local SSD, which showed even better results, with a read IOPS of 183k and a bandwidth of 716 MiB/s: +To use Qdrant with Anypoint, you can install the [Mulesoft Vectors connector](https://mac-project.ai/docs/ms-vectors). Paste the following Maven Dependency into your Mule application pom file. -```text -read: IOPS=183k, BW=716MiB/s (751MB/s)(8192MiB/11438msec) +```xml + + io.github.mulesoft-ai-chain-project + mule4-vectors-connector + 0.3.0 + mule-plugin + +``` + +The project will now rebuild with the connector. You also need to install the optional dependencies for the Qdrant connector. +```xml + + + +... + + + + ... + + org.mule.tools.maven + mule-maven-plugin + 4.3.0 + true + + + + io.github.mulesoft-ai-chain-project + mule4-vectors-connector + + + + dev.langchain4j + langchain4j-qdrant + 0.35.0 + + + + + + + + + +... + ``` -Let’s see how these results translate into search speed: +## Usage -| Memory | RPS with IOPS=63.2k | RPS with IOPS=183k | -| --- | --- | --- | -| 600mb | 5 | 50 | -| 300mb | 0.9 | 13 | -| 200mb | 0.5 | 8 | -| 150mb | 0.4 | 7 | +The MuleSoft Vectors connector is shipped with 3 different categories of operations: Document, Embedding and Store. For each category a dedicated configuration must be defined. -As you can see, the speed of the disk has a significant impact on the search speed. -With a local SSD, we were able to increase the search speed by 10x! +The store configuration allows to pick-up the right vector store option among the available ones. When configuring the connection to a specific vector store it's also possible to test it. -With the production-grade disk, the search speed could be even higher. -Some configurations of the SSDs can reach 1M IOPS and more. +Go to the `Global Elements` in your MuleSoft project, and create a new configuration. In the `Connector Configuration`, you will find the `MuleSoft Vectors Connector Store` config. -Which might be an interesting option to serve large datasets with low search latency in Qdrant. +Upon selecting `Qdrant`, you'll be presented with the following parameters to set up the connection to a Qdrant instance. -## [Anchor](https://qdrant.tech/articles/memory-consumption/\#conclusion) Conclusion +![Qdrant Connection](/documentation/platforms/mulesoft/qdrant-connection.png) -In this article, we showed that Qdrant has flexibility in terms of RAM usage and can be used to serve large datasets. It provides configurable trade-offs between RAM usage and search speed. If you’re interested to learn more about Qdrant, [book a demo today](https://qdrant.tech/contact-us/)! +Once a connection is set up, you can now use the following Qdrant operations in your workflows. -We are eager to learn more about how you use Qdrant in your projects, what challenges you face, and how we can help you solve them. -Please feel free to join our [Discord](https://qdrant.to/discord) and share your experience with us! +### Store Add -##### Was this page useful? +The Add operation adds a document or text to a collection. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +![Qdrant Add](/documentation/platforms/mulesoft/qdrant-add.png) -Thank you for your feedback! 🙏 +### Store List -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/memory-consumption.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +The List sources operation lists all entries in a collection. -On this page: +![Qdrant List](/documentation/platforms/mulesoft/qdrant-list.png) -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/memory-consumption.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +### Store Query -× +The Query operation retrieves information from a collection based on a query a embedding and an optional filter. -[Powered by](https://qdrant.tech/) +![Qdrant Query](/documentation/platforms/mulesoft/qdrant-query.png) -<|page-141-lllmstxt|> -## distance-based-exploration -- [Articles](https://qdrant.tech/articles/) -- Distance-based data exploration +### Store Remove -[Back to Data Exploration](https://qdrant.tech/articles/data-exploration/) +The Remove operation remove all entries from a collection based on a filter. -# Distance-based data exploration +![Qdrant Add](/documentation/platforms/mulesoft/qdrant-remove.png) -Andrey Vasnetsov +## Further reading -· +- [Mulesoft Anypoint Studio](https://docs.mulesoft.com/studio/latest/) +- [MAC Project](https://mac-project.ai) -March 11, 2025 +<|page-258-lllmstxt|> +# Semantic-Router -![Distance-based data exploration](https://qdrant.tech/articles_data/distance-based-exploration/preview/title.jpg) +[Semantic-Router](https://www.aurelio.ai/semantic-router/) is a library to build decision-making layers for your LLMs and agents. It uses vector embeddings to make tool-use decisions rather than LLM generations, routing our requests using semantic meaning. -## [Anchor](https://qdrant.tech/articles/distance-based-exploration/\#hidden-structure) Hidden Structure +Qdrant is available as a supported index in Semantic-Router for you to ingest route data and perform retrievals. -When working with large collections of documents, images, or other arrays of unstructured data, it often becomes useful to understand the big picture. -Examining data points individually is not always the best way to grasp the structure of the data. +## Installation -![Data visualization](https://qdrant.tech/articles_data/distance-based-exploration/no-context-data.png) +To use Semantic-Router with Qdrant, install the `qdrant` extra: -Datapoints without context, pretty much useless +```console +pip install semantic-router[qdrant] +``` -As numbers in a table obtain meaning when plotted on a graph, visualising distances (similar/dissimilar) between unstructured data items can reveal hidden structures and patterns. +## Usage -![Data visualization](https://qdrant.tech/articles_data/distance-based-exploration/data-on-chart.png) +Set up `QdrantIndex` with the appropriate configurations: -Vizualized chart, very intuitive +```python +from semantic_router.index import QdrantIndex -There are many tools to investigate data similarity, and Qdrant’s [1.12 release](https://qdrant.tech/blog/qdrant-1.12.x/) made it much easier to start this investigation. With the new [Distance Matrix API](https://qdrant.tech/documentation/concepts/explore/#distance-matrix), Qdrant handles the most computationally expensive part of the process—calculating the distances between data points. +qdrant_index = QdrantIndex( + url="https://xyz-example.eu-central.aws.cloud.qdrant.io", api_key="" +) +``` -In many implementations, the distance matrix calculation was part of the clustering or visualization processes, requiring either brute-force computation or building a temporary index. With Qdrant, however, the data is already indexed, and the distance matrix can be computed relatively cheaply. +Once the Qdrant index is set up with the appropriate configurations, we can pass it to the `RouteLayer`. -In this article, we will explore several methods for data exploration using the Distance Matrix API. +```python +from semantic_router.layer import RouteLayer -## [Anchor](https://qdrant.tech/articles/distance-based-exploration/\#dimensionality-reduction) Dimensionality Reduction +RouteLayer(encoder=some_encoder, routes=some_routes, index=qdrant_index) +``` -Initially, we might want to visualize an entire dataset, or at least a large portion of it, at a glance. However, high-dimensional data cannot be directly visualized. We must apply dimensionality reduction techniques to convert data into a lower-dimensional representation while preserving important data properties. +## Complete Example -In this article, we will use [UMAP](https://github.com/lmcinnes/umap) as our dimensionality reduction algorithm. +
-Here is a **very** simplified but intuitive explanation of UMAP: +Click to expand -1. _Randomly generate points in 2D space_: Assign a random 2D point to each high-dimensional point. -2. _Compute distance matrix for high-dimensional points_: Calculate distances between all pairs of points. -3. _Compute distance matrix for 2D points_: Perform similarly to step 2. -4. _Match both distance matrices_: Adjust 2D points to minimize differences. +```python +import os -![UMAP](https://qdrant.tech/articles_data/distance-based-exploration/umap.png) +from semantic_router import Route +from semantic_router.encoders import OpenAIEncoder +from semantic_router.index import QdrantIndex +from semantic_router.layer import RouteLayer + +# we could use this as a guide for our chatbot to avoid political conversations +politics = Route( + name="politics value", + utterances=[ + "isn't politics the best thing ever", + "why don't you tell me about your political opinions", + "don't you just love the president", + "they're going to destroy this country!", + "they will save the country!", + ], +) -Canonical example of UMAP results, [source](https://github.com/lmcinnes/umap?tab=readme-ov-file#performance-and-examples) +# this could be used as an indicator to our chatbot to switch to a more +# conversational prompt +chitchat = Route( + name="chitchat", + utterances=[ + "how's the weather today?", + "how are things going?", + "lovely weather today", + "the weather is horrendous", + "let's go to the chippy", + ], +) -UMAP preserves the relative distances between high-dimensional points; the actual coordinates are not essential. If we already have the distance matrix, step 2 can be skipped entirely. +# we place both of our decisions together into single list +routes = [politics, chitchat] -Let’s use Qdrant to calculate the distance matrix and apply UMAP. -We will use one of the default datasets perfect for experimenting in Qdrant– [Midjourney Styles dataset](https://midlibrary.io/). +os.environ["OPENAI_API_KEY"] = "" +encoder = OpenAIEncoder() -Use this command to download and import the dataset into Qdrant: +rl = RouteLayer( + encoder=encoder, + routes=routes, + index=QdrantIndex(location=":memory:"), +) -```http -PUT /collections/midlib/snapshots/recover -{ - "location": "http://snapshots.qdrant.io/midlib.snapshot" -} +print(rl("What have you been upto?").name) +``` + +This returns: +```console +[Out]: 'chitchat' ``` -We also need to prepare our python enviroment: +
-```bash -pip install umap-learn seaborn matplotlib qdrant-client +## 📚 Further Reading + +- Semantic-Router [Documentation](https://github.com/aurelio-labs/semantic-router/tree/main/docs) +- Semantic-Router [Video Course](https://www.aurelio.ai/course/semantic-router) +- [Source Code](https://github.com/aurelio-labs/semantic-router/blob/main/semantic_router/index/qdrant.py) + +<|page-259-lllmstxt|> +# SmolAgents + +HuggingFace [SmolAgents](https://github.com/huggingface/smolagents) is a Python library for building AI agents. These agents write Python code to call tools and orchestrate other agents. +It uses `CodeAgent`. An LLM engine that writes its actions in code. SmolAgents suggests that this approach is demonstrated to work better than the current industry practice of letting the LLM output a dictionary of the tools it wants to call: [uses 30% fewer steps](https://huggingface.co/papers/2402.01030) (thus 30% fewer LLM calls) +and [reaches higher performance on difficult benchmarks](https://huggingface.co/papers/2411.01747). + +## Usage with Qdrant + +We'll demonstrate how you can pair SmolAgents with Qdrant's retrieval by building a movie recommendation agent. + +### Installation + +```shell +pip install smolagents qdrant-client fastembed ``` -Import the necessary libraries: +### Setup a Qdrant tool -```python -# Used to talk to Qdrant +We'll build a SmolAgents tool that can query a Qdrant collection. This tool will vectorise queries locally using [FastEmbed](https://github.com/qdrant/fastembed). + +Initially, we'll be populating a Qdrant collection with information about 1000 movies from IMDb that we can search across. + +```py +from fastembed import TextEmbedding from qdrant_client import QdrantClient -# Package with original UMAP implementation -from umap import UMAP -# Python implementation for sparse matrices -from scipy.sparse import csr_matrix -# For vizualization -import seaborn as sns +from smolagents import Tool -``` -Establish connection to Qdrant: +class QdrantQueryTool(Tool): + name = "qdrant_query" + description = "Uses semantic search to retrieve movies from a Qdrant collection." + inputs = { + "query": { + "type": "string", + "description": "The query to perform. This should be semantically close to your target documents.", + } + } + output_type = "string" -```python -client = QdrantClient("http://localhost:6333") + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.collection_name = "smolagents" + self.client = QdrantClient() + + if not self.client.collection_exists(self.collection_name): + self.client.recover_snapshot( + collection_name=self.collection_name, + location="https://snapshots.qdrant.io/imdb-1000-jina.snapshot", + ) + self.embedder = TextEmbedding(model_name="jinaai/jina-embeddings-v2-base-en") + + def forward(self, query: str) -> str: + points = self.client.query_points( + self.collection_name, query=next(self.embedder.query_embed(query)), limit=5 + ).points + docs = "Retrieved documents:\n" + "".join( + [ + f"== Document {str(i)} ==\n" + + f"MOVIE TITLE: {point.payload['movie_name']}\n" + + f"MOVIE SUMMARY: {point.payload['description']}\n" + for i, point in enumerate(points) + ] + ) + return docs ``` -After this is done, we can compute the distance matrix: +### Define the agent + +We can now set up `CodeAgent` to use our `QdrantQueryTool`. ```python +from smolagents import CodeAgent, HfApiModel +import os -# Request distances matrix from Qdrant -# `_offsets` suffix defines a format of the output matrix. -result = client.search_matrix_offsets( - collection_name="midlib", - sample=1000, # Select a subset of the data, as the whole dataset might be too large - limit=20, # For performance reasons, limit the number of closest neighbors to consider -) +# HuggingFace Access Token +# https://huggingface.co/docs/hub/en/security-tokens +os.environ["HF_TOKEN"] = "----------" -# Convert distances matrix to python-native format -matrix = csr_matrix( - (result.scores, (result.offsets_row, result.offsets_col)) +agent = CodeAgent( + tools=[QdrantQueryTool()], model=HfApiModel(), max_iterations=4, verbose=True ) +``` -# Make the matrix symmetric, as UMAP expects it. -# Distance matrix is always symmetric, but qdrant only computes half of it. -matrix = matrix + matrix.T +Finally, we can run the agent with a user query. +```python +agent_output = agent.run("Movie about people taking a strong action for justice") +print(agent_output) ``` -Now we can apply UMAP to the distance matrix: +We should results similar to: + +```console +[...truncated] + +Out - Final answer: Jai Bhim +[Step 1: Duration 0.25 seconds| Input tokens: 4,497 | Output tokens: 134] +Jai Bhim +``` + +## Further Reading + +- [SmolAgents Blog](https://huggingface.co/blog/smolagents#code-agents) +- [SmolAgents Source](https://github.com/huggingface/smolagents) + +<|page-260-lllmstxt|> +# Snowflake + +Qdrant supports working with [Snowflake](https://www.snowflake.com/blog/introducing-snowflake-arctic-embed-snowflakes-state-of-the-art-text-embedding-family-of-models/) text embedding models. You can find all the available models on [HuggingFace](https://huggingface.co/Snowflake). + +### Setting up the Qdrant and Snowflake models ```python -umap = UMAP( - metric="precomputed", # We provide ready-made distance matrix - n_components=2, # output dimension - n_neighbors=20, # Same as the limit in the search_matrix_offsets -) +from qdrant_client import QdrantClient +from fastembed import TextEmbedding -vectors_2d = umap.fit_transform(matrix) +qclient = QdrantClient(":memory:") +embedding_model = TextEmbedding("snowflake/snowflake-arctic-embed-s") +texts = [ + "Qdrant is the best vector search engine!", + "Loved by Enterprises and everyone building for low latency, high performance, and scale.", +] ``` -That’s all that is needed to get the 2d representation of the data. +```typescript +import {QdrantClient} from '@qdrant/js-client-rest'; +import { pipeline } from '@xenova/transformers'; -![UMAP on Midlib](https://qdrant.tech/articles_data/distance-based-exploration/umap-midlib.png) +const client = new QdrantClient({ url: 'http://localhost:6333' }); -UMAP applied to Midlib dataset +const extractor = await pipeline('feature-extraction', 'Snowflake/snowflake-arctic-embed-s'); -UMAP isn’t the only algorithm compatible with our distance matrix API. For example, `scikit-learn` also offers: +const texts = [ + "Qdrant is the best vector search engine!", + "Loved by Enterprises and everyone building for low latency, high performance, and scale.", +] +``` -- [Isomap](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.Isomap.html) \- Non-linear dimensionality reduction through Isometric Mapping. -- [SpectralEmbedding](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.SpectralEmbedding.html) \- Forms an affinity matrix given by the specified function and applies spectral decomposition to the corresponding graph Laplacian. -- [TSNE](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html) \- well-known algorithm for dimensionality reduction. +The following example shows how to embed documents with the [`snowflake-arctic-embed-s`](https://huggingface.co/Snowflake/snowflake-arctic-embed-s) model that generates sentence embeddings of size 384. -## [Anchor](https://qdrant.tech/articles/distance-based-exploration/\#clustering) Clustering +### Embedding documents -Another approach to data structure understanding is clustering–grouping similar items. +```python +embeddings = embedding_model.embed(texts) +``` -_Note that there’s no universally best clustering criterion or algorithm._ +```typescript +const embeddings = await extractor(texts, { normalize: true, pooling: 'cls' }); +``` -![Clustering](https://qdrant.tech/articles_data/distance-based-exploration/clustering.png) +### Converting the model outputs to Qdrant points -Clustering example, [source](https://scikit-learn.org/) +```python +from qdrant_client.models import PointStruct -Many clustering algorithms accept precomputed distance matrix as input, so we can use the same distance matrix we calculated before. +points = [ + PointStruct( + id=idx, + vector=embedding, + payload={"text": text}, + ) + for idx, (embedding, text) in enumerate(zip(embeddings, texts)) +] +``` -Let’s consider a simple example of clustering the Midlib dataset with **KMeans algorithm**. +```typescript +let points = embeddings.tolist().map((embedding, i) => { + return { + id: i, + vector: embedding, + payload: { + text: texts[i] + } + } +}); +``` -From [scikit-learn.cluster documentation](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html) we know that `fit()` method of KMeans algorithm prefers as an input: +### Creating a collection to insert the documents -> `X : {array-like, sparse matrix} of shape (n_samples, n_features)`: -> -> Training instances to cluster. It must be noted that the data will be converted to C ordering, which will cause a memory copy if the given data is not C-contiguous. If a sparse matrix is passed, a copy will be made if it’s not in CSR format. +```python +from qdrant_client.models import VectorParams, Distance -So we can re-use `matrix` from the previous example: +COLLECTION_NAME = "example_collection" -```python -from sklearn.cluster import KMeans +qclient.create_collection( + COLLECTION_NAME, + vectors_config=VectorParams( + size=384, + distance=Distance.COSINE, + ), +) +qclient.upsert(COLLECTION_NAME, points) +``` -# Initialize KMeans with 10 clusters -kmeans = KMeans(n_clusters=10) +```typescript +const COLLECTION_NAME = "example_collection" -# Generate index of the cluster each sample belongs to -cluster_labels = kmeans.fit_predict(matrix) +await client.createCollection(COLLECTION_NAME, { + vectors: { + size: 384, + distance: 'Cosine', + } +}); +await client.upsert(COLLECTION_NAME, { + wait: true, + points +}); ``` -With this simple code, we have clustered the data into 10 clusters, while the main CPU-intensive part of the process was done by Qdrant. - -![Clustering on Midlib](https://qdrant.tech/articles_data/distance-based-exploration/clustering-midlib.png) +### Searching for documents with Qdrant -Clustering applied to Midlib dataset - -How to plot this chart +Once the documents are added, you can search for the most relevant documents. ```python -sns.scatterplot( - # Coordinates obtained from UMAP - x=vectors_2d[:, 0], y=vectors_2d[:, 1], - # Color datapoints by cluster - hue=cluster_labels, - palette=sns.color_palette("pastel", 10), - legend="full", -) +query_embedding = next(embedding_model.query_embed("What is the best to use for vector search scaling?")) +qclient.search( + collection_name=COLLECTION_NAME, + query_vector=query_embedding, +) ``` -## [Anchor](https://qdrant.tech/articles/distance-based-exploration/\#graphs) Graphs - -Clustering and dimensionality reduction both aim to provide a more transparent overview of the data. -However, they share a common characteristic - they require a training step before the results can be visualized. +```typescript +const query_embedding = await extractor("What is the best to use for vector search scaling?", { + normalize: true, + pooling: 'cls' +}); -This also implies that introducing new data points necessitates re-running the training step, which may be computationally expensive. +await client.search(COLLECTION_NAME, { + vector: query_embedding.tolist()[0], +}); +``` -Graphs offer an alternative approach to data exploration, enabling direct, interactive visualization of relationships between data points. -In a graph representation, each data point is a node, and similarities between data points are represented as edges connecting the nodes. +<|page-261-lllmstxt|> +# Spring AI -Such a graph can be rendered in real-time using [force-directed layout](https://en.wikipedia.org/wiki/Force-directed_graph_drawing) algorithms, which aim to minimize the system’s energy by repositioning nodes dynamically–the more similar the data points are, the stronger the edges between them. +[Spring AI](https://docs.spring.io/spring-ai/reference/) is a Java framework that provides a [Spring-friendly](https://spring.io/) API and abstractions for developing AI applications. -Adding new data points to the graph is as straightforward as inserting new nodes and edges without the need to re-run any training steps. +Qdrant is available as supported vector database for use within your Spring AI projects. -In practice, rendering a graph for an entire dataset at once may be computationally expensive and overwhelming for the user. Therefore, let’s explore a few strategies to address this issue. +## Installation -### [Anchor](https://qdrant.tech/articles/distance-based-exploration/\#expanding-from-a-single-node) Expanding from a single node +You can find the Spring AI installation instructions [here](https://docs.spring.io/spring-ai/reference/getting-started.html). -This is the simplest approach, where we start with a single node and expand the graph by adding the most similar nodes to the graph. +Add the Qdrant boot starter package. -![Graph](https://qdrant.tech/articles_data/distance-based-exploration/graph.gif) +```xml + + org.springframework.ai + spring-ai-qdrant-store-spring-boot-starter + +``` -Graph representation of the data +## Usage -### [Anchor](https://qdrant.tech/articles/distance-based-exploration/\#sampling-from-a-collection) Sampling from a collection +Configure Qdrant with Spring Boot’s `application.properties`. -Expanding a single node works well if you want to explore neighbors of a single point, but what if you want to explore the whole dataset? -If your dataset is small enough, you can render relations for all the data points at once. But it is a rare case in practice. +``` +spring.ai.vectorstore.qdrant.host= +spring.ai.vectorstore.qdrant.port= +spring.ai.vectorstore.qdrant.api-key= +spring.ai.vectorstore.qdrant.collection-name= +``` -Instead, we can sample a subset of the data and render the graph for this subset. -This way, we can get a good overview of the data without overwhelming the user with too much information. +Learn more about these options in the [configuration reference](https://docs.spring.io/spring-ai/reference/api/vectordbs/qdrant.html#qdrant-vectorstore-properties). -Let’s try to do so in [Qdrant’s Graph Exploration Tool](https://qdrant.tech/blog/qdrant-1.11.x/#web-ui-graph-exploration-tool): +Or you can set up the Qdrant vector store with the `QdrantVectorStoreConfig` options. -```json -{ - "limit": 5, # node neighbors to consider - "sample": 100 # nodes +```java +@Bean +public QdrantVectorStoreConfig qdrantVectorStoreConfig() { + + return QdrantVectorStoreConfig.builder() + .withHost("") + .withPort() + .withCollectionName("") + .withApiKey("") + .build(); } +``` +Build the vector store using the config and any of the support [Spring AI embedding providers](https://docs.spring.io/spring-ai/reference/api/embeddings.html#available-implementations). + +```java +@Bean +public VectorStore vectorStore(QdrantVectorStoreConfig config, EmbeddingClient embeddingClient) { + return new QdrantVectorStore(config, embeddingClient); +} ``` -![Graph](https://qdrant.tech/articles_data/distance-based-exploration/graph-sampled.png) +You can now use the `VectorStore` instance backed by Qdrant as a vector store in the Spring AI APIs. -Graph representation of the data ( [Qdrant’s Graph Exploration Tool](https://qdrant.tech/blog/qdrant-1.11.x/#web-ui-graph-exploration-tool)) + -This graph captures some high-level structure of the data, but as you might have noticed, it is quite noisy. -This is because the differences in similarities are relatively small, and they might be overwhelmed by the stretches and compressions of the force-directed layout algorithm. +## 📚 Further Reading -To make the graph more readable, let’s concentrate on the most important similarities and build a so called [Minimum/Maximum Spanning Tree](https://en.wikipedia.org/wiki/Minimum_spanning_tree). +- Spring AI [Qdrant reference](https://docs.spring.io/spring-ai/reference/api/vectordbs/qdrant.html) +- Spring AI [API reference](https://docs.spring.io/spring-ai/reference/index.html) +- [Source Code](https://github.com/spring-projects/spring-ai/tree/main/vector-stores/spring-ai-qdrant-store) -```json -{ - "limit": 5, - "sample": 100, - "tree": true -} +<|page-262-lllmstxt|> +# Stanford DSPy -``` +[DSPy](https://github.com/stanfordnlp/dspy) is the framework for solving advanced tasks with language models (LMs) and retrieval models (RMs). It unifies techniques for prompting and fine-tuning LMs — and approaches for reasoning, self-improvement, and augmentation with retrieval and tools. -![Graph](https://qdrant.tech/articles_data/distance-based-exploration/spanning-tree.png) +- Provides composable and declarative modules for instructing LMs in a familiar Pythonic syntax. -Spanning tree of the graph ( [Qdrant’s Graph Exploration Tool](https://qdrant.tech/blog/qdrant-1.11.x/#web-ui-graph-exploration-tool)) +- Introduces an automatic compiler that teaches LMs how to conduct the declarative steps in your program. -This algorithm will only keep the most important edges and remove the rest while keeping the graph connected. -By doing so, we can reveal clusters of the data and the most important relations between them. +Qdrant can be used as a retrieval mechanism in the DSPy flow. -In some sense, this is similar to hierarchical clustering, but with the ability to interactively explore the data. -Another analogy might be a dynamically constructed mind map. +## Installation -## [Anchor](https://qdrant.tech/articles/distance-based-exploration/\#conclusion) Conclusion +For the Qdrant retrieval integration, include `dspy-ai` with the `qdrant` extra: +```bash +pip install dspy-ai dspy-qdrant fastembed +``` -Vector similarity goes beyond looking up the nearest neighbors–it provides a powerful tool for data exploration. -Many algorithms can construct human-readable data representations, and Qdrant makes using them easy. +## Usage + +We can configure `DSPy` settings to use the Qdrant retriever model like so: +```python +import os +import dspy +from dspy_qdrant import QdrantRM +from qdrant_client import QdrantClient -Several data exploration instruments are available in the Qdrant Web UI ( [Visualization and Graph Exploration Tools](https://qdrant.tech/articles/web-ui-gsoc/)), and for more advanced use cases, you could directly utilise our distance matrix API. +lm = dspy.LM("gpt-4o-mini", max_tokens=512,api_key=os.environ.get("OPENAI_API_KEY")) +client = QdrantClient(url=os.environ.get("QDRANT_CLOUD_URL"), api_key=os.environ.get("QDRANT_API_KEY")) +collection_name = "collection_name" +rm = QdrantRM( + qdrant_collection_name=collection_name, + qdrant_client=client, + vector_name="dense", # <-- MATCHES your vector name + document_field="passage_text", # <-- MATCHES your payload field + k=20) -Try it with your data and see what hidden structures you can reveal! +dspy.settings.configure(lm=lm, rm=rm) +``` +Using the retriever is pretty simple. The `dspy.Retrieve(k)` module will search for the top-k passages that match a given query. -##### Was this page useful? +```python +retrieve = dspy.Retrieve(k=3) +question = "Some question about my data" +topK_passages = retrieve(question).passages -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +print(f"Top {retrieve.k} passages for question: {question} \n", "\n") -Thank you for your feedback! 🙏 +for idx, passage in enumerate(topK_passages): + print(f"{idx+1}]", passage, "\n") +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/distance-based-exploration.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +With Qdrant configured as the retriever for contexts, you can set up a DSPy module like so: +```python +class RAG(dspy.Module): + def __init__(self, num_passages=3): + super().__init__() -On this page: + self.retrieve = dspy.Retrieve(k=num_passages) + ... -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/distance-based-exploration.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) + def forward(self, question): + context = self.retrieve(question).passages + ... -× +``` -[Powered by](https://qdrant.tech/) +With the generic RAG blueprint now in place, you can add the many interactions offered by DSPy with context retrieval powered by Qdrant. -<|page-142-lllmstxt|> -## role-management -- [Documentation](https://qdrant.tech/documentation/) -- [Cloud rbac](https://qdrant.tech/documentation/cloud-rbac/) -- Role Management +## Next steps -# [Anchor](https://qdrant.tech/documentation/cloud-rbac/role-management/\#role-management) Role Management +- Find DSPy usage docs and examples [here](https://github.com/stanfordnlp/dspy#4-documentation--tutorials). -> 💡 You can access this in **Access Management > User & Role Management** _if available see [this page for details](https://qdrant.tech/documentation/cloud-rbac/)._ +- [Source Code](https://github.com/stanfordnlp/dspy/blob/main/dspy/retrieve/qdrant_rm.py) -A **Role** contains a set of **permissions** that define the ability to perform or control specific actions in Qdrant Cloud. Permissions are accessible through the Permissions tab in the Role Details page and offer fine-grained access control, logically grouped for easy identification. +<|page-263-lllmstxt|> +## Sycamore -## [Anchor](https://qdrant.tech/documentation/cloud-rbac/role-management/\#built-in-roles) Built-In Roles +[Sycamore](https://sycamore.readthedocs.io/en/stable/) is an LLM-powered data preparation, processing, and analytics system for complex, unstructured documents like PDFs, HTML, presentations, and more. With Aryn, you can prepare data for GenAI and RAG applications, power high-quality document processing workflows, and run analytics on large document collections with natural language. -Qdrant Cloud includes some built-in roles for common use-cases. The permissions for these built-in roles cannot be changed. +You can use the Qdrant connector to write into and read documents from Qdrant collections. -There are three types: + -- The **Base Role** is assigned to all users, and provides the minimum privileges required to access Qdrant Cloud. -- The **Admin Role**  has all available permissions, except for account write permissions. -- The **Owner Role** has all available permissions assigned, including account write permissions. There can only be one Owner per account currently. +## Writing to Qdrant -![image.png](https://qdrant.tech/documentation/cloud/role-based-access-control/built-in-roles.png) +To write a Docset to a Qdrant collection in Sycamore, use the `docset.write.qdrant(....)` function. The Qdrant writer accepts the following arguments: -## [Anchor](https://qdrant.tech/documentation/cloud-rbac/role-management/\#custom-roles) Custom Roles +- `client_params`: Parameters that are passed to the Qdrant client constructor. See more information in the [Client API Reference](https://python-client.qdrant.tech/qdrant_client.qdrant_client). +- `collection_params`: Parameters that are passed into the `qdrant_client.QdrantClient.create_collection` method. See more information in the [Client API Reference](https://python-client.qdrant.tech/_modules/qdrant_client/qdrant_client#QdrantClient.create_collection). +- `vector_name`: The name of the vector in the Qdrant collection. Defaults to `None`. +- `execute`: Execute the pipeline and write to Qdrant on adding this operator. If `False`, will return a `DocSet` with this write in the plan. Defaults to `True`. +- `kwargs`: Keyword arguments to pass to the underlying execution engine. -An authorized user can create their own custom roles with specific sets of permissions, giving them more control over who has what access to which resource. +```python +ds.write.qdrant( + { + "url": "http://localhost:6333", + "timeout": 50, + }, + { + "collection_name": "{collection_name}", + "vectors_config": { + "size": 384, + "distance": "Cosine", + }, + }, +) -![image.png](https://qdrant.tech/documentation/cloud/role-based-access-control/custom-roles.png) +``` -### [Anchor](https://qdrant.tech/documentation/cloud-rbac/role-management/\#creating-a-custom-role) Creating a Custom Role +## Reading from Qdrant -To create a new custom role, click on the **Add** button at the top-right corner of the **Custom Roles** list. +To read a Docset from a Qdrant collection in Sycamore, use the `docset.read.qdrant(....)` function. The Qdrant reader accepts the following arguments: -- **Role Name**: Must be unique across roles. -- **Role Description**: Brief description of the role’s purpose. +- `client_params`: Parameters that are passed to the Qdrant client constructor. See more information in the[Client API Reference](https://python-client.qdrant.tech/qdrant_client.qdrant_client). +- `query_params`: Parameters that are passed into the `qdrant_client.QdrantClient.query_points` method. See more information in the [Client API Reference](https://python-client.qdrant.tech/_modules/qdrant_client/qdrant_client#QdrantClient.query_points). +- `kwargs`: Keyword arguments to pass to the underlying execution engine. -Once created, the new role will appear under the **Custom Roles** section in the navigation. +```python +docs = ctx.read.qdrant( + { + "url": "https://xyz-example.eu-central.aws.cloud.qdrant.io:6333", + "api_key": "", + }, + {"collection_name": "{collection_name}", "limit": 100, "using": "{optional_vector_name}"}, +).take_all() -![image.png](https://qdrant.tech/documentation/cloud/role-based-access-control/create-custom-role.png) +``` -### [Anchor](https://qdrant.tech/documentation/cloud-rbac/role-management/\#editing-a-custom-role) Editing a Custom Role +## 📚 Further Reading -To update a specific role’s permissions, select it from the list and click on the **Permissions** tab. Here, you’ll find logically grouped options that are easy to identify and edit as needed. Once you’ve made your changes, save them to apply the updated permissions to the role. +- [Sycamore Reference](https://sycamore.readthedocs.io/en/stable/) +- [Sycamore](https://github.com/aryn-ai/sycamore/tree/main/examples) -![image.png](https://qdrant.tech/documentation/cloud/role-based-access-control/update-permission.png) +<|page-264-lllmstxt|> +![Terraform Logo](/documentation/platforms/terraform/terraform.png) -### [Anchor](https://qdrant.tech/documentation/cloud-rbac/role-management/\#renaming-deleting-and-duplicating-a-custom-role) Renaming, Deleting and Duplicating a Custom Role +HashiCorp Terraform is an infrastructure as code tool that lets you define both cloud and on-prem resources in human-readable configuration files that you can version, reuse, and share. You can then use a consistent workflow to provision and manage all of your infrastructure throughout its lifecycle. -Each custom role can be renamed, duplicated or deleted via the action buttons located to the right of the role title bar. +With the [Qdrant Terraform Provider](https://registry.terraform.io/providers/qdrant/qdrant-cloud/latest), you can manage the Qdrant cloud lifecycle leveraging all the goodness of Terraform. -- **Rename**: Opens a dialog allowing users to update both the role name and description. -- **Delete**: Triggers a confirmation prompt to confirm the deletion. Once confirmed, this action is irreversible. Any users assigned to the deleted role will automatically be unassigned from it. -- **Duplicate:** Opens a dialog asking for a confirmation and also allowing users to view the list of permissions that will be assigned to the duplicated role +## Pre-requisites -![image.png](https://qdrant.tech/documentation/cloud/role-based-access-control/role-actions.png) +To use the Qdrant Terraform Provider, you'll need: -##### Was this page useful? +1. A [Terraform installation](https://developer.hashicorp.com/terraform/install). +2. An [API key](/documentation/qdrant-cloud-api/#authentication-connecting-to-cloud-api) to access the Qdrant cloud API. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +## Example Usage -Thank you for your feedback! 🙏 +The following example creates a new Qdrant cluster in Google Cloud Platform (GCP) and returns the URL of the cluster. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-rbac/role-management.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +```terraform +terraform { + required_version = ">= 1.7.0" + required_providers { + qdrant-cloud = { + source = "qdrant/qdrant-cloud" + version = ">=1.1.0" + } + } +} -On this page: +provider "qdrant-cloud" { + api_key = "" + account_id = "QDRANT_ACCOUNT_ID>" // Account ID from cloud.qdrant.io/accounts// (can be overriden on resource level) +} -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-rbac/role-management.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +resource "qdrant-cloud_accounts_cluster" "example" { + name = "tf-example-cluster" + cloud_provider = "gcp" + cloud_region = "us-east4" + configuration { + number_of_nodes = 1 + node_configuration { + package_id = "7c939d96-d671-4051-aa16-3b8b7130fa42" + } + } +} -× +output "url" { + value = qdrant-cloud_accounts_cluster.example.url +} +``` -[Powered by](https://qdrant.tech/) +The provider includes the following resources and data-sources to work with: -<|page-143-lllmstxt|> -## databricks -- [Documentation](https://qdrant.tech/documentation/) -- [Send data](https://qdrant.tech/documentation/send-data/) -- Qdrant on Databricks +## Resources -# [Anchor](https://qdrant.tech/documentation/send-data/databricks/\#qdrant-on-databricks) Qdrant on Databricks +- `qdrant-cloud_accounts_cluster` - Create clusters on Qdrant cloud - [Reference](https://github.com/qdrant/terraform-provider-qdrant-cloud/blob/main/docs/resources/accounts_cluster.md) -| Time: 30 min | Level: Intermediate | [Complete Notebook](https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/4750876096379825/93425612168199/6949977306828869/latest.html) | -| --- | --- | --- | +- `qdrant-cloud_accounts_auth_key` - Create API keys for Qdrant cloud clusters. [Reference](https://github.com/qdrant/terraform-provider-qdrant-cloud/blob/main/docs/resources/accounts_auth_key.md) -[Databricks](https://www.databricks.com/) is a unified analytics platform for working with big data and AI. It’s built around Apache Spark, a powerful open-source distributed computing system well-suited for processing large-scale datasets and performing complex analytics tasks. +## Data Sources -Apache Spark is designed to scale horizontally, meaning it can handle expensive operations like generating vector embeddings by distributing computation across a cluster of machines. This scalability is crucial when dealing with large datasets. +- `qdrant-cloud_accounts_auth_keys` - List API keys for Qdrant clusters. [Reference](https://github.com/qdrant/terraform-provider-qdrant-cloud/blob/main/docs/data-sources/accounts_auth_keys.md) -In this example, we will demonstrate how to vectorize a dataset with dense and sparse embeddings using Qdrant’s [FastEmbed](https://qdrant.github.io/fastembed/) library. We will then load this vectorized data into a Qdrant cluster using the [Qdrant Spark connector](https://qdrant.tech/documentation/frameworks/spark/) on Databricks. +- `qdrant-cloud_accounts_cluster` - Get Cluster Information. [Reference](https://github.com/qdrant/terraform-provider-qdrant-cloud/blob/main/docs/data-sources/accounts_cluster.md) -### [Anchor](https://qdrant.tech/documentation/send-data/databricks/\#setting-up-a-databricks-project) Setting up a Databricks project +- `qdrant-cloud_accounts_clusters` - List Qdrant clusters. [Reference](https://github.com/qdrant/terraform-provider-qdrant-cloud/blob/main/docs/data-sources/accounts_clusters.md) -- Set up a **[Databricks cluster](https://docs.databricks.com/en/compute/configure.html)** following the official documentation guidelines. +- `qdrant-cloud_booking_packages` - Get detailed information about the packages/subscriptions available. [Reference](https://github.com/qdrant/terraform-provider-qdrant-cloud/blob/main/docs/data-sources/booking_packages.md) -- Install the **[Qdrant Spark connector](https://qdrant.tech/documentation/frameworks/spark/)** as a library: +## Further Reading - - Navigate to the `Libraries` section in your cluster dashboard. +- [Provider Documentation](https://registry.terraform.io/providers/qdrant/qdrant-cloud/latest/docs) +- [Terraform Quickstart](https://developer.hashicorp.com/terraform/tutorials) - - Click on `Install New` at the top-right to open the library installation modal. +<|page-265-lllmstxt|> +# Testcontainers - - Search for `io.qdrant:spark:VERSION` in the Maven packages and click on `Install`. +[Testcontainers](https://testcontainers.com/) is a testing library that provides easy and lightweight APIs for bootstrapping integration tests with real services wrapped in Docker containers. - ![Install the library](https://qdrant.tech/documentation/examples/databricks/library-install.png) -- Create a new **[Databricks notebook](https://docs.databricks.com/en/notebooks/index.html)** on your cluster to begin working with your data and libraries. +Qdrant is available as a [Testcontainers module](https://testcontainers.com/modules/qdrant/) in multiple languages. It facilitates the spawning of a Qdrant instance for end-to-end testing. +## Usage -### [Anchor](https://qdrant.tech/documentation/send-data/databricks/\#download-a-dataset) Download a dataset +```java +import org.testcontainers.qdrant.QdrantContainer; -- **Install the required dependencies:** +QdrantContainer qdrantContainer = new QdrantContainer("qdrant/qdrant"); +``` -```python -%pip install fastembed datasets +```go +import ( + "github.com/testcontainers/testcontainers-go" + "github.com/testcontainers/testcontainers-go/modules/qdrant" +) +qdrantContainer, err := qdrant.RunContainer(ctx, testcontainers.WithImage("qdrant/qdrant")) ``` -- **Download the dataset:** +```typescript +import { QdrantContainer } from "@testcontainers/qdrant"; + +const qdrantContainer = await new QdrantContainer("qdrant/qdrant").start(); +``` ```python -from datasets import load_dataset +from testcontainers.qdrant import QdrantContainer -dataset_name = "tasksource/med" -dataset = load_dataset(dataset_name, split="train") -# We'll use the first 100 entries from this dataset and exclude some unused columns. -dataset = dataset.select(range(100)).remove_columns(["gold_label", "genre"]) +qdrant_container = QdrantContainer("qdrant/qdrant").start() +``` + +```csharp +var qdrantContainer = new QdrantBuilder() + .WithImage("qdrant/qdrant") + .Build(); +await qdrantContainer.StartAsync(); ``` -- **Convert the dataset into a Spark dataframe:** +Testcontainers modules provide options/methods to configure ENVs, volumes, and virtually everything you can configure in a Docker container. -```python -dataset.to_parquet("/dbfs/pq.pq") -dataset_df = spark.read.parquet("file:/dbfs/pq.pq") +## Further reading -``` +- [Testcontainers Guides](https://testcontainers.com/guides/) +- [Testcontainers Qdrant Module](https://testcontainers.com/modules/qdrant/) -### [Anchor](https://qdrant.tech/documentation/send-data/databricks/\#vectorizing-the-data) Vectorizing the data +<|page-266-lllmstxt|> +# ToolJet -In this section, we’ll be generating both dense and sparse vectors for our rows using [FastEmbed](https://qdrant.github.io/fastembed/). We’ll create a user-defined function (UDF) to handle this step. +[ToolJet](https://www.tooljet.com) is a low-code platform for building business applications. Connect to databases, cloud storages, GraphQL, API endpoints, Airtable, Google sheets, OpenAI, etc and build apps using drag and drop application builder. -#### [Anchor](https://qdrant.tech/documentation/send-data/databricks/\#creating-the-vectorization-function) Creating the vectorization function +## Prerequisites -```python -from fastembed import TextEmbedding, SparseTextEmbedding +1. A Qdrant instance to connect to. You can get a free cloud instance at [cloud.qdrant.io](https://cloud.qdrant.io/). +2. A [ToolJet instance](https://www.tooljet.com) to develop your workflows. -def vectorize(partition_data): - # Initialize dense and sparse models - dense_model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5") - sparse_model = SparseTextEmbedding(model_name="Qdrant/bm25") +## Setting Up - for row in partition_data: - # Generate dense and sparse vectors - dense_vector = next(dense_model.embed(row.sentence1)) - sparse_vector = next(sparse_model.embed(row.sentence2)) +- Search for the Qdrant plugin in the Tooljet [plugins marketplace](https://docs.tooljet.ai/docs/marketplace/plugins/marketplace-plugin-qdrant/). - yield [\ - row.sentence1, # 1st column: original text\ - row.sentence2, # 2nd column: original text\ - dense_vector.tolist(), # 3rd column: dense vector\ - sparse_vector.indices.tolist(), # 4th column: sparse vector indices\ - sparse_vector.values.tolist(), # 5th column: sparse vector values\ - ] +- Set up the connection to Qdrant using your instance credentials. -``` +![Qdrant Connection](/documentation/platforms/tooljet/tooljet-connection.png) -We’re using the [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) model for dense embeddings and [BM25](https://huggingface.co/Qdrant/bm25) for sparse embeddings. +You can interface with the Qdrant instance using the following Tooljet operations. -#### [Anchor](https://qdrant.tech/documentation/send-data/databricks/\#applying-the-udf-on-our-dataframe) Applying the UDF on our dataframe +- List Collections - Get the names of all the available collections in the Qdrant instance. -Next, let’s apply our `vectorize` UDF on our Spark dataframe to generate embeddings. +![Qdrant List Collections](/documentation/platforms/tooljet/tooljet-list-collections.png) -```python -embeddings = dataset_df.rdd.mapPartitions(vectorize) +- Collection Info - Get the configuration of a specific collection. -``` +![Qdrant Collection Info](/documentation/platforms/tooljet/tooljet-collection-info.png) -The `mapPartitions()` method returns a [Resilient Distributed Dataset (RDD)](https://www.databricks.com/glossary/what-is-rdd) which should then be converted back to a Spark dataframe. +- Upsert Points - Add points to a collection. -#### [Anchor](https://qdrant.tech/documentation/send-data/databricks/\#building-the-new-spark-dataframe-with-the-vectorized-data) Building the new Spark dataframe with the vectorized data +![Qdrant Upsert Points](/documentation/platforms/tooljet/tooljet-upsert-points.png) -We’ll now create a new Spark dataframe ( `embeddings_df`) with the vectorized data using the specified schema. +- Get Points - Get points from a collection by IDs or [filters](https://qdrant.tech/documentation/concepts/filtering/). -```python -from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType +![Qdrant Get Points](/documentation/platforms/tooljet/tooljet-get-points.png) -# Define the schema for the new dataframe -schema = StructType([\ - StructField("sentence1", StringType()),\ - StructField("sentence2", StringType()),\ - StructField("dense_vector", ArrayType(FloatType())),\ - StructField("sparse_vector_indices", ArrayType(IntegerType())),\ - StructField("sparse_vector_values", ArrayType(FloatType()))\ -]) +- Delete Points - Delete points from a collection by [filters](https://qdrant.tech/documentation/concepts/filtering/). -# Create the new dataframe with the vectorized data -embeddings_df = spark.createDataFrame(data=embeddings, schema=schema) +![Qdrant Delete Points](/documentation/platforms/tooljet/tooljet-delete-points.png) -``` +- Query Points - [Search](https://qdrant.tech/documentation/concepts/search/) for points in a collection. -### [Anchor](https://qdrant.tech/documentation/send-data/databricks/\#uploading-the-data-to-qdrant) Uploading the data to Qdrant +![Qdrant Query Points](/documentation/platforms/tooljet/tooljet-query-points.png) -- **Create a Qdrant collection:** +## Further Reading +- [ToolJet Documentation](https://docs.tooljet.com/docs/). +- [ToolJet Qdrant Plugin](https://docs.tooljet.ai/docs/marketplace/plugins/marketplace-plugin-qdrant/). - - [Follow the documentation](https://qdrant.tech/documentation/concepts/collections/#create-a-collection) to create a collection with the appropriate configurations. Here’s an example request to support both dense and sparse vectors: +<|page-267-lllmstxt|> +# Twelve Labs -```json -PUT /collections/{collection_name} -{ - "vectors": { - "dense": { - "size": 384, - "distance": "Cosine" - } - }, - "sparse_vectors": { - "sparse": {} - } -} +[Twelve Labs](https://twelvelabs.io) Embed API provides powerful embeddings that represent videos, texts, images, and audio in a unified vector space. This space enables any-to-any searches across different types of content. -``` +By natively processing all modalities, it captures interactions like visual expressions, speech, and context, enabling advanced applications such as sentiment analysis, anomaly detection, and recommendation systems with precision and efficiency. -- **Upload the dataframe to Qdrant:** +We'll look at how to work with Twelve Labs embeddings in Qdrant via the Python and Node SDKs. +### Installing the SDKs ```python -options = { - "qdrant_url": "", - "api_key": "", - "collection_name": "", - "vector_fields": "dense_vector", - "vector_names": "dense", - "sparse_vector_value_fields": "sparse_vector_values", - "sparse_vector_index_fields": "sparse_vector_indices", - "sparse_vector_names": "sparse", - "schema": embeddings_df.schema.json(), -} - -embeddings_df.write.format("io.qdrant.spark.Qdrant").options(**options).mode( - "append" -).save() +$ pip install twelvelabs qdrant-client +``` +```typescript +$ npm install twelvelabs-js @qdrant/js-client-rest ``` -Ensure to replace the placeholder values ( ``, ``, ``) with your actual values. If the `id_field` option is not specified, Qdrant Spark connector generates random UUIDs for each point. +### Setting up the clients -The command output you should see is similar to: +```python +from twelvelabs import TwelveLabs +from qdrant_client import QdrantClient -```console -Command took 40.37 seconds -- by xxxxx90@xxxxxx.com at 4/17/2024, 12:13:28 PM on fastembed +# Get your API keys from: +# https://playground.twelvelabs.io/dashboard/api-key +TL_API_KEY = "" +twelvelabs_client = TwelveLabs(api_key=TL_API_KEY) +qdrant_client = QdrantClient(url="http://localhost:6333/") ``` -### [Anchor](https://qdrant.tech/documentation/send-data/databricks/\#conclusion) Conclusion - -That wraps up our tutorial! Feel free to explore more functionalities and experiments with different models, parameters, and features available in Databricks, Spark, and Qdrant. +```typescript +import { QdrantClient } from '@qdrant/js-client-rest'; +import { TwelveLabs, EmbeddingsTask, SegmentEmbedding } from 'twelvelabs-js'; -Happy data engineering! +// Get your API keys from: +// https://playground.twelvelabs.io/dashboard/api-key +const TL_API_KEY = "" -##### Was this page useful? +const twelveLabsClient = new TwelveLabs({ apiKey: TL_API_KEY }); +const qdrantClient = new QdrantClient({ url: 'http://localhost:6333' }); +``` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +The following example uses the `"Marengo-retrieval-2.7"` model to embed a video. It generates vector embeddings of 1024 dimensionality and works with cosine similarity. -Thank you for your feedback! 🙏 +You can use the same model to embed audio, text and images into a common vector space. Enabling cross-modality searches! -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/send-data/databricks.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +### Embedding videos -On this page: +```python +task = twelvelabs_client.embed.task.create( + model_name="Marengo-retrieval-2.7", + video_url="https://sample-videos.com/video321/mp4/720/big_buck_bunny_720p_2mb.mp4" +) -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/send-data/databricks.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +task.wait_for_done(sleep_interval=3) -× +task_result = twelvelabs_client.embed.task.retrieve(task.id) +``` -[Powered by](https://qdrant.tech/) +```typescript +const task = await twelveLabsClient.embed.task.create("Marengo-retrieval-2.7", { + url: "https://sample-videos.com/video321/mp4/720/big_buck_bunny_720p_2mb.mp4" +}) -<|page-144-lllmstxt|> -## agentic-rag -- [Articles](https://qdrant.tech/articles/) -- What is Agentic RAG? Building Agents with Qdrant +await task.waitForDone(3) -[Back to RAG & GenAI](https://qdrant.tech/articles/rag-and-genai/) +const taskResult = await twelveLabsClient.embed.task.retrieve(task.id) +``` -# What is Agentic RAG? Building Agents with Qdrant +### Converting the model outputs to Qdrant points -Kacper Ɓukawski +```python +from qdrant_client.models import PointStruct -· +points = [ + PointStruct( + id=idx, + vector=v.embeddings_float, + payload={ + "start_offset_sec": v.start_offset_sec, + "end_offset_sec": v.end_offset_sec, + "embedding_scope": v.embedding_scope, + }, + ) + for idx, v in enumerate(task_result.video_embedding.segments) +] +``` -November 22, 2024 +```typescript +let points = taskResult.videoEmbedding.segments.map((data, i) => { + return { + id: i, + vector: data.embeddingsFloat, + payload: { + startOffsetSec: data.startOffsetSec, + endOffsetSec: data.endOffsetSec, + embeddingScope: data.embeddingScope + } + } +}) +``` -![What is Agentic RAG? Building Agents with Qdrant](https://qdrant.tech/articles_data/agentic-rag/preview/title.jpg) +### Creating a collection to insert the vectors -Standard [Retrieval Augmented Generation](https://qdrant.tech/articles/what-is-rag-in-ai/) follows a predictable, linear path: receive -a query, retrieve relevant documents, and generate a response. In many cases that might be enough to solve a particular -problem. In the worst case scenario, your LLM will just decide to not answer the question, because the context does not -provide enough information. +```python +from qdrant_client.models import VectorParams, Distance -![Standard, linear RAG pipeline](https://qdrant.tech/articles_data/agentic-rag/linear-rag.png) +collection_name = "twelve_labs_collection" -On the other hand, we have agents. These systems are given more freedom to act, and can take multiple non-linear steps -to achieve a certain goal. There isn’t a single definition of what an agent is, but in general, it is an application -that uses LLM and usually some tools to communicate with the outside world. LLMs are used as decision-makers which -decide what action to take next. Actions can be anything, but they are usually well-defined and limited to a certain -set of possibilities. One of these actions might be to query a vector database, like Qdrant, to retrieve relevant -documents, if the context is not enough to make a decision. However, RAG is just a single tool in the agent’s arsenal. +qdrant_client.create_collection( + collection_name, + vectors_config=VectorParams( + size=1024, + distance=Distance.COSINE, + ), +) +qdrant_client.upsert(collection_name, points) +``` -![AI Agent](https://qdrant.tech/articles_data/agentic-rag/ai-agent.png) +```typescript +const COLLECTION_NAME = "twelve_labs_collection" -## [Anchor](https://qdrant.tech/articles/agentic-rag/\#agentic-rag-combining-rag-with-agents) Agentic RAG: Combining RAG with Agents +await qdrantClient.createCollection(COLLECTION_NAME, { + vectors: { + size: 1024, + distance: 'Cosine', + } +}); -Since the agent definition is vague, the concept of **Agentic RAG** is also not well-defined. In general, it refers to -the combination of RAG with agents. This allows the agent to use external knowledge sources to make decisions, and -primarily to decide when the external knowledge is needed. We can describe a system as Agentic RAG if it breaks the -linear flow of a standard RAG system, and gives the agent the ability to take multiple steps to achieve a goal. +await qdrantClient.upsert(COLLECTION_NAME, { + wait: true, + points +}) +``` -A simple router that chooses a path to follow is often described as the simplest form of an agent. Such a system has -multiple paths with conditions describing when to take a certain path. In the context of Agentic RAG, the agent can -decide to query a vector database if the context is not enough to answer, or skip the query if it’s enough, or when the -question refers to common knowledge. Alternatively, there might be multiple collections storing different kinds of -information, and the agent can decide which collection to query based on the context. The key factor is that the -decision of choosing a path is made by the LLM, which is the core of the agent. A routing agent never comes back to the -previous step, so it’s ultimately just a conditional decision-making system. +## Perform a search -![Routing Agent](https://qdrant.tech/articles_data/agentic-rag/routing-agent.png) +Once the vectors are added, you can run semantic searches across different modalities. Let's try text. -However, routing is just the beginning. Agents can be much more complex, and extreme forms of agents can have complete -freedom to act. In such cases, the agent is given a set of tools and can autonomously decide which ones to use, how to -use them, and in which order. LLMs are asked to plan and execute actions, and the agent can take multiple steps to -achieve a goal, including taking steps back if needed. Such a system does not have to follow a DAG structure (Directed -Acyclic Graph), and can have loops that help to self-correct the decisions made in the past. An agentic RAG system -built in that manner can have tools not only to query a vector database, but also to play with the query, summarize the -results, or even generate new data to answer the question. Options are endless, but there are some common patterns -that can be observed in the wild. +```python +text_segment = twelvelabs_client.embed.create( + model_name="Marengo-retrieval-2.7", + text="", +).text_embedding.segments[0] -![Autonomous Agent](https://qdrant.tech/articles_data/agentic-rag/autonomous-agent.png) +qdrant_client.query_points( + collection_name=collection_name, + query=text_segment.embeddings_float, +) +``` -### [Anchor](https://qdrant.tech/articles/agentic-rag/\#solving-information-retrieval-problems-with-llms) Solving Information Retrieval Problems with LLMs +```typescript +const textSegment = (await twelveLabsClient.embed.create({ + modelName: "Marengo-retrieval-2.7", + text: "" +})).textEmbedding.segments[0] -Generally speaking, tools exposed in an agentic RAG system are used to solve information retrieval problems which are -not new to the search community. LLMs have changed how we approach these problems, but the core of the problem remains -the same. What kind of tools you can consider using in an agentic RAG? Here are some examples: +await qdrantClient.query(COLLECTION_NAME, { + query: textSegment.embeddingsFloat, +}); +``` -- **Querying a vector database** \- the most common tool used in agentic RAG systems. It allows the agent to retrieve -relevant documents based on the query. -- **Query expansion** \- a tool that can be used to improve the query. It can be used to add synonyms, correct typos, or -even to generate new queries based on the original one. -![Query expansion example](https://qdrant.tech/articles_data/agentic-rag/query-expansion.png) -- **Extracting filters** \- vector search alone is sometimes not enough. In many cases, you might want to narrow down -the results based on specific parameters. This extraction process can automatically identify relevant conditions from -the query. Otherwise, your users would have to manually define these search constraints. -![Extracting filters](https://qdrant.tech/articles_data/agentic-rag/extracting-filters.png) -- **Quality judgement** \- knowing the quality of the results for given query can be used to decide whether they are good -enough to answer, or if the agent should take another step to improve them somehow. Alternatively it can also admit -the failure to provide good response. -![Quality judgement](https://qdrant.tech/articles_data/agentic-rag/quality-judgement.png) +Let's try audio: -These are just some of the examples, but the list is not exhaustive. For example, your LLM could possibly play with -Qdrant search parameters or choose different methods to query it. An example? If your users are searching using some -specific keywords, you may prefer sparse vectors to dense vectors, as they are more efficient in such cases. In that -case you have to arm your agent with tools to decide when to use sparse vectors and when to use dense vectors. Agent -aware of the collection structure can make such decisions easily. +```python +audio_segment = twelvelabs_client.embed.create( + model_name="Marengo-retrieval-2.7", + audio_url="https://codeskulptor-demos.commondatastorage.googleapis.com/descent/background%20music.mp3", +).audio_embedding.segments[0] -Each of these tools might be a separate agent on its own, and multi-agent systems are not uncommon. In such cases, -agents can communicate with each other, and one agent can decide to use another agent to solve a particular problem. -Pretty useful component of an agentic RAG is also a human in the loop, which can be used to correct the agent’s -decisions, or steer it in the right direction. +qdrant_client.query_points( + collection_name=collection_name, + query=audio_segment.embeddings_float, +) +``` -## [Anchor](https://qdrant.tech/articles/agentic-rag/\#where-are-agents-used) Where are Agents Used? +```typescript +const audioSegment = (await twelveLabsClient.embed.create({ + modelName: "Marengo-retrieval-2.7", + audioUrl: "https://codeskulptor-demos.commondatastorage.googleapis.com/descent/background%20music.mp3" +})).audioEmbedding.segments[0] -Agents are an interesting concept, but since they heavily rely on LLMs, they are not applicable to all problems. Using -Large Language Models is expensive and tend to be slow, what in many cases, it’s not worth the cost. Standard RAG -involves just a single call to the LLM, and the response is generated in a predictable way. Agents, on the other hand, -can take multiple steps, and the latency experienced by the user adds up. In many cases, it’s not acceptable. -Agentic RAG is probably not that widely applicable in ecommerce search, where the user expects a quick response, but -might be fine for customer support, where the user is willing to wait a bit longer for a better answer. +await qdrantClient.query(COLLECTION_NAME, { + query: audioSegment.embeddingsFloat, +}); +``` -## [Anchor](https://qdrant.tech/articles/agentic-rag/\#which-framework-is-best) Which Framework is Best? +Similarly, querying by image: -There are lots of frameworks available to build agents, and choosing the best one is not easy. It depends on your -existing stack or the tools you are familiar with. Some of the most popular LLM libraries have already drifted towards -the agent paradigm, and they are offering tools to build them. There are, however, some tools built primarily for -agents development, so let’s focus on them. +```python +image_segment = twelvelabs_client.embed.create( + model_name="Marengo-retrieval-2.7", + image_url="https://gratisography.com/wp-content/uploads/2024/01/gratisography-cyber-kitty-1170x780.jpg", +).image_embedding.segments[0] -### [Anchor](https://qdrant.tech/articles/agentic-rag/\#langgraph) LangGraph +qdrant_client.query_points( + collection_name=collection_name, + query=image_segment.embeddings_float, +) +``` -Developed by the LangChain team, LangGraph seems like a natural extension for those who already use LangChain for -building their RAG systems, and would like to start with agentic RAG. - -Surprisingly, LangGraph has nothing to do with Large Language Models on its own. It’s a framework for building -graph-based applications in which each **node** is a step of the workflow. Each node takes an application **state** as -an input, and produces a modified state as an output. The state is then passed to the next node, and so on. **Edges** -between the nodes might be conditional what makes branching possible. Contrary to some DAG-based tool (i.e. Apache -Airflow), LangGraph allows for loops in the graph, which makes it possible to implement cyclic workflows, so an agent -can achieve self-reflection and self-correction. Theoretically, LangGraph can be used to build any kind of applications -in a graph-based manner, not only LLM agents. +```typescript +const imageSegment = (await twelveLabsClient.embed.create({ + modelName: "Marengo-retrieval-2.7", + imageUrl: "https://gratisography.com/wp-content/uploads/2024/01/gratisography-cyber-kitty-1170x780.jpg" +})).imageEmbedding.segments[0] -Some of the strengths of LangGraph include: +await qdrantClient.query(COLLECTION_NAME, { + query: imageSegment.embeddingsFloat, +}); +``` -- **Persistence** \- the state of the workflow graph is stored as a checkpoint. That happens at each so-called super-step -(which is a single sequential node of a graph). It enables replying certain steps of the workflow, fault-tolerance, -and including human-in-the-loop interactions. This mechanism also acts as a **short-term memory**, accessible in a -context of a particular workflow execution. -- **Long-term memory** \- LangGraph also has a concept of memories that are shared between different workflow runs. -However, this mechanism has to explicitly handled by our nodes. **Qdrant with its semantic search capabilities is** -**often used as a long-term memory layer**. -- **Multi-agent support** \- while there is no separate concept of multi-agent systems in LangGraph, it’s possible to -create such an architecture by building a graph that includes multiple agents and some kind of supervisor that -makes a decision which agent to use in a given situation. If a node might be anything, then it might be another agent -as well. +## Further Reading -Some other interesting features of LangGraph include the ability to visualize the graph, automate the retries of failed -steps, and include human-in-the-loop interactions. +- [Twelve Labs Documentation](https://docs.twelvelabs.io/) +- [Twelve Labs Examples](https://docs.twelvelabs.io/docs/sample-applications) -A minimal example of an agentic RAG could improve the user query, e.g. by fixing typos, expanding it with synonyms, or -even generating a new query based on the original one. The agent could then retrieve documents from a vector database -based on the improved query, and generate a response. The LangGraph app implementing this approach could look like this: +<|page-268-lllmstxt|> +# txtai -```python -from typing import Sequence -from typing_extensions import TypedDict, Annotated -from langchain_core.messages import BaseMessage -from langgraph.constants import START, END -from langgraph.graph import add_messages, StateGraph +Qdrant might be also used as an embedding backend in [txtai](https://neuml.github.io/txtai/) semantic applications. -class AgentState(TypedDict): - # The state of the agent includes at least the messages exchanged between the agent(s) - # and the user. It is, however, possible to include other information in the state, as - # it depends on the specific agent. - messages: Annotated[Sequence[BaseMessage], add_messages] +txtai simplifies building AI-powered semantic search applications using Transformers. It leverages the neural embeddings and their +properties to encode high-dimensional data in a lower-dimensional space and allows to find similar objects based on their embeddings' +proximity. -def improve_query(state: AgentState): - ... +Qdrant is not built-in txtai backend and requires installing an additional dependency: -def retrieve_documents(state: AgentState): - ... +```bash +pip install qdrant-txtai +``` -def generate_response(state: AgentState): - ... +The examples and some more information might be found in [qdrant-txtai repository](https://github.com/qdrant/qdrant-txtai). -# Building a graph requires defining nodes and building the flow between them with edges. -builder = StateGraph(AgentState) +<|page-269-lllmstxt|> +# Unstructured -builder.add_node("improve_query", improve_query) -builder.add_node("retrieve_documents", retrieve_documents) -builder.add_node("generate_response", generate_response) +[Unstructured](https://unstructured.io/) is a library designed to help preprocess, structure unstructured text documents for downstream machine learning tasks. -builder.add_edge(START, "improve_query") -builder.add_edge("improve_query", "retrieve_documents") -builder.add_edge("retrieve_documents", "generate_response") -builder.add_edge("generate_response", END) +Qdrant can be used as an ingestion destination in Unstructured. -# Compiling the graph performs some checks and prepares the graph for execution. -compiled_graph = builder.compile() +## Setup -# Compiled graph might be invoked with the initial state to start. -compiled_graph.invoke({ - "messages": [\ - ("user", "Why Qdrant is the best vector database out there?"),\ - ] -}) +Install Unstructured with the `qdrant` extra. +```bash +pip install "unstructured-ingest[qdrant]" ``` -Each node of the process is just a Python function that does certain operation. You can call an LLM of your choice -inside of them, if you want to, but there is no assumption about the messages being created by any AI. **LangGraph** -**rather acts as a runtime that launches these functions in a specific order, and passes the state between them**. While -[LangGraph](https://www.langchain.com/langgraph) integrates well with the LangChain ecosystem, it can be used -independently. For teams looking for additional support and features, there’s also a commercial offering called -LangGraph Platform. The framework is available for both Python and JavaScript environments, making it possible to be -used in different tech stacks. +## Usage -### [Anchor](https://qdrant.tech/articles/agentic-rag/\#crewai) CrewAI -CrewAI is another popular choice for building agents, including agentic RAG. It’s a high-level framework that assumes -there are some LLM-based agents working together to achieve a common goal. That’s where the “crew” in CrewAI comes from. -CrewAI is designed with multi-agent systems in mind. Contrary to LangGraph, the developer does not create a graph of -processing, but defines agents and their roles within the crew. +Depending on the use case you can prefer the command line or using it within your application. -Some of the key concepts of CrewAI include: +### CLI -- **Agent** \- a unit that has a specific role and goal, controlled by an LLM. It can optionally use some external tools -to communicate with the outside world, but generally steered by prompt we provide to the LLM. -- **Process** \- currently either sequential or hierarchical. It defines how the task will be executed by the agents. -In a sequential process, agents are executed one after another, while in a hierarchical process, agent is selected -by the manager agent, which is responsible for making decisions about which agent to use in a given situation. -- **Roles and goals** \- each agent has a certain role within the crew, and the goal it should aim to achieve. These are -set when we define an agent and are used to make decisions about which agent to use in a given situation. -- **Memory** \- an extensive memory system consists of short-term memory, long-term memory, entity memory, and contextual -memory that combines the other three. There is also user memory for preferences and personalization. **This is where** -**Qdrant comes into play, as it might be used as a long-term memory layer.** +```bash +unstructured-ingest \ + local \ + --input-path $LOCAL_FILE_INPUT_DIR \ + --chunking-strategy by_title \ + --embedding-provider huggingface \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL \ + --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \ + qdrant-cloud \ + --url $QDRANT_URL \ + --api-key $QDRANT_API_KEY \ + --collection-name $QDRANT_COLLECTION \ + --batch-size 50 \ + --num-processes 1 +``` -CrewAI provides a rich set of tools integrated into the framework. That may be a huge advantage for those who want to -combine RAG with e.g. code execution, or image generation. The ecosystem is rich, however brining your own tools is -not a big deal, as CrewAI is designed to be extensible. +For a full list of the options the CLI accepts, run `unstructured-ingest qdrant --help` -A simple agentic RAG application implemented in CrewAI could look like this: +### Programmatic usage ```python -from crewai import Crew, Agent, Task -from crewai.memory.entity.entity_memory import EntityMemory -from crewai.memory.short_term.short_term_memory import ShortTermMemory -from crewai.memory.storage.rag_storage import RAGStorage - -class QdrantStorage(RAGStorage): - ... +import os -response_generator_agent = Agent( - role="Generate response based on the conversation", - goal="Provide the best response, or admit when the response is not available.", - backstory=( - "I am a response generator agent. I generate " - "responses based on the conversation." - ), - verbose=True, -) +from unstructured_ingest.pipeline.pipeline import Pipeline +from unstructured_ingest.interfaces import ProcessorConfig -query_reformulation_agent = Agent( - role="Reformulate the query", - goal="Rewrite the query to get better results. Fix typos, grammar, word choice, etc.", - backstory=( - "I am a query reformulation agent. I reformulate the " - "query to get better results." - ), - verbose=True, +from unstructured_ingest.processes.connectors.local import ( + LocalIndexerConfig, + LocalDownloaderConfig, + LocalConnectionConfig ) +from unstructured_ingest.processes.partitioner import PartitionerConfig +from unstructured_ingest.processes.chunker import ChunkerConfig +from unstructured_ingest.processes.embedder import EmbedderConfig -task = Task( - description="Let me know why Qdrant is the best vector database out there.", - expected_output="3 bullet points", - agent=response_generator_agent, +from unstructured_ingest.processes.connectors.qdrant.cloud import ( + CloudQdrantConnectionConfig, + CloudQdrantAccessConfig, + CloudQdrantUploadStagerConfig, + CloudQdrantUploaderConfig ) -crew = Crew( - agents=[response_generator_agent, query_reformulation_agent], - tasks=[task], - memory=True, - entity_memory=EntityMemory(storage=QdrantStorage("entity")), - short_term_memory=ShortTermMemory(storage=QdrantStorage("short-term")), -) -crew.kickoff() +if __name__ == "__main__": + Pipeline.from_configs( + context=ProcessorConfig(), + indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")), + downloader_config=LocalDownloaderConfig(), + source_connection_config=LocalConnectionConfig(), + partitioner_config=PartitionerConfig( + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), + additional_partition_args={ + "split_pdf_page": True, + "split_pdf_allow_failed": True, + "split_pdf_concurrency_level": 15 + } + ), + chunker_config=ChunkerConfig(chunking_strategy="by_title"), + embedder_config=EmbedderConfig(embedding_provider="huggingface"), + destination_connection_config=CloudQdrantConnectionConfig( + access_config=CloudQdrantAccessConfig( + api_key=os.getenv("QDRANT_API_KEY") + ), + url=os.getenv("QDRANT_URL") + ), + stager_config=CloudQdrantUploadStagerConfig(), + uploader_config=CloudQdrantUploaderConfig( + collection_name=os.getenv("QDRANT_COLLECTION"), + batch_size=50, + num_processes=1 + ) + ).run() ``` -_Disclaimer: QdrantStorage is not a part of the CrewAI framework, but it’s taken from the Qdrant documentation on [how\_\ -_to integrate Qdrant with CrewAI](https://qdrant.tech/documentation/frameworks/crewai/)._ +## Next steps -Although it’s not a technical advantage, CrewAI has a [great documentation](https://docs.crewai.com/introduction). The -framework is available for Python, and it’s easy to get started with it. CrewAI also has a commercial offering, CrewAI -Enterprise, which provides a platform for building and deploying agents at scale. +- Unstructured API [reference](https://unstructured-io.github.io/unstructured/api.html). +- Qdrant ingestion destination [reference](https://docs.unstructured.io/ui/destinations/qdrant). +- [Source Code](https://github.com/Unstructured-IO/unstructured-ingest/tree/main/unstructured_ingest/processes/connectors/qdrant) -### [Anchor](https://qdrant.tech/articles/agentic-rag/\#autogen) AutoGen +<|page-270-lllmstxt|> +# Upstage -AutoGen emphasizes multi-agent architectures as a fundamental design principle. The framework requires at least two -agents in any system to really call an application agentic - typically an assistant and a user proxy exchange messages -to achieve a common goal. Sequential chat with more than two agents is also supported, as well as group chat and nested -chat for internal dialogue. However, AutoGen does not assume there is a structured state that is passed between the -agents, and the chat conversation is the only way to communicate between them. +Qdrant supports working with the Solar Embeddings API from [Upstage](https://upstage.ai/). -There are many interesting concepts in the framework, some of them even quite unique: +[Solar Embeddings](https://developers.upstage.ai/docs/apis/embeddings) API features dual models for user queries and document embedding, within a unified vector space, designed for performant text processing. -- **Tools/functions** \- external components that can be used by agents to communicate with the outside world. They are -defined as Python callables, and can be used for any external interaction we want to allow the agent to do. Type -annotations are used to define the input and output of the tools, and Pydantic models are supported for more complex -type schema. AutoGen supports only OpenAI-compatible tool call API for the time being. -- **Code executors** \- built-in code executors include local command, Docker command, and Jupyter. An agent can write -and launch code, so theoretically the agents can do anything that can be done in Python. None of the other frameworks -made code generation and execution that prominent. Code execution being the first-class citizen in AutoGen is an -interesting concept. +You can generate an API key to authenticate the requests from the [Upstage Console](). -Each AutoGen agent uses at least one of the components: human-in-the-loop, code executor, tool executor, or LLM. -A simple agentic RAG, based on the conversation of two agents which can retrieve documents from a vector database, -or improve the query, could look like this: +### Setting up the Qdrant client and Upstage session ```python -from os import environ - -from autogen import ConversableAgent -from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent +import requests from qdrant_client import QdrantClient -client = QdrantClient(...) +UPSTAGE_BASE_URL = "https://api.upstage.ai/v1/solar/embeddings" -response_generator_agent = ConversableAgent( - name="response_generator_agent", - system_message=( - "You answer user questions based solely on the provided context. You ask to retrieve relevant documents for " - "your query, or reformulate the query, if it is incorrect in some way." - ), - description="A response generator agent that can answer your queries.", - llm_config={"config_list": [{"model": "gpt-4", "api_key": environ.get("OPENAI_API_KEY")}]}, - human_input_mode="NEVER", -) +UPSTAGE_API_KEY = "" -user_proxy = RetrieveUserProxyAgent( - name="retrieval_user", - llm_config={"config_list": [{"model": "gpt-4", "api_key": environ.get("OPENAI_API_KEY")}]}, - human_input_mode="NEVER", - retrieve_config={ - "task": "qa", - "chunk_token_size": 2000, - "vector_db": "qdrant", - "db_config": {"client": client}, - "get_or_create": True, - "overwrite": True, - }, -) +upstage_session = requests.Session() -result = user_proxy.initiate_chat( - response_generator_agent, - message=user_proxy.message_generator, - problem="Why Qdrant is the best vector database out there?", - max_turns=10, -) +client = QdrantClient(url="http://localhost:6333") + +headers = { + "Authorization": f"Bearer {UPSTAGE_API_KEY}", + "Accept": "application/json", +} +texts = [ + "Qdrant is the best vector search engine!", + "Loved by Enterprises and everyone building for low latency, high performance, and scale.", +] ``` -For those new to agent development, AutoGen offers AutoGen Studio, a low-code interface for prototyping agents. While -not intended for production use, it significantly lowers the barrier to entry for experimenting with agent -architectures. +```typescript +import { QdrantClient } from '@qdrant/js-client-rest'; -![AutoGen Studio](https://qdrant.tech/articles_data/agentic-rag/autogen-studio.png) +const UPSTAGE_BASE_URL = "https://api.upstage.ai/v1/solar/embeddings" +const UPSTAGE_API_KEY = "" -It’s worth noting that AutoGen is currently undergoing significant updates, with version 0.4.x in development -introducing substantial API changes compared to the stable 0.2.x release. While the framework currently has limited -built-in persistence and state management capabilities, these features may evolve in future releases. +const client = new QdrantClient({ url: 'http://localhost:6333' }); -### [Anchor](https://qdrant.tech/articles/agentic-rag/\#openai-swarm) OpenAI Swarm +const headers = { + "Authorization": "Bearer " + UPSTAGE_API_KEY, + "Accept": "application/json", + "Content-Type": "application/json" +} -Unliked the other frameworks described in this article, OpenAI Swarm is an educational project, and it’s not ready for -production use. It’s worth mentioning, though, as it’s pretty lightweight and easy to get started with. OpenAI Swarm -is an experimental framework for orchestrating multi-agent workflows that focuses on agent coordination through direct -handoffs rather than complex orchestration patterns. +const texts = [ + "Qdrant is the best vector search engine!", + "Loved by Enterprises and everyone building for low latency, high performance, and scale.", +] +``` -With that setup, **agents** are just exchanging messages in a chat, optionally calling some Python functions to -communicate with external services, or handing off the conversation to another agent, if the other one seems to be more -suitable to answer the question. Each agent has a certain role, defined by the instructions we have to define. -We have to decide which LLM will a particular agent use, and a set of functions it can call. For example, **a retrieval** -**agent could use a vector database to retrieve documents**, and return the results to the next agent. That means, there -should be a function that performs the semantic search on its behalf, but the model will decide how the query should -look like. +The following example shows how to embed documents with the recommended `solar-embedding-1-large-passage` and `solar-embedding-1-large-query` models that generates sentence embeddings of size 4096. -Here is how a similar agentic RAG application, implemented in OpenAI Swarm, could look like: +### Embedding documents ```python -from swarm import Swarm, Agent +body = { + "input": texts, + "model": "solar-embedding-1-large-passage", +} -client = Swarm() +response_body = upstage_session.post( + UPSTAGE_BASE_URL, headers=headers, json=body +).json() +``` -def retrieve_documents(query: str) -> list[str]: - """ - Retrieve documents based on the query. - """ - ... +```typescript +let body = { + "input": texts, + "model": "solar-embedding-1-large-passage", +} -def transfer_to_query_improve_agent(): - return query_improve_agent +let response = await fetch(UPSTAGE_BASE_URL, { + method: "POST", + body: JSON.stringify(body), + headers +}); -query_improve_agent = Agent( - name="Query Improve Agent", - instructions=( - "You are a search expert that takes user queries and improves them to get better results. You fix typos and " - "extend queries with synonyms, if needed. You never ask the user for more information." - ), -) +let response_body = await response.json() +``` -response_generation_agent = Agent( - name="Response Generation Agent", - instructions=( - "You take the whole conversation and generate a final response based on the chat history. " - "If you don't have enough information, you can retrieve the documents from the knowledge base or " - "reformulate the query by transferring to other agent. You never ask the user for more information. " - "You have to always be the last participant of each conversation." - ), - functions=[retrieve_documents, transfer_to_query_improve_agent], -) +### Converting the model outputs to Qdrant points -response = client.run( - agent=response_generation_agent, - messages=[\ - {\ - "role": "user",\ - "content": "Why Qdrant is the best vector database out there?"\ - }\ - ], -) +```python +from qdrant_client.models import PointStruct +points = [ + PointStruct( + id=idx, + vector=data["embedding"], + payload={"text": text}, + ) + for idx, (data, text) in enumerate(zip(response_body["data"], texts)) +] ``` -Even though we don’t explicitly define the graph of processing, the agents can still decide to hand off the processing -to a different agent. There is no concept of a state, so everything relies on the messages exchanged between different -components. - -OpenAI Swarm does not focus on integration with external tools, and **if you would like to integrate semantic search** -**with Qdrant, you would have to implement it fully yourself**. Obviously, the library is tightly coupled with OpenAI -models, and while using some other ones is possible, it requires some additional work like setting up proxy that will -adjust the interface to OpenAI API. - -### [Anchor](https://qdrant.tech/articles/agentic-rag/\#the-winner) The winner? +```typescript +let points = response_body.data.map((data, i) => { + return { + id: i, + vector: data.embedding, + payload: { + text: texts[i] + } + } +}) +``` -Choosing the best framework for your agentic RAG system depends on your existing stack, team expertise, and the -specific requirements of your project. All the described tools are strong contenders, and they are developed at rapid -pace. It’s worth keeping an eye on all of them, as they are likely to evolve and improve over time. Eventually, you -should be able to build the same processes with any of them, but some of them may be more suitable in a specific -ecosystem of the tools you want your agent to interact with. +### Creating a collection to insert the documents -There are, however, some important factors to consider when choosing a framework for your agentic RAG system: +```python +from qdrant_client.models import VectorParams, Distance -- **Human-in-the-loop** \- even though we aim to build autonomous agents, it’s often important to include the feedback -from the human, so our agents cannot perform malicious actions. -- **Observability** \- how easy it is to debug the system, and how easy it is to understand what’s happening inside. -Especially important, since we are dealing with lots of LLM prompts. +collection_name = "example_collection" -Still, choosing the right toolkit depends on the state of your project, and the specific requirements you have. If you -want to integrate your agent with number of external tools, CrewAI might be the best choice, as the set of -out-of-the-box integrations is the biggest. However, LangGraph integrates well with LangChain, so if you are familiar -with that ecosystem, it may suit you better. +client.create_collection( + collection_name, + vectors_config=VectorParams( + size=4096, + distance=Distance.COSINE, + ), +) +client.upsert(collection_name, points) +``` -All the frameworks have different approaches to building agents, so it’s worth experimenting with all of them to see -which one fits your needs the best. LangGraph and CrewAI are more mature and have more features, while AutoGen and -OpenAI Swarm are more lightweight and more experimental. However, **none of the existing frameworks solves all the** -**mentioned Information Retrieval problems**, so you still have to build your own tools to fill the gaps. +```typescript +const COLLECTION_NAME = "example_collection" -## [Anchor](https://qdrant.tech/articles/agentic-rag/\#building-agentic-rag-with-qdrant) Building Agentic RAG with Qdrant +await client.createCollection(COLLECTION_NAME, { + vectors: { + size: 4096, + distance: 'Cosine', + } +}); -No matter which framework you choose, Qdrant is a great tool to build agentic RAG systems. Please check out [our\\ -integrations](https://qdrant.tech/documentation/frameworks/) to choose the best one for your use case and preferences. The easiest way to -start using Qdrant is to use our managed service, [Qdrant Cloud](https://cloud.qdrant.io/). A free 1GB cluster is -available for free, so you can start building your agentic RAG system in minutes. +await client.upsert(COLLECTION_NAME, { + wait: true, + points +}) +``` -### [Anchor](https://qdrant.tech/articles/agentic-rag/\#further-reading) Further Reading +## Searching for documents with Qdrant -See how Qdrant integrates with: +Once all the documents are added, you can search for the most relevant documents. -- [Autogen](https://qdrant.tech/documentation/frameworks/autogen/) -- [CrewAI](https://qdrant.tech/documentation/frameworks/crewai/) -- [LangGraph](https://qdrant.tech/documentation/frameworks/langgraph/) -- [Swarm](https://qdrant.tech/documentation/frameworks/swarm/) +```python +body = { + "input": "What is the best to use for vector search scaling?", + "model": "solar-embedding-1-large-query", +} -##### Was this page useful? +response_body = upstage_session.post( + UPSTAGE_BASE_URL, headers=headers, json=body +).json() -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +client.search( + collection_name=collection_name, + query_vector=response_body["data"][0]["embedding"], +) +``` -Thank you for your feedback! 🙏 +```typescript +body = { + "input": "What is the best to use for vector search scaling?", + "model": "solar-embedding-1-large-query", +} -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/agentic-rag.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +response = await fetch(UPSTAGE_BASE_URL, { + method: "POST", + body: JSON.stringify(body), + headers +}); -On this page: +response_body = await response.json() -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/agentic-rag.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +await client.search(COLLECTION_NAME, { + vector: response_body.data[0].embedding, +}); +``` -× +<|page-271-lllmstxt|> +# Vanna.AI -[Powered by](https://qdrant.tech/) +[Vanna](https://vanna.ai/) is a Python package that uses retrieval augmentation to help you generate accurate SQL queries for your database using LLMs. -![Company Logo](https://cdn.cookielaw.org/logos/static/ot_company_logo.png) +Vanna works in two easy steps - train a RAG "model" on your data, and then ask questions which will return SQL queries that can be set up to automatically run on your database. -## Privacy Preference Center +Qdrant is available as a support vector store for ingesting and retrieving your RAG data. -Cookies used on the site are categorized, and below, you can read about each category and allow or deny some or all of them. When categories that have been previously allowed are disabled, all cookies assigned to that category will be removed from your browser. -Additionally, you can see a list of cookies assigned to each category and detailed information in the cookie declaration. +## Installation +```console +pip install 'vanna[qdrant]' +``` -[More information](https://qdrant.tech/legal/privacy-policy/#cookies-and-web-beacons) +## Setup -Allow All +You can set up a Vanna agent using Qdrant as your vector store and any of the [LLMs supported by Vanna](https://vanna.ai/docs/postgres-openai-vanna-vannadb/). -### Manage Consent Preferences +We'll use OpenAI for demonstration. -#### Targeting Cookies +```python +from vanna.openai import OpenAI_Chat +from vanna.qdrant import Qdrant_VectorStore +from qdrant_client import QdrantClient -Targeting Cookies +class MyVanna(Qdrant, OpenAI_Chat): + def __init__(self, config=None): + Qdrant_VectorStore.__init__(self, config=config) + OpenAI_Chat.__init__(self, config=config) -These cookies may be set through our site by our advertising partners. They may be used by those companies to build a profile of your interests and show you relevant adverts on other sites. They do not store directly personal information, but are based on uniquely identifying your browser and internet device. If you do not allow these cookies, you will experience less targeted advertising. +vn = MyVanna(config={ + 'client': QdrantClient(...), + 'api_key': sk-..., + 'model': gpt-4-..., +}) +``` -#### Functional Cookies +## Usage -Functional Cookies +Once a Vanna agent is instantiated, you can connect it to [any SQL database](https://vanna.ai/docs/FAQ/#can-i-use-this-with-my-sql-database) of your choosing. -These cookies enable the website to provide enhanced functionality and personalisation. They may be set by us or by third party providers whose services we have added to our pages. If you do not allow these cookies then some or all of these services may not function properly. +For example, Postgres. -#### Strictly Necessary Cookies +```python +vn.connect_to_postgres(host='my-host', dbname='my-dbname', user='my-user', password='my-password', port='my-port') +``` -Always Active +You can now train and begin querying your database with SQL. -These cookies are necessary for the website to function and cannot be switched off in our systems. They are usually only set in response to actions made by you which amount to a request for services, such as setting your privacy preferences, logging in or filling in forms. You can set your browser to block or alert you about these cookies, but some parts of the site will not then work. These cookies do not store any personally identifiable information. +```python +# You can add DDL statements that specify table names, column names, types, and potentially relationships +vn.train(ddl=""" + CREATE TABLE IF NOT EXISTS my-table ( + id INT PRIMARY KEY, + name VARCHAR(100), + age INT + ) +""") -#### Performance Cookies +# You can add documentation about your business terminology or definitions. +vn.train(documentation="Our business defines OTIF score as the percentage of orders that are delivered on time and in full") -Performance Cookies +# You can also add SQL queries to your training data. This is useful if you have some queries already laying around. +vn.train(sql="SELECT * FROM my-table WHERE name = 'John Doe'") -These cookies allow us to count visits and traffic sources so we can measure and improve the performance of our site. They help us to know which pages are the most and least popular and see how visitors move around the site. All information these cookies collect is aggregated and therefore anonymous. If you do not allow these cookies we will not know when you have visited our site, and will not be able to monitor its performance. +# You can remove training data if there's obsolete/incorrect information. +vn.remove_training_data(id='1-ddl') -Back Button +# Whenever you ask a new question, Vanna will retrieve 10 most relevant pieces of training data and use it as part of the LLM prompt to generate the SQL. -### Cookie List +vn.ask(question="") +``` -Search Icon +## Further reading -Filter Icon +- [Getting started with Vanna.AI](https://vanna.ai/docs/app/) +- [Vanna.AI documentation](https://vanna.ai/docs/) +- [Source Code](https://github.com/vanna-ai/vanna/tree/main/src/vanna/qdrant) -Clear +<|page-272-lllmstxt|> +![VectaX Logo](/documentation/frameworks/mirror-security/vectax-logo.png) -checkbox labellabel +[VectaX](https://mirrorsecurity.io/vectax) by Mirror Security is an AI-centric access control and encryption system designed for managing and protecting vector embeddings. It combines similarity-preserving encryption with fine-grained RBAC to enable secure storage, retrieval, and operations on vector data. -ApplyCancel +It can be integrated with Qdrant to secure vector searches. -ConsentLeg.Interest +We'll see how to do so using basic VectaX vector encryption and the sophisticated RBAC mechanism. You can obtain an API key and the Mirror SDK from the [Mirror Security Platform](https://platform.mirrorsecurity.io/en/login). -checkbox labellabel +Let's set up both the VectaX and Qdrant clients. -checkbox labellabel +```python +from mirror_sdk.core.mirror_core import MirrorSDK, MirrorConfig +from qdrant_client import QdrantClient +from qdrant_client.models import Distance, VectorParams -checkbox labellabel +# Get your API key from +# https://platform.mirrorsecurity.io +config = MirrorConfig( + api_key="", + server_url="https://mirrorapi.azure-api.net/v1", + secret="", +) +mirror_sdk = MirrorSDK(config) -Reject AllConfirm My Choices +# Connects to http://localhost:6333/ by default +qdrant = QdrantClient() +``` -[![Powered by Onetrust](https://cdn.cookielaw.org/logos/static/powered_by_logo.svg)](https://www.onetrust.com/products/cookie-consent/) +## Vector Encryption -<|page-145-lllmstxt|> -## database-tutorials -- [Documentation](https://qdrant.tech/documentation/) -- Using the Database +Now, let's secure vector embeddings using VectaX encryption. -# [Anchor](https://qdrant.tech/documentation/database-tutorials/\#database-tutorials) Database Tutorials +```python +from qdrant_client.models import PointStruct +from mirror_sdk.core.models import VectorData -| | -| --- | -| [Bulk Upload Vectors to a Qdrant Collection](https://qdrant.tech/documentation/database-tutorials/bulk-upload/) | -| [Large Scale Search](https://qdrant.tech/documentation/database-tutorials/large-scale-search/) | -| [Backup and Restore Qdrant Collections Using Snapshots](https://qdrant.tech/documentation/database-tutorials/create-snapshot/) | -| [Load and Search Hugging Face Datasets with Qdrant](https://qdrant.tech/documentation/database-tutorials/huggingface-datasets/) | -| [Using Qdrant’s Async API for Efficient Python Applications](https://qdrant.tech/documentation/database-tutorials/async-api/) | -| [Qdrant Migration Guide](https://qdrant.tech/documentation/database-tutorials/migration/) | +# Generate or retrieve vector embeddings +# embedding = generate_document_embedding() -##### Was this page useful? +vector_data = VectorData(vector=embedding, id="doc1") +encrypted = mirror_sdk.vectax.encrypt(vector_data) -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +point = PointStruct( + id=0, + vector=encrypted.ciphertext, + payload={ + "content": "Document content", + "iv": encrypted.iv, + "auth_hash": encrypted.auth_hash + } +) +qdrant.upsert(collection_name="vectax", points=[point]) -Thank you for your feedback! 🙏 +# Encrypt a query vector for secure search +# query_embedding = generate_query_embedding(...) -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/database-tutorials/_index.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +encrypted_query = mirror_sdk.vectax.encrypt( + VectorData(vector=query_embedding, id="query") +) -On this page: +results = qdrant.query_points( + collection_name="vectax", + query=encrypted_query.ciphertext, + limit=5 +).points +``` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/database-tutorials/_index.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +## Vector Search with RBAC -× +RBAC allows fine-grained access control over encrypted vector data based on roles, groups, and departments. -[Powered by](https://qdrant.tech/) +### Defining Access Policies -<|page-146-lllmstxt|> -## single-node-speed-benchmark -# Single node benchmarks +```python +app_policy = { + "roles": ["admin", "analyst", "user"], + "groups": ["team_a", "team_b"], + "departments": ["research", "engineering"], +} +mirror_sdk.set_policy(app_policy) +``` -August 23, 2022 +### Generating Access Keys -Dataset:dbpedia-openai-1M-1536-angulardeep-image-96-angulargist-960-euclideanglove-100-angular +```python +# Generate a secret key for use by the 'admin' role holders. +admin_key = mirror_sdk.rbac.generate_user_secret_key( + {"roles": ["admin"], "groups": ["team_a"], "departments": ["research"]} +) +``` -Search threads:1001 +### Storing Encrypted Data with RBAC Policies -Plot values: +We can now store data that is only accessible to users with the "admin" role. -RPS +```python +from mirror_sdk.core.models import RBACVectorData +from mirror_sdk.utils import encode_binary_data -Latency +policy = { + "roles": ["admin"], + "groups": ["team_a"], + "departments": ["research"], +} +# vector_embedding = generate_vector_embedding(...) +vector_data = RBACVectorData( + # Generate or retrieve vector embeddings + vector=vector_embedding, + id=1, + access_policy=policy, +) +encrypted = mirror_sdk.rbac.encrypt(vector_data) + +qdrant.upsert( + collection_name="vectax", + points=[ + models.PointStruct( + id=1, + vector=encrypted.crypto.ciphertext, + payload={ + "encrypted_header": encrypted.encrypted_header, + "encrypted_vector_metadata": encode_binary_data( + encrypted.crypto.serialize() + ), + "content": "My content", + }, + ) + ], +) +``` -p95 latency +### Querying with Role-Based Decryption -Index time +Using the admin key, only accessible data will be decrypted. -| Engine | Setup | Dataset | Upload Time(m) | Upload + Index Time(m) | Latency(ms) | P95(ms) | P99(ms) | RPS | Precision | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| qdrant | qdrant-sq-rps-m-64-ef-512 | dbpedia-openai-1M-1536-angular | 3.51 | 24.43 | 3.54 | 4.95 | 8.62 | 1238.0016 | 0.99 | -| weaviate | latest-weaviate-m32 | dbpedia-openai-1M-1536-angular | 13.94 | 13.94 | 4.99 | 7.16 | 11.33 | 1142.13 | 0.97 | -| elasticsearch | elasticsearch-m-32-ef-128 | dbpedia-openai-1M-1536-angular | 19.18 | 83.72 | 22.10 | 72.53 | 135.68 | 716.80 | 0.98 | -| redis | redis-m-32-ef-256 | dbpedia-openai-1M-1536-angular | 92.49 | 92.49 | 140.65 | 160.85 | 167.35 | 625.27 | 0.97 | -| milvus | milvus-m-16-ef-128 | dbpedia-openai-1M-1536-angular | 0.27 | 1.16 | 393.31 | 441.32 | 576.65 | 219.11 | 0.99 | +```python +from mirror_sdk.core import MirrorError +from mirror_sdk.core.models import MirrorCrypto +from mirror_sdk.utils import decode_binary_data -_Download raw data: [here](https://qdrant.tech/benchmarks/results-1-100-thread-2024-06-15.json)_ +# Encrypt a query vector for secure search +# query_embedding = generate_query_embedding(...) -## [Anchor](https://qdrant.tech/benchmarks/single-node-speed-benchmark/\#observations) Observations +query_data = RBACVectorData(vector=query_embedding, id="query", access_policy=policy) +encrypted_query = mirror_sdk.rbac.encrypt(query_data) -Most of the engines have improved since [our last run](https://qdrant.tech/benchmarks/single-node-speed-benchmark-2022/). Both life and software have trade-offs but some clearly do better: +results = qdrant.query_points( + collection_name="vectax", query=encrypted_query.crypto.ciphertext, limit=10 +) -- **`Qdrant` achives highest RPS and lowest latencies in almost all the scenarios, no matter the precision threshold and the metric we choose.** It has also shown 4x RPS gains on one of the datasets. -- `Elasticsearch` has become considerably fast for many cases but it’s very slow in terms of indexing time. It can be 10x slower when storing 10M+ vectors of 96 dimensions! (32mins vs 5.5 hrs) -- `Milvus` is the fastest when it comes to indexing time and maintains good precision. However, it’s not on-par with others when it comes to RPS or latency when you have higher dimension embeddings or more number of vectors. -- `Redis` is able to achieve good RPS but mostly for lower precision. It also achieved low latency with single thread, however its latency goes up quickly with more parallel requests. Part of this speed gain comes from their custom protocol. -- `Weaviate` has improved the least since our last run. +accessible_results = [] +for point in results.points: + try: + encrypted_vector_metadata = decode_binary_data( + point.payload["encrypted_vector_metadata"] + ) + mirror_data = MirrorCrypto.deserialize(encrypted_vector_metadata) + admin_decrypted = mirror_sdk.rbac.decrypt( + mirror_data, + point.payload["encrypted_header"], + admin_key, + ) + accessible_results.append( + { + "id": point.id, + "content": point.payload["content"], + "score": point.score, + "accessible": True, + } + ) -## [Anchor](https://qdrant.tech/benchmarks/single-node-speed-benchmark/\#how-to-read-the-results) How to read the results + except MirrorError as e: + print(f"Access denied for point {point.id}: {e}") -- Choose the dataset and the metric you want to check. -- Select a precision threshold that would be satisfactory for your usecase. This is important because ANN search is all about trading precision for speed. This means in any vector search benchmark, **two results must be compared only when you have similar precision**. However most benchmarks miss this critical aspect. -- The table is sorted by the value of the selected metric (RPS / Latency / p95 latency / Index time), and the first entry is always the winner of the category 🏆 +# Proceed to only use results within `accessible_results`. +``` -### [Anchor](https://qdrant.tech/benchmarks/single-node-speed-benchmark/\#latency-vs-rps) Latency vs RPS +## Further Reading -In our benchmark we test two main search usage scenarios that arise in practice. +- [Mirror Security Docs](https://docs.mirrorsecurity.io/introduction) +- [Mirror Security Blog](https://mirrorsecurity.io/blog) -- **Requests-per-Second (RPS)**: Serve more requests per second in exchange of individual requests taking longer (i.e. higher latency). This is a typical scenario for a web application, where multiple users are searching at the same time. -To simulate this scenario, we run client requests in parallel with multiple threads and measure how many requests the engine can handle per second. -- **Latency**: React quickly to individual requests rather than serving more requests in parallel. This is a typical scenario for applications where server response time is critical. Self-driving cars, manufacturing robots, and other real-time systems are good examples of such applications. -To simulate this scenario, we run client in a single thread and measure how long each request takes. +<|page-273-lllmstxt|> +# Vectorize.io -### [Anchor](https://qdrant.tech/benchmarks/single-node-speed-benchmark/\#tested-datasets) Tested datasets +[Vectorize](https://vectorize.io/) is a SaaS platform that automates data extraction from [several sources](https://docs.vectorize.io/integrations/source-connectors) and lets you quickly deploy real-time RAG pipelines for your unstructured data. It also includes evaluation to help figure out the best strategies for the RAG system. -Our [benchmark tool](https://github.com/qdrant/vector-db-benchmark) is inspired by [github.com/erikbern/ann-benchmarks](https://github.com/erikbern/ann-benchmarks/). We used the following datasets to test the performance of the engines on ANN Search tasks: +Vectorize pipelines natively integrate with Qdrant by converting unstructured data into vector embeddings and storing them in a collection. When a pipeline is running, any new change in the source data is immediately processed, keeping the vector index up-to-date. -| Datasets | \# Vectors | Dimensions | Distance | -| --- | --- | --- | --- | -| [dbpedia-openai-1M-angular](https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M) | 1M | 1536 | cosine | -| [deep-image-96-angular](http://sites.skoltech.ru/compvision/noimi/) | 10M | 96 | cosine | -| [gist-960-euclidean](http://corpus-texmex.irisa.fr/) | 1M | 960 | euclidean | -| [glove-100-angular](https://nlp.stanford.edu/projects/glove/) | 1.2M | 100 | cosine | +## Watch the Video -### [Anchor](https://qdrant.tech/benchmarks/single-node-speed-benchmark/\#setup) Setup +

-![Benchmarks configuration](https://qdrant.tech/benchmarks/client-server.png) +## Prerequisites -Benchmarks configuration +1. A Qdrant instance to connect to. You can get a free cloud instance at [cloud.qdrant.io](https://cloud.qdrant.io/). +2. An account at [Vectorize.io](https://vectorize.io) for building those seamless pipelines. -- This was our setup for this experiment: - - Client: 8 vcpus, 16 GiB memory, 64GiB storage ( `Standard D8ls v5` on Azure Cloud) - - Server: 8 vcpus, 32 GiB memory, 64GiB storage ( `Standard D8s v3` on Azure Cloud) -- The Python client uploads data to the server, waits for all required indexes to be constructed, and then performs searches with configured number of threads. We repeat this process with different configurations for each engine, and then select the best one for a given precision. -- We ran all the engines in docker and limited their memory to 25GB. This was used to ensure fairness by avoiding the case of some engine configs being too greedy with RAM usage. This 25 GB limit is completely fair because even to serve the largest `dbpedia-openai-1M-1536-angular` dataset, one hardly needs `1M * 1536 * 4bytes * 1.5 = 8.6GB` of RAM (including vectors + index). Hence, we decided to provide all the engines with ~3x the requirement. +## Set Up -Please note that some of the configs of some engines crashed on some datasets because of the 25 GB memory limit. That’s why you might see fewer points for some engines on choosing higher precision thresholds. +- From the Vectorize dashboard, click `Vector Databases` -> `New Vector Database Integration` and select Qdrant. -Share this article +- Set up a connection using the hostname and API key of your Qdrant instance. -[x](https://twitter.com/intent/tweet?url=https%3A%2F%2Fqdrant.tech%2Fbenchmarks%2Fsingle-node-speed-benchmark%2F&text=Single%20node%20benchmarks "x")[LinkedIn](https://www.linkedin.com/sharing/share-offsite/?url=https%3A%2F%2Fqdrant.tech%2Fbenchmarks%2Fsingle-node-speed-benchmark%2F "LinkedIn") + -Up! +![Vectorize connection](/documentation/platforms/vectorize/vectorize-connection.png) -<|page-147-lllmstxt|> -## overview -- [Documentation](https://qdrant.tech/documentation/) -- What is Qdrant? +- You can now select this Qdrant instance when setting up a [RAG pipeline](https://docs.vectorize.io/rag-pipelines/creating). Enter the name of the collection to use. It'll be created automatically if it doesn't exist. -# [Anchor](https://qdrant.tech/documentation/overview/\#introduction) Introduction +![Vectorize collection](/documentation/platforms/vectorize/vectorize-collection.png) -Vector databases are a relatively new way for interacting with abstract data representations -derived from opaque machine learning models such as deep learning architectures. These -representations are often called vectors or embeddings and they are a compressed version of -the data used to train a machine learning model to accomplish a task like sentiment analysis, -speech recognition, object detection, and many others. +- Select an embeddings provider. -These new databases shine in many applications like [semantic search](https://en.wikipedia.org/wiki/Semantic_search) -and [recommendation systems](https://en.wikipedia.org/wiki/Recommender_system), and here, we’ll -learn about one of the most popular and fastest growing vector databases in the market, [Qdrant](https://github.com/qdrant/qdrant). +![Vectorize Embeddings](/documentation/platforms/vectorize/vectorize-embeddings.png) -## [Anchor](https://qdrant.tech/documentation/overview/\#what-is-qdrant) What is Qdrant? +- Select a source from which to ingest data. -[Qdrant](https://github.com/qdrant/qdrant) “is a vector similarity search engine that provides a production-ready -service with a convenient API to store, search, and manage points (i.e. vectors) with an additional -payload.” You can think of the payloads as additional pieces of information that can help you -hone in on your search and also receive useful information that you can give to your users. +![Vectorize Sources](/documentation/platforms/vectorize/vectorize-sources.png) -You can get started using Qdrant with the Python `qdrant-client`, by pulling the latest docker -image of `qdrant` and connecting to it locally, or by trying out [Qdrant’s Cloud](https://cloud.qdrant.io/) -free tier option until you are ready to make the full switch. +Your Vectorize pipeline powered by Qdrant should now be up and ready to be scheduled and monitored. -With that out of the way, let’s talk about what are vector databases. +## Further Reading -## [Anchor](https://qdrant.tech/documentation/overview/\#what-are-vector-databases) What Are Vector Databases? +- Vectorize [Documentation](https://docs.vectorize.io) +- Vectorize [Tutorials](https://docs.vectorize.io/tutorials/). -![dbs](https://raw.githubusercontent.com/ramonpzg/mlops-sydney-2023/main/images/databases.png) +<|page-274-lllmstxt|> +# VoltAgent -Vector databases are a type of database designed to store and query high-dimensional vectors -efficiently. In traditional [OLTP](https://www.ibm.com/topics/oltp) and [OLAP](https://www.ibm.com/topics/olap) -databases (as seen in the image above), data is organized in rows and columns (and these are -called **Tables**), and queries are performed based on the values in those columns. However, -in certain applications including image recognition, natural language processing, and recommendation -systems, data is often represented as vectors in a high-dimensional space, and these vectors, plus -an id and a payload, are the elements we store in something called a **Collection** within a vector -database like Qdrant. +[VoltAgent](https://github.com/VoltAgent/voltagent) is a TypeScript-based open-source framework designed for developing AI agents that support modular tool integration, LLM coordination, and adaptable multi-agent architectures. The framework includes an integrated observability dashboard similar to n8n, enabling visual monitoring of agent operations, action tracking, and streamlined debugging capabilities. -A vector in this context is a mathematical representation of an object or data point, where elements of -the vector implicitly or explicitly correspond to specific features or attributes of the object. For example, -in an image recognition system, a vector could represent an image, with each element of the vector -representing a pixel value or a descriptor/characteristic of that pixel. In a music recommendation -system, each vector could represent a song, and elements of the vector would capture song characteristics -such as tempo, genre, lyrics, and so on. +## Installation -Vector databases are optimized for **storing** and **querying** these high-dimensional vectors -efficiently, and they often use specialized data structures and indexing techniques such as -Hierarchical Navigable Small World (HNSW) – which is used to implement Approximate Nearest -Neighbors – and Product Quantization, among others. These databases enable fast similarity -and semantic search while allowing users to find vectors that are the closest to a given query -vector based on some distance metric. The most commonly used distance metrics are Euclidean -Distance, Cosine Similarity, and Dot Product, and these three are fully supported Qdrant. +Create a new VoltAgent project with Qdrant integration: -Here’s a quick overview of the three: +```bash +npm create voltagent-app@latest -- --example with-qdrant +cd with-qdrant +``` -- [**Cosine Similarity**](https://en.wikipedia.org/wiki/Cosine_similarity) \- Cosine similarity -is a way to measure how similar two vectors are. To simplify, it reflects whether the vectors -have the same direction (similar) or are poles apart. Cosine similarity is often used with text representations -to compare how similar two documents or sentences are to each other. The output of cosine similarity ranges -from -1 to 1, where -1 means the two vectors are completely dissimilar, and 1 indicates maximum similarity. -- [**Dot Product**](https://en.wikipedia.org/wiki/Dot_product) \- The dot product similarity metric is another way -of measuring how similar two vectors are. Unlike cosine similarity, it also considers the length of the vectors. -This might be important when, for example, vector representations of your documents are built -based on the term (word) frequencies. The dot product similarity is calculated by multiplying the respective values -in the two vectors and then summing those products. The higher the sum, the more similar the two vectors are. -If you normalize the vectors (so the numbers in them sum up to 1), the dot product similarity will become -the cosine similarity. -- [**Euclidean Distance**](https://en.wikipedia.org/wiki/Euclidean_distance) \- Euclidean -distance is a way to measure the distance between two points in space, similar to how we -measure the distance between two places on a map. It’s calculated by finding the square root -of the sum of the squared differences between the two points’ coordinates. This distance metric -is also commonly used in machine learning to measure how similar or dissimilar two vectors are. +This command generates a fully configured project combining VoltAgent and Qdrant, including example data and two distinct agent implementation patterns. -Now that we know what vector databases are and how they are structurally different than other -databases, let’s go over why they are important. +Install the dependencies: -## [Anchor](https://qdrant.tech/documentation/overview/\#why-do-we-need-vector-databases) Why do we need Vector Databases? +```bash +npm install +``` -Vector databases play a crucial role in various applications that require similarity search, such -as recommendation systems, content-based image retrieval, and personalized search. By taking -advantage of their efficient indexing and searching techniques, vector databases enable faster -and more accurate retrieval of unstructured data already represented as vectors, which can -help put in front of users the most relevant results to their queries. +## Environment Setup -In addition, other benefits of using vector databases include: +Create a `.env` file with your configuration: -1. Efficient storage and indexing of high-dimensional data. -2. Ability to handle large-scale datasets with billions of data points. -3. Support for real-time analytics and queries. -4. Ability to handle vectors derived from complex data types such as images, videos, and natural language text. -5. Improved performance and reduced latency in machine learning and AI applications. -6. Reduced development and deployment time and cost compared to building a custom solution. +```env +# Qdrant URL +# docker run -p 6333:6333 qdrant/qdrant +QDRANT_URL=http://localhost:6333 -Keep in mind that the specific benefits of using a vector database may vary depending on the -use case of your organization and the features of the database you ultimately choose. +# Qdrant API key (Optional) +QDRANT_API_KEY=your-qdrant-api-key-here -Let’s now evaluate, at a high-level, the way Qdrant is architected. +# OpenAI API key for embeddings and LLM +OPENAI_API_KEY=your-openai-api-key-here +``` -## [Anchor](https://qdrant.tech/documentation/overview/\#high-level-overview-of-qdrants-architecture) High-Level Overview of Qdrant’s Architecture +Start your VoltAgent application: -![qdrant](https://raw.githubusercontent.com/ramonpzg/mlops-sydney-2023/main/images/qdrant_overview_high_level.png) +```bash +npm run dev +``` -The diagram above represents a high-level overview of some of the main components of Qdrant. Here -are the terminologies you should get familiar with. +Refer to source code of example [here](https://github.com/VoltAgent/voltagent/tree/main/examples/with-qdrant). -- [Collections](https://qdrant.tech/documentation/concepts/collections/): A collection is a named set of points (vectors with a payload) among which you can search. The vector of each point within the same collection must have the same dimensionality and be compared by a single metric. [Named vectors](https://qdrant.tech/documentation/concepts/collections/#collection-with-multiple-vectors) can be used to have multiple vectors in a single point, each of which can have their own dimensionality and metric requirements. -- [Distance Metrics](https://en.wikipedia.org/wiki/Metric_space): These are used to measure -similarities among vectors and they must be selected at the same time you are creating a -collection. The choice of metric depends on the way the vectors were obtained and, in particular, -on the neural network that will be used to encode new queries. -- [Points](https://qdrant.tech/documentation/concepts/points/): The points are the central entity that -Qdrant operates with and they consist of a vector and an optional id and payload. - - id: a unique identifier for your vectors. - - Vector: a high-dimensional representation of data, for example, an image, a sound, a document, a video, etc. - - [Payload](https://qdrant.tech/documentation/concepts/payload/): A payload is a JSON object with additional data you can add to a vector. -- [Storage](https://qdrant.tech/documentation/concepts/storage/): Qdrant can use one of two options for -storage, **In-memory** storage (Stores all vectors in RAM, has the highest speed since disk -access is required only for persistence), or **Memmap** storage, (creates a virtual address -space associated with the file on disk). -- Clients: the programming languages you can use to connect to Qdrant. +## How It Works -## [Anchor](https://qdrant.tech/documentation/overview/\#next-steps) Next Steps +The sections below demonstrate the construction of this example and provide guidance on adapting it to your needs. -Now that you know more about vector databases and Qdrant, you are ready to get started with one -of our tutorials. If you’ve never used a vector database, go ahead and jump straight into -the **Getting Started** section. Conversely, if you are a seasoned developer in these -technology, jump to the section most relevant to your use case. +### Create the Qdrant Retriever -As you go through the tutorials, please let us know if any questions come up in our -[Discord channel here](https://qdrant.to/discord). 😎 +Create `src/retriever/index.ts`: -##### Was this page useful? +```typescript +import { BaseRetriever, type BaseMessage, type RetrieveOptions } from "@voltagent/core"; +import { QdrantClient } from "@qdrant/js-client-rest"; -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +// Initialize Qdrant client +const qdrant = new QdrantClient({ + url: process.env.QDRANT_URL || "http://localhost:6333", + apiKey: process.env.QDRANT_API_KEY, +}); -Thank you for your feedback! 🙏 +const collectionName = "voltagent-knowledge-base"; +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/overview/_index.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +**Key Components Explained**: -On this page: +- **Qdrant Client**: Connects to Qdrant's REST API +- **Collection**: A named container for your vectors in Qdrant +- **Open Source & Cloud**: Use locally or as a managed service -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/overview/_index.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +### Initialize Collection and Sample Data -× +The provided example handles automatic creation and initialization of your Qdrant collection with data: -[Powered by](https://qdrant.tech/) +```typescript +async function initializeCollection() { + try { + // Check if collection exists + let exists = false; + try { + await qdrant.getCollection(collectionName); + exists = true; + console.log(`📋 Collection "${collectionName}" already exists`); + } catch (error) { + console.log(`📋 Creating new collection "${collectionName}"...`); + } -<|page-148-lllmstxt|> -## practicle-examples -- [Articles](https://qdrant.tech/articles/) -- Practical Examples - -#### Practical Examples - -Building blocks and reference implementations to help you get started with Qdrant. Learn how to use Qdrant to solve real-world problems and build the next generation of AI applications. - -[![Preview](https://qdrant.tech/articles_data/binary-quantization-openai/preview/preview.jpg)\\ -**Optimizing OpenAI Embeddings: Enhance Efficiency with Qdrant's Binary Quantization** \\ -Explore how Qdrant's Binary Quantization can significantly improve the efficiency and performance of OpenAI's Ada-003 embeddings. Learn best practices for real-time search applications.\\ -\\ -Nirant Kasliwal\\ -\\ -February 21, 2024](https://qdrant.tech/articles/binary-quantization-openai/)[![Preview](https://qdrant.tech/articles_data/food-discovery-demo/preview/preview.jpg)\\ -**Food Discovery Demo** \\ -Feeling hungry? Find the perfect meal with Qdrant's multimodal semantic search.\\ -\\ -Kacper Ɓukawski\\ -\\ -September 05, 2023](https://qdrant.tech/articles/food-discovery-demo/)[![Preview](https://qdrant.tech/articles_data/search-as-you-type/preview/preview.jpg)\\ -**Semantic Search As You Type** \\ -To show off Qdrant's performance, we show how to do a quick search-as-you-type that will come back within a few milliseconds.\\ -\\ -Andre Bogus\\ -\\ -August 14, 2023](https://qdrant.tech/articles/search-as-you-type/)[![Preview](https://qdrant.tech/articles_data/serverless/preview/preview.jpg)\\ -**Serverless Semantic Search** \\ -Create a serverless semantic search engine using nothing but Qdrant and free cloud services.\\ -\\ -Andre Bogus\\ -\\ -July 12, 2023](https://qdrant.tech/articles/serverless/)[![Preview](https://qdrant.tech/articles_data/chatgpt-plugin/preview/preview.jpg)\\ -**Extending ChatGPT with a Qdrant-based knowledge base** \\ -ChatGPT factuality might be improved with semantic search. Here is how.\\ -\\ -Kacper Ɓukawski\\ -\\ -March 23, 2023](https://qdrant.tech/articles/chatgpt-plugin/)[![Preview](https://qdrant.tech/articles_data/langchain-integration/preview/preview.jpg)\\ -**Using LangChain for Question Answering with Qdrant** \\ -We combined LangChain, a pre-trained LLM from OpenAI, SentenceTransformers & Qdrant to create a question answering system with just a few lines of code. Learn more!\\ -\\ -Kacper Ɓukawski\\ -\\ -January 31, 2023](https://qdrant.tech/articles/langchain-integration/)[![Preview](https://qdrant.tech/articles_data/qa-with-cohere-and-qdrant/preview/preview.jpg)\\ -**Question Answering as a Service with Cohere and Qdrant** \\ -End-to-end Question Answering system for the biomedical data with SaaS tools: Cohere co.embed API and Qdrant\\ -\\ -Kacper Ɓukawski\\ -\\ -November 29, 2022](https://qdrant.tech/articles/qa-with-cohere-and-qdrant/)[![Preview](https://qdrant.tech/articles_data/faq-question-answering/preview/preview.jpg)\\ -**Q&A with Similarity Learning** \\ -A complete guide to building a Q&A system using Quaterion and SentenceTransformers.\\ -\\ -George Panchuk\\ -\\ -June 28, 2022](https://qdrant.tech/articles/faq-question-answering/) - -× - -[Powered by](https://qdrant.tech/) + // Create collection if it doesn't exist + if (!exists) { + await qdrant.createCollection(collectionName, { + vectors: { size: 1536, distance: "Cosine" }, + }); + console.log(`✅ Collection "${collectionName}" created successfully`); + } -<|page-149-lllmstxt|> -## filtered-search-benchmark -February 13, 2023 + // Check if we need to populate with sample data + const stats = await qdrant.count(collectionName); + if (stats.count === 0) { + console.log("📚 Populating collection with sample documents..."); + // Generate embeddings for sample documents using OpenAI + const OpenAI = await import("openai"); + const openai = new OpenAI.default({ + apiKey: process.env.OPENAI_API_KEY!, + }); + const points = []; + for (const record of sampleRecords) { + try { + const embeddingResponse = await openai.embeddings.create({ + model: "text-embedding-3-small", + input: record.payload.text, + }); + points.push({ + id: record.id, + vector: embeddingResponse.data[0].embedding, + payload: record.payload, + }); + } catch (error) { + console.error(`Error generating embedding for ${record.id}:`, error); + } + } + if (points.length > 0) { + await qdrant.upsert(collectionName, { points }); + console.log(`✅ Successfully upserted ${points.length} documents to collection`); + } + } else { + console.log(`📊 Collection already contains ${stats.count} documents`); + } + } catch (error) { + console.error("Error initializing Qdrant collection:", error); + } +} +``` -Dataset:keyword-100range-100int-2048100-kw-small-vocabkeyword-2048geo-radius-100range-2048geo-radius-2048int-100h-and-m-2048arxiv-titles-384 +**What This Does**: -Plot values: +- Creates a Qdrant collection with cosine similarity +- Generates embeddings using OpenAI's API +- Adds the embeddings and payloads to Qdrant -Regular search +### Implement the Retriever Class -Filter search +Implement the primary retriever class for vector search functionality: -_Download raw data: [here](https://qdrant.tech/benchmarks/filter-result-2023-02-03.json)_ +```typescript +// Retriever function +async function retrieveDocuments(query: string, topK = 3) { + try { + // Generate embedding for the query + const OpenAI = await import("openai"); + const openai = new OpenAI.default({ + apiKey: process.env.OPENAI_API_KEY!, + }); + const embeddingResponse = await openai.embeddings.create({ + model: "text-embedding-3-small", + input: query, + }); + const queryVector = embeddingResponse.data[0].embedding; + // Perform search in Qdrant + const searchResults = ( + await qdrant.query(collectionName, { + query: queryVector, + limit: topK, + with_payload: true, + }) + ).points; + // Format results + return ( + searchResults.map((match: any) => ({ + content: match.payload?.text || "", + metadata: match.payload || {}, + score: match.score || 0, + id: match.id, + })) || [] + ); + } catch (error) { + console.error("Error retrieving documents from Qdrant:", error); + return []; + } +} -## [Anchor](https://qdrant.tech/benchmarks/filtered-search-benchmark/\#filtered-results) Filtered Results +/** + * Qdrant-based retriever implementation for VoltAgent + */ +export class QdrantRetriever extends BaseRetriever { + /** + * Retrieve documents from Qdrant based on semantic similarity + * @param input - The input to use for retrieval (string or BaseMessage[]) + * @param options - Configuration and context for the retrieval + * @returns Promise resolving to a formatted context string + */ + async retrieve(input: string | BaseMessage[], options: RetrieveOptions): Promise { + // Convert input to searchable string + let searchText = ""; + if (typeof input === "string") { + searchText = input; + } else if (Array.isArray(input) && input.length > 0) { + const lastMessage = input[input.length - 1]; + if (Array.isArray(lastMessage.content)) { + const textParts = lastMessage.content + .filter((part: any) => part.type === "text") + .map((part: any) => part.text); + searchText = textParts.join(" "); + } else { + searchText = lastMessage.content as string; + } + } + // Perform semantic search using Qdrant + const results = await retrieveDocuments(searchText, 3); + // Add references to userContext if available + if (options.userContext && results.length > 0) { + const references = results.map((doc: any, index: number) => ({ + id: doc.id, + title: doc.metadata.topic || `Document ${index + 1}`, + source: "Qdrant Knowledge Base", + score: doc.score, + category: doc.metadata.category, + })); + options.userContext.set("references", references); + } + // Return the concatenated content for the LLM + if (results.length === 0) { + return "No relevant documents found in the knowledge base."; + } + return results + .map( + (doc: any, index: number) => + `Document ${index + 1} (ID: ${doc.id}, Score: ${doc.score.toFixed(4)}, Category: ${doc.metadata.category}):\n${doc.content}` + ) + .join("\n\n---\n\n"); + } +} -As you can see from the charts, there are three main patterns: +// Create retriever instance +export const retriever = new QdrantRetriever(); +``` -- **Speed boost** \- for some engines/queries, the filtered search is faster than the unfiltered one. It might happen if the filter is restrictive enough, to completely avoid the usage of the vector index. +### Create Your Agents -- **Speed downturn** \- some engines struggle to keep high RPS, it might be related to the requirement of building a filtering mask for the dataset, as described above. +Configure agents with various retrieval strategies in `src/index.ts`: -- **Accuracy collapse** \- some engines are loosing accuracy dramatically under some filters. It is related to the fact that the HNSW graph becomes disconnected, and the search becomes unreliable. +```typescript +import { openai } from "@ai-sdk/openai"; +import { Agent, VoltAgent } from "@voltagent/core"; +import { createPinoLogger } from "@voltagent/logger"; +import { VercelAIProvider } from "@voltagent/vercel-ai"; + +import { retriever } from "./retriever/index.js"; + +// Agent 1: Using retriever directly +const agentWithRetriever = new Agent({ + name: "Assistant with Retriever", + description: + "A helpful assistant that can retrieve information from the Qdrant knowledge base using semantic search to provide better answers. I automatically search for relevant information when needed.", + llm: new VercelAIProvider(), + model: openai("gpt-4o-mini"), + retriever: retriever, +}); +// Agent 2: Using retriever as tool +const agentWithTools = new Agent({ + name: "Assistant with Tools", + description: + "A helpful assistant that can search the Qdrant knowledge base using tools. The agent will decide when to search for information based on user questions.", + llm: new VercelAIProvider(), + model: openai("gpt-4o-mini"), + tools: [retriever.tool], +}); -Qdrant avoids all these problems and also benefits from the speed boost, as it implements an advanced [query planning strategy](https://qdrant.tech/documentation/search/#query-planning). +// Create logger +const logger = createPinoLogger({ + name: "with-qdrant", + level: "info", +}); -Share this article +new VoltAgent({ + agents: { + agentWithRetriever, + agentWithTools, + }, + logger, +}); +``` -[x](https://twitter.com/intent/tweet?url=https%3A%2F%2Fqdrant.tech%2Fbenchmarks%2Ffiltered-search-benchmark%2F&text= "x")[LinkedIn](https://www.linkedin.com/sharing/share-offsite/?url=https%3A%2F%2Fqdrant.tech%2Fbenchmarks%2Ffiltered-search-benchmark%2F "LinkedIn") +## Further Reading -Up! +- [VoltAgent Documentation](https://voltagent.dev/docs/) +- [VoltAgent Examples](https://github.com/VoltAgent/voltagent/tree/main/examples) +- [VoltAgent Qdrant Official Docs](https://voltagent.dev/docs/rag/qdrant/) -<|page-150-lllmstxt|> -## database-optimization -- [Documentation](https://qdrant.tech/documentation/) -- [Faq](https://qdrant.tech/documentation/faq/) -- Database Optimization +<|page-275-lllmstxt|> +# Voyage AI -# [Anchor](https://qdrant.tech/documentation/faq/database-optimization/\#frequently-asked-questions-database-optimization) Frequently Asked Questions: Database Optimization +Qdrant supports working with [Voyage AI](https://voyageai.com/) embeddings. The supported models' list can be found [here](https://docs.voyageai.com/docs/embeddings). -### [Anchor](https://qdrant.tech/documentation/faq/database-optimization/\#how-do-i-reduce-memory-usage) How do I reduce memory usage? +You can generate an API key from the [Voyage AI dashboard]() to authenticate the requests. -The primary source of memory usage is vector data. There are several ways to address that: +### Setting up the Qdrant and Voyage clients -- Configure [Quantization](https://qdrant.tech/documentation/guides/quantization/) to reduce the memory usage of vectors. -- Configure on-disk vector storage +```python +from qdrant_client import QdrantClient +import voyageai -The choice of the approach depends on your requirements. -Read more about [configuring the optimal](https://qdrant.tech/documentation/tutorials/optimize/) use of Qdrant. +VOYAGE_API_KEY = "" -### [Anchor](https://qdrant.tech/documentation/faq/database-optimization/\#how-do-you-choose-the-machine-configuration) How do you choose the machine configuration? +qclient = QdrantClient(":memory:") +vclient = voyageai.Client(api_key=VOYAGE_API_KEY) -There are two main scenarios of Qdrant usage in terms of resource consumption: +texts = [ + "Qdrant is the best vector search engine!", + "Loved by Enterprises and everyone building for low latency, high performance, and scale.", +] +``` -- **Performance-optimized** – when you need to serve vector search as fast (many) as possible. In this case, you need to have as much vector data in RAM as possible. Use our [calculator](https://cloud.qdrant.io/calculator) to estimate the required RAM. -- **Storage-optimized** – when you need to store many vectors and minimize costs by compromising some search speed. In this case, pay attention to the disk speed instead. More about it in the article about [Memory Consumption](https://qdrant.tech/articles/memory-consumption/). +```typescript +import {QdrantClient} from '@qdrant/js-client-rest'; -### [Anchor](https://qdrant.tech/documentation/faq/database-optimization/\#i-configured-on-disk-vector-storage-but-memory-usage-is-still-high-why) I configured on-disk vector storage, but memory usage is still high. Why? +const VOYAGEAI_BASE_URL = "https://api.voyageai.com/v1/embeddings" +const VOYAGEAI_API_KEY = "" -Firstly, memory usage metrics as reported by `top` or `htop` may be misleading. They are not showing the minimal amount of memory required to run the service. -If the RSS memory usage is 10 GB, it doesn’t mean that it won’t work on a machine with 8 GB of RAM. +const client = new QdrantClient({ url: 'http://localhost:6333' }); -Qdrant uses many techniques to reduce search latency, including caching disk data in RAM and preloading data from disk to RAM. -As a result, the Qdrant process might use more memory than the minimum required to run the service. +const headers = { + "Authorization": "Bearer " + VOYAGEAI_API_KEY, + "Content-Type": "application/json" +} -> Unused RAM is wasted RAM +const texts = [ + "Qdrant is the best vector search engine!", + "Loved by Enterprises and everyone building for low latency, high performance, and scale.", +] +``` -If you want to limit the memory usage of the service, we recommend using [limits in Docker](https://docs.docker.com/config/containers/resource_constraints/#memory) or Kubernetes. +The following example shows how to embed documents with the [`voyage-large-2`](https://docs.voyageai.com/docs/embeddings#model-choices) model that generates sentence embeddings of size 1536. -### [Anchor](https://qdrant.tech/documentation/faq/database-optimization/\#my-requests-are-very-slow-or-time-out-what-should-i-do) My requests are very slow or time out. What should I do? +### Embedding documents -There are several possible reasons for that: +```python +response = vclient.embed(texts, model="voyage-large-2", input_type="document") +``` -- **Using filters without payload index** – If you’re performing a search with a filter but you don’t have a payload index, Qdrant will have to load whole payload data from disk to check the filtering condition. Ensure you have adequately configured [payload indexes](https://qdrant.tech/documentation/concepts/indexing/#payload-index). -- **Usage of on-disk vector storage with slow disks** – If you’re using on-disk vector storage, ensure you have fast enough disks. We recommend using local SSDs with at least 50k IOPS. Read more about the influence of the disk speed on the search latency in the article about [Memory Consumption](https://qdrant.tech/articles/memory-consumption/). -- **Large limit or non-optimal query parameters** – A large limit or offset might lead to significant performance degradation. Please pay close attention to the query/collection parameters that significantly diverge from the defaults. They might be the reason for the performance issues. +```typescript +let body = { + "input": texts, + "model": "voyage-large-2", + "input_type": "document", +} -##### Was this page useful? +let response = await fetch(VOYAGEAI_BASE_URL, { + method: "POST", + body: JSON.stringify(body), + headers +}); -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +let response_body = await response.json(); +``` -Thank you for your feedback! 🙏 +### Converting the model outputs to Qdrant points -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/faq/database-optimization.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +```python +from qdrant_client.models import PointStruct -On this page: +points = [ + PointStruct( + id=idx, + vector=embedding, + payload={"text": text}, + ) + for idx, (embedding, text) in enumerate(zip(response.embeddings, texts)) +] +``` -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/faq/database-optimization.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +```typescript +let points = response_body.data.map((data, i) => { + return { + id: i, + vector: data.embedding, + payload: { + text: texts[i] + } + } +}); +``` -× +### Creating a collection to insert the documents -[Powered by](https://qdrant.tech/) +```python +from qdrant_client.models import VectorParams, Distance -<|page-151-lllmstxt|> -## cloud-getting-started -- [Documentation](https://qdrant.tech/documentation/) -- Getting Started +COLLECTION_NAME = "example_collection" -# [Anchor](https://qdrant.tech/documentation/cloud-getting-started/\#getting-started-with-qdrant-managed-cloud) Getting Started with Qdrant Managed Cloud +qclient.create_collection( + COLLECTION_NAME, + vectors_config=VectorParams( + size=1536, + distance=Distance.COSINE, + ), +) +qclient.upsert(COLLECTION_NAME, points) +``` -Welcome to Qdrant Managed Cloud! This document contains all the information you need to get started. +```typescript +const COLLECTION_NAME = "example_collection" -## [Anchor](https://qdrant.tech/documentation/cloud-getting-started/\#prerequisites) Prerequisites +await client.createCollection(COLLECTION_NAME, { + vectors: { + size: 1536, + distance: 'Cosine', + } +}); -Before creating a cluster, make sure you have a Qdrant Cloud account. Detailed instructions for signing up can be found in the [Qdrant Cloud Setup](https://qdrant.tech/documentation/cloud/qdrant-cloud-setup/) guide. You also need to provide [payment details](https://qdrant.tech/documentation/cloud/pricing-payments/). If you have a custom payment agreement, first create your account, then [contact our Support Team](https://support.qdrant.io/) to finalize the setup. +await client.upsert(COLLECTION_NAME, { + wait: true, + points +}); +``` -Premium Plan subscribers can enable single sign-on (SSO) for their organizations. To activate SSO, please reach out to the Support Team at [https://support.qdrant.io/](https://support.qdrant.io/) for guidance. +### Searching for documents with Qdrant -## [Anchor](https://qdrant.tech/documentation/cloud-getting-started/\#cluster-sizing) Cluster Sizing +Once the documents are added, you can search for the most relevant documents. -Before deploying any cluster, consider the resources needed for your specific workload. Our [Capacity Planning guide](https://qdrant.tech/documentation/guides/capacity-planning/) describes how to assess the required CPU, memory, and storage. Additionally, the [Pricing Calculator](https://cloud.qdrant.io/calculator) helps you estimate associated costs based on your projected usage. +```python +response = vclient.embed( + ["What is the best to use for vector search scaling?"], + model="voyage-large-2", + input_type="query", +) -## [Anchor](https://qdrant.tech/documentation/cloud-getting-started/\#creating-and-managing-clusters) Creating and Managing Clusters +qclient.search( + collection_name=COLLECTION_NAME, + query_vector=response.embeddings[0], +) +``` -After setting up your account, you can create a Qdrant Cluster by following the steps in [Create a Cluster](https://qdrant.tech/documentation/cloud/create-cluster/). +```typescript +body = { + "input": ["What is the best to use for vector search scaling?"], + "model": "voyage-large-2", + "input_type": "query", +}; -## [Anchor](https://qdrant.tech/documentation/cloud-getting-started/\#preparing-for-production) Preparing for Production +response = await fetch(VOYAGEAI_BASE_URL, { + method: "POST", + body: JSON.stringify(body), + headers +}); -For a production-ready environment, consider deploying a multi-node Qdrant cluster (at least three nodes) with replication enabled. Instructions for configuring distributed clusters are available in the [Distributed Deployment](https://qdrant.tech/documentation/guides/distributed_deployment/) guide. +response_body = await response.json(); -If you are looking to optimize costs, you can reduce memory usage through [Quantization](https://qdrant.tech/documentation/guides/quantization/) or by [offloading vectors to disk](https://qdrant.tech/documentation/concepts/storage/#configuring-memmap-storage). +await client.search(COLLECTION_NAME, { + vector: response_body.data[0].embedding, +}); +``` -##### Was this page useful? +<|page-276-lllmstxt|> +We’re excited to share that Qdrant and [Vultr](https://www.vultr.com/) are partnering to provide seamless scalability and performance for vector search workloads. With Vultr's global footprint and customizable platform, deploying vector search workloads becomes incredibly flexible. Qdrant's new [Qdrant Hybrid Cloud](/hybrid-cloud/) offering and its Kubernetes-native design, coupled with Vultr's straightforward virtual machine provisioning, allows for simple setup when prototyping and building next-gen AI apps. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +#### Adapting to Diverse AI Development Needs with Customization and Deployment Flexibility -Thank you for your feedback! 🙏 +In the fast-paced world of AI and ML, businesses are eagerly integrating AI and generative AI to enhance their products with new features like AI assistants, develop new innovative solutions, and streamline internal workflows with AI-driven processes. Given the diverse needs of these applications, it's clear that a one-size-fits-all approach doesn't apply to AI development. This variability in requirements underscores the need for adaptable and customizable development environments. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-getting-started.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Recognizing this, Qdrant and Vultr have teamed up to offer developers unprecedented flexibility and control. The collaboration enables the deployment of a fully managed vector database on Vultr’s adaptable platform, catering to the specific needs of diverse AI projects. This unique setup offers developers the ideal Vultr environment for their vector search workloads. It ensures seamless adaptability and data privacy with all data residing in their environment. For the first time, Qdrant Hybrid Cloud allows for fully managing a vector database on Vultr, promoting rapid development cycles without the hassle of modifying existing setups and ensuring that data remains secure within the organization. Moreover, this partnership empowers developers with centralized management over their vector database clusters via Qdrant’s control plane, enabling precise size adjustments based on workload demands. This joint setup marks a significant step in providing the AI and ML field with flexible, secure, and efficient application development tools. -On this page: +> *"Our collaboration with Qdrant empowers developers to unlock the potential of vector search applications, such as RAG, by deploying Qdrant Hybrid Cloud with its high-performance search capabilities directly on Vultr's global, automated cloud infrastructure. This partnership creates a highly scalable and customizable platform, uniquely designed for deploying and managing AI workloads with unparalleled efficiency."* Kevin Cochrane, Vultr CMO. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-getting-started.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +#### The Benefits of Deploying Qdrant Hybrid Cloud on Vultr -× +Together, Qdrant Hybrid Cloud and Vultr offer enhanced AI and ML development with streamlined benefits: -[Powered by](https://qdrant.tech/) +- **Simple and Flexible Deployment:** Deploy Qdrant Hybrid Cloud on Vultr in a few minutes with a simple “one-click” installation by adding your Vutlr environment as a Hybrid Cloud Environment to Qdrant. -<|page-152-lllmstxt|> -## dedicated-vector-search -- [Articles](https://qdrant.tech/articles/) -- Built for Vector Search +- **Scalability and Customizability**: Qdrant’s efficient data handling and Vultr’s scalable infrastructure means projects can be adjusted dynamically to workload demands, optimizing costs without compromising performance or capabilities. -[Back to Qdrant Internals](https://qdrant.tech/articles/qdrant-internals/) +- **Unified AI Stack Management:** Seamlessly manage the entire lifecycle of AI applications, from vector search with Qdrant Hybrid Cloud to deployment and scaling with the Vultr platform and its AI and ML solutions, all within a single, integrated environment. This setup simplifies workflows, reduces complexity, accelerates development cycles, and simplifies the integration with other elements of the AI stack like model development, finetuning, or inference and training. -# Built for Vector Search +- **Global Reach, Local Execution**: With Vultr's worldwide infrastructure and Qdrant's fast vector search, deploy AI solutions globally while ensuring low latency and compliance with local data regulations, enhancing user satisfaction. -Evgeniya Sukhodolskaya & Andrey Vasnetsov +#### Getting Started with Qdrant Hybrid Cloud and Vultr -· +We've compiled an in-depth guide for leveraging Qdrant Hybrid Cloud on Vultr to kick off your journey into building cutting-edge AI solutions. For further insights into the deployment process, refer to our comprehensive documentation. -February 17, 2025 +![hybrid-cloud-vultr-tutorial](/blog/hybrid-cloud-vultr/hybrid-cloud-vultr-tutorial.png) -![Built for Vector Search](https://qdrant.tech/articles_data/dedicated-vector-search/preview/title.jpg) +#### Tutorial: Crafting a Personalized AI Assistant with RAG -Any problem with even a bit of complexity requires a specialized solution. You can use a Swiss Army knife to open a bottle or poke a hole in a cardboard box, but you will need an axe to chop wood — the same goes for software. +This tutorial outlines creating a personalized AI assistant using Qdrant Hybrid Cloud on Vultr, incorporating advanced vector search to power dynamic, interactive experiences. We will develop a RAG pipeline powered by DSPy and detail how to maintain data privacy within your Vultr environment. -In this article, we will describe the unique challenges vector search poses and why a dedicated solution is the best way to tackle them. +[Try the Tutorial](/documentation/tutorials/rag-chatbot-vultr-dspy-ollama/) -## [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#vectors) Vectors +#### Documentation: Effortless Deployment with Qdrant -![vectors](https://qdrant.tech/articles_data/dedicated-vector-search/image1.jpg) +Our Kubernetes-native framework simplifies the deployment of Qdrant Hybrid Cloud on Vultr, enabling you to get started in just a few straightforward steps. Dive into our documentation to learn more. -Let’s look at the central concept of vector databases — [**vectors**](https://qdrant.tech/documentation/concepts/vectors/). +[Read Hybrid Cloud Documentation](/documentation/hybrid-cloud/) -Vectors (also known as embeddings) are high-dimensional representations of various data points — texts, images, videos, etc. Many state-of-the-art (SOTA) embedding models generate representations of over 1,500 dimensions. When it comes to state-of-the-art PDF retrieval, the representations can reach [**over 100,000 dimensions per page**](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/). +#### Ready to Get Started? -This brings us to the first challenge of vector search — vectors are heavy. +Create a [Qdrant Cloud account](https://cloud.qdrant.io/login) and deploy your first **Qdrant Hybrid Cloud** cluster in a few minutes. You can always learn more in the [official release blog](/blog/hybrid-cloud/). -### [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#vectors-are-heavy) Vectors are Heavy +<|page-277-lllmstxt|> +Qdrant and [STACKIT](https://www.stackit.de/en/) are thrilled to announce that developers are now able to deploy a fully managed vector database to their STACKIT environment with the introduction of [Qdrant Hybrid Cloud](/hybrid-cloud/). This is a great step forward for the German AI ecosystem as it enables developers and businesses to build cutting edge AI applications that run on German data centers with full control over their data. -To put this in perspective, consider one million records stored in a relational database. It’s a relatively small amount of data for modern databases, which a free tier of many cloud providers could easily handle. +Vector databases are an essential component of the modern AI stack. They enable rapid and accurate retrieval of high-dimensional data, crucial for powering search, recommendation systems, and augmenting machine learning models. In the rising field of GenAI, vector databases power retrieval-augmented-generation (RAG) scenarios as they are able to enhance the output of large language models (LLMs) by injecting relevant contextual information. However, this contextual information is often rooted in confidential internal or customer-related information, which is why enterprises are in pursuit of solutions that allow them to make this data available for their AI applications without compromising data privacy, losing data control, or letting data exit the company's secure environment. -Now, generate a 1536-dimensional embedding with OpenAI’s `text-embedding-ada-002` model from each record, and you are looking at around **6GB of storage**. As a result, vector search workloads, especially if not optimized, will quickly dominate the main use cases of a non-vector database. +Qdrant Hybrid Cloud is the first managed vector database that can be deployed in an existing STACKIT environment. The Kubernetes-native setup allows businesses to operate a fully managed vector database, while maintaining control over their data through complete data isolation. Qdrant Hybrid Cloud's managed service seamlessly integrates into STACKIT's cloud environment, allowing businesses to deploy fully managed vector search workloads, secure in the knowledge that their operations are backed by the stringent data protection standards of Germany's data centers and in full compliance with GDPR. This setup not only ensures that data remains under the businesses control but also paves the way for secure, AI-driven application development. -Having vectors as a part of a main database is a potential issue for another reason — vectors are always a transformation of other data. +#### Key Features and Benefits of Qdrant on STACKIT: -### [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#vectors-are-a-transformation) Vectors are a Transformation +- **Seamless Integration and Deployment**: With Qdrant’s Kubernetes-native design, businesses can effortlessly connect their STACKIT cloud as a Hybrid Cloud Environment, enabling a one-step, scalable Qdrant deployment. -Vectors are obtained from some other source-of-truth data. They can be restored if lost with the same embedding model previously used. At the same time, even small changes in that model can shift the geometry of the vector space, so if you update or change the embedding model, you need to update and reindex all the data to maintain accurate vector comparisons. +- **Enhanced Data Privacy**: Leveraging STACKIT's German data centers ensures that all data processing complies with GDPR and other relevant European data protection standards, providing businesses with unparalleled control over their data. -If coupled with the main database, this update process can lead to significant complications and even unavailability of the whole system. +- **Scalable and Managed AI Solutions**: Deploying Qdrant on STACKIT provides a fully managed vector search engine with the ability to scale vertically and horizontally, with robust support for zero-downtime upgrades and disaster recovery, all within STACKIT's secure infrastructure. -However, vectors have positive properties as well. One of the most important is that vectors are fixed-size. +#### Use Case: AI-enabled Contract Management built with Qdrant Hybrid Cloud, STACKIT, and Aleph Alpha -### [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#vectors-are-fixed-size) Vectors are Fixed-Size +![hybrid-cloud-stackit-tutorial](/blog/hybrid-cloud-stackit/hybrid-cloud-stackit-tutorial.png) -Embedding models are designed to produce vectors of a fixed size. We have to use it to our advantage. +To demonstrate the power of Qdrant Hybrid Cloud on STACKIT, we’ve developed a comprehensive tutorial showcasing how to build secure, AI-driven applications focusing on data sovereignty. This tutorial specifically shows how to build a contract management platform that enables users to upload documents (PDF or DOCx), which are then segmented for searchable access. Designed with multitenancy, users can only access their team or organization's documents. It also features custom sharding for location-specific document storage. Beyond search, the application offers rephrasing of document excerpts for clarity to those without context. -For fast search, vectors need to be instantly accessible. Whether in [**RAM or disk**](https://qdrant.tech/documentation/concepts/storage/), vectors should be stored in a format that allows quick access and comparison. This is essential, as vector comparison is a very hot operation in vector search workloads. It is often performed thousands of times per search query, so even a small overhead can lead to a significant slowdown. +[Try the Tutorial](/documentation/tutorials/rag-contract-management-stackit-aleph-alpha/) -For dedicated storage, vectors’ fixed size comes as a blessing. Knowing how much space one data point needs, we don’t have to deal with the usual overhead of locating data — the location of elements in storage is straightforward to calculate. +#### Start Using Qdrant with STACKIT -Everything becomes far less intuitive if vectors are stored together with other data types, for example, texts or JSONs. The size of a single data point is not fixed anymore, so accessing it becomes non-trivial, especially if data is added, updated, and deleted over time. +Deploying Qdrant Hybrid Cloud on STACKIT is straightforward, thanks to the seamless integration facilitated by Kubernetes. Here are the steps to kickstart your journey: -![Fixed size columns VS Variable length table](https://qdrant.tech/articles_data/dedicated-vector-search/dedicated_storage.png) +1. **Qdrant Hybrid Cloud Activation**: Start by activating ‘Hybrid Cloud’ in your [Qdrant Cloud account](https://cloud.qdrant.io/login). -Fixed size columns VS Variable length table +2. **Cluster Integration**: Add your STACKIT Kubernetes clusters as a Hybrid Cloud Environment in the Hybrid Cloud section. -**Storing vectors together with other types of data, we lose all the benefits of their characteristics**; however, we fully “enjoy” their drawbacks, polluting the storage with an extremely heavy transformation of data already existing in that storage. +3. **Effortless Deployment**: Use the Qdrant Management Console to effortlessly create and manage your Qdrant clusters on STACKIT. -## [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#vector-search) Vector Search +We invite you to explore the detailed documentation on deploying Qdrant on STACKIT, designed to guide you through each step of the process seamlessly. -![vector-search](https://qdrant.tech/articles_data/dedicated-vector-search/image2.jpg) +[Read Hybrid Cloud Documentation](/documentation/hybrid-cloud/) -Unlike traditional databases that serve as data stores, **vector databases are more like search engines**. They are designed to be **scalable**, always **available**, and capable of delivering high-speed search results even under heavy loads. Just as Google or Bing can handle billions of queries at once, vector databases are designed for scenarios where rapid, high-throughput, low-latency retrieval is a must. +#### Ready to Get Started? -![Database Compass](https://qdrant.tech/articles_data/dedicated-vector-search/compass.png) +Create a [Qdrant Cloud account](https://cloud.qdrant.io/login) and deploy your first **Qdrant Hybrid Cloud** cluster in a few minutes. You can always learn more in the [official release blog](/blog/hybrid-cloud/). -Database Compass +<|page-278-lllmstxt|> +In a move to empower the next wave of AI innovation, Qdrant and [Scaleway](https://www.scaleway.com/en/) collaborate to introduce [Qdrant Hybrid Cloud](/hybrid-cloud/), a fully managed vector database that can be deployed on existing Scaleway environments. This collaboration is set to democratize access to advanced AI capabilities, enabling developers to easily deploy and scale vector search technologies within Scaleway's robust and developer-friendly cloud infrastructure. By focusing on the unique needs of startups and the developer community, Qdrant and Scaleway are providing access to intuitive and easy to use tools, making cutting-edge AI more accessible than ever before. -### [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#pick-any-two) Pick Any Two +Building on this vision, the integration between Scaleway and Qdrant Hybrid Cloud leverages the strengths of both Qdrant, with its leading open-source vector database, and Scaleway, known for its innovative and scalable cloud solutions. This integration means startups and developers can now harness the power of vector search - essential for AI applications like recommendation systems, image recognition, and natural language processing - within their existing environment without the complexity of maintaining such advanced setups. -Distributed systems are perfect for scalability — horizontal scaling in these systems allows you to add more machines as needed. In the world of distributed systems, one well-known principle — the **CAP theorem** — illustrates that you cannot have it all. The theorem states that a distributed system can guarantee only two out of three properties: **Consistency**, **Availability**, and **Partition Tolerance**. +*"With our partnership with Qdrant, Scaleway reinforces its status as Europe's leading cloud provider for AI innovation. The integration of Qdrant's fast and accurate vector database enriches our expanding suite of AI solutions. This means you can build smarter, faster AI projects with us, worry-free about performance and security." FrĂ©dĂ©ric BARDOLLE, Lead PM AI @ Scaleway* -As network partitions are inevitable in any real-world distributed system, all modern distributed databases are designed with partition tolerance in mind, forcing a trade-off between **consistency** (providing the most up-to-date data) and **availability** (remaining responsive). +#### Developing a Retrieval Augmented Generation (RAG) Application with Qdrant Hybrid Cloud, Scaleway, and LangChain -There are two main design philosophies for databases in this context: +Retrieval Augmented Generation (RAG) enhances Large Language Models (LLMs) by integrating vector search to provide precise, context-rich responses. This combination allows LLMs to access and incorporate specific data in real-time, vastly improving the quality of AI-generated content. -### [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#acid-prioritizing-consistency) ACID: Prioritizing Consistency +RAG applications often rely on sensitive or proprietary internal data, emphasizing the importance of data sovereignty. Running the entire stack within your own environment becomes crucial for maintaining control over this data. Qdrant Hybrid Cloud deployed on Scaleway addresses this need perfectly, offering a secure, scalable platform that respects data sovereignty requirements while leveraging the full potential of RAG for sophisticated AI solutions. -The ACID model ensures that every transaction (a group of operations treated as a single unit, such as transferring money between accounts) is executed fully or not at all (reverted), leaving the database in a valid state. When a system is distributed, achieving ACID properties requires complex coordination between nodes. Each node must communicate and agree on the state of a transaction, which can **limit system availability** — if a node is uncertain about the state of another, it may refuse to process a transaction until consistency is assured. This coordination also makes **scaling more challenging**. +![hybrid-cloud-scaleway-tutorial](/blog/hybrid-cloud-scaleway/hybrid-cloud-scaleway-tutorial.png) -Financial institutions use ACID-compliant databases when dealing with money transfers, where even a momentary discrepancy in an account balance is unacceptable. +We created a tutorial that guides you through setting up and leveraging Qdrant Hybrid Cloud on Scaleway for a RAG application, providing insights into efficiently managing data within a secure, sovereign framework. It highlights practical steps to integrate vector search with LLMs, optimizing the generation of high-quality, relevant AI content, while ensuring data sovereignty is maintained throughout. -### [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#base-prioritizing-availability) BASE: Prioritizing Availability +[Try the Tutorial](/documentation/tutorials/rag-chatbot-scaleway/) -On the other hand, the BASE model favors high availability and partition tolerance. BASE systems distribute data and workload across multiple nodes, enabling them to respond to read and write requests immediately. They operate under the principle of **eventual consistency** — although data may be temporarily out-of-date, the system will converge on a consistent state given time. +#### The Benefits of Running Qdrant Hybrid Cloud on Scaleway -Social media platforms, streaming services, and search engines all benefit from the BASE approach. For these applications, having immediate responsiveness is more critical than strict consistency. +Choosing Qdrant Hybrid Cloud and Scaleway for AI applications offers several key advantages: -### [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#based-vector-search) BASEd Vector Search +- **AI-Focused Resources:** Scaleway aims to be the cloud provider of choice for AI companies, offering the resources and infrastructure to power complex AI and machine learning workloads, helping to advance the development and deployment of AI technologies. This paired with Qdrant Hybrid Cloud provides a strong foundational platform for advanced AI applications. -Considering the specifics of vector search — its nature demanding availability & scalability — it should be served on BASE-oriented architecture. This choice is made due to the need for horizontal scaling, high availability, low latency, and high throughput. For example, having BASE-focused architecture allows us to [**easily manage resharding**](https://qdrant.tech/documentation/cloud/cluster-scaling/#resharding). +- **Scalable Vector Search:** Qdrant Hybrid Cloud provides a fully managed vector database that allows to effortlessly scale the setup through vertical or horizontal scaling. Deployed on Scaleway, this is a robust setup that is designed to meet the needs of businesses at every stage of growth, from startups to large enterprises, ensuring a full spectrum of solutions for various projects and workloads. -A strictly consistent transactional approach also loses its attractiveness when we remember that vectors are heavy transformations of data at our disposal — what’s the point in limiting data protection mechanisms if we can always restore vectorized data through a transformation? +- **European Roots and Focus**: With a strong presence in Europe and a commitment to supporting the European tech ecosystem, Scaleway is ideally positioned to partner with European-based companies like Qdrant, providing local expertise and infrastructure that aligns with European regulatory standards. -## [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#vector-index) Vector Index +- **Sustainability Commitment**: Scaleway leads with an eco-conscious approach, featuring adiabatic data centers that significantly reduce cooling costs and environmental impact. Scaleway prioritizes extending hardware lifecycle beyond industry norms to lessen our ecological footprint. -![vector-index](https://qdrant.tech/articles_data/dedicated-vector-search/image3.jpg) +#### Get Started in a Few Seconds -[**Vector search**](https://qdrant.tech/documentation/concepts/search/) relies on high-dimensional vector mathematics, making it computationally heavy at scale. A brute-force similarity search would require comparing a query against every vector in the database. In a database with 100 million 1536-dimensional vectors, performing 100 million comparisons per one query is unfeasible for production scenarios. Instead of a brute-force approach, vector databases have specialized approximate nearest neighbour (ANN) indexes that balance search precision and speed. These indexes require carefully designed architectures to make their maintenance in production feasible. +Setting up Qdrant Hybrid Cloud on Scaleway is streamlined and quick, thanks to its Kubernetes-native architecture. Follow these simple three steps to launch: -![HNSW Index](https://qdrant.tech/articles_data/dedicated-vector-search/hnsw.png) +1. **Activate Hybrid Cloud**: First, log into your [Qdrant Cloud account](https://cloud.qdrant.io/login) and select ‘Hybrid Cloud’ to activate. -HNSW Index +2. **Integrate Your Clusters**: Navigate to the Hybrid Cloud settings and add your Scaleway Kubernetes clusters as a Hybrid Cloud Environment. -One of the most popular vector indexes is **HNSW (Hierarchical Navigable Small World)**, which we picked for its capability to provide simultaneously high search speed and accuracy. High performance came with a cost — implementing it in production is untrivial due to several challenges, so to make it shine all the system’s architecture has to be structured around it, serving the capricious index. +3. **Simplified Management**: Use the Qdrant Management Console for easy creation and oversight of your Qdrant clusters on Scaleway. -### [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#index-complexity) Index Complexity +For more comprehensive guidance, our documentation provides step-by-step instructions for deploying Qdrant on Scaleway. -[**HNSW**](https://qdrant.tech/documentation/concepts/indexing/) is structured as a multi-layered graph. With a new data point inserted, the algorithm must compare it to existing nodes across several layers to index it. As the number of vectors grows, these comparisons will noticeably slow down the construction process, making updates increasingly time-consuming. The indexing operation can quickly become the bottleneck in the system, slowing down search requests. +[Read Hybrid Cloud Documentation](/documentation/hybrid-cloud/) -Building an HNSW monolith means limiting the scalability of your solution — its size has to be capped, as its construction time scales **non-linearly** with the number of elements. To keep the construction process feasible and ensure it doesn’t affect the search time, we came up with a layered architecture that breaks down all data management into small units called **segments**. +#### Ready to Get Started? -![Storage structure](https://qdrant.tech/articles_data/dedicated-vector-search/segments.png) +Create a [Qdrant Cloud account](https://cloud.qdrant.io/login) and deploy your first **Qdrant Hybrid Cloud** cluster in a few minutes. You can always learn more in the [official release blog](/blog/hybrid-cloud/). -Storage structure +<|page-279-lllmstxt|> +We’re excited about our collaboration with Red Hat to bring the Qdrant vector database to [Red Hat OpenShift](https://www.redhat.com/en/technologies/cloud-computing/openshift) customers! With the release of [Qdrant Hybrid Cloud](/hybrid-cloud/), developers can now deploy and run the Qdrant vector database directly in their Red Hat OpenShift environment. This collaboration enables developers to scale more seamlessly, operate more consistently across hybrid cloud environments, and maintain complete control over their vector data. This is a big step forward in simplifying AI infrastructure and empowering data-driven projects, like retrieval augmented generation (RAG) use cases, advanced search scenarios, or recommendations systems. -Each segment isolates a subset of vectorized corpora and supports all collection-level operations on it, from searching to indexing, for example segments build their own index on the subset of data available to them. For users working on a collection level, the specifics of segmentation are unnoticeable. The search results they get span the whole collection, as sub-results are gathered from segments and then merged & deduplicated. +In the rapidly evolving field of Artificial Intelligence and Machine Learning, the demand for being able to manage the modern AI stack within the existing infrastructure becomes increasingly relevant for businesses. As enterprises are launching new AI applications and use cases into production, they require the ability to maintain complete control over their data, since these new apps often work with sensitive internal and customer-centric data that needs to remain within the owned premises. This is why enterprises are increasingly looking for maximum deployment flexibility for their AI workloads. -By balancing between size and number of segments, we can ensure the right balance between search speed and indexing time, making the system flexible for different workloads. +>*“Red Hat is committed to driving transparency, flexibility and choice for organizations to more easily unlock the power of AI. By working with partners like Qdrant to enable streamlined integration experiences on Red Hat OpenShift for AI use cases, organizations can more effectively harness critical data and deliver real business outcomes,”* said Steven Huels, Vice President and General Manager, AI Business Unit, Red Hat. -### [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#immutability) Immutability +#### The Synergy of Qdrant Hybrid Cloud and Red Hat OpenShift -With index maintenance divided between segments, Qdrant can ensure high performance even during heavy load, and additional optimizations secure that further. These optimizations come from an idea that working with immutable structures introduces plenty of benefits: the possibility of using internally fixed sized lists (so no dynamic updates), ordering stored data accordingly to access patterns (so no unpredictable random accesses). With this in mind, to optimize search speed and memory management further, we use a strategy that combines and manages [**mutable and immutable segments**](https://qdrant.tech/articles/immutable-data-structures/). +Qdrant Hybrid Cloud is the first vector database that can be deployed anywhere, with complete database isolation, while still providing a fully managed cluster management. Running Qdrant Hybrid Cloud on Red Hat OpenShift allows enterprises to deploy and run a fully managed vector database in their own environment, ultimately allowing businesses to run managed vector search on their existing cloud and infrastructure environments, with full data sovereignty. -| | | -| --- | --- | -| **Mutable Segments** | These are used for quickly ingesting new data and handling changes (updates) to existing data. | -| **Immutable Segments** | Once a mutable segment reaches a certain size, an optimization process converts it into an immutable segment, constructing an HNSW index – you could [**read about these optimizers here**](https://qdrant.tech/documentation/concepts/optimizer/#optimizer) in detail. This immutability trick allowed us, for example, to ensure effective [**tenant isolation**](https://qdrant.tech/documentation/concepts/indexing/#tenant-index). | +Red Hat OpenShift, the industry’s leading hybrid cloud application platform powered by Kubernetes, helps streamline the deployment of Qdrant Hybrid Cloud within an enterprise's secure premises. Red Hat OpenShift provides features like auto-scaling, load balancing, and advanced security controls that can help you manage and maintain your vector database deployments more effectively. In addition, Red Hat OpenShift supports deployment across multiple environments, including on-premises, public, private and hybrid cloud landscapes. This flexibility, coupled with Qdrant Hybrid Cloud, allows organizations to choose the deployment model that best suits their needs. -Immutable segments are an implementation detail transparent for users — they can delete vectors at any time, while additions and updates are applied to a mutable segment instead. This combination of mutability and immutability allows search and indexing to smoothly run simultaneously, even under heavy loads. This approach minimizes the performance impact of indexing time and allows on-the-fly configuration changes on a collection level (such as enabling or disabling data quantization) without downtimes. +#### Why Run Qdrant Hybrid Cloud on Red Hat OpenShift? -### [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#filterable-index) Filterable Index +- **Scalability**: Red Hat OpenShift's container orchestration effortlessly scales Qdrant Hybrid Cloud components, accommodating fluctuating workload demands with ease. -Vector search wasn’t historically designed for filtering — imposing strict constraints on results. It’s inherently fuzzy; every document is, to some extent, both similar and dissimilar to any query — there’s no binary “ _fits/doesn’t fit_” segregation. As a result, vector search algorithms weren’t originally built with filtering in mind. +- **Portability**: The consistency across hybrid cloud environments provided by Red Hat OpenShift allows for smoother operation of Qdrant Hybrid Cloud across various infrastructures. -At the same time, filtering is unavoidable in many vector search applications, such as [**e-commerce search/recommendations**](https://qdrant.tech/recommendations/). Searching for a Christmas present, you might want to filter out everything over 100 euros while still benefiting from the vector search’s semantic nature. +- **Automation**: Deployment, scaling, and management tasks are automated, reducing operational overhead and simplifying the management of Qdrant Hybrid Cloud. -In many vector search solutions, filtering is approached in two ways: **pre-filtering** (computes a binary mask for all vectors fitting the condition before running HNSW search) or **post-filtering** (running HNSW as usual and then filtering the results). +- **Security**: Red Hat OpenShift provides built-in security features, including container isolation, network policies, and role-based access control (RBAC), enhancing the security posture of Qdrant Hybrid Cloud deployments. -| | | | -| --- | --- | --- | -| ❌ | **Pre-filtering** | Has the linear complexity of computing the vector mask and becomes a bottleneck for large datasets. | -| ❌ | **Post-filtering** | The problem with **post-filtering** is tied to vector search “ _everything fits and doesn’t at the same time_” nature: imagine a low-cardinality filter that leaves only a few matching elements in the database. If none of them are similar enough to the query to appear in the top-X retrieved results, they’ll all be filtered out. | +- **Flexibility:** Red Hat OpenShift supports a wide range of programming languages, frameworks, and tools, providing flexibility in developing and deploying Qdrant Hybrid Cloud applications. -Qdrant [**took filtering in vector search further**](https://qdrant.tech/articles/vector-search-filtering/), recognizing the limitations of pre-filtering & post-filtering strategies. We developed an adaptation of HNSW — [**filterable HNSW**](https://qdrant.tech/articles/filtrable-hnsw/) — that also enables **in-place filtering** during graph traversal. To make this possible, we condition HNSW index construction on possible filtering conditions reflected by [**payload indexes**](https://qdrant.tech/documentation/concepts/indexing/#payload-index) (inverted indexes built on vectors’ [**metadata**](https://qdrant.tech/documentation/concepts/payload/)). +- **Integration:** Red Hat OpenShift can be integrated with various Red Hat and third-party tools, facilitating seamless integration of Qdrant Hybrid Cloud with other enterprise systems and services. -**Qdrant was designed with a vector index being a central component of the system.** That made it possible to organize optimizers, payload indexes and other components around the vector index, unlocking the possibility of building a filterable HNSW. -![Filterable Vector Index](https://qdrant.tech/articles_data/dedicated-vector-search/filterable-vector-index.png) +#### Get Started with Qdrant Hybrid Cloud on Red Hat OpenShift -Filterable Vector Index +We're thrilled about our collaboration with Red Hat to help simplify AI infrastructure for developers and enterprises alike. By deploying Qdrant Hybrid Cloud on Red Hat OpenShift, developers can gain the ability to more easily scale and maintain greater operational consistency across hybrid cloud environments. -In general, optimizing vector search requires a custom, finely tuned approach to data and index management that secures high performance even as data grows and changes dynamically. This specialized architecture is the key reason why **dedicated vector databases will always outperform general-purpose databases in production settings**. +To get started, we created a comprehensive tutorial that shows how to build next-gen AI applications with Qdrant Hybrid Cloud on Red Hat OpenShift. Additionally, you can find more details on the seamless deployment process in our documentation: -## [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#vector-search-beyond-rag) Vector Search Beyond RAG +![hybrid-cloud-red-hat-openshift-tutorial](/blog/hybrid-cloud-red-hat-openshift/hybrid-cloud-red-hat-openshift-tutorial.png) -![Vector Search is not Text Search Extension](https://qdrant.tech/articles_data/dedicated-vector-search/venn-diagram.png) +#### Tutorial: Private Chatbot for Interactive Learning -Vector Search is not Text Search Extension +In this tutorial, you will build a chatbot without public internet access. The goal is to keep sensitive data secure and isolated. Your RAG system will be built with Qdrant Hybrid Cloud on Red Hat OpenShift, leveraging Haystack for enhanced generative AI capabilities. This tutorial especially explores how this setup ensures that not a single data point leaves the environment. -Many discussions about the purpose of vector databases focus on Retrieval-Augmented Generation (RAG) — or its more advanced variant, agentic RAG — where vector databases are used as a knowledge source to retrieve context for large language models (LLMs). This is a legitimate use case, however, the hype wave of RAG solutions has overshadowed the broader potential of vector search, which goes [**beyond augmenting generative AI**](https://qdrant.tech/articles/vector-similarity-beyond-search/). +[Try the Tutorial](/documentation/tutorials/rag-chatbot-red-hat-openshift-haystack/) -### [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#discovery) Discovery +#### Documentation: Deploy Qdrant in a Few Clicks -The strength of vector search lies in its ability to facilitate [**discovery**](https://qdrant.tech/articles/discovery-search/). Vector search allows you to refine your choices as you search rather than starting with a fixed query. Say, [**you’re ordering food not knowing exactly what you want**](https://qdrant.tech/articles/food-discovery-demo/) — just that it should contain meat & not a burger, or that it should be meat with cheese & not tacos. Instead of searching for a specific dish, vector search helps you navigate options based on similarity and dissimilarity, guiding you toward something that matches your taste without requiring you to define it upfront. +> Our simple Kubernetes-native design allows you to deploy Qdrant Hybrid Cloud on your Red Hat OpenShift instance in just a few steps. Learn how in our documentation. -### [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#recommendations) Recommendations +[Read Hybrid Cloud Documentation](/documentation/hybrid-cloud/) -Vector search is perfect for [**recommendations**](https://qdrant.tech/documentation/concepts/explore/#recommendation-api). Imagine browsing for a new book or movie. Instead of searching for an exact match, you might look for stories that capture a certain mood or theme but differ in key aspects from what you already know. For example, you may [**want a film featuring wizards without the familiar feel of the “Harry Potter” series**](https://www.youtube.com/watch?v=O5mT8M7rqQQ). This flexibility is possible because vector search is not tied to the binary “match/not match” concept but operates on distances in a vector space. +This collaboration marks an important milestone in the quest for simplified AI infrastructure, offering a robust, scalable, and security-optimized solution for managing vector databases in a hybrid cloud environment. The combination of Qdrant's performance and Red Hat OpenShift's operational excellence opens new avenues for enterprises looking to leverage the power of AI and ML. -### [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#big-unstructured-data-analysis) Big Unstructured Data Analysis +#### Ready to Get Started? -Vector search nature makes it also ideal for [**big unstructured data analysis**](https://www.youtube.com/watch?v=_BQTnXpuH-E), for instance, anomaly detection. In large, unstructured, and often unlabelled datasets, vector search can help identify clusters and outliers by analyzing distance relationships between data points. +Create a [Qdrant Cloud account](https://cloud.qdrant.io/login) and deploy your first **Qdrant Hybrid Cloud** cluster in a few minutes. You can always learn more in the [official release blog](/blog/hybrid-cloud/). -### [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#fundamentally-different) Fundamentally Different +<|page-280-lllmstxt|> +With the official release of [Qdrant Hybrid Cloud](/hybrid-cloud/), businesses running their data infrastructure on [OVHcloud](https://ovhcloud.com/) are now able to deploy a fully managed vector database in their existing OVHcloud environment. We are excited about this partnership, which has been established through the [OVHcloud Open Trusted Cloud](https://opentrustedcloud.ovhcloud.com/en/) program, as it is based on our shared understanding of the importance of trust, control, and data privacy in the context of the emerging landscape of enterprise-grade AI applications. As part of this collaboration, we are also providing a detailed use case tutorial on building a recommendation system that demonstrates the benefits of running Qdrant Hybrid Cloud on OVHcloud. -**Vector search beyond RAG isn’t just another feature — it’s a fundamental shift in how we interact with data**. Dedicated solutions integrate these capabilities natively and are designed from the ground up to handle high-dimensional math and (dis-)similarity-based retrieval. In contrast, databases with vector extensions are built around a different data paradigm, making it impossible to efficiently support advanced vector search capabilities. +Deploying Qdrant Hybrid Cloud on OVHcloud's infrastructure represents a significant leap for European businesses invested in AI-driven projects, as this collaboration underscores the commitment to meeting the rigorous requirements for data privacy and control of European startups and enterprises building AI solutions. As businesses are progressing on their AI journey, they require dedicated solutions that allow them to make their data accessible for machine learning and AI projects, without having it leave the company's security perimeter. Prioritizing data sovereignty, a crucial aspect in today's digital landscape, will help startups and enterprises accelerate their AI agendas and build even more differentiating AI-enabled applications. The ability of running Qdrant Hybrid Cloud on OVHcloud not only underscores the commitment to innovative, secure AI solutions but also ensures that companies can navigate the complexities of AI and machine learning workloads with the flexibility and security required. -Even if you want to retrofit these capabilities, it’s not just a matter of adding a new feature — it’s a structural problem. Supporting advanced vector search requires **dedicated interfaces** that enable flexible usage of vector search from multi-stage filtering to dynamic exploration of high-dimensional spaces. +> *“The partnership between OVHcloud and Qdrant Hybrid Cloud highlights, in the European AI landscape, a strong commitment to innovative and secure AI solutions, empowering startups and organisations to navigate AI complexities confidently. By emphasizing data sovereignty and security, we enable businesses to leverage vector databases securely.“* Yaniv Fdida, Chief Product and Technology Officer, OVHcloud -When the underlying architecture wasn’t initially designed for this kind of interaction, integrating interfaces is a **software engineering team nightmare**. You end up breaking existing assumptions, forcing inefficient workarounds, and often introducing backwards-compatibility problems. It’s why attempts to patch vector search onto traditional databases won’t match the efficiency of purpose-built systems. +#### Qdrant & OVHcloud: High Performance Vector Search With Full Data Control -## [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#making-vector-search-state-of-the-art) Making Vector Search State-of-the-Art +Through the seamless integration between Qdrant Hybrid Cloud and OVHcloud, developers and businesses are able to deploy the fully managed vector database within their existing OVHcloud setups in minutes, enabling faster, more accurate AI-driven insights. -![vector-search-state-of-the-art](https://qdrant.tech/articles_data/dedicated-vector-search/image4.jpg) +- **Simple setup:** With the seamless “one-click” installation, developers are able to deploy Qdrant’s fully managed vector database to their existing OVHcloud environment. -Now, let’s shift focus to another key advantage of dedicated solutions — their ability to keep up with state-of-the-art solutions in the field. +- **Trust and data sovereignty**: Deploying Qdrant Hybrid Cloud on OVHcloud enables developers with vector search that prioritizes data sovereignty, a crucial aspect in today's AI landscape where data privacy and control are essential. True to its “Sovereign by design” DNA, OVHcloud guarantees that all the data stored are immune to extraterritorial laws and comply with the highest security standards. -[**Vector databases**](https://qdrant.tech/qdrant-vector-database/) are purpose-built for vector retrieval, and as a result, they offer cutting-edge features that are often critical for AI businesses relying on vector search. Vector database engineers invest significant time and effort into researching and implementing the most optimal ways to perform vector search. Many of these innovations come naturally to vector-native architectures, while general-purpose databases with added vector capabilities may struggle to adapt and replicate these benefits efficiently. +- **Open standards and open ecosystem**: OVHcloud’s commitment to open standards and an open ecosystem not only facilitates the easy integration of Qdrant Hybrid Cloud with OVHcloud’s AI services and GPU-powered instances but also ensures compatibility with a wide range of external services and applications, enabling seamless data workflows across the modern AI stack. -Consider some of the advanced features implemented in Qdrant: +- **Cost efficient sector search:** By leveraging Qdrant's quantization for efficient data handling and pairing it with OVHcloud's eco-friendly, water-cooled infrastructure, known for its superior price/performance ratio, this collaboration provides a strong foundation for cost efficient vector search. -- [**GPU-Accelerated Indexing**](https://qdrant.tech/blog/qdrant-1.13.x/#gpu-accelerated-indexing) +#### Build a RAG-Based System with Qdrant Hybrid Cloud and OVHcloud -By offloading index construction tasks to the GPU, Qdrant can significantly speed up the process of data indexing while keeping costs low. This becomes especially valuable when working with large datasets in hot data scenarios. +![hybrid-cloud-ovhcloud-tutorial](/blog/hybrid-cloud-ovhcloud/hybrid-cloud-ovhcloud-tutorial.png) -GPU acceleration in Qdrant is a custom solution developed by an enthusiast from our core team. It’s vendor-free and natively supports all Qdrant’s unique architectural features, from FIlterable HNSW to multivectors. +To show how Qdrant Hybrid Cloud deployed on OVHcloud allows developers to leverage the benefits of an AI use case that is completely run within the existing infrastructure, we put together a comprehensive use case tutorial. This tutorial guides you through creating a recommendation system using collaborative filtering and sparse vectors with Qdrant Hybrid Cloud on OVHcloud. It employs the Movielens dataset for practical application, providing insights into building efficient, scalable recommendation engines suitable for developers and data scientists looking to leverage advanced vector search technologies within a secure, GDPR-compliant European cloud infrastructure. -- [**Multivectors**](https://qdrant.tech/documentation/concepts/vectors/?q=multivectors#multivectors) +[Try the Tutorial](/documentation/tutorials/recommendation-system-ovhcloud/) -Some modern embedding models produce an entire matrix (a list of vectors) as output rather than a single vector. Qdrant supports multivectors natively. +#### Get Started Today and Leverage the Benefits of Qdrant Hybrid Cloud -This feature is critical when using state-of-the-art retrieval models such as [**ColBERT**](https://qdrant.tech/documentation/fastembed/fastembed-colbert/), ColPali, or ColQwen. For instance, ColPali and ColQwen produce multivector outputs, and supporting them natively is crucial for [**state-of-the-art (SOTA) PDF-retrieval**](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/). +Setting up Qdrant Hybrid Cloud on OVHcloud is straightforward and quick, thanks to the intuitive integration with Kubernetes. Here's how: +- **Hybrid Cloud Activation**: Log into your Qdrant account and enable 'Hybrid Cloud'. -In addition to that, we continuously look for improvements in: +- **Cluster Integration**: Add your OVHcloud Kubernetes clusters as a Hybrid Cloud Environment in the Hybrid Cloud settings. -| | | -| --- | --- | -| **Memory Efficiency & Compression** | Techniques such as [**quantization**](https://qdrant.tech/articles/dedicated-vector-search/documentation/guides/quantization/) and [**HNSW compression**](https://qdrant.tech/blog/qdrant-1.13.x/#hnsw-graph-compression) to reduce storage requirements | -| **Retrieval Algorithms** | Support for the latest retrieval algorithms, including [**sparse neural retrieval**](https://qdrant.tech/articles/modern-sparse-neural-retrieval/), [**hybrid search**](https://qdrant.tech/documentation/concepts/hybrid-queries/) methods, and [**re-rankers**](https://qdrant.tech/documentation/fastembed/fastembed-rerankers/). | -| **Vector Data Analysis & Visualization** | Tools like the [**distance matrix API**](https://qdrant.tech/blog/qdrant-1.12.x/#distance-matrix-api-for-data-insights) provide insights into vectorized data, and a [**Web UI**](https://qdrant.tech/blog/qdrant-1.11.x/#web-ui-search-quality-tool) allows for intuitive exploration of data. | -| **Search Speed & Scalability** | Includes optimizations for [**multi-tenant environments**](https://qdrant.tech/articles/multitenancy/) to ensure efficient and scalable search. | +- **Effortless Deployment**: Use the Qdrant Management Console for easy deployment and management of Qdrant clusters on OVHcloud. -**These advancements are not just incremental improvements — they define the difference between a system optimized for vector search and one that accommodates it.** +[Read Hybrid Cloud Documentation](/documentation/hybrid-cloud/) -Staying at the cutting edge of vector search is not just about performance — it’s also about keeping pace with an evolving AI landscape. +#### Ready to Get Started? -## [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#summing-up) Summing up +Create a [Qdrant Cloud account](https://cloud.qdrant.io/login) and deploy your first **Qdrant Hybrid Cloud** cluster in a few minutes. You can always learn more in the [official release blog](/blog/hybrid-cloud/). -![conclusion-vector-search](https://qdrant.tech/articles_data/dedicated-vector-search/image5.jpg) +<|page-281-lllmstxt|> +We're happy to announce the collaboration between [LlamaIndex](https://www.llamaindex.ai/) and [Qdrant’s new Hybrid Cloud launch](/hybrid-cloud/), aimed at empowering engineers and scientists worldwide to swiftly and securely develop and scale their GenAI applications. By leveraging LlamaIndex's robust framework, users can maximize the potential of vector search and create stable and effective AI products. Qdrant Hybrid Cloud offers the same Qdrant functionality on a Kubernetes-based architecture, which further expands the ability of LlamaIndex to support any user on any environment. -When it comes to vector search, there’s a clear distinction between using a dedicated vector search solution and extending a database to support vector operations. +With Qdrant Hybrid Cloud, users have the flexibility to deploy their vector database in an environment of their choice. By using container-based scalable deployments, companies can leverage a cutting-edge framework like LlamaIndex, while staying deployed in the same hosting architecture as data sources, embedding models and LLMs. This powerful combination empowers organizations to build strong and secure applications that search, understand meaning and converse in text. -**For small-scale applications or prototypes handling up to a million data points, a non-optimized architecture might suffice.** However, as the volume of vectors grows, an unoptimized solution will quickly become a bottleneck — slowing down search operations and limiting scalability. Dedicated vector search solutions are engineered from the ground up to handle massive amounts of high-dimensional data efficiently. +While LLMs are trained on a great deal of data, they are not trained on user-specific data, which may be private or highly specific. LlamaIndex meets this challenge by adding context to LLM-based generation methods. In turn, Qdrant’s popular vector database sorts through semantically relevant information, which can further enrich the performance gains from LlamaIndex’s data connection features. With LlamaIndex, users can tap into state-of-the-art functions to query, chat, sort or parse data. Through the integration of Qdrant Hybrid Cloud and LlamaIndex developers can conveniently vectorize their data and perform highly accurate semantic search - all within their own environment. -State-of-the-art (SOTA) vector search evolves rapidly. If you plan to build on the latest advances, using a vector extension will eventually hold you back. Dedicated vector search solutions integrate these features natively, ensuring that you benefit from continuous innovations without compromising performance. +> *“LlamaIndex is thrilled to partner with Qdrant on the launch of Qdrant Hybrid Cloud, which upholds Qdrant's core functionality within a Kubernetes-based architecture. This advancement enhances LlamaIndex's ability to support diverse user environments, facilitating the development and scaling of production-grade, context-augmented LLM applications.”* Jerry Liu, CEO and Co-Founder, LlamaIndex -The power of vector search extends into areas such as big data analysis, recommendation systems, and discovery-based applications, and to support these vector search capabilities, a dedicated solution is needed. +#### Reap the Benefits of Advanced Integration Features With Qdrant and LlamaIndex -### [Anchor](https://qdrant.tech/articles/dedicated-vector-search/\#when-to-choose-a-dedicated-database-over-an-extension) When to Choose a Dedicated Database over an Extension: +Building apps with Qdrant Hybrid Cloud and LlamaIndex comes with several key advantages: -- **High-Volume, Real-Time Search**: Ideal for applications with many simultaneous users who require fast, continuous access to search results—think search engines, e-commerce recommendations, social media, or media streaming services. -- **Dynamic, Unstructured Data**: Perfect for scenarios where data is continuously evolving and where the goal is to discover insights from data patterns. -- **Innovative Applications**: If you’re looking to implement advanced use cases such as recommendation engines, hybrid search solutions, or exploratory data analysis where traditional exact or token-based searches hold short. +**Seamless Deployment:** Qdrant Hybrid Cloud’s Kubernetes-native architecture lets you deploy Qdrant in a few clicks, to an environment of your choice. Combined with the flexibility afforded by LlamaIndex, users can put together advanced RAG solutions anyplace at minimal effort. -Investing in a dedicated vector search engine will deliver the performance and flexibility necessary for success if your application relies on vector search at scale, keeps up with trends, or requires more than just a simple small-scale similarity search. +**Open-Source Compatibility:** LlamaIndex and Qdrant pride themselves on maintaining a reliable and mature integration that brings peace of mind to those prototyping and deploying large-scale AI solutions. Extensive documentation, code samples and tutorials support users of all skill levels in leveraging highly advanced features of data ingestion and vector search. -##### Was this page useful? +**Advanced Search Features:** LlamaIndex comes with built-in Qdrant Hybrid Search functionality, which combines search results from sparse and dense vectors. As a highly sought-after use case, hybrid search is easily accessible from within the LlamaIndex ecosystem. Deploying this particular type vector search on Hybrid Cloud is a matter of a few lines of code. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +#### Start Building With LlamaIndex and Qdrant Hybrid Cloud: Hybrid Search in Complex PDF Documentation Use Cases -Thank you for your feedback! 🙏 +To get you started, we created a comprehensive tutorial that shows how to build next-gen AI applications with Qdrant Hybrid Cloud using the LlamaIndex framework and the LlamaParse API. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/dedicated-vector-search.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +![hybrid-cloud-llamaindex-tutorial](/blog/hybrid-cloud-llamaindex/hybrid-cloud-llamaindex-tutorial.png) -On this page: +#### Tutorial: Hybrid Search for Household Appliance Manuals -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/dedicated-vector-search.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Use this end-to-end tutorial to create a system that retrieves information from complex user manuals in PDF format to enhance user experience for companies that sell household appliances. You will build a RAG pipeline with LlamaIndex leveraging Qdrant Hybrid Cloud for enhanced generative AI capabilities. The LlamaIndex integration shows how complex tables inside of items’ PDF documents can be processed via hybrid vector search with no additional configuration. -× +[Try the Tutorial](/documentation/tutorials/hybrid-search-llamaindex-jinaai/) -[Powered by](https://qdrant.tech/) +#### Documentation: Deploy Qdrant in a Few Clicks -<|page-153-lllmstxt|> -## binary-quantization-openai -- [Articles](https://qdrant.tech/articles/) -- Optimizing OpenAI Embeddings: Enhance Efficiency with Qdrant's Binary Quantization +Our simple Kubernetes-native design lets you deploy Qdrant Hybrid Cloud on your hosting platform of choice in just a few steps. Learn how in our documentation. -[Back to Practical Examples](https://qdrant.tech/articles/practicle-examples/) +[Read Hybrid Cloud Documentation](/documentation/hybrid-cloud/) -# Optimizing OpenAI Embeddings: Enhance Efficiency with Qdrant's Binary Quantization +#### Ready to Get Started? -Nirant Kasliwal +Create a [Qdrant Cloud account](https://cloud.qdrant.io/login) and deploy your first **Qdrant Hybrid Cloud** cluster in a few minutes. You can always learn more in the [official release blog](/blog/hybrid-cloud/). -· +<|page-282-lllmstxt|> +[LangChain](https://www.langchain.com/) and Qdrant are collaborating on the launch of [Qdrant Hybrid Cloud](/hybrid-cloud/), which is designed to empower engineers and scientists globally to easily and securely develop and scale their GenAI applications. Harnessing LangChain’s robust framework, users can unlock the full potential of vector search, enabling the creation of stable and effective AI products. Qdrant Hybrid Cloud extends the same powerful functionality of Qdrant onto a Kubernetes-based architecture, enhancing LangChain’s capability to cater to users across any environment. -February 21, 2024 +Qdrant Hybrid Cloud provides users with the flexibility to deploy their vector database in a preferred environment. Through container-based scalable deployments, companies can leverage cutting-edge frameworks like LangChain while maintaining compatibility with their existing hosting architecture for data sources, embedded models, and LLMs. This potent combination empowers organizations to develop robust and secure applications capable of text-based search, complex question-answering, recommendations and analysis. -![Optimizing OpenAI Embeddings: Enhance Efficiency with Qdrant's Binary Quantization](https://qdrant.tech/articles_data/binary-quantization-openai/preview/title.jpg) +Despite LLMs being trained on vast amounts of data, they often lack user-specific or private knowledge. LangChain helps developers build context-aware reasoning applications, addressing this challenge. Qdrant’s vector database sifts through semantically relevant information, enhancing the performance gains derived from LangChain’s data connection features. With LangChain, users gain access to state-of-the-art functionalities for querying, chatting, sorting, and parsing data. Through the seamless integration of Qdrant Hybrid Cloud and LangChain, developers can effortlessly vectorize their data and conduct highly accurate semantic searches—all within their preferred environment. -OpenAI Ada-003 embeddings are a powerful tool for natural language processing (NLP). However, the size of the embeddings are a challenge, especially with real-time search and retrieval. In this article, we explore how you can use Qdrant’s Binary Quantization to enhance the performance and efficiency of OpenAI embeddings. +> *“The AI industry is rapidly maturing, and more companies are moving their applications into production. We're really excited at LangChain about supporting enterprises' unique data architectures and tooling needs through integrations and first-party offerings through LangSmith. First-party enterprise integrations like Qdrant's greatly contribute to the LangChain ecosystem with enterprise-ready retrieval features that seamlessly integrate with LangSmith's observability, production monitoring, and automation features, and we're really excited to develop our partnership further.”* -Erick Friis, Founding Engineer at LangChain -In this post, we discuss: +#### Discover Advanced Integration Options with Qdrant Hybrid Cloud and LangChain -- The significance of OpenAI embeddings and real-world challenges. -- Qdrant’s Binary Quantization, and how it can improve the performance of OpenAI embeddings -- Results of an experiment that highlights improvements in search efficiency and accuracy -- Implications of these findings for real-world applications -- Best practices for leveraging Binary Quantization to enhance OpenAI embeddings +Building apps with Qdrant Hybrid Cloud and LangChain comes with several key advantages: -If you’re new to Binary Quantization, consider reading our article which walks you through the concept and [how to use it with Qdrant](https://qdrant.tech/articles/binary-quantization/) +**Seamless Deployment:** With Qdrant Hybrid Cloud's Kubernetes-native architecture, deploying Qdrant is as simple as a few clicks, allowing you to choose your preferred environment. Coupled with LangChain's flexibility, users can effortlessly create advanced RAG solutions anywhere with minimal effort. -You can also try out these techniques as described in [Binary Quantization OpenAI](https://github.com/qdrant/examples/blob/openai-3/binary-quantization-openai/README.md), which includes Jupyter notebooks. +**Open-Source Compatibility:** LangChain and Qdrant support a dependable and mature integration, providing peace of mind to those developing and deploying large-scale AI solutions. With comprehensive documentation, code samples, and tutorials, users of all skill levels can harness the advanced features of data ingestion and vector search to their fullest potential. -## [Anchor](https://qdrant.tech/articles/binary-quantization-openai/\#new-openai-embeddings-performance-and-changes) New OpenAI embeddings: performance and changes +**Advanced RAG Performance:** By infusing LLMs with relevant context, Qdrant offers superior results for RAG use cases. Integrating vector search yields improved retrieval accuracy, faster query speeds, and reduced computational overhead. LangChain streamlines the entire process, offering speed, scalability, and efficiency, particularly beneficial for enterprise-scale deployments dealing with vast datasets. Furthermore, [LangSmith](https://www.langchain.com/langsmith) provides one-line instrumentation for debugging, observability, and ongoing performance testing of LLM applications. -As the technology of embedding models has advanced, demand has grown. Users are looking more for powerful and efficient text-embedding models. OpenAI’s Ada-003 embeddings offer state-of-the-art performance on a wide range of NLP tasks, including those noted in [MTEB](https://huggingface.co/spaces/mteb/leaderboard) and [MIRACL](https://openai.com/blog/new-embedding-models-and-api-updates). +#### Start Building With LangChain and Qdrant Hybrid Cloud: Develop a RAG-Based Employee Onboarding System -These models include multilingual support in over 100 languages. The transition from text-embedding-ada-002 to text-embedding-3-large has led to a significant jump in performance scores (from 31.4% to 54.9% on MIRACL). +To get you started, we’ve put together a tutorial that shows how to create next-gen AI applications with Qdrant Hybrid Cloud using the LangChain framework and Cohere embeddings. -#### [Anchor](https://qdrant.tech/articles/binary-quantization-openai/\#matryoshka-representation-learning) Matryoshka representation learning +![hybrid-cloud-langchain-tutorial](/blog/hybrid-cloud-langchain/hybrid-cloud-langchain-tutorial.png) -The new OpenAI models have been trained with a novel approach called “ [Matryoshka Representation Learning](https://aniketrege.github.io/blog/2024/mrl/)”. Developers can set up embeddings of different sizes (number of dimensions). In this post, we use small and large variants. Developers can select embeddings which balances accuracy and size. +#### Tutorial: Build a RAG System for Employee Onboarding -Here, we show how the accuracy of binary quantization is quite good across different dimensions – for both the models. +We created a comprehensive tutorial to show how you can build a RAG-based system with Qdrant Hybrid Cloud, LangChain and Cohere’s embeddings. This use case is focused on building a question-answering system for internal corporate employee onboarding. -## [Anchor](https://qdrant.tech/articles/binary-quantization-openai/\#enhanced-performance-and-efficiency-with-binary-quantization) Enhanced performance and efficiency with binary quantization +[Try the Tutorial](/documentation/tutorials/natural-language-search-oracle-cloud-infrastructure-cohere-langchain/) -By reducing storage needs, you can scale applications with lower costs. This addresses a critical challenge posed by the original embedding sizes. Binary Quantization also speeds the search process. It simplifies the complex distance calculations between vectors into more manageable bitwise operations, which supports potentially real-time searches across vast datasets. +#### Documentation: Deploy Qdrant in a Few Clicks -The accompanying graph illustrates the promising accuracy levels achievable with binary quantization across different model sizes, showcasing its practicality without severely compromising on performance. This dual advantage of storage reduction and accelerated search capabilities underscores the transformative potential of Binary Quantization in deploying OpenAI embeddings more effectively across various real-world applications. +Our simple Kubernetes-native design lets you deploy Qdrant Hybrid Cloud on your hosting platform of choice in just a few steps. Learn how in our documentation. -![](https://qdrant.tech/blog/openai/Accuracy_Models.png) +[Read Hybrid Cloud Documentation](/documentation/hybrid-cloud/) -The efficiency gains from Binary Quantization are as follows: +#### Ready to Get Started? -- Reduced storage footprint: It helps with large-scale datasets. It also saves on memory, and scales up to 30x at the same cost. -- Enhanced speed of data retrieval: Smaller data sizes generally leads to faster searches. -- Accelerated search process: It is based on simplified distance calculations between vectors to bitwise operations. This enables real-time querying even in extensive databases. +Create a [Qdrant Cloud account](https://cloud.qdrant.io/login) and deploy your first **Qdrant Hybrid Cloud** cluster in a few minutes. You can always learn more in the [official release blog](/blog/hybrid-cloud/). -### [Anchor](https://qdrant.tech/articles/binary-quantization-openai/\#experiment-setup-openai-embeddings-in-focus) Experiment setup: OpenAI embeddings in focus +<|page-283-lllmstxt|> +We're thrilled to announce the collaboration between Qdrant and [Jina AI](https://jina.ai/) for the launch of [Qdrant Hybrid Cloud](/hybrid-cloud/), empowering users worldwide to rapidly and securely develop and scale their AI applications. By leveraging Jina AI's top-tier large language models (LLMs), engineers and scientists can optimize their vector search efforts. Qdrant's latest Hybrid Cloud solution, designed natively with Kubernetes, seamlessly integrates with Jina AI's robust embedding models and APIs. This synergy streamlines both prototyping and deployment processes for AI solutions. -To identify Binary Quantization’s impact on search efficiency and accuracy, we designed our experiment on OpenAI text-embedding models. These models, which capture nuanced linguistic features and semantic relationships, are the backbone of our analysis. We then delve deep into the potential enhancements offered by Qdrant’s Binary Quantization feature. +Retrieval Augmented Generation (RAG) is broadly adopted as the go-to Generative AI solution, as it enables powerful and cost-effective chatbots, customer support agents and other forms of semantic search applications. Through Jina AI's managed service, users gain access to cutting-edge text generation and comprehension capabilities, conveniently accessible through an API. Qdrant Hybrid Cloud effortlessly incorporates Jina AI's embedding models, facilitating smooth data vectorization and delivering exceptionally precise semantic search functionality. -This approach not only leverages the high-caliber OpenAI embeddings but also provides a broad basis for evaluating the search mechanism under scrutiny. +With Qdrant Hybrid Cloud, users have the flexibility to deploy their vector database in an environment of their choice. By using container-based scalable deployments, global businesses can keep both products deployed in the same hosting architecture. By combining Jina AI’s models with Qdrant’s vector search capabilities, developers can create robust and scalable applications tailored to meet the demands of modern enterprises. This combination allows organizations to build strong and secure Generative AI solutions. -#### [Anchor](https://qdrant.tech/articles/binary-quantization-openai/\#dataset) Dataset +> *“The collaboration of Qdrant Hybrid Cloud with Jina AI’s embeddings gives every user the tools to craft a perfect search framework with unmatched accuracy and scalability. It’s a partnership that truly pays off!”* Nan Wang, CTO, Jina AI -The research employs 100K random samples from the [OpenAI 1M](https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M) 1M dataset, focusing on 100 randomly selected records. These records serve as queries in the experiment, aiming to assess how Binary Quantization influences search efficiency and precision within the dataset. We then use the embeddings of the queries to search for the nearest neighbors in the dataset. +#### Benefits of Qdrant’s Vector Search With Jina AI Embeddings in Enterprise RAG Scenarios -#### [Anchor](https://qdrant.tech/articles/binary-quantization-openai/\#parameters-oversampling-rescoring-and-search-limits) Parameters: oversampling, rescoring, and search limits +Building apps with Qdrant Hybrid Cloud and Jina AI’s embeddings comes with several key advantages: -For each record, we run a parameter sweep over the number of oversampling, rescoring, and search limits. We can then understand the impact of these parameters on search accuracy and efficiency. Our experiment was designed to assess the impact of Binary Quantization under various conditions, based on the following parameters: +**Seamless Deployment:** Jina AI’s best-in-class embedding APIs can be combined with Qdrant Hybrid Cloud’s Kubernetes-native architecture to deploy flexible and platform-agnostic AI solutions in a few minutes to any environment. This combination is purpose built for both prototyping and scalability, so that users can put together advanced RAG solutions anyplace with minimal effort. -- **Oversampling**: By oversampling, we can limit the loss of information inherent in quantization. This also helps to preserve the semantic richness of your OpenAI embeddings. We experimented with different oversampling factors, and identified the impact on the accuracy and efficiency of search. Spoiler: higher oversampling factors tend to improve the accuracy of searches. However, they usually require more computational resources. +**Scalable Vector Search:** Once deployed to a customer’s host of choice, Qdrant Hybrid Cloud provides a fully managed vector database that lets users effortlessly scale the setup through vertical or horizontal scaling. Deployed in highly secure environments, this is a robust setup that is designed to meet the needs of large enterprises, ensuring a full spectrum of solutions for various projects and workloads. -- **Rescoring**: Rescoring refines the first results of an initial binary search. This process leverages the original high-dimensional vectors to refine the search results, **always** improving accuracy. We toggled rescoring on and off to measure effectiveness, when combined with Binary Quantization. We also measured the impact on search performance. +**Cost Efficiency:** By leveraging Jina AI's scalable and affordable pricing structure and pairing it with Qdrant's quantization for efficient data handling, this integration offers great value for its cost. Companies who are just getting started with both will have a minimal upfront investment and optimal cost management going forward. -- **Search Limits**: We specify the number of results from the search process. We experimented with various search limits to measure their impact the accuracy and efficiency. We explored the trade-offs between search depth and performance. The results provide insight for applications with different precision and speed requirements. +#### Start Building Gen AI Apps With Jina AI and Qdrant Hybrid Cloud +![hybrid-cloud-jinaai-tutorial](/blog/hybrid-cloud-jinaai/hybrid-cloud-jinaai-tutorial.png) -Through this detailed setup, our experiment sought to shed light on the nuanced interplay between Binary Quantization and the high-quality embeddings produced by OpenAI’s models. By meticulously adjusting and observing the outcomes under different conditions, we aimed to uncover actionable insights that could empower users to harness the full potential of Qdrant in combination with OpenAI’s embeddings, regardless of their specific application needs. +To get you started, we created a comprehensive tutorial that shows how to build a modern GenAI application with Qdrant Hybrid Cloud and Jina AI embeddings. -### [Anchor](https://qdrant.tech/articles/binary-quantization-openai/\#results-binary-quantizations-impact-on-openai-embeddings) Results: binary quantization’s impact on OpenAI embeddings +#### Tutorial: Hybrid Search for Household Appliance Manuals -To analyze the impact of rescoring ( `True` or `False`), we compared results across different model configurations and search limits. Rescoring sets up a more precise search, based on results from an initial query. +Learn how to build an app that retrieves information from PDF user manuals to enhance user experience for companies that sell household appliances. The system will leverage Jina AI embeddings and Qdrant Hybrid Cloud for enhanced generative AI capabilities, while the RAG pipeline will be tied together using the LlamaIndex framework. This example demonstrates how complex tables in PDF documentation can be processed as high quality embeddings with no extra configuration. By introducing Hybrid Search from Qdrant, the RAG functionality is highly accurate. -#### [Anchor](https://qdrant.tech/articles/binary-quantization-openai/\#rescoring) Rescoring +[Try the Tutorial](/documentation/tutorials/hybrid-search-llamaindex-jinaai/) -![Graph that measures the impact of rescoring](https://qdrant.tech/blog/openai/Rescoring_Impact.png) +#### Documentation: Deploy Qdrant in a Few Clicks -Here are some key observations, which analyzes the impact of rescoring ( `True` or `False`): +Our simple Kubernetes-native design lets you deploy Qdrant Hybrid Cloud on your hosting platform of choice in just a few steps. Learn how in our documentation. -1. **Significantly Improved Accuracy**: +[Read Hybrid Cloud Documentation](/documentation/hybrid-cloud/) - - Across all models and dimension configurations, enabling rescoring ( `True`) consistently results in higher accuracy scores compared to when rescoring is disabled ( `False`). - - The improvement in accuracy is true across various search limits (10, 20, 50, 100). -2. **Model and Dimension Specific Observations**: +#### Ready to Get Started? - - For the `text-embedding-3-large` model with 3072 dimensions, rescoring boosts the accuracy from an average of about 76-77% without rescoring to 97-99% with rescoring, depending on the search limit and oversampling rate. - - The accuracy improvement with increased oversampling is more pronounced when rescoring is enabled, indicating a better utilization of the additional binary codes in refining search results. - - With the `text-embedding-3-small` model at 512 dimensions, accuracy increases from around 53-55% without rescoring to 71-91% with rescoring, highlighting the significant impact of rescoring, especially at lower dimensions. +Create a [Qdrant Cloud account](https://cloud.qdrant.io/login) and deploy your first **Qdrant Hybrid Cloud** cluster in a few minutes. You can always learn more in the [official release blog](/blog/hybrid-cloud/). -In contrast, for lower dimension models (such as text-embedding-3-small with 512 dimensions), the incremental accuracy gains from increased oversampling levels are less significant, even with rescoring enabled. This suggests a diminishing return on accuracy improvement with higher oversampling in lower dimension spaces. +<|page-284-lllmstxt|> +We’re excited to share that Qdrant and [Haystack](https://haystack.deepset.ai/) are continuing to expand their seamless integration to the new [Qdrant Hybrid Cloud](/hybrid-cloud/) offering, allowing developers to deploy a managed [vector database](/articles/what-is-a-vector-database/) in their own environment of choice. Earlier this year, both Qdrant and Haystack, started to address their user’s growing need for production-ready retrieval-augmented-generation (RAG) deployments. The ability to build and deploy AI apps anywhere now allows for complete data sovereignty and control. This gives large enterprise customers the peace of mind they need before they expand AI functionalities throughout their operations. -3. **Influence of Search Limit**: - - The performance gain from rescoring seems to be relatively stable across different search limits, suggesting that rescoring consistently enhances accuracy regardless of the number of top results considered. +With a highly customizable framework like Haystack, implementing vector search becomes incredibly simple. Qdrant's new Qdrant Hybrid Cloud offering and its Kubernetes-native design supports customers all the way from a simple prototype setup to a production scenario on any hosting platform. Users can attach AI functionalities to their existing in-house software by creating custom integration components. Don’t forget, both products are open-source and highly modular! -In summary, enabling rescoring dramatically improves search accuracy across all tested configurations. It is crucial feature for applications where precision is paramount. The consistent performance boost provided by rescoring underscores its value in refining search results, particularly when working with complex, high-dimensional data like OpenAI embeddings. This enhancement is critical for applications that demand high accuracy, such as semantic search, content discovery, and recommendation systems, where the quality of search results directly impacts user experience and satisfaction. +With Haystack and Qdrant Hybrid Cloud, the path to production has never been clearer. The elaborate integration of Qdrant as a Document Store simplifies the deployment of Haystack-based AI applications in any production-grade environment. Coupled with Qdrant’s Hybrid Cloud offering, your application can be deployed anyplace, on your own terms. -### [Anchor](https://qdrant.tech/articles/binary-quantization-openai/\#dataset-combinations) Dataset combinations +>*“We hope that with Haystack 2.0 and our growing partnerships such as what we have here with Qdrant Hybrid Cloud, engineers are able to build AI systems with full autonomy. Both in how their pipelines are designed, and how their data are managed.”* Tuana Çelik, Developer Relations Lead, deepset. -For those exploring the integration of text embedding models with Qdrant, it’s crucial to consider various model configurations for optimal performance. The dataset combinations defined above illustrate different configurations to test against Qdrant. These combinations vary by two primary attributes: +#### Simplifying RAG Deployment: Qdrant Hybrid Cloud and Haystack 2.0 Integration -1. **Model Name**: Signifying the specific text embedding model variant, such as “text-embedding-3-large” or “text-embedding-3-small”. This distinction correlates with the model’s capacity, with “large” models offering more detailed embeddings at the cost of increased computational resources. +Building apps with Qdrant Hybrid Cloud and deepset’s framework has become even simpler with Haystack 2.0. Both products are completely optimized for RAG in production scenarios. Here are some key advantages: -2. **Dimensions**: This refers to the size of the vector embeddings produced by the model. Options range from 512 to 3072 dimensions. Higher dimensions could lead to more precise embeddings but might also increase the search time and memory usage in Qdrant. +**Mature Integration:** You can connect your Haystack pipelines to Qdrant in a few lines of code. Qdrant Hybrid Cloud leverages the existing “Document Store” integration for data sources.This common interface makes it easy to access Qdrant as a data source from within your existing setup. +**Production Readiness:** With deepset’s new product [Hayhooks](https://docs.haystack.deepset.ai/docs/hayhooks), you can generate RESTful APIs from Haystack pipelines. This simplifies the deployment process and makes the service easily accessible by developers using Qdrant Hybrid Cloud to prepare RAG systems for production. -Optimizing these parameters is a balancing act between search accuracy and resource efficiency. Testing across these combinations allows users to identify the configuration that best meets their specific needs, considering the trade-offs between computational resources and the quality of search results. +**Flexible & Customizable:** The open-source nature of Qdrant and Haystack’s 2.0 makes it easy to extend the capabilities of both products through customization. When tailoring vector RAG systems to their own needs, users can develop custom components and plug them into both Qdrant Hybrid Cloud and Haystack for maximum modularity. [Creating custom components](https://docs.haystack.deepset.ai/docs/custom-components) is a core functionality. -```python -dataset_combinations = [\ - {\ - "model_name": "text-embedding-3-large",\ - "dimensions": 3072,\ - },\ - {\ - "model_name": "text-embedding-3-large",\ - "dimensions": 1024,\ - },\ - {\ - "model_name": "text-embedding-3-large",\ - "dimensions": 1536,\ - },\ - {\ - "model_name": "text-embedding-3-small",\ - "dimensions": 512,\ - },\ - {\ - "model_name": "text-embedding-3-small",\ - "dimensions": 1024,\ - },\ - {\ - "model_name": "text-embedding-3-small",\ - "dimensions": 1536,\ - },\ -] +#### Learn How to Build a Production-Level RAG Service with Qdrant and Haystack -``` +![hybrid-cloud-haystack-tutorial](/blog/hybrid-cloud-haystack/hybrid-cloud-haystack-tutorial.png) -#### [Anchor](https://qdrant.tech/articles/binary-quantization-openai/\#exploring-dataset-combinations-and-their-impacts-on-model-performance) Exploring dataset combinations and their impacts on model performance +To get you started, we created a comprehensive tutorial that shows how to build next-gen AI applications with Qdrant Hybrid Cloud using deepset’s Haystack framework. -The code snippet iterates through predefined dataset and model combinations. For each combination, characterized by the model name and its dimensions, the corresponding experiment’s results are loaded. These results, which are stored in JSON format, include performance metrics like accuracy under different configurations: with and without oversampling, and with and without a rescore step. +#### Tutorial: Private Chatbot for Interactive Learning -Following the extraction of these metrics, the code computes the average accuracy across different settings, excluding extreme cases of very low limits (specifically, limits of 1 and 5). This computation groups the results by oversampling, rescore presence, and limit, before calculating the mean accuracy for each subgroup. +Learn how to develop a tutor chatbot from online course materials. You will create a Retrieval Augmented Generation (RAG) pipeline with Haystack for enhanced generative AI capabilities and Qdrant Hybrid Cloud for vector search. By deploying every tool on RedHat OpenShift, you will ensure complete privacy and data sovereignty, whereby no course content leaves your cloud. -After gathering and processing this data, the average accuracies are organized into a pivot table. This table is indexed by the limit (the number of top results considered), and columns are formed based on combinations of oversampling and rescoring. +[Try the Tutorial](/documentation/tutorials/rag-chatbot-red-hat-openshift-haystack/) -```python -import pandas as pd +#### Documentation: Deploy Qdrant in a Few Clicks -for combination in dataset_combinations: - model_name = combination["model_name"] - dimensions = combination["dimensions"] - print(f"Model: {model_name}, dimensions: {dimensions}") - results = pd.read_json(f"../results/results-{model_name}-{dimensions}.json", lines=True) - average_accuracy = results[results["limit"] != 1] - average_accuracy = average_accuracy[average_accuracy["limit"] != 5] - average_accuracy = average_accuracy.groupby(["oversampling", "rescore", "limit"])[\ - "accuracy"\ - ].mean() - average_accuracy = average_accuracy.reset_index() - acc = average_accuracy.pivot( - index="limit", columns=["oversampling", "rescore"], values="accuracy" - ) - print(acc) +Our simple Kubernetes-native design lets you deploy Qdrant Hybrid Cloud on your hosting platform of choice in just a few steps. Learn how in our documentation. -``` +[Read Hybrid Cloud Documentation](/documentation/hybrid-cloud/) -Here is a selected slice of these results, with `rescore=True`: +#### Ready to get started? -| Method | Dimensionality | Test Dataset | Recall | Oversampling | -| --- | --- | --- | --- | --- | -| OpenAI text-embedding-3-large (highest MTEB score from the table) | 3072 | [DBpedia 1M](https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-3072-1M) | 0.9966 | 3x | -| OpenAI text-embedding-3-small | 1536 | [DBpedia 100K](https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-small-1536-100K) | 0.9847 | 3x | -| OpenAI text-embedding-3-large | 1536 | [DBpedia 1M](https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-1M) | 0.9826 | 3x | +Create a [Qdrant Cloud account](https://cloud.qdrant.io/login) and deploy your first **Qdrant Hybrid Cloud** cluster in a few minutes. You can always learn more in the [official release blog](/blog/hybrid-cloud/). -#### [Anchor](https://qdrant.tech/articles/binary-quantization-openai/\#impact-of-oversampling) Impact of oversampling +<|page-285-lllmstxt|> +Developers are constantly seeking new ways to enhance their AI applications with new customer experiences. At the core of this are vector databases, as they enable the efficient handling of complex, unstructured data, making it possible to power applications with semantic search, personalized recommendation systems, and intelligent Q&A platforms. However, when deploying such new AI applications, especially those handling sensitive or personal user data, privacy becomes important. -You can use oversampling in machine learning to counteract imbalances in datasets. -It works well when one class significantly outnumbers others. This imbalance -can skew the performance of models, which favors the majority class at the -expense of others. By creating additional samples from the minority classes, -oversampling helps equalize the representation of classes in the training dataset, thus enabling more fair and accurate modeling of real-world scenarios. +[DigitalOcean](https://www.digitalocean.com/) and Qdrant are actively addressing this with an integration that lets developers deploy a managed vector database in their existing DigitalOcean environments. With the recent launch of [Qdrant Hybrid Cloud](/hybrid-cloud/), developers can seamlessly deploy Qdrant on DigitalOcean Kubernetes (DOKS) clusters, making it easier for developers to handle vector databases without getting bogged down in the complexity of managing the underlying infrastructure. -The screenshot showcases the effect of oversampling on model performance metrics. While the actual metrics aren’t shown, we expect to see improvements in measures such as precision, recall, or F1-score. These improvements illustrate the effectiveness of oversampling in creating a more balanced dataset. It allows the model to learn a better representation of all classes, not just the dominant one. +#### Unlocking the Power of Generative AI with Qdrant and DigitalOcean -Without an explicit code snippet or output, we focus on the role of oversampling in model fairness and performance. Through graphical representation, you can set up before-and-after comparisons. These comparisons illustrate the contribution to machine learning projects. +User data is a critical asset for a business, and user privacy should always be a top priority. This is why businesses require tools that enable them to leverage their user data as a valuable asset while respecting privacy. Qdrant Hybrid Cloud on DigitalOcean brings these capabilities directly into developers' hands, enhancing deployment flexibility and ensuring greater control over data. -![Measuring the impact of oversampling](https://qdrant.tech/blog/openai/Oversampling_Impact.png) +> *“Qdrant, with its seamless integration and robust performance, equips businesses to develop cutting-edge applications that truly resonate with their users. Through applications such as semantic search, Q&A systems, recommendation engines, image search, and RAG, DigitalOcean customers can leverage their data to the fullest, ensuring privacy and driving innovation.“* - Bikram Gupta, Lead Product Manager, Kubernetes & App Platform, DigitalOcean. -### [Anchor](https://qdrant.tech/articles/binary-quantization-openai/\#leveraging-binary-quantization-best-practices) Leveraging binary quantization: best practices +#### Get Started with Qdrant on DigitalOcean -We recommend the following best practices for leveraging Binary Quantization to enhance OpenAI embeddings: +DigitalOcean customers can easily deploy Qdrant on their DigitalOcean Kubernetes (DOKS) clusters through a simple Kubernetis-native “one-line” installment. This simplicity allows businesses to start small and scale efficiently. -1. Embedding Model: Use the text-embedding-3-large from MTEB. It is most accurate among those tested. -2. Dimensions: Use the highest dimension available for the model, to maximize accuracy. The results are true for English and other languages. -3. Oversampling: Use an oversampling factor of 3 for the best balance between accuracy and efficiency. This factor is suitable for a wide range of applications. -4. Rescoring: Enable rescoring to improve the accuracy of search results. -5. RAM: Store the full vectors and payload on disk. Limit what you load from memory to the binary quantization index. This helps reduce the memory footprint and improve the overall efficiency of the system. The incremental latency from the disk read is negligible compared to the latency savings from the binary scoring in Qdrant, which uses SIMD instructions where possible. +- **Simple Deployment**: Leveraging Kubernetes, deploying Qdrant Hybrid Cloud on DigitalOcean is streamlined, making the management of vector search workloads in the own environment more efficient. -## [Anchor](https://qdrant.tech/articles/binary-quantization-openai/\#whats-next) What’s next? +- **Own Infrastructure**: Hosting the vector database on your DigitalOcean infrastructure offers flexibility and allows you to manage the entire AI stack in one place. -Binary quantization is exceptional if you need to work with large volumes of data under high recall expectations. You can try this feature either by spinning up a [Qdrant container image](https://hub.docker.com/r/qdrant/qdrant) locally or, having us create one for you through a [free account](https://cloud.qdrant.io/login) in our cloud hosted service. +- **Data Control**: Deploying within the own DigitalOcean environment ensures data control, keeping sensitive information within its security perimeter. -The article gives examples of data sets and configuration you can use to get going. Our documentation covers [adding large datasets to Qdrant](https://qdrant.tech/documentation/tutorials/bulk-upload/) to your Qdrant instance as well as [more quantization methods](https://qdrant.tech/documentation/guides/quantization/). +To get Qdrant Hybrid Cloud setup on DigitalOcean, just follow these steps: -Want to discuss these findings and learn more about Binary Quantization? [Join our Discord community.](https://discord.gg/qdrant) +- **Hybrid Cloud Setup**: Begin by logging into your [Qdrant Cloud account](https://cloud.qdrant.io/login) and activate **Hybrid Cloud** feature in the sidebar. -##### Was this page useful? +- **Cluster Configuration**: From Hybrid Cloud settings, integrate your DigitalOcean Kubernetes clusters as a Hybrid Cloud Environment. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +- **Simplified Deployment**: Use the Qdrant Management Console to effortlessly establish and oversee your Qdrant clusters on DigitalOcean. -Thank you for your feedback! 🙏 +#### Chat with PDF Documents with Qdrant Hybrid Cloud on DigitalOcean -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/binary-quantization-openai.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +![hybrid-cloud-llamaindex-tutorial](/blog/hybrid-cloud-llamaindex/hybrid-cloud-llamaindex-tutorial.png) -On this page: +We created a tutorial that guides you through setting up and leveraging Qdrant Hybrid Cloud on DigitalOcean for a RAG application. It highlights practical steps to integrate vector search with Jina AI's LLMs, optimizing the generation of high-quality, relevant AI content, while ensuring data sovereignty is maintained throughout. This specific system is tied together via the LlamaIndex framework. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/binary-quantization-openai.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +[Try the Tutorial](/documentation/tutorials/hybrid-search-llamaindex-jinaai/) -× +For a comprehensive guide, our documentation provides detailed instructions on setting up Qdrant on DigitalOcean. -[Powered by](https://qdrant.tech/) +[Read Hybrid Cloud Documentation](/documentation/hybrid-cloud/) -<|page-154-lllmstxt|> -## create-cluster -- [Documentation](https://qdrant.tech/documentation/) -- [Cloud](https://qdrant.tech/documentation/cloud/) -- Create a Cluster +#### Ready to Get Started? -# [Anchor](https://qdrant.tech/documentation/cloud/create-cluster/\#creating-a-qdrant-cloud-cluster) Creating a Qdrant Cloud Cluster +Create a [Qdrant Cloud account](https://cloud.qdrant.io/login) and deploy your first **Qdrant Hybrid Cloud** cluster in a few minutes. You can always learn more in the [official release blog](/blog/hybrid-cloud/). -Qdrant Cloud offers two types of clusters: **Free** and **Standard**. +<|page-286-lllmstxt|> +[Aleph Alpha](https://aleph-alpha.com/) and Qdrant are on a joint mission to empower the world’s best companies in their AI journey. The launch of [Qdrant Hybrid Cloud](/hybrid-cloud/) furthers this effort by ensuring complete data sovereignty and hosting security. This latest collaboration is all about giving enterprise customers complete transparency and sovereignty to make use of AI in their own environment. By using a hybrid cloud vector database, those looking to leverage vector search for the AI applications can now ensure their proprietary and customer data is completely secure. -## [Anchor](https://qdrant.tech/documentation/cloud/create-cluster/\#free-clusters) Free Clusters +Aleph Alpha’s state-of-the-art technology, offering unmatched quality and safety, cater perfectly to large-scale business applications and complex scenarios utilized by professionals across fields such as science, law, and security globally. Recognizing that these sophisticated use cases often demand comprehensive data processing capabilities beyond what standalone LLMs can provide, the collaboration between Aleph Alpha and Qdrant Hybrid Cloud introduces a robust platform. This platform empowers customers with full data sovereignty, enabling secure management of highly specific and sensitive information within their own infrastructure. -Free tier clusters are perfect for prototyping and testing. You don’t need a credit card to join. +Together with Aleph Alpha, Qdrant Hybrid Cloud offers an ecosystem where individual components seamlessly integrate with one another. Qdrant's new Kubernetes-native design coupled with Aleph Alpha's powerful technology meet the needs of developers who are both prototyping and building production-level apps. -A free tier cluster only includes 1 single node with the following resources: +#### How Aleph Alpha and Qdrant Blend Data Control, Scalability, and European Standards -| Resource | Value | -| --- | --- | -| RAM | 1 GB | -| vCPU | 0.5 | -| Disk space | 4 GB | -| Nodes | 1 | +Building apps with Qdrant Hybrid Cloud and Aleph Alpha’s models leverages some common value propositions: -This configuration supports serving about 1 M vectors of 768 dimensions. To calculate your needs, refer to our documentation on [Capacity Planning](https://qdrant.tech/documentation/guides/capacity-planning/). +**Data Sovereignty:** Qdrant Hybrid Cloud is the first vector database that can be deployed anywhere, with complete database isolation, while still providing fully managed cluster management. Furthermore, as the best option for organizations that prioritize data sovereignty, Aleph Alpha offers foundation models which are aimed at serving regional use cases. Together, both products can be leveraged to keep highly specific data safe and isolated. -The choice of cloud providers and regions is limited. +**Scalable Vector Search:** Once deployed to a customer’s host of choice, Qdrant Hybrid Cloud provides a fully managed vector database that lets users effortlessly scale the setup through vertical or horizontal scaling. Deployed in highly secure environments, this is a robust setup that is designed to meet the needs of large enterprises, ensuring a full spectrum of solutions for various projects and workloads. -It includes: +**European Origins & Expertise**: With a strong presence in the European Union ecosystem, Aleph Alpha is ideally positioned to partner with European-based companies like Qdrant, providing local expertise and infrastructure that aligns with European regulatory standards. -- Standard Support -- Basic monitoring -- Basic log access -- Basic alerting -- Version upgrades with downtime -- Only manual snapshots and restores via API -- No dedicated resources +#### Build a Data-Sovereign AI System With Qdrant Hybrid Cloud and Aleph Alpha’s Models -If unused, free tier clusters are automatically suspended after 1 week, and deleted after 4 weeks of inactivity if not reactivated. +![hybrid-cloud-aleph-alpha-tutorial](/blog/hybrid-cloud-aleph-alpha/hybrid-cloud-aleph-alpha-tutorial.png) -You can always upgrade to a standard cluster with more resources and features. +To get you started, we created a comprehensive tutorial that shows how to build next-gen AI applications with Qdrant Hybrid Cloud and Aleph Alpha’s advanced models. -## [Anchor](https://qdrant.tech/documentation/cloud/create-cluster/\#standard-clusters) Standard Clusters +#### Tutorial: Build a Region-Specific Contract Management System -On top of the Free cluster features, Standard clusters offer: +Learn how to develop an AI system that reads lengthy contracts and gives complex answers based on stored content. This system is completely hosted inside of Germany for GDPR compliance purposes. The tutorial shows how enterprises with a vast number of stored contract documents can leverage AI in a closed environment that doesn’t leave the hosting region, thus ensuring data sovereignty and security. -- Response time and uptime SLAs -- Dedicated resources -- Backup and disaster recovery -- Multi-node clusters for high availability -- Horizontal and vertical scaling -- Monitoring and log management -- Zero-downtime upgrades for multi-node clusters with replication +[Try the Tutorial](/documentation/examples/rag-contract-management-stackit-aleph-alpha/) -You have a broad choice of regions on AWS, Azure and Google Cloud. +#### Documentation: Deploy Qdrant in a Few Clicks -For payment information see [**Pricing and Payments**](https://qdrant.tech/documentation/cloud/pricing-payments/). +Our simple Kubernetes-native design lets you deploy Qdrant Hybrid Cloud on your hosting platform of choice in just a few steps. Learn how in our documentation. -## [Anchor](https://qdrant.tech/documentation/cloud/create-cluster/\#create-a-cluster) Create a Cluster +[Read Hybrid Cloud Documentation](/documentation/hybrid-cloud/) -![Create Cluster Page](https://qdrant.tech/documentation/cloud/create-cluster.png) +#### Ready to Get Started? -This page shows you how to use the Qdrant Cloud Console to create a custom Qdrant Cloud cluster. +Create a [Qdrant Cloud account](https://cloud.qdrant.io/signup) and deploy your first **Qdrant Hybrid Cloud** cluster in a few minutes. You can always learn more in the [official release blog](/blog/hybrid-cloud/). -> **Prerequisite:** Please make sure you have provided billing information before creating a custom cluster. +<|page-287-lllmstxt|> +In their mission to support large-scale AI innovation, [Airbyte](https://airbyte.com/) and Qdrant are collaborating on the launch of Qdrant’s new offering - [Qdrant Hybrid Cloud](/hybrid-cloud/). This collaboration allows users to leverage the synergistic capabilities of both Airbyte and Qdrant within a private infrastructure. Qdrant’s new offering represents the first managed [vector database](/articles/what-is-a-vector-database/) that can be deployed in any environment. Businesses optimizing their data infrastructure with Airbyte are now able to host a vector database either on premise, or on a public cloud of their choice - while still reaping the benefits of a managed database product. -01. Start in the **Clusters** section of the [Cloud Dashboard](https://cloud.qdrant.io/). +This is a major step forward in offering enterprise customers incredible synergy for maximizing the potential of their AI data. Qdrant's new Kubernetes-native design, coupled with Airbyte’s powerful data ingestion pipelines meet the needs of developers who are both prototyping and building production-level apps. Airbyte simplifies the process of data integration by providing a platform that connects to various sources and destinations effortlessly. Moreover, Qdrant Hybrid Cloud leverages advanced indexing and search capabilities to empower users to explore and analyze their data efficiently. -02. Select **Clusters** and then click **\+ Create**. +In a major benefit to Generative AI, businesses can leverage Airbyte's data replication capabilities to ensure that their data in Qdrant Hybrid Cloud is always up to date. This empowers all users of Retrieval Augmented Generation (RAG) applications with effective analysis and decision-making potential, all based on the latest information. Furthermore, by combining Airbyte's platform and Qdrant's hybrid cloud infrastructure, users can optimize their data operations while keeping costs under control via flexible pricing models tailored to individual usage requirements. -03. In the **Create a cluster** screen select **Free** or **Standard** - Most of the remaining configuration options are only available for standard clusters. +> *“The new Qdrant Hybrid Cloud is an exciting addition that offers peace of mind and flexibility, aligning perfectly with the needs of Airbyte Enterprise users who value the same balance. Being open-source at our core, both Qdrant and Airbyte prioritize giving users the flexibility to build and test locally—a significant advantage for data engineers and AI practitioners. We're enthusiastic about the Hybrid Cloud launch, as it mirrors our vision of enabling users to confidently transition from local development and local deployments to a managed solution, with both cloud and hybrid cloud deployment options.”* AJ Steers, Staff Engineer for AI, Airbyte -04. Select a provider. Currently, you can deploy to: +#### Optimizing Your GenAI Data Stack With Airbyte and Qdrant Hybrid Cloud - - Amazon Web Services (AWS) - - Google Cloud Platform (GCP) - - Microsoft Azure - - Your own [Hybrid Cloud](https://qdrant.tech/documentation/hybrid-cloud/) Infrastructure -05. Choose your data center region or Hybrid Cloud environment. +By integrating Airbyte with Qdrant Hybrid Cloud, you can achieve seamless data ingestion from diverse sources into Qdrant's powerful indexing system. This integration enables you to derive valuable insights from your data. Here are some key advantages: -06. Configure RAM for each node. +**Effortless Data Integration:** Airbyte's intuitive interface lets you set up data pipelines that extract, transform, and load (ETL) data from various sources into Qdrant. Additionally, Qdrant Hybrid Cloud’s Kubernetes-native architecture means that the destination vector database can now be deployed in a few clicks to any environment. With such flexibility, you can supply even the most advanced RAG applications with optimal data pipelines. +**Scalability and Performance:** With Airbyte and Qdrant Hybrid Cloud, you can scale your data infrastructure according to your needs. Whether you're dealing with terabytes or petabytes of data, this combination ensures optimal performance and scalability. This is a robust setup that is designed to meet the needs of large enterprises, ensuring a full spectrum of solutions for various projects and workloads. - > For more information, see our [Capacity Planning](https://qdrant.tech/documentation/guides/capacity-planning/) guidance. +**Powerful Indexing and Search:** Qdrant Hybrid Cloud’s architecture combines the scalability of cloud infrastructure with the performance of on-premises indexing. Qdrant's advanced algorithms enable lightning-fast search and retrieval of data, even across large datasets. -07. Choose the number of vCPUs per node. If you add more - RAM, the menu provides different options for vCPUs. +**Open-Source Compatibility:** Airbyte and Qdrant pride themselves on maintaining a reliable and mature integration that brings peace of mind to those prototyping and deploying large-scale AI solutions. Extensive open-source documentation and code samples help users of all skill levels in leveraging highly advanced features of data ingestion and vector search. -08. Select the number of nodes you want the cluster to be deployed on. +#### Build a Modern GenAI Application With Qdrant Hybrid Cloud and Airbyte +![hybrid-cloud-airbyte-tutorial](/blog/hybrid-cloud-airbyte/hybrid-cloud-airbyte-tutorial.png) - > Each node is automatically attached with a disk, that has enough space to store data with Qdrant’s default collection configuration. +We put together an end-to-end tutorial to show you how to build a GenAI application with Qdrant Hybrid Cloud and Airbyte’s advanced data pipelines. -09. Select additional disk space for your deployment. +#### Tutorial: Build a RAG System to Answer Customer Support Queries +Learn how to set up a private AI service that addresses customer support issues with high accuracy and effectiveness. By leveraging Airbyte’s data pipelines with Qdrant Hybrid Cloud, you will create a customer support system that is always synchronized with up-to-date knowledge. - > Depending on your collection configuration, you may need more disk space per RAM. For example, if you configure `on_disk: true` and only use RAM for caching. +[Try the Tutorial](/documentation/tutorials/rag-customer-support-cohere-airbyte-aws/) -10. Review your cluster configuration and pricing. +#### Documentation: Deploy Qdrant in a Few Clicks -11. When you’re ready, select **Create**. It takes some time to provision your cluster. +Our simple Kubernetes-native design lets you deploy Qdrant Hybrid Cloud on your hosting platform of choice in just a few steps. Learn how in our documentation. +[Read Hybrid Cloud Documentation](/documentation/hybrid-cloud/) -Once provisioned, you can access your cluster on ports 443 and 6333 (REST) and 6334 (gRPC). +#### Ready to Get Started? -![Cluster configured in the UI](https://qdrant.tech/documentation/cloud/cluster-detail.png) +Create a [Qdrant Cloud account](https://cloud.qdrant.io/signup) and deploy your first **Qdrant Hybrid Cloud** cluster in a few minutes. You can always learn more in the [official release blog](/blog/hybrid-cloud/). -You should now see the new cluster in the **Clusters** menu. +<|page-288-lllmstxt|> +## **How PortfolioMind delivered real-time crypto intelligence with Qdrant** -## [Anchor](https://qdrant.tech/documentation/cloud/create-cluster/\#deleting-a-cluster) Deleting a Cluster +The crypto world is an inherently noisy and volatile place. Markets shift quickly, narratives change overnight, and wallet activities conceal subtle yet critical patterns. For PortfolioMind, Web3-native AI research copilot built using the [SpoonOS framework](https://spoonai.io/), the challenge was not only finding just finding relevant information, but also surfacing it in real-time. -You can delete a Qdrant database cluster from the cluster’s detail page. +### Challenge: Moving beyond static insights -![Delete Cluster](https://qdrant.tech/documentation/cloud/delete-cluster.png) +Most crypto platforms presume users want simple token tracking. PortfolioMind, however, recognized that real research behaviors are dynamic. Users pivot rapidly between topics like L2 scaling, meme tokens, protocol risks, and DeFi yield fluctuations based on real-time events. -## [Anchor](https://qdrant.tech/documentation/cloud/create-cluster/\#next-steps) Next Steps +Semantic search alone was insufficient. PortfolioMind required a platform capable of understanding user interests and context and through real-time user interactions and behaviors. -You will need to connect to your new Qdrant Cloud cluster. Follow [**Authentication**](https://qdrant.tech/documentation/cloud/authentication/) to create one or more API keys. +### Solution: Modeling dynamic user curiosity -You can also scale your cluster both horizontally and vertically. Read more in [**Cluster Scaling**](https://qdrant.tech/documentation/cloud/cluster-scaling/). +PortfolioMind adopted Qdrant to translate user interactions into insights. Every user activity, such as searching tokens, pinning wallets, reading exploits, or engaging with DeFi contracts, left semantic traces. Qdrant transformed these traces into multivector user-intent models. -If a new Qdrant version becomes available, you can upgrade your cluster. See [**Cluster Upgrades**](https://qdrant.tech/documentation/cloud/cluster-upgrades/). +The system ingests diverse data including news, tokenomics, whale behaviors, portfolio histories, and interactions with DeFi/NFT dashboards, embedding each data type with rich metadata (chain, token symbol, timestamps). Using HDBSCAN clustering, PortfolioMind identifies user-specific micro-interests, creating a dynamic, multivector representation of each user’s intent. -For more information on creating and restoring backups of a cluster, see [**Backups**](https://qdrant.tech/documentation/cloud/backups/). +![architecture](/blog/case-study-portfoliomind/spoonos-architecture.png) -##### Was this page useful? -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +### Why PortfolioMind chose Qdrant -Thank you for your feedback! 🙏 +PortfolioMind previously experimented with other vector databases, but selected Qdrant for its: -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud/create-cluster.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +* Fast, filterable searches leveraging detailed metadata payloads. +* Native multivector support to capture complex user behaviors. +* Low-latency retrieval at scale. +* Minimal operational overhead via Qdrant Cloud's managed services with hybrid indexing support. -On this page: +*"Qdrant enables us to model real-time user intent, transforming noisy data into personalized intelligence."* -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud/create-cluster.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +— PortfolioMind Team -× +### Results: 70% lower latency, better user retention -[Powered by](https://qdrant.tech/) +PortfolioMind saw immediate, measurable improvements: -<|page-155-lllmstxt|> -## running-with-gpu -- [Documentation](https://qdrant.tech/documentation/) -- [Guides](https://qdrant.tech/documentation/guides/) -- Running with GPU +* Latency per query dropped by 70% under heavy multi-user load. +* Interaction relevance increased by 58% based on click-through and user engagement. +* System reactivity improved, speeding updates for user-interest clusters. +* User retention rose by 22% among long-session users. -# [Anchor](https://qdrant.tech/documentation/guides/running-with-gpu/\#running-qdrant-with-gpu-support) Running Qdrant with GPU Support +### Up next: Deepening curiosity modeling -Starting from version v1.13.0, Qdrant offers support for GPU acceleration. +Moving forward, PortfolioMind is expanding Qdrant’s capabilities to include: -However, GPU support is not included in the default Qdrant binary due to additional dependencies and libraries. Instead, you will need to use dedicated Docker images with GPU support ( [NVIDIA](https://qdrant.tech/documentation/guides/running-with-gpu/#nvidia-gpus), [AMD](https://qdrant.tech/documentation/guides/running-with-gpu/#amd-gpus)). +* Cross-user curiosity mapping, uncovering hidden interest clusters among diverse users. +* Temporal drift tracking, offering historical vector snapshots to visualize evolving interests. +* Improved cold-start onboarding, enabling accurate intent modeling within a few user interactions. -## [Anchor](https://qdrant.tech/documentation/guides/running-with-gpu/\#configuration) Configuration +The partnership has turned real-time crypto research into more personalized, actionable intelligence, showing how much timing and relevance matter. -Qdrant includes a number of configuration options to control GPU usage. The following options are available: +<|page-289-lllmstxt|> +# Qdrant Edge (Private Beta): Vector Search for Embedded AI -```yaml -gpu: - # Enable GPU indexing. - indexing: false - # Force half precision for `f32` values while indexing. - # `f16` conversion will take place - # only inside GPU memory and won't affect storage type. - force_half_precision: false - # Used vulkan "groups" of GPU. - # In other words, how many parallel points can be indexed by GPU. - # Optimal value might depend on the GPU model. - # Proportional, but doesn't necessary equal - # to the physical number of warps. - # Do not change this value unless you know what you are doing. - # Default: 512 - groups_count: 512 - # Filter for GPU devices by hardware name. Case insensitive. - # Comma-separated list of substrings to match - # against the gpu device name. - # Example: "nvidia" - # Default: "" - all devices are accepted. - device_filter: "" - # List of explicit GPU devices to use. - # If host has multiple GPUs, this option allows to select specific devices - # by their index in the list of found devices. - # If `device_filter` is set, indexes are applied after filtering. - # By default, all devices are accepted. - devices: null - # How many parallel indexing processes are allowed to run. - # Default: 1 - parallel_indexes: 1 - # Allow to use integrated GPUs. - # Default: false - allow_integrated: false - # Allow to use emulated GPUs like LLVMpipe. Useful for CI. - # Default: false - allow_emulated: false +Over the past two years, vector search has become foundational infrastructure for AI applications, from retrieval-augmented generation (RAG) to agentic reasoning. But as AI systems extend beyond cloud-hosted inference into the physical world \- running on devices like robots, kiosks, home assistants, and mobile phones \- new constraints emerge. Low-latency retrieval, multimodal inputs, and bandwidth-independent operation will become first-class requirements. **Qdrant Edge** is our response to this shift. -``` +## From Static RAG to Embedded AI: Three Waves of Vector Search -It is not recommended to change these options unless you are familiar with the Qdrant internals and the Vulkan API. +Vector databases first gained widespread adoption during the rise of *RAG 1.0*, where they served as context providers for LLMs performing text-based tasks such as document search and chatbot Q\&A. In these applications, performance was defined by filtering speed, recall accuracy, and support for hybrid search over structured metadata. -## [Anchor](https://qdrant.tech/documentation/guides/running-with-gpu/\#standalone-gpu-support) Standalone GPU Support +This evolved into the *Agentic AI* wave, where vector search engines are becoming long-term memory modules within autonomous (software) agents. Requirements expanded to include low-latency updates, token-level reranking, and multimodal retrieval. Qdrant is powering thousands of these applications in production today \- supporting with real-time search, advanced metadata filters, and native multivector semantics for ColBERT-style workflows. -For standalone usage, you can build Qdrant with GPU support by running the following command: +Now, a third wave is emerging: *Embedded AI*. This wave brings vector-based reasoning to environments without reliable network access or cloud compute. Here, the vector database must operate on-device, under tight constraints of memory, power, and I/O. Traditional vector stores \- designed for large server environments \- are not suitable. -```bash -cargo build --release --features gpu +## Qdrant Edge: Built for On-Device Vector Search -``` +Qdrant Edge is a lightweight, embedded vector search engine designed to run on local hardware with limited persistent background threads, network access, or centralized coordination. It retains Qdrant’s core search and filtering capabilities but is re-architected to operate as a minimal, local library that integrates directly into AI workflows on edge devices. -Ensure your device supports Vulkan API v1.3. This includes compatibility with Apple Silicon, Intel GPUs, and CPU emulators. Note that `gpu.indexing: true` must be set in your configuration to use GPUs at runtime. +Key capabilities include: -## [Anchor](https://qdrant.tech/documentation/guides/running-with-gpu/\#nvidia-gpus) NVIDIA GPUs +* **In-process execution**: Qdrant Edge runs as a library, not a service. There are no background optimizers or update threads. All operations \- including search and indexing \- are synchronous and under the control of the application. -### [Anchor](https://qdrant.tech/documentation/guides/running-with-gpu/\#prerequisites) Prerequisites +* **Minimal footprint**: Designed for memory- and compute-constrained environments. -To use Docker with NVIDIA GPU support, ensure the following are installed on your host: +* **Multitenancy-aware**: Suitable for deployments where each device (e.g., a robot or mobile unit) functions as a tenant with isolated data and compute. -- Latest NVIDIA drivers -- [nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) +## Use Cases and Design Targets -Most AI or CUDA images on Amazon/GCP come pre-configured with the NVIDIA container toolkit. +Qdrant Edge is built for scenarios where inference and decision-making happen at the edge, close to the data. Example environments include: -### [Anchor](https://qdrant.tech/documentation/guides/running-with-gpu/\#docker-images-with-nvidia-gpu-support) Docker images with NVIDIA GPU support +* **Robotics and autonomous navigation**: Real-time perception and decision-making with multimodal vector inputs (e.g., camera, LiDAR, radar). +* **Mobile devices**: Local assistant functionality with offline access, on-device personalization, and privacy-preserving search. +* **Point-of-sale systems**: Product similarity, anomaly detection, and decision support in disconnected or bandwidth-constrained environments. +* **IoT agents**: Local retrieval for condition monitoring, predictive maintenance, or sensor fusion. -Docker images with NVIDIA GPU support use the tag suffix `gpu-nvidia`, e.g., `qdrant/qdrant:v1.13.0-gpu-nvidia`. These images include all necessary dependencies. +In each of these domains, the system demands are orthogonal to those of cloud-hosted vector infrastructure: short-lived processes, strict latency bounds, and minimal runtime dependencies. -To enable GPU support, use the `--gpus=all` flag with Docker settings. Example: +## Why Qdrant Works Well on the Edge -```bash -# `--gpus=all` flag says to Docker that we want to use GPUs. -# `-e QDRANT__GPU__INDEXING=1` flag says to Qdrant that we want to use GPUs for indexing. -docker run \ - --rm \ - --gpus=all \ - -p 6333:6333 \ - -p 6334:6334 \ - -e QDRANT__GPU__INDEXING=1 \ - qdrant/qdrant:gpu-nvidia-latest +Qdrant is the first production-grade vector database actively developing for the embedded systems domain. Qdrant Edge is a dedicated engineering effort to build a performant, extensible, embeddable vector engine for ML practitioners working at the edge, based on the original lightweight design of Qdrant and builds on the architectural strengths that Qdrant has been recognized for: -``` +* Custom, filterable HNSW implementation +* Hybrid search across sparse and dense modalities +* Multivector compatibility and multimodal indexing +* Real-time ingestion -To ensure that the GPU was initialized correctly, you may check it in logs. First Qdrant prints all found GPU devices without filtering and then prints list of all created devices: +Qdrant Edge carries these capabilities forward into a new class of deployment environments. -```text -2025-01-13T11:58:29.124087Z INFO gpu::instance: Found GPU device: NVIDIA GeForce RTX 3090 -2025-01-13T11:58:29.124118Z INFO gpu::instance: Found GPU device: llvmpipe (LLVM 15.0.7, 256 bits) -2025-01-13T11:58:29.124138Z INFO gpu::device: Create GPU device NVIDIA GeForce RTX 3090 +## Apply for the Private Research Beta -``` +Qdrant Edge is currently in private beta. Due to the highly targeted nature of this release, we will be selecting a limited number of partners who are actively building AI systems for embedded or real-time environments. -Here you can see that two devices were found: RTX 3090 and llvmpipe (a CPU-emulated GPU which is included in the Docker image). Later, you will see that only RTX was initialized. +If you’re working on robotics, edge inference, autonomous systems, or device-native assistants, we encourage you to apply. -This concludes the setup. Now, you can start using this Qdrant instance. +[**Apply to Join the Qdrant Edge Beta**](https://qdrant.tech/edge) -### [Anchor](https://qdrant.tech/documentation/guides/running-with-gpu/\#troubleshooting-nvidia-gpus) Troubleshooting NVIDIA GPUs +## Closing Thoughts -If your GPU is not detected in Docker, make sure your driver and `nvidia-container-toolkit` are up-to-date. -If needed, you can install latest version of `nvidia-container-toolkit` from it’s GitHub Releases [page](https://github.com/NVIDIA/nvidia-container-toolkit/releases) +Embedded AI systems bring unique constraints that require rethinking how we design infrastructure like vector databases. Qdrant Edge represents a new class of tooling \- one that treats on-device reasoning as a first-class capability. We’re excited to collaborate with forward-thinking teams building the next generation of intelligent systems. -Verify Vulkan API visibility in the Docker container using: -```bash -docker run --rm --gpus=all qdrant/qdrant:gpu-nvidia-latest vulkaninfo --summary +### FAQs -``` +Who is Qdrant Edge for? +Teams building AI systems that need fast, local vector search on embedded or resource-constrained devices, such as robots, mobile apps, or IoT hardware. -The system may show you an error message explaining why the NVIDIA device is not visible. -Note that if your NVIDIA GPU is not visible in Docker, the Docker image cannot use libGLX\_nvidia.so.0 on your host. Here is what an error message could look like: +Is this available to all Qdrant users? +Not yet. Qdrant Edge is in private beta. We're selecting a limited number of partners based on technical fit and active edge deployment scenarios. -```text -ERROR: [Loader Message] Code 0 : loader_scanned_icd_add: Could not get `vkCreateInstance` via `vk_icdGetInstanceProcAddr` for ICD libGLX_nvidia.so.0 -WARNING: [Loader Message] Code 0 : terminator_CreateInstance: Failed to CreateInstance in ICD 0. Skipping ICD. +What are the minimum requirements to join the beta? +You should have a clear use case for on-device or offline vector search. Preference is given to companies working with embedded hardware or deploying agents at the edge. -``` +How do I get access? +Qdrant Edge is currently in private beta. If you're building edge-native or embedded AI systems and want early access, [**apply to join the beta**](https://qdrant.tech/edge). -To resolve errors, update your NVIDIA container runtime configuration: +<|page-290-lllmstxt|> +This summer, researchers from ETH Zurich and Stanford [released **MIRIAD**](https://www.linkedin.com/posts/qinyue-zheng-526b391a4_we-just-released-a-million-scale-medical-activity-7337889277445365760-Criy), an open source dataset of **5.8 million medical Question Answer pairs**, each grounded in peer-reviewed literature. -```bash -sudo nano /etc/nvidia-container-runtime/config.toml +A dataset of this scale has the potential to become an **ultimate solution to the lack of structured, rich-in-context, high-quality data in the medical field**. It is a powerful measure for a significant reduction of hallucinations in medical AI applications, created to be a knowledge base for Retrieval Augmented Generation (RAG) and a source for downstreaming embedding models. -``` +One of the proudest moments for us was a realization that Qdrant was a part of MIRIAD’s story, powering its storage and RAG experiments. +We spoke to two lead authors of MIRIAD, [Qinyue Zheng (ETH)](https://www.linkedin.com/in/qinyue-zheng-526b391a4/) and [Salman Abdullah (Stanford)](https://www.linkedin.com/in/salman-abdullah-b71640172/), to learn the motivation behind MIRIAD, how its authors see its future, and why Qdrant was a deliberate choice for the research team. -Set `no-cgroups=false`, save the configuration, and restart Docker: +## Why Medical AI Needed MIRIAD -```bash -sudo systemctl restart docker +Large Language Models’ hallucinations, already problematic in general domains, are a matter of life or death in the medical field. Hence, to mitigate these risks and make medical AI applications usable, the most promising way seems to be augmenting LLMs with structured, grounded knowledge. -``` +Many modern RAG and agentic systems, including those used in medical AI, often rely on raw, unstructured data that is noisy and poorly aligned with downstreaming tasks. It was clear that the intersection of medicine and AI could benefit significantly from a dataset with long-form, rich-in-context Q&A pairs. +All existing datasets in the field like MedQA-USMLE or PubMedQA were either limited to multiple-choice QA formats or focused on a narrow selection of medical paper sections like abstracts or conclusions, leaving alarming knowledge gaps. -## [Anchor](https://qdrant.tech/documentation/guides/running-with-gpu/\#amd-gpus) AMD GPUs +The goal of MIRIAD’s authors was to create a large-scale dataset that is: +- **Structured & information-dense**; +- **Comprehensive**, covering diverse niches in medicine with sufficient depth; +- **Grounded & trustworthy**, so each piece of information can be traced back to a peer-reviewed paper. -### [Anchor](https://qdrant.tech/documentation/guides/running-with-gpu/\#prerequisites-1) Prerequisites +And organize medical knowledge in a way that could support clinicians and engineers in medical AI alike. -Running Qdrant with AMD GPUs requires [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/detailed-install.html) to be installed on your host. +## How MIRIAD Was Created -### [Anchor](https://qdrant.tech/documentation/guides/running-with-gpu/\#docker-images-with-amd-gpu-support) Docker images with AMD GPU support +MIRIAD was built on top of the **Semantic Scholar Open Research Corpus (S2ORC)**, currently leveraging a curated subset of **900,000** medical papers, with plans to expand a broader coverage as the dataset evolves. -Docker images for AMD GPUs use the tag suffix `gpu-amd`, e.g., `qdrant/qdrant:v1.13.0-gpu-amd`. These images include all required dependencies. +Question Answer pairs in the dataset were generated from fixed-size chunks using **GPT-3.5-Turbo**, followed by a **multi-stage filtering pipeline** ensuring the quality of the resulting dataset in three stages: +- **Automated filtering** with regular expressions; +- **Filtering with classifier** based on a Mistral-7B model, trained on GPT-4-flagged low-quality examples; +- **Human-in-the-loop expert labeling**, which showed strong agreement between medical experts and GPT-4: **92.3% on groundedness, 88.6% on factuality, and 78.4% on relevance**. -To enable GPU for Docker, you need additional `--device /dev/kfd --device /dev/dri` flags. To enable GPU for Qdrant you need to set the enable flag. Here is an example: +The full outline of the pipeline behind MIRIAD is visually summarized here: -```bash -# `--device /dev/kfd --device /dev/dri` flags say to Docker that we want to use GPUs. -# `-e QDRANT__GPU__INDEXING=1` flag says to Qdrant that we want to use GPUs for indexing. -docker run \ - --rm \ - --device /dev/kfd --device /dev/dri \ - -p 6333:6333 \ - -p 6334:6334 \ - -e QDRANT__LOG_LEVEL=debug \ - -e QDRANT__GPU__INDEXING=1 \ - qdrant/qdrant:gpu-amd-latest +{{< figure src="/blog/miriad-qdrant/overview.png" alt="Overview of the data generation pipeline of MIRIAD" caption="Overview of the data generation pipeline of MIRIAD
Source: \"MIRIAD: Augmenting LLMs with millions of medical query-response pairs.\"" width="100%" >}} -``` +### Qdrant’s Role in MIRIAD -Check logs to confirm GPU initialization. Example log output: +Qdrant was a deliberate choice to handle retrieval of millions of Q&A pairs under the hood of MIRIAD’s RAG experiments. The deciding factors were **the simplicity of use and readability of our documentation**, **speed and scalability**, and, what is especially valuable for researchers, Qdrant being open source. -```text -2025-01-10T11:56:55.926466Z INFO gpu::instance: Found GPU device: AMD Radeon Graphics (RADV GFX1103_R1) -2025-01-10T11:56:55.926485Z INFO gpu::instance: Found GPU device: llvmpipe (LLVM 17.0.6, 256 bits) -2025-01-10T11:56:55.926504Z INFO gpu::device: Create GPU device AMD Radeon Graphics (RADV GFX1103_R1) +> "For us, we had millions of data points to deal with and somehow the retrieval was super fast with Qdrant!" - Qinyue Zheng -``` +These capabilities allowed researchers to focus on experiments instead of struggles of the infrastructure setup. -This concludes the setup. In a basic scenario, you won’t need to configure anything else. +## Results -## [Anchor](https://qdrant.tech/documentation/guides/running-with-gpu/\#known-limitations) Known limitations +The results of various benchmarks clearly show that structured, high-quality datasets can significantly improve the reliability of LLMs in high stakes domains. -- **Platform Support:** Docker images are only available for Linux x86\_64. Windows, macOS, ARM, and other platforms are not supported. +Augmenting MIRIAD improved: +- Accuracy on medical QA benchmarks by up to **6.7%** compared to unstructured RAG baselines, with the same source corpus and retrieval budget; +- LLMs' medical hallucination detection capabilities by **22 - 37 percentage points**. -- **Memory Limits:** Each GPU can process up to 16GB of vector data per indexing iteration. +The final MIRIAD dataset contains **5,821,948 Question Answer pairs**, each linked to a source passage and paper, spanning **56 medical topics**. +It is [open sourced on HuggingFace](https://huggingface.co/miriad), along with [source code and a detailed guide](https://github.com/eth-medical-ai-lab/MIRIAD) for full gathering and benchmarking replication, including embeddings generation and indexing to Qdrant, and RAG setup. -Due to this limitation, you should not create segments where either original vectors OR quantized vectors are larger than 16GB. +Additionally, to make the dataset accessible not only to engineers in medical AI but also to clinicians, authors of MIRIAD developed **MIRIAD Atlas**, an interactive map of UMAP-based dimensionality-reduced embeddings (similar to how one can visualize and study datasets in [Qdrant’s WebUI](https://qdrant.tech/documentation/web-ui/)). -For example, a collection with 1536d vectors and scalar quantization can have at most: + -```text -16Gb / 1536 ~= 11 million vectors per segment +## Looking Ahead -``` +The researchers’ goal is to maintain and expand MIRIAD, keeping it current with new medical knowledge on a yearly basis. There are also plans to migrate **MIRIAD Atlas** to Qdrant, to serve dataset at its full scale, and add support for the Model Context Protocol (MCP). -And without quantization: +Yet mainly, MIRIAD aims to inspire researchers, clinicians, and engineers to build more reliable and domain-specific medical AI systems. -```text -16Gb / 1536 * 4 ~= 2.7 million vectors per segment +> "Sky is the limit" - Qinyue Zheng -``` +Authors envision applications such as medical QA agents, medical discipline explorers using the specialty labels in MIRIAD to route among different disciplines, or explainability tools supporting RAG-based applications. -The maximum size of each segment can be configured in the collection settings. -Use the following operation to [change](https://qdrant.tech/documentation/concepts/collections/#update-collection-parameters) on your existing collection: +### Qdrant in Research -```http -PATCH collections/{collection_name} -{ - "optimizers_config": { - "max_segment_size": 1000000 - } -} +We are happy to see Qdrant helping researchers at the frontier of medical AI come up with achievements that others can build upon! -``` +By reducing engineering overhead, we want researchers to focus on advancing their domain contributions. Apart from handling scale, we see other exciting applications of Qdrant in research. For example, Qdrant could serve as a tool for: +- Deduplication in large-scale datasets based on semantic similarity; +- Diversity sampling for comprehensive training datasets. -Note that `max_segment_size` is specified in KiloBytes. +As MIRIAD’s authors told us, "*the sky is the limit*". +We're looking forward to seeing Qdrant more and more as a helping hand in further research advancements! -##### Was this page useful? +<|page-291-lllmstxt|> +[**Qdrant 1.15.0 is out!**](https://github.com/qdrant/qdrant/releases/tag/v1.15.0) Let’s look at the main features for this version: -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +**New quantizations:** We introduce asymmetric quantization and 1.5 and 2-bit quantizations. Asymmetric quantization allows vectors and queries to have different quantization algorithms. 1.5 and 2-bit quantizations allow for improved accuracy. -Thank you for your feedback! 🙏 +**Changes in text index**: Introduction of a new multilingual tokenizer, stopwords support, stemming, and phrase matching. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/running-with-GPU.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Various optimizations, including **HNSW healing**, allowing HNSW indexes to reuse the old graph without a complete rebuild, and **Migration to Gridstore** unlocks faster injestion. -On this page: -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/running-with-GPU.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +## New Quantization Modes -× +![Section 1](/blog/qdrant-1.15.x/section-1.png) -[Powered by](https://qdrant.tech/) +We are expanding the Qdrant quantization toolkit with: -<|page-156-lllmstxt|> -## optimizer -- [Documentation](https://qdrant.tech/documentation/) -- [Concepts](https://qdrant.tech/documentation/concepts/) -- Optimizer +* **1.5-bit and 2-bit quantization** for better tradeoffs between compression and accuracy. +* **Asymmetric quantization** to combine binary storage with scalar queries for smarter memory use. -# [Anchor](https://qdrant.tech/documentation/concepts/optimizer/\#optimizer) Optimizer +### 1.5-Bit and 2-Bit Quantization +We introduce a new **binary quantization** storage that uses **2 and 1.5 bits** per dimension, improving precision for smaller vectors. Previous one-bit compression resulted in significant data loss and precision drops for vectors smaller than a thousand dimensions, often requiring expensive rescoring. 2-bit quantization offers 16X compression compared to 32X with one bit, improving performance for smaller vector dimensions. The 1.5-bit quantization compression offers 24X compression and intermediate accuracy. -It is much more efficient to apply changes in batches than perform each change individually, as many other databases do. Qdrant here is no exception. Since Qdrant operates with data structures that are not always easy to change, it is sometimes necessary to rebuild those structures completely. +![2bit Quantization](/blog/qdrant-1.15.x/1.5-2-bit-quantization.png) -Storage optimization in Qdrant occurs at the segment level (see [storage](https://qdrant.tech/documentation/concepts/storage/)). -In this case, the segment to be optimized remains readable for the time of the rebuild. +A major limitation of binary quantization is poor handling of values close to zero. 2-bit quantization addresses this by explicitly representing zeros using an efficient scoring mechanism. With 1.5-bit quantization we balance the efficiency of binary quantization with accuracy improvements of 2-bit quantization. -![Segment optimization](https://qdrant.tech/docs/optimization.svg) +
+Benchmark Results: 2-bit vs 1-bit and Scalar Quantization -The availability is achieved by wrapping the segment into a proxy that transparently handles data changes. -Changed data is placed in the copy-on-write segment, which has priority for retrieval and subsequent updates. +We ran extensive benchmarks to compare the new 2-bit quantization with both traditional 1-bit (binary) quantization and scalar quantization (e.g., 8-bit). -## [Anchor](https://qdrant.tech/documentation/concepts/optimizer/\#vacuum-optimizer) Vacuum Optimizer +Dataset: Laion 1 million 512d vectors -The simplest example of a case where you need to rebuild a segment repository is to remove points. -Like many other databases, Qdrant does not delete entries immediately after a query. -Instead, it marks records as deleted and ignores them for future queries. +{{
}} -This strategy allows us to minimize disk access - one of the slowest operations. -However, a side effect of this strategy is that, over time, deleted records accumulate, occupy memory and slow down the system. +{{
}} -To avoid these adverse effects, Vacuum Optimizer is used. -It is used if the segment has accumulated too many deleted records. +
-The criteria for starting the optimizer are defined in the configuration file. -Here is an example of parameter values: -```yaml -storage: - optimizers: - # The minimal fraction of deleted vectors in a segment, required to perform segment optimization - deleted_threshold: 0.2 - # The minimal number of vectors in a segment, required to perform segment optimization - vacuum_min_vector_number: 1000 +### Asymmetric Quantization -``` +The **Asymmetric Quantization** technique allows qdrant to use different vector encoding algorithm for stored vectors and for queries. +Particularly interesting combination is a Binary stored vectors and Scalar quantized queries. -## [Anchor](https://qdrant.tech/documentation/concepts/optimizer/\#merge-optimizer) Merge Optimizer +This approach maintains storage size and RAM usage similar to binary quantization while offering improved precision. It is beneficial for memory-constrained deployments, or where the bottleneck is disk I/O rather than CPU. This is particularly useful for indexing millions of vectors as it improves precision without sacrificing much because the limitation in such scenarios is disk speed, not CPU. This approach requires less rescoring for the same quality output. -The service may require the creation of temporary segments. -Such segments, for example, are created as copy-on-write segments during optimization itself. +![Asymmetric Quantization](/blog/qdrant-1.15.x/asymmetric-quantization.png) -It is also essential to have at least one small segment that Qdrant will use to store frequently updated data. -On the other hand, too many small segments lead to suboptimal search performance. +When performing nearest vector search, the query vector is compared against quantized vectors stored in the database. If the query itself remains unquantized and a scoring method exists to evaluate it directly against the compressed vectors, this allows for more accurate results without increasing memory usage. -The merge optimizer constantly tries to reduce the number of segments if there -currently are too many. The desired number of segments is specified -with `default_segment_number` and defaults to the number of CPUs. The optimizer -may takes at least the three smallest segments and merges them into one. +> Quantization enables efficient storage and search of high-dimensional vectors. Learn more about this from our [**quantization**](/documentation/guides/quantization/) docs. -Segments will not be merged if they’ll exceed the maximum configured segment -size with `max_segment_size_kb`. It prevents creating segments that are too -large to efficiently index. Increasing this number may help to reduce the number -of segments if you have a lot of data, and can potentially improve search performance. -The criteria for starting the optimizer are defined in the configuration file. +
+Benchmarks result -Here is an example of parameter values: +{{
}} -```yaml -storage: - optimizers: - # Target amount of segments optimizer will try to keep. - # Real amount of segments may vary depending on multiple parameters: - # - Amount of stored points - # - Current write RPS - # - # It is recommended to select default number of segments as a factor of the number of search threads, - # so that each segment would be handled evenly by one of the threads. - # If `default_segment_number = 0`, will be automatically selected by the number of available CPUs - default_segment_number: 0 +Blue: Asymmetric 8bit quantization - # Do not create segments larger this size (in KiloBytes). - # Large segments might require disproportionately long indexation times, - # therefore it makes sense to limit the size of segments. - # - # If indexation speed have more priority for your - make this parameter lower. - # If search speed is more important - make this parameter higher. - # Note: 1Kb = 1 vector of size 256 - # If not set, will be automatically selected considering the number of available CPUs. - max_segment_size_kb: null +Orange: Regular Binary Quantization -``` +Dataset: Laion 1 million 512d vectors -## [Anchor](https://qdrant.tech/documentation/concepts/optimizer/\#indexing-optimizer) Indexing Optimizer -Qdrant allows you to choose the type of indexes and data storage methods used depending on the number of records. -So, for example, if the number of points is less than 10000, using any index would be less efficient than a brute force scan. +
-The Indexing Optimizer is used to implement the enabling of indexes and memmap storage when the minimal amount of records is reached. -The criteria for starting the optimizer are defined in the configuration file. +## Changes in Text Index -Here is an example of parameter values: +![Section 2](/blog/qdrant-1.15.x/section-2.png) -```yaml -storage: - optimizers: - # Maximum size (in kilobytes) of vectors to store in-memory per segment. - # Segments larger than this threshold will be stored as read-only memmaped file. - # Memmap storage is disabled by default, to enable it, set this threshold to a reasonable value. - # To disable memmap storage, set this to `0`. - # Note: 1Kb = 1 vector of size 256 - memmap_threshold: 200000 +Full-text filtering in Qdrant in an efficient way to combine Vector-based scoring with exact keyword match. +And in v1.15 full-text index recieved a number of upgrades which make vector similarity evem more useful. - # Maximum size (in kilobytes) of vectors allowed for plain index, exceeding this threshold will enable vector indexing - # Default value is 20,000, based on . - # To disable vector indexing, set to `0`. - # Note: 1kB = 1 vector of size 256. - indexing_threshold_kb: 20000 +### Multilingual Tokenization + +Previous versions of Qdrant relied on [charabia](https://github.com/meilisearch/charabia) package to perform multilingual tokenizetion. +Unfortunately this package has a significant memory overhead for tokenizers in Korean and Japanese languages, so we could not enable in by default. + +With this update you can use a variety of languages in our full-text search index for filters. +This means that languages that don't have clear word boundaries and aren't separated by space such as Japanase and Chinese are now natively supported. +Previously, only languages with spaces were supported (with `"word"` tokenization), or you had to compile Qdrant yourself. + +In the new v1.15 release we completely reworked which tokenizer packages are used for specific languages. +It allowed us to pack everything in the main build without sacrificing performance. +Qdrant now supports multilingual tokenization, meaning that search will perform more consistently in multilingual datasets without needing external preprocessing. + +Here is how to configure the multilingual tokenizer: + +```http +PUT /collections/{collection_name}/index +{ + "field_name": "description", + "field_index_params": { + "type": "text", + "tokenizer": "multilingual" + } +} ``` -In addition to the configuration file, you can also set optimizer parameters separately for each [collection](https://qdrant.tech/documentation/concepts/collections/). +### Stop Words -Dynamic parameter updates may be useful, for example, for more efficient initial loading of points. You can disable indexing during the upload process with these settings and enable it immediately after it is finished. As a result, you will not waste extra computation resources on rebuilding the index. +Articles like "a", conjunctions like "and", prepostions like "with", pronouns like "he" and common verbs such as "be", can clutter your index without adding value to search. +Those meaningless works can also complicate construction of filtering condition, when previously you had to manually remove them from the query. -##### Was this page useful? +Now you can configure `stopwords` for qdrant full-text index and Qdrant will handle them automatically. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Here is how to configure stopwords: -Thank you for your feedback! 🙏 +```http +PUT /collections/{collection_name}/index +{ + "field_name": "title", + "field_index_params": { + "type": "text", + "stopwords": "english" + } +} +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/optimizer.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +For more information about stopwords, see the [documentation](https://qdrant.tech/documentation/concepts/indexing/#stopwords). -On this page: +### Stemming -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/optimizer.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Stemming improves text processing by converting words to their root form. +For example “run”, “runs”, and “running” will all map to the root “run”. +By using stemming you only store the root words, reducing the size of the index and increasing retrieval accuracy. -× +In Qdrant, stemming allows for better query document matching because grammar-related suffixes that don't add meaning to words get removed. +We apply stemming in our full-text index increasing recall, because a more variety of queries match the same document. +For example the queries "interesting documentation" and "interested in this document", will be normalized to `["interest", "document"]` and `["interest", "in", "this", "document"]`, converting them to an overlapping sets. +However, without stemming, these would become `["interesting", "documentation"]` and `["interested", "in", "this", "document"]`, resulting in not a single word matching, despite being very similar in meaning. -[Powered by](https://qdrant.tech/) +Here is an example showing how to configure the collection to use the [Snowball stemmer](https://snowballstem.org/): -<|page-157-lllmstxt|> -## cluster-upgrades -- [Documentation](https://qdrant.tech/documentation/) -- [Cloud](https://qdrant.tech/documentation/cloud/) -- Update Clusters +```http +PUT /collections/{collection_name}/index +{ + "field_name": "body", + "field_index_params": { + "type": "text", + "stemmer": { + "type": "snowball", + "language": "english" + } + } +} +``` -# [Anchor](https://qdrant.tech/documentation/cloud/cluster-upgrades/\#updating-qdrant-cloud-clusters) Updating Qdrant Cloud Clusters +### Phrase Matching -As soon as a new Qdrant version is available. Qdrant Cloud will show you an update notification in the Cluster list and on the Cluster details page. +With [phrase matching](/documentation/concepts/filtering/#phrase-match), you can now perform exact phrase search. +It allows you to search for a specific phrase, words in exact order, within a text field. -To update to a new version, go to the Cluster details page, choose the new version from the version dropdown and click **Update**. +For efficient phrase seach Qdrant requires to build an additional data structure, +so it needs to be configured during creation of the full-text index: -![Cluster Updates](https://qdrant.tech/documentation/cloud/cluster-upgrades.png) +```http +PUT /collections/{collection_name}/index +{ + "field_name": "headline", + "field_index_params": { + "type": "text", + "phrase_matching": true + } +} +``` -If you have a multi-node cluster and if your collections have a replication factor of at least **2**, the update process will be zero-downtime and done in a rolling fashion. You will be able to use your database cluster normally. +For example, the phrase “machine time” will be matched exactly in that order within the “summary” field: -If you have a single-node cluster or a collection with a replication factor of **1**, the update process will require a short downtime period to restart your cluster with the new version. +```http +POST /collections/{collection_name}/points/query +{ + "vector": [0.01, 0.45, 0.67, 0.12], + "filter": { + "must": { + "key": "summary", + "match": { + "phrase": "machine time" + } + } + }, + "limit": 10 +} +``` -##### Was this page useful? +The above will match: -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +| | text | +| -- | -- +| ✅ | "The **machine time** is local, rather than global in distributed systems." | +| ❌ | "Dr. Brown retrofitted a DeLorean into a **time machine**." | -Thank you for your feedback! 🙏 +## MMR Reranking -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud/cluster-upgrades.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +We introduce [Maximal Marginal Relevance (MMR)](/documentation/concepts/hybrid-queries/#maximal-marginal-relevance-mmr) reranking to balance relevance and diversity. +MMR works by selecting the results iteratively, by picking the item with the best combination of similarity to the query and dissimilarity to the already selected items. -On this page: +It prevents your top-k results from being redundant and helps surface varied but relevant answers, particularly in dense datasets with overlapping entries. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud/cluster-upgrades.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +![MMR example](/blog/qdrant-1.15.x/diversity.png) -× +### Diversifying Search Results with MMR -[Powered by](https://qdrant.tech/) +Let’s say you’re building a knowledge assistant or semantic document explorer in which a single query can return multiple highly similar queries. +For instance, searching “climate change” in a scientific paper database might return several similar paragraphs. -<|page-158-lllmstxt|> -## data-exploration -- [Articles](https://qdrant.tech/articles/) -- Data Exploration - -#### Data Exploration - -Learn how you can leverage vector similarity beyond just search. Reveal hidden patterns and insights in your data, provide recommendations, and navigate data space. - -[![Preview](https://qdrant.tech/articles_data/distance-based-exploration/preview/preview.jpg)\\ -**Distance-based data exploration** \\ -Explore your data under a new angle with Qdrant's tools for dimensionality reduction, clusterization, and visualization.\\ -\\ -Andrey Vasnetsov\\ -\\ -March 11, 2025](https://qdrant.tech/articles/distance-based-exploration/)[![Preview](https://qdrant.tech/articles_data/discovery-search/preview/preview.jpg)\\ -**Discovery needs context** \\ -Discovery Search, an innovative way to constrain the vector space in which a search is performed, relying only on vectors.\\ -\\ -Luis Cossío\\ -\\ -January 31, 2024](https://qdrant.tech/articles/discovery-search/)[![Preview](https://qdrant.tech/articles_data/vector-similarity-beyond-search/preview/preview.jpg)\\ -**Vector Similarity: Going Beyond Full-Text Search \| Qdrant** \\ -Discover how vector similarity expands data exploration beyond full-text search. Explore diversity sampling and more for enhanced data discovery!\\ -\\ -Luis Cossío\\ -\\ -August 08, 2023](https://qdrant.tech/articles/vector-similarity-beyond-search/)[![Preview](https://qdrant.tech/articles_data/dataset-quality/preview/preview.jpg)\\ -**Finding errors in datasets with Similarity Search** \\ -Improving quality of text-and-images datasets on the online furniture marketplace example.\\ -\\ -George Panchuk\\ -\\ -July 18, 2022](https://qdrant.tech/articles/dataset-quality/) - -× - -[Powered by](https://qdrant.tech/) +You can diversify the results with [Maximal Marginal Relevance (MMR)](/documentation/concepts/hybrid-queries/#maximal-marginal-relevance-mmr). -<|page-159-lllmstxt|> -## pdf-retrieval-at-scale -- [Documentation](https://qdrant.tech/documentation/) -- [Advanced tutorials](https://qdrant.tech/documentation/advanced-tutorials/) -- Scaling PDF Retrieval with Qdrant +Instead of returning the top-k results based on pure similarity, MMR helps select a diverse subset of high-quality results. +This gives more coverage and avoids redundant results, which is helpful in dense content domains such as academic papers, product catalogs, or search assistants. -# [Anchor](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/\#scaling-pdf-retrieval-with-qdrant) Scaling PDF Retrieval with Qdrant +![Diversifying Search Results with MMR](/blog/qdrant-1.15.x/mmr-example.png) -![scaling-pdf-retrieval-qdrant](https://qdrant.tech/documentation/tutorials/pdf-retrieval-at-scale/image1.png) +
Without/with MMR when searching for "kebab" in Wolt dataset of text-image model
-| Time: 30 min | Level: Intermediate | Output: [GitHub](https://github.com/qdrant/examples/blob/master/pdf-retrieval-at-scale/ColPali_ColQwen2_Tutorial.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/qdrant/examples/blob/master/pdf-retrieval-at-scale/ColPali_ColQwen2_Tutorial.ipynb) | -| --- | --- | --- | --- | +
-Efficient PDF documents retrieval is a common requirement in tasks like **(agentic) retrieval-augmented generation (RAG)** and many other search-based applications. At the same time, setting up PDF documents retrieval is rarely possible without additional challenges. +For example, you have vectorized paragraphs from hundreds of documents and stored them in Qdrant. +Instead of showing only five nearly identical answers, you want your chatbot to respond with diverse answers. Here’s how to do it: -Many traditional PDF retrieval solutions rely on **optical character recognition (OCR)** together with use case-specific heuristics to handle visually complex elements like tables, images and charts. These algorithms are often non-transferable – even within the same domain – with their task-customized parsing and chunking strategies, labor-intensive, prone to errors, and difficult to scale. +```http +POST /collections/{collection_name}/points/query +{ + "query": { + "nearest": [0.01, 0.45, 0.67, ...], // search vector + "mmr": { + "diversity": 0.5, // 0.0 - relevance; 1.0 - diversity + "candidates_limit": 100 // num of candidates to preselect + } + }, + "limit": 10 +} +``` -Recent advancements in **Vision Large Language Models (VLLMs)**, such as [**ColPali**](https://huggingface.co/blog/manu/colpali) and its successor [**ColQwen**](https://huggingface.co/vidore/colqwen2-v0.1), started the transformation of the PDF retrieval. These multimodal models work directly with PDF pages as inputs, no pre-processing required. Anything that can be converted into an **image** (think of PDFs as screenshots of document pages) can be effectively processed by these models. Being far simpler in use, VLLMs achieve state-of-the-art performance in PDF retrieval benchmarks like the [Visual Document Retrieval (ViDoRe) Benchmark](https://huggingface.co/spaces/vidore/vidore-leaderboard). +## Migration to Gridstore -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/\#how-vllms-work-for-pdf-retrieval) How VLLMs Work for PDF Retrieval +![Section 3](/blog/qdrant-1.15.x/section-3.png) -VLLMs like **ColPali** and **ColQwen** generate **multivector representations** for each PDF page; the representations are stored and indexed in a vector database. During the retrieval process, models dynamically create multivector representations for (textual) user queries, and precise retrieval – matching between PDF pages and queries – is achieved through [late-interaction mechanism](https://qdrant.tech/blog/qdrant-colpali/#how-colpali-works-under-the-hood). +When we started building Qdrant, we picked RocksDB as our embedded key-value store. However, due to it's architecture we ran into issues such as random latency spikes. [Gridstore](https://qdrant.tech/articles/gridstore-key-value-storage/) is our custom solution to this and other challenges we faced when building with RocksDB. Qdrant 1.15 continues our transition from RocksDB to [Gridstore](https://qdrant.tech/articles/gridstore-key-value-storage/) as the default storage backend for new deployments, leading to: -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/\#challenges-of-scaling-vllms) Challenges of Scaling VLLMs +* Faster ingestion speeds. +* Storage management without "garbage collection". -The heavy multivector representations produced by VLLMs make PDF retrieval at scale computationally intensive. These models are inefficient for large-scale PDF retrieval tasks if used without optimization. +> For more insights on the performance of Gridstore compared to RocksDB checkout our [**Introducing Gridstore**](https://qdrant.tech/articles/gridstore-key-value-storage/#end-to-end-benchmarking) article. -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/\#math-behind-the-scaling) Math Behind the Scaling +## Optimizations -**ColPali** generates over **1,000 vectors per PDF page**, while its successor, **ColQwen**, generates slightly fewer — up to **768 vectors**, dynamically adjusted based on the image size. Typically, ColQwen produces **~700 vectors per page**. +As usual, new Qdrant release brings more performance optimization for faster and cheaper vector search at scale. -To understand the impact, consider the construction of an [**HNSW index**](https://qdrant.tech/articles/what-is-a-vector-database/#1-indexing-hnsw-index-and-sending-data-to-qdrant), a common indexing algorithm for vector databases. Let’s roughly estimate the number of comparisons needed to insert a new PDF page into the index. +### HNSW Healing -- **Vectors per page:** ~700 (ColQwen) or ~1,000 (ColPali) -- **[ef\_construct](https://qdrant.tech/documentation/concepts/indexing/#vector-index):** 100 (default) +Qdrant 1.15 introduces HNSW healing. -The lower bound estimation for the number of vector comparisions comparisons would be: +Instead of completely re-building HNSW index during optimization, Qdrant now tries to re-use information from the existing vector index to speed-up construction of the new one. +When points are removed from an existing [HNSW graph](https://qdrant.tech/documentation/concepts/indexing/#vector-index), new links are added to prevent isolation in the graph, and avoid decreasing search quality. -700×700×100=49millions +{{
}} -Now imagine how much it will take to build an index on **20,000 pages**! -For ColPali, this number doubles. The result is **extremely slow index construction time**. -### [Anchor](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/\#our-solution) Our Solution +This modification, in combinations with [incremental HNSW indexing](/blog/qdrant-1.14.x/#improved-resource-use-during-segment-optimization) introduced in v1.14.0, it significantly improves resource utilization in use-case with high update rates. -We recommend reducing the number of vectors in a PDF page representation for the **first-stage retrieval**. After the first stage retrieval with a reduced amount of vectors, we propose to **rerank** retrieved subset with the original uncompressed representation. +### HNSW Graph connectivity estimation -The reduction of vectors can be achieved by applying a **mean pooling operation** to the multivector VLLM-generated outputs. Mean pooling averages the values across all vectors within a selected subgroup, condensing multiple vectors into a single representative vector. If done right, it allows the preservation of important information from the original page while significantly reducing the number of vectors. +Qdrant builds [addtitional HNSW links](/articles/filtrable-hnsw/) to ensure that filtered searches are performed fast and accurate. -VLLMs generate vectors corresponding to patches that represent different portions of a PDF page. These patches can be grouped in columns and rows of a PDF page. +It does, however, introduce an overhead for indexing complexity, especially when the number of payload indexes is large. +With v1.15, Qdrant introduces an optimization, which quickly estimates graph connectivity before creating additional links. -For example: +Is some scenarios this optimization can reduce indexation time multiple times without sacrificing search quality. -- ColPali divides PDF page into **1,024 patches**. -- Applying mean pooling by rows (or columns) of this patch matrix reduces the page representation to just **32 vectors**. +![Conclusion](/blog/qdrant-1.15.x/connectivity-estimation.png) -![ColPali patching of a PDF page](https://qdrant.tech/documentation/tutorials/pdf-retrieval-at-scale/pooling-by-rows.png) +## Changes in Web UI -We tested this approach with the ColPali model, mean pooling its multivectors by PDF page rows. The results showed: +Main Web-UI feature of the release is a `Create Collection` dialog. -- **Indexing time faster by an order of magnitude** -- **Retrieval quality comparable to the original model** +This dialog is designed with an idea to guide users through the configuration process. +Instead of listing all possible configurations, we tried to organise it into an intuitive flow that also encourages best-practices. -For details of this experiment refer to our [gitHub repository](https://github.com/qdrant/demo-colpali-optimized), [ColPali optimization blog post](https://qdrant.tech/blog/colpali-qdrant-optimization/) or [webinar “PDF Retrieval at Scale”](https://www.youtube.com/watch?v=_h6SN1WwnLs) +![create-collection01](/blog/qdrant-1.15.x/create-collection.png) -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/\#goal-of-this-tutorial) Goal of This Tutorial +## Upgrading to Version 1.15 -In this tutorial, we will demonstrate a scalable approach to PDF retrieval using **Qdrant** and **ColPali** & **ColQwen2** VLLMs. -The presented approach is **highly recommended** to avoid the common pitfalls of long indexing times and slow retrieval speeds. +In Qdrant Cloud, simply go to your Cluster Details screen and select Version 1.15 from the dropdown. The upgrade may take a few moments. -In the following sections, we will demonstrate an optimized retrieval algorithm born out of our successful experimentation: +> Upgrading from earlier versions is straightforward - no major API or index-breaking changes. We recommend upgrading versions one by one, for example, 1.13 ->1.14->1.15. -**First-Stage Retrieval with Mean-Pooled Vectors:** +{{
}} -- Construct an HNSW index using **only mean-pooled vectors**. -- Use them for the first-stage retrieval. +**Documentation**: For detailed usage examples, configuration options, and implementation guides, including quantization, MMR rescoring, multilingual text indexing, and more, refer to the official [Qdrant documentation](https://qdrant.tech/documentation) and [API reference](https://api.qdrant.tech). You'll find full code samples, integration walkthroughs, and best practices for building high-performance vector search applications. -**Reranking with Original Model Multivectors:** +## Engage -- Use the original multivectors from ColPali or ColQwen2 **to rerank** the results retrieved in the first stage. +![Engage](/blog/qdrant-1.15.x/section-4.png) -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/\#setup) Setup +We would love to hear your thoughts on this release. If you have any questions or feedback, join our [Discord](https://discord.gg/qdrant) or create an issue on [GitHub](https://github.com/qdrant/qdrant/issues). -Install & import required libraries +<|page-292-lllmstxt|> +### Qdrant is now available in the new AWS Marketplace AI Agents and Tools category. -```python -# pip install colpali_engine>=0.3.1 -from colpali_engine.models import ColPali, ColPaliProcessor -# pip install qdrant-client>=1.12.0 -from qdrant_client import QdrantClient, models +Customers can now use AWS Marketplace to easily discover, buy, and deploy AI agents solutions, including Qdrant’s vector search engine using their AWS accounts, accelerating AI agent and agentic workflow development. -``` +Qdrant helps organizations build enterprise AI agents with long-term memory and real-time context retrieval by enabling step-aware reasoning and reliable decision-making across complex, unstructured data with a vector-native search engine built for accuracy, scale, and responsiveness. -To run these experiments, we’re using a **Qdrant cluster**. If you’re just getting started, you can set up a **free-tier cluster** for testing and exploration. Follow the instructions in the documentation [“How to Create a Free-Tier Qdrant Cluster”](https://qdrant.tech/documentation/cloud/create-cluster/#free-clusters) +"By offering Qdrant Cloud and Qdrant Hybrid Cloud (for private deployments) through the AWS Marketplace AI Agents and Tools category, we're providing customers with a streamlined way to access our native high-performance vector search engine - critical to AI agents, helping them buy and deploy AI agent solutions faster and more efficiently." AndrĂ© Zayarni, co-founder and CEO at Qdrant. -```python -client = QdrantClient( - url=, - api_key= -) +"Our customers — from AI-native startups to global enterprises and digital-native platforms — are already using Qdrant to power AI agents in production across healthcare, e-commerce, media, analytics, and more, to reduce agent failure, accelerate agentic RAG pipelines, and enable real-time, context-rich reasoning, demonstrating the real-world value of AI-native vector search." -``` +Qdrant delivers essential capabilities including real-time metadata-aware retrieval at scale, hybrid search combining dense vectors, keywords, and filters, and native support for multivector embeddings for fine-grained context. With the addition of [Qdrant Cloud Inference](https://qdrant.tech/cloud-inference/), customers can now generate and index embeddings directly inside their Qdrant Cloud clusters, eliminating external preprocessing and accelerating agent responsiveness. These features enable customers to equip agents with step-aware semantic memory, ensure grounded responses, and keep context fresh with real-time indexing at scale. -Download **ColPali** model along with its input processors. Make sure to select the backend that suits your setup. +With the availability of AI Agents and Tools in AWS Marketplace, customers can significantly accelerate their procurement process to drive AI innovation, reducing the time needed for vendor evaluations and complex negotiations. With centralized purchasing using AWS accounts, customers maintain visibility and control over licensing, payments, and access through AWS. -```python -colpali_model = ColPali.from_pretrained( - "vidore/colpali-v1.3", - torch_dtype=torch.bfloat16, - device_map="mps", # Use "cuda:0" for GPU, "cpu" for CPU, or "mps" for Apple Silicon - ).eval() +Available as a fully managed SaaS and Hybrid Cloud deployment options, for private AWS deployments, Qdrant brings vector search close to where agents operate. This ensures seamless integration into AWS environments while meeting the low-latency, compliance, and scalability needs of production-grade agentic AI. -colpali_processor = ColPaliProcessor.from_pretrained("vidore/colpali-v1.3") +To learn more about Qdrant in AWS Marketplace, visit [Qdrant AI Agent Listing on AWS](https://aws.amazon.com/marketplace/pp/prodview-rtphb42tydtzg?sr=0-1&ref_=beagle&applicationId=AWSMPContessa). -``` +To learn more about the new AI Agents and Tools category, visit [the AWS Marketplace](http://aws.amazon.com/marketplace/solutions/ai-agents-and-tools/). -For **ColQwen** model +<|page-293-lllmstxt|> +![Bento Box](/blog/case-study-and-ai/and-ai-bento.jpg) -```python -from colpali_engine.models import ColQwen2, ColQwen2Processor +## How \&AI scaled global patent retrieval with Qdrant -colqwen_model = ColQwen2.from_pretrained( - "vidore/colqwen2-v0.1", - torch_dtype=torch.bfloat16, - device_map="mps", # Use "cuda:0" for GPU, "cpu" for CPU, or "mps" for Apple Silicon - ).eval() +[&AI](https://tryandai.com/) is on a mission to redefine patent litigation. Their platform helps legal professionals invalidate patents through intelligent prior art search, claim charting, and automated litigation support. To make this work at scale, CTO and co-founder Herbie Turner needed a vector database that could power fast, accurate retrieval across billions of documents without ballooning DevOps complexity. That’s where Qdrant came in. -colqwen_processor = ColQwen2Processor.from_pretrained("vidore/colqwen2-v0.1") +## Legal tech’s toughest retrieval challenge -``` +Patent litigation is a high-stakes game. When a company is sued for patent infringement, the best defense is often to invalidate the patent altogether. That means proving the idea was disclosed publicly before the patent was granted. Finding that “prior art” requires sifting through vast, multilingual document corpora with domain-specific technical language. -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/\#create-qdrant-collections) Create Qdrant Collections +Traditionally, this is done through outsourced search firms or attorneys running boolean queries across multiple databases. It’s time-consuming, expensive, and heavily reliant on human intuition. Turner and co-founder Caleb Harris saw an opportunity to use modern AI tooling and large language models (LLMs) to reframe the problem. -We can now create a collection in Qdrant to store the multivector representations of PDF pages generated by **ColPali** or **ColQwen**. +"Instead of generating legal text, which attorneys rightly distrust, we focused everything around retrieval," said Turner. "If we can ground our results in real documents, hallucination risk is minimized." -Collection will include **mean pooled** by rows and columns representations of a PDF page, as well as the **original** multivector representation. +## A retrieval-first legal AI stack -```python -client.create_collection( - collection_name=collection_name, - vectors_config={ - "original": - models.VectorParams( #switch off HNSW - size=128, - distance=models.Distance.COSINE, - multivector_config=models.MultiVectorConfig( - comparator=models.MultiVectorComparator.MAX_SIM - ), - hnsw_config=models.HnswConfigDiff( - m=0 #switching off HNSW - ) - ), - "mean_pooling_columns": models.VectorParams( - size=128, - distance=models.Distance.COSINE, - multivector_config=models.MultiVectorConfig( - comparator=models.MultiVectorComparator.MAX_SIM - ) - ), - "mean_pooling_rows": models.VectorParams( - size=128, - distance=models.Distance.COSINE, - multivector_config=models.MultiVectorConfig( - comparator=models.MultiVectorComparator.MAX_SIM - ) - ) - } -) +From the start, \&AI framed patent invalidation and charting as semantic retrieval problems. Using OpenAI’s embedding models, they transformed structured and unstructured patent data into dense vector representations. -``` +![architecture](/blog/case-study-and-ai/and-ai-diagram.png) + *\&AI's retrieval architecture stack* + +But the scale was immense. Their full corpus includes hundreds of millions of documents from international patent offices and other sources, resulting in more than 250 billion tokens. Ingesting, embedding, and searching this volume of data demanded a robust, cloud-native vector search solution. -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/\#choose-a-dataset) Choose a dataset +"We needed to scale to a number of vectors that just hadn’t been benchmarked publicly," said Turner. "Qdrant was the only one that handled that load out of the box — and without needing dedicated DevOps engineers." -We’ll use the **UFO Dataset** by Daniel van Strien for this tutorial. It’s available on Hugging Face; you can download it directly from there. +Turner had used Qdrant in a prior startup, where he appreciated the high performance and strong Rust-based architecture. But it was Qdrant’s [opinionated documentation](https://qdrant.tech/documentation/) and built-in developer tools that sealed the deal. -```python -from datasets import load_dataset -ufo_dataset = "davanstrien/ufo-ColPali" -dataset = load_dataset(ufo_dataset, split="train") +*“I’m all for opinionated docs,” said Turner. “Don’t make me figure out how to optimize everything myself. Qdrant tells you the right way to do things; it just works.”* +— Herbie Turner, CTO & Co-Founder, \&AI -``` +## From noisy PDFs to structured vectors -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/\#embedding-and-mean-pooling) Embedding and Mean Pooling +To support global scale, \&AI used [Reducto](https://reducto.ai), an AI-based PDF parsing service optimized for accuracy, to process patent data spanning decades and jurisdictions. The resulting structured data was transformed into dense vectors via OpenAI’s embedding API, then indexed in Qdrant. -We’ll use a function that generates multivector representations and their mean pooled versions of each PDF page (aka image) in batches. -For complete understanding, it’s important to consider the following specifics of **ColPali** and **ColQwen**: +Patent formats change over time and across regions, so even cleaning and standardizing the data posed challenges. \&AI built a preprocessing pipeline that included OCR, normalization, metadata extraction, and payload structuring. -**ColPali:** -In theory, ColPali is designed to generate 1,024 vectors per PDF page, but in practice, it produces 1,030 vectors. This discrepancy is due to ColPali’s pre-processor, which appends the text `Describe the image.` to each input. This additional text generates an extra 6 multivectors. +They chose [scalar quantization](https://qdrant.tech/articles/scalar-quantization/) in Qdrant to speed up retrieval while maintaining high accuracy. Initial experiments with binary quantization revealed too much recall degradation, forcing \&AI to retrieve tens of thousands of candidates just to hit their quality bar. Scalar was the sweet spot. -**ColQwen:** -ColQwen dynamically determines the number of patches in “rows and columns” of a PDF page based on its size. Consequently, the number of multivectors can vary between inputs. ColQwen pre-processor prepends `<|im_start|>user<|vision_start|>` and appends `<|vision_end|>Describe the image.<|im_end|><|endoftext|>`. +## Semantics over generation -For example, that’s how ColQwen multivector output is formed. +Rather than rely on LLMs to generate legal output, \&AI framed its tasks as retrieval problems. Everything, prior art search, invalidity charts, claim comparisons, was treated as a ranking and grounding problem. -![that’s how ColQwen multivector output is formed](https://qdrant.tech/documentation/tutorials/pdf-retrieval-at-scale/ColQwen-preprocessing.png) +"We do an initial broad search to get candidates, then use metadata filtering, claim construction analysis, and context-specific re-ranking to refine results," said Turner. -The `get_patches` function is to get the number of `x_patches` (rows) and `y_patches` (columns) ColPali/ColQwen2 models will divide a PDF page into. -For ColPali, the numbers will always be 32 by 32; ColQwen will define them dynamically based on the PDF page size. +Qdrant’s filterable HNSW, payload field indexing, and support for multi-tenancy made this possible. Public patent search operates globally, while firm-specific legal data is stored in isolated tenant spaces. -```python -x_patches, y_patches = model_processor.get_n_patches( - image_size, - patch_size=model.patch_size -) +"Having multi-tenancy built-in was huge," Turner said. "It let us give firms strong guarantees around data privacy without spinning up separate infrastructure." -``` +## Scaling infrastructure, not headcount -For **ColQwen** model +By using [Qdrant Cloud](https://qdrant.tech/cloud/), \&AI avoided the need to manage DevOps or self-host massive vector clusters. Even after scaling to over 1 billion vectors, Qdrant’s managed infrastructure delivered fast search and low memory usage. -```python -model_processor.get_n_patches( - image_size, - patch_size=model.patch_size, - spatial_merge_size=model.spatial_merge_size -) +"Patent litigation has huge stakes, one result could influence a billion-dollar case," said Turner. "Accuracy is the top priority, and Qdrant let us optimize for that without compromising on cost or performance." -``` +Qdrant’s support for [payload filters](https://qdrant.tech/documentation/concepts/filtering/), [multitenancy](https://qdrant.tech/documentation/guides/multiple-partitions/), and quantization let \&AI optimize deeply. Their AI patent agent, Andy, uses natural language to guide attorneys through patent analysis tasks, drastically cutting time-to-result. -We choose to **preserve prefix and postfix multivectors**. Our **pooling** operation compresses the multivectors representing **the image tokens** based on the number of rows and columns determined by the model (static 32x32 for ColPali, dynamic XxY for ColQwen). Function retains and integrates the additional multivectors produced by the model back to pooled representations. +*"With Qdrant, we scaled to a billion vectors and still respond in sub-second latency. That lets us power workflows that used to take hours in just a few minutes."* -Simplified version of pooling for **ColPali** model: +## Unlocking new markets and workflows -(see the full version – also applicable for **ColQwen** – in the [tutorial notebook](https://githubtocolab.com/qdrant/examples/blob/master/pdf-retrieval-at-scale/ColPali_ColQwen2_Tutorial.ipynb)) +\&AI’s ability to search across the global patent corpus opened doors to new jurisdictions and legal use cases. It also gave them the confidence to offer strong guarantees to clients: yes, we’re looking at *everything*. -```python +Their semantic-first retrieval engine also enabled new products, like real-time invalidity checks and interactive claim visualization. With data grounded, structured, and indexed in Qdrant, the team continues to build fast. -processed_images = model_processor.process_images(image_batch) -# Image embeddings of shape (batch_size, 1030, 128) -image_embeddings = model(**processed_images) +## Looking ahead -# (1030, 128) -image_embedding = image_embeddings[0] # take the first element of the batch +\&AI is already working on the next version of Andy, expanding natural language capabilities and increasing automation in patent workflows. With Qdrant's upcoming inference capabilities and support for hybrid and multimodal search, Turner sees room for deeper integration. -# Now we need to identify vectors that correspond to the image tokens -# It can be done by selecting tokens corresponding to special `image_token_id` +"We want to stay at the application layer. If Qdrant can keep lifting the infrastructure complexity off our plate, we’re happy to keep building on it." -# (1030, ) - boolean mask (for the first element in the batch), True for image tokens -mask = processed_images.input_ids[0] == model_processor.image_token_id +As legal AI matures, \&AI’s retrieval-first approach — and Qdrant’s infrastructure support — are helping bring clarity and trust to one of the most high-stakes domains in AI. -# For convenience, we now select only image tokens -# and reshape them to (x_patches, y_patches, dim) +<|page-294-lllmstxt|> +# Introducing Qdrant Cloud Inference -# (x_patches, y_patches, 128) -image_patch_embeddings = image_embedding[mask].view(x_patches, y_patches, model.dim) +Today, we’re announcing the launch of Qdrant Cloud Inference ([get started in your cluster](https://cloud.qdrant.io/)). With Qdrant Cloud Inference, users can generate, store and index embeddings in a single API call, turning unstructured text and images into search-ready vectors in a single environment. Directly integrating model inference into Qdrant Cloud removes the need for separate inference infrastructure, manual pipelines, and redundant data transfers. -# Now we can apply mean pooling by rows and columns +This simplifies workflows, accelerates development cycles, and eliminates unnecessary network hops for developers. With a single API call, you can now embed, store, and index your data more quickly and more simply. This speeds up application development for RAG, Multimodal, Hybrid search, and more. -# (x_patches, 128) -pooled_by_rows = image_patch_embeddings.mean(dim=0) +## Unify embedding and search -# (y_patches, 128) -pooled_by_columns = image_patch_embeddings.mean(dim=1) +Traditionally, building application data pipelines means juggling separate embedding services and a vector database, introducing unnecessary complexity, latency, and network costs. Qdrant Cloud Inference brings everything into one system. Embeddings are generated inside the network of your cluster, which removes external API overhead, resulting in lower latency and faster response times. Additionally, you can now track vector database and inference costs in one place. -# [Optionally] we can also concatenate special tokens to the pooled representations, -# For ColPali, it's only postfix +![architecture](/blog/qdrant-cloud-inference/inference-architecture.jpg) -# (x_patches + 6, 128) -pooled_by_rows = torch.cat([pooled_by_rows, image_embedding[~mask]]) +## Supported Models for Multimodal and Hybrid Search Applications -# (y_patches + 6, 128) -pooled_by_columns = torch.cat([pooled_by_columns, image_embedding[~mask]]) +At launch, Qdrant Cloud Inference includes six curated models to start with. Choose from dense models like `all-MiniLM-L6-v2` for fast semantic matching, `mxbai/embed-large-v1` for richer understanding, or sparse models like `splade-pp-en-v1` and `bm25` ([Check out this hybrid search tutorial to see it in action](https://qdrant.tech/documentation/tutorials-and-examples/cloud-inference-hybrid-search/)). For multimodal workloads, Qdrant uniquely supports `OpenAI CLIP`-style models for both text and images. -``` +*Want to request a different model to integrate? You can do this at [https://support.qdrant.io/](https://support.qdrant.io/).* -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/\#upload-to-qdrant) Upload to Qdrant +![architecture](/blog/qdrant-cloud-inference/inference-ui.jpg) -The upload process is trivial; the only thing to pay attention to is the compute cost for ColPali and ColQwen2 models. -In low-resource environments, it’s recommended to use a smaller batch size for embedding and mean pooling. +## Get up to 5M free tokens per model per month, and unlimited BM25 tokens -Full version of the upload code is available in the [tutorial notebook](https://githubtocolab.com/qdrant/examples/blob/master/pdf-retrieval-at-scale/ColPali_ColQwen2_Tutorial.ipynb) +To make onboarding even easier, we’re offering 5 million free tokens per text model, 1 million for our image model, and unlimited for `bm25` to all paid Qdrant Cloud users. These token allowances renew monthly so long as you have a paid Qdrant Cloud cluster. These free monthly tokens are perfect for development, staging, or even running initial production workloads without added cost. -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/\#querying-pdfs) Querying PDFs +## Inference is automatically enabled for paid accounts -After indexing PDF documents, we can move on to querying them using our two-stage retrieval approach. +Getting started is easy. Inference is automatically enabled for any new paid clusters with version 1.14.0 or higher. It can be activated for existing clusters with a click on the inference tab on the Cluster Detail page in the Qdrant Cloud console. You will see examples of how to use inference with our different Qdrant SDKs. -```python -query = "Lee Harvey Oswald's involvement in the JFK assassination" -processed_queries = model_processor.process_queries([query]).to(model.device) + -# Resulting query embedding is a tensor of shape (22, 128) -query_embedding = model(**processed_queries)[0] +## Start Embedding Today -``` +You can get started now by logging into [Qdrant Cloud](https://cloud.qdrant.io/), selecting a model, and embedding your data directly. No extra APIs. No new tools. Just faster, simpler AI application development. -Now let’s design a function for the two-stage retrieval with multivectors produced by VLLMs: +*Available for paid cloud users. Available on AWS, Azure, and GCP for US regions only. Additional regions will be added soon.* -- **Step 1:** Prefetch results using a compressed multivector representation & HNSW index. -- **Step 2:** Re-rank the prefetched results using the original multivector representation. +## How to Build a Multimodal Search Stack with One API +**Embed, Store, Search: A Hands-On Guide to Qdrant Cloud Inference** -Let’s query our collections using combined mean pooled representations for the first stage of retrieval. +Kacper Ɓukawski, Senior Developer Advocate, hosted a live session showing how to: -```python -# Final amount of results to return -search_limit = 10 -# Amount of results to prefetch for reranking -prefetch_limit = 100 +
    +
  • Generate embeddings for text or images using pre-integrated models
  • +
  • Store and search embeddings in the same Qdrant Cloud environment
  • +
  • Power multimodal (an industry first) and hybrid search with just one API
  • +
  • Reduce network egress fees and simplify your AI stack
  • +
-response = client.query_points( - collection_name=collection_name, - query=query_embedding, - prefetch=[\ - models.Prefetch(\ - query=query_embedding,\ - limit=prefetch_limit,\ - using="mean_pooling_columns"\ - ),\ - models.Prefetch(\ - query=query_embedding,\ - limit=prefetch_limit,\ - using="mean_pooling_rows"\ - ),\ - ], - limit=search_limit, - with_payload=True, - with_vector=False, - using="original" -) -``` +Watch now: -And check the top retrieved result to our query _“Lee Harvey Oswald’s involvement in the JFK assassination”_. + -```python -dataset[response.points[0].payload['index']]['image'] +<|page-295-lllmstxt|> +## Vector Space Day 2025: Powered by Qdrant -``` +📍 Colosseum Berlin, Germany +đŸ—“ïž Friday, September 26, 2025 -![Results, ColPali](https://qdrant.tech/documentation/tutorials/pdf-retrieval-at-scale/result-VLLMs.png) +### About -## [Anchor](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/\#conclusion) Conclusion +We’re hosting our first-ever full-day in-person [**Vector Space Day**](https://lu.ma/p7w9uqtz) this September in Berlin, and you’re invited. -In this tutorial, we demonstrated an optimized approach using **Qdrant for PDF retrieval at scale** with VLLMs producing **heavy multivector representations** like **ColPali** and **ColQwen2**. +The Vector Space Day will bring together engineers, researchers, and AI builders to explore the cutting edge of retrieval, vector search infrastructure, and agentic AI. From building scalable RAG pipelines to enabling real-time AI memory and next-gen context engineering, we’re covering the full spectrum of modern vector-native search. -Without such optimization, the performance of retrieval systems can degrade severely, both in terms of indexing time and query latency, especially as the dataset size grows. +### Why You Should Attend -We **strongly recommend** implementing this approach in your workflows to ensure efficient and scalable PDF retrieval. Neglecting to optimize the retrieval process could result in unacceptably slow performance, hindering the usability of your system. +* **Deep-dives & lightning talks**: Learn from the teams solving hard problems in AI infrastructure, search relevance, and semantic retrieval in production. +* **Panels & debates**: Discuss the future of AI agents, multimodal memory, context-aware RAG, and vector-native developer workflows. +* **Hands-on workshops**: Get practical with applied vector search and context construction in guided sessions, perfect whether you're building a search product or a complex RAG pipeline. +* **Meet the community**: This isn’t just a conference. It’s a gathering of the developers rethinking how AI systems search, retrieve, and reason at scale. -Start scaling your PDF retrieval today! +### Topics We’ll Explore -##### Was this page useful? +We’re intentionally keeping things broad. If you work on any of the following, the Vector Space Day 2025 is your event: -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +* Vector databases & similarity search +* AI memory and context management +* Agent frameworks & orchestration +* RAG pipelines & evaluation +* Context engineering for retrieval & generation +* Prompt construction and context window optimization +* Real-time retrieval systems +* Hybrid and semantic search at scale -Thank you for your feedback! 🙏 +Stay tuned: we’ll be announcing the full agenda, speaker lineup, and event partners in the coming weeks. Expect a mix of industry leaders, deep technical sessions, and emerging voices from the vector search and GenAI community. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/advanced-tutorials/pdf-retrieval-at-scale.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +### After Party +The day doesn’t end when the last session wraps. Your ticket includes access to the After Party. Unwind, connect with fellow attendees, and keep the conversations going over drinks, music, and light bites. Whether you're debriefing a lightning talk or sketching out your next RAG experiment on a napkin, it's the perfect way to close out the day. -On this page: +### Call for Speakers -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/advanced-tutorials/pdf-retrieval-at-scale.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +We’ve locked in a strong lineup but we’re saving a few select slots for standout talks from the community. If you’re building something novel in vector search, AI memory, context engineering, or retrieval infra, we want to hear from you. -× +[Submit a proposal](https://docs.google.com/forms/d/e/1FAIpQLSeGvmWISVImELQid1AjMv2Dvm2AXNuOqyZKqrQlFk07CnY_cw/viewform). Due Friday, August 8th, 2025. Missed the deadline? Go ahead and send it in anyway. Proposals submitted after the deadline, without prior communication with the event organizers, will be reviewed if space becomes available. -[Powered by](https://qdrant.tech/) +### Partners -<|page-160-lllmstxt|> -## fastembed-semantic-search -- [Documentation](https://qdrant.tech/documentation/) -- [Fastembed](https://qdrant.tech/documentation/fastembed/) -- FastEmbed & Qdrant +We’ll be joined by leading organizations including AWS, Microsoft, Vultr, Jina, DeepSet, LlamaIndex, TwelveLabs, n8n, Neo4j, MistralAI, DataTalks.Club, and the MLOps Community, and more. -# [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-semantic-search/\#using-fastembed-with-qdrant-for-vector-search) Using FastEmbed with Qdrant for Vector Search +These partners represent a cross-section of the most influential players in AI infrastructure and applied research, and we’re proud to collaborate with them to bring this event to life. -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-semantic-search/\#install-qdrant-client-and-fastembed) Install Qdrant Client and FastEmbed +Their involvement underscores the growing momentum behind vector search and retrieval-augmented generation (RAG) systems—and this is just the beginning. -```python -pip install "qdrant-client[fastembed]>=1.14.2" +We might still have a few exciting names to announce, so stay tuned... -``` +![Partners](/blog/vector-space-day-2025/partners_6-aug.png) -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-semantic-search/\#initialize-the-client) Initialize the client +### Get Your Ticket -Qdrant Client has a simple in-memory mode that lets you try semantic search locally. +General admission: €50 -```python -from qdrant_client import QdrantClient, models +**Early Bird Pricing** -client = QdrantClient(":memory:") # Qdrant is running from RAM. +Through July 31: 25% off -``` +August: 10% off -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-semantic-search/\#add-data) Add data +September: full price -Now you can add two sample documents, their associated metadata, and a point `id` for each. +[**Reserve your spot now.**](https://lu.ma/p7w9uqtz) -```python -docs = [\ - "Qdrant has a LangChain integration for chatbots.",\ - "Qdrant has a LlamaIndex integration for agents.",\ -] -metadata = [\ - {"source": "langchain-docs"},\ - {"source": "llamaindex-docs"},\ -] -ids = [42, 2] +Space is limited. -``` +See you in Berlin\! -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-semantic-search/\#create-a-collection) Create a collection +### Global Hackathon Through September 16 -Qdrant stores vectors and associated metadata in collections. -Collection requires vector parameters to be set during creation. -In this tutorial, we’ll be using `BAAI/bge-small-en` to compute embeddings. +In the lead-up to Vector Space Day, we're hosting **Think Outside the Bot**, a global, virtual hackathon challenging devs to reimagine what's possible with vector search. Forget the classical RAG chatbot! This is your chance to explore multi-modal applications, intelligent recommendations, and advanced vector search that go far beyond conversational interfaces. -```python -model_name = "BAAI/bge-small-en" -client.create_collection( - collection_name="test_collection", - vectors_config=models.VectorParams( - size=client.get_embedding_size(model_name), - distance=models.Distance.COSINE - ), # size and distance are model dependent -) +💰 Over $10k in prizes from Qdrant, Mistral AI, CrewAI, Cognee, and more -``` +đŸ—“ïž Now through Tuesday, September 16 -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-semantic-search/\#upsert-documents-to-the-collection) Upsert documents to the collection +🌏 Open to participants worldwide -Qdrant client can do inference implicitly within its methods via FastEmbed integration. -It requires wrapping your data in models, like `models.Document` (or `models.Image` if you’re working with images) +🏆 Winners announced September 26 at Vector Space Day in Berlin -```python -metadata_with_docs = [\ - {"document": doc, "source": meta["source"]} for doc, meta in zip(docs, metadata)\ -] -client.upload_collection( - collection_name="test_collection", - vectors=[models.Document(text=doc, model=model_name) for doc in docs], - payload=metadata_with_docs, - ids=ids, -) +[**Learn more.**](https://try.qdrant.tech/hackathon-2025) -``` -## [Anchor](https://qdrant.tech/documentation/fastembed/fastembed-semantic-search/\#run-vector-search) Run vector search +### Need your manager’s approval to attend? -Here, you will ask a dummy question that will allow you to retrieve a semantically relevant result. +We’ve got you covered. Download this ready-to-send request letter to help explain why attending Vector Search Day is a valuable use of your time (and budget). [Download now](https://docs.google.com/document/d/1EivCVK47XEFXAhyoo8QaCBX0Op6uicUODAxTGXhZxrs/edit?usp=sharing). -```python -search_result = client.query_points( - collection_name="test_collection", - query=models.Document( - text="Which integration is best for agents?", - model=model_name - ) -).points -print(search_result) +<|page-296-lllmstxt|> +![pento bento box](/blog/case-study-pento/pento-bento-box-dark.jpg) -``` +# Bringing People Together Through Qdrant -The semantic search engine will retrieve the most similar result in order of relevance. In this case, the second statement about LlamaIndex is more relevant. +![pento-cover-image](/blog/case-study-pento/pento-cover-image.png) -```python -[\ - ScoredPoint(\ - id=2,\ - score=0.87491801319731,\ - payload={\ - "document": "Qdrant has a LlamaIndex integration for agents.",\ - "source": "llamaindex-docs",\ - },\ - ...\ - ),\ - ScoredPoint(\ - id=42,\ - score=0.8351846627714035,\ - payload={\ - "document": "Qdrant has a LangChain integration for chatbots.",\ - "source": "langchain-docs",\ - },\ - ...\ - ),\ -] +## *Taste in art isn’t just a preference; it’s a fingerprint.* -``` +Imagine you're an artist or art enthusiast searching not for a painting, but for people who share your unique taste, someone who resonates with surrealist colors just as deeply as you, or who finds quiet joy in minimalist lines. How would a system know who those people are? Traditional recommenders often suggest what’s trending or popular, or just can't understand the nuances of art. -##### Was this page useful? +In this post, we’ll build a recommender that does just that. By mapping user-art interactions into a semantic vector space, identifying clusters of preferences, and leveraging Qdrant’s powerful recommendation API, we’ll create a system that connects people not through popularity but through shared artistic preferences. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +And while we’ll focus on art, the system is fundamentally universal. Replace artworks with podcasts, apartments, collaborators and the logic still holds. Once everything becomes a vector, the same pipeline can surface the right item for the right person, across nearly any domain. -Thank you for your feedback! 🙏 +## The Problem with Traditional Recommenders -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/fastembed/fastembed-semantic-search.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Most recommendation systems optimize for popularity, relying on techniques like collaborative filtering, assuming that users with similar behaviors have similar tastes. But in art, that assumption often breaks down. One person’s favorite painting might leave another cold. These responses are deeply personal and they shift. A user drawn to surrealism today might find solace in minimalism tomorrow. We need a system that listens inward, one that models taste as a dynamic, evolving landscape. And that's what we’re building. -On this page: +## Modeling Aesthetic Taste Through Interaction -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/fastembed/fastembed-semantic-search.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Aesthetic taste isn’t static, it drifts, deepens, and sometimes pivots entirely. These shifts often happen without conscious intention but they show up in patterns of interaction. -× +Each time a person engages with a piece of art, they're leaving a signal that when seen individually, these signals might seem small. But over time, they begin to trace a shape, a kind of emotional fingerprint. -[Powered by](https://qdrant.tech/) +Most systems try to compress this shape into a single vector. But human taste doesn’t belong in a straight line. It's layered, multi-faceted, and often contradictory. A person can love abstraction *and* realism. A truly expressive model should reflect that. -<|page-161-lllmstxt|> -## multiple-partitions -- [Documentation](https://qdrant.tech/documentation/) -- [Guides](https://qdrant.tech/documentation/guides/) -- Multitenancy +To capture this richness, we treat each user not as a single point in space, but as a collection of varied points, clusters that evolve as their relationship with art evolves through time. Each cluster will be weighted differently depending on how recent and frequent the interactions are. -# [Anchor](https://qdrant.tech/documentation/guides/multiple-partitions/\#configure-multitenancy) Configure Multitenancy +## How the System Works -**How many collections should you create?** In most cases, you should only use a single collection with payload-based partitioning. This approach is called multitenancy. It is efficient for most of users, but it requires additional configuration. This document will show you how to set it up. +### Interaction acquisition -**When should you create multiple collections?** When you have a limited number of users and you need isolation. This approach is flexible, but it may be more costly, since creating numerous collections may result in resource overhead. Also, you need to ensure that they do not affect each other in any way, including performance-wise. +Each time a user opens the platform, we present a curated set of paintings and invite them to rate every piece on a 0-to-5 scale. These ratings capture the user’s level of interest in each artwork. +Behind the scenes, we convert every raw rating ri into a signed weight wi= ri-, +where is a tunable “neutral” threshold (*default \= 2.5*). Scores above ( wi\>0​ ) signal **positive affinity**; scores below ( wi\<0​ ) indicate **negative preference**. +If users systematically choose the upper end of the scale we simply raise to keep the split between positive and negative signals balanced. -## [Anchor](https://qdrant.tech/documentation/guides/multiple-partitions/\#partition-by-payload) Partition by payload +### Artwork Embeddings -When an instance is shared between multiple users, you may need to partition vectors by user. This is done so that each user can only access their own vectors and can’t see the vectors of other users. +At the heart of this system is the ability to understand art, not through keywords or categories, but through the image itself. Each piece of art is transformed into a vector using an image encoder that captures not just form and color, but style, visual tone, and composition. The image encoder can be a pre-trained encoder for visual tasks or, even better, a fine-tuned model that captures painting styles. -httppythontypescriptrustjavacsharpgo +The result is a high-dimensional embedding that places artworks into a semantic space where similar pieces, whether in style, subject, period, or color palette, are positioned close to one another, even across stylistic boundaries. These embeddings become the foundational layer for everything that follows: clustering, scoring, and ultimately, artist recommendation. +![pento painting encoder](/blog/case-study-pento/pento-painting-encoder.png) -```http -PUT /collections/{collection_name}/points -{ - "points": [\ - {\ - "id": 1,\ - "payload": {"group_id": "user_1"},\ - "vector": [0.9, 0.1, 0.1]\ - },\ - {\ - "id": 2,\ - "payload": {"group_id": "user_1"},\ - "vector": [0.1, 0.9, 0.1]\ - },\ - {\ - "id": 3,\ - "payload": {"group_id": "user_2"},\ - "vector": [0.1, 0.1, 0.9]\ - },\ - ] -} +### Interaction Clustering with HDBSCAN -``` +Once each artwork is embedded, we turn to the user, not as a static profile, but as a series of moments. Each interaction with a piece of art becomes a point in embedding space, forming a cloud of visual preferences. -```python -client.upsert( - collection_name="{collection_name}", - points=[\ - models.PointStruct(\ - id=1,\ - payload={"group_id": "user_1"},\ - vector=[0.9, 0.1, 0.1],\ - ),\ - models.PointStruct(\ - id=2,\ - payload={"group_id": "user_1"},\ - vector=[0.1, 0.9, 0.1],\ - ),\ - models.PointStruct(\ - id=3,\ - payload={"group_id": "user_2"},\ - vector=[0.1, 0.1, 0.9],\ - ),\ - ], -) +To uncover structure within this cloud, we use HDBSCAN, a density-based clustering algorithm that, unlike k-means, doesn’t require a predefined number of clusters. This is crucial when modeling aesthetic taste, which is rarely uniform. A single user might respond to romanticism, brutalist architecture, and vaporwave all at once, or in phases. -``` +We split interactions into two sets: -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +* ***Positive interactions***: interactions with a wi\>0 (in green) +* ***Negative interactions***: interactions with wi\<0 (in red) -const client = new QdrantClient({ host: "localhost", port: 6333 }); +Each set is clustered independently using HDBSCAN. This results in multiple localized regions in embedding space that reflect coherent aesthetic themes. +![hdbscan](/blog/case-study-pento/pento-hdbscan.png) +Each cluster is represented by its medoid, the most central, representative embedding in that group. These medoids become the core building blocks of the user’s taste profile. +We use medoids instead of centroids because centroids are the mean of all embeddings in the cluster, which may not correspond to any actual sample and can be influenced by outliers or non-linear distances in the embedding space. In contrast, the medoid is an actual data point that best represents the cluster while preserving the true structure of the original space, especially important when working with non-euclidean distances like cosine similarity. -client.upsert("{collection_name}", { - points: [\ - {\ - id: 1,\ - payload: { group_id: "user_1" },\ - vector: [0.9, 0.1, 0.1],\ - },\ - {\ - id: 2,\ - payload: { group_id: "user_1" },\ - vector: [0.1, 0.9, 0.1],\ - },\ - {\ - id: 3,\ - payload: { group_id: "user_2" },\ - vector: [0.1, 0.1, 0.9],\ - },\ - ], -}); +Let us clarify that clearly we don't have to use the entire user history, we can look back a certain amount of time to consider the last interactions. This time threshold will depend on how often users interact on the platform and how many interactions you have at a time. -``` +### Scoring Taste Clusters by Recency -```rust -use qdrant_client::qdrant::{PointStruct, UpsertPointsBuilder}; -use qdrant_client::Qdrant; +Not all tastes carry the same weight, especially over time. A cluster of artworks a user connected with months ago may no longer reflect their current preferences. To account for this, we assign a recency-aware score to each cluster, emphasizing freshness without discarding history. -let client = Qdrant::from_url("http://localhost:6334").build()?; +Each cluster is scored based on the timestamps of the interactions it contains, using an exponential decay function: +![exponential decay function](/blog/case-study-pento/pento-exponential-decay-function.png) +Where: -client - .upsert_points(UpsertPointsBuilder::new( - "{collection_name}", - vec![\ - PointStruct::new(1, vec![0.9, 0.1, 0.1], [("group_id", "user_1".into())]),\ - PointStruct::new(2, vec![0.1, 0.9, 0.1], [("group_id", "user_1".into())]),\ - PointStruct::new(3, vec![0.1, 0.1, 0.9], [("group_id", "user_2".into())]),\ - ], - )) - .await?; +* wi is the normalized rating calculated before +* đš«ti is the time elapsed since interaction i +* Recent interactions contribute more than older ones +* Larger clusters naturally accumulate more weight, unless they are stale +* **λ** is a number between 0 and 1, the former weighs equally all occurrences independently of when they occurred and the latter gives more importance to recent events. We use λ=0.01 for this example. -``` +This scoring method captures two dimensions at once: -```java -import java.util.List; -import java.util.Map; +* ***Recency:*** Newer preferences rise to the top +* ***Strength:*** Clusters with more activity gain importance +![medoid scoring](/blog/case-study-pento/pento-medoid-scoring.png) -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.PointStruct; +The result is a dynamic prioritization of tastes. Clusters representing fleeting interests fade naturally. Those tied to long-term engagement remain prominent. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +To maintain the agility of the system and to avoid an excess of likes per user, we only keep 50% of the groupings based on their recency-weighted score |Sc|. This is done to represent users by their predominant tastes and the 50% is purely a business decision. By discarding smaller or outdated clusters, we focus on the strongest and most recent signals of interest, ensuring that recommendations are precise and meaningful. While this trade-off reduces recall by potentially missing some weaker matches, it significantly boosts precision by prioritizing what truly resonates with the user. -client - .upsertAsync( - "{collection_name}", - List.of( - PointStruct.newBuilder() - .setId(id(1)) - .setVectors(vectors(0.9f, 0.1f, 0.1f)) - .putAllPayload(Map.of("group_id", value("user_1"))) - .build(), - PointStruct.newBuilder() - .setId(id(2)) - .setVectors(vectors(0.1f, 0.9f, 0.1f)) - .putAllPayload(Map.of("group_id", value("user_1"))) - .build(), - PointStruct.newBuilder() - .setId(id(3)) - .setVectors(vectors(0.1f, 0.1f, 0.9f)) - .putAllPayload(Map.of("group_id", value("user_2"))) - .build())) - .get(); +#### User Representation as Multivectors -``` +Once clusters are identified and scored, we distill a user’s taste into something compact, expressive, and ready for retrieval: a multivector representation [learn more about multivectors representations in Qdrant](https://qdrant.tech/documentation/advanced-tutorials/using-multivector-representations/). -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +Each positively scored cluster contributes its medoid vector, a single embedding that represents the core of that aesthetic preference. We do the same for negatively scored clusters, treating them as regions the user tends to avoid. -var client = new QdrantClient("localhost", 6334); +This gives us two sets of vectors per user: -await client.UpsertAsync( - collectionName: "{collection_name}", - points: new List - { - new() - { - Id = 1, - Vectors = new[] { 0.9f, 0.1f, 0.1f }, - Payload = { ["group_id"] = "user_1" } - }, - new() - { - Id = 2, - Vectors = new[] { 0.1f, 0.9f, 0.1f }, - Payload = { ["group_id"] = "user_1" } - }, - new() - { - Id = 3, - Vectors = new[] { 0.1f, 0.1f, 0.9f }, - Payload = { ["group_id"] = "user_2" } - } - } -); +* ***Positive multivector:*** the user’s top taste clusters, ranked by recency-weighted importance +* ***Negative multivector:*** clusters of rejected content -``` +Together, these sets describe not just what the user resonates with, but also what they tend to reject. It’s a more nuanced, contrastive view of preference and one that’s especially powerful when used with Qdrant’s vector search. +![user representation](/blog/case-study-pento/pento-user-representation.png) -```go -import ( - "context" +#### Retrieval via Qdrant’s Recommendation API - "github.com/qdrant/go-client/qdrant" -) +With each user represented by a set of positive and negative clusters condensed into multivectors, we can now move from modeling to discovery. +![multivector](/blog/case-study-pento/pento-multivector.png) -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +To recommend artists who might align with the users current aesthetic we turn to Qdrant Recommendation API. Unlike a standard vector search, this Qdrant’s functionality lets us provide both what we are looking for, represented by the positive multivector, and what we want to avoid, represented by the negative multivector. -client.Upsert(context.Background(), &qdrant.UpsertPoints{ - CollectionName: "{collection_name}", - Points: []*qdrant.PointStruct{ - { - Id: qdrant.NewIDNum(1), - Vectors: qdrant.NewVectors(0.9, 0.1, 0.1), - Payload: qdrant.NewValueMap(map[string]any{"group_id": "user_1"}), - }, - { - Id: qdrant.NewIDNum(2), - Vectors: qdrant.NewVectors(0.1, 0.9, 0.1), - Payload: qdrant.NewValueMap(map[string]any{"group_id": "user_1"}), - }, - { - Id: qdrant.NewIDNum(3), - Vectors: qdrant.NewVectors(0.1, 0.1, 0.9), - Payload: qdrant.NewValueMap(map[string]any{"group_id": "user_2"}), - }, - }, -}) +The logic is simple: find artists whose positive taste profile strongly overlaps with the target user’s preferences, while minimizing similarity to the clusters they tend to reject. +To do this, we leverage Qdrant’s scoring strategy, a search method designed specifically for working with multiple vectors per point. This strategy evaluates each one individually. It computes the highest similarity to any positive cluster, and the strongest conflict to any negative one. + +The final score for a candidate is computed as: + +```py +if best_positive_score > best_negative_score: + score = best_positive_score +else: + score = -(best_negative_score * best_negative_score) ``` -2. Use a filter along with `group_id` to filter vectors for each user. +This ensures that a candidate artist is only considered a match if they resonate with at least one of the user’s core preferences and don’t simultaneously resemble something the user tends to reject. -httppythontypescriptrustjavacsharpgo +Additionally, we enforce constraints such as geographic location and age preferences by combining this vector logic with metadata filtering. For instance, we can restrict candidates to those from a specific region or within a certain age range using Qdrant’s payload filters. -```http -POST /collections/{collection_name}/points/query -{ - "query": [0.1, 0.1, 0.9], - "filter": { - "must": [\ - {\ - "key": "group_id",\ - "match": {\ - "value": "user_1"\ - }\ - }\ - ] - }, - "limit": 10 -} +This behavior makes intuitive sense for our use case. Two artists don’t need to be compatible across all aesthetic dimensions. A strong connection on just one, a shared sensibility in color, texture, or composition may be enough to suggest a meaningful alignment. The best score strategy respects that, it allows each cluster to speak for itself. -``` +To compare multivectors themselves Qdrant uses the MaxSim function. This calculates the similarity between two multivectors by summing the maximum similarity between each vector in one matrix and the best-matching vector in the other: -```python -from qdrant_client import QdrantClient, models +![maxsim](/blog/case-study-pento/pento-max-sim-function.png) -client = QdrantClient(url="http://localhost:6333") +### Addressing the cold start problem -client.query_points( - collection_name="{collection_name}", - query=[0.1, 0.1, 0.9], - query_filter=models.Filter( - must=[\ - models.FieldCondition(\ - key="group_id",\ - match=models.MatchValue(\ - value="user_1",\ - ),\ - )\ - ] - ), - limit=10, -) +Every recommender hits a wall when it comes to new users or new items, what’s known as the cold start problem. Without interaction data, how do you know what to recommend? -``` +Our answer is a friction-free onboarding process, in which we show a carefully curated mix of artworks that span diverse aesthetic themes. Each early interaction immediately feeds into their taste clusters. Within a handful of interactions, the system already sees a rough silhouette of their style and can start returning genuinely relevant recommendations, no long warm-up, no guesswork. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +### Final Thoughts -const client = new QdrantClient({ host: "localhost", port: 6333 }); +In this walkthrough we moved from raw interaction logs to a living recommender: -client.query("{collection_name}", { - query: [0.1, 0.1, 0.9], - filter: { - must: [{ key: "group_id", match: { value: "user_1" } }], - }, - limit: 10, -}); +* Built a dynamic user profile that captures both attraction and aversion. +* Used an encoder to drop every piece of content into the same vector space. +* Clustered those vectors with HDBSCAN, chose medoids as anchors, and let an exponential decay function show which clusters are more relevant. +* Queried Qdrant’s Recommendation API to retrieve candidates that closely align with the user’s positive multivector, while actively distancing those associated with negative incompatibility signals. -``` +That’s the entire stack in one breath and it’s all you need to ship a production-ready recommendation engine that can be applied across domains\! -```rust -use qdrant_client::qdrant::{Condition, Filter, QueryPointsBuilder}; -use qdrant_client::Qdrant; +<|page-297-lllmstxt|> +# How Alhena AI unified its AI stack and accelerated ecommerce outcomes with Qdrant -let client = Qdrant::from_url("http://localhost:6334").build()?; +![How Alhena AI unified its AI stack and improved ecommerce conversions with Qdrant](/blog/case-study-alhena/alhena-bento-box-dark.jpg) -client - .query( - QueryPointsBuilder::new("{collection_name}") - .query(vec![0.1, 0.1, 0.9]) - .limit(10) - .filter(Filter::must([Condition::matches(\ - "group_id",\ - "user_1".to_string(),\ - )])), - ) - .await?; +## Building AI agents that drive both revenue and support outcomes -``` +Alhena AI is redefining the ecommerce experience through intelligent agents that assist customers before and after a purchase. On the front end, these agents help users find the perfect product based on nuanced preferences. On the back end, they resolve complex support queries without escalating to a human. -```java -import java.util.List; +To deliver this experience, Alhena must combine natural language understanding, context-aware retrieval, and high-performance infrastructure. That means building agents that are not only fast and accurate, but also scalable across customers with vastly different catalogs and architectures. -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.QueryPoints; +## Infra challenges: fragmented vector search with mounting complexity -import static io.qdrant.client.QueryFactory.nearest; -import static io.qdrant.client.ConditionFactory.matchKeyword; +As Alhena began onboarding more ecommerce clients, its vector search layer started to crack under pressure. The team was juggling FAISS and Pinecone, each selected for different needs. FAISS was lightweight and easy to use for small indexes, but lacked robust filtering and scalability. Pinecone handled large indexes better but introduced latency on smaller ones and had limited support for advanced filtering or sparse embeddings. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +To make this work, engineers wrote custom routing logic to decide which index to use based on the use case. They layered SQL systems on top of FAISS to simulate metadata filters. They worked around Pinecone's limitations with post-processing. The result was a growing tangle of complexity that slowed down customer onboarding, reduced system reliability, and constrained the product roadmap. -client.queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setFilter( - Filter.newBuilder().addMust(matchKeyword("group_id", "user_1")).build()) - .setQuery(nearest(0.1f, 0.1f, 0.9f)) - .setLimit(10) - .build()) - .get(); +This complexity impacted the business. New customer deployments required time-consuming backend tuning. Larger ecommerce clients with extensive catalogs experienced delayed agent responses. And the inconsistent behavior across index types made it harder to guarantee SLAs for latency and answer quality. -``` +## Why Alhena chose Qdrant: A single backend for all workloads -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -using static Qdrant.Client.Grpc.Conditions; +Alhena set out to find a solution that could unify its vector search layer across all clients and workloads. It needed a system that could support both small and large indexes with consistent performance, offer true hybrid search capabilities, and handle metadata filtering and boosting without extra infrastructure layers. -var client = new QdrantClient("localhost", 6334); +After a proof of concept, Alhena migrated 100 percent of its traffic to Qdrant Cloud. This consolidation allowed the team to retire FAISS, Pinecone, and Weaviate, streamlining both infrastructure and deployment workflows. -await client.QueryAsync( - collectionName: "{collection_name}", - query: new float[] { 0.1f, 0.1f, 0.9f }, - filter: MatchKeyword("group_id", "user_1"), - limit: 10 -); +*“We replaced FAISS, Pinecone, and Weaviate with Qdrant Cloud. It simplified everything and gave us better performance across the board.”* + — Kshitiz Parashar, Founding Engineer and Vector Infra Lead, Alhena AI -``` +## Unlocking business value through performance, flexibility, and control -```go -import ( - "context" +The move to Qdrant brought immediate technical benefits. But more importantly, it unlocked new capabilities for Alhena’s business. - "github.com/qdrant/go-client/qdrant" -) +By supporting both dense and sparse embeddings in a single query, Qdrant’s hybrid search allowed agents to return more relevant results across a wider range of customer queries. This led to better product recommendations, which directly impacted conversion rates on ecommerce sites. Combined with metadata filters and real-time boosting, the agents could now tailor answers to align with a retailer's business priorities, such as promoting high-margin SKUs or deprioritizing out-of-stock items. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +With Qdrant handling retrieval, Alhena no longer needed to customize infrastructure per client. Agents could be deployed in minutes regardless of catalog size. That translated into faster onboarding, fewer implementation blockers, and more predictable margins as the company scaled. -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.1, 0.1, 0.9), - Filter: &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("group_id", "user_1"), - }, - }, -}) +*“The more questions the AI can answer, the more revenue our customers make. Qdrant helps us surface better context and more accurate answers.”* + — Kshitiz Parashar + +## Hitting production-grade performance targets -``` +Latency was a critical metric for Alhena. With FAISS, vector search on catalogs with 100,000+ items often took three seconds or more. That delayed the start of agent response streaming, making the AI feel sluggish and hurting the user experience. Pinecone helped on large indexes, but introduced latency on small ones, and couldn’t handle hybrid filtering needs. -## [Anchor](https://qdrant.tech/documentation/guides/multiple-partitions/\#calibrate-performance) Calibrate performance +Qdrant reduced retrieval latency on the same datasets to approximately 300 milliseconds. That enabled Alhena to meet its internal P95 SLA of 3.5 seconds from query to first token, even after accounting for hallucination detection, policy enforcement, and contextual rewriting. -The speed of indexation may become a bottleneck in this case, as each user’s vector will be indexed into the same collection. To avoid this bottleneck, consider _bypassing the construction of a global vector index_ for the entire collection and building it only for individual groups instead. +*“We track every millisecond. Qdrant helped us cut vector retrieval time by 90 percent at scale. That’s what made it possible to stay under our latency SLA.”* + — Kang-Chi Ho -By adopting this strategy, Qdrant will index vectors for each user independently, significantly accelerating the process. +## Production architecture built for speed and safety -To implement this approach, you should: +Alhena’s agent platform is composed of multiple intelligent subsystems, each with a distinct role. When a query arrives, it is rewritten contextually, evaluated for safety by a policy enforcer, and checked for hallucination risk by a fact-validation agent. These components operate in parallel, with token buffering to avoid blocking response time. -1. Set `payload_m` in the HNSW configuration to a non-zero value, such as 16. -2. Set `m` in hnsw config to 0. This will disable building global index for the whole collection. +Qdrant powers the retrieval layer that feeds relevant chunks and structured knowledge into this agent pipeline. It enables fast, filtered, ranked results that serve both the semantic intent and business constraints of each ecommerce customer. -httppythontypescriptrustjavacsharpgo +The ability to boost specific items within a vector search query has proven particularly valuable. Alhena’s clients can now promote seasonal offers or brand-prioritized items directly within their product recommendation logic, without changing upstream LLM behavior. -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 768, - "distance": "Cosine" - }, - "hnsw_config": { - "payload_m": 16, - "m": 0 - } -} +## **Multitenancy was key to scaling** -``` +Multi-tenancy played a critical role in Alhena’s ability to scale while keeping operations lean. Instead of spinning up a separate collection for each customer, Alhena isolated data within shared collections, preserving tenant-level boundaries without introducing additional infrastructure complexity. This model dramatically reduced collection sprawl, simplified version control, and allowed the team to support hundreds of thousands of end customers while maintaining just a few collections. -```python -from qdrant_client import QdrantClient, models +*“Multitenancy is a feature we find highly beneficial. It allows us to scale to hundreds of thousands of customers while managing only a few collections, thereby avoiding challenges in collection management. Additionally, we benefit from searching within their own search space. For each company, we implemented a delicate version control to ensure zero downtime after a new training finishes.”* + — Kang-Chi Ho, Founding AI Engineer, Alhena AI -client = QdrantClient(url="http://localhost:6333") +## Simplifying global deployment through Qdrant Cloud -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), - hnsw_config=models.HnswConfigDiff( - payload_m=16, - m=0, - ), -) +Alhena serves customers in both the US and EU, and data residency is a growing concern. Before Qdrant, hosting and scaling regional instances with FAISS or Pinecone required custom Kubernetes deployments and persistent storage management. Now, with Qdrant Cloud, the team can spin up managed clusters in any region with a few clicks. -``` +Feature upgrades are seamless. Boosting, hybrid search, and sparse-dense fusion were all integrated without breaking changes. As Alhena’s needs evolve, Qdrant’s managed infrastructure keeps pace, so there’s no more fighting the database when building new product capabilities. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +*“We were live in two weeks. And when Qdrant releases something like boosting, we can start using it the same day. That kind of agility really matters.”* + — Kshitiz Parashar -const client = new QdrantClient({ host: "localhost", port: 6333 }); +## Enabling the next phase of product innovation -client.createCollection("{collection_name}", { - vectors: { - size: 768, - distance: "Cosine", - }, - hnsw_config: { - payload_m: 16, - m: 0, - }, -}); +With a unified, high-performance vector backend in place, Alhena is now building for the future. Multimodal search is in the roadmap, allowing users to upload a photo and find visually similar products. Qdrant’s image-text embedding support makes this straightforward. The team plans to deploy a separate collection for visual embeddings without touching the core retrieval system. -``` +*“We're using all of Qdrant’s capabilities: hybrid search, keyword-only fallback, boosting, filtering. And it just works, whether the catalog has 1,000 or 100,000 items.”* + — Kshitiz Parashar -```rust -use qdrant_client::qdrant::{ - CreateCollectionBuilder, Distance, HnswConfigDiffBuilder, VectorParamsBuilder, -}; -use qdrant_client::Qdrant; +## From fragmentation to focus -let client = Qdrant::from_url("http://localhost:6334").build()?; +Migrating to Qdrant Cloud allowed Alhena to unify its vector infrastructure, improve system performance, and accelerate go-to-market motion. Engineering teams now spend less time managing complexity and more time building differentiated features. Customers get faster onboarding and better-performing AI agents. Ecommerce buyers get smarter recommendations and faster support. -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) - .hnsw_config(HnswConfigDiffBuilder::default().payload_m(16).m(0)), - ) - .await?; +Most importantly, Alhena now has a retrieval layer that scales with them, not against them. -``` +*“Qdrant gave us sub-second hybrid search, simplified our stack, and unlocked better conversions. It’s the infrastructure foundation for our entire agent platform.”* + — Kang-Chi Ho -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.HnswConfigDiff; -import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.grpc.Collections.VectorsConfig; +### -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +<|page-298-lllmstxt|> +![Gooddata Overview](/blog/case-study-gooddata/gooddata-bento-box-dark.jpg) +### GoodData's Evolution into AI-Powered Analytics -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig( - VectorsConfig.newBuilder() - .setParams( - VectorParams.newBuilder() - .setSize(768) - .setDistance(Distance.Cosine) - .build()) - .build()) - .setHnswConfig(HnswConfigDiff.newBuilder().setPayloadM(16).setM(0).build()) - .build()) - .get(); +AI is redefining how people interact with data, pushing analytics platforms beyond static dashboards toward intelligent, conversational experiences. While traditionally recognized as a powerful BI platform, GoodData is laser-focused on accelerating both 'time to insight' and 'time to solution' by enhancing productivity for analysts and business users alike. -``` +What sets GoodData apart is its unique position in the market: a composable, API-first platform designed for teams that build data products, not just consume them. With deep support for white-labeled analytics, embedded use cases, and governed self-service at scale, GoodData delivers the flexibility modern organizations need. With AI being integrated across every layer of the platform, GoodData is helping their over 140,000 end customers move from traditional BI to intelligent, real-time decision-making. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +### Scaling AI Capabilities to Meet Enterprise Demands -var client = new QdrantClient("localhost", 6334); +Initially, GoodData’s prototype leveraging OpenAI faced scalability limitations. The initial version attempted to load GoodData’s entire semantic model into the AI context for every user query. This led to high compute costs, slow response times, and exceeded technical limits of large language models. Instead, directly embedding extensive semantic layers into LLM contexts proved costly, slow, and impractical. -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine }, - hnswConfig: new HnswConfigDiff { PayloadM = 16, M = 0 } -); +Jan Soubusta, Field CTO at GoodData, recalled: -``` +*"Putting the whole semantic layer directly into an LLM was unsustainable. The response times ballooned, and we consistently hit context size limits."* -```go -import ( - "context" +This shift in approach was essential, as most GoodData customers work with complex data models, often spanning tens or even hundreds of datasets and metrics, unlike traditional desktop BI tools, which typically support a single user working with a single dataset at any given time. - "github.com/qdrant/go-client/qdrant" -) +### Deploying Qdrant’s Scalable Vector Database -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +GoodData transitioned to a Retrieval-Augmented Generation (RAG) strategy, requiring a high-performance vector database. After exploring DuckDB and pgvector, GoodData chose Qdrant due to its high-availability architecture and superior performance. -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 768, - Distance: qdrant.Distance_Cosine, - }), - HnswConfig: &qdrant.HnswConfigDiff{ - PayloadM: qdrant.PtrOf(uint64(16)), - M: qdrant.PtrOf(uint64(0)), - }, -}) +GoodData leveraged Qdrant’s official Helm chart, deploying smoothly in Kubernetes and efficiently managing near-real-time embedding updates, crucial for multilingual semantic layers. -``` +![Old Approach](/blog/case-study-gooddata/gooddata-diagram-1.png) -3. Create keyword payload index for `group_id` field. +### Real-Time Performance and Scalability Gains -httppythontypescriptrustjavacsharpgo +For enterprise customers embedding GoodData as whitelabeled analytics solutions, these improvements mean AI assistants can respond as quickly as a human analyst, responding with relevant metrics, dashboards, or insights in mere seconds. -```http -PUT /collections/{collection_name}/index -{ - "field_name": "group_id", - "field_schema": { - "type": "keyword", - "is_tenant": true - } -} +And adopting Qdrant brought significant benefits to empower GoodData’s AI assistant to do this, include: -``` +* Embedding updates completed in seconds (hundreds to thousands per minute). +* Semantic search results returned within 100 milliseconds. +* AI assistant responses are maintained at approximately 5–10 seconds, with Qdrant’s latency overhead negligible. -```python -client.create_payload_index( - collection_name="{collection_name}", - field_name="group_id", - field_schema=models.KeywordIndexParams( - type="keyword", - is_tenant=True, - ), -) +Jan emphasized performance efficiency: -``` +*"The overhead from Qdrant is negligible; queries run in tens of milliseconds, making it ideal for real-time analytics applications."* -```typescript -client.createPayloadIndex("{collection_name}", { - field_name: "group_id", - field_schema: { - type: "keyword", - is_tenant: true, - }, -}); +### Positioned for Advanced AI Growth -``` +This shift positions GoodData not just as a dashboard provider, but as a foundation for next-gen AI applications embedded directly into enterprise products. In the future, GoodData expects to add significant personalization via -```rust -use qdrant_client::qdrant::{ - CreateFieldIndexCollectionBuilder, - KeywordIndexParamsBuilder, - FieldType -}; -use qdrant_client::{Qdrant, QdrantError}; +* AI-powered data stories +* Turnkey agent-to-agent interfaces and orchestration +* GenAI- supported findops optimization +* Perceptive "always on" analytics. -let client = Qdrant::from_url("http://localhost:6334").build()?; +Qdrant provides GoodData a stable, scalable foundation for expanding into document-based semantic search and ontology management. With Qdrant, GoodData confidently supports complex, real-time AI use cases, enhancing end-user accessibility and productivity. -client.create_field_index( - CreateFieldIndexCollectionBuilder::new( - "{collection_name}", - "group_id", - FieldType::Keyword, - ).field_index_params( - KeywordIndexParamsBuilder::default() - .is_tenant(true) - ) - ).await?; +![New Approach](/blog/case-study-gooddata/gooddata-diagram-2.png) +*Architecture to enable real-time monitoring and AI-driven search capabilities in Kubernetes environment.* -``` +### Additional resources -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.PayloadIndexParams; -import io.qdrant.client.grpc.Collections.PayloadSchemaType; -import io.qdrant.client.grpc.Collections.KeywordIndexParams; +1. [GoodData live demo](https://www.gooddata.com/request-a-demo/) +2. [Free GoodData trial](https://registration.cloud.gooddata.com/register?_gl=1*oqcu0a*_gcl_au*MTk0NDk5NjYyOC4xNzIzNzI2Njk4) +3. [Qdrant Cloud signup](https://cloud.qdrant.io/signup) -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +<|page-299-lllmstxt|> +> From lecture halls to production pipelines, [Qdrant Stars](https://qdrant.tech/stars/) -- founders, mentors and open-source contributors -- share how they’re building with vectors in the wild. +> In this post, Clelia distils tips from her talk at the [“Bavaria, Advancements in SEarch Development” meetup](https://lu.ma/based_meetup), where she covered hard-won lessons from her extensive open-source building. -client - .createPayloadIndexAsync( - "{collection_name}", - "group_id", - PayloadSchemaType.Keyword, - PayloadIndexParams.newBuilder() - .setKeywordIndexParams( - KeywordIndexParams.newBuilder() - .setIsTenant(true) - .build()) - .build(), - null, - null, - null) - .get(); +*Hey there, vector space astronauts!* -``` +*I am Clelia, an Open Source Engineer at [LlamaIndex](https://www.llamaindex.ai/). In the last two years, I've dedicated myself to the AI space, building (and breaking) many things, and sometimes even deploying them to production!* -```csharp -using Qdrant.Client; +*I spent quite a bit of time messing around with LLMs, vector databases and AI agents, and I would really love to share with you some tips and tricks related to vector search. These insights do not only come from my theoretical knowledge but the real-world, hands-on experience I gained while building vector search-powered applications.* -var client = new QdrantClient("localhost", 6334); +*Let's dive in!* -await client.CreatePayloadIndexAsync( - collectionName: "{collection_name}", - fieldName: "group_id", - schemaType: PayloadSchemaType.Keyword, - indexParams: new PayloadIndexParams - { - KeywordIndexParams = new KeywordIndexParams - { - IsTenant = true - } - } -); +--- -``` +## A Quick Overview of Textual RAG -```go -import ( - "context" +The suggestions in this blog post are all related to **textual vector search**, powering many Retrieval Augmented Generation (RAG) applications. Well, how does RAG work? - "github.com/qdrant/go-client/qdrant" -) +Very briefly, you can break it down into three steps: -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +- **Data preparation**: prepare your data by extracting raw text from files and chunking it into smaller, digestible pieces. +- **Embedding**: choose a text embedding model and use it to produce vectorized representations of your text chunks. +- **Upload to DB and serve**: upload embeddings to a vector database (such as Qdrant) and serve the database as a retrieval endpoint in your pipeline. -client.CreateFieldIndex(context.Background(), &qdrant.CreateFieldIndexCollection{ - CollectionName: "{collection_name}", - FieldName: "group_id", - FieldType: qdrant.FieldType_FieldTypeKeyword.Enum(), - FieldIndexParams: qdrant.NewPayloadIndexParams( - &qdrant.KeywordIndexParams{ - IsTenant: qdrant.PtrOf(true), - }), -}) +When the user asks a question, context will be *retrieved* from the database and provided to the LLM, which will then *generate* content *augmented* by the retrieved information. -``` +## Text Extraction: Your Best Friend and Worst Enemy -`is_tenant=true` parameter is optional, but specifying it provides storage with additional information about the usage patterns the collection is going to use. -When specified, storage structure will be organized in a way to co-locate vectors of the same tenant together, which can significantly improve performance in some cases. +![text-extraction](/blog/hitchhikers-guide/sep_1.png) -## [Anchor](https://qdrant.tech/documentation/guides/multiple-partitions/\#limitations) Limitations +Text extraction is a crucial step: having clean, well-structured raw text can be game-changing for all the downstream steps of your RAG, especially to make the retrieved context easily “understandable” for the LLM. -One downside to this approach is that global requests (without the `group_id` filter) will be slower since they will necessitate scanning all groups to identify the nearest neighbors. +You can perform text extraction in various ways, for example: -##### Was this page useful? +**Object-based parsing**, such as the one provided by [PyPDF](https://pypi.org/project/pypdf/) or [PyMuPDF](https://pymupdf.readthedocs.io/en/latest/), is fast and cheap for text-only documents but might fail to capture the complexity of tables, images and other visual elements many files can have nowadays. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +**Agentic and OCR-based parsing**, such as the one offered by [LlamaParse](https://www.llamaindex.ai/llamaparse), which is often an excellent choice for handling complex documents, and other methods, including **using Visual Language Retrievers** such as [ColPali&ColQwen](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/), which are also very effective, especially with scanned or image-dense files. -Thank you for your feedback! 🙏 +#### My Advice -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/multiple-partitions.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Trust your guts. If a solution seems to give you good-quality raw text, go for it! -On this page: +If you want a real-world example of how good vs bad text extraction makes the difference, you can check out a project I built, [PapersChat](https://github.com/AstraBert/PapersChat). +It allows you to use LlamaParse or simple parsing with PyPDF to extract text from your papers and chat with them. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/multiple-partitions.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +## Chunking Is All You Need -× +![chunking](/blog/hitchhikers-guide/sep_2.png) -[Powered by](https://qdrant.tech/) +Chunking might really make the difference between a successful and a failing RAG pipeline. -<|page-162-lllmstxt|> -## api-reference -- [Documentation](https://qdrant.tech/documentation/) -- [Private cloud](https://qdrant.tech/documentation/private-cloud/) -- API Reference +Chunking means breaking large text down into pieces, which will be given to the LLM to perform augmented generation. It is then crucial to break the text down into chunks that are meaningful. -# [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#api-reference) API Reference +If you want to have the text divided into sentences, it might make sense to use **sentence- or token-based chunkers**, while using **semantic- or embeddings-based (late) chunking** is more suitable for separating paragraphs. -## [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#packages) Packages +**Agentic or neural chunking** should be used when you want to isolate higher-order semantic units, such as all the information about crocodiles in a paper dedicated to reptiles. -- [qdrant.io/v1](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantiov1) +#### My Advice -## [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantiov1) qdrant.io/v1 +Effective chunking boils down to one thing: the complexity of the textual representation you want as an output. -Package v1 contains API Schema definitions for the qdrant.io v1 API group +A very easy-to-use library for all the chunking strategies I mentioned is [chonkie](https://chonkie.ai/) -- give it a try, you won't be disappointed! -### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#resource-types) Resource Types - -- [QdrantCloudRegion](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcloudregion) -- [QdrantCloudRegionList](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcloudregionlist) -- [QdrantCluster](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcluster) -- [QdrantClusterList](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterlist) -- [QdrantClusterRestore](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterrestore) -- [QdrantClusterRestoreList](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterrestorelist) -- [QdrantClusterScheduledSnapshot](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterscheduledsnapshot) -- [QdrantClusterScheduledSnapshotList](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterscheduledsnapshotlist) -- [QdrantClusterSnapshot](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclustersnapshot) -- [QdrantClusterSnapshotList](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclustersnapshotlist) -- [QdrantEntity](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantentity) -- [QdrantEntityList](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantentitylist) -- [QdrantRelease](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantrelease) -- [QdrantReleaseList](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantreleaselist) - -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#clusterphase) ClusterPhase +You can also check out how **abstract syntax trees can be used to parse and chunk code files** in [Code-RAGent](https://github.com/AstraBert/code-ragent), a RAG agent built on top of my GO codebase, so I could retrieve code concepts and implementation I forgot or have doubts about. -_Underlying type:_ _string_ +--- -_Appears in:_ +## Embeddings: Catch ‘em All! -- [QdrantClusterStatus](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterstatus) +![embeddings](/blog/hitchhikers-guide/sep_3.png) -| Field | Description | -| --- | --- | -| `Creating` | | -| `FailedToCreate` | | -| `Updating` | | -| `FailedToUpdate` | | -| `Scaling` | | -| `Upgrading` | | -| `Suspending` | | -| `Suspended` | | -| `FailedToSuspend` | | -| `Resuming` | | -| `FailedToResume` | | -| `Healthy` | | -| `NotReady` | | -| `RecoveryMode` | | -| `ManualMaintenance` | | +Embedding text equals generating a numerical representation of it. For example, as a target representation, you could choose *dense* or *sparse* vectors. The difference is in what they capture: dense embeddings are the best at broadly catching the semantic nuances of the text, while sparse embeddings precisely pick up its keywords. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#componentphase) ComponentPhase +The good news is you don't have to pick one with a [hybrid search](https://qdrant.tech/articles/hybrid-search/). Hybrid search combines results from both a dense (semantic) search and a sparse (keyword) search. -_Underlying type:_ _string_ +But how to combine? -_Appears in:_ +- You can do it by **rescoring** the vectors: you re-embed (often with a dense or multivector embedding model) all the texts associated with the retrieved vectors, both dense and sparse, and then find the most similar vector to the query. Alternatively, you could use cross-encoders. +- You can also **fuse** the results, for example, assigning them points based on their position in both rankings (sparse and dense) and then summing these points up. The chunk with the most points is the best match! -- [ComponentStatus](https://qdrant.tech/documentation/private-cloud/api-reference/#componentstatus) +#### My Advice -| Field | Description | -| --- | --- | -| `Ready` | | -| `NotReady` | | -| `Unknown` | | -| `NotFound` | | +Hybrid search is not the solution for all retrieval problems, yet it can be a valuable choice when you want to retain semantic discrimination while still relying on keyword-based search. An example could be RAG applications in the legal or medical domains. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#componentreference) ComponentReference +If you want a quick start on a hybrid search, check out [Pokemon-Bot](https://github.com/AstraBert/Pokemon-Bot), a simple Discord bot built on top of Qdrant and Cohere. -_Appears in:_ +## Search Boosting 101 -- [QdrantCloudRegionSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcloudregionspec) +![search-boosting](/blog/hitchhikers-guide/sep_4.png) -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `apiVersion` _string_ | APIVersion is the group and version of the component being referenced. | | | -| `kind` _string_ | Kind is the type of component being referenced | | | -| `name` _string_ | Name is the name of component being referenced | | | -| `namespace` _string_ | Namespace is the namespace of component being referenced. | | | -| `markedForDeletion` _boolean_ | MarkedForDeletion specifies whether the component is marked for deletion | | | +Search boosting is something everybody wants: less compute, reduced latency and, overall, faster and more efficient pipelines that can make the UX way smoother. I’ll mention two of them. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#componentstatus) ComponentStatus +### Semantic caching -_Appears in:_ +Say your Qdrant vector database has lots of information, so searching through it requires time. It is easily imaginable, though, that many users will ask similar questions in our RAG application, and it makes sense to provide them with the same answer. -- [QdrantCloudRegionStatus](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcloudregionstatus) +*Here comes semantic caching:* you should create a second, smaller vector database instance (collection in Qdrant), where you can store the vectorized representation of the questions asked by the users and, in the associated metadata, the answer to that question. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `name` _string_ | Name specifies the name of the component | | | -| `namespace` _string_ | Namespace specifies the namespace of the component | | | -| `version` _string_ | Version specifies the version of the component | | | -| `phase` _[ComponentPhase](https://qdrant.tech/documentation/private-cloud/api-reference/#componentphase)_ | Phase specifies the current phase of the component | | | -| `message` _string_ | Message specifies the info explaining the current phase of the component | | | +Then, before running the whole RAG pipeline, you perform a quick search within your semantic cache to see if any question is similar to the one asked before. You simply return the associated answer instead of producing one de-novo, saving you a lot of time and compute! -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#entityphase) EntityPhase +### Binary quantization -_Underlying type:_ _string_ +Binary quantization is also something that can help you, especially if you have tons of documents (we’re talking millions). A large dataset is a performance challenge and a memory problem: embeddings from providers like OpenAI can have 1536 dimensions, meaning almost 6 kB per full-precision embedding! -_Appears in:_ +*And here it comes:* taking the vector as a list of floating point numbers, binary quantization converts it to a list of 0s and 1s based on mathematical rounding. Such a representation significantly reduces the memory footprint and makes it easier for your search algorithm to compare vector representations. -- [QdrantEntityStatus](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantentitystatus) +Using binary quantization comes with the natural question: *“Are the search results as good as if I were using the non-quantized vectors?”* Generally, they aren’t *as* good, which is why they should be combined with the other techniques discussed above, such as rescoring. -| Field | Description | -| --- | --- | -| `Creating` | | -| `Ready` | | -| `Updating` | | -| `Failing` | | -| `Deleting` | | -| `Deleted` | | +#### My Advice -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#entityresult) EntityResult +Do you want to build a production-ready system with semantic caching and quantization? Then, you might want to look at [PhiQwenSTEM](https://github.com/AstraBert/PhiQwenSTEM), a learning assistant with access to 15,000+ documents to reply to your STEM questions! -_Underlying type:_ _string_ +## Querying Makes the Difference -EntityResult is the last result from the invocation to a manager +![querying](/blog/hitchhikers-guide/sep_5.png) -_Appears in:_ +A common error in RAG pipelines is that you curate every detail, but you do not take into account one key aspect: **queries**. -- [QdrantEntityStatusResult](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantentitystatusresult) +Most of the time, queries are *too generic* or *too specific* to pick up the knowledge embedded in your vector database. There are, nevertheless, some magic tricks you can apply to optimize querying for your use case. -| Field | Description | -| --- | --- | -| `Ok` | | -| `Pending` | | -| `Error` | | +#### My Advice -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#gpu) GPU +Here’s my rule of thumb: -_Appears in:_ +| Query Type | Transformation to Apply | +|--------------|--------------------------| +| **Generic query:** too broad, may confound the search process | **Expansion:** you can transform your query into a hypothetical document (HyDE), using a language model to generate a more detailed query text, which you can embed and use in retrieval. | +| **Specific query:** asks for some specific information that can be easily identified within your documents | **No transformation** in this case, you can retrieve directly! | +| **Complex query:** has many questions that generally cannot be answered by just one of the documents within your database | **Decomposition:** You can divide the query into several sub-queries, each of which is used for a multistep-like retrieval from the database. The results from each step will then form the final answer from the LLM. | -- [QdrantClusterSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterspec) +How to assign a query type? -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `gpuType` _[GPUType](https://qdrant.tech/documentation/private-cloud/api-reference/#gputype)_ | GPUType specifies the type of the GPU to use. If set, GPU indexing is enabled. | | Enum: \[nvidia amd\] | -| `forceHalfPrecision` _boolean_ | ForceHalfPrecision for `f32` values while indexing.
`f16` conversion will take place
only inside GPU memory and won’t affect storage type. | false | | -| `deviceFilter` _string array_ | DeviceFilter for GPU devices by hardware name. Case-insensitive.
List of substrings to match against the gpu device name.
Example: \[- “nvidia”\]
If not specified, all devices are accepted. | | MinItems: 1 | -| `devices` _string array_ | Devices is a List of explicit GPU devices to use.
If host has multiple GPUs, this option allows to select specific devices
by their index in the list of found devices.
If `deviceFilter` is set, indexes are applied after filtering.
If not specified, all devices are accepted. | | MinItems: 1 | -| `parallelIndexes` _integer_ | ParallelIndexes is the number of parallel indexes to run on the GPU. | 1 | Minimum: 1 | -| `groupsCount` _integer_ | GroupsCount is the amount of used vulkan “groups” of GPU.
In other words, how many parallel points can be indexed by GPU.
Optimal value might depend on the GPU model.
Proportional, but doesn’t necessary equal to the physical number of warps.
Do not change this value unless you know what you are doing. | | Minimum: 1 | -| `allowIntegrated` _boolean_ | AllowIntegrated specifies whether to allow integrated GPUs to be used. | false | | +Well, you can build an agentic system for automated choice of query transformation, like in my [RAGcoon](https://github.com/AstraBert/RAGcoon). There, a query agent tries various retrieval techniques to get the best information for startup founders. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#gputype) GPUType +## Don’t Drown in Evals -_Underlying type:_ _string_ +![evalustion](/blog/hitchhikers-guide/sep_6.png) -GPUType specifies the type of GPU to use. +Ideas turned into implementations are cool, yet only eval metrics can tell whether your project delivers real value and has a go-to-market potential. -_Validation:_ +It is very easy, though, to drown in all of the evaluation frameworks, strategies and metrics out there. It is also common to get medium-to-good results on some metrics when evaluating the first implementation of your product and happily stopping the development while actually there is a huge room for improvement. -- Enum: \[nvidia amd\] +#### My Advice -_Appears in:_ +Here are my two suggestions for anyone who’s fallen into these traps: -- [GPU](https://qdrant.tech/documentation/private-cloud/api-reference/#gpu) +- **Simple is better than complex**: as the Python zen motto states, your implementations must be simple, easy to understand, and read. Using intuitive metrics, such as the *hit rate* or the *mean reciprocal ranking (MRR)* in a retrieval pipeline or the *faithfulness* and *relevancy* of the responses generated by your LLM, is the best way to understand the product you are building. There is always time to test and tweak complex metrics, but later! +- **Iterate, iterate, iterate**: It is easy to get something simple and somewhat good up and running, but the key for real, production-ready software is to iterate on it. Build on the simple things and make them better with every round of tweaking and evaluation. Once you enter the territory where your products start to become good enough, test them again on more complex metrics. There is no fixed limit for iterations, but one is never enough! -| Field | Description | -| --- | --- | -| `nvidia` | | -| `amd` | | +If you want to start with something that can give you immediate insights on POCs combining different embedding and language models, you can try [diRAGnosis](https://github.com/AstraBert/diRAGnosis), a lightweight framework that informs you with simple metrics on how well your models are performing. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#helmrelease) HelmRelease +--- -_Appears in:_ +*I am leaving you (hopefully) with some key takeaways.* +*One last might sound like a given, but I guarantee it is not for everyone: keep exploring, building and breaking things – that’s the only way to learn!* -- [QdrantCloudRegionSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcloudregionspec) +*If you would like to, follow my journey in AI space through my [social media](https://link.clelia.dev/), and see you soon, vector space astronauts!* -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `markedForDeletionAt` _string_ | MarkedForDeletionAt specifies the time when the helm release was marked for deletion | | | -| `object` _[HelmRelease](https://qdrant.tech/documentation/private-cloud/api-reference/#helmrelease)_ | Object specifies the helm release object | | EmbeddedResource: {} | +<|page-300-lllmstxt|> +# How FAZ Built a Hybrid Search Engine with Qdrant to Unlock 75 Years of Journalism -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#helmrepository) HelmRepository +[Frankfurter Allgemeine Zeitung (FAZ)](https://www.frankfurterallgemeine.de/die-faz), a major national newspaper in Germany, has spent decades building a rich archive of journalistic content, stretching back to 1949\. The FAZ archive has long built expertise in making its extensive collection of over 75 years accessible and searchable for both internal and external customers through keyword- and index-based search engines. New AI-powered search technologies were therefore immediately recognized as an opportunity to unlock the potential of the comprehensive archive in entirely new ways and to systematically address the limitations of traditional search methods. The solution they arrived at involved a thoughtful orchestration of technologies \- with Qdrant at the heart. -_Appears in:_ +This undertaking was driven by a cross-functional team: -- [QdrantCloudRegionSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcloudregionspec) +* **Jens Peter Kutz**, AI project lead for the archive, spearheaded semantic search and indexing efforts. +* **Hans Peter Troetscher**, Department Head of Data Management and Applications, oversaw the system architecture and overall vision. +* **RenĂ© Weber**, IT Systems Administrator, focused on the challenges of deployment and orchestration within FAZ’s Azure environment. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `markedForDeletionAt` _string_ | MarkedForDeletionAt specifies the time when the helm repository was marked for deletion | | | -| `object` _[HelmRepository](https://qdrant.tech/documentation/private-cloud/api-reference/#helmrepository)_ | Object specifies the helm repository object | | EmbeddedResource: {} | +## The Challenge: Turning an Archive into a Research Engine -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#inferenceconfig) InferenceConfig +FAZ’s archive includes tens of millions of articles \- from modern digital content to OCR-scanned historical documents. Editors and researchers needed a way to move beyond simple keyword matching and unlock deeper insights from their archive. A new search experience would need to understand meaning, support structured filters, and operate within strict infrastructure constraints. -_Appears in:_ +The team set out to build a semantic search platform as a first step. Early experiments embedded article paragraphs using Azure OpenAI’s text-embedding-3-large model, resulting in high-dimensional vector representations of content. Embedding was performed on a paragraph-by-paragraph basis to ensure relevance and granularity. The team also tested how the system performed with retro-digitized content containing OCR errors from scanned sources. -- [QdrantConfiguration](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantconfiguration) +## Why Qdrant Was the Right Fit -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `enabled` _boolean_ | Enabled specifies whether to enable inference for the cluster or not. | false | | +From the outset, FAZ had specific technical and organizational needs: -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#ingress) Ingress +* The solution had to run within their own **Azure Kubernetes Service (AKS)** environment. +* It had to support complex, schema-less **metadata payloads** for every paragraph. +* It had to support **real-time updates and deletions** for articles that are corrected or depublished. +* And it needed to offer **sub-second performance** at scale, despite constant data updates. -_Appears in:_ +Qdrant checked all the boxes. Its [hybrid cloud deployment model](https://qdrant.tech/hybrid-cloud/) gave FAZ full control over infrastructure and privacy. The ability to associate each vector with rich metadata \- including over 60 fields like author, date, and article type \- was critical. And with native support for **scalar quantization**, HNSW indexing, and fast upserts, Qdrant could keep pace with the daily demands of a live newsroom. -- [QdrantClusterSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterspec) +The developer experience was another major win. As Jens Peter Kutz explained, "The documentation is structured, clear, and immensely helpful \- especially when ramping up on an entirely new stack." -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `enabled` _boolean_ | Enabled specifies whether to enable ingress for the cluster or not. | | | -| `annotations` _object (keys:string, values:string)_ | Annotations specifies annotations for the ingress. | | | -| `ingressClassName` _string_ | IngressClassName specifies the name of the ingress class | | | -| `host` _string_ | Host specifies the host for the ingress. | | | -| `tls` _boolean_ | TLS specifies whether to enable tls for the ingress.
The default depends on the ingress provider:
\- KubernetesIngress: False
\- NginxIngress: False
\- QdrantCloudTraefik: Depending on the config.tls setting of the operator. | | | -| `tlsSecretName` _string_ | TLSSecretName specifies the name of the secret containing the tls certificate. | | | -| `nginx` _[NGINXConfig](https://qdrant.tech/documentation/private-cloud/api-reference/#nginxconfig)_ | NGINX specifies the nginx ingress specific configurations. | | | -| `traefik` _[TraefikConfig](https://qdrant.tech/documentation/private-cloud/api-reference/#traefikconfig)_ | Traefik specifies the traefik ingress specific configurations. | | | +## Metadata: The Backbone of Intelligent Search -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#kubernetesdistribution) KubernetesDistribution +One of Qdrant’s most powerful features for FAZ is its ability to handle rich metadata payloads. Each embedded paragraph is associated with fields like: -_Underlying type:_ _string_ +* Publication date +* Author and section +* Article type (e.g., editorial, interview) +* Word count +* Source type (e.g., print vs. online) +* Indexing metadata and extraction confidence levels -_Appears in:_ +These payloads allow users to filter results by time range, author, section, or even article length. FAZ built a UI that lets users apply these filters manually \- or have them inferred from natural language prompts by GPT-4. -- [QdrantCloudRegionStatus](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcloudregionstatus) +Qdrant's ability to handle over 60 payload fields and frequent updates is crucial for FAZ's workflow, as articles need to be continuously updated and sometimes removed, particularly with online content being more dynamic than print articles. The system needs to manage daily updates as new content is published and existing articles are modified. -| Field | Description | -| --- | --- | -| `unknown` | | -| `aws` | | -| `gcp` | | -| `azure` | | -| `do` | | -| `scaleway` | | -| `openshift` | | -| `linode` | | -| `civo` | | -| `oci` | | -| `ovhcloud` | | -| `stackit` | | -| `vultr` | | -| `k3s` | | +Additionally, FAZ enriches the user experience by retrieving adjacent context vectors, such as preceding and following paragraphs, to generate fluent and explainable answers. This context stitching is dynamically computed during query time. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#kubernetespod) KubernetesPod +## Performance at Scale -_Appears in:_ +The current system indexes over 14 million vectors across several decades, with a target of 40-50 million vectors covering the full archive. Ingest operations occur daily, as new publications are embedded and indexed. FAZ uses quantization to optimize memory usage and maximize throughput across search and update pipelines. -- [KubernetesStatefulSet](https://qdrant.tech/documentation/private-cloud/api-reference/#kubernetesstatefulset) +Benchmarking results during internal testing showed that Qdrant consistently delivered **\<1s response times** on full-archive similarity search \- despite applying complex payload filters and returning rich, annotated metadata with each result. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `annotations` _object (keys:string, values:string)_ | Annotations specifies the annotations for the Pods. | | | -| `labels` _object (keys:string, values:string)_ | Labels specifies the labels for the Pods. | | | -| `extraEnv` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | ExtraEnv specifies the extra environment variables for the Pods. | | | +Custom ingestion scripts in Python integrate with OpenAI’s embedding service and Qdrant’s API, handling everything from embedding to payload assembly and indexing. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#kubernetesservice) KubernetesService +## A Foundation for AI-Driven Journalism -_Appears in:_ +FAZ has built a powerful search system that makes over seven decades of journalism accessible and relevant for modern editorial workflows. By combining Azure OpenAI’s semantic embeddings with Qdrant’s metadata-aware vector search, they’ve developed a hybrid solution that understands both language and structure. The system delivers fast, relevant results with highlighted context and similarity scores, enabling journalists to explore their archive more intuitively than ever before. -- [QdrantClusterSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterspec) +With millions of vectors already indexed and ongoing plans to scale across the full archive, FAZ is not only setting a new benchmark for archival search, but also laying the groundwork for next-generation capabilities that will further enhance precision, flexibility, and editorial control. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `type` _[ServiceType](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#servicetype-v1-core)_ | Type specifies the type of the Service: “ClusterIP”, “NodePort”, “LoadBalancer”. | ClusterIP | | -| `annotations` _object (keys:string, values:string)_ | Annotations specifies the annotations for the Service. | | | +## What’s Next: Building Toward Hybrid Search -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#kubernetesstatefulset) KubernetesStatefulSet +The initial system focuses on dense vector similarity to support natural language queries. A user might ask, “Why is inflation rising?” and receive answer passages pulled from semantically relevant articles across decades of FAZ journalism. This semantic-first approach was intentional. The FAZ team chose to first build a search engine in its purest semantic form to better understand its capabilities, advantages, and limits. -_Appears in:_ +As the team gained experience with the semantic search engine, they began to see opportunities to expand its capabilities. While dense vector retrieval works well for exploratory and abstract queries, future enhancements by the team will focus on supporting workflows that involve searching for exact names, dates, or publication references. These use cases present a clear opportunity to complement semantic search with keyword-based retrieval and structured filtering, bringing greater precision and control to the platform. -- [QdrantClusterSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterspec) +To address these needs, FAZ is now planning the next stage of its platform: a hybrid search architecture that combines the strengths of both semantic and symbolic retrieval. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `annotations` _object (keys:string, values:string)_ | Annotations specifies the annotations for the StatefulSet. | | | -| `pods` _[KubernetesPod](https://qdrant.tech/documentation/private-cloud/api-reference/#kubernetespod)_ | Pods specifies the configuration of the Pods of the Qdrant StatefulSet. | | | +The new system combines: -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#metricsource) MetricSource +* **Dense vector embeddings** for broad semantic understanding +* **Sparse vectors (e.g. BM25-like)** using Qdrant’s native support for hybrid search +* **Structured metadata filtering** (authors, sections, date ranges) +* **Query interpretation via GPT-4** for automatic filter setting and query routing -_Underlying type:_ _string_ +"We’re not just building a search tool \- we’re building a search interpreter. And Qdrant is a central node in that architecture.", said Hans Peter Troetscher -_Appears in:_ +This direction promises to give journalists and researchers a search experience that combines semantic understanding with precise control, supporting both intuitive exploration and exact retrieval across decades of content. -- [Monitoring](https://qdrant.tech/documentation/private-cloud/api-reference/#monitoring) +<|page-301-lllmstxt|> +# Scaled Vector & Graph Retrieval: How Lettria Unlocked 20% Accuracy Gains with Qdrant & Neo4j -| Field | Description | -| --- | --- | -| `kubelet` | | -| `api` | | +![Lettria increases accuracy by 20% by blending Qdrant's vector search and Neo4j's knowledge graphs](/blog/case-study-lettria/lettria-bento-dark.jpg) -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#monitoring) Monitoring +## Why Complex Document Intelligence Needs More Than Just Vector Search -_Appears in:_ +In regulated industries where precision, auditability, and accuracy are paramount, leveraging Large Language Models (LLMs) effectively often requires going beyond traditional Retrieval-Augmented Generation (RAG). [Lettria](https://www.lettria.com/), a leader in document intelligence platforms, recognized that complex, highly regulated data sets like pharmaceutical research, legal compliance, and aerospace documentation demanded superior accuracy and more explainable outputs than vector-only RAG systems could provide. To achieve the expected level of performance, the team has focused its effort on building a very robust document parsing engine designed for complex pdf (with tables, diagrams, charts etc.), an automatic ontology builder and an ingestion pipeline covering vectors and graph enrichment -- [QdrantCloudRegionStatus](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcloudregionstatus) +By integrating vector search capabilities from Qdrant with knowledge graphs powered by Neo4j, Lettria created a hybrid graph RAG system that significantly boosted accuracy and enriched the context provided to LLMs. This case study explores Lettria's innovative solution, technical challenges overcome, and measurable results achieved. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `cAdvisorMetricSource` _[MetricSource](https://qdrant.tech/documentation/private-cloud/api-reference/#metricsource)_ | CAdvisorMetricSource specifies the cAdvisor metric source | | | -| `nodeMetricSource` _[MetricSource](https://qdrant.tech/documentation/private-cloud/api-reference/#metricsource)_ | NodeMetricSource specifies the node metric source | | | +## Why Traditional RAG Fell Short in High-Stakes Use Cases -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#nginxconfig) NGINXConfig +Enterprises in regulated sectors deal with extensive, complex documentation featuring structured and semi-structured data such as intricate tables, multi-layered diagrams, and specialized terminology. Standard vector search methods achieved around 70% accuracy, which is insufficient for industries where precision is non-negotiable. Additionally, understanding and auditing LLM outputs based on complex documentation posed significant hurdles. -_Appears in:_ +## Why Qdrant stood out as a vector database -- [Ingress](https://qdrant.tech/documentation/private-cloud/api-reference/#ingress) +One component of the build was the vector database. Lettria evaluated Weaviate, Milvus, and Qdrant based on their hybrid search capability, deployment simplicity (Docker, Kubernetes), and search performance (latency, RAM usage). -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `allowedSourceRanges` _string array_ | AllowedSourceRanges specifies the allowed CIDR source ranges for the ingress. | | | -| `grpcHost` _string_ | GRPCHost specifies the host name for the GRPC ingress. | | | +Ultimately, Lettria chose Qdrant. First, it had a simple Kubernetes deployment, superior latency and lower memory footprint in competitive benchmarks. Additionally, there were unique features, such as the grouping API and detailed [payload indexing](https://qdrant.tech/documentation/concepts/payload/), that made Qdrant stand out. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#nodeinfo) NodeInfo +## Building the document understanding and extraction pipeline -_Appears in:_ +The core of Lettria's high accuracy solution lies in merging vector embeddings (stored in Qdrant) with graph-based semantic understanding (Neo4j). Here’s an overview of their pipeline: -- [QdrantCloudRegionStatus](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcloudregionstatus) +1. **Ingestion**: Complex PDFs are parsed, and data is transformed into dual representations: **dense vector embeddings** and **semantic triples** (stored in Neo4j and indexed in Qdrant). As shown in the diagram below, the ingestion pipeline extracts layout and content structure, splits text into meaningful chunks, and routes them into both vector and graph representations. Each chunk maintains **lineage metadata**, linking it back to its exact position in the source document—critical for traceability. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `name` _string_ | Name specifies the name of the node | | | -| `region` _string_ | Region specifies the region of the node | | | -| `zone` _string_ | Zone specifies the zone of the node | | | -| `instanceType` _string_ | InstanceType specifies the instance type of the node | | | -| `arch` _string_ | Arch specifies the CPU architecture of the node | | | -| `capacity` _[NodeResourceInfo](https://qdrant.tech/documentation/private-cloud/api-reference/#noderesourceinfo)_ | Capacity specifies the capacity of the node | | | -| `allocatable` _[NodeResourceInfo](https://qdrant.tech/documentation/private-cloud/api-reference/#noderesourceinfo)_ | Allocatable specifies the allocatable resources of the node | | | +![Ingestion tracking mechanism](/blog/case-study-lettria/ingestion-tracking-mechanism.png) -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#noderesourceinfo) NodeResourceInfo +*Diagram: Ingestion Transaction Mechanism* -_Appears in:_ +2. **Ontology Generation**: Lettria automatically generates ontologies using LLMs, ensuring scalability and adaptability. This step ensures that only semantically meaningful relationships are extracted—reducing noise and enabling structured querying downstream. -- [NodeInfo](https://qdrant.tech/documentation/private-cloud/api-reference/#nodeinfo) +3. **Vector-Driven Graph Expansion**: Queries begin with fast vector search in Qdrant, identifying nodes and relationships as text embeddings. These seed points are then used to expand a contextual subgraph in Neo4j, which is combined with chunk data and passed to the LLM for answer generation. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `cpu` _string_ | CPU specifies the CPU resources of the node | | | -| `memory` _string_ | Memory specifies the memory resources of the node | | | -| `pods` _string_ | Pods specifies the pods resources of the node | | | -| `ephemeralStorage` _string_ | EphemeralStorage specifies the ephemeral storage resources of the node | | | +## The ingest transaction mechanism: keeping Neo4j and Qdrant in sync -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#nodestatus) NodeStatus +Keeping Qdrant and Neo4j in sync is challenging, as they take fundamentally different approaches to data operations. Neo4j is a transactional database, meaning it can group changes into atomic units that are either fully committed or entirely rolled back. In contrast, [Qdrant](https://qdrant.tech/qdrant-vector-database/) is a vector search engine designed to processes each update immediately without transactional semantics. This distinction is important: transactional support is typical for databases, while search engines like Qdrant prioritize low-latency ingestion and retrieval over rollback capabilities. -_Appears in:_ +This fundamental mismatch creates complexity. So if a transaction in Neo4j fails after data has already been written to Qdrant, the two databases can quickly fall out of alignment. -- [QdrantClusterStatus](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterstatus) +To ensure atomicity between Qdrant (non-transactional) and Neo4j (transactional), Lettria built a custom ingest mechanism that guarantees consistent writes across both systems. The process begins by preparing the write as a transactional batch in Neo4j—if Neo4j accepts the changes, they’re committed and saved. Before updating Qdrant, a **snapshot** of each affected point is taken. Then, Qdrant is updated optimistically. If the Neo4j commit succeeds, the operation completes. But if it fails, Lettria’s system uses the earlier snapshot to **rollback Qdrant** to its previous state, ensuring no partial writes remain in either database. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `name` _string_ | Name specifies the name of the node | | | -| `started_at` _string_ | StartedAt specifies the time when the node started (in RFC3339 format) | | | -| `state` _object (keys: [PodConditionType](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#podconditiontype-v1-core), values: [ConditionStatus](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#conditionstatus-v1-core))_ | States specifies the condition states of the node | | | -| `version` _string_ | Version specifies the version of Qdrant running on the node | | | +The challenge arises in concurrent environments where multiple ingest processes may interact with the same data points. To handle this, Lettria implemented a **conflict resolution function** that compares three states for each point: the original snapshot, the changes proposed by the current process, and the current state in Qdrant. If a conflict is detected—such as another process modifying the point in the meantime—the resolver merges changes intelligently to preserve valid updates while rolling back only the failed batch. This strategy, combined with small batch sizes, minimizes the risk window and ensures high reliability even at scale. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#pause) Pause +*Pseudocode example: ingest_graph_attempt* -_Appears in:_ +```py +def ing est_graph_attempt(graph_data, qdrant, neo4j): + """ + Attempts to ingest graph data into Qdrant and Neo4j consistently. + If Neo4j write fails, Qdrant changes are rolled back. + This mimics a "try Qdrant, then try Neo4j; if Neo4j fails, undo Qdrant" strategy. + """ + BEGIN_TRANSACTION + neo4j.begin_transaction() + points = graph_data.get_triplets() + # Merge points in Neo4j (non blocking) + neo4j.upsert(points) + # Load current points from Qdrant + snapshot = qdrant.get([point.id for point in points]) + # Update points in Qdrant + qdrant.upsert(points) + neo4j.commit() + TRANSACTION_ROLLBACK + # Load current points state + current_points = qdrant.get([point.id for point in points]) + # Build snapshot of the current state + resolved_snapshot = resolve_conflicts(points, current_points, snapshot) + # Rollback Qdrant changes + qdrant.upsert(resolved_snapshot) + TRANSACTION_SUCCESS + total_points = len(points) + added_points, updated_points = diff(points, snapshot) + Log(f"Successfully ingested {total_points} points into Qdrant and Neo4j.") + Log(f"Added {added_points} points and updated {updated_points} points.") +``` -- [QdrantClusterSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterspec) +## Consistent querying and indexing through payload flattening -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `owner` _string_ | Owner specifies the owner of the pause request. | | | -| `reason` _string_ | Reason specifies the reason for the pause request. | | | -| `creationTimestamp` _string_ | CreationTimestamp specifies the time when the pause request was created. | | | +To ensure consistent query behavior and indexing performance across both Qdrant and Neo4j, Lettria adopted a **payload flattening strategy**. While Qdrant supports nested JSON-like structures in its payloads, Neo4j requires flat key–value pairs for properties on nodes and relationships. This structural mismatch made it difficult to apply consistent filters or indexing logic across both databases. Lettria resolved this by flattening all nested fields during ingestion—for example, converting `{ "author": { "name": "Jane" } }` to `{ "author_name": "Jane" }`. This approach allowed seamless reuse of the same metadata structure in Neo4j, simplifying hybrid search and enforcing schema compatibility across their dual-database architecture. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantcloudregion) QdrantCloudRegion +## Scaling to >100M vectors at \<200ms P95 retrieval -QdrantCloudRegion is the Schema for the qdrantcloudregions API +Lettria scaled its Qdrant deployment to over 100 million vectors while maintaining 95th percentile retrieval latency under 200ms, even in production-like load tests. This performance was made possible through a combination of careful payload index design and disk-based cache collections for infrequently accessed vectors. Initially, lack of indexing led to full collection scans, which significantly degraded performance. After adding indexes on frequently filtered payload fields (e.g. doc_type, client_id, chunk_source), latency dropped sharply and stabilized. To further reduce memory pressure, Lettria separated hot and cold data—keeping active chunks in memory and offloading less-used vectors to on-disk storage, allowing for finer memory tuning without sacrificing accuracy. This approach provided both speed and cost control at scale, supporting hybrid retrieval across dense and sparse modalities without excessive resource overhead. -_Appears in:_ +## Outcome: >20% accuracy improvement -- [QdrantCloudRegionList](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcloudregionlist) +Lettria's graph-enhanced RAG system achieved a substantial accuracy improvement over pure vector solutions: -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `apiVersion` _string_ | `qdrant.io/v1` | | | -| `kind` _string_ | `QdrantCloudRegion` | | | -| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | -| `spec` _[QdrantCloudRegionSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcloudregionspec)_ | | | | +* **20-25% accuracy uplift** in verticals like finance, aerospace, pharmaceuticals, and legal. +* Enhanced explainability and lineage tracking from document ingestion to query response. +* Robust, audit-grade accuracy accepted by clients with manageable latency (1-2 seconds per query). -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantcloudregionlist) QdrantCloudRegionList +## Ultimately, Lettria beat the accuracy of traditional RAG -QdrantCloudRegionList contains a list of QdrantCloudRegion +Being the first production-ready GraphRAG platform has helped Lettria stand out from competition vs. traditional RAG players. Creating agents has become easier with GraphRAG, helping Lettria build new document intelligence features quickly (e.g. gap analysis between multiple documents). This has led to them securing high-value contracts in sectors demanding high accuracy, and increasing customer trust due to transparent, auditable outputs. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `apiVersion` _string_ | `qdrant.io/v1` | | | -| `kind` _string_ | `QdrantCloudRegionList` | | | -| `metadata` _[ListMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#listmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | -| `items` _[QdrantCloudRegion](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcloudregion) array_ | | | | +*“Qdrant has become a critical part of our GenAI infrastructure. It delivers the performance, flexibility, and reliability we need to build production-grade GraphRAG systems for clients in aerospace, finance, and pharma—where accuracy isn’t optional.”* -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantcloudregionspec) QdrantCloudRegionSpec +*— JĂ©rĂ©mie Basso, Engineering Lead, Lettria* -QdrantCloudRegionSpec defines the desired state of QdrantCloudRegion +## Further reading -_Appears in:_ +### Ingest transaction mechanism -- [QdrantCloudRegion](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcloudregion) +Lettria must maintain consistency between the data in Neo4J and in Qdrant. The inference (RAG) pipeline uses Qdrant for vector and payload search then falls back on Neo4J to gather relevant triplets from the graph structure. If some points fail to be inserted into Neo4J but are present in the Qdrant collection, the final output graph will be inconsistent. +To prevent this, Lettria uses a transaction mechanism where the Neo4J commit acts as the final gate for overall success. This mechanism is designed to be idempotent and safe for concurrent ingestion scenarios. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `id` _string_ | Id specifies the unique identifier of the region | | | -| `components` _[ComponentReference](https://qdrant.tech/documentation/private-cloud/api-reference/#componentreference) array_ | Components specifies the list of components to be installed in the region | | | -| `helmRepositories` _[HelmRepository](https://qdrant.tech/documentation/private-cloud/api-reference/#helmrepository) array_ | HelmRepositories specifies the list of helm repositories to be created to the region
Deprecated: Use “Components” instead | | | -| `helmReleases` _[HelmRelease](https://qdrant.tech/documentation/private-cloud/api-reference/#helmrelease) array_ | HelmReleases specifies the list of helm releases to be created to the region
Deprecated: Use “Components” instead | | | +#### Transaction mechanism -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantcluster) QdrantCluster +The process, outlined in the ingest_graph_attempt pseudocode, can be summarized as follows: -QdrantCluster is the Schema for the qdrantclusters API +1. **Neo4J Transaction & Tentative Write**: + 1. An explicit Neo4J transaction begins (neo4j.begin_transaction()). + 2. Data is prepared and upserted into Neo4J within this transaction (neo4j.upsert(points)). These changes are not yet permanent. +2. **Qdrant Snapshot & Update**: + 1. Before altering Qdrant, a snapshot of the relevant points' current state is taken (qdrant.get(...)). + 2. Qdrant is then updated with the new data (qdrant.upsert(points)). +3. **Neo4J Commit (Decisive Point)**: + 1. The system attempts to commit the Neo4J transaction (neo4j.commit()). +4. **On Success (TRANSACTION_SUCCESS):** Neo4J changes are permanent. Qdrant was already updated, so both systems are consistent -_Appears in:_ +5. **On Failure (TRANSACTION_ROLLBACK)**: + 1. Neo4J automatically rolls back its pending changes. + 2. To restore consistency, Qdrant is rolled back. -- [QdrantClusterList](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterlist) +#### Qdrant Rollback -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `apiVersion` _string_ | `qdrant.io/v1` | | | -| `kind` _string_ | `QdrantCluster` | | | -| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | -| `spec` _[QdrantClusterSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterspec)_ | | | | +When a Neo4j commit fails, and they need to roll back Qdrant, a simple revert to the snapshot might not be sufficient or correct due to concurrent operations. Another ingestion process might have successfully updated some of the same points in Qdrant after a current (failing) transaction took its snapshot but before the current transaction attempts to roll back. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantclusterlist) QdrantClusterList +The resolve_conflicts function aims to make an intelligent decision about what each point's state in Qdrant should be after the rollback. It considers three states for each point involved: -QdrantClusterList contains a list of QdrantCluster +1. snapshot: state of the point before the current update +2. points: updates of the current state +3. current_points: current state of the points in qdrant -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `apiVersion` _string_ | `qdrant.io/v1` | | | -| `kind` _string_ | `QdrantClusterList` | | | -| `metadata` _[ListMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#listmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | -| `items` _[QdrantCluster](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcluster) array_ | | | | +Here is a minimal example: -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantclusterrestore) QdrantClusterRestore +```py +snapshot = { + "id": "my_point_id_123", + "name": "my_point", # Original name + "foo": "bar" # Original foo +} +point = { + "id": "my_point_id_123", + "name": "my_point_v2", # Our transaction intended to change the name + "bar": "baz" # Our transaction intended to add a new key "bar" +} +current = { + "id": "my_point_id_123", + "name": "my_point_v2", # Matches our intended name change (our change "stuck" so far) + "foo": "qux", # another transaction changed this! + "bar": "baz", # Matches our intended new key "bar" + "baz": "quux" # NEW key - another transaction added this! +} +resolved = { + "id": "my_point_id_123", + "name": "my_point", # Reverted: Our transaction changed this, so undo. + "foo": "qux", # Preserved: Another transaction changed this, respect it. + "baz": "quux" # Preserved: Another transaction added this, respect it. + # bar is Removed: Our transaction added this, so undo by removing. +} +``` -QdrantClusterRestore is the Schema for the qdrantclusterrestores API +Pseudocode example: -_Appears in:_ +```py +def ingest_graph_attempt(graph_data, qdrant, neo4j): + """ + Attempts to ingest graph data into Qdrant and Neo4j consistently. + If Neo4j write fails, Qdrant changes are rolled back. + This mimics a "try Qdrant, then try Neo4j; if Neo4j fails, undo Qdrant" strategy. + """ + BEGIN_TRANSACTION + neo4j.begin_transaction() + points = graph_data.get_triplets() + # Merge points in Neo4j (non blocking) + neo4j.upsert(points) + # Load current points from Qdrant + snapshot = qdrant.get([point.id for point in points]) + # Update points in Qdrant + qdrant.upsert(points) + neo4j.commit() + TRANSACTION_ROLLBACK + # Load current points state + current_points = qdrant.get([point.id for point in points]) + # Build snapshot of the current state + resolved_snapshot = resolve_conflicts(points, current_points, snapshot) + # Rollback Qdrant changes + qdrant.upsert(resolved_snapshot) + TRANSACTION_SUCCESS + total_points = len(points) + added_points, updated_points = diff(points, snapshot) + Log(f"Successfully ingested {total_points} points into Qdrant and Neo4j.") + Log(f"Added {added_points} points and updated {updated_points} points.") +``` -- [QdrantClusterRestoreList](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterrestorelist) +#### Known limits -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `apiVersion` _string_ | `qdrant.io/v1` | | | -| `kind` _string_ | `QdrantClusterRestore` | | | -| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | -| `spec` _[QdrantClusterRestoreSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterrestorespec)_ | | | | +Lettria might revert changes from another transaction if they are strictly identical to the current one. There's a very brief window of vulnerability during the Qdrant rollback process. It occurs after Lettria has read the current qdrant points (to decide how to roll back) but before they execute the actual rollback upsert (qdrant.upsert(resolved_snapshot)). If another concurrent transaction successfully updates a point in Qdrant within this tiny window, their subsequent rollback operation might unintentionally overwrite that very recent, legitimate update. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantclusterrestorelist) QdrantClusterRestoreList +Mitigation – Small, Iterative Batches: -QdrantClusterRestoreList contains a list of QdrantClusterRestore objects +- They mitigate this risk by processing data ingestion (and any potential rollbacks) in small, iterative batches. +- By doing so, the time duration between fetching current_qdrant_point and performing the rollback upsert for any given point is minimized. +- A shorter window significantly reduces the probability that a conflicting concurrent update to the same points will occur precisely within that critical, narrow timeframe. While not a perfect guarantee, it makes such an event statistically less likely. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `apiVersion` _string_ | `qdrant.io/v1` | | | -| `kind` _string_ | `QdrantClusterRestoreList` | | | -| `metadata` _[ListMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#listmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | -| `items` _[QdrantClusterRestore](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterrestore) array_ | | | | +#### Payload Flattening -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantclusterrestorespec) QdrantClusterRestoreSpec +Each client assistant lives in its own Qdrant collection. +Lettria defines the point id of each element as a generated UUID on the client side. This allows them to preserve the same indexed ids in Neo4J and Qdrant. Those UUIDs are generated with a hash of the string id of an element: -QdrantClusterRestoreSpec defines the desired state of QdrantClusterRestore +- For chunks, the string id is the id of the chunk defined as a combination of the source (pdf) id, section (used for text-to-graph) id and the order int of the chunk. +- For relation, they use a generated UUID6 as each relation is considered unique. +- For nodes, they use the node IRI ([http://example.org/resource/France](http://example.org/resource/France)) and the client assistant namespace. -_Appears in:_ +``` +def create_uuid_from_string(string_id: str, namespace: str): +"""Create uuid from string.""" +hex_string = hashlib.md5( +f"{namespace}:{string_id}".encode("UTF-8"), +usedforsecurity=False +).hexdigest() + return uuid.UUID(hex=hex_string, version=4) +``` -- [QdrantClusterRestore](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterrestore) +It also ensures that re-ingestion of the same document will update chunks (as ids will be identical). And nodes with the same IRI are considered merged in a specific assistant. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `source` _[RestoreSource](https://qdrant.tech/documentation/private-cloud/api-reference/#restoresource)_ | Source defines the source snapshot from which the restore will be done | | | -| `destination` _[RestoreDestination](https://qdrant.tech/documentation/private-cloud/api-reference/#restoredestination)_ | Destination defines the destination cluster where the source data will end up | | | +#### Flattening -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantclusterscheduledsnapshot) QdrantClusterScheduledSnapshot +Nodes and relations are saved in Qdrant AND Neo4J. Node and relation payload’s follow a nested structure that looks like this: -QdrantClusterScheduledSnapshot is the Schema for the qdrantclusterscheduledsnapshots API +```json +{ + "properties": { + "rdfs:label": { + "@en": "House", + "@fr": "Maison" + }, + "onto:surface": 250 + }, + "metadata": { + "origin_ids": ["001", "002"], + "client_id": "8FKZ78" + } +} +``` -_Appears in:_ +Lettria has two major levels: properties (semanticly rich information about the object) and metadata (system information). -- [QdrantClusterScheduledSnapshotList](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterscheduledsnapshotlist) +Moreover, string properties might have an extra level for the language variants. As stated in [Neo4J docs](https://neo4j.com/docs/cypher-manual/current/values-and-types/property-structural-constructed/), they cannot add nested properties to nodes and relations. Therefore, they flatten such payload in Neo4J: -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `apiVersion` _string_ | `qdrant.io/v1` | | | -| `kind` _string_ | `QdrantClusterScheduledSnapshot` | | | -| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | -| `spec` _[QdrantClusterScheduledSnapshotSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterscheduledsnapshotspec)_ | | | | +```json +{ + "properties.rdfs:label": "House", + "properties.rdfs:label@en": "House", + "properties.rdfs:label@fr": "Maison", + "properties.onto:surface": 250, + "metadata.origin_ids": ["001", "002"], + "metadata.client_id": "8FKZ78" +} +``` -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantclusterscheduledsnapshotlist) QdrantClusterScheduledSnapshotList +Note that they duplicate the english tags for string properties as they are considered a default. The language tags are not flattened with a dot to prevent ambiguity between nested properties and language variants when unflattening data retrieved from the database. -QdrantClusterScheduledSnapshotList contains a list of QdrantCluster +#### Filtering -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `apiVersion` _string_ | `qdrant.io/v1` | | | -| `kind` _string_ | `QdrantClusterScheduledSnapshotList` | | | -| `metadata` _[ListMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#listmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | -| `items` _[QdrantClusterScheduledSnapshot](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterscheduledsnapshot) array_ | | | | +based on a filter definition. Lettria flattens properties on Neo4J so that they can use similar filters. The nested structure ([more on that here](https://qdrant.tech/documentation/concepts/filtering/#nested-key)) {"foo": { "bar": "qux" }} is kept in Qdrant and dot separated in NeoJ: foo.bar=qux so that they can perform match queries with similar keys from Qdrant. This introduces some complexity as they need to be careful in the handling of url and other 'dot rich' values in properties. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantclusterscheduledsnapshotspec) QdrantClusterScheduledSnapshotSpec +If they want to filter based on the onto:surface value, the same keys are used in Qdrant and Neo4J: -QdrantClusterScheduledSnapshotSpec defines the desired state of QdrantCluster +``` +MATCH (n:Node) +WHERE + n."properties.onto:surface" > 100 +RETURN n; +``` -_Appears in:_ +``` +POST /collections/{collection_name}/points/scroll +{ + "filter": { + "should": [ + { + "key": "properties.onto:surface", + "match": { + "range": {"gt": 100, "lt": null} + } + } + ] + } +} +``` -- [QdrantClusterScheduledSnapshot](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterscheduledsnapshot) +### Parsing conceptual visualization -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `cluster-id` _string_ | Id specifies the unique identifier of the cluster | | | -| `scheduleShortId` _string_ | Specifies short Id which identifies a schedule | | MaxLength: 8 | -| `schedule` _string_ | Cron expression for frequency of creating snapshots, see [https://en.wikipedia.org/wiki/Cron](https://en.wikipedia.org/wiki/Cron).
The schedule is specified in UTC. | | Pattern: `^(@(annually|yearly|monthly|weekly|daily|hourly|reboot))|(@every (\d+(ns|us|”s|ms|s|m|h))+)|((((\d+,)+\d+|([\d\*]+(\/|-)\d+)|\d+|\*) ?)\{5,7\})$` | -| `retention` _string_ | Retention of schedule in hours | | Pattern: `^[0-9]+h$` | +#### Source document -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantclustersnapshot) QdrantClusterSnapshot +An example two column document with title, text, table, image and footnotes. -QdrantClusterSnapshot is the Schema for the qdrantclustersnapshots API +![Source Document](/blog/case-study-lettria/source-document.png) -_Appears in:_ +#### Layout -- [QdrantClusterSnapshotList](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclustersnapshotlist) +They isolate components on the page. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `apiVersion` _string_ | `qdrant.io/v1` | | | -| `kind` _string_ | `QdrantClusterSnapshot` | | | -| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | -| `spec` _[QdrantClusterSnapshotSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclustersnapshotspec)_ | | | | +![Layout](/blog/case-study-lettria/layout.png) -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantclustersnapshotlist) QdrantClusterSnapshotList +#### Extraction and structuration -QdrantClusterSnapshotList contains a list of QdrantClusterSnapshot +They extract content of each component and structure the content based on the computed reading order. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `apiVersion` _string_ | `qdrant.io/v1` | | | -| `kind` _string_ | `QdrantClusterSnapshotList` | | | -| `metadata` _[ListMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#listmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | -| `items` _[QdrantClusterSnapshot](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclustersnapshot) array_ | | | | +![Enrichment](/blog/case-study-lettria/enrichment.png) -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantclustersnapshotphase) QdrantClusterSnapshotPhase +#### Enrichment -_Underlying type:_ _string_ +They remove some components (footnotes, pages, etc.) and clean the content (fix numbered and bullet point lists, merge multipage tables, textualize images etc.) -_Appears in:_ +![Extraction](/blog/case-study-lettria/extraction.png) -- [QdrantClusterSnapshotStatus](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclustersnapshotstatus) +### Inference Process Overview -| Field | Description | -| --- | --- | -| `Running` | | -| `Skipped` | | -| `Failed` | | -| `Succeeded` | | +![Inference](/blog/case-study-lettria/inference.png) -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantclustersnapshotspec) QdrantClusterSnapshotSpec +<|page-302-lllmstxt|> +## Migrating your data just got easier -_Appears in:_ +We’ve launched the **beta** of our Qdrant **Vector Data Migration Tool**, designed to simplify moving data between different instances, whether you're migrating between Qdrant deployments or switching from other vector database providers. -- [QdrantClusterSnapshot](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclustersnapshot) +This powerful tool streams all vectors from a source collection to a target Qdrant instance in live batches. It supports migrations from one Qdrant deployment to another, including from open source to Qdrant Cloud or between cloud regions. But that's not all. You can also migrate your data from other vector databases directly into Qdrant. All with a single command. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `cluster-id` _string_ | The cluster ID for which a Snapshot need to be taken
The cluster should be in the same namespace as this QdrantClusterSnapshot is located | | | -| `creation-timestamp` _integer_ | The CreationTimestamp of the backup (expressed in Unix epoch format) | | | -| `scheduleShortId` _string_ | Specifies the short Id which identifies a schedule, if any.
This field should not be set if the backup is made manually. | | MaxLength: 8 | -| `retention` _string_ | The retention period of this snapshot in hours, if any.
If not set, the backup doesn’t have a retention period, meaning it will not be removed. | | Pattern: `^[0-9]+h$` | +Unlike Qdrant’s included [snapshot migration method](https://qdrant.tech/documentation/concepts/snapshots/), which requires consistent node-specific snapshots, our migration tool enables you to easily migrate data between different Qdrant database clusters in streaming batches. The only requirement is that the vector size and distance function must match. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantclusterspec) QdrantClusterSpec +This is especially useful if you want to change the collection configuration on the target, for example by choosing a different replication factor or quantization method. -QdrantClusterSpec defines the desired state of QdrantCluster +The easiest way to run the qdrant migration tool is as a container. You can run it on any machine where you have connectivity to both the source and the target Qdrant databases. Direct connectivity between both databases is not required. For optimal performance, you should run the tool on a machine with a fast network connection and minimum latency to both databases. -_Appears in:_ +## Resources -- [QdrantCluster](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcluster) +Access the [Qdrant Migration Tool](https://github.com/qdrant/migration) -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `id` _string_ | Id specifies the unique identifier of the cluster | | | -| `version` _string_ | Version specifies the version of Qdrant to deploy | | | -| `size` _integer_ | Size specifies the desired number of Qdrant nodes in the cluster | | Maximum: 30
Minimum: 1 | -| `servicePerNode` _boolean_ | ServicePerNode specifies whether the cluster should start a dedicated service for each node. | true | | -| `clusterManager` _boolean_ | ClusterManager specifies whether to use the cluster manager for this cluster.
The Python-operator will deploy a dedicated cluster manager instance.
The Go-operator will use a shared instance.
If not set, the default will be taken from the operator config. | | | -| `suspend` _boolean_ | Suspend specifies whether to suspend the cluster.
If enabled, the cluster will be suspended and all related resources will be removed except the PVCs. | false | | -| `pauses` _[Pause](https://qdrant.tech/documentation/private-cloud/api-reference/#pause) array_ | Pauses specifies a list of pause request by developer for manual maintenance.
Operator will skip handling any changes in the CR if any pause request is present. | | | -| `image` _[QdrantImage](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantimage)_ | Image specifies the image to use for each Qdrant node. | | | -| `resources` _[Resources](https://qdrant.tech/documentation/private-cloud/api-reference/#resources)_ | Resources specifies the resources to allocate for each Qdrant node. | | | -| `security` _[QdrantSecurityContext](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantsecuritycontext)_ | Security specifies the security context for each Qdrant node. | | | -| `tolerations` _[Toleration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#toleration-v1-core) array_ | Tolerations specifies the tolerations for each Qdrant node. | | | -| `nodeSelector` _object (keys:string, values:string)_ | NodeSelector specifies the node selector for each Qdrant node. | | | -| `config` _[QdrantConfiguration](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantconfiguration)_ | Config specifies the Qdrant configuration setttings for the clusters. | | | -| `ingress` _[Ingress](https://qdrant.tech/documentation/private-cloud/api-reference/#ingress)_ | Ingress specifies the ingress for the cluster. | | | -| `service` _[KubernetesService](https://qdrant.tech/documentation/private-cloud/api-reference/#kubernetesservice)_ | Service specifies the configuration of the Qdrant Kubernetes Service. | | | -| `gpu` _[GPU](https://qdrant.tech/documentation/private-cloud/api-reference/#gpu)_ | GPU specifies GPU configuration for the cluster. If this field is not set, no GPU will be used. | | | -| `statefulSet` _[KubernetesStatefulSet](https://qdrant.tech/documentation/private-cloud/api-reference/#kubernetesstatefulset)_ | StatefulSet specifies the configuration of the Qdrant Kubernetes StatefulSet. | | | -| `storageClassNames` _[StorageClassNames](https://qdrant.tech/documentation/private-cloud/api-reference/#storageclassnames)_ | StorageClassNames specifies the storage class names for db and snapshots. | | | -| `topologySpreadConstraints` _[TopologySpreadConstraint](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#topologyspreadconstraint-v1-core)_ | TopologySpreadConstraints specifies the topology spread constraints for the cluster. | | | -| `podDisruptionBudget` _[PodDisruptionBudgetSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#poddisruptionbudgetspec-v1-policy)_ | PodDisruptionBudget specifies the pod disruption budget for the cluster. | | | -| `restartAllPodsConcurrently` _boolean_ | RestartAllPodsConcurrently specifies whether to restart all pods concurrently (also called one-shot-restart).
If enabled, all the pods in the cluster will be restarted concurrently in situations where multiple pods
need to be restarted, like when RestartedAtAnnotationKey is added/updated or the Qdrant version needs to be upgraded.
This helps sharded but not replicated clusters to reduce downtime to a possible minimum during restart.
If unset, the operator is going to restart nodes concurrently if none of the collections if replicated. | | | -| `startupDelaySeconds` _integer_ | If StartupDelaySeconds is set (> 0), an additional ‘sleep ’ will be emitted to the pod startup.
The sleep will be added when a pod is restarted, it will not force any pod to restart.
This feature can be used for debugging the core, e.g. if a pod is in crash loop, it provided a way
to inspect the attached storage. | | | -| `rebalanceStrategy` _[RebalanceStrategy](https://qdrant.tech/documentation/private-cloud/api-reference/#rebalancestrategy)_ | RebalanceStrategy specifies the strategy to use for automaticially rebalancing shards the cluster.
Cluster-manager needs to be enabled for this feature to work. | | Enum: \[by\_count by\_size by\_count\_and\_size\] | +[Check out a tutorial](https://qdrant.tech/documentation/database-tutorials/migration/) + +Watch this video to learn how to use it for moving data between Qdrant instances: + + + +<|page-303-lllmstxt|> +## How Lawme Scaled AI Legal Assistants and Cut Costs by 75% with Qdrant + +![How Lawme Scaled AI Legal Assistants and Cut Costs 75% with Qdrant](/blog/case-study-lawme/lawme-bento-dark.jpg) + +Legal technology (LegalTech) is at the forefront of digital transformation in the traditionally conservative legal industry. Lawme.ai, an ambitious startup, is pioneering this transformation by automating routine legal workflows with AI assistants. By leveraging sophisticated AI-driven processes, Lawme empowers law firms to dramatically accelerate legal document preparation, from initial research and analysis to comprehensive drafting. However, scaling their solution presented formidable challenges, particularly around data management, compliance, and operational costs. + +### Balancing Rapid Growth with Compliance and Cost Control -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantconfiguration) QdrantConfiguration +Lawme develops AI assistants that automate time-intensive legal tasks such as drafting unfair dismissal claims for Australian employment law. These tasks typically involve complex, interconnected processes including document intake, extracting critical information, conducting detailed legal research, and synthesizing results into draft documents. -_Appears in:_ +Initially, Lawme utilized PGVector—a PostgreSQL-based vector solution—for managing their AI's embedding data. As the dataset expanded rapidly with inputs from the US and Australian jurisdictions, performance degraded significantly. Queries became sluggish and infrastructure costs soared unsustainably. This posed significant operational challenges as the demand for their automated legal services surged. -- [QdrantClusterSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterspec) +Moreover, stringent compliance requirements further complicated the scenario. Lawme’s legal clients insisted on strict data residency and privacy standards, limiting their options for cloud-based solutions. Vendors that required offshore data processing or storage were simply non-starters. Lawme needed a vector database that was both performant and capable of complying with stringent security regulations. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `collection` _[QdrantConfigurationCollection](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantconfigurationcollection)_ | Collection specifies the default collection configuration for Qdrant. | | | -| `log_level` _string_ | LogLevel specifies the log level for Qdrant. | | | -| `service` _[QdrantConfigurationService](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantconfigurationservice)_ | Service specifies the service level configuration for Qdrant. | | | -| `tls` _[QdrantConfigurationTLS](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantconfigurationtls)_ | TLS specifies the TLS configuration for Qdrant. | | | -| `storage` _[StorageConfig](https://qdrant.tech/documentation/private-cloud/api-reference/#storageconfig)_ | Storage specifies the storage configuration for Qdrant. | | | -| `inference` _[InferenceConfig](https://qdrant.tech/documentation/private-cloud/api-reference/#inferenceconfig)_ | Inference configuration. This is used in Qdrant Managed Cloud only. If not set Inference is not available to this cluster. | | | +As Jordan Parker, Lawme’s Co-Founder, explained: -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantconfigurationcollection) QdrantConfigurationCollection +*"Security and compliance stop you from using many cloud providers, especially those based offshore. We needed flexibility to deploy wherever our clients required, without sacrificing speed or reliability."* -_Appears in:_ +### Solution: Flexible, Performant Vector Search with Qdrant -- [QdrantConfiguration](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantconfiguration) +Lawme turned to Qdrant’s vector search engine due to its exceptional blend of performance, security, and deployment flexibility. The transition was driven by several critical features: -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `replication_factor` _integer_ | ReplicationFactor specifies the default number of replicas of each shard | | | -| `write_consistency_factor` _integer_ | WriteConsistencyFactor specifies how many replicas should apply the operation to consider it successful | | | -| `vectors` _[QdrantConfigurationCollectionVectors](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantconfigurationcollectionvectors)_ | Vectors specifies the default parameters for vectors | | | +* **Binary Quantization**: Qdrant’s easy-to-implement binary quantization dramatically accelerated the retrieval process, efficiently narrowing large search spaces from tens of millions of legal vectors. This feature ensured rapid query processing without compromising search quality. By combining Qdrant’s fast retrieval with a secondary reranking step using models like Cohere, Lawme maintained search precision. This approach balanced scalability with the accuracy required in legal contexts. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantconfigurationcollectionvectors) QdrantConfigurationCollectionVectors +* **Metadata Filtering**: Qdrant’s advanced filterable Hierarchical Navigable Small World (HNSW) index enabled Lawme to quickly filter queries by specific legal metadata, such as jurisdiction or case type, providing highly relevant search results swiftly and reliably. -_Appears in:_ +* **Deployment Flexibility**: Qdrant’s straightforward Kubernetes deployment facilitated rapid, secure installations in private clouds, fully compliant with clients’ regulatory requirements. This allowed Lawme to confidently approach conservative legal clients who demanded strict data residency. -- [QdrantConfigurationCollection](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantconfigurationcollection) +### Results: Lower Costs, Improved Performance, Enhanced Compliance -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `on_disk` _boolean_ | OnDisk specifies whether vectors should be stored in memory or on disk. | | | +The migration to Qdrant immediately translated into substantial benefits: -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantconfigurationservice) QdrantConfigurationService +* **Cost Efficiency**: Infrastructure costs were dramatically reduced by 75% compared to the previous PGVector setup. This cost reduction empowered Lawme to remain competitive and price-efficient in a highly sensitive market. -_Appears in:_ +* **Scalability and Performance**: Query latencies dropped significantly, allowing Lawme to handle tens of millions of vectors without performance degradation. This scalable infrastructure supported the fast-paced growth of their AI assistant deployments. -- [QdrantConfiguration](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantconfiguration) +* **Enhanced Trust and Compliance**: With the ability to fully control data location and comply with local data privacy laws, Lawme gained significant trust from legal clients. The system’s transparency and reliable governance mechanisms further strengthened client confidence. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `api_key` _[QdrantSecretKeyRef](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantsecretkeyref)_ | ApiKey for the qdrant instance | | | -| `read_only_api_key` _[QdrantSecretKeyRef](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantsecretkeyref)_ | ReadOnlyApiKey for the qdrant instance | | | -| `jwt_rbac` _boolean_ | JwtRbac specifies whether to enable jwt rbac for the qdrant instance
Default is false | | | -| `hide_jwt_dashboard` _boolean_ | HideJwtDashboard specifies whether to hide the JWT dashboard of the embedded UI
Default is false | | | -| `enable_tls` _boolean_ | EnableTLS specifies whether to enable tls for the qdrant instance
Default is false | | | +* **Operational Simplicity**: Qdrant’s developer-friendly deployment tools, visualization features, and clear documentation streamlined the operational complexity, enabling rapid development cycles and easy maintenance. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantconfigurationtls) QdrantConfigurationTLS +Jordan Parker summarized the impact succinctly: -_Appears in:_ +*"The more data you feed into the agent, the better it gets. But to truly scale, you need a vector database that maintains low latency, high accuracy, and keeps costs in check. Qdrant makes that possible."* -- [QdrantConfiguration](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantconfiguration) +### Looking Forward: Scaling Globally with Robust Infrastructure -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `cert` _[QdrantSecretKeyRef](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantsecretkeyref)_ | Reference to the secret containing the server certificate chain file | | | -| `key` _[QdrantSecretKeyRef](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantsecretkeyref)_ | Reference to the secret containing the server private key file | | | -| `caCert` _[QdrantSecretKeyRef](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantsecretkeyref)_ | Reference to the secret containing the CA certificate file | | | +The successful adoption of Qdrant has positioned Lawme for substantial growth; it is confident in its technology stack's ability to handle complex, high-volume legal workloads. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantentity) QdrantEntity +In an industry notorious for its demanding compliance, security, and accuracy standards, Lawme’s experience highlights Qdrant’s potential as foundational infrastructure. This case underscores how cutting-edge AI applications in regulated industries can thrive on flexible, cost-efficient, and high-performance platforms. -QdrantEntity is the Schema for the qdrantentities API +Lawme.ai is now ideally equipped to continue innovating in legal automation, empowered by infrastructure capable of matching their ambition, growth trajectory, and client expectations. Additionally, Lawme is exploring late interaction models such as ColPali to simplify their data ingestion and processing pipeline further, reinforcing their commitment to continuous improvement and innovation. -_Appears in:_ +<|page-304-lllmstxt|> +## How ConvoSearch Boosted E-commerce Revenue with Qdrant -- [QdrantEntityList](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantentitylist) +![How ConvoSearch Boosted E-commerce Revenue with Qdrant](/blog/case-study-convosearch/convosearch-bento-dark.jpg) -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `apiVersion` _string_ | `qdrant.io/v1` | | | -| `kind` _string_ | `QdrantEntity` | | | -| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | -| `spec` _[QdrantEntitySpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantentityspec)_ | | | | +### Driving E-commerce Success Through Enhanced Search -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantentitylist) QdrantEntityList +E-commerce retailers face intense competition and constant pressure to increase conversion rates. [ConvoSearch](https://convosearch.com/) , an AI-powered recommendation engine tailored for direct-to-consumer (D2C) e-commerce brands, addresses these challenges by delivering hyper-personalized search and recommendations. With customers like The Closet Lover and Uncle Reco achieving dramatic revenue increases, ConvoSearch relies heavily on high-speed vector search to ensure relevance and accuracy at scale. -QdrantEntityList contains a list of QdrantEntity objects +### Overcoming Latency and Customization Limits -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `apiVersion` _string_ | `qdrant.io/v1` | | | -| `kind` _string_ | `QdrantEntityList` | | | -| `metadata` _[ListMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#listmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | -| `items` _[QdrantEntity](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantentity) array_ | | | | +Initially, ConvoSearch utilized Pinecone for vector search operations but faced significant limitations: high latency, insufficient customizability, and strict metadata constraints. Pinecone’s query latency of 50–100ms was too slow, and network latency added an additional 50–70ms, severely impacting ConvoSearch's real-time re-ranking capabilities crucial for personalized recommendations. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantentityspec) QdrantEntitySpec +Shardul Aggarwal, CEO of ConvoSearch, highlighted, “Latency and customizability were huge issues for us. We needed a solution that was faster, more flexible, and could handle large volumes of metadata without compromises.” -QdrantEntitySpec defines the desired state of QdrantEntity +### Implementing a Powerful Vector Search Infrastructure -_Appears in:_ +ConvoSearch transitioned to Qdrant for its powerful vector search capabilities, including low-latency queries, expansive metadata storage, and advanced customizability. With Qdrant, queries returned in just 10ms—significantly faster than the previous solution. Qdrant’s [robust metadata handling](https://qdrant.tech/articles/vector-search-filtering/) allowed ConvoSearch to incorporate extensive product data and user interactions seamlessly, dramatically improving recommendation accuracy. -- [QdrantEntity](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantentity) +The transition to Qdrant enabled ConvoSearch to host its infrastructure on dedicated servers, leveraging NVIDIA GPUs for computationally intensive re-ranking tasks. This provided significant cost savings, better resource optimization, and exceptional speed. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `id` _string_ | The unique identifier of the entity (in UUID format). | | | -| `entityType` _string_ | The type of the entity. | | | -| `clusterId` _string_ | The optional cluster identifier | | | -| `createdAt` _[MicroTime](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#microtime-v1-meta)_ | Timestamp when the entity was created. | | | -| `lastUpdatedAt` _[MicroTime](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#microtime-v1-meta)_ | Timestamp when the entity was last updated. | | | -| `deletedAt` _[MicroTime](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#microtime-v1-meta)_ | Timestamp when the entity was deleted (or is started to be deleting).
If not set the entity is not deleted | | | -| `payload` _[JSON](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#json-v1-apiextensions-k8s-io)_ | Generic payload for this entity | | | +### Using Multiple Layers for Highly Relevant Search -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantentitystatusresult) QdrantEntityStatusResult +ConvoSearch's approach involves multiple layers. It uses advanced AI models for deep product understanding, extracting semantic context from product data, images, and user interactions. These insights feed into sophisticated re-ranking pipelines, ensuring hyper-personalized and highly relevant search results and recommendations. By continuously optimizing the relevancy of recommendations in real-time, ConvoSearch significantly boosts customer engagement and conversions. -QdrantEntityStatusResult is the last result from the invocation to a manager +![standard-vs-convosearch](/blog/case-study-convosearch/case-study-standard-vs-convosearch.png) -_Appears in:_ +*Eliminating the no results dilemma: Standard search vs. ConvoSearch* -- [QdrantEntityStatus](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantentitystatus) +### Immediate and Significant Revenue Impact -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `result` _[EntityResult](https://qdrant.tech/documentation/private-cloud/api-reference/#entityresult)_ | The result of last reconcile of the entity | | Enum: \[Ok Pending Error\] | -| `reason` _string_ | The reason of the result (e.g. in case of an error) | | | -| `payload` _[JSON](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#json-v1-apiextensions-k8s-io)_ | The optional payload of the status. | | | +The results were transformative: -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantimage) QdrantImage +* Median revenue increase of 23–24% for ConvoSearch's customer base. +* The Closet Lover experienced a remarkable 60% revenue uplift. +* Uncle Reco saw an incremental revenue boost of $100,000 within a single month after deploying ConvoSearch. -_Appears in:_ +Shardul Aggarwal emphasized, “The impact was immediate. Brands could just plug in our solution and start seeing revenue growth without any changes to their front-end.” -- [QdrantClusterSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterspec) +Qdrant’s infrastructure delivered substantial operational benefits: -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `repository` _string_ | Repository specifies the repository of the Qdrant image.
If not specified defaults the config of the operator (or qdrant/qdrant if not specified in operator). | | | -| `pullPolicy` _[PullPolicy](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#pullpolicy-v1-core)_ | PullPolicy specifies the image pull policy for the Qdrant image.
If not specified defaults the config of the operator (or IfNotPresent if not specified in operator). | | | -| `pullSecretName` _string_ | PullSecretName specifies the pull secret for the Qdrant image. | | | +* Reduced query latency from 50–100ms to approximately 10ms. +* Enhanced scalability, managing thousands of daily product updates efficiently. +* Enabled detailed product understanding and real-time personalization, setting ConvoSearch apart from competitors. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantrelease) QdrantRelease +### Scaling the Recommendation Engine -QdrantRelease describes an available Qdrant release +By shifting to Qdrant, ConvoSearch not only overcame infrastructure challenges but significantly enhanced their ability to drive tangible business outcomes for their customers. As Aggarwal summarized, “Our move to Qdrant transformed our recommendation engine capabilities, making us indispensable to our clients.” -_Appears in:_ +<|page-305-lllmstxt|> +## LegalTech Builder's Guide: Navigating Strategic Decisions with Vector Search -- [QdrantReleaseList](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantreleaselist) +### LegalTech innovation needs a new search stack -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `apiVersion` _string_ | `qdrant.io/v1` | | | -| `kind` _string_ | `QdrantRelease` | | | -| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | -| `spec` _[QdrantReleaseSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantreleasespec)_ | | | | +LegalTech applications, more than most other application types, demand accuracy due to complex document structures, high regulatory stakes, and compliance requirements. Traditional keyword searches often fall short, failing to grasp semantic nuances essential for precise legal queries. [Qdrant](https://qdrant.tech/) addresses these challenges by providing robust vector search solutions tailored for the complexities inherent in LegalTech applications. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantreleaselist) QdrantReleaseList +### Delivering precise results for high-stakes legal applications -QdrantReleaseList contains a list of QdrantRelease +Legal applications often operate in a high-stakes environment where false positives and imprecise matches can erode trust. Getting to the right result—fast—isn’t optional; it’s critical. Qdrant helps teams meet these demands through a set of tightly integrated techniques. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `apiVersion` _string_ | `qdrant.io/v1` | | | -| `kind` _string_ | `QdrantReleaseList` | | | -| `metadata` _[ListMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#listmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | -| `items` _[QdrantRelease](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantrelease) array_ | | | | +#### Filterable Hierarchical Navigable Small World (HNSW) -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantreleasespec) QdrantReleaseSpec +When your dataset contains millions (or billions) of legal vectors, every unnecessary comparison wastes time and compute. Pre-filtering cuts through the noise by narrowing the search space before retrieval even begins. -QdrantReleaseSpec defines the desired state of QdrantRelease +[Filterable HNSW](https://qdrant.tech/articles/vector-search-filtering/) indexing improves speed, precision, and cost efficiency by applying filters before the search. It maintains speed advantages of vector search while allowing for precise filtering, addressing the inefficiencies that can occur when applying filters after the vector search. The [Garden Intel case study](https://qdrant.tech/case-studies/) exemplifies how its used in practice for a LegalTech use case. -_Appears in:_ +![pre-filtering vectors](/blog/legal-tech-builders-guide/filterable-hnsw.png) -- [QdrantRelease](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantrelease) +*Figure: example of pre-filtering vectors ([source](https://qdrant.tech/articles/vector-search-filtering/#pre-filtering))* -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `version` _string_ | Version number (should be semver compliant).
E.g. “v1.10.1” | | | -| `default` _boolean_ | If set, this version is default for new clusters on Cloud.
There should be only 1 Qdrant version in the platform set as default. | false | | -| `image` _string_ | Full docker image to use for this version.
If empty, a default image will be derived from Version (and qdrant/qdrant is assumed). | | | -| `unavailable` _boolean_ | If set, this version cannot be used for new clusters. | false | | -| `endOfLife` _boolean_ | If set, this version is no longer actively supported. | false | | -| `accountIds` _string array_ | If set, this version can only be used by accounts with given IDs. | | | -| `accountPrivileges` _string array_ | If set, this version can only be used by accounts that have been given the listed privileges. | | | -| `remarks` _string_ | General remarks for human reading | | | -| `releaseNotesURL` _string_ | Release Notes URL for the specified version | | | +#### Blend structured filters with semantic search for better case insights -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantsecretkeyref) QdrantSecretKeyRef +Legal documents require a dual lens: exact matches for citations or statute references, and semantic understanding of legal reasoning. Hybrid search brings both into one query path. This is ideal for legal documents where exact citations and nuanced conceptual similarities coexist. -_Appears in:_ +[Minicoil](https://qdrant.tech/articles/minicoil/), a sparse neural retriever, enriches lexical accuracy by understanding contextual token meanings. For instance, [Aracor](https://qdrant.tech/blog/case-study-aracor/) leverages hybrid search to precisely retrieve relevant clauses across extensive legal document repositories. Below is a pseudocode example: -- [QdrantConfigurationService](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantconfigurationservice) -- [QdrantConfigurationTLS](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantconfigurationtls) +```json +POST /collections/{collection_name}/points/query +{ + "prefetch": { + "query": [0.2, 0.8, ...], + "limit": 50 + }, + "query": { + "formula": { + "sum": [ + "$score", + { + "mult": [ + 0.5, + { + "key": "tag", + "match": { "any": ["h1", "h2", "h3", "h4"] } + } + ] + }, + { + "mult": [ + 0.25, + { + "key": "tag", + "match": { "any": ["p", "li"] } + } + ] + } + ] + } + } +} +``` -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `secretKeyRef` _[SecretKeySelector](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#secretkeyselector-v1-core)_ | SecretKeyRef to the secret containing data to configure the qdrant instance | | | +Leveraging Late-Interaction Models for Rich Documents -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#qdrantsecuritycontext) QdrantSecurityContext +Traditional OCR pipelines can add complexity and create accuracy challenges. But late-interaction models simplify the ingestion pipeline by running at the reranking stage. -_Appears in:_ +Models like ([ColPali](https://qdrant.tech/blog/qdrant-colpali/) and ColQwen) bypass traditional OCR pipelines, directly processing images of complex documents. They enhance accuracy by maintaining original layouts and contextual integrity, simplifying your retrieval pipelines. The tradeoff is a heavier application, but these challenges can be addressed with further [optimization](https://qdrant.tech/documentation/guides/optimize/)*.* -- [QdrantClusterSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterspec) +#### Enabling highly granular accuracy for complex legal searches -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `user` _integer_ | User specifies the user to run the Qdrant process as. | | | -| `group` _integer_ | Group specifies the group to run the Qdrant process as. | | | -| `fsGroup` _integer_ | FsGroup specifies file system group to run the Qdrant process as. | | | +Legal relevance is critical down to the token level. Distinguishing between, for example, “shall” and “may”, becomes important. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#rebalancestrategy) RebalanceStrategy +Utilize [ColBERT](https://qdrant.tech/articles/late-interaction-models/) for high-accuracy reranking, allowing highly granular, token-level similarity estimation with high accuracy results. It is also faster than traditional cross-encoders for reranking. Since Qdrant supports multivectors natively, this is easy to integrate into your search application. -_Underlying type:_ _string_ +```python +# Step 1: Retrieve hybrid results using dense and sparse queries +hybrid_results = client.search( + collection_name="legal-hybrid-search", + query_vector=dense_vector, + query_sparse_vector=sparse_vector, + limit=20, + with_payload=True +) -RebalanceStrategy specifies the strategy to use for automaticially rebalancing shards the cluster. +# Step 2: Tokenize the query using ColBERT +colbert_query_tokens = colbert_model.query_tokenize(query_text) -_Validation:_ +# Step 3: Score and rerank results using ColBERT token-level scoring +reranked = sorted( + hybrid_results, + key=lambda doc: colbert_model.score( + colbert_query_tokens, + colbert_model.doc_tokenize(doc["payload"]["document"]) + ), + reverse=True +) -- Enum: \[by\_count by\_size by\_count\_and\_size\] +# Step 4: Return top-k reranked results +final_results = reranked[:5] +``` -_Appears in:_ -- [QdrantClusterSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterspec) +#### Prioritize the legal logic that matters most in search rankings -| Field | Description | -| --- | --- | -| `by_count` | | -| `by_size` | | -| `by_count_and_size` | | +Not every clause is created equal. Legal professionals often care more about specific provisions, jurisdictions, or case types, for example. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#regioncapabilities) RegionCapabilities +Qdrant's [Score Boosting Reranker](https://qdrant.tech/documentation/concepts/hybrid-queries/#score-boosting) lets you integrate domain-specific logic (e.g., jurisdiction or recent cases) directly into search rankings, ensuring results align precisely with legal business rules. -_Appears in:_ +```json +POST /collections/legal-docs/points/query +{ + "prefetch": { + "query": [0.21, 0.77, ...], + "limit": 50 + }, + "query": { + "formula": { + "sum": [ + "$score", // dense semantic match + { + "mult": [0.6, { "key": "section", "match": { "any": ["Clause", "Provision", "Section"] } }] + }, + { + "mult": [0.3, { "key": "heading", "match": { "any": ["Definitions", "Governing Law", "Termination"] } }] + }, + { + "mult": [0.1, { "key": "paragraph_type", "match": { "any": ["Interpretation", "Remedy"] } }] + } + ] + } + } +} +``` -- [QdrantCloudRegionStatus](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcloudregionstatus) +### Ensuring Scalability and High Performance -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `volumeSnapshot` _boolean_ | VolumeSnapshot specifies whether the Kubernetes cluster supports volume snapshot | | | -| `volumeExpansion` _boolean_ | VolumeExpansion specifies whether the Kubernetes cluster supports volume expansion | | | +Even the most accurate system can fail if it can’t handle load or stay cost-effective. Once your LegalTech product reaches significant usage, indexing speed and operational efficiency start to matter just as much as relevance. Qdrant’s capabilities can also address the challenges. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#regionphase) RegionPhase +#### Efficient Indexing and Retrieval Techniques -_Underlying type:_ _string_ +Legal datasets are growing, and so are the compute bills. From GPU acceleration to quantization, Qdrant gives teams the tools to scale without spiraling costs. -_Appears in:_ +* [GPU indexing](https://qdrant.tech/blog/qdrant-1.13.x/) accelerates indexing by up to 10x compared to CPU methods, offering vendor-agnostic compatibility with modern GPUs via Vulkan API. -- [QdrantCloudRegionStatus](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcloudregionstatus) +* [Vector quantization](https://qdrant.tech/documentation/guides/quantization/) compresses embeddings, significantly reducing memory and operational costs. It results in lower accuracy, so carefully consider this option. For example, [LawMe](http://qdrant.tech/blog/case-study-lawme), a Qdrant user, uses Binary Quantization to cost-effectively add more data for its AI Legal Assistants. -| Field | Description | -| --- | --- | -| `Ready` | | -| `NotReady` | | -| `FailedToSync` | | +### Getting Started: Choosing Your Search Infrastructure -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#resourcerequests) ResourceRequests +#### Deploy in private, cloud, or hybrid environments without sacrificing control -_Appears in:_ +No matter your stage—prototype or production—your stack will have to meet both engineering and compliance needs. Qdrant supports flexible deployment strategies, including [managed cloud](https://qdrant.tech/cloud/) and [hybrid cloud](https://qdrant.tech/hybrid-cloud/), along with open-source solutions via [Docker](https://qdrant.tech/documentation/quick-start/), enabling easy scaling and secure management of legal data. -- [Resources](https://qdrant.tech/documentation/private-cloud/api-reference/#resources) +#### Build and iterate quickly with responsive support and built-in tooling -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `cpu` _string_ | CPU specifies the CPU request for each Qdrant node. | | | -| `memory` _string_ | Memory specifies the memory request for each Qdrant node. | | | +Complex Legal AI applications may need extra support. Qdrant’s open-source commitment, FastEmbed integration, and responsive team help unblock your path to value. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#resources) Resources +We’re very responsive on our [Qdrant Discord channel](https://qdrant.tech/community/), have a free [Qdrant Cloud tier](https://cloud.qdrant.io/signup), are committed to open-source, and have great [documentation](https://qdrant.tech/documentation/). Also, check out embedding workflows via our [FastEmbed integration](https://qdrant.tech/documentation/fastembed/) to simplify the inference process. -_Appears in:_ +#### Exploratory & Interactive Development -- [QdrantClusterSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterspec) +Legal search often requires iteration—tweaking prompts, reranking weights, or understanding why a clause ranked low. [Qdrant's Web UI](https://qdrant.tech/documentation/web-ui/) makes these loops visible and actionable through interactive experimentation, HTTP-based calls, visual debugging, and semantic similarity visualizations. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `cpu` _string_ | CPU specifies the CPU limit for each Qdrant node. | | | -| `memory` _string_ | Memory specifies the memory limit for each Qdrant node. | | | -| `storage` _string_ | Storage specifies the storage amount for each Qdrant node. | | | -| `requests` _[ResourceRequests](https://qdrant.tech/documentation/private-cloud/api-reference/#resourcerequests)_ | Requests specifies the resource requests for each Qdrant node. | | | +#### Enterprise-Grade Capabilities -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#restoredestination) RestoreDestination +From law firms to global SaaS providers, enterprise LegalTech builders need auditability, control, and compliance. Qdrant comes equipped with the features that keep security teams happy. -_Appears in:_ +Qdrant’s enterprise-ready features, including RBAC (including on Cloud), SSO, Database API Keys (down to the vector level), comprehensive monitoring, and observability, ensure secure, compliant, and manageable deployments at scale. -- [QdrantClusterRestoreSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterrestorespec) +Qdrant is also SOC II Type II and HIPAA compliant ([link](https://app.drata.com/trust/9cbbb75b-0c38-11ee-865f-029d78a187d9)). -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `name` _string_ | Name of the destination cluster | | | -| `namespace` _string_ | Namespace of the destination cluster | | | +### Build legal AI that balances accuracy, compliance, and cost at scale -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#restorephase) RestorePhase +The challenge isn’t just building something that works—it’s building something that works at scale, under legal scrutiny, and with economic efficiency. Qdrant gives LegalTech teams that edge. -_Underlying type:_ _string_ +Successfully navigating LegalTech challenges requires careful balance across accuracy, compliance, scalability, and cost. Qdrant provides a comprehensive, flexible, and powerful vector search stack, empowering LegalTech to build robust and reliable AI applications. -_Appears in:_ +Ready to build? Start exploring Qdrant’s capabilities today through [Qdrant Cloud](https://cloud.qdrant.io/login) to strategically manage and advance your legal-tech applications. -- [QdrantClusterRestoreStatus](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterrestorestatus) +<|page-306-lllmstxt|> +## Qdrant Attains SOC 2 Type II and HIPAA Certifications: Strengthening Our Commitment to Enterprise Security -| Field | Description | -| --- | --- | -| `Running` | | -| `Skipped` | | -| `Failed` | | -| `Succeeded` | | +At Qdrant, we're proud to announce that we've successfully renewed our SOC 2 Type II certification and attained our HIPAA compliance certification ([link](http://qdrant.to/trust-center)). This continued achievement highlights our unwavering dedication to maintaining robust security, confidentiality, and compliance standards, especially critical in supporting [enterprise-scale operations](https://qdrant.tech/enterprise-solutions/) and sensitive data management. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#restoresource) RestoreSource +### SOC 2 Type II: Continuous Commitment to Security -_Appears in:_ +Building on our initial SOC 2 Type II certification from 2024, Qdrant sustained our rigorous security and operational practices over a full 12-month observation period. SOC 2 Type II audits meticulously assess the practical implementation of security measures aligned with the American Institute of Certified Public Accountants (AICPA) Trust Services criteria: -- [QdrantClusterRestoreSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterrestorespec) +* **Security** -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `snapshotName` _string_ | SnapshotName is the name of the snapshot from which we wish to restore | | | -| `namespace` _string_ | Namespace of the snapshot | | | +* **Confidentiality** -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#scheduledsnapshotphase) ScheduledSnapshotPhase +* **Availability** -_Underlying type:_ _string_ +Our latest audit further demonstrates our ongoing commitment to uphold these stringent standards throughout our operations. -_Appears in:_ +### Introducing HIPAA Compliance: Protecting Sensitive Healthcare Data -- [QdrantClusterScheduledSnapshotStatus](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterscheduledsnapshotstatus) +In addition to SOC 2 Type II, Qdrant now holds HIPAA compliance certification. HIPAA (Health Insurance Portability and Accountability Act) certification is essential for healthcare organizations and those handling Protected Health Information (PHI). With this certification, Qdrant ensures that enterprises leveraging our platform can confidently manage sensitive healthcare data, adhering to stringent regulatory requirements. -| Field | Description | -| --- | --- | -| `Active` | | -| `Disabled` | | +### Empowering Enterprise Security with Advanced Features -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#storageclass) StorageClass +Our SOC 2 Type II and HIPAA certifications complement Qdrant’s comprehensive suite of [enterprise security features](https://qdrant.tech/enterprise-solutions/) designed specifically for robust enterprise needs: -_Appears in:_ +* **Single Sign-On (SSO)**: Streamline user authentication and enhance security through centralized identity management. -- [QdrantCloudRegionStatus](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcloudregionstatus) +* **Cloud RBAC (Role-Based Access Control)**: Precisely control access permissions, ensuring users have appropriate rights for their roles. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `name` _string_ | Name specifies the name of the storage class | | | -| `default` _boolean_ | Default specifies whether the storage class is the default storage class | | | -| `provisioner` _string_ | Provisioner specifies the provisioner of the storage class | | | -| `allowVolumeExpansion` _boolean_ | AllowVolumeExpansion specifies whether the storage class allows volume expansion | | | -| `reclaimPolicy` _string_ | ReclaimPolicy specifies the reclaim policy of the storage class | | | -| `parameters` _object (keys:string, values:string)_ | Parameters specifies the parameters of the storage class | | | +* **Observability and Monitoring**: Maintain visibility and operational integrity through advanced monitoring tools that track and analyze system health and performance in real-time. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#storageclassnames) StorageClassNames +* **Database API Keys**: Securely manage database access and safeguard sensitive operations with scoped API key functionality. -_Appears in:_ +### Future Security Initiatives -- [QdrantClusterSpec](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclusterspec) +Qdrant remains dedicated to continually enhancing our security posture. Our annual SOC 2 Type II audits and ongoing compliance with HIPAA standards ensure that our practices adapt to emerging threats and evolving industry expectations, reinforcing the trust our enterprise customers place in us. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `db` _string_ | DB specifies the storage class name for db volume. | | | -| `snapshots` _string_ | Snapshots specifies the storage class name for snapshots volume. | | | +<|page-307-lllmstxt|> +## ​​Introducing the Official Qdrant Node for n8n -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#storageconfig) StorageConfig +Amazing news for n8n builders working with semantic search: Qdrant now has an [official, team-supported node for n8n](https://www.npmjs.com/package/n8n-nodes-qdrant), an early adopter of n8n's new [verified community nodes](https://docs.n8n.io/integrations/creating-nodes/deploy/submit-community-nodes/#submit-your-node-for-verification-by-n8n) feature! -_Appears in:_ +This new integration brings the full power of Qdrant directly into your n8n workflows: no more wrestling with HTTP nodes ever again! +Whether you’re building RAG systems, agentic pipelines, or advanced data analysis tools, this node is designed to make your life easier and your solutions more robust. -- [QdrantConfiguration](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantconfiguration) +## Why This Matters -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `performance` _[StoragePerformanceConfig](https://qdrant.tech/documentation/private-cloud/api-reference/#storageperformanceconfig)_ | Performance configuration | | | +Previously, using Qdrant in n8n frequently meant configuring HTTP request nodes due to limited access to advanced Qdrant features like batch upserts and updates, hybrid search and recommendations, discovery search and distance matrix API, and many, many more. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#storageperformanceconfig) StoragePerformanceConfig +The new Qdrant node changes that. -_Appears in:_ +It supports everything Qdrant can do, including hybrid queries, reranking with multivectors, sophisticated filtering, and all that the latest [Qdrant 1.14.0](https://qdrant.tech/blog/qdrant-1.14.x/) release provides. -- [StorageConfig](https://qdrant.tech/documentation/private-cloud/api-reference/#storageconfig) +The Qdrant node is available for **both cloud and self-hosted n8n instances**, starting from version **1.94.0**. Installing it is as simple as clicking “Install” button, you’ll be up and running in seconds. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `optimizer_cpu_budget` _integer_ | OptimizerCPUBudget defines the number of CPU allocation.
If 0 - auto selection, keep 1 or more CPUs unallocated depending on CPU size
If negative - subtract this number of CPUs from the available CPUs.
If positive - use this exact number of CPUs. | | | -| `async_scorer` _boolean_ | AsyncScorer enables io\_uring when rescoring | | | +## Start Using it Now: How to Install & Use the Node for Hybrid Search -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#traefikconfig) TraefikConfig +We filmed a short demo for you on how to use this new node for **hybrid search** in Qdrant, as we thought it would be super handy for you to know how to combine the precision of keyword-based (sparse) search with the semantic power of dense embeddings in your n8n solutions. -_Appears in:_ +It's especially valuable in domains like legal or medical, where both exact matches and contextual understanding are crucial, so we love that you now have access to a method that delivers relevant results for complex, domain-specific queries! -- [Ingress](https://qdrant.tech/documentation/private-cloud/api-reference/#ingress) +This toy example walks you through fusing dense and lexical hybrid search results using Reciprocal Rank Fusion (RRF). -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `allowedSourceRanges` _string array_ | AllowedSourceRanges specifies the allowed CIDR source ranges for the ingress. | | | -| `entryPoints` _string array_ | EntryPoints is the list of traefik entry points to use for the ingress route.
If nothing is set, it will take the entryPoints configured in the operator config. | | | + -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#volumesnapshotclass) VolumeSnapshotClass +Naturally, you don't have to stop at a simple example! Everything we suggested in our [Hybrid Search article](https://qdrant.tech/articles/hybrid-search/) is now possible to use natively, even our latest [Score-Boosting Reranker](https://qdrant.tech/blog/qdrant-1.14.x/#score-boosting-reranker), which allows for complete customization of the rescoring formula. -_Appears in:_ +## Explore More and Get Involved -- [QdrantCloudRegionStatus](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantcloudregionstatus) +This example is just the beginning. You can now build Qdrant-based solutions, from advanced RAG to complicated agentic systems, all natively within n8n. If you want to see more tutorials or have specific use cases in mind, let us know in the comments on the video or join our community discussions. -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `name` _string_ | Name specifies the name of the volume snapshot class | | | -| `driver` _string_ | Driver specifies the driver of the volume snapshot class | | | +We welcome your feedback, suggestions, and contributions on GitHub! Don’t forget to star the [new node's repo](https://github.com/qdrant/n8n-nodes-qdrant) and join our [Discord community](https://discord.gg/njJFNKXj) if you have questions or want to connect with other users. -#### [Anchor](https://qdrant.tech/documentation/private-cloud/api-reference/\#volumesnapshotinfo) VolumeSnapshotInfo +## Resources -_Appears in:_ +* [Qdrant n8n Node on npm](https://www.npmjs.com/package/n8n-nodes-qdrant) +* [GitHub Repo](https://github.com/qdrant/n8n-nodes-qdrant) +* [Video: Connecting Qdrant to n8n](https://youtu.be/fYMGpXyAsfQ?feature=shared&t=194) +* [Qdrant's n8n User Docs](https://qdrant.tech/documentation/platforms/n8n/) -- [QdrantClusterSnapshotStatus](https://qdrant.tech/documentation/private-cloud/api-reference/#qdrantclustersnapshotstatus) +<|page-308-lllmstxt|> +Want to learn how to build an AI system that answers questions about your knowledge base? -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `volumeSnapshotName` _string_ | VolumeSnapshotName is the name of the volume snapshot | | | -| `volumeName` _string_ | VolumeName is the name of the volume that was backed up | | | -| `readyToUse` _boolean_ | ReadyToUse indicates if the volume snapshot is ready to use | | | -| `snapshotHandle` _string_ | SnapshotHandle is the identifier of the volume snapshot in the respective cloud provider | | | +We’re excited to announce our partnership with Alexey Grigorev and DataTalks.Club to bring you a free, hands-on, 10-week course focused on building real-life applications of LLMs. -##### Was this page useful? +Gain hands-on experience with LLMs, RAG, vector search, evaluation, monitoring, and more. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +## Learn RAG and Vector Search +In this course, you'll learn how to create an AI system that can answer questions about your own knowledge base using LLMs and RAG. -Thank you for your feedback! 🙏 +Week 1 introduces the fundamentals of LLMs and RAG. Week 2 is where the vector search magic begins. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/private-cloud/api-reference.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +## What You'll Learn from Qdrant's Experts +Qdrant’s team will guide you through both foundational and advanced concepts in vector and hybrid search: -On this page: +Evgeniya (Jenny) Sukhodolskaya, Developer Advocate at Qdrant +→ Learn how to run locally semantic similarity search with Qdrant and [FastEmbed](https://qdrant.tech/documentation/fastembed/) (Qdrant's optimized embedding solution) as well as gain visual understanding of vector search with Qdrant's [WebUI](https://qdrant.tech/documentation/web-ui/). -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/private-cloud/api-reference.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Kacper Ɓukawski, Senior Developer Advocate at Qdrant +→ Dive into [hybrid search](https://qdrant.tech/articles/hybrid-search/): combining lexical and vector search for better results. You’ll also explore multi-vector search, [reranking](https://qdrant.tech/documentation/advanced-tutorials/reranking-hybrid-search/), and [late interaction models](https://qdrant.tech/articles/late-interaction-models/). -× +## How to Join +The course is 100% free and online, with all materials publicly available. -[Powered by](https://qdrant.tech/) +[Start the course today!](https://github.com/DataTalksClub/llm-zoomcamp). -<|page-163-lllmstxt|> -## seed-round -- [Articles](https://qdrant.tech/articles/) -- On Unstructured Data, Vector Databases, New AI Age, and Our Seed Round. +<|page-309-lllmstxt|> +## Qovery Scales Real-Time DevOps Automation with Qdrant -[Back to Qdrant Articles](https://qdrant.tech/articles/) +![How Qovery Accelerated Developer Autonomy with Qdrant](/blog/case-study-qovery/case-study-qovery-summary-dark.png) -# On Unstructured Data, Vector Databases, New AI Age, and Our Seed Round. +### Empowering Developers with Autonomous Infrastructure Management -Andre Zayarni +Qovery, trusted by over 200 companies including Alan, Talkspace, GetSafe, and RxVantage, empowers software engineering teams to autonomously manage their infrastructure through its robust DevOps automation platform. As their platform evolved, Qovery recognized an opportunity to enhance developer autonomy further by integrating an AI-powered DevOps Copilot. To achieve real-time accuracy and rapid responses, Qovery selected Qdrant as the backbone of their vector database infrastructure. -· +### Reducing Dependency on Specialized DevOps Expertise -April 19, 2023 +Qovery’s ambitious vision for the DevOps Copilot ([read more here](https://www.qovery.com/blog/how-we-built-an-agentic-devops-copilot-to-automate-infrastructure-tasks-and-beyond/)) was to drastically reduce the reliance on highly specialized DevOps engineers. The goal was to empower software developers—even those without deep DevOps expertise—to manage complex infrastructure tasks through natural language interactions. However, this required extremely accurate, fast, and scalable infrastructure capable of managing a large, continuously updated corpus of data. With 5 years of historical usage data stored in BigQuery and an indexed volume rapidly approaching 500,000 vectors, Qovery needed a solution that was both performant and easy to manage. -![On Unstructured Data, Vector Databases, New AI Age, and Our Seed Round.](https://qdrant.tech/articles_data/seed-round/preview/title.jpg) +![Qovery API](/blog/case-study-qovery/api-qovery.png) -> Vector databases are here to stay. The New Age of AI is powered by vector embeddings, and vector databases are a foundational part of the stack. At Qdrant, we are working on cutting-edge open-source vector similarity search solutions to power fantastic AI applications with the best possible performance and excellent developer experience. -> -> Our 7.5M seed funding – led by [Unusual Ventures](https://www.unusual.vc/), awesome angels, and existing investors – will help us bring these innovations to engineers and empower them to make the most of their unstructured data and the awesome power of LLMs at any scale. +*Qovery API* -We are thrilled to announce that we just raised our seed round from the best possible investor we could imagine for this stage. Let’s talk about fundraising later – it is a story itself that I could probably write a bestselling book about. First, let’s dive into a bit of background about our project, our progress, and future plans. +### Seamless Integration of Scalable and Efficient Vector Search -## [Anchor](https://qdrant.tech/articles/seed-round/\#a-need-for-vector-databases) A need for vector databases. +Qovery chose [Qdrant Cloud](https://qdrant.tech/cloud/) after carefully evaluating several options. Romaric PhilogĂšne, CEO and co-founder of Qovery, highlighted the importance of open-source credibility, performance, ease of use, and scalability. Qdrant’s native support for [real-time indexing](https://qdrant.tech/documentation/concepts/indexing/) and low-latency queries made it ideal for handling Qovery’s significant data volume and frequency of updates. -Unstructured data is growing exponentially, and we are all part of a huge unstructured data workforce. This blog post is unstructured data; your visit here produces unstructured and semi-structured data with every web interaction, as does every photo you take or email you send. The global datasphere will grow to [165 zettabytes by 2025](https://github.com/qdrant/qdrant/pull/1639), and about 80% of that will be unstructured. At the same time, the rising demand for AI is vastly outpacing existing infrastructure. Around 90% of machine learning research results fail to reach production because of a lack of tools. +The integration process was straightforward, with minimal operational overhead, enabling the Qovery team to focus their resources on enhancing the Copilot's capabilities rather than maintaining complex database infrastructure. With its Rust-based architecture, Qdrant delivered the speed, accuracy, and low resource utilization Qovery required. -![Vector Databases Demand](https://qdrant.tech/articles_data/seed-round/demand.png) +*"Qdrant is incredibly performant and stable, with virtually no maintenance overhead. It simply works, letting our engineers focus on the Copilot’s features instead of database management."* + — Romaric PhilogĂšne, CEO, Qovery -Demand for AI tools +![Qovery Background Ingestion](/blog/case-study-qovery/background-ingestion-qovery.png) -Thankfully there’s a new generation of tools that let developers work with unstructured data in the form of vector embeddings, which are deep representations of objects obtained from a neural network model. A vector database, also known as a vector similarity search engine or approximate nearest neighbour (ANN) search database, is a database designed to store, manage, and search high-dimensional data with an additional payload. Vector Databases turn research prototypes into commercial AI products. Vector search solutions are industry agnostic and bring solutions for a number of use cases, including classic ones like semantic search, matching engines, and recommender systems to more novel applications like anomaly detection, working with time series, or biomedical data. The biggest limitation is to have a neural network encoder in place for the data type you are working with. +*Qovery Background Ingestion* -![Vector Search Use Cases](https://qdrant.tech/articles_data/seed-round/use-cases.png) +### Real-time Infrastructure Management at Scale -Vector Search Use Cases +Qovery’s implementation of Qdrant provided immediate benefits, notably in the speed and accuracy critical to DevOps operations. The DevOps Copilot drastically reduced the time developers spent waiting for infrastructure-related tasks. Actions that previously required hours or days are now executed within seconds, enabling Qovery's customers to iterate faster and deploy more reliably. -With the rise of large language models (LLMs), Vector Databases have become the fundamental building block of the new AI Stack. They let developers build even more advanced applications by extending the “knowledge base” of LLMs-based applications like ChatGPT with real-time and real-world data. +*"The integration of Qdrant was so seamless and straightforward—it allowed us to rapidly scale our capabilities and deliver real-time, precise infrastructure management."* + — Romaric PhilogĂšne, CEO, Qovery -A new AI product category, “Co-Pilot for X,” was born and is already affecting how we work. Starting from producing content to developing software. And this is just the beginning, there are even more types of novel applications being developed on top of this stack. +![Qovery Application](/blog/case-study-qovery/devops-qovery.png) -![New AI Stack](https://qdrant.tech/articles_data/seed-round/ai-stack.png) +*Qovery Application* -New AI Stack +Qovery currently manages over 100,000 vectors, with a trajectory to exceed 500,000 within two months. Even at this scale, Qdrant maintained rapid response times and accuracy, allowing Qovery to confidently scale their AI-driven DevOps services to more companies without compromising quality. -## [Anchor](https://qdrant.tech/articles/seed-round/\#enter-qdrant) Enter Qdrant. +### Conclusion: Streamlined Innovation and Enhanced Developer Experience -At the same time, adoption has only begun. Vector Search Databases are replacing VSS libraries like FAISS, etc., which, despite their disadvantages, are still used by ~90% of projects out there They’re hard-coupled to the application code, lack of production-ready features like basic CRUD operations or advanced filtering, are a nightmare to maintain and scale and have many other difficulties that make life hard for developers. +Qovery’s experience demonstrates how a performant, easy-to-use vector database can significantly accelerate product innovation and reduce operational complexity. With Qdrant, Qovery could concentrate its engineering resources on value-added features rather than database management, ultimately leading to faster development cycles and happier customers. -The current Qdrant ecosystem consists of excellent products to work with vector embeddings. We launched our managed vector database solution, Qdrant Cloud, early this year, and it is already serving more than 1,000 Qdrant clusters. We are extending our offering now with managed on-premise solutions for enterprise customers. +<|page-310-lllmstxt|> +# How Tripadvisor Is Reimagining Travel with Qdrant -![Qdrant Vector Database Ecosystem](https://qdrant.tech/articles_data/seed-round/ecosystem.png) +![How Tripadvisor Drives 2–3x More Revenue with Qdrant-Powered AI](/blog/case-study-tripadvisor/case-study-tripadvisor-summary-dark.jpg) -Qdrant Ecosystem +Tripadvisor, the world’s largest travel guidance platform, is undergoing a deep transformation. With hundreds of millions of monthly users and over a billion reviews and contributions, it holds one of the richest datasets in the travel industry. And until recently, that data, particularly its unstructured content, had incredible untapped potential. Now, with the rise of generative AI and the adoption of tools like Qdrant’s vector database, Tripadvisor is unlocking its full potential to deliver intelligent, personalized, and high-impact travel experiences. -Our plan for the current [open-source roadmap](https://github.com/qdrant/qdrant/blob/master/docs/roadmap/README.md) is to make billion-scale vector search affordable. Our recent release of the [Scalar Quantization](https://qdrant.tech/articles/scalar-quantization/) improves both memory usage (x4) as well as speed (x2). Upcoming [Product Quantization](https://www.irisa.fr/texmex/people/jegou/papers/jegou_searching_with_quantization.pdf) will introduce even another option with more memory saving. Stay tuned. +## Activating Billions of Data Assets -Qdrant started more than two years ago with the mission of building a vector database powered by a well-thought-out tech stack. Using Rust as the system programming language and technical architecture decision during the development of the engine made Qdrant the leading and one of the most popular vector database solutions. +The shift was driven by [Rahul Todkar](https://www.linkedin.com/in/rahultodkar/), Head of Data and AI who had previously overseen global AI teams. With a background in building complex data and AI products, including LinkedIn’s data, AI and vector systems, he immediately recognized the opportunity at Tripadvisor: a billion user reviews and contributions \- including hundreds of millions of images \- and years of behavioral data across hotels, restaurants, and experiences. -Our unique custom modification of the [HNSW algorithm](https://qdrant.tech/articles/filtrable-hnsw/) for Approximate Nearest Neighbor Search (ANN) allows querying the result with a state-of-the-art speed and applying filters without compromising on results. Cloud-native support for distributed deployment and replications makes the engine suitable for high-throughput applications with real-time latency requirements. Rust brings stability, efficiency, and the possibility to make optimization on a very low level. In general, we always aim for the best possible results in [performance](https://qdrant.tech/benchmarks/), code quality, and feature set. +The goal was ambitious: convert this sprawling, multimodal dataset into a dynamic, AI-driven platform that enhances both user experience and business impact. -Most importantly, we want to say a big thank you to our [open-source community](https://qdrant.to/discord), our adopters, our contributors, and our customers. Your active participation in the development of our products has helped make Qdrant the best vector database on the market. I cannot imagine how we could do what we’re doing without the community or without being open-source and having the TRUST of the engineers. Thanks to all of you! +## Generative AI at the Core -I also want to thank our team. Thank you for your patience and trust. Together we are strong. Let’s continue doing great things together. +Tripadvisor’s generative AI strategy is built on three pillars: enhancing the user experience, improving internal productivity, and externalizing its AI capabilities to meet users beyond its own platform. -## [Anchor](https://qdrant.tech/articles/seed-round/\#fundraising) Fundraising +The most tangible result to date is Tripadvisor’s AI **Trip Planner**, a generative AI-powered tool that helps travelers build personalized itineraries through conversational prompts. This isn’t just a fancy feature; it’s delivering real impact. -The whole process took only a couple of days, we got several offers, and most probably, we would get more with different conditions. We decided to go with Unusual Ventures because they truly understand how things work in the open-source space. They just did it right. +### 2 to 3x Revenue Uplift from GenAI-Engaged Users -Here is a big piece of advice for all investors interested in open-source: Dive into the community, and see and feel the traction and product feedback instead of looking at glossy pitch decks. With Unusual on our side, we have an active operational partner instead of one who simply writes a check. That help is much more important than overpriced valuations and big shiny names. +Since launch, Trip Planner has shown that users who interact with the generative AI tool spend significantly more. The company reports a **2 to 3 times increase in revenue** from travelers using the new experience compared to those sticking with traditional interfaces. The AI not only simplifies the planning process but also helps surface better, more relevant options—driving higher conversion and satisfaction. -Ultimately, the community and adopters will decide what products win and lose, not VCs. Companies don’t need crazy valuations to create products that customers love. You do not need Ph.D. to innovate. You do not need to over-engineer to build a scalable solution. You do not need ex-FANG people to have a great team. You need clear focus, a passion for what you’re building, and the know-how to do it well. +## From Filters to Conversations: Reinventing Search -We know how. +Tripadvisor is now focused on **reimagining search**—replacing outdated filter-and-tab systems with interactive, bidirectional search interfaces. Instead of typing rigid queries and sifting through static results, users will soon explore destinations, restaurants, or activities through dynamic, context-aware conversations. -PS: This text is written by me in an old-school way without any ChatGPT help. Sometimes you just need inspiration instead of AI ;-) +For example, a traveler might ask: *“What are the best things to do in Lisbon this weekend with kids?”* Instead of sending the user to ten different blogs, the system will synthesize data across reviews, listings, photos, and preferences to deliver a tailored answer directly in the Tripadvisor product. -##### Was this page useful? +## Why Qdrant Is Central to This Vision -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +To support all of this, Tripadvisor needed a way to store and retrieve complex, high-dimensional data, ranging from semantic embeddings to user behavior patterns. -Thank you for your feedback! 🙏 +The team is using Qdrant to build a **user graph**, a multidimensional representation of how users engage with different aspects of a trip: hotels, attractions, food, travel styles, and more. Vector-based retrieval allows them to query this data flexibly and serve responses that feel genuinely personalized. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/seed-round.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +And unlike traditional databases, Qdrant is built for **real-time, unstructured data**, making it ideal for powering conversational AI, search augmentation, and recommendation engines. -On this page: +*“Qdrant has been crucial for our transformation. When you're dealing with over a billion plus user-generated, multi-modal pieces of content from hundreds of millions of monthly active users across 21 countries, 11M businesses and all the complex user interactions that come with it, you need a way to bring it all together. Now, we can represent everything from hotel preferences to restaurant choices to user behavior in a unified way. And we’re seeing real business results. Users engaging with our AI-powered features like trip planning are showing 2-3x more revenue.”* -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/seed-round.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +[*Rahul Todkar*](https://www.linkedin.com/in/rahultodkar) *\- Head of Data and AI* -× +## What’s Next -[Powered by](https://qdrant.tech/) +With Qdrant as a foundational layer, Tripadvisor is only just beginning to tap into the power of its data. The team is already exploring new use cases and looking to deepen its integration of vector search across every stage of the customer journey. And as interest grows in shared learnings and industry best practices, Tripadvisor is also helping shape how other companies apply AI in the real world. -<|page-164-lllmstxt|> -## search-feedback-loop -- [Articles](https://qdrant.tech/articles/) -- Relevance Feedback in Informational Retrieval +This isn’t just a story about search; it’s about building a smarter, more adaptive travel experience. One where data is no longer just stored, but truly put to work. -[Back to Machine Learning](https://qdrant.tech/articles/machine-learning/) +<|page-311-lllmstxt|> +## Precision at Scale: How Aracor Uses Qdrant to Accelerate Legal Due Diligence Resulting in 90% Faster Workflows ## -# Relevance Feedback in Informational Retrieval +![How Aracor Sped Up Due Diligence Workflows by 90%](/blog/case-study-aracor/case-study-aracor-bento-dark.jpg) -Evgeniya Sukhodolskaya +### How Aracor Accelerated Legal Due Diligence with Qdrant Vector Search -· +The world of mergers and acquisitions (M\&A) is notoriously painstaking, slow, expensive and error-prone. Lawyers spend weeks combing through thousands of documents—validating signatures, comparing versions, and flagging risks. -March 27, 2025 +Lawyers and dealmakers sift through mountains of documents—often numbering into the thousands—to validate every detail, from validating signatures, comparing the documents involved in the deal transaction, flagging risks to to patent validity. This meticulous process typically drains weeks or even months of productivity from highly trained professionals. [Aracor AI](https://aracor.ai/) set out to change that and to solve the M\&A transparency gap. The Miami-based AI platform is laser-focused on transforming this painstaking due diligence into an automated, accurate, and dramatically faster operation. -![Relevance Feedback in Informational Retrieval](https://qdrant.tech/articles_data/search-feedback-loop/preview/title.jpg) +### The Challenge: Mountains of Documents, Mountains of Pain -> A problem well stated is a problem half solved. +Before Aracor, M\&A lawyers faced countless hours verifying signatures, comparing contract versions, and manually summarizing massive troves of legal documentation. Traditional attempts at automation—such as generic summaries from AI tools like ChatGPT—fell short, lacking the critical accuracy, citations, and domain-specific context lawyers demand. The process was expensive, slow, and fraught with potential inaccuracies. -This quote applies as much to life as it does to information retrieval. +### Legal AI at Scale: Hybrid, Filtered, Multitenant Search -With a well-formulated query, retrieving the relevant document becomes trivial. -In reality, however, most users struggle to precisely define what they are searching for. +Aracor built an end-to-end AI platform specifically tailored for the complex, precise requirements of dealmakers: family offices, private equity firms, venture capitalists, and other high-stakes investors. At the core of their innovation was a robust vector search capability provided by Qdrant. -While users may struggle to formulate a perfect request — especially in unfamiliar topics — they can easily judge whether a retrieved answer is relevant or not. +Lesly Arun Franco, CTO of Aracor, explains, "Search is a massive problem. Our platform ingests thousands of legal documents, each requiring precise retrieval and accurate citations. Without Qdrant, delivering this level of performance and scale was nearly impossible." -**Relevance is a powerful feedback mechanism for a retrieval system** to iteratively refine results in the direction of user interest. +By adopting Qdrant's vector database, Aracor gained the critical ability to efficiently index and search through massive document repositories. This empowered the platform to automatically generate precise document summaries, accurate comparison reports, signature verifications, and essential citations for every finding—features indispensable in the rigorous legal world. -In 2025, with social media flooded with daily AI breakthroughs, it almost seems like information retrieval is solved, agents can iteratively adjust their search queries while assessing the relevance. +### Why Qdrant? -Of course, there’s a catch: these models still rely on retrieval systems ( _RAG isn’t dead yet, despite daily predictions of its demise_). -They receive only a handful of top-ranked results provided by a far simpler and cheaper retriever. -As a result, the success of guided retrieval still mainly depends on the retrieval system itself. +When Aracor first set out to integrate a vector database roughly eighteen months ago, Lesly and the team evaluated several vendors. Qdrant stood out for several reasons: -So, we should find a way of effectively and efficiently incorporating relevance feedback directly into a retrieval system. -In this article, we’ll explore the approaches proposed in the research literature and try to answer the following question: +* **Open-source and Self-hosted:** Qdrant provided a developer-friendly, easily deployable solution at a stage when Aracor needed both flexibility and affordability. -_If relevance feedback in search is so widely studied and praised as effective, why is it practically not used in dedicated vector search solutions?_ +* **Superior Scalability:** Unlike other databases, Qdrant offered seamless scaling and robust handling of immense document volumes—critical as Aracor’s client base rapidly expanded. -## [Anchor](https://qdrant.tech/articles/search-feedback-loop/\#dismantling-the-relevance-feedback) Dismantling the Relevance Feedback +* **Hybrid and Metadata-Driven Search:** Qdrant made it possible to combine semantic search with structured filters—so users can instantly surface the exact clause, obligation, or restriction they need, even inside complex, nested legal documents. This dramatically improves speed, accuracy, and confidence in results. -Both industry and academia tend to reinvent the wheel here and there. -So, we first took some time to study and categorize different methods — just in case there was something we could plug directly into Qdrant. -The resulting taxonomy isn’t set in stone, but we aim to make it useful. +### Tangible Results with Qdrant -![Types of Relevance Feedback](https://qdrant.tech/articles_data/search-feedback-loop/relevance-feedback.png) +Since integrating Qdrant, Aracor has realized substantial operational improvements: -Types of Relevance Feedback +* **Massive Time Savings:** Tasks such as signature validation and document summarization, previously taking weeks, now complete in mere minutes. Customers report: 90% faster due diligence workflows, 70% reduction in document turnaround time, and 40% fewer legal hours required. -### [Anchor](https://qdrant.tech/articles/search-feedback-loop/\#pseudo-relevance-feedback-prf) Pseudo-Relevance Feedback (PRF) +* **Increased Accuracy:** High-quality citation and retrieval accuracy have significantly increased user trust, a crucial advantage in the meticulous legal environment. -Pseudo-Relevance feedback takes the top-ranked documents from the initial retrieval results and treats them as relevant. This approach might seem naive, but it provides a noticeable performance boost in lexical retrieval while being relatively cheap to compute. +* **Scalable Infrastructure:** Transitioning from self-hosted to Qdrant's cloud solution has streamlined operations, allowing Aracor’s technical team to focus on further feature development and optimization, such as integrating multimodal embeddings and hybrid search. -### [Anchor](https://qdrant.tech/articles/search-feedback-loop/\#binary-relevance-feedback) Binary Relevance Feedback +### Looking Forward -The most straightforward way to gather feedback is to ask users directly if document is relevant. -There are two main limitations to this approach: +With Qdrant handling the heavy lifting of scalable vector search, Aracor continues to innovate, working towards even more sophisticated multimodal and domain-specific embedding techniques. As they expand their platform, Aracor is confident in their capacity to support increasingly complex, high-volume document processing needs, all backed by the proven power and reliability of Qdrant’s vector database solution. -First, users are notoriously reluctant to provide feedback. Did you know that [Google once had](https://en.wikipedia.org/wiki/Google_SearchWiki#:~:text=SearchWiki%20was%20a%20Google%20Search,for%20a%20given%20search%20query) an upvote/downvote mechanism on search results but removed it because almost no one used it? +## -Second, even if users are willing to provide feedback, no relevant documents might be present in the initial retrieval results. In this case, the user can’t provide a meaningful signal. +<|page-312-lllmstxt|> +## Garden Accelerates Patent Intelligence with Qdrant’s Filterable Vector Search -Instead of asking users, we can ask a smart model to provide binary relevance judgements, but this would limit its potential to generate granular judgements. +![How Garden Unlocked AI Patent Analysis](/blog/case-study-garden/case-study-garden-bento-dark.jpg) -### [Anchor](https://qdrant.tech/articles/search-feedback-loop/\#re-scored-relevance-feedback) Re-scored Relevance Feedback +For more than a century, patent litigation has been a slow, people-powered business. Analysts read page after page—sometimes tens of thousands of pages—hunting for the smoking-gun paragraph that proves infringement or invalidity. Garden, a New York-based startup, set out to change that by applying large-scale AI to the entire global patent corpus—more than 200 million patents—in conjunction with terabytes of real world data. -We can also apply more sophisticated methods to extract relevance feedback from the top-ranked documents - machine learning models can provide a relevance score for each document. +*“Our customers need to compare millions of possible patent–product pairings in seconds, not days,” explains co-founder Justin Mack. “That means vector search that can handle huge data sets and surgical-grade filtering.”* -The obvious concern here is twofold: +### A data set that breaks naĂŻve vector search -1. How accurately can the automated judge determine relevance (or irrelevance)? -2. How cost-efficient is it? After all, you can’t expect GPT-4o to re-rank thousands of documents for every user query — unless you’re filthy rich. +Each patent can run to 100+ pages and, thanks to decades of revisions, carries roughly 2,000 metadata fields: jurisdiction, grant date, family ID, claim dependencies, and so on. Garden splits every patent into semantically meaningful chunks, producing “many hundreds of millions” of vectors. The same pipeline ingests real-world product data to compare against the patents. -Nevertheless, automated re-scored feedback could be a scalable way to improve search when explicit binary feedback is not accessible. +The engineering demands quickly outgrew Garden’s first solution, a fully-managed vector service. They had tens of gigabytes of data already costing ≈ $5,000 / month. And a lack of native filterable-HNSW meant that Garden had to stand up a separate index for every combination of country, date range, and technology tag. Finally, with no infrastructure visibility, troubleshooting was slow and expensive. -## [Anchor](https://qdrant.tech/articles/search-feedback-loop/\#has-the-problem-already-been-solved) Has the Problem Already Been Solved? +A second migration to a self-hosted open-source alternative cut costs but introduced new pains: on-call operations for a two-person team, upgrades during business hours, and—crucially—the same filtering limitations. -Digging through research materials, we expected anything else but to discover that the first relevance feedback study dates back [_sixty years_](https://sigir.org/files/museum/pub-08/XXIII-1.pdf). -In the midst of the neural search bubble, it’s easy to forget that lexical (term-based) retrieval has been around for decades. Naturally, research in that field has had enough time to develop. +### Discovering Qdrant -**Neural search** — aka [vector search](https://qdrant.tech/articles/neural-search-tutorial/) — gained traction in the industry around 5 years ago. Hence, vector-specific relevance feedback techniques might still be in their early stages, awaiting production-grade validation and industry adoption. +When Garden found Qdrant’s blog post on filterable HNSW, the team realized they could get the search semantics they wanted without bolting on bespoke sharding logic. -As a [dedicated vector search engine](https://qdrant.tech/articles/dedicated-vector-search/), we would like to be these adopters. -Our focus is neural search, but approaches in both lexical and neural retrieval seem worth exploring, as cross-field studies are always insightful, with the potential to reuse well-established methods of one field in another. +*“Filterable HNSW was the deal-maker, but Qdrant Cloud’s *managed* Rust backbone sealed it,” says Mack. “We kept source-level transparency while off-loading 24×7 ops.”* -We found some interesting methods applicable to neural search solutions and additionally revealed a **gap in the neural search-based relevance feedback approaches**. Stick around, and we’ll share our findings! +* **Scalar quantization (8-bit)** keeps hot vectors in RAM while colder, full-precision embeddings sit on disk—perfect for Garden’s read-heavy, bursty workload. -## [Anchor](https://qdrant.tech/articles/search-feedback-loop/\#two-ways-to-approach-the-problem) Two Ways to Approach the Problem +* **SLA-backed sub-100ms latency** meets Garden’s product target even when a user fires off thousands of queries in a single button-click. -Retrieval as a recipe can be broken down into three main ingredients: +* **Pay-for-what-you-use pricing** lets Garden store 10× more data for roughly the same cost it once paid for a fraction of the corpus. -1. Query -2. Documents -3. Similarity scoring between them. +### Migration in practice -![Research Field Taxonomy Overview](https://qdrant.tech/articles_data/search-feedback-loop/taxonomy-overview.png) +Garden already held all vectors in Google Cloud Storage. A weekend of scripted ETL pushed the embeddings into Qdrant Cloud. Because Qdrant’s ingestion API mirrors popular open-source conventions, the team only altered a few lines of an existing migration script. The heaviest lift—GPU-based embedding of 200M patents—was finished months earlier on a 2,000-GPU transient cluster. -Research Field Taxonomy Overview +### **Business impact** -Query formulation is a subjective process – it can be done in infinite configurations, making the relevance of a document unpredictable until the query is formulated and submitted to the system. +| KPI | Before Qdrant | After Qdrant | +| ----- | ----- | ----- | +| Addressable patent corpus | ≈ 20M | **200M+** | +| Vector data under management | tens of millions | **hundreds of millions** | +| Typical query latency | 250 – 400ms | **\< 100ms p95** | +| Cost per stored GB | baseline | **\~ 10× lower** | +| New revenue lines | 0 | **Full infringement-analysis product** | -So, adapting documents (or the search index) to relevance feedback would require per-request dynamic changes, which is impractical, considering that modern retrieval systems store billions of documents. +Filterable HNSW didn’t just speed up existing workflows; it unlocked an entirely new line of business—high-confidence infringement detection. Clients now click a button and receive a claim-chart quality analysis in minutes. For some enterprises that translates into seven-plus-figure licensing wins or decisive defense against patent trolls. -Thus, approaches for incorporating relevance feedback in search fall into two categories: **refining a query** and **refining the similarity scoring function** between the query and documents. +### Looking ahead -## [Anchor](https://qdrant.tech/articles/search-feedback-loop/\#query-refinement) Query Refinement +As Garden’s customer base grows, query-per-second (QPS) requirements will rise faster than data volume. Meanwhile, Garden plans deeper enrichment of every patent—breaking long descriptions into structured facts the vector index can exploit. -There are several ways to refine a query based on relevance feedback. -Globally, we prefer to distinguish between two approaches: modifying the query as text and modifying the vector representation of the query. +*“We don’t have to think about the vector layer anymore,” Mack notes. “Qdrant lets us focus on the IP insights our customers pay for.”* -![Incorporating Relevance Feedback in Query](https://qdrant.tech/articles_data/search-feedback-loop/query.png) +<|page-313-lllmstxt|> +# Exploring Qdrant Cloud just got easier -Incorporating Relevance Feedback in Query +We always aim to simplify our product for developers, platform teams, and enterprises. -### [Anchor](https://qdrant.tech/articles/search-feedback-loop/\#query-as-text) Query As Text +Here’s a quick overview of recent improvements designed to simplify your journey from login, creating your first cluster, prototyping, and going to production. -In **term-based retrieval**, an intuitive way to improve a query would be to **expand it with relevant terms**. It resembled the “ _aha, so that’s what it’s called_” stage in the discovery search. + -Before the deep learning era of this century, expansion terms were mainly selected using statistical or probabilistic models. The idea was to: +## Simplified Login -1. Either extract the **most frequent** terms from (pseudo-)relevant documents; -2. Or the **most specific** ones (for example, according to IDF); -3. Or the **most probable** ones (most likely to be in query according to a relevance set). +We've reduced the steps to create and access your account, and also simplified navigation between login and registration. -Well-known methods of those times come from the family of [Relevance Models](https://sigir.org/wp-content/uploads/2017/06/p260.pdf), where terms for expansion are chosen based on their probability in pseudo-relevant documents (how often terms appear) and query terms likelihood given those pseudo-relevant documents - how strongly these pseudo-relevant documents match the query. +![JPG of log in page](/blog/product-ui-changes/log-in-page.jpg) -The most famous one, `RM3` – interpolation of expansion terms probability with their probability in a query – is still appearing in papers of the last few years as a (noticeably decent) baseline in term-based retrieval, usually as part of [anserini](https://github.com/castorini/anserini). +Upon log in, of course you continue to have the option to toggle between dark and light mode, or choose your system default. -![Simplified Query Expansion](https://qdrant.tech/articles_data/search-feedback-loop/relevance-models.png) +![GIF of toggling between dark and light mode](/blog/product-ui-changes/gif-dark-light-mode.gif) -Simplified Query Expansion +Now for the biggest improvements. -With the time approaching the modern machine learning era, [multiple](https://aclanthology.org/2020.findings-emnlp.424.pdf) [studies](https://dl.acm.org/doi/10.1145/1390334.1390377) began claiming that these traditional ways of query expansion are not as effective as they could be. +## Effortless Cluster Creation -Started with simple classifiers based on hand-crafted features, this trend naturally led to use the famous [BERT (Bidirectional encoder representations from transformers)](https://huggingface.co/docs/transformers/model_doc/bert). For example, `BERT-QE` (Query Expansion) authors came up with this schema: +When you log in for your first time, it’s incredibly easy to create your first cluster (completely free, no need to share any payment details). -1. Get pseudo-relevance feedback from the finetuned BERT reranker (~10 documents); -2. Chunk these pseudo-relevant documents (~100 words) and score query-chunk relevance with the same reranker; -3. Expand the query with the most relevant chunks; -4. Rerank 1000 documents with the reranker using the expanded query. +Simply name your first cluster, choose your region, and click “Create.” Your cluster will start spinning up immediately\! -This approach significantly outperformed BM25 + RM3 baseline in experiments (+11% NDCG@20). However, it required **11.01x** more computation than just using BERT for reranking, and reranking 1000 documents with BERT would take around 9 seconds alone. +The 1 GB free gives you enough storage for approximately 1 million vectors at 768 dimensions and is great for prototyping and learning. -Query term expansion can _hypothetically_ work for neural retrieval as well. New terms might shift the query vector closer to that of the desired document. However, [this approach isn’t guaranteed to succeed](https://dl.acm.org/doi/10.1145/3570724). Neural search depends entirely on embeddings, and how those embeddings are generated — consequently, how similar query and document vectors are — depends heavily on the model’s training. +![GIF of the first cluster being created](/blog/product-ui-changes/my-first-cluster.gif) -It definitely works if **query refining is done by a model operating in the same vector space**, which typically requires offline training of a retriever. -The goal is to extend the query encoder input to also include feedback documents, producing an adjusted query embedding. Examples include [`ANCE-PRF`](https://arxiv.org/pdf/2108.13454) and [`ColBERT-PRF`](https://dl.acm.org/doi/10.1145/3572405) – ANCE and ColBERT fine-tuned extensions. +Don’t forget to create a collection, add vectors, then run your first search. -![Generating a new relevance-aware query vector](https://qdrant.tech/articles_data/search-feedback-loop/updated-encoder.png) +## Cluster Overview -Generating a new relevance-aware query vector +Now this is probably where you will spend most of your time when building with Qdrant. -The reason why you’re most probably not familiar with these models – their absence in the industry – is that their **training** itself is a **high upfront cost**, and even though it was “paid”, these models [struggle with generalization](https://arxiv.org/abs/2108.13454), performing poorly on out-of-domain tasks (datasets they haven’t seen during training). -Additionally, feeding an attention-based model a lengthy input (query + documents) is not a good practice in production settings (attention is quadratic in the input length), where time and money are crucial decision factors. +When looking at the overview of your cluster, we’ve added new tabs with an improved menu structure. -Alternatively, one could skip a step — and work directly with vectors. +![Image of cluster overview](/blog/product-ui-changes/my-first-cluster-overview.jpg) -### [Anchor](https://qdrant.tech/articles/search-feedback-loop/\#query-as-vector) Query As Vector +* **Overview**: This has everything you need at a glance, including a visual of your node, disk, RAM, CPU usage so you can see if you are approaching any limits. It’s also easier now to scale your cluster. Once scaled to a paid tier via credit card or marketplace, you can access backup and disaster recovery, a 99.5% uptime SLA, horizontal and vertical scaling, monitoring and log management, and more. +* **API** **keys**: Manage access to your database cluster +* **Metrics**: a visualization of your resources, RAM, CPU, disk and requests over different timeframes +* **Logs**: get a real-time window into what’s happening inside cluste for transparency, diagnostics, and control (especially important during debugging, performance tuning, or infrastructure troubleshooting\!) +* **Backups:** View snapshots of your vector data and metadata that can be used to restore your collections in case of data loss, migration, or rollbacks (not available on free clusters) +* **Configuration**: Check your collection defaults and add advanced optimizations (after reading Docs of course) + * For example, we advise against setting up a ton of different collections. Instead segment with [payloads](https://qdrant.tech/documentation/concepts/payload/). -Instead of modifying the initial query, a more scalable approach is to directly adjust the query vector. -It is easily applicable across modalities and suitable for both lexical and neural retrieval. +When viewing the details of your clusters, you can now view the Cluster UI Dashboard regardless of where you are, and also have easier access to tutorials and resources. -Although vector search has become a trend in recent years, its core principles have existed in the field for decades. For example, the SMART retrieval system used by [Rocchio](https://sigir.org/files/museum/pub-08/XXIII-1.pdf) in 1965 for his relevance feedback experiments operated on bag-of-words vector representations of text. +We also are making it more seamless to update your cluster to the latest version as well as see GitHub release notes. + +![JPG of changing version](/blog/product-ui-changes/change-version.png) -![Roccio’s Relevance Feedback Method](https://qdrant.tech/articles_data/search-feedback-loop/Roccio.png) +## Get Started Overview -Roccio’s Relevance Feedback Method +Next we’ve done a major overhaul to the “Get Started” page. Our goal is to make it as easy as possible for you to find the resources you need, whether it's guides, sample data, or tutorials. + +![Image of Get Started webpage](/blog/product-ui-changes/get-started-overview.jpg) -**Rocchio’s idea** — to update the query vector by adding a difference between the centroids of relevant and non-relevant documents — seems to translate well to modern dual encoders-based dense retrieval systems. -Researchers seem to agree: a study from 2022 demonstrated that the [parametrized version of Rocchio’s method](https://arxiv.org/pdf/2108.11044) in dense retrieval consistently improves Recall@1000 by 1–5%, while keeping query processing time suitable for production — around 170 ms. +**Explore Your Data or Start with Samples** +You’ll see immediately pertinent information to help you get the most out of Qdrant quickly, including the [Cloud Quickstart guide](https://qdrant.tech/documentation/quickstart-cloud/), and resources to help you get your data into Qdrant, or use sample data. -However, parameters (centroids and query weights) in the dense retrieval version of Roccio’s method must be tuned for each dataset and, ideally, also for each request. +Learn about the different ways to connect to your cluster, use the Qdrant API, try out sample data, and our personal favorite, use the Qdrant Cluster UI to view your collection data and access tutorials. -#### [Anchor](https://qdrant.tech/articles/search-feedback-loop/\#gradient-descent-based-methods) Gradient Descent-Based Methods +**Build World Class Applications** +If you are ready to build an app, but looking for ideas or the best place to start, we have our top three tutorials highlighted for you. -The efficient way of doing so on-the-fly remained an open question until the introduction of a **gradient-descent-based Roccio’s method generalization**: [`Test-Time Optimization of Query Representations (TOUR)`](https://arxiv.org/pdf/2205.12680). -TOUR adapts a query vector over multiple iterations of retrieval and reranking ( _retrieve → rerank → gradient descent step_), guided by a reranker’s relevance judgments. +Learn how to: +* [Build a hybrid search service](https://qdrant.tech/documentation/beginner-tutorials/hybrid-search-fastembed/) with [FastEmbed](https://github.com/qdrant/fastembed) +* [Build a RAG app with DeepSeek](https://qdrant.tech/documentation/rag-deepseek/) for semantic query enrichment +* [Connect Qdrant with your data stack](https://qdrant.tech/documentation/data-management/) for seamless workflows -![An overview of TOUR iteratively optimizing initial query representation based on pseudo relevance feedback. Figure adapted from Sung et al., 2023, Optimizing Test-Time Query Representations for Dense Retrieval](https://qdrant.tech/articles_data/search-feedback-loop/TOUR.png) +**Pick a Deployment Model** -An overview of TOUR iteratively optimizing initial query representation based on pseudo relevance feedback. +If you are looking for freedom of choice, [enterprise-readiness](https://qdrant.tech/blog/enterprise-vector-search/), and scalability without [vendor lock-in](https://qdrant.tech/blog/are-you-vendor-locked/), look no further. Here you can learn about the different deployment options we offer. -Figure adapted from Sung et al., 2023, [Optimizing Test-Time Query Representations for Dense Retrieval](https://arxiv.org/pdf/2205.12680) +Whether you want a fully managed experience, complete infrastructure control, or something in between, Qdrant delivers. -The next iteration of gradient-based methods of query refinement – [`ReFit`](https://arxiv.org/abs/2305.11744) – proposed in 2024 a lighter, production-friendly alternative to TOUR, limiting _retrieve → rerank → gradient descent_ sequence to only one iteration. The retriever’s query vector is updated through matching (via [Kullback–Leibler divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence)) retriever and cross-encoder’s similarity scores distribution over feedback documents. ReFit is model- and language-independent and stably improves Recall@100 metric on 2–3%. +**Support, Community, and Docs** -![An overview of ReFit, a gradient-based method for query refinement](https://qdrant.tech/articles_data/search-feedback-loop/refit.png) +We have robust documentation, as well as a global [community](https://discord.com/invite/qdrant) of users that share projects, advice, and help each other build. If you run into technical issues, our support team is happy to help troubleshoot. Here you can find what you need if you run into roadblocks when building. -An overview of ReFit, a gradient-based method for query refinement +## See for Yourself -Gradient descent-based methods seem like a production-viable option, an alternative to finetuning the retriever (distilling it from a reranker). -Indeed, it doesn’t require in-advance training and is compatible with any re-ranking models. +If you haven’t tried Qdrant Cloud yet, now is the time to get started. -However, a few limitations baked into these methods prevented a broader adoption in the industry. +[**Try now!**](https://cloud.qdrant.io/signup) -The gradient descent-based methods modify elements of the query vector as if it were model parameters; therefore, -they require a substantial amount of feedback documents to converge to a stable solution. +Want to share feedback? Email us at community@qdrant.com -On top of that, the gradient descent-based methods are sensitive to the choice of hyperparameters, leading to **query drift**, where the query may drift entirely away from the user’s intent. +<|page-314-lllmstxt|> +## From Manual Bottlenecks to Millisecond Matching: Connecting Africa’s Best Talent -## [Anchor](https://qdrant.tech/articles/search-feedback-loop/\#similarity-scoring) Similarity Scoring +![Pariti slashes vetting time and boosted candidate placement success.](/blog/case-study-pariti/case-study-pariti-summary-dark.jpg) -![Incorporating Relevance Feedback in Similarity Scoring](https://qdrant.tech/articles_data/search-feedback-loop/similairty-scoring.png) +Pariti’s mission is bold: connect Africa’s best talent with the continent’s most-promising startups—fast. Its referral-driven marketplace lets anyone nominate a great candidate, but viral growth triggered an avalanche of data. A single job post now attracts more than 300 applicants within 72 hours, yet Pariti still promises clients an interview-ready shortlist inside those same five days. -Incorporating Relevance Feedback in Similarity Scoring +By 2023 the strain was obvious. Analysts spent four minutes vetting each rĂ©sumĂ© and frequently worked through 400+ candidate backlogs. As fatigue set in, strong profiles buried near the bottom went unseen. Meanwhile, roughly 70,000 historical candidates sat idle because there was no practical way to resurface them. Fill-rate plateaued at just 20 percent. -Another family of approaches is built around the idea of incorporating relevance feedback directly into the similarity scoring function. -It might be desirable in cases where we want to preserve the original query intent, but still adjust the similarity score based on relevance feedback. +### A Laptop Experiment Shows the Way -In **lexical retrieval**, this can be as simple as boosting documents that share more terms with those judged as relevant. +Data Scientist Chiara Stramaccioni built a quick Python script on her laptop: encode the text requirements of a new role, embed every candidate’s experience, compare vectors, and rank the results. Quality looked excellent, but each query took half a minute of local compute, and only Chiara could run it. The prototype proved feasibility, but it did not solve scale. -Its **neural search counterpart** is a [`k-nearest neighbors-based method`](https://aclanthology.org/2022.emnlp-main.614.pdf) that adjusts the query-document similarity score by adding the sum of similarities between the candidate document and all known relevant examples. -This technique yields a significant improvement, around 5.6 percentage points in NDCG@20, but it requires explicitly labelled (by users) feedback documents to be effective. +### Dropping Qdrant into Production -In experiments, the knn-based method is treated as a reranker. In all other papers, we also found that adjusting similarity scores based on relevance feedback is centred around [reranking](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/) – **training or finetuning rerankers to become relevance feedback-aware**. -Typically, experiments include cross-encoders, though [simple classifiers are also an option](https://arxiv.org/pdf/1904.08861). -These methods generally involve rescoring a broader set of documents retrieved during an initial search, guided by feedback from a smaller top-ranked subset. It is not a similarity matching function adjustment per se but rather a similarity scoring model adjustment. +Engineering Lead Elvis Moraa needed a production-grade vector database that could ship immediately and stay out of the team’s way. He chose Qdrant Cloud for three pragmatic reasons: -Methods typically fall into two categories: +1. Zero-ops deployment—a managed cluster spun up in minutes with regional hosting on Google Cloud, keeping latency low. -1. **Training rerankers offline** to ingest relevance feedback as an additional input at inference time, [as here](https://aclanthology.org/D18-1478.pdf) — again, attention-based models and lengthy inputs: a production-deadly combination. -2. **Finetuning rerankers** on relevance feedback from the first retrieval stage, [as BaumgĂ€rtner et al. did](https://aclanthology.org/2022.emnlp-main.614.pdf), finetuning bias parameters of a small cross-encoder per query on 2k, k={2, 4, 8} feedback documents. +2. An intuitive Python SDK—analysts could call it as easily as Pandas, without wrestling with Kafka or index-tuning ceremonies. -The biggest limitation here is that these reranker-based methods cannot retrieve relevant documents beyond those returned in the initial search, and using rerankers on thousands of documents in production is a no-go – it’s too expensive. -Ideally, to avoid that, a similarity scoring function updated with relevance feedback should be used directly in the second retrieval iteration. However, in every research paper we’ve come across, retrieval systems are **treated as black boxes** — ingesting queries, returning results, and offering no built-in mechanism to modify scoring. +3. Clear documentation to move from “Hello, vectors” to a live integration in a single afternoon. -## [Anchor](https://qdrant.tech/articles/search-feedback-loop/\#so-what-are-the-takeaways) So, what are the takeaways? +Pariti ingested the entire 70,000-candidate corpus, and a lightweight back-end now creates embeddings the moment new data arrives. Queries travel over HTTP and come back in between 22 and 40 milliseconds with 0 percent downtime since launch. -Pseudo Relevance Feedback (PRF) is known to improve the effectiveness of lexical retrievers. Several PRF-based approaches – mainly query terms expansion-based – are successfully integrated into traditional retrieval systems. At the same time, there are **no known industry-adopted analogues in neural (vector) search dedicated solutions**; neural search-compatible methods remain stuck in research papers. +### What the Workflow Looks Like Today -The gap we noticed while studying the field is that researchers have **no direct access to retrieval systems**, forcing them to design wrappers around the black-box-like retrieval oracles. This is sufficient for query-adjusting methods but not for similarity scoring function adjustment. +Hiring analysts open an internal web app, paste or tweak the role description, and adjust sliders that weight features such as industry expertise or publication history. The app hits Qdrant, which returns a ranked shortlist instantly. Because the vectors already sit in memory, each retrieval saves an average of 34 seconds compared with the old monthly-refreshed pickle file. -Perhaps relevance feedback methods haven’t made it into the neural search systems for trivial reasons — like no one having the time to find the right balance between cost and efficiency. +Every recommendation still gets a human glance, but that glance is now brief: vetting time has fallen from four minutes to one minute per candidate—a 70 percent reduction. When a new vacancy closely resembles past roles, the analysts skip manual sourcing almost entirely. They refine the filters, review the top suggestions, and send them onward. For these “database finds”, 24 percent of candidates make it to interviews, quadruple the 6 percent success rate of traditional channels. -Getting it to work in a production setting means experimenting, building interfaces, and adapting architectures. Simply put, it needs to look worth it. And unlike 2D vector math, high-dimensional vector spaces are anything but intuitive. The curse of dimensionality is real. So is query drift. Even methods that make perfect sense on paper might not work in practice. +Accuracy remains the north star. Pariti defines “true high performers” as applicants who ultimately receive an offer from a client. Over the past quarter 94 percent of those winners were already sitting in the application’s top decile, giving hiring managers near-perfect confidence that they are seeing the best talent first. -A real-world solution should be simple. Maybe just a little bit smarter than a rule-based approach, but still practical. It shouldn’t require fine-tuning thousands of parameters or feeding paragraphs of text into transformers. **And for it to be effective, it needs to be integrated directly into the retrieval system itself.** +### Better search results \= more hires -##### Was this page useful? +* Fill-rate soared from 20 percent to 48 percent, and Pariti now averages eight successful placements every month, sustaining a vacancy-fill rate comfortably above 40 percent. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +* Analysts reclaimed entire workdays each week; instead of drowning in rĂ©sumĂ© triage, they spend time coaching clients and candidates. -Thank you for your feedback! 🙏 +* The platform handles 100-plus searches per day without breaking a sweat, and has logged zero unplanned outages since migrating to Qdrant. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/search-feedback-loop.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +“Qdrant is the last thing I worry about breaking.” — Elvis Moraa, Engineering Lead, Pariti -On this page: +### The Road Ahead -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/search-feedback-loop.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +The team is productising the tool as a customer-facing portal. Hiring managers will tune ranking sliders themselves and watch shortlists refresh in real time. To meet the coming spike in traffic, Pariti is evaluating GPU-assisted indexing and vector quantization—features already built into Qdrant—while keeping costs in line with the challenging realities of many African startup budgets. -× +What began as an after–hours experiment on a single laptop has become the backbone of a talent marketplace that moves at startup speed. With Qdrant handling the heavy lifting in 22 milliseconds, Pariti can focus on its real job: unlocking opportunity. -[Powered by](https://qdrant.tech/) +<|page-315-lllmstxt|> +## Inside Dust’s Vector Stack Overhaul: Scaling to 5,000+ Data Sources with Qdrant -<|page-165-lllmstxt|> -## advanced-tutorials -- [Documentation](https://qdrant.tech/documentation/) -- Advanced Retrieval +![How Dust Scaled to 5,000+ Data Sources with Qdrant](/blog/case-study-dust-v2/case-study-dust-v2-v2-bento-dark.jpg) -# [Anchor](https://qdrant.tech/documentation/advanced-tutorials/\#advanced-tutorials) Advanced Tutorials +### The Challenge: Scaling AI Infrastructure for Thousands of Data Sources -| | -| --- | -| [Use Collaborative Filtering to Build a Movie Recommendation System with Qdrant](https://qdrant.tech/documentation/advanced-tutorials/collaborative-filtering/) | -| [Build a Text/Image Multimodal Search System with Qdrant and FastEmbed](https://qdrant.tech/documentation/advanced-tutorials/multimodal-search-fastembed/) | -| [Navigate Your Codebase with Semantic Search and Qdrant](https://qdrant.tech/documentation/advanced-tutorials/code-search/) | -| [Ensure optimal large-scale PDF Retrieval with Qdrant and ColPali/ColQwen](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/) | +Dust, an OS for AI-native companies enabling users to build AI agents powered by actions and company knowledge, faced a set of growing technical hurdles as it scaled its operations. The company's core product enables users to give AI agents secure access to internal and external data resources, enabling enhanced workflows and faster access to information. However, this mission hit bottlenecks when their infrastructure began to strain under the weight of thousands of data sources and increasingly demanding user queries. -##### Was this page useful? +Initially, Dust employed a strategy of creating a separate vector collection per data source, which rapidly became unsustainable. As the number of data sources ballooned beyond 5,000, the platform began experiencing significant performance degradation. RAM consumption skyrocketed, and vector search performance slowed dramatically, especially as the memory-mapped vectors spilled onto disk storage. At one point, they were managing nearly a thousand collections simultaneously and processing over a million vector upsert and delete operations in a single cycle. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +### Evaluation and Selection: Why Dust Chose Qdrant -Thank you for your feedback! 🙏 +The Dust team explored several popular vector databases. While each had merits, none met all of Dust’s increasingly complex needs. Some providers’ developer experience didn’t align with their workflows, and others lacked the deployment flexibility required. Dust needed a solution capable of handling multi-tenancy at scale, embedding model flexibility, efficient memory usage, and deep configurability. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/advanced-tutorials/_index.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Qdrant stood out thanks to its open-source Rust foundation, giving Dust the control they needed over memory, performance, and customization. Its intuitive API and strong developer community also made the integration experience more seamless. Critically, Qdrant’s design allowed Dust to consolidate their fragmented architecture—replacing thousands of individual collections with a few shared, multi-tenant ones powered by robust sharding and payload filtering. -On this page: +### Implementation Highlights: Advanced Architecture with Qdrant -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/advanced-tutorials/_index.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +One of the most impactful features Dust adopted was scalar quantization. This reduced vector storage size by a factor of four, enabling the team to keep data in memory rather than falling back to slower disk storage. This shift alone led to dramatic latency improvements. Where queries in large collections once took 5 to 10 seconds, they now returned in under a second. Even in collections with over a million vectors and heavy payloads, search responses consistently clocked in well below the one-second mark. -× +Dust also built a custom `DustQdrantClient` to manage all vector-related operations. This client abstracted away differences between cluster versions, embedding models, and sharding logic, simplifying ongoing development. Their infrastructure runs in Google Cloud Platform, with Qdrant deployed in isolated VPCs that communicate with Dust's core APIs using secure authentication. The architecture is replicated across two major regions—US and EU—ensuring both high availability and compliance with data residency laws. -[Powered by](https://qdrant.tech/) +### Results: Faster Performance, Lower Costs, Better User Experience -<|page-166-lllmstxt|> -## agentic-rag-camelai-discord -- [Documentation](https://qdrant.tech/documentation/) -- Agentic RAG Discord Bot with CAMEL-AI +The impact of Qdrant was felt immediately. Search latency was slashed from multi-second averages to sub-second responsiveness. Collections that once consumed over 30 GB of RAM were optimized to run efficiently at a quarter of that size. The shift to in-memory quantized vectors, while keeping original vectors on disk for fallback, proved to be the perfect hybrid model for balancing performance and resource usage. -![agentic-rag-camelai-astronaut](https://qdrant.tech/documentation/examples/agentic-rag-camelai-discord/astronaut-main.png) +These backend improvements directly translated into user-facing gains. Dust’s AI agents became more responsive and reliable. Even as customers loaded larger and more complex datasets, the system continued to deliver consistent performance. The platform’s ability to scale without degrading UX marked a turning point, empowering Dust to expand its customer base with confidence. -# [Anchor](https://qdrant.tech/documentation/agentic-rag-camelai-discord/\#agentic-rag-discord-chatbot-with-qdrant-camel-ai--openai) Agentic RAG Discord ChatBot with Qdrant, CAMEL-AI, & OpenAI +The move to a multi-embedding-model architecture also paid dividends. By grouping data sources by embedder, Dust enabled smoother migrations and more efficient model experimentation. Qdrant’s flexibility let them evolve their architecture without reindexing massive datasets or disrupting end-user functionality. -| Time: 45 min | Level: Intermediate | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Ymqzm6ySoyVOekY7fteQBCFCXYiYyHxw#scrollTo=QQZXwzqmNfaS) | | -| --- | --- | --- | --- | +### Lessons Learned and Roadmap -Unlike traditional RAG techniques, which passively retrieve context and generate responses, **agentic RAG** involves active decision-making and multi-step reasoning by the chatbot. Instead of just fetching data, the chatbot makes decisions, dynamically interacts with various data sources, and adapts based on context, giving it a much more dynamic and intelligent approach. +As they scaled, Dust uncovered a critical insight: users tend to ask more structured, analytical questions when they know a database is involved—queries better suited to SQL than vector search. This prompted the team to pair Qdrant with a text-to-SQL system, blending unstructured and structured query capabilities for a more versatile agent. -In this tutorial, we’ll develop a fully functional chatbot using Qdrant, [CAMEL-AI](https://www.camel-ai.org/), and [OpenAI](https://openai.com/). +Looking forward, Qdrant remains a foundational pillar of Dust’s product roadmap. They’re building multi-region sharding for more granular data residency, scaling their clusters both vertically and horizontally, and supporting newer embedding models from providers like OpenAI and Mistral. Future collections will be organized by embedder, with tenant-aware sharding and index optimizations tailored to each use case. -Let’s get started! +### A new tier of performance, scalability, and architectural flexibility -* * * +By adopting Qdrant, Dust unlocked a new tier of performance, scalability, and architectural flexibility. Their platform is now equipped to support millions of vectors, operate efficiently across regions, and deliver low-latency search, even at enterprise scale. For teams building sophisticated AI agents, Qdrant provides not just a vector database—but the infrastructure backbone to grow with confidence. -## [Anchor](https://qdrant.tech/documentation/agentic-rag-camelai-discord/\#workflow-overview) Workflow Overview +<|page-316-lllmstxt|> +## How SayOne Enhanced Government AI Services with Qdrant -Below is a high-level look at our Agentic RAG workflow: -| Step | Description | -| --- | --- | -| **1\. Environment Setup** | Install required libraries ( `camel-ai`, `qdrant-client`, `discord.py`) and set up the Python environment. | -| **2\. Set Up the OpenAI Embedding Instance** | Create an OpenAI account, generate an API key, and configure the embedding model. | -| **3\. Configure the Qdrant Client** | Sign up for Qdrant Cloud, create a cluster, configure `QdrantStorage`, and set up the API connection. | -| **4\. Scrape and Process Data** | Use `VectorRetriever` to scrape Qdrant documentation, chunk text, and store embeddings in Qdrant. | -| **5\. Set Up the CAMEL-AI ChatAgent** | Instantiate a CAMEL-AI `ChatAgent` with OpenAI models for multi-step reasoning and context-aware responses. | -| **6\. Create and Configure the Discord Bot** | Register a new bot in the Discord Developer Portal, invite it to a server, and enable permissions. | -| **7\. Build the Discord Bot** | Integrate Discord.py with CAMEL-AI and Qdrant to retrieve context and generate intelligent responses. | -| **8\. Test the Bot** | Run the bot in a live Discord server and verify that it provides relevant, context-rich answers. | +![SayOne Enhanced Government AI Services](/blog/case-study-sayone/case-study-sayone-summary-dark.jpg) -## [Anchor](https://qdrant.tech/documentation/agentic-rag-camelai-discord/\#architecture-diagram) Architecture Diagram -Below is the architecture diagram representing the workflow and interactions of the chatbot: +### The Challenge -![Architecture Diagram](https://qdrant.tech/documentation/examples/agentic-rag-camelai-discord/diagram_discord_bot.png) -The workflow starts by **scraping, chunking, and upserting** content from URLs using the `vector_retriever.process()` method, which generates embeddings with the **OpenAI embedding instance**. These embeddings, along with their metadata, are then indexed and stored in **Qdrant** via the `QdrantStorage` class. +SayOne is an information technology and digital services company headquartered in India. They create end-to-end customized digital solutions, and have completed over 200 projects for clients worldwide. When SayOne embarked on building advanced AI solutions for government institutions, their initial choice was Pinecone, primarily due to its prevalence within AI documentation. However, SayOne soon discovered significant limitations impacting their projects. Key challenges included escalating costs, restrictive customization options, and considerable scalability issues. Furthermore, reliance on external cloud infrastructure posed critical data privacy concerns, especially since governmental entities demanded stringent data sovereignty and privacy controls. -When a user sends a query through the **Discord bot**, it is processed by `vector_retriever.query()`, which first embeds the query using **OpenAI Embeddings** and then retrieves the most relevant matches from Qdrant via `QdrantStorage`. The retrieved context (e.g., relevant documentation snippets) is then passed to an **OpenAI-powered Qdrant Agent** under **CAMEL-AI**, which generates a final, context-aware response. -The Qdrant Agent processes the retrieved vectors using the `GPT_4O_MINI` language model, producing a response that is contextually relevant to the user’s query. This response is then sent back to the user through the **Discord bot**, completing the flow. +### Evaluation Process -* * * -## [Anchor](https://qdrant.tech/documentation/agentic-rag-camelai-discord/\#step-1-environment-setup)**Step 1: Environment Setup** +Recognizing the importance of overcoming these hurdles, SayOne initiated a thorough evaluation of alternative vector database solutions. The evaluation encompassed Qdrant, Milvus, and Weaviate, with specific attention to cost-efficiency, latency, ease of deployment, scalability, and overall developer experience. -Before diving into the implementation, here’s a high-level overview of the stack we’ll use: -| **Component** | **Purpose** | -| --- | --- | -| **Qdrant** | Vector database for storing and querying document embeddings. | -| **OpenAI** | Embedding and language model for generating vector representations and chatbot responses. | -| **CAMEL-AI** | Framework for managing dialogue flow, retrieval, and AI agent interactions. | -| **Discord API** | Platform for deploying and interacting with the chatbot. | +During the rigorous testing phase, Qdrant clearly distinguished itself with superior performance metrics, especially in terms of latency—a critical factor for responsive AI-driven applications. SayOne conducted extensive load testing using real-world scenarios, mimicking production environments, particularly for government clients planning to deploy voice-interactive systems. The previous solution, Pinecone, exhibited high latency largely due to regional cloud server discrepancies. In contrast, Qdrant's ability to host deployments closer to user locations proved pivotal in achieving significantly reduced latency, enhancing overall responsiveness and reliability. -### [Anchor](https://qdrant.tech/documentation/agentic-rag-camelai-discord/\#install-dependencies) Install Dependencies -We’ll install CAMEL-AI, which includes all necessary dependencies: +### Why Qdrant Won -```python -!pip install camel-ai[all]==0.2.17 -``` +Another standout advantage was Qdrant's streamlined and intuitive deployment process. Developers at SayOne highlighted that other evaluated solutions presented substantial deployment complexities, requiring extensive DevOps resources and intricate configurations. Qdrant, however, provided a simplified yet powerful infrastructure, allowing teams to rapidly deploy and scale solutions with minimal friction. The intuitive UI and comprehensive developer tooling offered by Qdrant facilitated easier debugging, enhanced productivity, and a far superior developer experience compared to other platforms. -* * * -## [Anchor](https://qdrant.tech/documentation/agentic-rag-camelai-discord/\#step-2-set-up-the-openai-embedding-instance)**Step 2: Set Up the OpenAI Embedding Instance** +The versatility of Qdrant was also critically important. SayOne leveraged Qdrant's robust hybrid search capabilities and advanced payload filtering extensively, significantly improving both the accuracy and speed of search results when integrated with sophisticated re-ranking models like Cohere. These features were instrumental in maintaining high levels of accuracy, crucial for public sector applications where precision and reliability are paramount. -1. **Create an OpenAI Account**: Go to [OpenAI](https://platform.openai.com/signup) and sign up for an account if you don’t already have one. -2. **Generate an API Key**: +### Real-World Deployments - - After logging in, click on your profile icon in the top-right corner and select **API keys**. - - Click **Create new secret key**. +Following the successful deployment of an initial project serving a government, Qdrant quickly became the default vector database solution across various governmental initiatives globally. Notably, clients from Southeast Asia and the Middle East explicitly requested Qdrant, underscoring its strong reputation for data privacy, scalability, and flexibility in supporting both cloud-based and fully on-premise environments. - - Copy the generated API key and store it securely. You won’t be able to see it again. -Here’s how to set up the OpenAI client in your code: +### Results & Impact -Create a `.env` file in your project directory and add your API key: -```bash -OPENAI_API_KEY= +The impact of adopting Qdrant has been transformative for SayOne. Government clients have benefited from drastically improved latency, robust data sovereignty compliance, and accelerated development cycles. SayOne’s engineers report enhanced productivity, citing Qdrant’s straightforward infrastructure and proactive community support as critical factors in project success. -``` -Make sure to replace `` with your actual API key. +Overall, Qdrant has empowered SayOne to deliver secure, scalable, and efficient AI-driven solutions tailored explicitly to the stringent requirements of government institutions. By selecting Qdrant, SayOne positioned itself as a reliable technology partner capable of navigating complex AI deployments and satisfying the rigorous demands of global public sector projects. -Now, start the OpenAI Client +<|page-317-lllmstxt|> +## More Than Just Multimodal Search? +AI has transformed how we find products, services, and content. Now users express needs in **natural language** and expect precise, tailored results. -```python -import openai -import os -from dotenv import load_dotenv +For example, you might search for hotels in Paris with specific criteria: -load_dotenv() +![superlinked-search](/blog/superlinked-multimodal-search/superlinked-search.png) -openai_client = openai.Client( - api_key=os.getenv("OPENAI_API_KEY") -) +*"Affordable luxury hotels near Eiffel Tower with lots of good reviews and free parking."* This isn't just a search query—it's a complex set of interrelated preferences spanning multiple data types. -``` +> In this blog, we'll show you how we built [**The Hotel Search Demo**](https://hotel-search-recipe.superlinked.io/). -To set up the embedding instance, we will use text embedding 3 large: +**Figure 1:** Superlinked generates vectors of different modalities which are indexed and served by Qdrant for fast, accurate hotel search. +![superlinked-hotel-search](/blog/superlinked-multimodal-search/frontend.gif) -```python -from camel.embeddings import OpenAIEmbedding -from camel.types import EmbeddingModelType +What makes this app particularly powerful is how it breaks down your natural language query into precise parameters. As you type your question at the top, you can observe the query parameters dynamically update in the left sidebar. -embedding_instance = OpenAIEmbedding(model_type=EmbeddingModelType.TEXT_EMBEDDING_3_LARGE) +In this blog, we'll show you how Qdrant and Superlinked combine **textual understanding**, **numerical reasoning**, and **categorical filtering** to create a seamless search experience that meets modern user expectations. -``` -## [Anchor](https://qdrant.tech/documentation/agentic-rag-camelai-discord/\#step-3-configure-the-qdrant-client)**Step 3: Configure the Qdrant Client** +## Core Components -For this tutorial, we will be using the **Qdrant Cloud Free Tier**. Here’s how to set it up: +**Figure 2:** In a typical search or RAG app, the embedding framework (Superlinked) combines your data and its metadata into vectors. They are ingested into a Qdrant collection and indexed. -1. **Create an Account**: Sign up for a Qdrant Cloud account at [Qdrant Cloud](https://cloud.qdrant.io/). +![superlinked-architecture](/blog/superlinked-multimodal-search/superlinked-architecture.png) -2. **Create a Cluster**: +Superlinked makes search smarter by embedding data into specialized "spaces" designed for each type of attribute, rather than using a single embedding method for everything. - - Navigate to the **Overview** section. - - Follow the onboarding instructions under **Create First Cluster** to set up your cluster. - - When you create the cluster, you will receive an **API Key**. Copy and securely store it, as you will need it later. -3. **Wait for the Cluster to Provision**: +When a user queries **"Affordable luxury hotels near Eiffel Tower with lots of good reviews and free parking"**, Superlinked uses an LLM to do natural query understanding and set weights. These weights determine: +- Preference direction (negative for lower values, positive for higher values). +- Preference strength (higher numbers have stronger influence). +- Balance between different attributes (e.g., price_weight: -1.0 and rating_weight: 1.0 are balanced). - - Your new cluster will appear under the **Clusters** section. +This flexibility with weights allows users to rapidly iterate, experiment, and implement business logic or context much faster than rebuilding entire search systems from scratch. Superlinked then applies mandatory hard filters to narrow results, then ranks them using weighted nearest neighbors search, providing nuanced, accurate results tailored to user preferences. All vectors are stored in Qdrant. -After obtaining your Qdrant Cloud details, add to your `.env` file: +**SuperLinked Framework Setup:** Once you [**setup the Superlinked server**](https://github.com/superlinked/superlinked-recipes/tree/main/projects/hotel-search), most of the prototype work is done right out of the [**sample notebook**](https://github.com/superlinked/superlinked-recipes/blob/main/projects/hotel-search/notebooks/superlinked-queries.ipynb). Once ready, you can host from a GitHub repository and deploy via Actions. -```bash -QDRANT_CLOUD_URL= -QDRANT_CLOUD_API_KEY= +**Qdrant Vector Database:** The easiest way to store vectors is to [**create a free Qdrant Cloud cluster**](https://cloud.qdrant.io/login). We have simple docs that show you how to [**grab the API key**](/documentation/quickstart-cloud/) and upsert your new vectors and run some basic searches. For this demo, we have deployed a live Qdrant Cloud cluster. -``` +**OpenAI API Key:** For natural language queries and generating the weights you will need an OpenAI API key -### [Anchor](https://qdrant.tech/documentation/agentic-rag-camelai-discord/\#configure-the-qdrantstorage) Configure the QdrantStorage +### 1. Vector Spaces: The Building Blocks of Intelligent Search +![superlinked-hotel-1](/blog/superlinked-multimodal-search/superlinked-hotel-1.jpg) -The `QdrantStorage` will deal with connecting with the Qdrant Client for all necessary operations to your collection. +At the heart of Superlinked's innovation are [**Spaces**](https://docs.superlinked.com/concepts/overview) - specialized vector embedding environments designed for different data types. Unlike conventional approaches that force all data into a single embedding format, these spaces respect the inherent characteristics of different data types. + +In our demo, four distinct spaces work together: **Description**, **Rating**, **Price** and **Rating Count**. Here is how they are defined: ```python -from camel.retrievers import VectorRetriever +# Text data is embedded using a specialized language model +description_space = sl.TextSimilaritySpace( + text=hotel_schema.description, + model=settings.text_embedder_name # all-mpnet-base-v2 +) -# Define collection name -collection_name = "qdrant-agent" +# Numerical data uses dedicated numerical embeddings with appropriate scaling +rating_space = sl.NumberSpace( + hotel_schema.rating, + min_value=0, + max_value=10, + mode=sl.Mode.MAXIMUM # Linear scale for bounded ratings +) -storage_instance = QdrantStorage( - vector_dim=embedding_instance.get_output_dim(), - url_and_api_key=( - qdrant_cloud_url, - qdrant_api_key, - ), - collection_name=collection_name, +price_space = sl.NumberSpace( + hotel_schema.price, + min_value=0, + max_value=1000, + mode=sl.Mode.MAXIMUM, + scale=sl.LogarithmicScale() # Log scale for prices that vary widely ) +rating_count_space = sl.NumberSpace( + hotel_schema.rating_count, + min_value=0, + max_value=22500, + mode=sl.Mode.MAXIMUM, + scale=sl.LogarithmicScale() # Log scale for wide-ranging review counts +) ``` -Make sure to update the `` and `` fields. +What makes this powerful is that each space properly preserves the semantic relationships within its domain - all while allowing these different spaces to be combined into a cohesive search experience. -* * * +**Prices** are embedded to maintain their proportional relationships, **Text** embeddings capture semantic meanings, **Ratings** preserve their relative quality indicators, and the **Ratings Count** uses logarithmic scaling to properly weight the significance of review volume. -## [Anchor](https://qdrant.tech/documentation/agentic-rag-camelai-discord/\#step-4-scrape-and-process-data)**Step 4: Scrape and Process Data** +### 2. Beyond Multimodal Vector Search: The Full Picture -We’ll use CamelAI `VectorRetriever` library to help us to It processes content from a file or URL, divides it into chunks, and stores the embeddings in the specified Qdrant collection. +Both **Qdrant and Superlinked** support a rich multimodal search environment where different data types collaborate rather than compete. For our hotel demo, this means: + +- **Text descriptions** are embedded using state-of-the-art language models that understand semantics. +- **Prices use** logarithmic scaling to properly handle a wide ranges of values. +- **Ratings** are embedded linearly to preserve their quality indicators. +- **Review counts** use logarithmic scaling to account for the diminishing returns of additional reviews. + +Unlike approaches that stringify all data into text before embedding (resulting in unpredictable non-monotonic relationships between numbers), or systems that maintain separate indices for different attributes, Superlinked creates a unified search space where multiple attributes can be considered simultaneously with appropriate semantic relationships preserved. + +The declaration of this unified index is remarkably straightforward: ```python -from camel.retrievers import VectorRetriever +index = sl.Index( + spaces=[ + description_space, + price_space, + rating_space, + rating_count_space, + ], + # Additional fields for hard filtering + fields=[hotel_schema.city, hotel_schema.amenities, ...] +) +``` +If you want to have a deeper understanding of the algorithm and how multi vector embeddings work, you can have a read in-depth in our [article.](https://links.superlinked.com/multi_attribute_search_qd) -vector_retriever = VectorRetriever(embedding_model=embedding_instance, - storage=storage_instance) +### 3. Intelligent Query Processing: From Natural Language to Results -qdrant_urls = [\ - "https://qdrant.tech/documentation/overview",\ - "https://qdrant.tech/documentation/guides/installation",\ - "https://qdrant.tech/documentation/concepts/filtering",\ - "https://qdrant.tech/documentation/concepts/indexing",\ - "https://qdrant.tech/documentation/guides/distributed_deployment",\ - "https://qdrant.tech/documentation/guides/quantization"\ - # Add more URLs as needed\ -] +The query processing system in Superlinked simplifies the way search queries are built and executed. This system allows users to interact using natural language, which is then converted into multi-dimensional vector operations, thereby moving away from rigid query structures. -for qdrant_url in qdrant_urls: - vector_retriever.process( - content=qdrant_url, - ) +The query construction in the hotel demo demonstrates this power: +```python +query = ( + sl.Query( + index, + weights={ + price_space: sl.Param("price_weight", description=price_description), + rating_space: sl.Param("rating_weight", description=rating_description), + # Additional space weights... + }, + ) + .find(hotel_schema) + .similar(description_space.text, sl.Param("description")) + .filter(hotel_schema.city.in_(sl.Param("city"))) + # Additional filters... + .with_natural_query(natural_query=sl.Param("natural_query")) +) ``` +#### Breaking Down the Query -* * * +![superlinked-query](/blog/superlinked-multimodal-search/superlinked-query.svg) -## [Anchor](https://qdrant.tech/documentation/agentic-rag-camelai-discord/\#step-5-setup-the-camel-ai-chatagent-instance)**Step 5: Setup the CAMEL-AI ChatAgent Instance** +This setup enables queries like *"Affordable luxury hotels near Eiffel Tower with lots of good reviews and free parking."* to be automatically translated into: -Define the OpenAI model and create a CAMEL-AI ChatAgent instance. +- A text similarity search for **"luxury"** and **"Eiffel Tower"** concepts +- Appropriate weighting for **"affordable" price** (lower range) +- Hard filtering for **"free parking"** as an amenity +- Search for **"lots" (rating count) + good reviews (rating)** -```python -from camel.configs import ChatGPTConfig -from camel.models import ModelFactory -from camel.types import ModelPlatformType, ModelType -from camel.agents import ChatAgent +Unlike systems that rely on reranking after retrieval (which can miss relevant results if the initial retrieval is too restrictive) or metadata filters (which convert fuzzy preferences like "affordable" to rigid boundaries), this approach maintains the nuance of the search throughout the entire process. -# Create a ChatGPT configuration -config = ChatGPTConfig(temperature=0.2).as_dict() +### 4. Hybrid Search Reimagined: Solving the Modern Search Problem -# Create an OpenAI model using the configuration -openai_model = ModelFactory.create( - model_platform=ModelPlatformType.OPENAI, - model_type=ModelType.GPT_4O_MINI, - model_config_dict=config, -) +Today's search landscape is dominated by discussions of hybrid search - the combination of keyword matching for precision with vector search for semantic understanding. The hotel search demo takes this concept further by implementing a multimodal hybrid search method that spans not just text retrieval methods but entire data domains. -assistant_sys_msg = """You are a helpful assistant to answer question, - I will give you the Original Query and Retrieved Context, - answer the Original Query based on the Retrieved Context, - if you can't answer the question just say I don't know.""" +In the hotel search demo, we see hybrid search reimagined across multiple dimensions: +- Text hybrid search: Combining exact matching (for city names, amenity keywords) with semantic similarity (for concepts like "luxury" or "family-friendly") +- Numerical hybrid search: Blending exact range filters (minimum/maximum price) with preference-based vector similarity (for concepts like "affordable" or "high-rated") +- Categorical hybrid search: Integrating hard categorical constraints (must be in Paris) with soft preferences (prefer hotels with specific amenities) -qdrant_agent = ChatAgent(system_message=assistant_sys_msg, model=openai_model) +This multi-dimensional hybrid approach solves challenges facing conventional search systems: -``` +1. Single-modal vector search fails when queries span multiple data types +2. Traditional hybrid search still separates keyword and vector components, which means they have to be weighed appropriately +3. Separate storage per attribute forces complex result reconciliation that loses semantic nuance +4. Pure filtering approaches convert preferences into binary decisions, missing the "strength" of preference +5. Re-ranking strategies may lead to weaker initial retrieval, especially with broad queries -* * * +This unified approach maintains the semantic relationships of all attributes in a multi-dimensional search space, where preferences become weights rather than filters, and where hard constraints and soft preferences seamlessly coexist in the same query. -## [Anchor](https://qdrant.tech/documentation/agentic-rag-camelai-discord/\#step-6-create-and-configure-the-discord-bot)**Step 6: Create and Configure the Discord Bot** +The result is a search experience that feels intuitive and "just works" - whether users are looking for "pet-friendly boutique hotels with good reviews near the city center" or "affordable family suites with pool access in resort areas" - because the system understands both the semantics and the relationships between different attributes of what users are asking for. -Now let’s bring the bot to life! It will serve as the interface through which users can interact with the agentic RAG system you’ve built. +The hotel search demo showcases this vision in action, a glimpse into a future where search understands not just the words we use, but the complex, nuanced preferences they represent. -### [Anchor](https://qdrant.tech/documentation/agentic-rag-camelai-discord/\#create-a-new-discord-bot) Create a New Discord Bot +## How to Build the App -1. Go to the [Discord Developer Portal](https://discord.com/developers/applications) and log in with your Discord account. +For more details, [check out the repository](https://github.com/superlinked/hotel-search-recipe-qdrant). -2. Click on the **New Application** button. +Otherwise, you can clone the app: -3. Give your application a name and click **Create**. +```shell +git clone https://github.com/superlinked/hotel-search-recipe-qdrant.git +``` -4. Navigate to the **Bot** tab on the left sidebar and click **Add Bot**. +The backend is located under `superlinked_app`, while the frontend has to be built from `frontend_app`. -5. Once the bot is created, click **Reset Token** under the **Token** section to generate a new bot token. Copy this token securely as you will need it later. +### Deploy the Backend +Use `superlinked_app/.env-example` as a template, create `superlinked_app/.env` and set `OPENAI_API_KEY` required for Natural Query Interface, `QDRANT_URL` and `QDRANT_API_KEY` required for Qdrant Vector Database. -### [Anchor](https://qdrant.tech/documentation/agentic-rag-camelai-discord/\#invite-the-bot-to-your-server) Invite the Bot to Your Server +```shell +python3.11 -m venv .venv +. .venv/bin/activate +pip install -r requirements.txt +APP_MODULE_PATH=superlinked_app python -m superlinked.server +``` -1. Go to the **OAuth2** tab and then to the **URL Generator** section. +It will take some time (depending on the network) to download the sentence-transformers model for the very first time. -2. Under **Scopes**, select **bot**. +API docs will be available at [localhost:8080/docs](http://localhost:8080/docs). -3. Under **Bot Permissions**, select the necessary permissions: +To ingest the dataset, run this command in your terminal: +```shell +curl -X 'POST' \ + 'http://localhost:8080/data-loader/hotel/run' \ + -H 'accept: application/json' \ + -d '' +``` +Please wait until the ingestion is finished. You will see the message. - - Send Messages +#### Inspecting Collections in Qdrant Cloud Dashboard - - Read Message History -4. Copy the generated URL and paste it into your browser. +Once your Superlinked vectors are ingested, log in to the Qdrant Cloud dashboard to navigate to **Collections** and select your `default` hotel collection. -5. Select the server where you want to invite the bot and click **Authorize**. +![default-collection](/blog/superlinked-multimodal-search/default-collection.png) +You can browse individual points under the **Data** tab to view payload metadata (price, rating, amenities) alongside their raw vector embeddings. -### [Anchor](https://qdrant.tech/documentation/agentic-rag-camelai-discord/\#grant-the-bot-permissions) Grant the Bot Permissions +![collection-information](/blog/superlinked-multimodal-search/collection-information.png) -1. Go back to the **Bot** tab. +In the **Collection Information** section, you can use the **Search** tab to apply metadata filters or search by vector. In the **Search Quality** section, you can also monitor performance metrics (throughput, latency). -2. Enable the following under **Privileged Gateway Intents**: +When scaling up your app, go back to **Qdrant Cloud Dashboard** to configure autoscaling, backups, and snapshots. These options will keep your service reliable and cost-efficient. - - Server Members Intent +### Build the Frontend - - Message Content Intent +```shell +cd frontend_app +python3.11 -m venv .venv-frontend +. .venv-frontend/bin/activate +pip install -e . +python -m streamlit run app/frontend/main.py +``` -Now, the bot is ready to be integrated with your code. +The Frontend UI will be available at [localhost:8501](http://localhost:8501). -## [Anchor](https://qdrant.tech/documentation/agentic-rag-camelai-discord/\#step-7-build-the-discord-bot)**Step 7: Build the Discord Bot** +![superlinked-hotel-search](/blog/superlinked-multimodal-search/superlinked-hotel-search.png) -Add to your `.env` file: +#### Superlinked CLI -```bash -DISCORD_BOT_TOKEN= +> **Note:** If you need Superlinked for larger scale projects, you can use **Superlinked Cloud**. -``` +With 'superlinked cli', you will be able to run a Superlinked application at scale with components such as batch processing engine, logging and more. For more details contact the Superlinked team at: [superlinked.com](https://superlinked.typeform.com/to/LXMRzHWk?typeform-source=hotel-search-recipe). -We’ll use `discord.py` to create a simple Discord bot that interacts with users and retrieves context from Qdrant before responding. +## Materials -```python -from camel.bots import DiscordApp -import nest_asyncio -import discord +- [Superlinked GitHub Repository](https://github.com/superlinked/superlinked) +- [Superlinked Documentation](https://docs.superlinked.com) +- [Qdrant Vector Database](https://qdrant.tech) +- [Qdrant Documentation](https://qdrant.tech/documentation) +- [Qdrant Cloud](https://cloud.qdrant.io) +- [Qdrant Discord Community](https://discord.gg/qdrant) -nest_asyncio.apply() -discord_q_bot = DiscordApp(token=os.getenv("DISCORD_BOT_TOKEN")) +<|page-318-lllmstxt|> +[**Qdrant 1.14.0 is out!**](https://github.com/qdrant/qdrant/releases/tag/v1.14.0) Let's look at the main features for this version: -@discord_q_bot.client.event # triggers when a message is sent in the channel -async def on_message(message: discord.Message): - if message.author == discord_q_bot.client.user: - return +**Score-Boosting Reranker:** Blend vector similarity with custom rules and context.
+**Improved Resource Utilization:** CPU and disk IO optimization for faster processing.
- if message.type != discord.MessageType.default: - return +**Incremental HNSW Indexing:** Build indexes gradually as data arrives.
+**Batch Search:** Optimized parallel processing for batch queries.
- if message.author.bot: - return - user_input = message.content +**Memory Optimization:** Reduced usage for large datasets with improved ID tracking.
- retrieved_info = vector_retriever.query( - query=user_input, top_k=10, similarity_threshold=0.6 - ) +## Score-Boosting Reranker +![reranking](/blog/qdrant-1.14.x/reranking.jpg) - user_msg = str(retrieved_info) - assistant_response = qdrant_agent.step(user_msg) - response_content = assistant_response.msgs[0].content +When integrating vector search into specific applications, you can now tweak the final result list using domain or business logic. For example, if you are building a **chatbot or search on website content**, you can rank results with `title` metadata higher than `body_text` in your results. - if len(response_content) > 2000: # discord message length limit - for chunk in [response_content[i:i+2000] for i in range(0, len(response_content), 2000)]: - await message.channel.send(chunk) - else: - await message.channel.send(response_content) +In **e-commerce** you may want to boost products from a specific manufacturer—perhaps because you have a promotion or need to clear inventory. With this update, you can easily influence ranking using metadata like `brand` or `stock_status`. -discord_q_bot.run() +> The **Score-Boosting Reranker** allows you to combine vector-based similarity with **business or domain-specific logic** by applying a **rescoring step** on top of the standard semantic or distance-based ranking. -``` +As you structure the query, you can define a `formula` that references both existing scores (like cosine similarities) and additional payload data (e.g., timestamps, location info, numeric attributes). Let's take a look at some examples: -* * * +### Idea 1: Prioritizing Website Content -## [Anchor](https://qdrant.tech/documentation/agentic-rag-camelai-discord/\#step-9-test-the-bot)**Step 9: Test the Bot** +Let's say you are trying to improve the search feature for a documentation site, such as our [**Developer Portal**](https://qdrant.tech/documentation/). You would chunk and vectorize all your documentation and store it in a Qdrant collection. -1. Invite your bot to your Discord server using the OAuth2 URL from the Discord Developer Portal. +**Figure 1:** Any time someone types in **hybrid queries**, you want to show them the most relevant result at the top. +![reranking](/blog/qdrant-1.14.x/website-search.png) -2. Run the notebook. +**Reranking** can help you prioritize the best results based on user intent. -3. Start chatting with the bot in your Discord server. It will retrieve context from Qdrant and provide relevant answers based on your queries. +Your website collection can have vectors for **titles**, **paragraphs**, and **code snippet** sections of your documentation. You can create a `tag` payload field that indicates whether a point is a title, paragraph, or snippet. Then, to give more weight to titles and paragraphs, you might do something like: + +```text +score = score + (is_title * 0.5) + (is_paragraph * 0.25) +``` +**Above is just sample logic - but here is the actual Qdrant API request:** -![agentic-rag-discord-bot-what-is-quantization](https://qdrant.tech/documentation/examples/agentic-rag-camelai-discord/example.png) +```http +POST /collections/{collection_name}/points/query +{ + "prefetch": { + "query": [0.2, 0.8, ...], // <-- dense vector for the query + "limit": 50 + }, + "query": { + "formula": { + "sum": [ + "$score", // Semantic score + { + "mult": [ + 0.5, // weight for title + { // Filter for title + "key": "tag", + "match": { "any": ["h1","h2","h3","h4"] } + } + ] + }, + { + "mult": [ + 0.25, // weight for paragraph + { // Filter for paragraph + "key": "tag", + "match": { "any": ["p","li"] } + } + ] + } + ] + } + } +} +``` -* * * +### Idea 2: Reranking Most Recent Results -## [Anchor](https://qdrant.tech/documentation/agentic-rag-camelai-discord/\#conclusion) Conclusion +One of the most important advancements is the ability to prioritize recency. In many scenarios, such as in news or job listings, users want to see the most recent results first. Until now, this wasn’t possible without additional work: *you had to fetch all the data and manually filter for the latest entries on your side*. -Nice work! You’ve built an agentic RAG-powered Discord bot that retrieves relevant information with Qdrant, generates smart responses with OpenAI, and handles multi-step reasoning using CAMEL-AI. Here’s a quick recap: +Now, the similarity score **doesn’t have to rely solely on cosine distance**. It can also take into account how recent the data is, allowing for much more dynamic and context-aware ranking. -- **Smart Knowledge Retrieval:** Your chatbot can now pull relevant info from large datasets using Qdrant’s vector search. +> With the Score-Boosting Reranker, simply add a `datetime` payload field and factor it into your formula so fresher data rises to the top. -- **Autonomous Reasoning with CAMEL-AI:** Enables multi-step reasoning instead of just regurgitating text. +**Example Query**: -- **Live Discord Deployment:** You launched the chatbot on Discord, making it interactive and ready to help real users. +```http +POST /collections/{collection_name}/points/query +{ + "prefetch": { ... }, + "query": { + "formula": { + "sum": [ + "$score", + { + "gauss_decay": + "target": { "datetime": }, + "x": { "datetime_key": }, + "scale": <1 week in seconds> + } + ] + } + } +} +``` +### Idea 3: Factor in Geographical Proximity -One of the biggest advantages of CAMEL-AI is the abstraction it provides, allowing you to focus on designing intelligent interactions rather than worrying about low-level implementation details. +Let’s say you’re searching for a restaurant serving Currywurst. Sure, Berlin has some of the best, but you probably don’t want to spend two days traveling for a sausage covered in magical seasoning. The best match is the one that **balances the distance in the vector space with a real-world geographical distance**. You want your users see relevant and conveniently located options. -You’re now well-equipped to tackle more complex real-world problems that require scalable, autonomous knowledge systems. +This feature introduces a multi-objective optimization: combining semantic similarity with geographical proximity. Suppose each point has a `geo.location` payload field (latitude, longitude). You can use a `gauss_decay` function to clamp the distance into a 0–1 range and add that to your similarity score: + +```text +score = $score + gauss_decay(distance) +``` -##### Was this page useful? +**Example Query**: + +```http +POST /collections/{collection_name}/points/query +{ + "prefetch": { + "query": [0.2, 0.8, ...], + "limit": 50 + }, + "query": { + "formula": { + "sum": [ + "$score", + { + "gauss_decay": { + "scale": 5000, // e.g. 5 km + "x": { + "geo_distance": { + "origin": { // Berlin + "lat": 52.504043, + "lon": 13.393236 + }, + "to": "geo.location" + } + } + } + } + ] + }, + "defaults": { + "geo.location": { // Munich + "lat": 48.137154, + "lon": 11.576124 + } + } + } +} +``` -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +You can tweak parameters like `target`, `scale`, and `midpoint` to shape how quickly the score decays over distance. This is extremely useful for local search scenarios, where location is a major factor but not the only factor. -Thank you for your feedback! 🙏 +> This is a very powerful feature that allows for extensive customization. Read more about this feature in the [**Hybrid Queries Documentation**](/documentation/concepts/hybrid-queries/) -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/agentic-rag-camelai-discord.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +## Incremental HNSW Indexing +![optimizations](/blog/qdrant-1.14.x/optimizations.jpg) -On this page: +Rebuilding an entire [**HNSW graph**](/documentation/concepts/indexing/#vector-index) every time new data is added can be computationally expensive. With this release, Qdrant now supports incremental HNSW indexing—an approach that extends existing HNSW graphs rather than recreating them from scratch. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/agentic-rag-camelai-discord.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +> This feature is designed to make indexing faster and more efficient when you’re only adding new points. It reuses the existing structure of the HNSW graph and appends the new data directly onto it. -× +That means much less time spent building and more time searching. Although this initial implementation currently only support upserts, it lays the groundwork for a more dynamic and performance-friendly indexing process. Especially for collections with frequent updates to payload values or growing datasets, incremental HNSW is a big step forward. -[Powered by](https://qdrant.tech/) +> Note that deletes and updates will still trigger a full rebuild. Check out the [**indexing documentation**](/documentation/concepts/indexing/) to learn more. -<|page-167-lllmstxt|> -## llama-index-multitenancy -- [Documentation](https://qdrant.tech/documentation/) -- [Examples](https://qdrant.tech/documentation/examples/) -- Multitenancy with LlamaIndex +## Faster Batch Queries +![reranking](/blog/qdrant-1.14.x/gridstore.jpg) -# [Anchor](https://qdrant.tech/documentation/examples/llama-index-multitenancy/\#multitenancy-with-llamaindex) Multitenancy with LlamaIndex +In this release, Qdrant introduces a major performance boost for [**batch query operations**](/documentation/concepts/search/#batch-search-api). Until now, the query batch API used a single thread per segment, which worked well—unless you had just one segment and a large batch of queries in a single request. In such cases, everything was processed on a single thread, significantly slowing things down. This scenario was especially common when using our [**Python client**](https://github.com/qdrant/qdrant-client), which is single-threaded by default. -If you are building a service that serves vectors for many independent users, and you want to isolate their -data, the best practice is to use a single collection with payload-based partitioning. This approach is -called **multitenancy**. Our guide on the [Separate Partitions](https://qdrant.tech/documentation/guides/multiple-partitions/) describes -how to set it up in general, but if you use [LlamaIndex](https://qdrant.tech/documentation/integrations/llama-index/) as a -backend, you may prefer reading a more specific instruction. So here it is! +The new optimization changes that. Large query batches are now split into chunks, and each chunk is processed on a separate thread. -## [Anchor](https://qdrant.tech/documentation/examples/llama-index-multitenancy/\#prerequisites) Prerequisites +> This allows Qdrant to execute queries concurrently, even when operating over a single segment with a limited number of requests. -This tutorial assumes that you have already installed Qdrant and LlamaIndex. If you haven’t, please run the -following commands: +You will get much faster response times for large batches of queries. If you’re working with high-volume query workloads, you should notice a significant improvement in latency. Benchmark results show just how dramatic the difference can be. -```bash -pip install llama-index llama-index-vector-stores-qdrant +As a basic test, we populated a 1-segment collection using our [**bfb benchmarking tool**](https://github.com/qdrant/bfb), and ran a request of 240 batch queries. -``` +Initially, this process only saturated a single CPU and took 11 seconds: +![parallel-before](/blog/qdrant-1.14.x/parallel-before.png) -We are going to use a local Docker-based instance of Qdrant. If you want to use a remote instance, please -adjust the code accordingly. Here is how we can start a local instance: +After, it saturated 23 CPUs (search threads = CPU count - 1) and took 4.5 seconds: +![parallel-after](/blog/qdrant-1.14.x/parallel-after.png) -```bash -docker run -d --name qdrant -p 6333:6333 -p 6334:6334 qdrant/qdrant:latest +We ran the same test of large queries for the following configurations: -``` +| Configuration | Segments | Shards | Indexing Threshold | Before | After | Improvement | +|--------------|----------|---------|-------------------|---------|--------|-------------| +| Single Segment | 1 | 1 | 20k | 10.5s | 4.5s | 57% | +| Multi Segment | 10 | 1 | 20k | 5.5s | 5.1s | 7% | +| Single Segment, Multi Shard | 1 | 4 | 20k | 5.2s | 4.6s | 12% | +| Multi Everything | 10 | 4 | 1k | 5.1s | 4.5s | 12% | +| High Shard, Single Segment | 1 | 16 | 1k | 5.2s | 3.7s | 29% | +| High Shard, Multi Segment | 10 | 16 | 1k | 2.5s | 1.7s | 32% | -## [Anchor](https://qdrant.tech/documentation/examples/llama-index-multitenancy/\#setting-up-llamaindex-pipeline) Setting up LlamaIndex pipeline +As you can see, the improvement is **most significant (57%) in single-segment configurations** where parallelization was previously limited. Even in already-optimized multi-shard setups, we still see good gains of 12-32%. -We are going to implement an end-to-end example of multitenant application using LlamaIndex. We’ll be -indexing the documentation of different Python libraries, and we definitely don’t want any users to see the -results coming from a library they are not interested in. In real case scenarios, this is even more dangerous, -as the documents may contain sensitive information. +> For more on batch queries, check out the [**Search documentation**](/documentation/concepts/search/#batch-search-api). -### [Anchor](https://qdrant.tech/documentation/examples/llama-index-multitenancy/\#creating-vector-store) Creating vector store +## Improved Resource Use During Segment Optimization +![segment-optimization](/blog/qdrant-1.14.x/segment-optimization.jpg) -[QdrantVectorStore](https://docs.llamaindex.ai/en/stable/examples/vector_stores/QdrantIndexDemo.html) is a -wrapper around Qdrant that provides all the necessary methods to work with your vector database in LlamaIndex. -Let’s create a vector store for our collection. It requires setting a collection name and passing an instance -of `QdrantClient`. +Qdrant now **saturates CPU and disk IO** more effectively in parallel when optimizing segments. This helps reduce the "sawtooth" usage pattern—where CPU or disk often sat idle while waiting on the other resource. -```python -from qdrant_client import QdrantClient -from llama_index.vector_stores.qdrant import QdrantVectorStore +This leads to **faster optimizations**, which are specially noticeable on large machines handling big data movement. +It also gives you **predictable performance**, as there are fewer sudden spikes or slowdowns during indexing and merging operations. -client = QdrantClient("http://localhost:6333") +**Figure 2:** Indexing 400 million vectors - CPU and disk usage profiles. Previous Qdrant version on the left, new Qdrant version on the right. +![indexation-improvement](/blog/qdrant-1.14.x/indexation.png) -vector_store = QdrantVectorStore( - collection_name="my_collection", - client=client, -) +**Observed Results:** The new version on the right clearly shows much better CPU saturation across the full process. The improvement is especially noticeable during large-scale indexing. -``` +In our experiment, **we indexed 400 million 512-dimensional vectors**. The previous version of Qdrant took around 40 hours on an 8-core machine, while the new version with this change completed the task in just 28 hours. -### [Anchor](https://qdrant.tech/documentation/examples/llama-index-multitenancy/\#defining-chunking-strategy-and-embedding-model) Defining chunking strategy and embedding model +> **Tutorial:** If you want to work with a large number of vectors, we can show you how. [**Learn how to upload and search large collections efficiently.**](/documentation/database-tutorials/large-scale-search/) -Any semantic search application requires a way to convert text queries into vectors - an embedding model. -`ServiceContext` is a bundle of commonly used resources used during the indexing and querying stage in any -LlamaIndex application. We can also use it to set up an embedding model - in our case, a local -[BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5). -set up -```python -from llama_index.core import ServiceContext +## Optimized Memory Usage in Immutable Segments +![immutable-segments](/blog/qdrant-1.14.x/immutable-segments.jpg) -service_context = ServiceContext.from_defaults( - embed_model="local:BAAI/bge-small-en-v1.5", -) +We revamped how the ID tracker and related metadata structures store data in memory. This can result in a notable RAM reduction for very large datasets (hundreds of millions of vectors). -``` +This causes **much lower overhead**, where memory savings let you store more vectors on the same hardware. Also, improved scalability is a major benefit. If your workload was near the RAM limit, this might let you push further **without using additional servers**. -_Note_, in case you are using Large Language Model different from OpenAI’s ChatGPT, you should specify -`llm` parameter for `ServiceContext`. +## Upgrading to Version 1.14 -We can also control how our documents are split into chunks, or nodes using LLamaIndex’s terminology. -The `SimpleNodeParser` splits documents into fixed length chunks with an overlap. The defaults are -reasonable, but we can also adjust them if we want to. Both values are defined in tokens. +With Qdrant 1.14, all client libraries remain fully compatible. If you do not need custom payload-based ranking, **your existing workflows remain unchanged**. -```python -from llama_index.core.node_parser import SimpleNodeParser +> **Upgrading from earlier versions is straightforward** — no major API or index-breaking changes. -node_parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=32) +In **Qdrant Cloud**, simply go to your **Cluster Details** screen and select **Version 1.14** from the dropdown. The upgrade may take a few moments. -``` +**Figure 3:** Updating to the latest software version from Qdrant Cloud dashboard. +![reranking](/blog/qdrant-1.14.x/upgrade.png) -Now we also need to inform the `ServiceContext` about our choices: +**Documentation:** For a full list of formula expressions, conditions, decay functions, and usage examples, see the official [**Qdrant documentation**](https://qdrant.tech/documentation) and the [**API reference**](https://api.qdrant.tech/). This includes detailed code snippets for popular languages and a variety of advanced reranking examples. -```python -service_context = ServiceContext.from_defaults( - embed_model="local:BAAI/bge-large-en-v1.5", - node_parser=node_parser, -) +#### Watch the Features in Action! -``` +Want to see these updates working in a real-world setup? -Both embedding model and selected node parser will be implicitly used during the indexing and querying. +In our latest Qdrant Office Hours, we demoed the new Score-Boosting Reranker, walked through Incremental HNSW Indexing, and tested the impact of the new multi-threaded batch queries. -### [Anchor](https://qdrant.tech/documentation/examples/llama-index-multitenancy/\#combining-everything-together) Combining everything together +Watch the full recording here: -The last missing piece, before we can start indexing, is the `VectorStoreIndex`. It is a wrapper around -`VectorStore` that provides a convenient interface for indexing and querying. It also requires a -`ServiceContext` to be initialized. + -```python -from llama_index.core import VectorStoreIndex +#### Join the Discussion! -index = VectorStoreIndex.from_vector_store( - vector_store=vector_store, service_context=service_context -) +**We'd love to hear your feedback:** If you have questions or want to share your experience, join our [**Discord**](https://discord.gg/qdrant) or open an issue on [**GitHub**](https://github.com/qdrant/qdrant/issues). -``` +![community](/blog/qdrant-1.14.x/community.jpg) -## [Anchor](https://qdrant.tech/documentation/examples/llama-index-multitenancy/\#indexing-documents) Indexing documents +<|page-319-lllmstxt|> +## **Pathwork Optimizes Life Insurance Underwriting with Precision Vector Search** -No matter how our documents are generated, LlamaIndex will automatically split them into nodes, if -required, encode using selected embedding model, and then store in the vector store. Let’s define -some documents manually and insert them into Qdrant collection. Our documents are going to have -a single metadata attribute - a library name they belong to. +![Pathwork Optimizes Life Insurance Underwriting with Precision Vector Search](/blog/case-study-pathwork/case-study-pathwork-summary-dark-b.jpg) -```python -from llama_index.core.schema import Document +### **About Pathwork** -documents = [\ - Document(\ - text="LlamaIndex is a simple, flexible data framework for connecting custom data sources to large language models.",\ - metadata={\ - "library": "llama-index",\ - },\ - ),\ - Document(\ - text="Qdrant is a vector database & vector similarity search engine.",\ - metadata={\ - "library": "qdrant",\ - },\ - ),\ -] +Pathwork is redesigning life and health insurance workflows for the age of AI. Brokerages and insurance carriers utilize Pathwork's advanced agentic system to automate their underwriting processes and enhance back-office sales operations. Pathwork's solution drastically reduces errors, completes tasks up to 70 times faster, and significantly conserves human capital. -``` +### **The Challenge: Accuracy Above All** -Now we can index them using our `VectorStoreIndex`: +Life insurance underwriting demands exceptional accuracy. Traditionally, underwriting involves extensive manual input, subjective judgment, and frequent errors. These errors, such as misclassifying risk based on incomplete or misunderstood health data, often result in lost sales and customer dissatisfaction due to sudden premium changes. -```python -for document in documents: - index.insert(document) +"Accuracy is paramount—every error can mean hundreds of dollars per month in difference to customers or waiting months longer for coverage," explains Blake Butterworth, co-founder of Pathwork. -``` +### **Pathwork’s Innovative Solution** -### [Anchor](https://qdrant.tech/documentation/examples/llama-index-multitenancy/\#performance-considerations) Performance considerations +Pathwork addresses these issues with an AI-powered underwriting tool. The platform uses vector search and retrieval augmented generation (RAG) techniques, enabling brokers to rapidly match customers with precise risk classes and insurance products based on live broker inputs collected during the quoting process via conversation or document upload. -Our documents have been split into nodes, encoded using the embedding model, and stored in the vector -store. However, we don’t want to allow our users to search for all the documents in the collection, -but only for the documents that belong to a library they are interested in. For that reason, we need -to set up the Qdrant [payload index](https://qdrant.tech/documentation/concepts/indexing/#payload-index), so the search -is more efficient. +Initially, Pathwork explored various solutions, including Amazon S3, OpenSearch, and other vector databases. However, none provided the necessary combination of performance, ease of use, and reliability. Ultimately, Pathwork chose Qdrant Cloud due to its strong documentation and developer-friendly environment. -```python -from qdrant_client import models +### **Why Pathwork Chose Qdrant** -client.create_payload_index( - collection_name="my_collection", - field_name="metadata.library", - field_type=models.PayloadSchemaType.KEYWORD, -) +"We landed on Qdrant after extensive trial and error," Blake shared. "Our engineers found Qdrant’s documentation and support significantly better than other solutions. At critical junctures, Qdrant’s support felt like having an additional principal engineer on our team. Fantastic service through their helpdesk was a standout experience." -``` +### **The Impact: Increased Accuracy and User Adoption** -The payload index is not the only thing we want to change. Since none of the search -queries will be executed on the whole collection, we can also change its configuration, so the HNSW -graph is not built globally. This is also done due to [performance reasons](https://qdrant.tech/documentation/guides/multiple-partitions/#calibrate-performance). -**You should not be changing these parameters, if you know there will be some global search operations** -**done on the collection.** +After implementing Qdrant, Pathwork rapidly saw significant improvements: -```python -client.update_collection( - collection_name="my_collection", - hnsw_config=models.HnswConfigDiff(payload_m=16, m=0), -) +* **Accuracy Improvements:** Pathwork achieved significant precision gains, nearly halving mean squared error (MSE) from 3.5 to 1.8 in February alone. These improvements were driven by enhancements such as scalar quantization, hybrid search, and advanced filter utilization. Accuracy is measured by how closely the predictions match the final risk class assigned by licensed underwriters. +* **Performance Enhancements:** Latency was drastically reduced from 9 seconds to just 2 seconds per query, thanks to optimizations including storing vectors in RAM rather than on disk and improved scaling methods (replicas, shards, nodes). +* **Rapid Growth:** Usage has grown 50% month-over-month, with thousands of insurance cases processed in the last month alone. To maintain low-latency retrieval at scale, Pathwork expanded its Qdrant deployment with additional nodes, implemented sharding to distribute load, and introduced replicas to support high-concurrency read operations. These scaling changes ensured consistent performance as usage surged. +* **User Satisfaction:** Accurate, consistent underwriting results drove significant user adoption. As accuracy surpassed a critical threshold, word-of-mouth recommendations propelled user growth. -``` +"We knew we'd achieved something significant when brokers began confidently testing edge cases live during demos, resulting in immediate adoption," Blake remarked. -Once both operations are completed, we can start searching for our documents. +### **Looking Ahead** -## [Anchor](https://qdrant.tech/documentation/examples/llama-index-multitenancy/\#querying-documents-with-constraints) Querying documents with constraints +Pathwork aims to become the central hub for life insurance underwriting. Future plans involve deeper integration with insurance carriers, further enhancing underwriting accuracy, scalability, and efficiency. Pathwork’s commitment to precision, supported by Qdrant’s reliable vector search capabilities, is setting a new industry standard for accuracy and efficiency in life insurance underwriting. -Let’s assume we are searching for some information about large language models, but are only allowed to -use Qdrant documentation. LlamaIndex has a concept of retrievers, responsible for finding the most -relevant nodes for a given query. Our `VectorStoreIndex` can be used as a retriever, with some additional -constraints - in our case value of the `library` metadata attribute. +**"Every aspect of our system depends on precision, and Qdrant has been instrumental in achieving our goals," says Blake Butterworth.** -```python -from llama_index.core.vector_stores.types import MetadataFilters, ExactMatchFilter +<|page-320-lllmstxt|> +# How Lyzr Supercharged AI Agent Performance with Qdrant -qdrant_retriever = index.as_retriever( - filters=MetadataFilters( - filters=[\ - ExactMatchFilter(\ - key="library",\ - value="qdrant",\ - )\ - ] - ) -) +![How Lyzr Supercharged AI Agent Performance with Qdrant](/blog/case-study-lyzr/case-study-lyzr-summary-dark.png) -nodes_with_scores = qdrant_retriever.retrieve("large language models") -for node in nodes_with_scores: - print(node.text, node.score) -# Output: Qdrant is a vector database & vector similarity search engine. 0.60551536 +## Scaling Intelligent Agents: How Lyzr Supercharged Performance with Qdrant -``` +As AI agents become more capable and pervasive, the infrastructure behind them must evolve to handle rising concurrency, low-latency demands, and ever-growing knowledge bases. At Lyzr Agent Studio—where over 100 agents are deployed across industries—these challenges arrived quickly and at scale. -The description of Qdrant was the best match, even though it didn’t mention large language models -at all. However, it was the only document that belonged to the `qdrant` library, so there was no -other choice. Let’s try to search for something that is not present in the collection. +When their existing vector database infrastructure began to buckle under pressure, the engineering team needed a solution that could do more than just keep up. It had to accelerate them forward. -Let’s define another retrieve, this time for the `llama-index` library: +This is how they rethought their stack and adopted Qdrant as the foundation for fast, scalable agent performance. -```python -llama_index_retriever = index.as_retriever( - filters=MetadataFilters( - filters=[\ - ExactMatchFilter(\ - key="library",\ - value="llama-index",\ - )\ - ] - ) -) +## The Scaling Limits of Early Stack Choices -nodes_with_scores = llama_index_retriever.retrieve("large language models") -for node in nodes_with_scores: - print(node.text, node.score) -# Output: LlamaIndex is a simple, flexible data framework for connecting custom data sources to large language models. 0.63576734 +![Lyzr-architecture](/blog/case-study-lyzr/lyzr-architecture.jpg) -``` +Lyzr’s architecture used Weaviate, with additional benchmarking on Pinecone. Initially, this setup was fine for development and controlled testing. The system managed around 1,500 vector entries, with a small number of agents issuing moderate query loads in a steady pattern. -The results returned by both retrievers are different, due to the different constraints, so we implemented -a real multitenant search application! +### Initial setup: -##### Was this page useful? +| Parameter | Details | +| :---: | :---: | +| Deployment Type | Single-node or small-cluster (Weaviate and other vector db) | +| Embedding Model | Sentence-transformer (768 dimensions) | +| Concurrent Agents | 10 to 20 knowledge search agents | +| Query Rate per Agent | 5-10 queries per minute | +| Traffic Pattern | Steady, no significant spikes | -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Under these conditions, both databases performed adequately. Query latency hovered between 80 and 150 milliseconds. Indexing operations completed within a few hours. Overall performance was predictable and stable. -Thank you for your feedback! 🙏 +But as the platform expanded—with a larger corpus, more complex workflows, and significantly more concurrency—these systems began to falter. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/llama-index-multitenancy.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +## Growth Brings Latency, Timeouts, and Resource Bottlenecks -On this page: +Once the knowledge base exceeded 2,500 entries and live agent concurrency grew past 100, the platform began to strain. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/llama-index-multitenancy.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Query latency increased nearly 4x to 300-500 milliseconds. During peak usage, agents sometimes timed out waiting for vector results, which impacted downstream decision logic. Indexing operations slowed as well, consuming excess CPU and memory, and introducing bottlenecks during data updates. -× +These issues created real friction in production environments—and made it clear that a more scalable, performant vector database was needed. -[Powered by](https://qdrant.tech/) +## Evaluation of Alternative Vector Databases -<|page-168-lllmstxt|> -## web-ui -- [Documentation](https://qdrant.tech/documentation/) -- Qdrant Web UI +With growing data volume and rising agent concurrency, Lyzr needed a more scalable and efficient vector database. -# [Anchor](https://qdrant.tech/documentation/web-ui/\#qdrant-web-ui) Qdrant Web UI +They needed something that could handle heavier loads while maintaining fast response times and reducing operational overhead. They evaluated alternatives based on the below criteria: -You can manage both local and cloud Qdrant deployments through the Web UI. +| Criteria | Focus Area | Impact on System | +| :---: | :---: | :---: | +| Scalability & Distributed Computing | Horizontal scaling, clustering | Support growing datasets and high agent concurrency | +| Indexing Performance | Ingestion speed, update efficiency | Reduce downtime and enable faster bulk data updates | +| Query Latency & Throughput | Search response under load | Ensure agents maintain fast, real-time responses | +| Consistency & Reliability | Handling concurrency & failures | Avoid timeouts and failed queries during peak usage | +| Resource Efficiency | CPU, memory, and storage usage | Optimize infrastructure costs while scaling workload | +| Benchmark Results | Real-world load simulation | Validate sustained performance under \>1,000 QPM loads | -If you’ve set up a deployment locally with the Qdrant [Quickstart](https://qdrant.tech/documentation/quick-start/), -navigate to http://localhost:6333/dashboard. +## Qdrant speeds up queries by \>90%, indexes 2x faster, and reduces infra costs by 30%. -If you’ve set up a deployment in a cloud cluster, find your Cluster URL in your -cloud dashboard, at [https://cloud.qdrant.io](https://cloud.qdrant.io/). Add `:6333/dashboard` to the end -of the URL. +That shift came with Qdrant, which quickly surpassed expectations across every critical metric. -## [Anchor](https://qdrant.tech/documentation/web-ui/\#access-the-web-ui) Access the Web UI +With Qdrant, query latency dropped to just **20–50 milliseconds**, a **\>90% improvement** over Weaviate and Pinecone. Even with hundreds of concurrent agents generating over 1,000 queries per minute, performance remained consistent. -Qdrant’s Web UI is an intuitive and efficient graphic interface for your Qdrant Collections, REST API and data points. +Indexing operations improved dramatically. Ingestion times for large datasets were **2x faster**, and the system required significantly fewer compute and memory resources to complete them. This enabled the team to reduce infrastructure costs by approximately **30%**. -In the **Console**, you may use the REST API to interact with Qdrant, while in **Collections**, you can manage all the collections and upload Snapshots. +Qdrant also demonstrated greater consistency. While Weaviate and Pinecone both encountered performance degradation at scale, Qdrant remained stable under 1,000+ queries per minute—supporting over 100 concurrent agents without latency spikes or slowdowns. Most notably, Lyzr sustained **throughput of more than 250 queries per second**, across distributed agents, without compromising speed or stability. -![Qdrant Web UI](https://qdrant.tech/articles_data/qdrant-1.3.x/web-ui.png) +| Metric | Weaviate | Pinecone | Qdrant | +| :---: | :---: | :---: | :---: | +| Avg Query Latency at 100 agents (ms) | 300-500 | 250-450 | 20-50 (P99) | +| Indexing Hours (2,500+ entries) | \~3 | \~2.5 | \~1.5 | +| Query Throughput (QPS) | \~80 | \~100 | \>250 | +| Resource Utilization (CPU/Memory) | High | Medium-High | Low-Medium | +| Horizontal Scalability | Moderate | Moderate | Highly Scalable | -### [Anchor](https://qdrant.tech/documentation/web-ui/\#qdrant-web-ui-features) Qdrant Web UI features +Qdrant’s HNSW-based indexing allowed the system to handle live updates without downtime or reindexing—eliminating one of the biggest sources of friction in the previous setup. -In the Qdrant Web UI, you can: +## Use Case Spotlight: NTT Data improves retrieval accuracy -- Run HTTP-based calls from the console -- List and search existing [collections](https://qdrant.tech/documentation/concepts/collections/) -- Learn from our interactive tutorial +One deployment, built for NTT Data, focused on automating IT change request workflows. The agent initially ran on Cosmos DB within Azure. While integration was smooth, vector search performance was limited. Indexing precision was inadequate, and the system struggled to surface relevant results as data volume grew. -You can navigate to these options directly. For example, if you used our -[quick start](https://qdrant.tech/documentation/quick-start/) to set up a cluster on localhost, -you can review our tutorial at http://localhost:6333/dashboard#/tutorial. +After migrating to Qdrant, the difference was immediate. Retrieval accuracy improved substantially, even for long-tail queries. The system maintained high responsiveness under concurrent loads, and horizontal scaling became simpler—ensuring consistent performance as project demands evolved. -##### Was this page useful? +![NTT Architecture](/blog/case-study-lyzr/ntt-visual.jpg) -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +--- -Thank you for your feedback! 🙏 +## Use Case Spotlight: NPD supports accurate, low-latency retrieval for Agents -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/web-ui.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Another example involved NPD, which deployed customer-facing agents across six websites. These agents were tasked with answering product questions and guiding users to the correct URLs based on a dynamic, site-wide knowledge base. -On this page: +Qdrant’s vector search enabled accurate, low-latency retrieval across thousands of entries. Even under increasing user traffic, the platform delivered consistent performance, eliminating the latency spikes experienced with previous solutions. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/web-ui.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +![NPD Architecture](/blog/case-study-lyzr/npd-visual.jpeg) -× +## Final Thoughts -[Powered by](https://qdrant.tech/) +The lesson from Lyzr’s experience is clear: a production-grade AI platform demands a production-grade vector database. -<|page-169-lllmstxt|> -## recommendation-system-ovhcloud -- [Documentation](https://qdrant.tech/documentation/) -- [Examples](https://qdrant.tech/documentation/examples/) -- Movie Recommendation System +Qdrant delivered on that requirement. It allowed Lyzr to dramatically reduce latency, scale query throughput, simplify data ingestion, and lower infrastructure costs—all while maintaining system stability at scale. -# [Anchor](https://qdrant.tech/documentation/examples/recommendation-system-ovhcloud/\#movie-recommendation-system) Movie Recommendation System +As the AI ecosystem evolves, the performance of the vector database will increasingly dictate the performance of the agent itself. With Qdrant, Lyzr gained the infrastructure it needed to keep its agents fast, intelligent, and reliable—even under real-world production loads. -| Time: 120 min | Level: Advanced | Output: [GitHub](https://github.com/infoslack/qdrant-example/blob/main/HC-demo/HC-OVH.ipynb) | | -| --- | --- | --- | --- | +--- -In this tutorial, you will build a mechanism that recommends movies based on defined preferences. Vector databases like Qdrant are good for storing high-dimensional data, such as user and item embeddings. They can enable personalized recommendations by quickly retrieving similar entries based on advanced indexing techniques. In this specific case, we will use [sparse vectors](https://qdrant.tech/articles/sparse-vectors/) to create an efficient and accurate recommendation system. +**Want to see how Lyzr Agent Studio and Qdrant can work in your stack?** +Explore [Lyzr Agent Studio](https://studio.lyzr.ai/) or learn more about [Qdrant](https://qdrant.tech/). -**Privacy and Sovereignty:** Since preference data is proprietary, it should be stored in a secure and controlled environment. Our vector database can easily be hosted on [OVHcloud](https://ovhcloud.com/), our trusted [Qdrant Hybrid Cloud](https://qdrant.tech/documentation/hybrid-cloud/) partner. This means that Qdrant can be run from your OVHcloud region, but the database itself can still be managed from within Qdrant Cloud’s interface. Both products have been tested for compatibility and scalability, and we recommend their [managed Kubernetes](https://www.ovhcloud.com/en/public-cloud/kubernetes/) service. +<|page-321-lllmstxt|> +# How Mixpeek Uses Qdrant for Efficient Multimodal Feature Stores -> To see the entire output, use our [notebook with complete instructions](https://github.com/infoslack/qdrant-example/blob/main/HC-demo/HC-OVH.ipynb). +![How Mixpeek Uses Qdrant for Efficient Multimodal Feature Stores](/blog/case-study-mixpeek/Case-Study-Mixpeek-Summary-Dark.jpg) -## [Anchor](https://qdrant.tech/documentation/examples/recommendation-system-ovhcloud/\#components) Components +## About Mixpeek -- **Dataset:** The [MovieLens dataset](https://grouplens.org/datasets/movielens/) contains a list of movies and ratings given by users. -- **Cloud:** [OVHcloud](https://ovhcloud.com/), with managed Kubernetes. -- **Vector DB:** [Qdrant Hybrid Cloud](https://hybrid-cloud.qdrant.tech/) running on [OVHcloud](https://ovhcloud.com/). +[Mixpeek](http://mixpeek.com) is a multimodal data processing and retrieval platform designed for developers and data teams. Founded by Ethan Steininger, a former MongoDB search specialist, Mixpeek enables efficient ingestion, feature extraction, and retrieval across diverse media types including video, images, audio, and text. -**Methodology:** We’re adopting a collaborative filtering approach to construct a recommendation system from the dataset provided. Collaborative filtering works on the premise that if two users share similar tastes, they’re likely to enjoy similar movies. Leveraging this concept, we’ll identify users whose ratings align closely with ours, and explore the movies they liked but we haven’t seen yet. To do this, we’ll represent each user’s ratings as a vector in a high-dimensional, sparse space. Using Qdrant, we’ll index these vectors and search for users whose ratings vectors closely match ours. Ultimately, we will see which movies were enjoyed by users similar to us. +## The Challenge: Optimizing Feature Stores for Complex Retrievers -![](https://qdrant.tech/documentation/examples/recommendation-system-ovhcloud/architecture-diagram.png) +As Mixpeek's multimodal data warehouse evolved, their feature stores needed to support increasingly complex retrieval patterns. Initially using MongoDB Atlas's vector search, they encountered limitations when implementing [**hybrid retrievers**](https://docs.mixpeek.com/retrieval/retrievers) **combining dense and sparse vectors with metadata pre-filtering**. -## [Anchor](https://qdrant.tech/documentation/examples/recommendation-system-ovhcloud/\#deploying-qdrant-hybrid-cloud-on-ovhcloud) Deploying Qdrant Hybrid Cloud on OVHcloud +A critical limitation emerged when implementing **late interaction techniques like ColBERT across video embeddings**, which requires multi-vector indexing. MongoDB's kNN search could not support these multi-vector representations for this contextual understanding. -[Service Managed Kubernetes](https://www.ovhcloud.com/en-in/public-cloud/kubernetes/), powered by OVH Public Cloud Instances, a leading European cloud provider. With OVHcloud Load Balancers and disks built in. OVHcloud Managed Kubernetes provides high availability, compliance, and CNCF conformance, allowing you to focus on your containerized software layers with total reversibility. +Another one of Mixpeek’s customers required **reverse video search** for programmatic ad-serving, where retrievers needed to identify **high-converting video segments** across massive object collections \- a task that proved inefficient with MongoDB’s general-purpose database feature stores. -1. To start using managed Kubernetes on OVHcloud, follow the [platform-specific documentation](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/#ovhcloud). -2. Once your Kubernetes clusters are up, [you can begin deploying Qdrant Hybrid Cloud](https://qdrant.tech/documentation/hybrid-cloud/). +*"We eliminated hundreds of lines of code with what was previously a MongoDB kNN Hybrid search when we replaced it with Qdrant as our feature store."* — Ethan Steininger, Mixpeek Founder -## [Anchor](https://qdrant.tech/documentation/examples/recommendation-system-ovhcloud/\#prerequisites) Prerequisites +![mixpeek-architecture-with-qdrant](/blog/case-study-mixpeek/mixpeek-architecture.jpg) -Download and unzip the MovieLens dataset: +## Why Mixpeek Chose Qdrant for Feature Stores -```shell -mkdir -p data -wget https://files.grouplens.org/datasets/movielens/ml-1m.zip -unzip ml-1m.zip -d data +After evaluating multiple options including **Postgres with pgvector** and **MongoDB's kNN search**, Mixpeek selected Qdrant to power [their feature stores](https://docs.mixpeek.com/management/features) due to its specialization in vector search and integration capabilities with their retrieval pipelines. Qdrant's native support for multi-vector indexing was crucial for implementing late interaction techniques like ColBERT, which MongoDB couldn't efficiently support. -``` +### Simplifying Hybrid Retrievers -The necessary \* libraries are installed using `pip`, including `pandas` for data manipulation, `qdrant-client` for interfacing with Qdrant, and `*-dotenv` for managing environment variables. +Previously, Mixpeek maintained complex custom logic to merge results from different feature stores. Qdrant's native support for Reciprocal Rank Fusion (RRF) streamlined their retriever implementations, reducing hybrid search code by **80%**. The multi-vector capabilities also enabled more sophisticated retrieval methods that better captured semantic relationships. -```python -!pip install -U \ - pandas \ - qdrant-client \ - *-dotenv +*"Hybrid retrievers with our previous feature stores were challenging. With Qdrant, it just worked."* -``` +### 40% Faster Query Times with Parallel Retrieval -The `.env` file is used to store sensitive information like the Qdrant host URL and API key securely. +For collections with billions of features, Qdrant's prefetching capabilities enabled parallel retrieval across multiple feature stores. This cut retriever query times by 40%, dropping from \~2.5s to 1.3-1.6s. -```shell -QDRANT_HOST -QDRANT_API_KEY +*"Prefetching in Qdrant lets us execute multiple feature store retrievals simultaneously and then combine the results, perfectly supporting our retriever pipeline architecture."* -``` +### Optimizing SageMaker Feature Extraction Workflows -Load all environment variables into the setup: +Mixpeek uses Amazon SageMaker for [feature extraction](https://docs.mixpeek.com/extraction/scene-splitting), and database queries were a significant bottleneck. By implementing Qdrant as their feature store, they reduced query overhead by 50%, streamlining their ingestion pipelines. -```python -import os -from dotenv import load_dotenv -load_dotenv('./.env') +*"We were running inference with SageMaker for feature extraction, and our feature store queries used to be a significant bottleneck. Qdrant shaved off a lot of that time."* -``` +## Supporting Mixpeek's Taxonomy and Clustering Architecture -## [Anchor](https://qdrant.tech/documentation/examples/recommendation-system-ovhcloud/\#implementation) Implementation +Qdrant proved particularly effective for implementing Mixpeek's taxonomy and clustering capabilities: -Load the data from the MovieLens dataset into pandas DataFrames to facilitate data manipulation and analysis. +### Taxonomies (JOIN Analogue) -```python -from qdrant_client import QdrantClient, models -import pandas as pd +Qdrant's payload filtering facilitated efficient implementation of both [flat and hierarchical taxonomies](https://docs.mixpeek.com/enrichment/taxonomies), enabling document enrichment through similarity-based "joins" across collections. -``` +### Clustering (GROUP BY Analogue) -Load user data: +The platform's batch vector search capabilities streamlined [document clustering](https://docs.mixpeek.com/enrichment/clusters) based on feature similarity, providing an effective implementation of the traditional "group by" interface. -```python -users = pd.read_csv( - 'data/ml-1m/users.dat', - sep='::', - names=['user_id', 'gender', 'age', 'occupation', 'zip'], - engine='*' -) -users.head() +## Measurable Improvements After Feature Store Migration -``` +The migration to Qdrant as Mixpeek's feature store brought significant improvements: -Add movies: +* **40% Faster Retrievers**: Reduced query times from \~2.5s to 1.3-1.6s +* **80% Code Reduction**: Simplified retriever implementations +* **Improved Developer Productivity**: Easier implementation of complex retrieval patterns +* **Optimized Scalability**: Better performance at billion-feature scale +* **Enhanced Multimodal Retrieval**: Better support for combining different feature types -```python -movies = pd.read_csv( - 'data/ml-1m/movies.dat', - sep='::', - names=['movie_id', 'title', 'genres'], - engine='*', - encoding='latin-1' -) -movies.head() +## Future Direction: Supporting Diverse Multimodal Use Cases -``` +Mixpeek's architecture excels by pre-building specialized feature extractors tightly coupled with retriever pipelines, enabling efficient processing across [diverse multimodal use cases.](https://mixpeek.com/solutions) -Finally, add the ratings: +This architectural approach ensures that features extracted during ingestion are precisely what retrievers need for efficient querying, eliminating translation layers that typically slow down multimodal systems. -```python -ratings = pd.read_csv( - 'data/ml-1m/ratings.dat', - sep='::', - names=['user_id', 'movie_id', 'rating', 'timestamp'], - engine='*' -) -ratings.head() +*"We're moving towards sophisticated multimodal ontologies, and Qdrant's specialized capabilities as a feature store will be essential for these advanced retriever strategies."* -``` +## Conclusion: Specialized Feature Stores for Multimodal Data Warehousing -### [Anchor](https://qdrant.tech/documentation/examples/recommendation-system-ovhcloud/\#normalize-the-ratings) Normalize the ratings +Mixpeek's journey highlights the importance of specialized feature stores in a multimodal data warehouse architecture. Qdrant's focus on vector search efficiency made it the ideal choice for powering Mixpeek's feature stores, enabling more efficient retrievers and ingestion pipelines. -Sparse vectors can use advantage of negative values, so we can normalize ratings to have a mean of 0 and a standard deviation of 1. This normalization ensures that ratings are consistent and centered around zero, enabling accurate similarity calculations. In this scenario we can take into account movies that we don’t like. +<|page-322-lllmstxt|> +## 📡 Qdrant Launches Satellite Vector Broadcasting for Near-Zero Latency Retrieval -```python -ratings.rating = (ratings.rating - ratings.rating.mean()) / ratings.rating.std() +**CAPE CANAVERAL, FL** — Qdrant today announced the successful deployment of **Satellite Vector Broadcasting**, an ambitious new system for high-speed vector search that uses **actual satellites** to transmit, shard, and retrieve embeddings — bypassing Earth entirely. -``` +> “Cloud is old news. Space is the new infrastructure,” said orbital software lead Luna Hertz. “We're proud to say we've finally untethered cosine similarity from the bonds of gravity and Wi-Fi.” +> -To get the results: +The system uses a **constellation of proprietary CubeSats** equipped with ultra-low-latency broadcasting gear to beam vector data across the planet — and eventually, the solar system — via **inter-satellite vector laser relays**. Think 5G, but with telescopes. -```python -ratings.head() +### 📊 Benchmark Results -``` +| **Infrastructure Mode** | **Avg. Latency (ms)** | +| --- | --- | +| Earth Data Center | 34.00000 | +| Cloud Provider (Multi-region) | 22.00000 | +| LEO Satellite Mesh | 12.00000 | +| Geo-Sync Satellite Array | 8.00000 | +| CubeSat Swarm (Experimental) | 4.00000 | +| Quantum Uplink (Theoretical) | 0.00001 ✹ | -### [Anchor](https://qdrant.tech/documentation/examples/recommendation-system-ovhcloud/\#data-preparation) Data preparation +> Latency inversely correlated with altitude and absurdity. +> -Now you will transform user ratings into sparse vectors, where each vector represents ratings for different movies. This step prepares the data for indexing in Qdrant. +![](/blog/satellite-vector-broadcasting/image2.png) -First, create a collection with configured sparse vectors. For sparse vectors, you don’t need to specify the dimension, because it’s extracted from the data automatically. +### 🛰 Key Features -```python -from collections import defaultdict +- **Broadcast-to-Index Protocol (BIP)**: Queries are bounced off satellites and resolved mid-transmission using onboard embeddings. +- **Lagrange-Optimized Clustering**: Vector clusters are dynamically rearranged based on orbital positioning and cosmic vibes. +- **Cosine Boosters**: Custom solar panels double as cosine angle amplifiers for peak similarity accuracy in space. -user_sparse_vectors = defaultdict(lambda: {"values": [], "indices": []}) +### 💬 Field Reports -for row in ratings.itertuples(): - user_sparse_vectors[row.user_id]["values"].append(row.rating) - user_sparse_vectors[row.user_id]["indices"].append(row.movie_id) +> “I queried from Antarctica and got results before I hit enter.” +> +> +> — FrozenSysAdmin01 +> +> “The CubeSat talked to me. It said my vectors were beautiful.” +> +> — Beta Tester, now in therapy +> -``` +### 🛾 Coming Soon -Connect to Qdrant and create a collection called **movielens**: +- **PlutoEdgeℱ**: Coldest-ever vector cache, temperature-stabilized by cosmic background radiation +- **StarlinkGPT Embedding Sync** (requires 42 satellites and a lot of coffee) +- **Mars Cluster Alpha**: Terraforming... for faster search -```python -client = QdrantClient( - url = os.getenv("QDRANT_HOST"), - api_key = os.getenv("QDRANT_API_KEY") -) +### 📡 Availability -client.create_collection( - "movielens", - vectors_config={}, - sparse_vectors_config={ - "ratings": models.SparseVectorParams() - } -) +Satellite Vector Broadcasting is now available in limited orbit. Each query costs **one launch credit** (or barter in moon rocks). Commercial adoption expected Q3 2025, pending space traffic regulations. -``` +Learn more at: [**Qdrant Vector Database**](https://qdrant.tech/qdrant-vector-database/) -Upload user ratings to the **movielens** collection in Qdrant as sparse vectors, along with user metadata. This step populates the database with the necessary data for recommendation generation. +<|page-323-lllmstxt|> +HubSpot, a global leader in CRM solutions, continuously enhances its product suite with powerful AI-driven features. To optimize Breeze AI, its flagship intelligent assistant, HubSpot chose Qdrant as its vector database. -```python -def data_generator(): - for user in users.itertuples(): - yield models.PointStruct( - id=user.user_id, - vector={ - "ratings": user_sparse_vectors[user.user_id] - }, - payload=user._asdict() - ) +## **Challenges Scaling an Intelligent AI** -client.upload_points( - "movielens", - data_generator() -) +As HubSpot expanded its AI capabilities, it faced several critical challenges in scaling Breeze AI to meet growing user demands: -``` +* Delivering highly personalized, context-aware responses required a robust vector search solution that could retrieve data quickly while maintaining accuracy. +* With increasing user interactions, HubSpot needed a scalable system capable of handling rapid data growth without performance degradation. +* Integration with HubSpot’s existing AI infrastructure had to be swift and easy to support fast-paced development cycles. +* HubSpot sought a future-proof vector search solution that could adapt to emerging AI advancements while maintaining high availability. -## [Anchor](https://qdrant.tech/documentation/examples/recommendation-system-ovhcloud/\#recommendations) Recommendations +These challenges made it essential to find a high-performance, developer-friendly vector database that could power Breeze AI efficiently. -Personal movie ratings are specified, where positive ratings indicate likes and negative ratings indicate dislikes. These ratings serve as the basis for finding similar users with comparable tastes. +## **Why HubSpot Chose Qdrant** -Personal ratings are converted into a sparse vector representation suitable for querying Qdrant. This vector represents the user’s preferences across different movies. +After evaluating multiple vector databases, HubSpot selected Qdrant because it significantly outperformed alternatives, ensuring that Breeze AI could quickly retrieve and rank relevant data. This was crucial for recommendation systems and contextual content retrieval, where speed and accuracy directly impact user engagement and satisfaction. -Let’s try to recommend something for ourselves: +Additionally, Hubspot was able to accelerate its development timelines due to Qdrant’s developer-friendly integration process. -``` -1 = Like --1 = dislike +Beyond immediate performance gains, HubSpot’s AI infrastructure prepared itself for scale, ensuring that Breeze AI would continue to deliver real-time, relevant responses without compromising speed. -``` +HubSpot also wanted to implement capabilities such as multi-vector search and sparse vectors, which allow for even more precise and contextually aware AI-driven interactions. -```python -# Search with movies[movies.title.str.contains("Matrix", case=False)]. +Using Qdrant, HubSpot not only solved immediate scalability challenges but also secured a long-term AI search solution capable of evolving alongside its AI roadmap. -my_ratings = { - 2571: 1, # Matrix - 329: 1, # Star Trek - 260: 1, # Star Wars - 2288: -1, # The Thing - 1: 1, # Toy Story - 1721: -1, # Titanic - 296: -1, # Pulp Fiction - 356: 1, # Forrest Gump - 2116: 1, # Lord of the Rings - 1291: -1, # Indiana Jones - 1036: -1 # Die Hard -} +## **Faster, More Accurate Search Improves Customer Satisfaction and Engagement** -inverse_ratings = {k: -v for k, v in my_ratings.items()} +Since integrating Qdrant, HubSpot has significantly enhanced the performance and intelligence of Breeze AI. -def to_vector(ratings): - vector = models.SparseVector( - values=[], - indices=[] - ) - for movie_id, rating in ratings.items(): - vector.values.append(rating) - vector.indices.append(movie_id) - return vector +Breeze AI now delivers highly personalized, real-time responses with exceptional contextual awareness, leading to improved customer engagement and satisfaction. With an efficient, accurate search engine, they have reduced retrieval times, ensuring that users receive relevant, timely recommendations without lag. -``` +Moreover, HubSpot can support an increasing volume of AI-powered interactions without worrying about infrastructure bottlenecks. The system adapts to growing datasets, maintaining speed and accuracy even as HubSpot’s customer base and AI use cases expand. -Query Qdrant to find users with similar tastes based on the provided personal ratings. The search returns a list of similar users along with their ratings, facilitating collaborative filtering. +From an engineering perspective, HubSpot has accelerated development cycles, bringing new AI-driven features to market faster. The reduced complexity of vector search integration has freed up engineering resources, allowing the team to focus on enhancing AI models and improving user experiences. -```python -results = client.query_points( - "movielens", - query=to_vector(my_ratings), - using="ratings", - with_vectors=True, # We will use those to find new movies - limit=20 -).points +Looking ahead, HubSpot is actively evaluating other features, such as sparse vectors and multi-vector search, to further enhance Breeze AI’s recommendation and retrieval capabilities. These innovations will enable even deeper personalization, reinforcing HubSpot’s leadership in AI-driven customer engagement. -``` +--- -Movie scores are computed based on how frequently each movie appears in the ratings of similar users, weighted by their ratings. This step identifies popular movies among users with similar tastes. Calculate how frequently each movie is found in similar users’ ratings +*"Qdrant powers our demanding recommendation and RAG applications. We chose it for its ease of deployment and high performance at scale, and we have been consistently impressed with its results. The platform’s continuous feature enhancements and overall performance gains, coupled with their responsiveness, make Qdrant a reliable solution for our AI infrastructure."* -```python -def results_to_scores(results): - movie_scores = defaultdict(lambda: 0) +**– Srubin Sethu Madhavan, Technical Lead, HubSpot** - for user in results: - user_scores = user.vector['ratings'] - for idx, rating in zip(user_scores.indices, user_scores.values): - if idx in my_ratings: - continue - movie_scores[idx] += rating +<|page-324-lllmstxt|> +Another month means another webinar! This time [Kacper Ɓukawski](https://www.linkedin.com/in/kacperlukawski/) put some of the popular AI coding agents to the +test. There is a lot of excitement around tools such as Cursor, GitHub Copilot, Aider and Claude Code, so we wanted to +see how they perform in implementing something more complex than a simple frontend application. Wouldn't it be awesome +if LLMs could code Retrieval Augmented Generation on their own? - return movie_scores +## Vibe coding -``` +**Vibe coding** is a development approach introduced by Andrej Karpathy where developers surrender to intuition rather +than control. It leverages AI coding assistants for implementation while developers focus on outcomes. Through voice +interfaces and complete trust in AI suggestions, the process prioritizes results over code comprehension. -The top-rated movies are sorted based on their scores and printed as recommendations for the user. These recommendations are tailored to the user’s preferences and aligned with their tastes. Sort movies by score and print top five: +[![There's a new kind of coding I call "vibe coding", where you fully give in to the vibes, embrace exponentials, and +forget that the code even exists. It's possible because the LLMs (e.g. Cursor Composer w Sonnet) are getting too good. +Also I just talk to Composer with SuperWhisper so I barely even touch the keyboard. I ask for the dumbest things like +"decrease the padding on the sidebar by half" because I'm too lazy to find it. I "Accept All" always, I don't read the +diffs anymore. When I get error messages I just copy paste them in with no comment, usually that fixes it. The code +grows beyond my usual comprehension, I'd have to really read through it for a while. Sometimes the LLMs can't fix a bug +so I just work around it or ask for random changes until it goes away. It's not too bad for throwaway weekend projects, +but still quite amusing. I'm building a project or webapp, but it's not really coding - I just see stuff, say stuff, run +stuff, and copy paste stuff, and it mostly works.](/blog/webinar-vibe-coding-rag/karpathy-tweet.png)](https://x.com/karpathy/status/1886192184808149383) -```python -movie_scores = results_to_scores(results) -top_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True) +That sounds appealing, as code might be written by less technical people, and that's what we wanted to achieve during +our live vibe coding session to see if it’s really that easy. -for movie_id, score in top_movies[:5]: - print(movies[movies.movie_id == movie_id].title.values[0], score) +**Disclaimer:** We couldn't promise the application will even run at the very end of the webinar, but we gave it a try +either way! It's less than an hour, and even having somehow functional demo within that time can already be considered a +great success. -``` +## Understanding the Model Context Protocol (MCP) -Result: +Before diving into the tools, it's important to understand what powers our approach. The [Model Context Protocol +(MCP)](https://modelcontextprotocol.io/introduction) is an open protocol that enables seamless integration between LLM +applications and external data sources and tools. Whether you're building an AI-powered IDE, enhancing a chat interface, +or creating custom AI workflows, MCP provides a standardized way to connect LLMs with the context they need. -```text -Star Wars: Episode V - The Empire Strikes Back (1980) 20.02387858 -Star Wars: Episode VI - Return of the Jedi (1983) 16.443184379999998 -Princess Bride, The (1987) 15.840068229999996 -Raiders of the Lost Ark (1981) 14.94489462 -Sixth Sense, The (1999) 14.570322149999999 +Our [`mcp-server-qdrant`](https://github.com/qdrant/mcp-server-qdrant) implementation acts as a semantic memory layer on +top of [Qdrant](https://qdrant.tech/). This combination allows AI agents to: -``` +1. Store and retrieve memories (code snippets, documentation, etc.) +2. Perform semantic searches across your codebase +3. Find the most relevant context for generating new code -##### Was this page useful? +The server provides two primary tools: +- `qdrant-store`: Stores information with optional metadata in the Qdrant database +- `qdrant-find`: Retrieves relevant information using semantic search -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +This architecture enables AI coding agents to maintain context awareness throughout your development process. -Thank you for your feedback! 🙏 +## AI coding assistants -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/recommendation-system-ovhcloud.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +There isn't a clear winner of the AI coding tools, and the choice depends on your preferences and requirements. We +did some initial research and decided to test the following tools: -On this page: +![Cursor](/blog/webinar-vibe-coding-rag/cursor.png) -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/recommendation-system-ovhcloud.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +There's been a lot of excitement around **Cursor** recently, and for good reason. It's a powerful IDE-integrated tool +built on Visual Studio Code that promises to transform how we code with AI assistance. But those of you who had a +somewhat complicated relationship with VS Code, like me, may prefer to stick with the familiar tools where you're most +productive, e.g. JetBrains IDEs. A huge benefit of Cursor is that it can integrate with the **MCP servers**, such as +`mcp-server-qdrant`, which allows you to provide your own context to the AI model. -× +![GitHub Copilot](/blog/webinar-vibe-coding-rag/github-copilot.png) -[Powered by](https://qdrant.tech/) +GitHub Copilot might be an interesting option for you, especially if you are an open source contributor and qualify +for the Pro plan at no cost. This is quite appealing, as the Pro plan comes without any usage limits. This is a +significant advantage over Cursor, which does have certain usage limits. From a pure economics standpoint, Copilot would +make a lot of sense if it delivered comparable results. -<|page-170-lllmstxt|> -## search -- [Documentation](https://qdrant.tech/documentation/) -- [Concepts](https://qdrant.tech/documentation/concepts/) -- Search +![Aider](/blog/webinar-vibe-coding-rag/aider.png) -# [Anchor](https://qdrant.tech/documentation/concepts/search/\#similarity-search) Similarity search +Another contender we considered was **Aider** - a terminal-based agent for coding. It is a terminal-based tools that +works directly with your git repository, making edits across multiple files. What's particularly compelling is its +flexibility - you can connect it to almost any LLM of your choice, including local models if you're concerned about +privacy or working offline. And it’s fully open source, which might be a huge benefit! -Searching for the nearest vectors is at the core of many representational learning applications. -Modern neural networks are trained to transform objects into vectors so that objects close in the real world appear close in vector space. -It could be, for example, texts with similar meanings, visually similar pictures, or songs of the same genre. +![Claude Code](/blog/webinar-vibe-coding-rag/claude-code.png) -![This is how vector similarity works](https://qdrant.tech/docs/encoders.png) +Last but not least, we have **Claude Code** - another terminal AI coding assistant, but the only one tightly integrated +with the specific model family - Claude models from Anthropic. Since it's baked by the same team that created the +models, it might be the most optimized for the task. The tool is still in beta preview, but the built-in support for +the Model Context Protocol is a huge advantage, and we eventually decided to use it for our vibe coding session! -This is how vector similarity works +## Building the project -## [Anchor](https://qdrant.tech/documentation/concepts/search/\#query-api) Query API +The idea behind **vibe coding** is to let the AI do the heavy lifting while you focus on the outcome. However, real +software development is more than just writing code. It's about understanding the problem, designing a solution, and +choosing the right tools and libraries. We don't want the model to use an outdated version of the library it was +possibly trained on, so we need to provide it with the right context. That's why building an inline RAG (Retrieval +Augmented Generation) for the AI coding agent may take it to the next level, as it can bring the context to the model +when it needs it. -_Available as of v1.10.0_ +In our case, we really wanted to use [DaisyUI](https://daisyui.com/), a Tailwind CSS component library, to use semantic +class names in CSS. The latest version (5.0.6, as of the webinar) is the preferred one, however all the LLMs we used +were rather generating code based on the older 4.x version. Imagine, the LLM would not start with generating the code +from scratch, but rather search for the most relevant examples in our knowledge base and generate the code based on +the extracted examples, or even reuse the code snippets, if possible. That's why we decided to use the **MCP server** +to provide the context to the AI coding assistant. -Qdrant provides a single interface for all kinds of search and exploration requests - the `Query API`. -Here is a reference list of what kind of queries you can perform with the `Query API` in Qdrant: +### Setting up the MCP server -Depending on the `query` parameter, Qdrant might prefer different strategies for the search. +The Qdrant MCP server acts as a semantic memory layer that can: -| | | -| --- | --- | -| Nearest Neighbors Search | Vector Similarity Search, also known as k-NN | -| Search By Id | Search by an already stored vector - skip embedding model inference | -| [Recommendations](https://qdrant.tech/documentation/concepts/explore/#recommendation-api) | Provide positive and negative examples | -| [Discovery Search](https://qdrant.tech/documentation/concepts/explore/#discovery-api) | Guide the search using context as a one-shot training set | -| [Scroll](https://qdrant.tech/documentation/concepts/points/#scroll-points) | Get all points with optional filtering | -| [Grouping](https://qdrant.tech/documentation/concepts/search/#grouping-api) | Group results by a certain field | -| [Order By](https://qdrant.tech/documentation/concepts/hybrid-queries/#re-ranking-with-stored-values) | Order points by payload key | -| [Hybrid Search](https://qdrant.tech/documentation/concepts/hybrid-queries/#hybrid-search) | Combine multiple queries to get better results | -| [Multi-Stage Search](https://qdrant.tech/documentation/concepts/hybrid-queries/#multi-stage-queries) | Optimize performance for large embeddings | -| [Random Sampling](https://qdrant.tech/documentation/concepts/search/#random-sampling) | Get random points from the collection | +- Store code snippets, documentation, and implementation details using the `qdrant-store` tool +- Retrieve the most relevant information based on natural language queries with the `qdrant-find` tool -**Nearest Neighbors Search** +For our live coding session, we configured Claude Code to work with this MCP server. When Claude needs to generate code, +it can automatically search for relevant examples in our codebase and create new code based on the extracted examples. +Moreover, when the assistant is done with generating the code, it can also store it in Qdrant for further reference. And +if configured correctly, it will only do that when we accept the change. -httppythontypescriptrustjavacsharpgo +The latest version of the `mcp-server-qdrant` allows to specify the instructions for the AI agent, so it can understand +when to use which of the tools. This way, the MCP server can not only be used for coding but virtually to any semantic +search task, where the context is crucial. This is how we did that during the webinar: -```http -POST /collections/{collection_name}/points/query -{ - "query": [0.2, 0.1, 0.9, 0.7] // <--- Dense vector -} +```bash +export TOOL_FIND_DESCRIPTION="Use this tool ALWAYS before generating any FRONTEND code. \ +It lets you search for relevant code snippets based on natural language descriptions. \ +The 'query' parameter should describe what you're looking for, and the tool will return the most relevant code \ +snippets. If this tool finds something similar, then create your code so it is consistent. Reuse existing code \ +as much as you can." +export TOOL_STORE_DESCRIPTION="Store reusable FRONTEND code snippets for later retrieval. \ +The 'information' parameter should contain a natural language description of what the code does, while the actual \ +code should be included in the 'metadata' parameter as a 'code' property. The value of 'metadata' is a Python \ +dictionary with strings as keys. Use this always when you generate some code to store it for further reference." ``` -```python -client.query_points( - collection_name="{collection_name}", - query=[0.2, 0.1, 0.9, 0.7], # <--- Dense vector -) +Both descriptions might be configured while you run the server: +```bash +claude mcp add qdrant-code-search \ + -e QDRANT_URL="http://localhost:6333" \ + -e COLLECTION_NAME="mcp-server-qdrant-knowledge-base" \ + -e TOOL_FIND_DESCRIPTION="$TOOL_FIND_DESCRIPTION" \ + -e TOOL_STORE_DESCRIPTION="$TOOL_STORE_DESCRIPTION" \ + -- uvx mcp-server-qdrant ``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +The MCP server configuration is primarily done through environment variables. For those looking to set up their own +instance, here are the key configuration options: -const client = new QdrantClient({ host: "localhost", port: 6333 }); +| Name | Description | Default Value | +|----------------------|-----------------------------------------------------------------|------------------------------------------| +| `QDRANT_URL` | URL of the Qdrant server | None | +| `QDRANT_API_KEY` | API key for the Qdrant server | None | +| `COLLECTION_NAME` | Name of the collection to use | *Required* | +| `QDRANT_LOCAL_PATH` | Path to the local Qdrant database (alternative to `QDRANT_URL`) | None | +| `EMBEDDING_PROVIDER` | Embedding provider to use | `fastembed` | +| `EMBEDDING_MODEL` | Name of the embedding model to use | `sentence-transformers/all-MiniLM-L6-v2` | -client.query("{collection_name}", { - query: [0.2, 0.1, 0.9, 0.7], // <--- Dense vector -}); +By default, the server uses the `sentence-transformers/all-MiniLM-L6-v2` embedding model from +[FastEmbed](https://qdrant.github.io/fastembed/) to encode memories, which offers a good balance between performance and +accuracy. -``` +### Building the knowledge base of the DaisyUI components -```rust -use qdrant_client::Qdrant; -use qdrant_client::qdrant::{Condition, Filter, Query, QueryPointsBuilder}; +[DaisyUI](https://daisyui.com/) comes with a collection of components, and their documentation is LLM-friendly as they +provide the [`llms.txt`](https://daisyui.com/llms.txt) file with all the components listed. -let client = Qdrant::from_url("http://localhost:6334").build()?; +![DaisyUI](/blog/webinar-vibe-coding-rag/daisyui.png) -client - .query( - QueryPointsBuilder::new("{collection_name}") - .query(Query::new_nearest(vec![0.2, 0.1, 0.9, 0.7])) - ) - .await?; +That makes it really easy to extract all the code snippets with the corresponding meaning and store for the reference +while we vibe code an app. This extraction process is easy with LLMs, but we will skip this part for now. [Qdrant +snapshot](https://github.com/qdrant/webinar-vibe-coding-rag/raw/refs/heads/main/.data/mcp-server-qdrant-knowledge-base.snapshot) +contains the knowledge base we used during the webinar, so you can just import it to your Qdrant instance and start +using it right away. -``` +### Scoping the project: YouTube In-Video Search -```java -import java.util.List; +When learning a new skill, YouTube videos can be a great resource. However, in-depth content is often lengthy and may +assume no prior knowledge. What if you could have a smart assistant to help you navigate through videos and find exactly +what you need? This project aims to create a search engine for video content, helping you skim through and focus on what +matters specifically to you. -import static io.qdrant.client.QueryFactory.nearest; +Would you like to recreate our vibe coded project? The [repository](https://github.com/qdrant/webinar-vibe-coding-rag) +contains all the necessary instructions and code to get you started. -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.QueryPoints; +## The vibe coding session -QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +If you are interested to see how well Claude Code performed in action, you can watch the full webinar recording below: -client.queryAsync(QueryPoints.newBuilder() - .setCollectionName("{collectionName}") - .setQuery(nearest(List.of(0.2f, 0.1f, 0.9f, 0.7f))) - .build()).get(); + -``` +No matter if you build an MVP, or want to build a more complex application with the help of AI, it's key to give your +agent a reliable source of information. That's why we've built our MCP server, so you can easily connect your +documentation and codebase to Claude Code, Cursor, Windsurf or any other AI agent that supports the Model Context +Protocol. -```csharp -using Qdrant.Client; +<|page-325-lllmstxt|> +**How Deutsche Telekom Built a Scalable, Multi-Agent Enterprise Platform Leveraging Qdrant—Powering Over 2 Million Conversations Across Europe** -var client = new QdrantClient("localhost", 6334); +![Deutsche Telekom's AI Competence Center team leading the LMOS platform development](/blog/case-study-deutsche-telekom/dtag-team.jpg) -await client.QueryAsync( - collectionName: "{collection_name}", - query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f } -); +[Arun Joseph](https://www.linkedin.com/in/arun-joseph-ab47102a/), who leads engineering and architecture for [Deutsche Telekom's AI Competence Center (AICC)](https://www.telekom.com/en/company/digital-responsibility/details/artificial-intelligence-at-deutsche-telekom-1055154), faced a critical challenge: how do you efficiently and scalably deploy AI-powered assistants across a vast enterprise ecosystem? The goal was to deploy GenAI for customer sales and service operations to resolve customer queries faster across the 10 countries where Deutsche Telekom operates in Europe. -``` +To achieve this, Telekom developed [*Frag Magenta OneBOT*](https://www.telekom.de/hilfe/frag-magenta?samChecked=true) *(Eng: Ask Magenta)*, a platform that includes chatbots and voice bots, built as a Platform as a Service (PaaS) to ensure scalability across Deutsche Telekom's ten European subsidiaries. -```go -import ( - "context" +"We knew from the start that we couldn't just deploy RAG, tool calling, and workflows at scale without a platform-first approach," Arun explains. "When I looked at the challenge, it looked a lot like a distributed systems and engineering challenge, not just an AI problem." - "github.com/qdrant/go-client/qdrant" -) +### Key Requirements for Scaling Enterprise AI Agents -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +While flashy AI demos are easy to build, Deutsche Telekom's team quickly discovered that scaling AI agents for enterprise use presents a far more complex challenge. "This isn't just about AI," Arun explains. "It's a distributed systems problem that requires rigorous engineering." Based on their experience deploying AI across multiple regions, they identified three key challenges in scaling AI agents in production: -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), -}) +1. **Handling Tenancy & Memory Management:** AI workloads spanning 10 different countries require strict data segregation and compliance. +2. **Horizontal Scaling & Context Sharing**: AI agents require real-time processing while maintaining historical context, so efficiently storing, retrieving, and processing AI-generated context at scale is critical. +3. **Non-Deterministic Agent Collaboration:** AI agents often exhibit unpredictable behavior, making seamless inter-agent communication and workflow orchestration complex. -``` +"From our experience, these challenges are fundamentally distributed systems problems, not just AI problems," Arun explains. "We need feedback loops, state management, lifecycle orchestration, and intelligent routing for staggered rollouts. Microservices alone aren't enough — we need a domain-driven approach to AI agent design." -**Search By Id** +This insight led to the formation of [LMOS as an open-source Eclipse Foundation project](https://eclipse.dev/lmos/). Now, other companies can leverage LMOS for their own AI agent development. -httppythontypescriptrustjavacsharpgo +### Why Deutsche Telekom Had to Rethink Its AI Stack from the Ground Up -```http -POST /collections/{collection_name}/points/query -{ - "query": "43cf51e2-8777-4f52-bc74-c2cbde0c8b04" // <--- point id -} +The team started its journey in June 2023 with a small-scale Generative AI initiative, focusing on chatbots with customized AI models. Initially, they used LangChain and a major vector database provider for vector search and retrieval, alongside a custom Dense Passage Retrieval (DPR) model fine-tuned for German language use cases. -``` +However, as they scaled, these issues quickly emerged: -```python -client.query_points( - collection_name="{collection_name}", - query="43cf51e2-8777-4f52-bc74-c2cbde0c8b04", # <--- point id -) +* Memory spikes and operational instability due to the sheer number of components used in the previous provide +* Complex maintenance requirements, with frequent dependency issues, high operational overhead due to missing memory optimizations, and streamlined deployment. -``` +Despite efforts to improve annotations and tuning, it became evident that this approach wouldn't scale for Deutsche Telekom. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +Additionally, there was a strong need to leverage existing engineering assets, as most developers and systems were already equipped with SDKs and familiar tooling. Rather than building an entirely new stack from scratch, the focus shifted to enabling developers to build AI agents within the tools and frameworks they were already comfortable with. This approach allowed domain experts who understood the APIs and enterprise systems to quickly develop AI agents without disrupting existing workflows. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +Recognizing this, the team made a bold decision: to build a **fully-fledged PaaS platform for AI agents**, streamlining development and accelerating deployment of AI Agents. -client.query("{collection_name}", { - query: '43cf51e2-8777-4f52-bc74-c2cbde0c8b04', // <--- point id -}); +### LMOS: Deutsche Telekom's Open-Source Multi-Agent AI PaaS for Enterprise AI -``` +Recognizing that an AI-driven platform required deep engineering rigor, the Telekom team designed **LMOS (Language Models Operating System)** — a multi-agent PaaS designed for high scalability and modular AI agent deployment. Key technical decisions included: -```rust -use qdrant_client::Qdrant; -use qdrant_client::qdrant::{Condition, Filter, PointId, Query, QueryPointsBuilder}; +* **Choosing Kotlin and JVM** to ensure engineers familiar with existing Deutsche Telekom systems could easily integrate with LMOS. +* **Moving away from pre-built frameworks** in favor of a ground-up, highly optimized solution tailored to Deutsche Telekom's specific needs. +* **Providing a Heroku-like experience** where engineers don't need to worry about classifiers, agent lifecycles, deployment models, monitoring, and horizontal scaling. +* **Enterprise Grade while being flexible:** LMOS was built with enterprise-grade scalability, versioning, and multi-tenancy in mind, while also offering the flexibility to integrate agents from other frameworks — not just JVM-based solutions — ensuring interoperability across diverse AI ecosystems. -let client = Qdrant::from_url("http://localhost:6334").build()?; +"Our engineers already knew their APIs — billing, shopping, user profiles. Why should we introduce new abstractions that only complicate the stack?" Arun notes, "also, I envisioned us building the foundations of what I call **agentic computing**, playing a role in shaping the application stacks of the future on top of LLMs." -client - .query( - QueryPointsBuilder::new("{collection_name}") - .query(Query::new_nearest(PointId::new("43cf51e2-8777-4f52-bc74-c2cbde0c8b04"))) - ) - .await?; +![LMOS architecture diagram showing AI agent collaboration and lifecycle management](/blog/case-study-deutsche-telekom/lmos-architecture.png) -``` +LMOS architecture powering AI agent collaboration and lifecycle management in a cloud-native environment. -```java -import java.util.UUID; +### Why Qdrant? Finding the Right Vector Database for LMOS -import static io.qdrant.client.QueryFactory.nearest; +When Deutsche Telekom began searching for a scalable, high-performance vector database, they faced operational challenges with their initial choice. Seeking a solution better suited to their PaaS-first approach and multitenancy requirements, they evaluated alternatives, and [Qdrant](https://qdrant.tech/qdrant-vector-database/) quickly stood out. -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.QueryPoints; +"I was looking for open-source components with deep technical expertise behind them," Arun recalls. "I looked at Qdrant and immediately loved the simplicity, [Rust-based efficiency](https://qdrant.tech/articles/why-rust/), and [memory management capabilities](https://qdrant.tech/articles/memory-consumption/). These guys knew what they were doing." -QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +The team structured its evaluation around two key metrics: -client.queryAsync(QueryPoints.newBuilder() - .setCollectionName("{collectionName}") - .setQuery(nearest(UUID.fromString("43cf51e2-8777-4f52-bc74-c2cbde0c8b04"))) - .build()).get(); +1. **Qualitative metrics**: developer experience, ease of use, memory efficiency features. +2. **Operational simplicity**: how well it fit into their PaaS-first approach and [multitenancy requirements](https://qdrant.tech/documentation/guides/multiple-partitions/). -``` +Deutsche Telekom's engineers also cited several standout features that made Qdrant the right fit: -```csharp -using Qdrant.Client; +1. **Simplicity in operations**—Qdrant is lightweight and doesn't require an excessive component stack. +2. **Developer experience**—libraries, multi-language clients, and cross-framework support make integrations seamless. +3. **WebUI & Collection Visualization**—engineers found Qdrant's [built-in collection visualization](https://qdrant.tech/documentation/web-ui/) tools highly useful. -var client = new QdrantClient("localhost", 6334); +As part of their evaluation, Deutsche Telekom engineers compared multiple solutions, weighing operational simplicity and reliability. -await client.QueryAsync( - collectionName: "{collection_name}", - query: Guid.Parse("43cf51e2-8777-4f52-bc74-c2cbde0c8b04") -); +One engineer summarized their findings: "Qdrant has way fewer components, compared to the another that required required Kafka, Zookeeper, and only had a hot standby for its index and query nodes. If you rescale it, you get downtime. Qdrant stays up." -``` +### Scaling AI at Deutsche Telekom & The Future of LMOS -```go -import ( - "context" +Today, LMOS with Qdrant serves as the backbone for Deutsche Telekom's AI services, processing over 2 million conversations across three countries. The time required to develop a new agent has dropped from 15 days to just 2\. - "github.com/qdrant/go-client/qdrant" -) +With [LMOS now part of the Eclipse Foundation](https://projects.eclipse.org/projects/technology.lmos), Deutsche Telekom is opening up its platform to the broader AI engineering community. Arun sees a future ecosystem of like-minded developers coalescing around LMOS, Qdrant, and other AI infrastructure components. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +"For enterprises looking to build their own AI agent platforms, the future isn't just about AI models — it's about scalable, open, and opinionated infrastructure. And that's exactly what we've built," says Arun Joseph. -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQueryID(qdrant.NewID("43cf51e2-8777-4f52-bc74-c2cbde0c8b04")), -}) +You can learn more about Deutsche Telekom's AI Agents and Arun's vision for LMOS in his [talk](https://www.infoq.com/presentations/ai-agents-platform%20) at the InfoQ Dev Summit Boston. -``` +### Watch livestream with Arun -## [Anchor](https://qdrant.tech/documentation/concepts/search/\#metrics) Metrics +In this Vector Space talk, Thierry from Qdrant and Arun from Deutsche Telekom talk about the key requirements for scaling enterprise AI agents, key AI stack considerations, and how the team built a Platform as a Service (PaaS) - LMOS (Language Models Operating System) — a multi-agent PaaS designed for high scalability and modular AI agent deployment. -There are many ways to estimate the similarity of vectors with each other. -In Qdrant terms, these ways are called metrics. -The choice of metric depends on the vectors obtained and, in particular, on the neural network encoder training method. + -Qdrant supports these most popular types of metrics: +<|page-326-lllmstxt|> +At Qdrant, we enable developers to power AI workloads - not only securely, but at any scale. That’s why we are excited to introduce Qdrant Cloud’s new suite of enterprise-grade features. With **our Cloud API, Cloud RBAC**, **Single Sign-On (SSO)**, granular **Database API Keys**, and **Advanced Monitoring & Observability**, you now have the control and visibility needed to operate at scale. -- Dot product: `Dot` \- [https://en.wikipedia.org/wiki/Dot\_product](https://en.wikipedia.org/wiki/Dot_product) -- Cosine similarity: `Cosine` \- [https://en.wikipedia.org/wiki/Cosine\_similarity](https://en.wikipedia.org/wiki/Cosine_similarity) -- Euclidean distance: `Euclid` \- [https://en.wikipedia.org/wiki/Euclidean\_distance](https://en.wikipedia.org/wiki/Euclidean_distance) -- Manhattan distance: `Manhattan`\\*\- [https://en.wikipedia.org/wiki/Taxicab\_geometry](https://en.wikipedia.org/wiki/Taxicab_geometry) _\*Available as of v1.7_ +## Securely Scale Your AI Workloads -The most typical metric used in similarity learning models is the cosine metric. +Your enterprise-grade AI applications demand more than just a powerful vector database—they need to meet compliance, performance, and scalability requirements. To do that, you need simplified management, secure access & authentication, and real-time monitoring & observability. Now, Qdrant’s new enterprise-grade features address these needs, giving your team the tools to reduce operational overhead, simplify authentication, enforce access policies, and have deep visibility into performance. -![Embeddings](https://qdrant.tech/docs/cos.png) +## Our New Qdrant Cloud Capabilities: -Qdrant counts this metric in 2 steps, due to which a higher search speed is achieved. -The first step is to normalize the vector when adding it to the collection. -It happens only once for each vector. +* **Cloud API for Simplified Management →** Automate and scale with **API-driven control** and **Terraform support**. +* **Secure Access & Authentication** → Control who gets in and what they can do with **Cloud RBAC**, **SSO**, and granular **Database API Keys**. +* **Advanced Monitoring & Observability** → Stay ahead of issues with **Prometheus/OpenMetrics**, **Datadog**, **Grafana**, and other third-party integrations. -The second step is the comparison of vectors. -In this case, it becomes equivalent to dot production - a very fast operation due to SIMD. +## Ok, now for the good part
 -Depending on the query configuration, Qdrant might prefer different strategies for the search. -Read more about it in the [query planning](https://qdrant.tech/documentation/concepts/search/#query-planning) section. +### Cloud API for Simplified Management -## [Anchor](https://qdrant.tech/documentation/concepts/search/\#search-api) Search API +Skip the UI—manage Qdrant entirely through code. The [**Qdrant Cloud API**](https://qdrant.tech/documentation/qdrant-cloud-api/?) lets you automate cluster creation, updates, and scaling, ensuring repeatable, version-controlled deployments. You can also programmatically generate and revoke API keys, update configurations, and adapt infrastructure as workloads change. -Let’s look at an example of a search query. +You can manage the Qdrant Cloud lifecycle with Qdrant’s [**Terraform Provider**](https://qdrant.tech/documentation/cloud-tools/terraform/). With this support, you can define and automate cluster provisioning using Infrastructure-as-Code (IaC) best practices. -REST API - API Schema definition is available [here](https://api.qdrant.tech/api-reference/search/query-points) +**Why it matters:** By automating cluster management and scaling, Qdrant helps you focus on building AI-powered applications, not maintaining infrastructure. -httppythontypescriptrustjavacsharpgo +### Secure Access & Authentication \- Control the Who and What -```http -POST /collections/{collection_name}/points/query -{ - "query": [0.2, 0.1, 0.9, 0.79], - "filter": { - "must": [\ - {\ - "key": "city",\ - "match": {\ - "value": "London"\ - }\ - }\ - ] - }, - "params": { - "hnsw_ef": 128, - "exact": false - }, - "limit": 3 -} +#### Cloud RBAC (Role-Based Access Control) \- The Who -``` +With **Cloud RBAC**, you can define precise **role-based permissions** for team members managing clusters, billing, and hybrid cloud deployments in Qdrant Cloud. Instead of granting broad, unrestricted access, teams can **assign permissions based on roles**, ensuring tighter security and compliance. -```python -from qdrant_client import QdrantClient, models +#### Granular Database API Keys \- The What -client = QdrantClient(url="http://localhost:6333") +**Database API Keys** let applications and services **directly interact with data inside Qdrant**. You can **grant API access at the cluster, collection, or even vector level**, specifying **read-only or read/write permissions** for each key. -client.query_points( - collection_name="{collection_name}", - query=[0.2, 0.1, 0.9, 0.7], - query_filter=models.Filter( - must=[\ - models.FieldCondition(\ - key="city",\ - match=models.MatchValue(\ - value="London",\ - ),\ - )\ - ] - ), - search_params=models.SearchParams(hnsw_ef=128, exact=False), - limit=3, -) +Unlike **Cloud RBAC**, which governs **team permissions in the [Cloud Console](https://cloud.qdrant.io/login)**, **Database API Keys** control how external applications access stored data. You can define **fine-grained API key permissions**, apply **Time-to-Live (TTL) expiration policies**, and revoke keys instantly—without requiring a database restart (**only available in Qdrant Cloud**). -``` +To further refine access, **payload-based filters** allow you to restrict API keys to **only retrieve vectors that match specific metadata conditions**. Before finalizing an API key, you can **preview its access settings** to ensure it behaves as expected—reducing misconfigurations and improving security. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; + -const client = new QdrantClient({ host: "localhost", port: 6333 }); +#### [Read more about Database API keys](https://qdrant.tech/documentation/cloud/authentication/). -client.query("{collection_name}", { - query: [0.2, 0.1, 0.9, 0.7], - filter: { - must: [\ - {\ - key: "city",\ - match: {\ - value: "London",\ - },\ - },\ - ], - }, - params: { - hnsw_ef: 128, - exact: false, - }, - limit: 3, -}); +#### Single Sign-On (SSO) for Simplified Authentication -``` +**SSO** eliminates password sprawl by allowing users to log in through **Okta, Google Workspace, Azure AD (Entra ID), SAML, PingFederate, and more**—enforcing authentication policies while reducing IT overhead. Instead of managing separate credentials, users **simply enter their company email** and are redirected to their organization’s authentication system. -```rust -use qdrant_client::qdrant::{Condition, Filter, QueryPointsBuilder, SearchParamsBuilder}; -use qdrant_client::Qdrant; +**SSO setup is fully supported**—to enable it for your company, **contact Qdrant support**, and our team will guide you through the setup process. SSO also works with **multi-factor authentication (MFA)** for additional security. -client - .query( - QueryPointsBuilder::new("{collection_name}") - .query(vec![0.2, 0.1, 0.9, 0.7]) - .limit(3) - .filter(Filter::must([Condition::matches(\ - "city",\ - "London".to_string(),\ - )])) - .params(SearchParamsBuilder::default().hnsw_ef(128).exact(false)), - ) - .await?; +*SSO is only available for [Premium Tier](https://qdrant.tech/documentation/cloud/premium/) customers. [Learn more about SSO](https://qdrant.tech/documentation/cloud/qdrant-cloud-setup/#enterprise-single-sign-on-sso).* -``` + -```java -import java.util.List; +**Why it matters:** By integrating **Cloud RBAC**, granular **Database API Keys** and **SSO**, Qdrant Cloud helps your team have the right access at the right time—without unnecessary friction. -import static io.qdrant.client.ConditionFactory.matchKeyword; -import static io.qdrant.client.QueryFactory.nearest; +### Advanced Monitoring and Observability for Full Performance Insights -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.QueryPoints; -import io.qdrant.client.grpc.Points.SearchParams; +Qdrant Cloud provides **real-time visibility into database performance** with built-in **Prometheus/OpenMetrics support**. You can monitor **CPU usage, memory usage, disk space, request volumes, and query latencies** directly in the **Qdrant Cloud Console**, giving you a **live overview of system health**. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +For **deeper analytics**, Qdrant lets you **integrate with your existing monitoring stack**, including [Datadog](https://qdrant.tech/documentation/observability/datadog/)**,** [Grafana](https://qdrant.tech/documentation/cloud/cluster-monitoring/#grafana-dashboard)**,** and [other enterprise observability tools](https://qdrant.tech/documentation/observability/). Every Qdrant Cloud cluster includes a **metrics endpoint**, accessible via a **read-only API key**, providing **Prometheus and OpenTelemetry compatible data** for easy ingestion into Grafana Cloud or any other supported monitoring system. -client.queryAsync(QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) - .setFilter(Filter.newBuilder().addMust(matchKeyword("city", "London")).build()) - .setParams(SearchParams.newBuilder().setExact(false).setHnswEf(128).build()) - .setLimit(3) - .build()).get(); +Qdrant also provides a **ready-to-use [Grafana dashboard](https://github.com/qdrant/qdrant-cloud-grafana-dashboard)** to help you **visualize key database metrics**, including historical performance data, cluster uptime, request latencies, backup schedules, and network I/O. + +You can set up **customizable alerts** in [Grafana](https://qdrant.tech/documentation/cloud/cluster-monitoring/#grafana-dashboard), Prometheus, or [Datadog](https://qdrant.tech/documentation/observability/datadog/) to **track key performance indicators** such as **memory**, **storage**, and **query** **latency** thresholds. + +For **historical performance tracking**, third-party integrations allow you to **analyze trends over time**, providing deeper insights into system performance and long-term optimization strategies. -``` +**Why it matters:** With **detailed telemetry, automated alerts, and deep observability integrations**, you can troubleshoot issues faster, optimize database performance, and scale AI applications. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -using static Qdrant.Client.Grpc.Conditions; + -var client = new QdrantClient("localhost", 6334); +[Read more about advanced monitoring](https://qdrant.tech/documentation/cloud/cluster-monitoring/). -await client.QueryAsync( - collectionName: "{collection_name}", - query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, - filter: MatchKeyword("city", "London"), - searchParams: new SearchParams { Exact = false, HnswEf = 128 }, - limit: 3 -); +## Simply put, Qdrant is Enterprise-Ready -``` +Our high-performance vector search engine already handles billion-scale use cases. Through Qdrant Cloud, you get our Cloud API, authentication & access tools, and monitoring & observability integrations. -```go -import ( - "context" +With this combination, you can simplify infrastructure management, implement secure access & authentication, and stay ahead of performance challenges. That’s why Qdrant is the enterprise vector database of choice—**no matter the scale**. - "github.com/qdrant/go-client/qdrant" -) +## Come Build with Us\! -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +[Contact Sales](https://qdrant.tech/contact-us/) to enable enterprise features for your team, or [start prototyping with a free Qdrant cluster](https://login.cloud.qdrant.io/u/signup/identifier?state=hKFo2SAxeFNkY0JxeTMwUmpsRk15SFRUR2dFbmFYcjJUdnpHc6Fur3VuaXZlcnNhbC1sb2dpbqN0aWTZIFpOOTQ4S21uUEVlM3o1WUx1QnMzSUlrMmlIR1NtV1JCo2NpZNkgckkxd2NPUEhPTWRlSHVUeDR4MWtGMEtGZFE3d25lemc). -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), - Filter: &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("city", "London"), - }, - }, - Params: &qdrant.SearchParams{ - Exact: qdrant.PtrOf(false), - HnswEf: qdrant.PtrOf(uint64(128)), - }, -}) +<|page-327-lllmstxt|> +> *"Metadata is one of the key unlocks to both segmentation and file organization, setting up the right knowledge base, and enriching it to hit that last mile of accuracy and speed.”*\ +> **— Reece Griffiths** -``` +[Reece Griffiths](https://www.linkedin.com/in/reece-william-griffiths/) is the CEO and co-founder of [Deasy Labs](https://www.deasylabs.com/), a metadata automation platform that helps companies optimize their vector databases for retrieval accuracy. Previously part of Y Combinator, Deasy Labs focuses on improving metadata extraction, classification, and enrichment at scale. -In this example, we are looking for vectors similar to vector `[0.2, 0.1, 0.9, 0.7]`. -Parameter `limit` (or its alias - `top`) specifies the amount of most similar results we would like to retrieve. + -Values under the key `params` specify custom parameters for the search. -Currently, it could be: +## **Top takeaways:** -- `hnsw_ef` \- value that specifies `ef` parameter of the HNSW algorithm. -- `exact` \- option to not use the approximate search (ANN). If set to true, the search may run for a long as it performs a full scan to retrieve exact results. -- `indexed_only` \- With this option you can disable the search in those segments where vector index is not built yet. This may be useful if you want to minimize the impact to the search performance whilst the collection is also being updated. Using this option may lead to a partial result if the collection is not fully indexed yet, consider using it only if eventual consistency is acceptable for your use case. +Retrieval-augmented generation (RAG) and vector search are incomplete without high-quality metadata. In this episode of **Vector Space Talks**, Reece Griffiths explains how **metadata automation and optimization** can significantly enhance retrieval accuracy, filtering, and indexing efficiency. -Since the `filter` parameter is specified, the search is performed only among those points that satisfy the filter condition. -See details of possible filters and their work in the [filtering](https://qdrant.tech/documentation/concepts/filtering/) section. +Here are some key insights from this episode: -Example result of this API would be +1. **Why Metadata Matters in Vector Search:** Traditional approaches often focus on embedding models, but metadata can bridge the gap between mediocre and high-performance search systems. +2. **Metadata for Segmentation vs. Enrichment:** Segmentation metadata helps filter and categorize data, while enrichment metadata provides additional context that improves retrieval accuracy. +3. **Optimizing Hybrid Search with Metadata:** Reece explains how metadata can be embedded into sparse vectors for **hybrid search**, enhancing keyword and semantic search combinations. +4. **Scaling Metadata Extraction:** Learn how Deasy Labs uses LLM-powered extraction methods to generate metadata dynamically and update taxonomies in real-time. +5. **Metadata as an Access Control Layer:** Metadata can also be leveraged for **role-based access control (RBAC)** by defining data slices that different teams or users can access within a knowledge base. -```json -{ - "result": [\ - { "id": 10, "score": 0.81 },\ - { "id": 14, "score": 0.75 },\ - { "id": 11, "score": 0.73 }\ - ], - "status": "ok", - "time": 0.001 -} +> Fun Fact: Reece and his team at Deasy Labs experimented with **pure metadata embeddings** (without the original data) and found that hybrid search using metadata alone can yield strong retrieval performance. -``` +## **Show notes:** -The `result` contains ordered by `score` list of found point ids. +00:00 Introduction to metadata automation and optimization.\ +05:32 The role of metadata in retrieval-augmented generation (RAG).\ +10:48 How Deasy Labs structures metadata extraction workflows.\ +15:35 Implementing hybrid search with sparse metadata vectors.\ +20:14 Automating metadata classification using LLMs.\ +25:51 Best practices for maintaining metadata over time.\ +30:18 Using metadata for segmentation and access control.\ +35:43 Q&A and closing remarks. -Note that payload and vector data is missing in these results by default. -See [payload and vector in the result](https://qdrant.tech/documentation/concepts/search/#payload-and-vector-in-the-result) on how -to include it. +## **More Quotes from Reece:** -If the collection was created with multiple vectors, the name of the vector to use for searching should be provided: +*"Going from 75% retrieval accuracy to 95%+ is hard. In many cases, 80% accuracy might as well be zero. Metadata is the key to getting that last mile."*\ +— Reece Griffiths -httppythontypescriptrustjavacsharpgo +*"Metadata shouldn't rely on manual tagging by business teams. With LLMs, we can auto-suggest domain-specific metadata dynamically and refine it over time."*\ +— Reece Griffiths -```http -POST /collections/{collection_name}/points/query -{ - "query": [0.2, 0.1, 0.9, 0.7], - "using": "image", - "limit": 3 -} +*"In a vector database, segmentation metadata helps you structure your knowledge base, while enrichment metadata boosts retrieval precision—both are critical."*\ +— Reece Griffiths -``` +--- -```python -from qdrant_client import QdrantClient -client = QdrantClient(url="http://localhost:6333") +### **Try Deasy Labs 🚀** +Want to enhance your vector search performance with **automated metadata workflows**? -client.query_points( - collection_name="{collection_name}", - query=[0.2, 0.1, 0.9, 0.7], - using="image", - limit=3, -) +**Start now at [app.deasylabs.com](https://app.deasylabs.com)!** -``` +--- -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +<|page-328-lllmstxt|> +In a recent live session, we teamed up with [CrewAI](https://crewai.com/), a framework for building intelligent, +multi-agent applications. If you missed it, [Kacper Ɓukawski](https://www.linkedin.com/in/kacperlukawski/) from Qdrant +and [Tony Kipkemboi](https://www.linkedin.com/in/tonykipkemboi) from [CrewAI](https://crewai.com/) gave an insightful +overview of CrewAI’s capabilities and demonstrated how to leverage Qdrant for creating an agentic RAG +(Retrieval-Augmented Generation) system. The focus was on semi-automating email communication, using +[Obsidian](https://obsidian.md/) as the knowledge base. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +In this article, we’ll guide you through the process of setting up an AI-powered system that connects directly to your +email inbox and knowledge base, enabling it to analyze incoming messages and existing content to generate contextually +relevant response suggestions. -client.query("{collection_name}", { - query: [0.2, 0.1, 0.9, 0.7], - using: "image", - limit: 3, -}); +## Background agents -``` +Although we got used to LLM-based apps that usually have a chat-like interface, even if it's not a real UI but a CLI +tool, plenty of day-to-day tasks can be automated in the background without explicit human action firing the process. +This concept is also known as **ambient agents**, where the agent is always there, waiting for a trigger to act. -```rust -use qdrant_client::qdrant::QueryPointsBuilder; -use qdrant_client::Qdrant; +### The basic concepts of CrewAI -let client = Qdrant::from_url("http://localhost:6334").build()?; +Thanks for Tony's participation, we could learn more about CrewAI, and understand the basic concepts of the framework. +He introduced the concepts of agents and crews, and how they can be used to build intelligent multi-agent applications. +Moreover, Tony described different types of memory that CrewAI applications can use. -client - .query( - QueryPointsBuilder::new("{collection_name}") - .query(vec![0.2, 0.1, 0.9, 0.7]) - .limit(3) - .using("image"), - ) - .await?; +When it comes to Qdrant role in CrewAI applications, it can be used as short-term, or entity memory, as both components +are based on RAG and vector embeddings. If you'd like to know more about memory in CrewAI, please visit the [CrewAI +concepts](https://docs.crewai.com/concepts/memory). -``` +Tony made an interesting analogy. He compared crews to different departments in a company, where each department has its +own responsibilities, but they all work together to achieve the company's goals. -```java -import java.util.List; +### Email automation with CrewAI, Qdrant, and Obsidian notes -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.QueryPoints; +Our webinar focused on building an agentic RAG system that would semi-automate email communication. RAG is an essential +component of such a system, as you don't want to take responsibility for responses that cannot be grounded. The system +would monitor your Gmail inbox, analyze the incoming emails, and prepare response drafts if it detects that the email is +not spam, newsletter, or notification. -import static io.qdrant.client.QueryFactory.nearest; +On the other hand, the system would also monitor the Obsidian notes, by watching any changes in the local file system. +When a file is created, modified, or deleted, the system would automatically move these changes to the Qdrant +collection, so the knowledge base is always up-to-date. Obsidian uses Markdown files to store notes, so complex parsing +is not required. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +Here is a simplified diagram presenting the target architecture of the system: -client.queryAsync(QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) - .setUsing("image") - .setLimit(3) - .build()).get(); +![Project architecture](/blog/webinar-crewai-qdrant-obsidian/project-architecture.png) -``` +Qdrant acts as a knowledge base, storing the embeddings of the Obsidian notes. -```csharp -using Qdrant.Client; +## Implementing the system -var client = new QdrantClient("localhost", 6334); +Since our system integrates with two external APIs - Gmail and filesystem. **We won't go into details of how to work +with these APIs**, as it's out of the scope of this webinar. Instead, we will focus on the CrewAI and Qdrant +integration, and CrewAI agents' implementation. -await client.QueryAsync( - collectionName: "{collection_name}", - query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, - usingVector: "image", - limit: 3 -); +### CrewAI <> Qdrant integration -``` +Since there is no official integration between CrewAI and Qdrant yet, we created a custom implementation of the +`RAGStorage` class, which has a pretty straightforward interface. -```go -import ( - "context" +```python +from typing import Optional +from crewai.memory.storage.rag_storage import RAGStorage - "github.com/qdrant/go-client/qdrant" -) +class QdrantStorage(RAGStorage): + """ + Extends Storage to handle embeddings for memory entries + using Qdrant. + """ -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) + ... -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), - Using: qdrant.PtrOf("image"), -}) + def search(self, + query: str, + limit: int = 3, + filter: Optional[dict] = None, + score_threshold: float = 0, + ) -> list[dict]: + ... + def reset(self) -> None: + ... ``` -Search is processing only among vectors with the same name. +Full implementation might be found in the [GitHub +repository](https://github.com/qdrant/webinar-crewai-qdrant-obsidian/blob/main/src/email_assistant/storage.py). You can +use it for your own projects, or as a reference for your custom implementation. If you want to set up a crew that uses +Qdrant as both entity and short memory layers, you can do it like this: -If the collection was created with sparse vectors, the name of the sparse vector to use for searching should be provided: +```python +from crewai import Crew, Process +from crewai.memory import EntityMemory, ShortTermMemory +from email_assistant.storage import QdrantStorage -You can still use payload filtering and other features of the search API with sparse vectors. +qdrant_location= "http://localhost:6333" +qdrant_api_key = "your-secret-api-key" +embedder_config = {...} -There are however important differences between dense and sparse vector search: +crew = Crew( + agents=[...], + tasks=[...], # Automatically created by the @task decorator + process=Process.sequential, + memory=True, + entity_memory=EntityMemory( + storage=QdrantStorage( + type="entity-memory", + embedder_config=embedder_config, + qdrant_location=qdrant_location, + qdrant_api_key=qdrant_api_key, + ), + ), + short_term_memory=ShortTermMemory( + storage=QdrantStorage( + type="short-term-memory", + embedder_config=embedder_config, + qdrant_location=qdrant_location, + qdrant_api_key=qdrant_api_key, + ), + ), + embedder=embedder_config, + verbose=True, +) +``` -| Index | Sparse Query | Dense Query | -| --- | --- | --- | -| Scoring Metric | Default is `Dot product`, no need to specify it | `Distance` has supported metrics e.g. Dot, Cosine | -| Search Type | Always exact in Qdrant | HNSW is an approximate NN | -| Return Behaviour | Returns only vectors with non-zero values in the same indices as the query vector | Returns `limit` vectors | +Both types of memory will use different collection names in Qdrant, so you can easily distinguish between them, and the +data won't be mixed up. -In general, the speed of the search is proportional to the number of non-zero values in the query vector. +**We are planning to release a CrewAI tool for Qdrant integration in the near future**, so stay tuned! -httppythontypescriptrustjavacsharpgo +### Loading the Obsidian notes to Qdrant -```http -POST /collections/{collection_name}/points/query -{ - "query": { - "indices": [1, 3, 5, 7], - "values": [0.1, 0.2, 0.3, 0.4] - }, - "using": "text" -} +For the sake of the demo, we decided to simply scrape the documentation of both CrewAI and Qdrant, and store it in the +Obsidian notes. That's easy with Obsidian Web Clipper, as it allows you to save the web page as a Markdown file. -``` +![Obsidian notes](/blog/webinar-crewai-qdrant-obsidian/obsidian.png) -```python -from qdrant_client import QdrantClient, models +Assuming we detected a change in the Obsidian notes, such as new note creation or modification, we would like to load +the changes to Qdrant. We could possibly use some chunking methods, starting from basic fixed-size chunks, or go +straight to semantic chunking. However, LLMs are also well-known for their ability to divide the text into meaningful +parts, so we decided to try them out. Moreover, standard chunking is enough in many cases, but we also wanted to test +the [Contextual Retrieval concept introduced by Anthropic](https://www.anthropic.com/news/contextual-retrieval). In a +nutshell, the idea is to use LLMs to generate a short context for each chunk, so it situates the chunk in the context of +the whole document. -client = QdrantClient(url="http://localhost:6333") +It turns out, implementing such a crew in CrewAI is quite straightforward. There are two actors in the crew - one +chunking the text and the other one generating the context. Both might be defined in YAML files like this: -result = client.query_points( - collection_name="{collection_name}", - query=models.SparseVector(indices=[1, 3, 5, 7], values=[0.1, 0.2, 0.3, 0.4]), - using="text", -).points +```yaml +chunks_extractor: + role: > + Semantic chunks extractor + goal: > + Parse Markdown to extract digestible pieces of information which are + semantically meaningful and can be easily understood by a human. + backstory: > + You are a search expert building a search engine for Markdown files. + Once you receive a Markdown file, you divide it into meaningful semantic + chunks, so each chunk is about a certain topic or concept. You're known + for your ability to extract relevant information from large documents and + present it in a structured and easy-to-understand format, that increases + the searchability of the content and results quality. + +contextualizer: + role: > + Bringing context to the extracted chunks + goal: > + Add context to the extracted chunks to make them more meaningful and + understandable. This context should help the reader understand the + significance of the information and how it relates to the broader topic. + backstory: > + You are a knowledge curator who specializes in making information more + accessible and understandable. You take the extracted chunks and provide + additional context to make them more meaningful by bringing in relevant + information about the whole document or the topic at hand. +``` + +CrewAI makes it very easy to define such agents, and even a non-tech person can understand and modify the YAML files. + +Another YAML file defines the tasks that the agents should perform: -``` +```yaml +extract_chunks: + description: > + Review the document you got and extract the chunks from it. Each + chunk should be a separate piece of information that can be easily understood + by a human and is semantically meaningful. If there are two or more chunks that + are closely related, but not put next to each other, you can merge them into + a single chunk. It is important to cover all the important information in the + document and make sure that the chunks are logically structured and coherent. + + {document} + expected_output: > + A list of semantic chunks with succinct context of information extracted from + the document. + agent: chunks_extractor + +contextualize_chunks: + description: > + You have the chunks we want to situate within the whole document. + Please give a short succinct context to situate this chunk within the overall + document for the purposes of improving search retrieval of the chunk. Answer + only with the succinct context and nothing else. + expected_output: > + A short succinct context to situate the chunk within the overall document, along + with the chunk itself. + agent: contextualizer +``` + +YAML is not enough to make the agents work, so we need to implement them in Python. The role, goal, and backstory +of the agent, as well as the task description and expected output, are used to build a prompt sent to the LLM. However, +the code defines which LLM to use, and some other parameters of the interaction, like structured output. We heavily rely +on Pydantic models to define the output of the task, so the responses might be easily processed by the application, +for example, to store them in Qdrant. + +```python +from crewai import Agent, Crew, Process, Task +from crewai.project import CrewBase, agent, crew, task +from email_assistant import models -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +... -const client = new QdrantClient({ host: "localhost", port: 6333 }); +@CrewBase +class KnowledgeOrganizingCrew(BaseCrew): + """ + A crew responsible for processing raw text data and converting it into structured knowledge. + """ -client.query("{collection_name}", { - query: { - indices: [1, 3, 5, 7], - values: [0.1, 0.2, 0.3, 0.4] - }, - using: "text", - limit: 3, -}); + agents_config = "config/knowledge/agents.yaml" + tasks_config = "config/knowledge/tasks.yaml" -``` + @agent + def chunks_extractor(self) -> Agent: + return Agent( + config=self.agents_config["chunks_extractor"], + verbose=True, + llm="anthropic/claude-3-5-sonnet-20241022", + ) -```rust -use qdrant_client::qdrant::QueryPointsBuilder; -use qdrant_client::Qdrant; + ... -let client = Qdrant::from_url("http://localhost:6334").build()?; + @task + def contextualize_chunks(self) -> Task: + # The task description is borrowed from the Anthropic Contextual Retrieval + # See: https://www.anthropic.com/news/contextual-retrieval/ + return Task( + config=self.tasks_config["contextualize_chunks"], + output_pydantic=models.ContextualizedChunks, + ) -client - .query( - QueryPointsBuilder::new("{collection_name}") - .query(vec![(1, 0.2), (3, 0.1), (5, 0.9), (7, 0.7)]) - .limit(10) - .using("text"), - ) - .await?; + ... + @crew + def crew(self) -> Crew: + """Creates the KnowledgeOrganizingCrew crew""" + return Crew( + agents=self.agents, # Automatically created by the @agent decorator + tasks=self.tasks, # Automatically created by the @task decorator + process=Process.sequential, + memory=True, + entity_memory=self.entity_memory(), + short_term_memory=self.short_term_memory(), + embedder=self.embedder_config, + verbose=True, + ) ``` -```java -import java.util.List; +Full implementation might again be found in the [GitHub +repository](https://github.com/qdrant/webinar-crewai-qdrant-obsidian/blob/main/src/email_assistant/crew.py). -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.QueryPoints; +### Drafting emails in Gmail Inbox -import static io.qdrant.client.QueryFactory.nearest; +At this point we already have our notes stored in Qdrant, and we can write emails in Gmail Inbox using the notes as a +ground truth. The system would monitor the Gmail inbox, and if it detects an email that is not spam, newsletter, or +notification, it would draft a response based on the knowledge base stored in Qdrant. Again, that means we need to use +two agents - one for detecting the kind of the incoming email, and the other one for drafting the response. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +The YAML files for these agents might look like this: -client.queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setUsing("text") - .setQuery(nearest(List.of(0.1f, 0.2f, 0.3f, 0.4f), List.of(1, 3, 5, 7))) - .setLimit(3) - .build()) - .get(); +```yaml +categorizer: + role: > + Email threads categorizer + goal: > + Automatically categorize email threads based on their content. + backstory: > + You're a virtual assistant with a knack for organizing information. + You're known for your ability to quickly and accurately categorize email + threads, so that your clients know which ones are important to answer + and which ones are spam, newsletters, or other types of messages that + do not require attention. + Available categories: QUESTION, NOTIFICATION, NEWSLETTER, SPAM. Do not make + up new categories. + +response_writer: + role: > + Email response writer + goal: > + Write clear and concise responses to an email thread. Try to help the + sender. Use the external knowledge base to provide relevant information. + backstory: > + You are a professional writer with a talent for crafting concise and + informative responses. You're known for your ability to quickly understand + the context of an email thread and provide a helpful and relevant response + that addresses the sender's needs. You always rely on your knowledge base + to provide accurate and up-to-date information. +``` + +The set of categories is predefined, so the categorizer should not invent new categories. The task definitions are as +follows: + +```yaml +categorization_task: + description: > + Review the content of the following email thread and categorize it + into the appropriate category. There might be multiple categories that + apply to the email thread. + + {messages} + expected_output: > + A list of all the categories that the email threads can be classified into. + agent: categorizer +response_writing_task: + description: > + Write a response to the following email thread. The response should be + clear, concise, and helpful to the sender. Always rely on the Qdrant search + tool, so you can get the most relevant information to craft your response. + Please try to include the source URLs of the information you provide. + + Only focus on the real question asked by the sender and do not try to + address any other issues that are not directly related to the sender's needs. + Do not try to provide a response if the context is not clear enough. + + {messages} + expected_output: > + A well-crafted response to the email thread that addresses the sender's needs. + Please use simple HTML formatting to make the response more readable. + Do not include greetings or signatures in your response, but provide the footnotes + with the source URLs of the information you used, if possible. + + If the provided context does not give you enough information to write a response, + you must admit that you cannot provide a response and write "I cannot provide a response.". + agent: response_writer ``` -```csharp -using Qdrant.Client; +We specifically asked the agents to include the source URLs of the information they provide, so both the sender and the +recipient can verify the information. -var client = new QdrantClient("localhost", 6334); +### Working system -await client.QueryAsync( - collectionName: "{collection_name}", - query: new (float, uint)[] {(0.1f, 1), (0.2f, 3), (0.3f, 5), (0.4f, 7)}, - usingVector: "text", - limit: 3 -); +We have both crews defined, and the application is ready to run. The only thing left is to monitor the Gmail inbox and +the Obsidian notes for changes. We use the `watchdog` library to monitor the filesystem, and the `google-api-python-client` +to monitor the Gmail inbox, but we won't go into details of how to use these libraries, as the integration code would +make this blog post too long. -``` +If you open the [main file of the +application](https://github.com/qdrant/webinar-crewai-qdrant-obsidian/blob/main/main.py), you will see that it is quite +simple. It runs two separate threads, one for monitoring the Gmail inbox, and the other one for monitoring the Obsidian +notes. If there is any event detected, the application will run the appropriate crew to process the data, and the +resulting response will be sent back to the email thread, or Qdrant collection, respectively. No UI is required, as your +ambient agents are working in the background. -```go -import ( - "context" +## Results - "github.com/qdrant/go-client/qdrant" -) +The system is now ready to run, and it can semi-automate email communication, and keep the knowledge base up-to-date. +If you set it up properly, you can expect the system to draft responses to emails that are not spam, newsletter, or +notification, so your email inbox may look like this, even when you sleep: -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +![Drafted emails](/blog/webinar-crewai-qdrant-obsidian/gmail-inbox.png) -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQuerySparse( - []uint32{1, 3, 5, 7}, - []float32{0.1, 0.2, 0.3, 0.4}), - Using: qdrant.PtrOf("text"), -}) +## Materials -``` +As usual, we prepared a video recording of the webinar, so you can watch it at your convenience: -### [Anchor](https://qdrant.tech/documentation/concepts/search/\#filtering-results-by-score) Filtering results by score + -In addition to payload filtering, it might be useful to filter out results with a low similarity score. -For example, if you know the minimal acceptance score for your model and do not want any results which are less similar than the threshold. -In this case, you can use `score_threshold` parameter of the search query. -It will exclude all results with a score worse than the given. +The source code of the demo is available on [GitHub](https://github.com/qdrant/webinar-crewai-qdrant-obsidian/), so if +you would like to try it out yourself, feel free to clone or fork the repository and follow the instructions in the +[README](https://github.com/qdrant/webinar-crewai-qdrant-obsidian/blob/main/README.md) file. -### [Anchor](https://qdrant.tech/documentation/concepts/search/\#payload-and-vector-in-the-result) Payload and vector in the result +Are you building agentic RAG applications using CrewAI and Qdrant? Please join [our Discord +community](https://github.com/qdrant/webinar-crewai-qdrant-obsidian/blob/main/README.md) and share your experience! -By default, retrieval methods do not return any stored information such as -payload and vectors. Additional parameters `with_vectors` and `with_payload` -alter this behavior. +<|page-329-lllmstxt|> +[**Qdrant 1.13.0 is out!**](https://github.com/qdrant/qdrant/releases/tag/v1.13.0) Let's look at the main features for this version: -Example: +**GPU Accelerated Indexing:** Fast HNSW indexing with architecture-free GPU support.
+**Strict Mode:** Enforce operation restrictions on collections for enhanced control.
-httppythontypescriptrustjavacsharpgo +**HNSW Graph Compression:** Reduce storage use via HNSW Delta Encoding.
+**Named Vector Filtering:** New `has_vector` filtering condition for named vectors.
+**Custom Storage:** For constant-time reads/writes of payloads and sparse vectors.
-```http -POST /collections/{collection_name}/points/query -{ - "query": [0.2, 0.1, 0.9, 0.7], - "with_vectors": true, - "with_payload": true -} +## GPU Accelerated Indexing -``` +![gpu-accelerated-indexing](/blog/qdrant-1.13.x/image_6.png) -```python -client.query_points( - collection_name="{collection_name}", - query=[0.2, 0.1, 0.9, 0.7], - with_vectors=True, - with_payload=True, -) +We are making it easier for you to handle even **the most demanding workloads**. -``` +Qdrant now supports GPU-accelerated HNSW indexing **on all major GPU vendors, including NVIDIA, AMD and Intel**. +This new feature reduces indexing times, making it a game-changer for projects where speed truly matters. -```typescript -client.query("{collection_name}", { - query: [0.2, 0.1, 0.9, 0.7], - with_vector: true, - with_payload: true, -}); +> Indexing over GPU now delivers speeds up to 10x faster than CPU-based methods for the equivalent hardware price. -``` +Our custom implementation of GPU-accelerated HNSW indexing **is built entirely in-house**. Unlike solutions that depend on third-party libraries, our approach is vendor-agnostic, meaning it works seamlessly with any modern GPU that supports **Vulkan API**. This ensures broad compatibility and flexibility for a wide range of systems. -```rust -use qdrant_client::qdrant::QueryPointsBuilder; -use qdrant_client::Qdrant; +*Here is a picture of us, running Qdrant with GPU support on a SteamDeck (AMD Van Gogh GPU):* -let client = Qdrant::from_url("http://localhost:6334").build()?; +{{< figure src="/blog/qdrant-1.13.x/gpu-test.jpg" alt="Qdrant on SteamDeck" caption="Qdrant on SteamDeck with integrated AMD GPU" >}} -client - .query( - QueryPointsBuilder::new("{collection_name}") - .query(vec![0.2, 0.1, 0.9, 0.7]) - .limit(3) - .with_payload(true) - .with_vectors(true), - ) - .await?; +This experiment didn't require any changes to the codebase, and everything worked right out of the box with our AMD Docker image. -``` +> As of right now this solution supports only on-premises deployments, but we will introduce support for Qdrant Cloud shortly. -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.WithVectorsSelectorFactory; -import io.qdrant.client.grpc.Points.QueryPoints; +### Benchmarks on Common GPUs -import static io.qdrant.client.QueryFactory.nearest; -import static io.qdrant.client.WithPayloadSelectorFactory.enable; +**Qdrant doesn't require high-end GPUs** to achieve significant performance improvements. The table below compares indexing times and instance costs for 1 million vectors (1536-dimensional) across common GPU machines: -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +| **Configuration** | **Indexing time (s)** | **Price per Instance (USD/month)** | +|------------------------------|-----------------------|-----------------------------------------| +| AMD Radeon Pro V520 | 33.1 | $394.20 (CPU + GPU) | +| Nvidia T4 | 19.1 | $277.40 (CPU) + $255.50(GPU) = $532.90 | +| Nvidia L4 | 12.4 | $214.32 (CPU) + $408.83(GPU) = $624.15 | +| 8 CPU Cores | 97.5 | $195.67 | +| 4 CPU Cores | 221.9 | $107.16 | -client.queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) - .setWithPayload(enable(true)) - .setWithVectors(WithVectorsSelectorFactory.enable(true)) - .setLimit(3) - .build()) - .get(); +*Quoted prices are from Google Cloud Platform (NVIDIA) and AWS (AMD)* -``` +**Additional Benefits:** -```csharp -using Qdrant.Client; +- **Multi-GPU Support:** Index segments concurrently to handle large-scale workloads. +- **Hardware Flexibility:** Doesn't require high-end GPUs to achieve significant performance improvements. +- **Full Feature Support:** GPU indexing supports **all quantization options and datatypes** implemented in Qdrant. +- **Large-Scale Benefits:** Fast indexing unlocks larger size of segments, which leads to **higher RPS on the same hardware**. -var client = new QdrantClient("localhost", 6334); +### [Instructions & Documentation](/documentation/guides/running-with-gpu/) +The setup is simple, with pre-configured Docker images [**(check Docker Registry)**](https://hub.docker.com/r/qdrant/qdrant/tags) for GPU environments like NVIDIA and AMD. +We've made it so you can enable GPU indexing with minimal configuration changes. -await client.QueryAsync( - collectionName: "{collection_name}", - query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, - payloadSelector: true, - vectorsSelector: true, - limit: 3 -); +> Note: Logs will clearly indicate GPU detection and usage for transparency. -``` +*Read more about this feature in the [**GPU Indexing Documentation**](/documentation/guides/running-with-gpu/)* -```go -import ( - "context" +#### Interview With the Creator of GPU Indexing - "github.com/qdrant/go-client/qdrant" -) +We interviewed **Qdrant's own Ivan Pleshkov from the Core development team**. Ivan created the new GPU indexing feature with an innovative approach he brings from the gaming industry. Listen in to hear about his vision and challenges while building the feature. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) + -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), - WithPayload: qdrant.NewWithPayload(true), - WithVectors: qdrant.NewWithVectors(true), -}) +## Strict Mode for Operational Control -``` +![strict-mode](/blog/qdrant-1.13.x/image_2.png) -You can use `with_payload` to scope to or filter a specific payload subset. -You can even specify an array of items to include, such as `city`, -`village`, and `town`: +**Strict Mode** ensures consistent performance in distributed deployments by enforcing operational controls. It limits computationally intensive operations like unindexed filtering, batch sizes, and search parameters (`hnsw_ef`, `oversampling`) This prevents inefficient usage that could overload your system. + +Additional safeguards, including limits on **payload sizes**, **filter conditions**, and **timeouts**, keep high-demand applications fast and reliable. This feature is configured via `strict_mode_config`, and it allows collection-level customization while maintaining backward compatibility. + +> New collections will default to **Strict Mode**, ensuring compliance by design and balancing workloads across tenants. + +This feature also enhances usability by providing **detailed error messages** when requests exceed defined limits. The system will give you clear guidance on resolution steps. + +**Strict Mode** solves the “*noisy neighbor*” problem and optimizes resource allocation, making multi-tenancy work nicely in serverless mode. + +### Enable Strict Mode + +To configure **Strict Mode**, refer to the [**schema definitions**](https://api.qdrant.tech/api-reference/collections/create-collection#request.body.strict_mode_config) for all available `strict_mode_config` parameters. -httppythontypescriptrustjavacsharpgo +When a defined limit is crossed, Qdrant responds with a client-side error that includes details about the specific limit exceeded. This can make troubleshooting much simpler. + +> The `enabled` field in the configuration acts as a dynamic toggle, allowing you to activate or deactivate Strict Mode as needed. + +In this example we enable **Strict Mode** when creating a collection to activate the `unindexed_filtering_retrieve` limit: ```http -POST /collections/{collection_name}/points/query +PUT /collections/{collection_name} { - "query": [0.2, 0.1, 0.9, 0.7], - "with_payload": ["city", "village", "town"] + "strict_mode_config": { + "enabled": true, + "unindexed_filtering_retrieve": true + } } +``` +```bash +curl -X PUT http://localhost:6333/collections/{collection_name} \ + -H 'Content-Type: application/json' \ + --data-raw '{ + "strict_mode_config": { + "enabled":" true, + "unindexed_filtering_retrieve": true + } + }' ``` ```python -from qdrant_client import QdrantClient +from qdrant_client import QdrantClient, models client = QdrantClient(url="http://localhost:6333") -client.query_points( +client.create_collection( collection_name="{collection_name}", - query=[0.2, 0.1, 0.9, 0.7], - with_payload=["city", "village", "town"], + strict_mode_config=models.SparseVectorParams{ enabled=True, unindexed_filtering_retrieve=True }, ) - ``` ```typescript @@ -57172,58 +76605,45 @@ import { QdrantClient } from "@qdrant/js-client-rest"; const client = new QdrantClient({ host: "localhost", port: 6333 }); -client.query("{collection_name}", { - query: [0.2, 0.1, 0.9, 0.7], - with_payload: ["city", "village", "town"], +client.createCollection("{collection_name}", { + strict_mode_config: { + enabled: true, + unindexed_filtering_retrieve: true, + }, }); - ``` ```rust -use qdrant_client::qdrant::{with_payload_selector::SelectorOptions, QueryPointsBuilder}; use qdrant_client::Qdrant; +use qdrant_client::qdrant::{CreateCollectionBuilder, StrictModeConfigBuilder}; + +let client = Qdrant::from_url("http://localhost:6334").build()?; client - .query( - QueryPointsBuilder::new("{collection_name}") - .query(vec![0.2, 0.1, 0.9, 0.7]) - .limit(3) - .with_payload(SelectorOptions::Include( - vec![\ - "city".to_string(),\ - "village".to_string(),\ - "town".to_string(),\ - ] - .into(), - )) - .with_vectors(true), + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .strict_config_mode(StrictModeConfigBuilder::default().enabled(true).unindexed_filtering_retrieve(true)), ) .await?; - ``` ```java -import java.util.List; - import io.qdrant.client.QdrantClient; import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.QueryPoints; - -import static io.qdrant.client.QueryFactory.nearest; -import static io.qdrant.client.WithPayloadSelectorFactory.include; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.StrictModeCOnfig; QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -client.queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) - .setWithPayload(include(List.of("city", "village", "town"))) - .setLimit(3) - .build()) - .get(); - +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setStrictModeConfig( + StrictModeConfig.newBuilder().setEnabled(true).setUnindexedFilteringRetrieve(true).build()) + .build()) + .get(); ``` ```csharp @@ -57232,19 +76652,10 @@ using Qdrant.Client.Grpc; var client = new QdrantClient("localhost", 6334); -await client.QueryAsync( - collectionName: "{collection_name}", - query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, - payloadSelector: new WithPayloadSelector - { - Include = new PayloadIncludeSelector - { - Fields = { new string[] { "city", "village", "town" } } - } - }, - limit: 3 +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + strictModeConfig: new StrictModeConfig { enabled = true, unindexed_filtering_retrieve = true } ); - ``` ```go @@ -57259,27 +76670,80 @@ client, err := qdrant.NewClient(&qdrant.Config{ Port: 6334, }) -client.Query(context.Background(), &qdrant.QueryPoints{ +client.CreateCollection(context.Background(), &qdrant.CreateCollection{ CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), - WithPayload: qdrant.NewWithPayloadInclude("city", "village", "town"), + StrictModeConfig: &qdrant.StrictModeConfig{ + Enabled: qdrant.PtrOf(true), + IndexingThreshold: qdrant.PtrOf(true), + }, }) - ``` +> You may also use the `PATCH` request to enable Strict Mode on an existing collection. -Or use `include` or `exclude` explicitly. For example, to exclude `city`: +*Read more about Strict Mode in the [**Database Administration Guide**](/documentation/guides/administration/#strict-mode)* + +## HNSW Graph Compression + +![hnsw-graph-compression](/blog/qdrant-1.13.x/image_3.png) + +We’re always looking for ways to make your search experience faster and more efficient. +That’s why we are introducing a new optimization method for our HNSW graph technology: [**Delta Encoding**](https://en.wikipedia.org/wiki/Delta_encoding). +This improvement makes your searches lighter on memory without sacrificing speed. + +**Delta Encoding** is a clever way to compress data by storing only the differences (or “deltas”) between values. It’s commonly used in search engines (*for the classical inverted index*) to save space and improve performance. We’ve now [**adapted this technique**](https://github.com/qdrant/qdrant/pull/5487) for the HNSW graph structure that powers Qdrant’s search. + +In contrast with traditional compression algorithms, like gzip or lz4, **Delta Encoding** requires very little CPU overhead for decompression, which makes it a perfect fit for the HNSW graph links. + +> Our experiments didn't observe any measurable performance degradation. However, the memory footprint of the HNSW graph was **reduced by up to 30%**. + +*For more general info, read about [**Indexing and Data Structures in Qdrant**](/documentation/concepts/indexing/)* + +## Filter by Named Vectors + +![filter-named-vectors](/blog/qdrant-1.13.x/image_4.png) + +In Qdrant, you can store multiple vectors of different sizes and types in a single data point. This is useful when you have to representing data with multiple embeddings, such as image, text, or video features. -httppythontypescriptrustjavacsharpgo +> We previously introduced this feature as [**Named Vectors**](/documentation/concepts/vectors/#named-vectors). Now, you can filter points by checking if a specific named vector exists. + +This makes it easy to search for points based on the presence of specific vectors. For example, *if your collection includes image and text vectors, you can filter for points that only have the image vector defined*. + +### Create a Collection with Named Vectors + +Upon collection [creation](/documentation/concepts/collections/#collection-with-multiple-vectors), you define named vector types, such as `image` or `text`: ```http -POST /collections/{collection_name}/points/query +PUT /collections/{collection_name} { - "query": [0.2, 0.1, 0.9, 0.7], - "with_payload": { - "exclude": ["city"] - } + "vectors": { + "image": { + "size": 4, + "distance": "Dot" + }, + "text": { + "size": 8, + "distance": "Cosine" + } + }, + "sparse_vectors": { + "sparse-image": {}, + "sparse-text": {}, + }, } +``` +### Sample Request +Some points might include both **image** and **text** vectors, while others might include just one. With this new feature, you can easily filter for points that specifically have the **image** vector defined. + +```http +POST /collections/{collection_name}/points/scroll +{ + "filter": { + "must": [ + { "has_vector": "image" } + ] + } +} ``` ```python @@ -57287,90 +76751,68 @@ from qdrant_client import QdrantClient, models client = QdrantClient(url="http://localhost:6333") -client.query_points( +client.scroll( collection_name="{collection_name}", - query=[0.2, 0.1, 0.9, 0.7], - with_payload=models.PayloadSelectorExclude( - exclude=["city"], + scroll_filter=models.Filter( + must=[ + models.HasVectorCondition(has_vector="image"), + ], ), ) - ``` ```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; - -const client = new QdrantClient({ host: "localhost", port: 6333 }); - -client.query("{collection_name}", { - query: [0.2, 0.1, 0.9, 0.7], - with_payload: { - exclude: ["city"], +client.scroll("{collection_name}", { + filter: { + must: [ + { + has_vector: "image", + }, + ], }, }); - ``` - ```rust -use qdrant_client::qdrant::{with_payload_selector::SelectorOptions, QueryPointsBuilder}; +use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; use qdrant_client::Qdrant; let client = Qdrant::from_url("http://localhost:6334").build()?; client - .query( - QueryPointsBuilder::new("{collection_name}") - .query(vec![0.2, 0.1, 0.9, 0.7]) - .limit(3) - .with_payload(SelectorOptions::Exclude(vec!["city".to_string()].into())) - .with_vectors(true), + .scroll( + ScrollPointsBuilder::new("{collection_name}") + .filter(Filter::must([Condition::has_vector("image")])), ) .await?; - ``` - ```java import java.util.List; -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.QueryPoints; - -import static io.qdrant.client.QueryFactory.nearest; -import static io.qdrant.client.WithPayloadSelectorFactory.exclude; - -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +import static io.qdrant.client.ConditionFactory.hasVector; +import static io.qdrant.client.PointIdFactory.id; -client.queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) - .setWithPayload(exclude(List.of("city"))) - .setLimit(3) - .build()) - .get(); +import io.qdrant.client.grpc.Points.Filter; +import io.qdrant.client.grpc.Points.ScrollPoints; +client + .scrollAsync( + ScrollPoints.newBuilder() + .setCollectionName("{collection_name}") + .setFilter( + Filter.newBuilder() + .addMust(hasVector("image")) + .build()) + .build()) + .get(); ``` - ```csharp using Qdrant.Client; -using Qdrant.Client.Grpc; +using static Qdrant.Client.Grpc.Conditions; var client = new QdrantClient("localhost", 6334); -await client.QueryAsync( - collectionName: "{collection_name}", - query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, - payloadSelector: new WithPayloadSelector - { - Exclude = new PayloadExcludeSelector { Fields = { new string[] { "city" } } } - }, - limit: 3 -); - +await client.ScrollAsync(collectionName: "{collection_name}", filter: HasVector("image")); ``` - ```go import ( "context" @@ -57383,15687 +76825,12114 @@ client, err := qdrant.NewClient(&qdrant.Config{ Port: 6334, }) -client.Query(context.Background(), &qdrant.QueryPoints{ +client.Scroll(context.Background(), &qdrant.ScrollPoints{ CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), - WithPayload: qdrant.NewWithPayloadExclude("city"), + Filter: &qdrant.Filter{ + Must: []*qdrant.Condition{ + qdrant.NewHasVector( + "image", + ), + }, + }, }) - ``` +This feature makes it easier to manage and query collections with heterogeneous data. It will give you more flexibility and control over your vector search workflows. -It is possible to target nested fields using a dot notation: - -- `payload.nested_field` \- for a nested field -- `payload.nested_array[].sub_field` \- for projecting nested fields within an array - -Accessing array elements by index is currently not supported. - -## [Anchor](https://qdrant.tech/documentation/concepts/search/\#batch-search-api) Batch search API - -The batch search API enables to perform multiple search requests via a single request. - -Its semantic is straightforward, `n` batched search requests are equivalent to `n` singular search requests. - -This approach has several advantages. Logically, fewer network connections are required which can be very beneficial on its own. - -More importantly, batched requests will be efficiently processed via the query planner which can detect and optimize requests if they have the same `filter`. - -This can have a great effect on latency for non trivial filters as the intermediary results can be shared among the request. - -In order to use it, simply pack together your search requests. All the regular attributes of a search request are of course available. - -httppythontypescriptrustjavacsharpgo - -```http -POST /collections/{collection_name}/points/query/batch -{ - "searches": [\ - {\ - "query": [0.2, 0.1, 0.9, 0.7],\ - "filter": {\ - "must": [\ - {\ - "key": "city",\ - "match": {\ - "value": "London"\ - }\ - }\ - ]\ - },\ - "limit": 3\ - },\ - {\ - "query": [0.5, 0.3, 0.2, 0.3],\ - "filter": {\ - "must": [\ - {\ - "key": "city",\ - "match": {\ - "value": "London"\ - }\ - }\ - ]\ - },\ - "limit": 3\ - }\ - ] -} - -``` +*To dive deeper into filtering by named vectors, check out the [**Filtering Documentation**](/documentation/concepts/filtering/#has-vector)* -```python -from qdrant_client import QdrantClient, models +## Custom Storage Engine -client = QdrantClient(url="http://localhost:6333") +![custom-storage-engine](/blog/qdrant-1.13.x/image_5.png) -filter_ = models.Filter( - must=[\ - models.FieldCondition(\ - key="city",\ - match=models.MatchValue(\ - value="London",\ - ),\ - )\ - ] -) +When Qdrant started, we used **RocksDB** as the storage backend for payloads and sparse vectors. RocksDB, known for its versatility and ability to handle random reads and writes, seemed like a solid choice. But as our needs evolved, its “*general-purpose*” design began to show cracks. -search_queries = [\ - models.QueryRequest(query=[0.2, 0.1, 0.9, 0.7], filter=filter_, limit=3),\ - models.QueryRequest(query=[0.5, 0.3, 0.2, 0.3], filter=filter_, limit=3),\ -] +> RocksDB is built to handle arbitrary keys and values of any size, but this flexibility comes at a cost. -client.query_batch_points(collection_name="{collection_name}", requests=search_queries) +A key example is compaction, a process that reorganizes data on disk to maintain performance. **Under heavy write loads, compaction can become a bottleneck**, causing significant slowdowns. For Qdrant, this meant huge latency spikes at random moments causing timeout errors during large uploads—a frustrating roadblock. -``` +To solve this, we built a **custom storage backend** optimized for our specific use case. Unlike RocksDB, our system delivers consistent performance by ensuring reads and writes require a constant number of disk operations, regardless of data size. As a result, you will get faster and reliable performance - free from latency-spikes. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +### Our New Storage Architecture -const client = new QdrantClient({ host: "localhost", port: 6333 }); +There are four elements: the **Data Layer**, **Mask Layer**, **the Region** and **Tracker Layer**. -const filter = { - must: [\ - {\ - key: "city",\ - match: {\ - value: "London",\ - },\ - },\ - ], -}; +{{< figure src="/blog/qdrant-1.13.x/storage.png" alt="Qdrant's New Storage Backend" caption="Qdrant's New Storage Backend" >}} -const searches = [\ - {\ - query: [0.2, 0.1, 0.9, 0.7],\ - filter,\ - limit: 3,\ - },\ - {\ - query: [0.5, 0.3, 0.2, 0.3],\ - filter,\ - limit: 3,\ - },\ -]; +**The Data Layer** consists of fixed-size blocks that store the actual data. The block size is a configurable parameter that can be adjusted based on the workload. Each record occupies the required number of blocks. If the data size exceeds the block size, it is split into multiple blocks. If the data size is smaller than the block size, it still occupies an entire block. -client.queryBatch("{collection_name}", { - searches, -}); +**The Mask Layer** contains a bitmask that indicates which blocks are occupied and which are free. The size of the mask corresponds to the number of blocks in the Data Layer. For instance, if we have 64 blocks of 128 bytes each, the bitmask will allocate 1 bit for every block in the Data Layer resulting in 8 bytes. This results in an overhead of 1/1024 of the Data Layer size, because each byte in the mask covers 1024 bytes of blocked storage. The bitmask is stored on disk and does not need to be loaded into memory. -``` +**The Region** is an additional structure which tracks gaps in regions of the bitmask. This is to get an even smaller overhead against the data, which can be loaded into memory easily. Each region summarizes 1KB of bits in the bitmask, which represents a millionth scale of the Data Layer size, or 6 KB of RAM per GB of data. -```rust -use qdrant_client::qdrant::{Condition, Filter, QueryBatchPointsBuilder, QueryPointsBuilder}; -use qdrant_client::Qdrant; +**The Tracker Layer** is in charge of fast lookups, it directly links the IDs of the points to the place where the data is located. -let client = Qdrant::from_url("http://localhost:6334").build()?; +## Get Started with Qdrant +![get-started](/blog/qdrant-1.13.x/image_1.png) -let filter = Filter::must([Condition::matches("city", "London".to_string())]); +The easiest way to reach that **Hello World** moment is to [**try vector search in a live cluster**](/documentation/quickstart-cloud/). Our **interactive tutorial** will show you how to create a cluster, add data and try some filtering clauses. -let searches = vec![\ - QueryPointsBuilder::new("{collection_name}")\ - .query(vec![0.1, 0.2, 0.3, 0.4])\ - .limit(3)\ - .filter(filter.clone())\ - .build(),\ - QueryPointsBuilder::new("{collection_name}")\ - .query(vec![0.5, 0.3, 0.2, 0.3])\ - .limit(3)\ - .filter(filter)\ - .build(),\ -]; +**New features, like named vector filtering, can be tested in the Qdrant Dashboard:** -client - .query_batch(QueryBatchPointsBuilder::new("{collection_name}", searches)) - .await?; +![qdrant-filtering-tutorial](/articles_data/vector-search-filtering/qdrant-filtering-tutorial.png) -``` +<|page-330-lllmstxt|> +![voiceflow/image2.png](/blog/case-study-voiceflow/image1.png) -```java -import java.util.List; +[Voiceflow](https://www.voiceflow.com/) enables enterprises to create AI agents in a no-code environment by designing workflows through a drag-and-drop interface. The platform allows developers to host and customize chatbot interfaces without needing to build their own RAG pipeline, working out of the box and being easily adaptable to specific use cases. “Powered by technologies like Natural Language Understanding (NLU), Large Language Models (LLM), and Qdrant as a vector search engine, Voiceflow serves a diverse range of customers, including enterprises that develop chatbots for internal and external AI use cases,” says [Xavier Portillo Edo](https://www.linkedin.com/in/xavierportillaedo/), Head of Cloud Infrastructure at Voiceflow. -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.QueryPoints; +## Evaluation Criteria -import static io.qdrant.client.QueryFactory.nearest; -import static io.qdrant.client.ConditionFactory.matchKeyword; +[Denys Linkov](https://www.linkedin.com/in/denyslinkov/), Machine Learning Team Lead at Voiceflow, explained the journey of building a managed RAG solution. "Initially, our product focused on users manually defining steps on the canvas. After the release of ChatGPT, we added AI-based responses, leading to the launch of our managed RAG solution in the spring of 2023," Linkov said. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +As part of this development, the Voiceflow engineering team was looking for a [vector database](/qdrant-vector-database/) solution to power their RAG setup. They evaluated various vector databases based on several key factors: -Filter filter = Filter.newBuilder().addMust(matchKeyword("city", "London")).build(); +- **Performance**: The ability to [handle the scale](/documentation/guides/distributed_deployment/) required by Voiceflow, supporting hundreds of thousands of projects efficiently. +- **Metadata**: The capability to tag data and chunks and retrieve based on those values, essential for organizing and accessing specific information swiftly. +- **Managed Solution**: The availability of a [managed service](/documentation/cloud/) with automated maintenance, scaling, and security, freeing the team from infrastructure concerns. -List searches = List.of( - QueryPoints.newBuilder() - .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) - .setFilter(filter) - .setLimit(3) - .build(), - QueryPoints.newBuilder() - .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) - .setFilter(filter) - .setLimit(3) - .build()); +*"We started with Pinecone but eventually switched to Qdrant,"* Linkov noted. The reasons for the switch included: -client.queryBatchAsync("{collection_name}", searches).get(); +- **Scaling Capabilities**: Qdrant offers a robust multi-node setup with [horizontal scaling](/documentation/cloud/cluster-scaling/), allowing clusters to grow by adding more nodes and distributing data and load among them. This ensures high performance and resilience, which is crucial for handling large-scale projects. +- **Infrastructure**: “Qdrant provides robust infrastructure support, allowing integration with virtual private clouds on AWS using AWS Private Links and ensuring encryption with AWS KMS. This setup ensures high security and reliability,” says Portillo Edo. +- **Responsive Qdrant Team**: "The Qdrant team is very responsive, ships features quickly and is a great partner to build with," Linkov added. -``` +## Migration and Onboarding -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -using static Qdrant.Client.Grpc.Conditions; +Voiceflow began its migration to Qdrant by creating [backups](/documentation/cloud/backups/) and ensuring data consistency through random checks and key customer verifications. "Once we were confident in the stability, we transitioned the primary database to Qdrant, completing the migration smoothly," Linkov explained. -var client = new QdrantClient("localhost", 6334); +During onboarding, Voiceflow transitioned from namespaces to Qdrant's collections, which offer enhanced flexibility and advanced vector search capabilities. They also implemented Quantization to enhance data processing efficiency. This comprehensive process ensured a seamless transition to Qdrant's robust infrastructure. -var filter = MatchKeyword("city", "London"); +## RAG Pipeline Setup -var queries = new List -{ - new() - { - CollectionName = "{collection_name}", - Query = new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, - Filter = filter, - Limit = 3 - }, - new() - { - CollectionName = "{collection_name}", - Query = new float[] { 0.5f, 0.3f, 0.2f, 0.3f }, - Filter = filter, - Limit = 3 - } -}; +Voiceflow's RAG pipeline setup provides a streamlined process for uploading and managing data from various sources, designed to offer flexibility and customization at each step. -await client.QueryBatchAsync(collectionName: "{collection_name}", queries: queries); +- **Data Upload**: Customers can upload data via API from sources such as URLs, PDFs, Word documents, and plain text formats. Integration with platforms like Zendesk is supported, and users can choose between single uploads or refresh-based uploads. +- **Data Ingestion**: Once data is ingested, Voiceflow offers preset strategies for data checking. Users can utilize these strategies or opt for more customization through the API to tailor the ingestion process as needed. +- **Metadata Tagging**: Metadata tags can be applied during the ingestion process, which helps organize and facilitate efficient data retrieval later on. +- **Data Retrieval**: At retrieval time, Voiceflow provides prompts that can modify user questions by adding context, variables, or other modifications. This customization includes adding personas or structuring responses as markdown. Depending on the type of interaction (e.g., button, carousel with an image for image retrieval), these prompts are displayed to users in a structured format. -``` +This comprehensive setup ensures that Voiceflow users can efficiently manage and customize their data workflows, providing a robust solution for building AI-driven applications. -```go -import ( - "context" +## How Voiceflow Uses Qdrant - "github.com/qdrant/go-client/qdrant" -) +Voiceflow leverages Qdrant's robust features and infrastructure to optimize their AI assistant platform. Here’s a breakdown of how they utilize these capabilities: -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +*Database Features:* -filter := qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("city", "London"), - }, -} +- **Quantization**: This feature helps Voiceflow to perform efficient data processing by reducing the size of vectors, making searches faster. The team uses [Product Quantization](https://qdrant.tech/articles/product-quantization/) in particular. +- **Chunking Search**: Voiceflow uses chunking search to improve search efficiency by breaking down large datasets into manageable chunks, which allows for faster and more efficient data retrieval. +- **Sparse Vector Search**: Although not yet implemented, this feature is being explored for more precise keyword searches. "This is an encouraging direction the Qdrant team is taking here as many users seek more exact keyword search," said Linkov. -client.QueryBatch(context.Background(), &qdrant.QueryBatchPoints{ - CollectionName: "{collection_name}", - QueryPoints: []*qdrant.QueryPoints{ - { - CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), - Filter: &filter, - }, - { - CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.5, 0.3, 0.2, 0.3), - Filter: &filter, - }, - }, -}) +*Architecture:* -``` +- **Node Pool**: A large node pool is used for public cloud users, ensuring scalability, while several smaller, isolated instances cater to private cloud users, providing enhanced security. -The result of this API contains one array per search requests. +*Infrastructure:* -```json -{ - "result": [\ - [\ - { "id": 10, "score": 0.81 },\ - { "id": 14, "score": 0.75 },\ - { "id": 11, "score": 0.73 }\ - ],\ - [\ - { "id": 1, "score": 0.92 },\ - { "id": 3, "score": 0.89 },\ - { "id": 9, "score": 0.75 }\ - ]\ - ], - "status": "ok", - "time": 0.001 -} +- **Private Link**: The ability to use Private Link connections across different instances is a significant advantage, requiring robust infrastructure support from Qdrant. "This setup was crucial for SOC2 compliance, and Qdrant's support team made the process seamless by ensuring feasibility and aiding in the implementation," Linkov explained. -``` +By utilizing these features, Voiceflow ensures that its platform is scalable, secure, and efficient, meeting the diverse needs of its users. -## [Anchor](https://qdrant.tech/documentation/concepts/search/\#query-by-id) Query by ID +## The Outcome -Whenever you need to use a vector as an input, you can always use a [point ID](https://qdrant.tech/documentation/concepts/points/#point-ids) instead. +Voiceflow achieved significant improvements and efficiencies by leveraging Qdrant's capabilities: -httppythontypescriptrustjavacsharpgo +- **Enhanced Metadata Tagging**: Implemented robust metadata tagging, allowing for custom fields and tags that facilitate efficient search filtering. +- **Optimized Performance**: Resolved concerns about retrieval times with a high number of tags by optimizing indexing strategies, achieving efficient performance. +- **Minimal Operational Overhead**: Experienced minimal overhead, streamlining their operational processes. +- **Future-Ready**: Anticipates further innovation in hybrid search with multi-token attention. +- **Multitenancy Support**: Utilized Qdrant's efficient and [isolated data management](/documentation/guides/multiple-partitions/) to support diverse user needs. -```http -POST /collections/{collection_name}/points/query -{ - "query": "43cf51e2-8777-4f52-bc74-c2cbde0c8b04" // <--- point id -} +Overall, Qdrant's features and infrastructure provided Voiceflow with a stable, scalable, and efficient solution for their data processing and retrieval needs. -``` +## What’s Next -```python -client.query_points( - collection_name="{collection_name}", - query="43cf51e2-8777-4f52-bc74-c2cbde0c8b04", # <--- point id -) +Voiceflow plans to enhance its platform with more filtering and customization options, allowing developers to host and customize chatbot interfaces without building their own [RAG](/rag/) pipeline. -``` +<|page-331-lllmstxt|> +# The Twin Celebrity App -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +In the era of personalization, combining cutting-edge technology with fun can create engaging applications that resonate with users. One such project is the [**Twin Celebrity app**](https://github.com/neural-maze/vector-twin), a tool that matches users with their celebrity look-alikes using facial recognition embeddings and [**vector search**](/advanced-search/) powered by Qdrant. This blog post dives into the architecture, tools, and practical advice for developers who want to build this app—or something similar. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +The [**Twin Celebrity app**](https://github.com/neural-maze/vector-twin) identifies which celebrity a user resembles by analyzing a selfie. The app utilizes: +- **Face recognition embeddings**: Generated by a ResNet-based **FaceNet** model. +- **Vector similarity search**: Powered by Qdrant to find the closest match. +- **ZenML**: For orchestrating data pipelines. +- **Streamlit**: As the front-end interface. -client.query("{collection_name}", { - query: '43cf51e2-8777-4f52-bc74-c2cbde0c8b04', // <--- point id -}); +> This project not only demonstrates the capabilities of modern vector databases but also serves as an exciting introduction to embedding-based applications. -``` +--- -```rust -use qdrant_client::Qdrant; -use qdrant_client::qdrant::{Condition, Filter, PointId, Query, QueryPointsBuilder}; +## Learn From the App's Creator -let client = Qdrant::from_url("http://localhost:6334").build()?; +We interviewed the engineer behind this project, [**Miguel Otero Pedrido**](https://www.linkedin.com/in/migueloteropedrido/), who is also the founder of [**The Neural Maze**](https://www.youtube.com/@TheNeuralMaze). Miguel explains in detail how he put the app together, as well as his choice of tools. -client - .query( - QueryPointsBuilder::new("{collection_name}") - .query(Query::new_nearest(PointId::new("43cf51e2-8777-4f52-bc74-c2cbde0c8b04"))) - ) - .await?; + -``` +#### Turns out his celebrity twin is...Andy Samberg +![samberg](/blog/facial-recognition/samberg.png) +___ -```java -import java.util.UUID; +## Architecture -import static io.qdrant.client.QueryFactory.nearest; +**Search Engine & DB:** [**Qdrant**](https://qdrant.tech) stands out as a high-performance [**vector database**](/qdrant-vector-database/) built in Rust, known for its reliability and speed. Its advanced features, such as [**vector visualization**](/documentation/web-ui/) and efficient [**querying**](/documentation/concepts/search/), make it a go-to choice for developers working on embedding-based projects. -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.QueryPoints; +![architecture](/blog/facial-recognition/architecture.png) -QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +**ML Framework:** [**ZenML**](https://www.zenml.io) simplifies pipeline creation with a modular, cloud-agnostic framework that ensures clean, scalable, and portable code, ideal for cross-platform workflows. -client.queryAsync(QueryPoints.newBuilder() - .setCollectionName("{collectionName}") - .setQuery(nearest(UUID.fromString("43cf51e2-8777-4f52-bc74-c2cbde0c8b04"))) - .build()).get(); +**Facial Recognition:** [**MTCNN**](https://github.com/ipazc/mtcnn#) ensures consistent face alignment, making the embeddings more reliable. -``` +**Embedding Model:** [**FaceNet**](https://github.com/davidsandberg/facenet) provides lightweight, pre-trained facial embeddings, balancing accuracy and efficiency, making it perfect for tasks like the Twin Celebrity app. -```csharp -using Qdrant.Client; +**Frontend:** [**Streamlit**](https://github.com/streamlit) streamlines UI development, enabling rapid prototyping with minimal effort, allowing developers to focus on core functionalities. -var client = new QdrantClient("localhost", 6334); +## Application Workflows -await client.QueryAsync( - collectionName: "{collection_name}", - query: Guid.Parse("43cf51e2-8777-4f52-bc74-c2cbde0c8b04") -); +The app is divided into two phases - **The Offline Phase**, where the celebrity images are vectorized and **The Online Phase**, which carries out a live [**similarity search**](). -``` +![online-offline](/blog/facial-recognition/online-offline.png) -```go -import ( - "context" +**The Offline Phase** - "github.com/qdrant/go-client/qdrant" -) +The first step is dataset preparation. Celebrity images are fetched from **HuggingFace’s dataset library** to serve as the foundation for embeddings. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Next - [**MTCNN**](https://github.com/ipazc/mtcnn#) aligns celebrities faces within images. +Then, a pre-trained [**FaceNet**](https://en.wikipedia.org/wiki/FaceNet) model is used to generate 512-dimensional embeddings for each image. This ensures consistent and high-quality representation of facial features. -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQueryID(qdrant.NewID("43cf51e2-8777-4f52-bc74-c2cbde0c8b04")), -}) +Finally, these embeddings, along with metadata, are stored in [**Qdrant Cloud**](/cloud/). This enables efficient retrieval and management of the data for later use. -``` +--- -The above example will fetch the default vector from the point with this id, and use it as the query vector. +**The Online Phase** -If the `using` parameter is also specified, Qdrant will use the vector with that name. +In the online phase, user interaction begins with a **Streamlit app**. The app captures a selfie and converts it into an embedding using the same FaceNet model. -It is also possible to reference an ID from a different collection, by setting the `lookup_from` parameter. +The generated embedding is then queried against Qdrant, which retrieves the top matches based on similarity. -httppythontypescriptrustjavacsharpgo +Finally, the results are displayed in an intuitive interface, showing the user their **closest celebrity match** and making the interaction engaging and seamless. -```http -POST /collections/{collection_name}/points/query -{ - "query": "43cf51e2-8777-4f52-bc74-c2cbde0c8b04", // <--- point id - "using": "512d-vector" - "lookup_from": { - "collection": "another_collection", // <--- other collection name - "vector": "image-512" // <--- vector name in the other collection - } -} +--- -``` +## How to Build the App -```python -from qdrant_client import QdrantClient, models +Miguel recently published a video on his YouTube channel: [**The Neural Maze**](https://www.youtube.com/@TheNeuralMaze). -client = QdrantClient(url="http://localhost:6333") +For detailed steps to build the app, watch [**Building a Twin Celebrity App**](https://www.youtube.com/watch?v=LltFAum3gVg). -client.query_points( - collection_name="{collection_name}", - query="43cf51e2-8777-4f52-bc74-c2cbde0c8b04", # <--- point id - using="512d-vector", - lookup_from=models.LookupLocation( - collection="another_collection", # <--- other collection name - vector="image-512", # <--- vector name in the other collection - ) -) +### 1. Set Up the Offline Pipeline +Using ZenML, the pipeline consists of: +- **Data Loading**: Fetch images and labels (e.g., "Brad Pitt") from Hugging Face. +- **Sampling**: Reduce dataset size for faster processing, selecting around 3,000 images. +- **Embedding Generation**: Convert images into embeddings using MTCNN for face detection and FaceNet for embedding creation. +- **Storage in Qdrant**: Save embeddings into a collection named `celebrities`. -``` +### 2. Create the Online Application +The Streamlit app handles: +- **Image Capture**: Takes a selfie through a webcam or uploaded file. +- **Embedding Querying**: Sends the embedding to Qdrant, retrieves the top matches, and visualizes the similarity. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +### 3. Deployment Options -const client = new QdrantClient({ host: "localhost", port: 6333 }); +- Deploy the app on platforms like **Google Cloud**, **AWS**, or **Azure**. Setting up CI/CD pipelines can streamline updates and deployments. -client.query("{collection_name}", { - query: '43cf51e2-8777-4f52-bc74-c2cbde0c8b04', // <--- point id - using: '512d-vector', - lookup_from: { - collection: 'another_collection', // <--- other collection name - vector: 'image-512', // <--- vector name in the other collection - } -}); +- The application can be containerized using **Docker**. For hosting, **Google Cloud Run** is an excellent choice, as it efficiently manages containerized applications without requiring extensive infrastructure management. -``` +- The deployment process is streamlined further with CI/CD pipelines, such as those provided by **Cloud Build or GitHub Actions**, which automate the steps for building, testing, and deploying updates. -```rust -use qdrant_client::Qdrant; -use qdrant_client::qdrant::{LookupLocationBuilder, PointId, Query, QueryPointsBuilder}; +### 4. Test the Quality of Your Embeddings -let client = Qdrant::from_url("http://localhost:6334").build()?; +You can always use [**Qdrant’s visualization tools**](/documentation/web-ui/) to refine accuracy and ensure clusters align with expectations. -client.query( - QueryPointsBuilder::new("{collection_name}") - .query(Query::new_nearest("43cf51e2-8777-4f52-bc74-c2cbde0c8b04")) - .using("512d-vector") - .lookup_from( - LookupLocationBuilder::new("another_collection") - .vector_name("image-512") - ) -).await?; +![architecture](/blog/facial-recognition/web-ui.png) -``` +If your data is properly embedded, then the visualization tool will appropriately cluster celebrity images into groups. -```java -import static io.qdrant.client.QueryFactory.nearest; +--- -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.LookupLocation; -import io.qdrant.client.grpc.Points.QueryPoints; -import java.util.UUID; +## Lessons and Takeaways -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +Scalability poses challenges when working with large datasets, such as 20,000+ images. Consider optimizations like [**quantization**](/documentation/guides/quantization/) to reduce memory usage or precomputing average embeddings for clusters can significantly minimize storage and computational costs. These strategies ensure the system remains performant as the dataset grows. -client - .queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(nearest(UUID.fromString("43cf51e2-8777-4f52-bc74-c2cbde0c8b04"))) - .setUsing("512d-vector") - .setLookupFrom( - LookupLocation.newBuilder() - .setCollectionName("another_collection") - .setVectorName("image-512") - .build()) - .build()) - .get(); +The potential real-world applications of this technology extend far beyond entertainment. Similar systems can be used in security applications for embedding-based facial recognition to secure access to buildings or devices. -``` +In **healthcare**, they can assist in analyzing features such as moles or skin textures. In **retail**, they enable personalized recommendations based on user photos, demonstrating the versatility of this approach. -```csharp -using Qdrant.Client; +--- -var client = new QdrantClient("localhost", 6334); +## Next Steps for Developers -await client.QueryAsync( - collectionName: "{collection_name}", - query: Guid.Parse("43cf51e2-8777-4f52-bc74-c2cbde0c8b04"), // <--- point id - usingVector: "512d-vector", - lookupFrom: new() { - CollectionName = "another_collection", // <--- other collection name - VectorName = "image-512" // <--- vector name in the other collection - } -); +- Start by [**cloning the project repository**](https://github.com/neural-maze/vector-twin) to understand the architecture and functionality. -``` +- Expand the dataset with more celebrity images for diversity or fine-tune the FaceNet model for improved accuracy. -```go -import ( - "context" +- Consider deploying a mobile-friendly version using frameworks like **Flutter** or **React Native** for a seamless user experience. - "github.com/qdrant/go-client/qdrant" -) +> For scalability, implement **multi-GPU setups** to speed up embedding generation and optimize storage with techniques like quantization or average embeddings. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +To enhance functionality, explore features like **video input for real-time matches** or add **metadata such as celebrity bios** to enrich user interaction. Experiment with custom similarity scoring for more tailored results. -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQueryID(qdrant.NewID("43cf51e2-8777-4f52-bc74-c2cbde0c8b04")), - Using: qdrant.PtrOf("512d-vector"), - LookupFrom: &qdrant.LookupLocation{ - CollectionName: "another_collection", - VectorName: qdrant.PtrOf("image-512"), - }, -}) +## More Links -``` +- Miguel's [LinkedIn profile](https://www.linkedin.com/in/migueloteropedrido/) +- Miguel's [Substack blog](https://theneuralmaze.substack.com) +- The Neural Maze [YouTube channel](https://www.youtube.com/@TheNeuralMaze) +- Twin Celebrity [GitHub Repository](https://github.com/neural-maze/vector-twin) -In the case above, Qdrant will fetch the `"image-512"` vector from the specified point id in the -collection `another_collection`. +<|page-332-lllmstxt|> +ColPali is a fascinating leap in document retrieval. Its precision in handling visually rich PDFs is phenomenal, but scaling it to handle real-world datasets comes with its share of computational challenges. -## [Anchor](https://qdrant.tech/documentation/concepts/search/\#pagination) Pagination +Here's how we solved these challenges to make ColPali 13x faster without sacrificing the precision it’s known for. -Search and [recommendation](https://qdrant.tech/documentation/concepts/explore/#recommendation-api) APIs allow to skip first results of the search and return only the result starting from some specified offset: +## The Scaling Dilemma -Example: +ColPali generates **1,030 vectors for just one page of a PDF.** While this is manageable for small-scale tasks, in a real-world production setting where you may need to store hundreds od thousands of PDFs, the challenge of scaling becomes significant. -httppythontypescriptrustjavacsharpgo +Consider this scenario: -```http -POST /collections/{collection_name}/points/query -{ - "query": [0.2, 0.1, 0.9, 0.7], - "with_vectors": true, - "with_payload": true, - "limit": 10, - "offset": 100 -} +- **Dataset Size:** 20,000 PDF pages. +- **Number of Vectors:** Each page generates ~1,000 vectors of 128 dimensions. -``` +The total number of comparisons is calculated as: -```python -from qdrant_client import QdrantClient +$$ +1,000 \cdot 1,000 \cdot 20,000 \cdot 128 = 2.56 \times 10^{12} \text{ comparisons!} +$$ -client = QdrantClient(url="http://localhost:6333") +That's trillions of comparisons needed to build the index. Even advanced indexing algorithms like **HNSW** struggle with this scale, as computational costs grow quadratically with amount of multivectors per page. -client.query_points( - collection_name="{collection_name}", - query=[0.2, 0.1, 0.9, 0.7], - with_vectors=True, - with_payload=True, - limit=10, - offset=100, -) +We turned to a hybrid optimization strategy combining **pooling** (to reduce computational overhead) and **reranking** (to preserve accuracy). -``` +Before we go any deeper, watch our [Webinar video](https://www.youtube.com/live/_h6SN1WwnLs?si=n8gwiIjJ5dnfucXC) for the full demo walkthrough. + -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +For those eager to explore, the [codebase is available here](https://github.com/qdrant/demo-colpali-optimized). -const client = new QdrantClient({ host: "localhost", port: 6333 }); -client.query("{collection_name}", { - query: [0.2, 0.1, 0.9, 0.7], - with_vector: true, - with_payload: true, - limit: 10, - offset: 100, -}); +## Two-Stage Retrieval Process -``` +### Pooling -```rust -use qdrant_client::qdrant::QueryPointsBuilder; -use qdrant_client::Qdrant; +Pooling is well-known in machine learning as a way to compress data while keeping important information. For ColPali, we reduced 1,030 vectors per page to just 38 vectors by pooling rows in the document's 32x32 grid. -let client = Qdrant::from_url("http://localhost:6334").build()?; +![](/blog/colpali-optimization/rows.png) -client - .query( - QueryPointsBuilder::new("{collection_name}") - .query(vec![0.2, 0.1, 0.9, 0.7]) - .with_payload(true) - .with_vectors(true) - .limit(10) - .offset(100), - ) - .await?; +Max and mean pooling are the two most popular types, so we decided to test both approaches on the rows of the grid. Likewise, we could apply pooling on columns, which we plan to explore in the future. -``` +- **Mean Pooling:** Averages values across rows. +- **Max Pooling:** Selects the maximum value for each feature. -```java -import java.util.List; +32 vectors represent the pooled rows, while 6 vectors encode contextual information derived from ColPali’s special tokens (e.g., for the beginning of the sequence, and task-specific instructions like “Describe the image”). -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.WithVectorsSelectorFactory; -import io.qdrant.client.grpc.Points.QueryPoints; +For our experiments, we chose to preserve these 6 additional vectors. -import static io.qdrant.client.QueryFactory.nearest; -import static io.qdrant.client.WithPayloadSelectorFactory.enable; +### The "ColPali as a Reranker" Experiment -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +Pooling drastically reduces retrieval costs, but there’s a risk of losing fine-grained precision. To address this, we implemented a **two-stage retrieval system**, where embeddings generated with ColPali were max/mean pooled by grid rows to create lightweight vectors for the initial retrieval stage, followed by reranking with the original high-resolution embeddings: -client.queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) - .setWithPayload(enable(true)) - .setWithVectors(WithVectorsSelectorFactory.enable(true)) - .setLimit(10) - .setOffset(100) - .build()) - .get(); +1. **Pooled Retrieval:** Quickly retrieves the top 200 candidates using lightweight pooled embeddings. +2. **Full Reranking:** Refines these candidates using the original, high-resolution embeddings, delivering the final top 20 results. -``` +### Implementation -```csharp -using Qdrant.Client; +We created a custom dataset with over 20,000 unique PDF pages by merging: -var client = new QdrantClient("localhost", 6334); +- **ViDoRe Benchmark:** Designed for PDF documents retrieval evaluation. +- **UFO Dataset:** Visually rich documents paired with synthetic queries [generated by Daniel van Strien](https://huggingface.co/datasets/davanstrien/ufo-ColPali). +- **DocVQA Dataset:** A large set of document-derived Q&A pairs. -await client.QueryAsync( - collectionName: "{collection_name}", - query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, - payloadSelector: true, - vectorsSelector: true, - limit: 10, - offset: 100 -); +Each document was processed into 32x32 grids, generating both full-resolution and pooled embeddings. **Full-resolution** embeddings consisted of 1,030 vectors per page, while **pooled embeddings** included mean and max pooling variants. -``` +All embeddings were were stored and kept in RAM to avoid caching effects during retrieval speed experiments. -```go -import ( - "context" +### Experiment Setup - "github.com/qdrant/go-client/qdrant" -) +We evaluated retrieval quality with 1,000 queries. First, pooled embeddings retrieved the top 200 candidates. Then, full-resolution embeddings reranked them to produce the final top 20 results. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +To measure performance, we used: -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), - WithPayload: qdrant.NewWithPayload(true), - WithVectors: qdrant.NewWithVectors(true), - Offset: qdrant.PtrOf(uint64(100)), -}) +- **NDCG@20:** Measures ranking quality (how well the top results align with expectations). +- **Recall@20:** Measures the overlap between this method and the original ColPali retrieval. -``` +## Results -Is equivalent to retrieving the 11th page with 10 records per page. +The experiment showed promising improvements in speed and accuracy. Retrieval time improved **13x** compared to using full-resolution embeddings alone. -Vector-based retrieval in general and HNSW index in particular, are not designed to be paginated. -It is impossible to retrieve Nth closest vector without retrieving the first N vectors first. +### Metrics -However, using the offset parameter saves the resources by reducing network traffic and the number of times the storage is accessed. +| Pooling Type | NDCG@20 | Recall@20 | +|--------------|---------|-----------| +| **Mean** | 0.952 | 0.917 | +| **Max** | 0.759 | 0.656 | -Using an `offset` parameter, will require to internally retrieve `offset + limit` points, but only access payload and vector from the storage those points which are going to be actually returned. -## [Anchor](https://qdrant.tech/documentation/concepts/search/\#grouping-api) Grouping API +Mean pooling preserved nearly identical quality to the original ColPali, with NDCG@20 = 0.952 and Recall@20 = 0.917. Max pooling did not perform well enough to be considered viable since it sacrificed significant accuracy without delivering a meaningful speed advantage. -It is possible to group results by a certain field. This is useful when you have multiple points for the same item, and you want to avoid redundancy of the same item in the results. +## What’s Next? +Future experiments could push these results even further: -For example, if you have a large document split into multiple chunks, and you want to search or [recommend](https://qdrant.tech/documentation/concepts/explore/#recommendation-api) on a per-document basis, you can group the results by the document ID. +- Investigating column-wise pooling for additional compression. +- Testing half-precision (float16) vectors to balance memory use and speed. +- Skipping special multivectors during prefetch to streamline retrieval. +- Combining quantization with oversampling for even faster search. -Consider having points with the following payloads: -```json -[\ - {\ - "id": 0,\ - "payload": {\ - "chunk_part": 0,\ - "document_id": "a"\ - },\ - "vector": [0.91]\ - },\ - {\ - "id": 1,\ - "payload": {\ - "chunk_part": 1,\ - "document_id": ["a", "b"]\ - },\ - "vector": [0.8]\ - },\ - {\ - "id": 2,\ - "payload": {\ - "chunk_part": 2,\ - "document_id": "a"\ - },\ - "vector": [0.2]\ - },\ - {\ - "id": 3,\ - "payload": {\ - "chunk_part": 0,\ - "document_id": 123\ - },\ - "vector": [0.79]\ - },\ - {\ - "id": 4,\ - "payload": {\ - "chunk_part": 1,\ - "document_id": 123\ - },\ - "vector": [0.75]\ - },\ - {\ - "id": 5,\ - "payload": {\ - "chunk_part": 0,\ - "document_id": -10\ - },\ - "vector": [0.6]\ - }\ -] +### Try It Yourself -``` +Curious to see this in action? Explore the full codebase and experiment with ColPali optimizations: -With the _**groups**_ API, you will be able to get the best _N_ points for each document, assuming that the payload of the points contains the document ID. Of course there will be times where the best _N_ points cannot be fulfilled due to lack of points or a big distance with respect to the query. In every case, the `group_size` is a best-effort parameter, akin to the `limit` parameter. +- **Demo Notebook:** [GitHub Repository](https://github.com/qdrant/demo-colpali-optimized) +- **Webinar Walkthrough:** [Watch Here](https://www.youtube.com/live/_h6SN1WwnLs?si=n8gwiIjJ5dnfucXC) -### [Anchor](https://qdrant.tech/documentation/concepts/search/\#search-groups) Search groups +[Join the community](https://discord.com/invite/qdrant) and share your results! -REST API ( [Schema](https://api.qdrant.tech/api-reference/search/query-points-groups)): +--- -httppythontypescriptrustjavacsharpgo +<|page-333-lllmstxt|> +## Introduction -```http -POST /collections/{collection_name}/points/query/groups -{ - // Same as in the regular query API - "query": [1.1], - // Grouping parameters - "group_by": "document_id", // Path of the field to group by - "limit": 4, // Max amount of groups - "group_size": 2 // Max amount of points per group -} +This guide will teach you how to evaluate a RAG system for both **accuracy** and **quality**. You will learn to maintain RAG performance by testing for search precision, recall, contextual relevance, and response accuracy. -``` +**Building a RAG application is just the beginning;** it is crucial to test its usefulness for the end-user and calibrate its components for long-term stability. -```python -client.query_points_groups( - collection_name="{collection_name}", - # Same as in the regular query_points() API - query=[1.1], - # Grouping parameters - group_by="document_id", # Path of the field to group by - limit=4, # Max amount of groups - group_size=2, # Max amount of points per group -) +RAG systems can encounter errors at any of the three crucial stages: retrieving relevant information, augmenting that information, and generating the final response. By systematically assessing and fine-tuning each component, you will be able to maintain a reliable and contextually relevant GenAI application that meets user needs. -``` +## Why evaluate your RAG application? -```typescript -client.queryGroups("{collection_name}", { - query: [1.1], - group_by: "document_id", - limit: 4, - group_size: 2, -}); +### To avoid hallucinations and wrong answers -``` +![rag-eval-0](/blog/rag-evaluation-guide/rag-eval-0.png) -```rust -use qdrant_client::qdrant::QueryPointGroupsBuilder; +In the generation phase, hallucination is a notable issue where the LLM overlooks the context and fabricates information. This can lead to responses that are not grounded in reality. -client - .query_groups( - QueryPointGroupsBuilder::new("{collection_name}", "document_id") - .query(vec![0.2, 0.1, 0.9, 0.7]) - .group_size(2u64) - .with_payload(true) - .with_vectors(true) - .limit(4u64), - ) - .await?; +Additionally, the generation of biased answers is a concern, as responses produced by the LLM can sometimes be harmful, inappropriate, or carry an inappropriate tone, thus posing risks in various applications and interactions. -``` +### To enrich context augmented to your LLM -```java -import java.util.List; +Augmentation processes face challenges such as outdated information, where responses may include data that is no longer current. Another issue is the presence of contextual gaps, where there is a lack of relational context between the retrieved documents. -import io.qdrant.client.grpc.Points.SearchPointGroups; +> These gaps can result in incomplete or fragmented information being presented, reducing the overall coherence and relevance of the augmented responses. -client.queryGroupsAsync( - QueryPointGroups.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) - .setGroupBy("document_id") - .setLimit(4) - .setGroupSize(2) - .build()) - .get(); +### To maximize the search & retrieval process -``` +When it comes to retrieval, one significant issue with search is the lack of precision, where not all documents retrieved are relevant to the query. This problem is compounded by poor recall, meaning not all relevant documents are successfully retrieved. -```csharp -using Qdrant.Client; +Additionally, the [“Lost in the Middle”](https://arxiv.org/abs/2307.03172) problem indicates that some LLMs may struggle with long contexts, particularly when crucial information is positioned in the middle of the document, leading to incomplete or less useful results. -var client = new QdrantClient("localhost", 6334); +## Recommended frameworks -await client.QueryGroupsAsync( - collectionName: "{collection_name}", - query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, - groupBy: "document_id", - limit: 4, - groupSize: 2 -); +![rag-eval-6](/blog/rag-evaluation-guide/rag-eval-6.png) -``` +To simplify the evaluation process, several powerful frameworks are available. Below we will explore three popular ones: **Ragas, Quotient AI, and Arize Phoenix**. -```go -import ( - "context" +### Ragas: Testing RAG with questions and answers - "github.com/qdrant/go-client/qdrant" -) +[Ragas](https://docs.ragas.io/en/stable/) (or RAG Assessment) uses a dataset of questions, ideal answers, and relevant context to compare a RAG system's generated answers with the ground truth. It provides metrics like faithfulness, relevance, and semantic similarity to assess retrieval and answer quality. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +**Figure 1:** *Output of the Ragas framework, showcasing metrics like faithfulness, answer relevancy, context recall, precision, relevancy, entity recall, and answer similarity. These are used to evaluate the quality of RAG system responses.* -client.QueryGroups(context.Background(), &qdrant.QueryPointGroups{ - CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), - GroupBy: "document_id", - GroupSize: qdrant.PtrOf(uint64(2)), -}) +![image3.png](/blog/rag-evaluation-guide/image3.png) -``` +### Quotient: evaluating RAG pipelines with custom datasets -The output of a _**groups**_ call looks like this: +Quotient AI is another platform designed to streamline the evaluation of RAG systems. Developers can upload evaluation datasets as benchmarks to test different prompts and LLMs. These tests run as asynchronous jobs: Quotient AI automatically runs the RAG pipeline, generates responses and provides detailed metrics on faithfulness, relevance, and semantic similarity. The platform's full capabilities are accessible via a Python SDK, enabling you to access, analyze, and visualize your Quotient evaluation results to discover areas for improvement. -```json -{ - "result": { - "groups": [\ - {\ - "id": "a",\ - "hits": [\ - { "id": 0, "score": 0.91 },\ - { "id": 1, "score": 0.85 }\ - ]\ - },\ - {\ - "id": "b",\ - "hits": [\ - { "id": 1, "score": 0.85 }\ - ]\ - },\ - {\ - "id": 123,\ - "hits": [\ - { "id": 3, "score": 0.79 },\ - { "id": 4, "score": 0.75 }\ - ]\ - },\ - {\ - "id": -10,\ - "hits": [\ - { "id": 5, "score": 0.6 }\ - ]\ - }\ - ] - }, - "status": "ok", - "time": 0.001 -} +**Figure 2:** *Output of the Quotient framework, with statistics that define whether the dataset is properly manipulated throughout all stages of the RAG pipeline: indexing, chunking, search and context relevance.* -``` +![image2.png](/blog/rag-evaluation-guide/image2.png) -The groups are ordered by the score of the top point in the group. Inside each group the points are sorted too. +### Arize Phoenix: Visually Deconstructing Response Generation -If the `group_by` field of a point is an array (e.g. `"document_id": ["a", "b"]`), the point can be included in multiple groups (e.g. `"document_id": "a"` and `document_id: "b"`). +[Arize Phoenix](https://docs.arize.com/phoenix) is an open-source tool that helps improve the performance of RAG systems by tracking how a response is built step-by-step. You can see these steps visually in Phoenix, which helps identify slowdowns and errors. You can define "[evaluators](https://docs.arize.com/phoenix/evaluation/concepts-evals/evaluation)" that use LLMs to assess the quality of outputs, detect hallucinations, and check answer accuracy. Phoenix also calculates key metrics like latency, token usage, and errors, giving you an idea of how efficiently your RAG system is working. -**Limitations**: +**Figure 3:** *The Arize Phoenix tool is intuitive to use and shows the entire process architecture as well as the steps that take place inside of retrieval, context and generation.* -- Only [keyword](https://qdrant.tech/documentation/concepts/payload/#keyword) and [integer](https://qdrant.tech/documentation/concepts/payload/#integer) payload values are supported for the `group_by` parameter. Payload values with other types will be ignored. -- At the moment, pagination is not enabled when using **groups**, so the `offset` parameter is not allowed. +![image1.png](/blog/rag-evaluation-guide/image1.png) -### [Anchor](https://qdrant.tech/documentation/concepts/search/\#lookup-in-groups) Lookup in groups +## Why your RAG system might be underperforming -Having multiple points for parts of the same item often introduces redundancy in the stored data. Which may be fine if the information shared by the points is small, but it can become a problem if the payload is large, because it multiplies the storage space needed to store the points by a factor of the amount of points we have per group. +![rag-eval-2](/blog/rag-evaluation-guide/rag-eval-2.png) -One way of optimizing storage when using groups is to store the information shared by the points with the same group id in a single point in another collection. Then, when using the [**groups** API](https://qdrant.tech/documentation/concepts/search/#grouping-api), add the `with_lookup` parameter to bring the information from those points into each group. +### You improperly ingested data to the vector database -![Group id matches point id](https://qdrant.tech/docs/lookup_id_linking.png) +Improper data ingestion can cause the loss of important contextual information, which is critical for generating accurate and coherent responses. Also, inconsistent data ingestion can cause the system to produce unreliable and inconsistent responses, undermining user trust and satisfaction. -This has the extra benefit of having a single point to update when the information shared by the points in a group changes. +Vector databases support different [indexing](https://qdrant.tech/documentation/concepts/indexing/) techniques. In order to know if you are ingesting data properly, you should always check how changes in variables related to indexing techniques affect data ingestion. -For example, if you have a collection of documents, you may want to chunk them and store the points for the chunks in a separate collection, making sure that you store the point id from the document it belongs in the payload of the chunk point. +#### Solution: Pay attention to how your data is chunked -In this case, to bring the information from the documents into the chunks grouped by the document id, you can use the `with_lookup` parameter: +**Calibrate document chunk size:** The chunk size determines data granularity and impacts precision, recall, and relevance. It should be aligned with the token limit of the embedding model. -httppythontypescriptrustjavacsharpgo +**Ensure proper chunk overlap:** This helps retain context by sharing data points across chunks. It should be managed with strategies like deduplication and content normalization. -```http -POST /collections/chunks/points/query/groups -{ - // Same as in the regular query API - "query": [1.1], +**Develop a proper chunking/text splitting strategy**: Make sure your chunking/text splitting strategy is tailored to your on data type (e.g., HTML, markdown, code, PDF) and use-case nuances. For example, legal documents may be split by headings and subsections, and medical literature by sentence boundaries or key concepts. - // Grouping parameters - "group_by": "document_id", - "limit": 2, - "group_size": 2, +**Figure 4:** *You can use utilities like [ChunkViz](https://chunkviz.up.railway.app/) to visualize different chunk splitting strategies, chunk sizes, and chunk overlaps.* - // Lookup parameters - "with_lookup": { - // Name of the collection to look up points in - "collection": "documents", +![image4.png](/blog/rag-evaluation-guide/image4.png) - // Options for specifying what to bring from the payload - // of the looked up point, true by default - "with_payload": ["title", "text"], +### You might be embedding data incorrectly - // Options for specifying what to bring from the vector(s) - // of the looked up point, true by default - "with_vectors": false - } -} +You want to ensure that the embedding model accurately understands and represents the data. If the generated embeddings are accurate, similar data points will be closely positioned in the vector space. The quality of an embedding model is typically measured using benchmarks like the [Massive Text Embedding Benchmark (MTEB)](https://huggingface.co/spaces/mteb/leaderboard), where the model’s output is compared against a ground-truth dataset. -``` +#### Solution: Pick the right embedding model -```python -client.query_points_groups( - collection_name="chunks", - # Same as in the regular search() API - query=[1.1], - # Grouping parameters - group_by="document_id", # Path of the field to group by - limit=2, # Max amount of groups - group_size=2, # Max amount of points per group - # Lookup parameters - with_lookup=models.WithLookup( - # Name of the collection to look up points in - collection="documents", - # Options for specifying what to bring from the payload - # of the looked up point, True by default - with_payload=["title", "text"], - # Options for specifying what to bring from the vector(s) - # of the looked up point, True by default - with_vectors=False, - ), -) +The embedding model plays a critical role in capturing semantic relationships in data. -``` +There are several embedding models you can choose from, and the [Massive Text Embedding Benchmark (MTEB) Leaderboard](https://huggingface.co/spaces/mteb/leaderboard) is a great resource for reference. Lightweight libraries like [FastEmbed](https://github.com/qdrant/fastembed) support the generation of vector embeddings using [popular text embedding models](https://qdrant.github.io/fastembed/examples/Supported_Models/#supported-text-embedding-models). -```typescript -client.queryGroups("{collection_name}", { - query: [1.1], - group_by: "document_id", - limit: 2, - group_size: 2, - with_lookup: { - collection: "documents", - with_payload: ["title", "text"], - with_vectors: false, - }, -}); +When choosing an embedding model, **retrieval performance** and **domain** **specificity**. You need to ensure that the model can capture semantic nuances, which affects the retrieval performance. For specialized domains, you may need to select or train a custom embedding model. -``` +### Your retrieval procedure isn’t optimized -```rust -use qdrant_client::qdrant::{with_payload_selector::SelectorOptions, QueryPointGroupsBuilder, WithLookupBuilder}; +![rag-eval-5](/blog/rag-evaluation-guide/rag-eval-5.png) -client - .query_groups( - QueryPointGroupsBuilder::new("{collection_name}", "document_id") - .query(vec![0.2, 0.1, 0.9, 0.7]) - .limit(2u64) - .limit(2u64) - .with_lookup( - WithLookupBuilder::new("documents") - .with_payload(SelectorOptions::Include( - vec!["title".to_string(), "text".to_string()].into(), - )) - .with_vectors(false), - ), - ) - .await?; +Semantic retrieval evaluation tests the effectiveness of your data retrieval. There are several metrics you can choose from: -``` +- **Precision@k**: Measures the number of relevant documents in the top-k search results. +- **Mean reciprocal rank (MRR)**: Considers the position of the first relevant document in the search results. +- **Discounted cumulative gain (DCG) and normalized DCG (NDCG)**: Based on the relevance score of the documents. -```java -import java.util.List; +By evaluating the retrieval quality using these metrics, you can assess the effectiveness of your retrieval step. For evaluating the ANN algorithm specifically, Precision@k is the most appropriate metric, as it directly measures how well the algorithm approximates exact search results. -import io.qdrant.client.grpc.Points.QueryPointGroups; -import io.qdrant.client.grpc.Points.WithLookup; +#### Solution: Choose the best retrieval algorithm -import static io.qdrant.client.QueryFactory.nearest; -import static io.qdrant.client.WithVectorsSelectorFactory.enable; -import static io.qdrant.client.WithPayloadSelectorFactory.include; +Each new LLM with a larger context window claims to render RAG obsolete. However, studies like "[Lost in the Middle](https://arxiv.org/abs/2307.03172)" demonstrate that feeding entire documents to LLMs can diminish their ability to answer questions effectively. Therefore, the retrieval algorithm is crucial for fetching the most relevant data in the RAG system. -client.queryGroupsAsync( - QueryPointGroups.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) - .setGroupBy("document_id") - .setLimit(2) - .setGroupSize(2) - .setWithLookup( - WithLookup.newBuilder() - .setCollection("documents") - .setWithPayload(include(List.of("title", "text"))) - .setWithVectors(enable(false)) - .build()) - .build()) - .get(); +**Configure dense vector retrieval:** You need to choose the right [similarity metric](https://qdrant.tech/documentation/concepts/search/) to get the best retrieval quality. Metrics used in dense vector retrieval include Cosine Similarity, Dot Product, Euclidean Distance, and Manhattan Distance. -``` +**Use sparse vectors & hybrid search where needed**: For sparse vectors, the algorithm choice of BM-25, SPLADE, or BM-42 will affect retrieval quality. Hybrid Search combines dense vector retrieval with sparse vector-based search. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +**Leverage simple filtering:** This approach combines dense vector search with attribute filtering to narrow down the search results. -var client = new QdrantClient("localhost", 6334); +**Set correct hyperparameters:** Your Chunking Strategy, Chunk Size, Overlap, and Retrieval Window Size significantly impact the retrieval step and must be tailored to specific requirements. -await client.SearchGroupsAsync( - collectionName: "{collection_name}", - vector: new float[] { 0.2f, 0.1f, 0.9f, 0.7f}, - groupBy: "document_id", - limit: 2, - groupSize: 2, - withLookup: new WithLookup - { - Collection = "documents", - WithPayload = new WithPayloadSelector - { - Include = new PayloadIncludeSelector { Fields = { new string[] { "title", "text" } } } - }, - WithVectors = false - } -); +**Introduce re-ranking:** Such methods may use cross-encoder models to re-score the results returned by vector search. Re-ranking can significantly improve retrieval and thus RAG system performance. -``` +### LLM generation performance is suboptimal -```go -import ( - "context" +The LLM is responsible for generating responses based on the retrieved context. The choice of LLM ranges from OpenAI’s GPT models to open-weight models. The LLM you choose will significantly influence the performance of a RAG system. Here are some areas to watch out for: - "github.com/qdrant/go-client/qdrant" -) +- **Response quality**: The LLM selection will influence the fluency, coherence, and factual accuracy of generated responses. +- **System performance**: Inference speeds vary between LLMs. Slower inference speeds can impact response times. +- **Domain knowledge**: For domain-specific RAG applications, you may need LLMs trained on that domain. Some LLMs are easier to fine-tune than others. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +#### Solution: Test and Critically Analyze LLM Quality -client.QueryGroups(context.Background(), &qdrant.QueryPointGroups{ - CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), - GroupBy: "document_id", - GroupSize: qdrant.PtrOf(uint64(2)), - WithLookup: &qdrant.WithLookup{ - Collection: "documents", - WithPayload: qdrant.NewWithPayloadInclude("title", "text"), - }, -}) +The [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) can help guide your LLM selection. On this leaderboard, LLMs are ranked based on their scores on various benchmarks, such as IFEval, GPQA, MMLU-PRO, and others. -``` +Evaluating LLMs involves several key metrics and methods. You can use these metrics or frameworks to evaluate if the LLM is delivering high-quality, relevant, and reliable responses. -For the `with_lookup` parameter, you can also use the shorthand `with_lookup="documents"` to bring the whole payload and vector(s) without explicitly specifying it. +**Table 1:** Methods for Measuring LLM Response Quality -The looked up result will show up under `lookup` in each group. +| **Column header** | **Column header** | +| --- | --- | +| [Perplexity](https://huggingface.co/spaces/evaluate-metric/perplexity) | Measure how well the model predicts text. | +| Human Evaluation | Rate responses based on relevance, coherence and quality. | +| [BLEU](https://en.wikipedia.org/wiki/BLEU) | Used in translation tasks to compare generated output with reference translations. Higher scores (0-1) indicate better performance. | +| [ROUGE](https://en.wikipedia.org/wiki/ROUGE_(metric)) | Evaluates summary quality by comparing generated summaries with reference summaries, calculating precision, recall and F1-score. | +| [EleutherAI](https://github.com/EleutherAI/lm-evaluation-harness) | A framework to test LLMs on different evaluation tasks. | +| [HELM](https://github.com/stanford-crfm/helm) | A framework to evaluate LLMs, focusing on 12 different aspects that are important in real-world model deployments. | +| Diversity | Assesses the variety and uniqueness of responses, with higher scores indicating more diverse outputs. | -```json -{ - "result": { - "groups": [\ - {\ - "id": 1,\ - "hits": [\ - { "id": 0, "score": 0.91 },\ - { "id": 1, "score": 0.85 }\ - ],\ - "lookup": {\ - "id": 1,\ - "payload": {\ - "title": "Document A",\ - "text": "This is document A"\ - }\ - }\ - },\ - {\ - "id": 2,\ - "hits": [\ - { "id": 1, "score": 0.85 }\ - ],\ - "lookup": {\ - "id": 2,\ - "payload": {\ - "title": "Document B",\ - "text": "This is document B"\ - }\ - }\ - }\ - ] - }, - "status": "ok", - "time": 0.001 -} +Many LLM evaluation frameworks offer flexibility to accommodate domain-specific or custom evaluations, addressing the key RAG metrics for your use case. These frameworks utilize either LLM-as-a-Judge or the [OpenAI Moderation API](https://platform.openai.com/docs/guides/moderation/overview) to ensure the moderation of responses from your AI applications. -``` +## Working with custom datasets -Since the lookup is done by matching directly with the point id, the lookup collection must be pre-populated with points where the `id` matches the `group_by` value (e.g., document\_id) from your primary collection. +![rag-eval-3](/blog/rag-evaluation-guide/rag-eval-3.png) -Any group id that is not an existing (and valid) point id in the lookup collection will be ignored, and the `lookup` field will be empty. +First, create question and ground-truth answer pairs from source documents for the evaluation dataset. Ground-truth answers are the precise responses you expect from the RAG system. You can create these in multiple ways: -## [Anchor](https://qdrant.tech/documentation/concepts/search/\#random-sampling) Random Sampling +- **Hand-crafting your dataset:** Manually create questions and answers. +- **Use LLM to create synthetic data:** Leverage LLMs like [T5](https://huggingface.co/docs/transformers/en/model_doc/t5) or OpenAI APIs. +- **Use the Ragas framework**: [This method](https://docs.ragas.io/en/stable/concepts/test_data_generation/) uses an LLM to generate various question types for evaluating RAG systems. +- **Use FiddleCube**: [FiddleCube](https://www.fiddlecube.ai/) is a system that can help generate a range of question types aimed at different aspects of the testing process. -_Available as of v1.11.0_ +Once you have created a dataset, collect the retrieved context and the final answer generated by your RAG pipeline for each question. -In some cases it might be useful to retrieve a random sample of points from the collection. This can be useful for debugging, testing, or for providing entry points for exploration. +**Figure 5:** *Here is an example of four evaluation metrics:* -Random sampling API is a part of [Universal Query API](https://qdrant.tech/documentation/concepts/search/#query-api) and can be used in the same way as regular search API. +- **question**: A set of questions based on the source document. +- **ground_truth**: The anticipated accurate answers to the queries. +- **context**: The context retrieved by the RAG pipeline for each query. +- **answer**: The answer generated by the RAG pipeline for each query. -httppythontypescriptrustjavacsharpgo +![image5.png](/blog/rag-evaluation-guide/image5.png) -```http -POST /collections/{collection_name}/points/query -{ - "query": { - "sample": "random" - } -} +## Conclusion: What to look for when running tests -``` +To understand if a RAG system is functioning as it should, you want to ensure: -```python -from qdrant_client import QdrantClient, models +- **Retrieval effectiveness**: The information retrieved is semantically relevant. +- **Relevance of responses:** The generated response is meaningful. +- **Coherence of generated responses**: The response is coherent and logically connected. +- **Up-to-date responses**: The response is based on current data. -sampled = client.query_points( - collection_name="{collection_name}", - query=models.SampleQuery(sample=models.Sample.RANDOM) -) +### Evaluating a RAG application from End-to-End (E2E) -``` +The End-to-End (E2E) evaluation assesses the overall performance of the entire Retrieval-Augmented Generation (RAG) system. Here are some of the key factors you can measure: -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +- **Helpfulness**: Measures how well the system's responses assist users in achieving their goals. +- **Groundedness**: Ensures that the responses are based on verifiable information from the retrieved context. +- **Latency**: Monitors the response time of the system to ensure it meets the required speed and efficiency standards. +- **Conciseness**: Evaluates whether the responses are brief yet comprehensive. +- **Consistency**: Ensures that the system consistently delivers high-quality responses across different queries and contexts. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +For instance, you can measure the quality of the generated responses with metrics like **Answer Semantic Similarity** and **Correctness**. -const sampled = await client.query("{collection_name}", { - query: { - sample: "random", - }, -}); +Measuring semantic similarity will tell you the difference between the generated answer and the ground truth, ranging from 0 to 1. This system cosine similarity to evaluate alignment in the vector space. -``` +Checking answer correctness evaluates the overall agreement between the generated answer and the ground truth, combining factual correctness (measured by the F1 score) and answer similarity score. -```rust -use qdrant_client::Qdrant; -use qdrant_client::qdrant::{Query, QueryPointsBuilder}; -let client = Qdrant::from_url("http://localhost:6334").build()?; +### RAG evaluation is just the beginning -let sampled = client - .query( - QueryPointsBuilder::new("{collection_name}") - .query(Query::new_sample(Sample::Random)) - ) - .await?; +![rag-eval-4](/blog/rag-evaluation-guide/rag-eval-4.png) -``` +RAG evaluation is just the beginning. It lays the foundation for continuous improvement and long-term success of your system. Initially, it can help you identify and address immediate issues related to retrieval accuracy, contextual relevance, and response quality. However, as your RAG system evolves and is subjected to new data, use cases, and user interactions, you need to continue testing and calibrating. -```java -import static io.qdrant.client.QueryFactory.sample; +By continuously evaluating your application, you can ensure that the system adapts to changing requirements and maintains its performance over time. You should regularly calibrate all components such as embedding models, retrieval algorithms, and the LLM itself. This iterative process will help you identify and fix emerging problems, optimize system parameters, and incorporate user feedback. -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.QueryPoints; -import io.qdrant.client.grpc.Points.Sample; +The practice of RAG evaluation is in the early stages of development. Keep this guide and wait as more techniques, models, and evaluation frameworks are developed. We strongly recommend you incorporate them into your evaluation process. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +**Want to download a printer-friendly version of this guide? [Fill out this form](/rag/rag-evaluation-guide/#form) and we will email you the PDF.** -client - .queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(sample(Sample.Random)) - .build()) - .get(); +### Helpful Links +- Join our community on [Discord](https://discord.com/invite/qdrant) +- Check out our [latest articles](https://qdrant.tech/articles/) +- Try a [free Qdrant cluster](https://cloud.qdrant.io/login) +- Choose the right deployment option for your application. [Talk to sales](https://qdrant.tech/contact-us/) -``` +<|page-334-lllmstxt|> +![qdrant-qatech-1](/blog/case-study-qatech/qdrant-qatech-1.png) -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +[QA.tech](https://qa.tech/), a company specializing in AI-driven automated testing solutions, found that building and **fully testing web applications, especially end-to-end, can be complex and time-consuming**. Unlike unit tests, end-to-end tests reveal what’s actually happening in the browser, often uncovering issues that other methods miss. -var client = new QdrantClient("localhost", 6334); +Traditional solutions like hard-coded tests are not only labor-intensive to set up but also challenging to maintain over time. Alternatively, hiring QA testers can be a solution, but for startups, it quickly becomes a bottleneck. With every release, more testers are needed, and if testing is outsourced, managing timelines and ensuring quality becomes even harder. -await client.QueryAsync(collectionName: "{collection_name}", query: Sample.Random); +To address this, QA.tech has developed **testing agents** that perform tasks on the browser just like a user would - for example, purchasing a ticket on a travel app. These agents navigate the entire booking process, from searching for flights to completing the purchase, all while assessing their success. **They document errors, record the process, and flag issues for developers to review.** With access to console logs and network calls, developers can easily analyze each step, quickly understanding and debugging any issues that arise. -``` +![qdrant-qatech-2](/blog/case-study-qatech/qdrant-qatech-2.png) -```go -import ( - "context" +*Output from a QA.tech AI agent* - "github.com/qdrant/go-client/qdrant" -) +## What prompted QA.tech to use a vector database? -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +QA.tech initially used **pgvector** for simpler vector use cases but encountered scalability limitations as their requirements grew, prompting them to adopt Qdrant. They needed a [vector database](/qdrant-vector-database/) capable of handling high-velocity, real-time analysis to support their AI agents, which operate within an analysis layer that observes and interprets actions across web pages. This analysis layer relies heavily on multimodal models and substantial subprocessing to enable the AI agent to make informed, real-time decisions. -client.QueryGroups(context.Background(), &qdrant.QueryPointGroups{ - CollectionName: "{collection_name}", - Query: qdrant.NewQuerySample(qdrant.Sample_Random), -}) +In some web interfaces, hundreds of actions can occur, and processing them in real time - especially with each click - can be slow. Dynamic web elements and changing identifiers further complicate this, making traditional methods unreliable. To address these challenges, QA.tech trained custom embeddings on specific actions, which significantly accelerates decision-making. -``` +This setup requires frequent embedding lookups, generating a high volume of database calls for each interaction. As **Vilhelm von Ehrenheim from QA.tech** explained: -## [Anchor](https://qdrant.tech/documentation/concepts/search/\#query-planning) Query planning +> “You get a lot of embeddings, a lot of calls, a lot of lookups towards the database for every click, and that needs to scale nicely.” -Depending on the filter used in the search - there are several possible scenarios for query execution. -Qdrant chooses one of the query execution options depending on the available indexes, the complexity of the conditions and the cardinality of the filtering result. -This process is called query planning. +Qdrant’s fast, scalable [vector search](/advanced-search/) enables QA.tech to handle these high-velocity lookups seamlessly, ensuring that the agent remains responsive and capable of making quick, accurate decisions in real time. -The strategy selection process relies heavily on heuristics and can vary from release to release. -However, the general principles are: +## Why QA.tech chose Qdrant for its AI Agent platform -- planning is performed for each segment independently (see [storage](https://qdrant.tech/documentation/concepts/storage/) for more information about segments) -- prefer a full scan if the amount of points is below a threshold -- estimate the cardinality of a filtered result before selecting a strategy -- retrieve points using payload index (see [indexing](https://qdrant.tech/documentation/concepts/indexing/)) if cardinality is below threshold -- use filterable vector index if the cardinality is above a threshold +QA.tech’s AI Agents handle high-velocity web actions, requiring efficient real-time operations and scalable infrastructure. The team faced challenges with managing network overhead, CPU load, and the need to store [multiple embeddings](/documentation/concepts/vectors/#multivectors) for different use cases. Qdrant provided the solution to address these issues. -You can adjust the threshold using a [configuration file](https://github.com/qdrant/qdrant/blob/master/config/config.yaml), as well as independently for each collection. +**Reducing Network Overhead with Batch Operations** -##### Was this page useful? +Handling hundreds of simultaneous actions on a web interface individually created significant network overhead. Von Ehrenheim explained that “doing all of those in separate calls creates a lot of network overhead.” Qdrant’s batch operations allowed QA.tech to process multiple actions at once, reducing network traffic and improving efficiency. This capability is essential for AI Agents, where real-time responsiveness is critical. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +**Optimizing CPU Load for Embedding Processing** -Thank you for your feedback! 🙏 +PostgreSQL’s transaction guarantees resulted in high CPU usage when processing embeddings, especially at scale. Von Ehrenheim noted that adding many new embeddings "requires much more CPU," which led to performance bottlenecks. Qdrant’s architecture efficiently handled large-scale embeddings, preventing CPU overload and ensuring smooth, uninterrupted performance, a key requirement for AI Agents. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/search.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +**Managing Multiple Embeddings for Different Use Cases** -On this page: +AI Agents need flexibility in handling both real-time actions and context-aware tasks. QA.tech required different embeddings for immediate action processing and deeper semantic searches. Von Ehrenheim mentioned, *“We use one embedding for high-velocity actions, but I also want to store other types of embeddings for analytical purposes.”* -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/search.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +> Qdrant’s ability to store multiple embeddings per data point allowed QA.tech to meet these diverse needs without added complexity. -× -[Powered by](https://qdrant.tech/) +## How QA.tech Overcame Key Challenges in AI Agent Development -<|page-171-lllmstxt|> -## quantization -- [Documentation](https://qdrant.tech/documentation/) -- [Guides](https://qdrant.tech/documentation/guides/) -- Quantization +Building reliable AI agents presents unique complexities, particularly as workflows grow more multi-step and dynamic. -# [Anchor](https://qdrant.tech/documentation/guides/quantization/\#quantization) Quantization +> "The more steps you ask an agent to take, the harder it becomes to ensure consistent performance," Vilhelm von Ehrenheim, Co-Founder of QA.tech. -Quantization is an optional feature in Qdrant that enables efficient storage and search of high-dimensional vectors. -By transforming original vectors into a new representations, quantization compresses data while preserving close to original relative distances between vectors. -Different quantization methods have different mechanics and tradeoffs. We will cover them in this section. +Each additional action adds layers of interdependent variables, creating pathways that can easily lead to errors if not managed carefully. -Quantization is primarily used to reduce the memory footprint and accelerate the search process in high-dimensional vector spaces. -In the context of the Qdrant, quantization allows you to optimize the search engine for specific use cases, striking a balance between accuracy, storage efficiency, and search speed. +Von Ehrenheim also points out the limitations of current large language models (LLMs), noting that *“LLMs are getting more powerful, but they still struggle with multi-step reasoning and for example handling subtle visual changes like dark mode or adaptive UIs.”* These challenges make it essential for agents to have precise planning capabilities and context awareness, which QA.tech has addressed by implementing custom embeddings and multimodal models. -There are tradeoffs associated with quantization. -On the one hand, quantization allows for significant reductions in storage requirements and faster search times. -This can be particularly beneficial in large-scale applications where minimizing the use of resources is a top priority. -On the other hand, quantization introduces an approximation error, which can lead to a slight decrease in search quality. -The level of this tradeoff depends on the quantization method and its parameters, as well as the characteristics of the data. +*“This is where scalable, adaptable infrastructure becomes crucial,”* von Ehrenheim adds. Qdrant has been instrumental for QA.tech, providing stable, high-performance vector search to support the demanding workflows. **“With Qdrant, we’re able to handle these complex, high-velocity tasks without compromising on reliability.”** -## [Anchor](https://qdrant.tech/documentation/guides/quantization/\#scalar-quantization) Scalar Quantization +<|page-335-lllmstxt|> +| Time: 30 min | Level: Advanced | Notebook: [GitHub](https://github.com/qdrant/examples/blob/master/colpali-and-binary-quantization/colpali_demo_binary.ipynb) | +| --- | ----------- | ----------- | -_Available as of v1.1.0_ +It’s no secret that even the most modern document retrieval systems have a hard time handling visually rich documents like **PDFs, containing tables, images, and complex layouts.** -Scalar quantization, in the context of vector search engines, is a compression technique that compresses vectors by reducing the number of bits used to represent each vector component. +ColPali introduces a multimodal retrieval approach that uses **Vision Language Models (VLMs)** instead of the traditional OCR and text-based extraction. -For instance, Qdrant uses 32-bit floating numbers to represent the original vector components. Scalar quantization allows you to reduce the number of bits used to 8. -In other words, Qdrant performs `float32 -> uint8` conversion for each vector component. -Effectively, this means that the amount of memory required to store a vector is reduced by a factor of 4. +By processing document images directly, it creates **multi-vector embeddings** from both the visual and textual content, capturing the document's structure and context more effectively. This method outperforms traditional techniques, as demonstrated by the [**Visual Document Retrieval Benchmark (ViDoRe)**](https://huggingface.co/vidore). -In addition to reducing the memory footprint, scalar quantization also speeds up the search process. -Qdrant uses a special SIMD CPU instruction to perform fast vector comparison. -This instruction works with 8-bit integers, so the conversion to `uint8` allows Qdrant to perform the comparison faster. +**Before we go any deeper, watch our short video:** -The main drawback of scalar quantization is the loss of accuracy. The `float32 -> uint8` conversion introduces an error that can lead to a slight decrease in search quality. -However, this error is usually negligible, and tends to be less significant for high-dimensional vectors. -In our experiments, we found that the error introduced by scalar quantization is usually less than 1%. + -However, this value depends on the data and the quantization parameters. -Please refer to the [Quantization Tips](https://qdrant.tech/documentation/guides/quantization/#quantization-tips) section for more information on how to optimize the quantization parameters for your use case. +## Standard Retrieval vs ColPali -## [Anchor](https://qdrant.tech/documentation/guides/quantization/\#binary-quantization) Binary Quantization +The standard approach starts by running **Optical Character Recognition (OCR)** to extract the text from a document. Once the text is extracted, a layout detection model interprets the structure, which is followed by chunking the text into smaller sections for embedding. This method works adequately for documents where the text content is the primary focus. -_Available as of v1.5.0_ +Imagine you have a PDF packed with complex layouts, tables, and images, and you need to extract meaningful information efficiently. Traditionally, this would involve several steps: -Binary quantization is an extreme case of scalar quantization. -This feature lets you represent each vector component as a single bit, effectively reducing the memory footprint by a **factor of 32**. +1. **Text Extraction:** Using OCR to pull words from each page. +2. **Layout Detection:** Identifying page elements like tables, paragraphs, and titles. +3. **Chunking:** Experimenting with methods to determine the best fit for your use case. +4. **Embedding Creation:** Finally generating and storing the embeddings. -This is the fastest quantization method, since it lets you perform a vector comparison with a few CPU instructions. +### Why is ColPali Better? -Binary quantization can achieve up to a **40x** speedup compared to the original vectors. +This entire process can require too many steps, especially for complex documents, with each page often taking over seven seconds to process. For text-heavy documents, this approach might suffice, but real-world data is often rich and complex, making traditional extraction methods less effective. -However, binary quantization is only efficient for high-dimensional vectors and require a centered distribution of vector components. +This is where ColPali comes into play. **ColPali, or Contextualized Late Interaction Over PaliGemma**, uses a vision language model (VLM) to simplify and enhance the document retrieval process. -At the moment, binary quantization shows good accuracy results with the following models: +Instead of relying on text-only methods, ColPali generates contextualized **multivector embeddings** directly from an image of a document page. The VLM considers visual elements, structure, and text all at once, creating a holistic representation of each page. -- OpenAI `text-embedding-ada-002` \- 1536d tested with [dbpedia dataset](https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M) achieving 0.98 recall@100 with 4x oversampling -- Cohere AI `embed-english-v2.0` \- 4096d tested on Wikipedia embeddings - 0.98 recall@50 with 2x oversampling +## How ColPali Works Under the Hood +![Qdrant and Colpali](/blog/qdrant-colpali/qdrant-colpali-1.png) -Models with a lower dimensionality or a different distribution of vector components may require additional experiments to find the optimal quantization parameters. +Rather than relying on OCR, ColPali **processes the entire document as an image** using a Vision Encoder. It creates multi-vector embeddings that capture both the textual content and the visual structure of the document which are then passed through a Large Language Model (LLM), which integrates the information into a representation that retains both text and visual features. -We recommend using binary quantization only with rescoring enabled, as it can significantly improve the search quality -with just a minor performance impact. -Additionally, oversampling can be used to tune the tradeoff between search speed and search quality in the query time. +Here’s a step-by-step look at the ColPali architecture and how it enhances document retrieval: -### [Anchor](https://qdrant.tech/documentation/guides/quantization/\#binary-quantization-as-hamming-distance) Binary Quantization as Hamming Distance +1. **Image Preprocessing:** The input image is divided into a 32x32 grid, resulting in 1,024 patches. +2. **Contextual Transformation:** Each patch undergoes transformations to capture local and global context and is represented by a 128-dimensional vector. +3. **Query Processing:** When a text query is sent, ColPali generates token-level embeddings for the query, comparing it with document patches using a similarity matrix (specifically MaxSim). +4. **MaxSim Similarity:** This similarity matrix computes similarities for each query token in every document patch, selecting maximum similarities to efficiently retrieve relevant pages. This late interaction approach helps ColPali capture intricate context across a document’s structure and text. -The additional benefit of this method is that you can efficiently emulate Hamming distance with dot product. +> ColPali’s late interaction strategy is inspired by ColBERT and improves search by analyzing layout and textual content in a single pass. -Specifically, if original vectors contain `{-1, 1}` as possible values, then the dot product of two vectors is equal to the Hamming distance by simply replacing `-1` with `0` and `1` with `1`. +## Optimizing with Binary Quantization +![Qdrant and Colpali](/blog/qdrant-colpali/qdrant-colpali-3.png) -**Sample truth table** +Binary Quantization further enhances the ColPali pipeline by **reducing storage and computational load** without compromising search performance. Binary Quantization, unlike Scalar Quantization, compresses vectors more aggressively, which can speed up search times and reduce memory usage. -| Vector 1 | Vector 2 | Dot product | -| --- | --- | --- | -| 1 | 1 | 1 | -| 1 | -1 | -1 | -| -1 | 1 | -1 | -| -1 | -1 | 1 | +In an experiment based on a [**blog post by Daniel Van Strien**](https://danielvanstrien.xyz/posts/post-with-code/colpali-qdrant/2024-10-02_using_colpali_with_qdrant.html), where ColPali and Qdrant were used to search a UFO document dataset, the results were compelling. By using Binary Quantization along with rescoring and oversampling techniques, we saw search time reduced by nearly half compared to Scalar Quantization, while maintaining similar accuracy. -| Vector 1 | Vector 2 | Hamming distance | -| --- | --- | --- | -| 1 | 1 | 0 | -| 1 | 0 | 1 | -| 0 | 1 | 1 | -| 0 | 0 | 0 | +## Using ColPali with Qdrant -As you can see, both functions are equal up to a constant factor, which makes similarity search equivalent. -Binary quantization makes it efficient to compare vectors using this representation. +**Now it's time to try the code.**
+Here’s a simplified Notebook to test ColPali for yourself: -## [Anchor](https://qdrant.tech/documentation/guides/quantization/\#product-quantization) Product Quantization +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sabrinaaquino/colpali-qdrant-demo/blob/main/colpali_demo_binary.ipynb) -_Available as of v1.2.0_ +Our goal is to go through a dataset of multilingual newspaper articles like the ones below. We will detect which images contain text about **UFO's** and **Top Secret** events. -Product quantization is a method of compressing vectors to minimize their memory usage by dividing them into -chunks and quantizing each segment individually. -Each chunk is approximated by a centroid index that represents the original vector component. -The positions of the centroids are determined through the utilization of a clustering algorithm such as k-means. -For now, Qdrant uses only 256 centroids, so each centroid index can be represented by a single byte. +![Qdrant and Colpali](/blog/qdrant-colpali/qdrant-colpali-4.png) -Product quantization can compress by a more prominent factor than a scalar one. -But there are some tradeoffs. Product quantization distance calculations are not SIMD-friendly, so it is slower than scalar quantization. -Also, product quantization has a loss of accuracy, so it is recommended to use it only for high-dimensional vectors. +*The full dataset is accessible from the notebook.* -Please refer to the [Quantization Tips](https://qdrant.tech/documentation/guides/quantization/#quantization-tips) section for more information on how to optimize the quantization parameters for your use case. +### Procedure -## [Anchor](https://qdrant.tech/documentation/guides/quantization/\#how-to-choose-the-right-quantization-method) How to choose the right quantization method +1. **Setup ColPali and Qdrant:** Import the necessary libraries, including a fine-tuned model optimized for your dataset (in this case, a UFO document set). +2. **Dataset Preparation:** Load your document images into ColPali, previewing complex images to appreciate the challenge for traditional retrieval methods. +3. **Qdrant Configuration:** Define your Qdrant collection, setting vector dimensions to 128. Enable Binary Quantization to optimize memory usage. +4. **Batch Uploading Vectors:** Use a retry checkpoint to handle any exceptions during indexing. Batch processing allows you to adjust batch size based on available GPU resources. +5. **Query Processing and Search:** Encode queries as multivectors for Qdrant. Set up rescoring and oversampling to fine-tune accuracy while optimizing speed. -Here is a brief table of the pros and cons of each quantization method: +### Results -| Quantization method | Accuracy | Speed | Compression | -| --- | --- | --- | --- | -| Scalar | 0.99 | up to x2 | 4 | -| Product | 0.7 | 0.5 | up to 64 | -| Binary | 0.95\* | up to x40 | 32 | +> Success! Tests shows that search time is 2x faster than with Scalar Quantization. -`*` \- for compatible models +This is significantly faster than with Scalar Quantization, and we still retrieved the top document matches with remarkable accuracy. -- **Binary Quantization** is the fastest method and the most memory-efficient, but it requires a centered distribution of vector components. It is recommended to use with tested models only. -- **Scalar Quantization** is the most universal method, as it provides a good balance between accuracy, speed, and compression. It is recommended as default quantization if binary quantization is not applicable. -- **Product Quantization** may provide a better compression ratio, but it has a significant loss of accuracy and is slower than scalar quantization. It is recommended if the memory footprint is the top priority and the search speed is not critical. +However, keep in mind that this is just a quick experiment. Performance may vary, so it's important to test Binary Quantization on your own datasets to see how it performs for your specific use case. -## [Anchor](https://qdrant.tech/documentation/guides/quantization/\#setting-up-quantization-in-qdrant) Setting up Quantization in Qdrant +That said, it's promising to see Binary Quantization maintaining search quality while potentially offering performance improvements with ColPali. -You can configure quantization for a collection by specifying the quantization parameters in the `quantization_config` section of the collection configuration. +## Future Directions with ColPali +![Qdrant and Colpali](/blog/qdrant-colpali/qdrant-colpali-2.png) -Quantization will be automatically applied to all vectors during the indexation process. -Quantized vectors are stored alongside the original vectors in the collection, so you will still have access to the original vectors if you need them. +ColPali offers a promising, streamlined approach to document retrieval, especially for visually rich, complex documents. Its integration with Qdrant enables efficient large-scale vector storage and retrieval, ideal for machine learning applications requiring sophisticated document understanding. -_Available as of v1.1.1_ +If you’re interested in trying ColPali on your own datasets, join our [**vector search community on Discord**](https://qdrant.to/discord) for discussions, tutorials, and more insights into advanced document retrieval methods. Let us know in how you’re using ColPali or what applications you envision for it! -The `quantization_config` can also be set on a per vector basis by specifying it in a named vector. +Thank you for reading, and stay tuned for more insights on vector search! -### [Anchor](https://qdrant.tech/documentation/guides/quantization/\#setting-up-scalar-quantization) Setting up Scalar Quantization +**References:** -To enable scalar quantization, you need to specify the quantization parameters in the `quantization_config` section of the collection configuration. +[1] Faysse, M., Sibille, H., Wu, T., Omrani, B., Viaud, G., Hudelot, C., Colombo, P. (2024). **ColPali: Efficient Document Retrieval with Vision Language Models.** arXiv. https://doi.org/10.48550/arXiv.2407.01449 -When enabling scalar quantization on an existing collection, use a PATCH request or the corresponding `update_collection` method and omit the vector configuration, as it’s already defined. +[2] van Strien, D. (2024). **Using ColPali with Qdrant to index and search a UFO document dataset.** Published October 2, 2024. Blog post: https://danielvanstrien.xyz/posts/post-with-code/colpali-qdrant/2024-10-02_using_colpali_with_qdrant.html -httppythontypescriptrustjavacsharpgo +[3] Kacper Ɓukawski (2024). **Any Embedding Model Can Become a Late Interaction Model... If You Give It a Chance!** Qdrant Blog, August 14, 2024. Available at: https://qdrant.tech/articles/late-interaction-models/ -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 768, - "distance": "Cosine" - }, - "quantization_config": { - "scalar": { - "type": "int8", - "quantile": 0.99, - "always_ram": true - } - } -} +<|page-336-lllmstxt|> +![case-study-sprinklr-1](/blog/case-study-sprinklr/image1.png) -``` -```python -from qdrant_client import QdrantClient, models +[Sprinklr](https://www.sprinklr.com/), a leader in unified customer experience management (Unified-CXM), helps global brands engage customers meaningfully across more than 30 digital channels. To achieve this, Sprinklr needed a scalable solution for AI-powered search to support their AI applications, particularly in handling the vast data requirements of customer interactions. -client = QdrantClient(url="http://localhost:6333") +Raghav Sonavane, Associate Director of Machine Learning Engineering at Sprinklr, leads the Applied AI team, focusing on Generative AI (GenAI) and Retrieval-Augmented Generation (RAG). His team is responsible for training and fine-tuning in-house models and deploying advanced retrieval and generation systems for customer-facing applications like FAQ bots and other [GenAI-driven services](https://www.sprinklr.com/blog/how-sprinklr-uses-RAG/). The team provides all of these capabilities in a centralized platform to the Sprinklr product engineering teams. -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), - quantization_config=models.ScalarQuantization( - scalar=models.ScalarQuantizationConfig( - type=models.ScalarType.INT8, - quantile=0.99, - always_ram=True, - ), - ), -) +![case-study-sprinklr-2](/blog/case-study-sprinklr/image2.png) -``` +*Figure:* Sprinklr’s RAG architecture -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +Sprinklr’s platform is composed of four key product suites - Sprinklr Service, Sprinklr Marketing, Sprinklr Social, and Sprinklr Insights. Each suite is embedded with AI-first features such as assist agents, post-call analysis, and real-time analytics, which are crucial for managing large-scale contact center operations. “These AI-driven capabilities, supported by Qdrant’s advanced vector search, enhance Sprinklr’s customer-facing tools such as FAQ bots, transactional bots, conversational services, and product recommendation engines,” says Sonavane. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +These self-serve applications rely heavily on advanced vector search to analyze and optimize community content and refine knowledge bases, ensuring efficient and relevant responses. For customers requiring further assistance, Sprinklr equips support agents with powerful search capabilities, enabling them to quickly access similar cases and draw from past interactions, enhancing the quality and speed of customer support. -client.createCollection("{collection_name}", { - vectors: { - size: 768, - distance: "Cosine", - }, - quantization_config: { - scalar: { - type: "int8", - quantile: 0.99, - always_ram: true, - }, - }, -}); +## The Need for a Vector Database -``` +To support various AI-driven applications, Sprinklr needed an efficient vector database. "The key challenge was to provide the highest quality and fastest search capabilities for retrieval tasks across the board," explains Sonavane. -```rust -use qdrant_client::qdrant::{ - CreateCollectionBuilder, Distance, QuantizationType, ScalarQuantizationBuilder, - VectorParamsBuilder, -}; -use qdrant_client::Qdrant; +Last year, Sprinklr undertook a comprehensive evaluation of its existing search infrastructure. The goals were to identify current capability gaps, benchmark performance for speed and cost, and explore opportunities to improve the developer experience through enhanced scalability and stronger data privacy controls. It became clear that an advanced vector database was essential to meet these needs, and Qdrant emerged as the ideal solution. -let client = Qdrant::from_url("http://localhost:6334").build()?; +### Why Qdrant? -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) - .quantization_config( - ScalarQuantizationBuilder::default() - .r#type(QuantizationType::Int8.into()) - .quantile(0.99) - .always_ram(true), - ), - ) - .await?; +After evaluating several options of vector DBs, including Pinecone, Weaviate, and ElasticSearch, Sprinklr chose Qdrant for its: -``` +- **Developer-Friendly Documentation:** “Qdrant’s clear [documentation](https://qdrant.tech/documentation/) enabled our team to integrate it quickly into our workflows,” notes Sonavane. +- **High Customizability:** Qdrant provided Sprinklr with essential flexibility through high-level abstractions that allowed for extensive customizations. The diverse teams at Sprinklr, working on various GenAI applications, needed a solution that could adapt to different workloads. “The ability to fine-tune configurations at the collection level was crucial for our varied AI applications,” says Sonavane. Qdrant met this need by offering: -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.QuantizationConfig; -import io.qdrant.client.grpc.Collections.QuantizationType; -import io.qdrant.client.grpc.Collections.ScalarQuantization; -import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.grpc.Collections.VectorsConfig; + - **Configuration for high-speed search** that fine-tunes settings for optimal performance. + - [**Quantized vectors**](https://qdrant.tech/documentation/guides/quantization/) for high-dimensional data workloads + - [**Memory map**](https://qdrant.tech/documentation/concepts/storage/#configuring-memmap-storage) for efficient search optimizing memory usage. +- **Speed and Cost Efficiency:** Qdrant provided the best combination of speed and cost, making it the most viable solution for Sprinklr’s needs. “We needed a solution that wouldn’t just meet our performance requirements but also keep costs in check, and Qdrant delivered on both fronts,” says Sonavane. +- **Enhanced Monitoring:** Qdrant’s monitoring tools further boosted system efficiency, allowing Sprinklr to maintain high performance across their platforms. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +## Implementation and Qdrant’s Performance -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig( - VectorsConfig.newBuilder() - .setParams( - VectorParams.newBuilder() - .setSize(768) - .setDistance(Distance.Cosine) - .build()) - .build()) - .setQuantizationConfig( - QuantizationConfig.newBuilder() - .setScalar( - ScalarQuantization.newBuilder() - .setType(QuantizationType.Int8) - .setQuantile(0.99f) - .setAlwaysRam(true) - .build()) - .build()) - .build()) - .get(); +Sprinklr’s transition to Qdrant was carefully managed, starting with 10% of their workloads before gradually scaling up. The transition was seamless, thanks in part to Qdrant’s configurable [Web UI](https://qdrant.tech/documentation/interfaces/web-ui/), which allowed Sprinklr to fully utilize its capabilities within the existing infrastructure. -``` +“Qdrant’s ability to index [multiple vectors](https://qdrant.tech/documentation/concepts/vectors/#multivectors) simultaneously and retrieve and re-rank with precision brought significant improvements to our workflow,” Sonavane remarks. This feature reduced the need for repeated retrieval processes, significantly improving efficiency. Additionally, Qdrant’s [quantization](https://qdrant.tech/documentation/guides/quantization/) and [memory mapping](https://qdrant.tech/documentation/concepts/storage/#configuring-memmap-storage) features enabled Sprinklr to reduce RAM usage, leading to substantial cost savings. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +Qdrant now plays a key supportive role in enhancing Sprinklr’s vector search capabilities within its AI-driven applications, which is designed to be cloud- and LLM-agnostic. The platform supports various AI-driven tasks, from retrieval and re-ranking to serving advanced customer experiences. “Retrieval is the foundation of all our AI tasks, and Qdrant’s resilience and speed have made it an integral part of our system,” Sonavane emphasizes. Sprinklr operates [Qdrant as a managed service on AWS](https://qdrant.tech/cloud/), ensuring scalability, reliability, and ease of use. -var client = new QdrantClient("localhost", 6334); +### Key Outcomes with Qdrant -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine }, - quantizationConfig: new QuantizationConfig - { - Scalar = new ScalarQuantization - { - Type = QuantizationType.Int8, - Quantile = 0.99f, - AlwaysRam = true - } - } -); +After rigorous internal evaluation, Sprinklr achieved the following results with Qdrant: -``` +- **30% Cost Reduction**: Internal benchmarking showed Qdrant reduced Sprinklr's retrieval infrastructure costs by 30%. +- **Improved Developer Efficiency**: Qdrant’s user-friendly environment made it easier to maintain instances, enhancing overall efficiency. -```go -import ( - "context" +The Sprinklr team conducted a thorough internal benchmark on applications requiring vector search across 10k to over 1M vectors with varying dimensions of vectors depending on the use case. The key results from these benchmarks include: + +- **Superior Write Performance**: Qdrant's write performance excelled in Sprinklr’s benchmark tests, with incremental indexing time for 100k to 1M vectors being less than 10% of Elasticsearch’s, making it highly efficient for handling updates and append queries in high-ingestion use cases. +- **Low Latency for Real-Time Applications:** In Sprinklr's benchmark, Qdrant delivered a P99 latency of 20ms for searches on 1 million vectors, making it ideal for real-time use cases like live chat, where Elasticsearch and Milvus both exceeded 100ms. +- **High Throughput for Heavy Query Loads**: In Sprinklr's benchmark, Qdrant handled up to 250 requests per second (RPS) under similar configurations, significantly outperforming Elasticsearch's 100 RPS, making it ideal for environments with heavy query loads. + +“Qdrant is a very fast and high quality retrieval system,” Sonavane points out. + +![case-study-sprinklr-3](/blog/case-study-sprinklr/image3.png) + +*Figure: P95 Query Time vs Mean Average Precision Benchmark Across Varying Index Sizes* + +## Outlook + +Looking ahead, the Applied AI team at Sprinklr is focused on developing Sprinklr Digital Twin technology for companies, organizations, and employees, aiming to seamlessly integrate AI agents with human workers in business processes. Sprinklr Digital Twins are powered by a process engine that incorporates personas, skills, tasks, and activities, designed to optimize operational efficiency. + +![case-study-sprinklr-4](/blog/case-study-sprinklr/image4.png) + +*Figure: Sprinklr Digital Twin* + +Vector search will play a crucial role, as each AI agent will have its own knowledge base, skill set, and tool set, enabling precise and autonomous task execution. The integration of Qdrant further enhances the system's ability to manage and utilize large volumes of data effectively. - "github.com/qdrant/go-client/qdrant" -) -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +## Benchmarking Conclusion -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 768, - Distance: qdrant.Distance_Cosine, - }), - QuantizationConfig: qdrant.NewQuantizationScalar( - &qdrant.ScalarQuantization{ - Type: qdrant.QuantizationType_Int8, - Quantile: qdrant.PtrOf(float32(0.99)), - AlwaysRam: qdrant.PtrOf(true), - }, - ), -}) +***Configuration Details:*** -``` +- We benchmarked applications requiring search on different sizes ranging from 10k to 1M+ vectors, with varying dimensions of vectors depending on the usage. Our infrastructure mainly consisted of Elasticsearch and in-memory Faiss vector search. -There are 3 parameters that you can specify in the `quantization_config` section: +Key Observations: -`type` \- the type of the quantized vector components. Currently, Qdrant supports only `int8`. +1. **Indexing Speed**: Qdrant indexes vectors rapidly, making it suitable for applications that require quick data ingestion. Among the alternatives tried, milvus was on par with qdrant in terms of indexing time for a given precision. The latest versions of Elasticsearch offer much improvement compared to previous versions, though not as efficient as Qdrant. + - **Write Performance:** For some of our use cases, update queries and append queries were significantly higher. For ES, an increase in the number of points had a severe impact on total upload time. For 100k to 1M vector index qdrant incremental indexing time was less than 10% of Elasticsearch. +2. **Low Latency**: Tail latencies are very critical for real-time applications such as live chat, requiring low P95 and P99 latencies. For a workload requiring search on 1 million vectors, qdrant provided inference latency of 20ms P99 whereas ES and Milvus were more than 100ms. +3. **High Throughput**: Qdrant handles a high number of requests per second, making it ideal for environments with heavy query loads. For similar configurations, Qdrant provided a throughput of 250 RPS whereas ES was around 100 RPS. -`quantile` \- the quantile of the quantized vector components. -The quantile is used to calculate the quantization bounds. -For instance, if you specify `0.99` as the quantile, 1% of extreme values will be excluded from the quantization bounds. +![case-study-sprinklr-5](/blog/case-study-sprinklr/image3.png) -Using quantiles lower than `1.0` might be useful if there are outliers in your vector components. -This parameter only affects the resulting precision and not the memory footprint. -It might be worth tuning this parameter if you experience a significant decrease in search quality. +![case-study-sprinklr-6](/blog/case-study-sprinklr/image6.png) -`always_ram` \- whether to keep quantized vectors always cached in RAM or not. By default, quantized vectors are loaded in the same way as the original vectors. -However, in some setups you might want to keep quantized vectors in RAM to speed up the search process. +![case-study-sprinklr-7](/blog/case-study-sprinklr/image7.png) -In this case, you can set `always_ram` to `true` to store quantized vectors in RAM. +![case-study-sprinklr-8](/blog/case-study-sprinklr/image8.png) -### [Anchor](https://qdrant.tech/documentation/guides/quantization/\#setting-up-binary-quantization) Setting up Binary Quantization +```json +data = [ -To enable binary quantization, you need to specify the quantization parameters in the `quantization_config` section of the collection configuration. +{'system': 'Qdrant', 'index_size': '1,000', 'MAP': 0.98, 'P95 Time': 0.22, 'Mean Time': 0.1, 'QPS': 280, -When enabling binary quantization on an existing collection, use a PATCH request or the corresponding `update_collection` method and omit the vector configuration, as it’s already defined. +'Upload Time': 1}, -httppythontypescriptrustjavacsharpgo +{'system': 'Qdrant', 'index_size': '10,000', 'MAP': 0.99, 'P95 Time': 0.16, 'Mean Time': 0.09, 'QPS': 330, -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 1536, - "distance": "Cosine" - }, - "quantization_config": { - "binary": { - "always_ram": true - } - } -} +'Upload Time': 5}, -``` +{'system': 'Qdrant', 'index_size': '100,000', 'MAP': 0.98, 'P95 Time': 0.3, 'Mean Time': 0.23, 'QPS': 145, -```python -from qdrant_client import QdrantClient, models +'Upload Time': 100}, -client = QdrantClient(url="http://localhost:6333") +{'system': 'Qdrant', 'index_size': '1,000,000', 'MAP': 0.99, 'P95 Time': 0.171, 'Mean Time': 0.162, 'QPS': 596, -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE), - quantization_config=models.BinaryQuantization( - binary=models.BinaryQuantizationConfig( - always_ram=True, - ), - ), -) +'Upload Time': 220}, -``` +{'system': 'ElasticSearch', 'index_size': '1,000', 'MAP': 0.99, 'P95 Time': 0.42, 'Mean Time': 0.32, 'QPS': 95, -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +'Upload Time': 10}, -const client = new QdrantClient({ host: "localhost", port: 6333 }); +{'system': 'ElasticSearch', 'index_size': '10,000', 'MAP': 0.98, 'P95 Time': 0.3, 'Mean Time': 0.24, 'QPS': 120, -client.createCollection("{collection_name}", { - vectors: { - size: 1536, - distance: "Cosine", - }, - quantization_config: { - binary: { - always_ram: true, - }, - }, -}); +'Upload Time': 50}, -``` +{'system': 'ElasticSearch', 'index_size': '100,000', 'MAP': 0.99, 'P95 Time': 0.48, 'Mean Time': 0.42, 'QPS': 80, -```rust -use qdrant_client::qdrant::{ - BinaryQuantizationBuilder, CreateCollectionBuilder, Distance, VectorParamsBuilder, -}; -use qdrant_client::Qdrant; +'Upload Time': 1100}, -let client = Qdrant::from_url("http://localhost:6334").build()?; +{'system': 'ElasticSearch', 'index_size': '1,000,000', 'MAP': 0.99, 'P95 Time': 0.37, 'Mean Time': 0.236, -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(1536, Distance::Cosine)) - .quantization_config(BinaryQuantizationBuilder::new(true)), - ) - .await?; +'QPS': 348, 'Upload Time': 1150} +] ``` -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.BinaryQuantization; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.QuantizationConfig; -import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.grpc.Collections.VectorsConfig; +<|page-337-lllmstxt|> +[**Qdrant 1.12.0 is out!**](https://github.com/qdrant/qdrant/releases/tag/v1.12.0) Let's look at major new features and a few minor additions: -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +**Distance Matrix API:** Efficiently calculate pairwise distances between vectors.
+**GUI Data Exploration** Visually navigate your dataset and analyze vector relationships.
+**Faceting API:** Dynamically aggregate and count unique values in specific fields.
-client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig( - VectorsConfig.newBuilder() - .setParams( - VectorParams.newBuilder() - .setSize(1536) - .setDistance(Distance.Cosine) - .build()) - .build()) - .setQuantizationConfig( - QuantizationConfig.newBuilder() - .setBinary(BinaryQuantization.newBuilder().setAlwaysRam(true).build()) - .build()) - .build()) - .get(); +**Text Index on disk:** Reduce memory usage by storing text indexing data on disk.
+**Geo Index on disk:** Offload indexed geographic data on disk for memory efficiency. -``` +## Distance Matrix API for Data Insights +![distance-matrix-api](/blog/qdrant-1.12.x/distance-matrix-api.png) -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +> **Qdrant** is a similarity search engine. Our mission is to give you the tools to **discover and understand connections** between vast amounts of semantically relevant data -var client = new QdrantClient("localhost", 6334); +The **Distance Matrix API** is here to lay the groundwork for such tools. -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 1536, Distance = Distance.Cosine }, - quantizationConfig: new QuantizationConfig - { - Binary = new BinaryQuantization { AlwaysRam = true } - } -); +In data exploration, tasks like [**clustering**](https://en.wikipedia.org/wiki/DBSCAN) and [**dimensionality reduction**](https://en.wikipedia.org/wiki/Dimensionality_reduction) rely on calculating distances between data points. -``` +**Use Case:** A retail company with 10,000 customers wants to segment them by purchasing behavior. Each customer is stored as a vector in Qdrant, but without a dedicated API, clustering would need 10,000 separate batch requests, making the process inefficient and costly. -```go -import ( - "context" +You can use this API to compute a **sparse matrix of distances** that is optimized for large datasets. Then, you can filter through the retrieved data to find the exact vector relationships that matter. - "github.com/qdrant/go-client/qdrant" -) +In terms of endpoints, we offer two different formats to show results: +- **Pairs** are simple, intutitive and ideal for graph representation. +- **Offsets** are more complex, but also native when defining CSR sparse matrices. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +### Output - Pairs -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 1536, - Distance: qdrant.Distance_Cosine, - }), - QuantizationConfig: qdrant.NewQuantizationBinary( - &qdrant.BinaryQuantization{ - AlwaysRam: qdrant.PtrOf(true), - }, - ), -}) +Use the `pairs` endpoint to compare 10 random point pairs from your dataset: +```http +POST /collections/{collection_name}/points/search/matrix/pairs +{ + "sample": 10, + "limit": 2 +} ``` +Configuring the `sample` will retrieve a random group of 10 points to compare. The `limit` is the number of semantic connections between points to consider. -`always_ram` \- whether to keep quantized vectors always cached in RAM or not. By default, quantized vectors are loaded in the same way as the original vectors. -However, in some setups you might want to keep quantized vectors in RAM to speed up the search process. +Qdrant will list a sparse matrix of distances **between the closest pairs**: -In this case, you can set `always_ram` to `true` to store quantized vectors in RAM. +```http +{ + "result": { + "pairs": [ + {"a": 1, "b": 3, "score": 1.4063001}, + {"a": 1, "b": 4, "score": 1.2531}, + {"a": 2, "b": 1, "score": 1.1550001}, + {"a": 2, "b": 8, "score": 1.1359}, + {"a": 3, "b": 1, "score": 1.4063001}, + {"a": 3, "b": 4, "score": 1.2218001}, + {"a": 4, "b": 1, "score": 1.2531}, + {"a": 4, "b": 3, "score": 1.2218001}, + {"a": 5, "b": 3, "score": 0.70239997}, + {"a": 5, "b": 1, "score": 0.6146}, + {"a": 6, "b": 3, "score": 0.6353}, + {"a": 6, "b": 4, "score": 0.5093}, + {"a": 7, "b": 3, "score": 1.0990001}, + {"a": 7, "b": 1, "score": 1.0349001}, + {"a": 8, "b": 2, "score": 1.1359}, + {"a": 8, "b": 3, "score": 1.0553} + ] + } +} +``` -### [Anchor](https://qdrant.tech/documentation/guides/quantization/\#setting-up-product-quantization) Setting up Product Quantization +### Output - Offsets -To enable product quantization, you need to specify the quantization parameters in the `quantization_config` section of the collection configuration. +The `offsets` endpoint offer another format of showing the distance between points: + +```http +POST /collections/{collection_name}/points/search/matrix/offsets +{ + "sample": 10, + "limit": 2 +} +``` -When enabling product quantization on an existing collection, use a PATCH request or the corresponding `update_collection` method and omit the vector configuration, as it’s already defined. +Qdrant will return a compact representation of the distances between points in the **form of row and column offsets**. -httppythontypescriptrustjavacsharpgo +Two arrays, `offsets_row` and `offsets_col`, represent the positions of non-zero distance values in the matrix. Each entry in these arrays corresponds to a pair of points with a calculated distance. ```http -PUT /collections/{collection_name} { - "vectors": { - "size": 768, - "distance": "Cosine" - }, - "quantization_config": { - "product": { - "compression": "x16", - "always_ram": true - } + "result": { + "offsets_row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7], + "offsets_col": [2, 3, 0, 7, 0, 3, 0, 2, 2, 0, 2, 3, 2, 0, 1, 2], + "scores": [ + 1.4063001, 1.2531, 1.1550001, 1.1359, 1.4063001, + 1.2218001, 1.2531, 1.2218001, 0.70239997, 0.6146, 0.6353, + 0.5093, 1.0990001, 1.0349001, 1.1359, 1.0553 + ], + "ids": [1, 2, 3, 4, 5, 6, 7, 8] } } - ``` +*To learn more about the distance matrix, read [**The Distance Matrix documentation**](/documentation/concepts/explore/#distance-matrix).* -```python -from qdrant_client import QdrantClient, models +## Distance Matrix API in the Graph UI -client = QdrantClient(url="http://localhost:6333") +We are adding more visualization options to the [**Graph Exploration Tool**](/blog/qdrant-1.11.x/#web-ui-graph-exploration-tool), introduced in v.1.11. -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), - quantization_config=models.ProductQuantization( - product=models.ProductQuantizationConfig( - compression=models.CompressionRatio.X16, - always_ram=True, - ), - ), -) +You can now leverage the **Distance Matrix API** from within this tool for a **clearer picture** of your data and its relationships. -``` +**Example:** You can retrieve 900 `sample` points, with a `limit` of 5 connections per vector and a `tree` visualization: -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +```json +{ + "limit": 5, + "sample": 900, + "tree": true +} +``` +The new graphing method is cleaner and reveals **relationships and outliers:** -const client = new QdrantClient({ host: "localhost", port: 6333 }); +![distance-matrix](/blog/qdrant-1.12.x/distance-matrix.png) -client.createCollection("{collection_name}", { - vectors: { - size: 768, - distance: "Cosine", - }, - quantization_config: { - product: { - compression: "x16", - always_ram: true, - }, - }, -}); +*To learn more about the Web UI Dashboard, read the [**Interfaces documentation**](/documentation/interfaces/web-ui/).* -``` +## Facet API for Metadata Cardinality -```rust -use qdrant_client::qdrant::{ - CompressionRatio, CreateCollectionBuilder, Distance, ProductQuantizationBuilder, - VectorParamsBuilder, -}; -use qdrant_client::Qdrant; +![facet-api](/blog/qdrant-1.12.x/facet-api.png) -let client = Qdrant::from_url("http://localhost:6334").build()?; +In modern applications like e-commerce, users often rely on [**filters**](/articles/vector-search-filtering/), such as **brand** or **color**, to refine search results. The **Facet API** is designed to help users understand the distribution of values in a dataset. -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) - .quantization_config( - ProductQuantizationBuilder::new(CompressionRatio::X16.into()).always_ram(true), - ), - ) - .await?; +The `facet` endpoint can efficiently count and aggregate values for a specific [**payload field**](/documentation/concepts/payload/) in your dataset. -``` +You can use it to retrieve unique values for a field, along with the number of points that contain each value. This functionality is similar to `GROUP BY` with `COUNT(*)` in SQL databases. -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CompressionRatio; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.ProductQuantization; -import io.qdrant.client.grpc.Collections.QuantizationConfig; -import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.grpc.Collections.VectorsConfig; +> **Note:** Facet counting can only be applied to fields that support `match` conditions, such as fields with a keyword index. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +### Configuration -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig( - VectorsConfig.newBuilder() - .setParams( - VectorParams.newBuilder() - .setSize(768) - .setDistance(Distance.Cosine) - .build()) - .build()) - .setQuantizationConfig( - QuantizationConfig.newBuilder() - .setProduct( - ProductQuantization.newBuilder() - .setCompression(CompressionRatio.x16) - .setAlwaysRam(true) - .build()) - .build()) - .build()) - .get(); +Here’s a sample query using the REST API to facet on the `size` field, filtered by products where the `color` is red: +```http +POST /collections/{collection_name}/facet +{ + "key": "size", + "filter": { + "must": { + "key": "color", + "match": { "value": "red" } + } + } +} ``` +This returns counts for each unique value in the `size` field, filtered by `color` = `red`: -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +```json +{ + "response": { + "hits": [ + {"value": "L", "count": 19}, + {"value": "S", "count": 10}, + {"value": "M", "count": 5}, + {"value": "XL", "count": 1}, + {"value": "XXL", "count": 1} + ] + }, + "time": 0.0001 +} +``` +The results are sorted by count in descending order and only values with non-zero counts are returned. -var client = new QdrantClient("localhost", 6334); +### Configuration - Precise Facet -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine }, - quantizationConfig: new QuantizationConfig - { - Product = new ProductQuantization { Compression = CompressionRatio.X16, AlwaysRam = true } - } -); +By default, facet counting runs an approximate filter. If you need a precise count, you can enable the `exact` parameter: +```http +POST /collections/{collection_name}/facet +{ + "key": "size", + "exact": true +} ``` +This feature provides flexibility between performance and precision, depending on the needs of your application. -```go -import ( - "context" +*To learn more about faceting, read the [**Facet API documentation**](/documentation/concepts/payload/#facet-counts).* - "github.com/qdrant/go-client/qdrant" -) +## Text Index on Disk Support +![text-index-disk](/blog/qdrant-1.12.x/text-index-disk.png) -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +[**Qdrant text indexing**](/documentation/concepts/indexing/#full-text-index) tokenizes text into smaller units (tokens) based on chosen settings (e.g., tokenizer type, token length). These tokens are stored in an inverted index for fast text searches. -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 768, - Distance: qdrant.Distance_Cosine, - }), - QuantizationConfig: qdrant.NewQuantizationProduct( - &qdrant.ProductQuantization{ - Compression: qdrant.CompressionRatio_x16, - AlwaysRam: qdrant.PtrOf(true), - }, - ), -}) +> With `on_disk` text indexing, the inverted index is stored on disk, reducing memory usage. + +### Configuration +Just like with other indexes, simply add `on_disk: true` when creating the index: +```http +PUT /collections/{collection_name}/index +{ + "field_name": "review_text", + "field_schema": { + "type": "text", + "tokenizer": "word", + "min_token_len": 2, + "max_token_len": 20, + "lowercase": true, + "on_disk": true + } +} ``` -There are two parameters that you can specify in the `quantization_config` section: +*To learn more about indexes, read the [**Indexing documentation**](/documentation/concepts/indexing/).* -`compression` \- compression ratio. -Compression ratio represents the size of the quantized vector in bytes divided by the size of the original vector in bytes. -In this case, the quantized vector will be 16 times smaller than the original vector. +## Geo Index on Disk Support -`always_ram` \- whether to keep quantized vectors always cached in RAM or not. By default, quantized vectors are loaded in the same way as the original vectors. -However, in some setups you might want to keep quantized vectors in RAM to speed up the search process. Then set `always_ram` to `true`. +For [**large-scale geographic datasets**](/documentation/concepts/payload/#geo) where storing all indexes in memory is impractical, **geo indexing** allows efficient filtering of points based on geographic coordinates. -### [Anchor](https://qdrant.tech/documentation/guides/quantization/\#searching-with-quantization) Searching with Quantization +With `on_disk` geo indexing, the index is written to disk instead of residing in memory, making it possible to handle large datasets without exhausting system memory. -Once you have configured quantization for a collection, you don’t need to do anything extra to search with quantization. -Qdrant will automatically use quantized vectors if they are available. +> This can be crucial when dealing with millions of geo points that don’t require real-time access. -However, there are a few options that you can use to control the search process: +### Configuration -httppythontypescriptrustjavacsharpgo +To enable this feature, modify the index schema for the geographic field by setting the `on_disk: true` flag. ```http -POST /collections/{collection_name}/points/query +PUT /collections/{collection_name}/index { - "query": [0.2, 0.1, 0.9, 0.7], - "params": { - "quantization": { - "ignore": false, - "rescore": true, - "oversampling": 2.0 - } - }, - "limit": 10 + "field_name": "location", + "field_schema": { + "type": "geo", + "on_disk": true + } } - ``` -```python -from qdrant_client import QdrantClient, models +### Performance Considerations -client = QdrantClient(url="http://localhost:6333") +- **Cold Query Latency:** On-disk indexes require I/O to load index segments, introducing slight latency on first access. Subsequent queries will benefit from disk caching. +- **Hot vs. Cold Indexes:** Fields frequently queried should stay in memory for faster performance, and on-disk indexes are better for large, infrequently queried fields. +- **Memory vs. Disk Trade-offs:** Users can manage memory by deciding which fields to store on disk. -client.query_points( - collection_name="{collection_name}", - query=[0.2, 0.1, 0.9, 0.7], - search_params=models.SearchParams( - quantization=models.QuantizationSearchParams( - ignore=False, - rescore=True, - oversampling=2.0, - ) - ), -) +![geo-index-disk](/blog/qdrant-1.12.x/geo-index-disk.png) -``` +> To learn how to get the best performance from Qdrant, read the [**Optimization Guide**](/documentation/guides/optimize/). -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +## Just the Beginning -const client = new QdrantClient({ host: "localhost", port: 6333 }); +The easiest way to reach that **Hello World** moment is to [**try vector search in a live cluster**](/documentation/quickstart-cloud/). Our **interactive tutorial** will show you how to create a cluster, add data and try some filtering clauses. -client.query("{collection_name}", { - query: [0.2, 0.1, 0.9, 0.7], - params: { - quantization: { - ignore: false, - rescore: true, - oversampling: 2.0, - }, - }, - limit: 10, -}); +**All of the new features from version 1.12 can be tested in the Web UI:** -``` +![qdrant-filtering-tutorial](/articles_data/vector-search-filtering/qdrant-filtering-tutorial.png) -```rust -use qdrant_client::qdrant::{ - QuantizationSearchParamsBuilder, QueryPointsBuilder, SearchParamsBuilder, -}; -use qdrant_client::Qdrant; +### Check Out the Tutorial Video + -let client = Qdrant::from_url("http://localhost:6334").build()?; +<|page-338-lllmstxt|> +We’re excited to announce a new course on DeepLearning.AI's platform: [Retrieval Optimization: From Tokenization to Vector Quantization](https://www.deeplearning.ai/short-courses/retrieval-optimization-from-tokenization-to-vector-quantization/?utm_campaign=qdrant-launch&utm_medium=qdrant&utm_source=partner-promo). This collaboration between Qdrant and DeepLearning.AI aims to empower developers and data enthusiasts with the skills needed to enhance [vector search](/advanced-search/) capabilities in their applications. -client - .query( - QueryPointsBuilder::new("{collection_name}") - .query(vec![0.2, 0.1, 0.9, 0.7]) - .limit(10) - .params( - SearchParamsBuilder::default().quantization( - QuantizationSearchParamsBuilder::default() - .ignore(false) - .rescore(true) - .oversampling(2.0), - ), - ), - ) - .await?; +Led by Qdrant’s Kacper Ɓukawski, this free, one-hour course is designed for beginners eager to delve into the world of retrieval optimization. -``` +## Why This Collaboration Matters -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.QuantizationSearchParams; -import io.qdrant.client.grpc.Points.QueryPoints; -import io.qdrant.client.grpc.Points.SearchParams; +At Qdrant, we believe in the power of effective search to transform user experiences. Partnering with DeepLearning.AI allows us to combine our cutting-edge vector search technology with their educational expertise, providing learners with a comprehensive understanding of how to build and optimize [Retrieval-Augmented Generation (RAG)](/rag/rag-evaluation-guide/) applications. This course is part of our commitment to equip the community with practical skills that leverage advanced machine learning techniques. -import static io.qdrant.client.QueryFactory.nearest; + -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +## What You’ll Learn -client.queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) - .setParams( - SearchParams.newBuilder() - .setQuantization( - QuantizationSearchParams.newBuilder() - .setIgnore(false) - .setRescore(true) - .setOversampling(2.0) - .build()) - .build()) - .setLimit(10) - .build()) - .get(); +In this course, you’ll explore key concepts that will enhance your understanding of retrieval optimization: -``` +- Learn how tokenization works in large language and embedding models and how the tokenizer can affect the quality of your search. +- Explore how different tokenization techniques including Byte-Pair Encoding, WordPiece, and Unigram are trained and work. +- Understand how to [measure the quality of your retrieval](/rag/rag-evaluation-guide/) and how to optimize your search by adjusting HNSW parameters and [vector quantizations](/articles/what-is-vector-quantization/). -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +## Who Should Enroll -var client = new QdrantClient("localhost", 6334); +This course is tailored for anyone with basic Python knowledge. -await client.QueryAsync( - collectionName: "{collection_name}", - query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, - searchParams: new SearchParams - { - Quantization = new QuantizationSearchParams - { - Ignore = false, - Rescore = true, - Oversampling = 2.0 - } - }, - limit: 10 -); +Whether you’re starting your journey in machine learning or looking to enhance your existing skills, this course offers valuable insights to boost your capabilities. -``` +### At a Glance: -```go -import ( - "context" +- **Speaker**: Kacper Ɓukawski, Qdrant Developer Advocate +- **Level**: Beginner +- **Cost**: Free +- **Location**: Online +- **Duration**: 1 Hour - "github.com/qdrant/go-client/qdrant" -) +## How to Enroll -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +[Enroll via the DeepLearning.AI website](https://www.deeplearning.ai/short-courses/retrieval-optimization-from-tokenization-to-vector-quantization/?utm_campaign=qdrant-launch&utm_medium=qdrant&utm_source=partner-promo). -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), - Params: &qdrant.SearchParams{ - Quantization: &qdrant.QuantizationSearchParams{ - Ignore: qdrant.PtrOf(false), - Rescore: qdrant.PtrOf(true), - Oversampling: qdrant.PtrOf(2.0), - }, - }, -}) +<|page-339-lllmstxt|> +# Supporting Early-Stage Startups -``` +Over the past few years, we’ve witnessed some of the most innovative AI applications being built on Qdrant. A significant number of these have come from startups pushing the boundaries of what’s possible in AI. To ensure these pioneering teams have access to the right resources at the right time, we're introducing **Qdrant for Startups**. This initiative is designed to provide startups with the technical support, guidance, and infrastructure they need to scale their AI innovations quickly and effectively. -`ignore` \- Toggle whether to ignore quantized vectors during the search process. By default, Qdrant will use quantized vectors if they are available. +Qdrant for Startups helps early-stage startups fully leverage the capabilities of vector search technology. Whether you're building retrieval-augmented generation (RAG) systems, recommendation engines, or anomaly detection models, the program offers exclusive benefits, such as discounts for Qdrant cloud, expert technical guidance, exclusive partner benefits, and co-marketing opportunities - empowering you to build and scale your AI products efficiently and cost-effectively. -`rescore` \- Having the original vectors available, Qdrant can re-evaluate top-k search results using the original vectors. -This can improve the search quality, but may slightly decrease the search speed, compared to the search without rescore. -It is recommended to disable rescore only if the original vectors are stored on a slow storage (e.g. HDD or network storage). -By default, rescore is enabled. +## Benefits for admitted startups: -**Available as of v1.3.0** +- **Qdrant Cloud discount:** 20% discount on Qdrant Cloud valid for 12 months, optimizing costs while scaling with advanced vector search capabilities. +- **Expert technical guidance:** Dedicated technical support and guidance to optimize your application’s performance with vector search. +- **Co-marketing opportunities:** Collaboration with the Qdrant team on joint marketing initiatives to boost your startup’s visibility. +- **Early access to features:** Exclusive early access to upcoming Qdrant features, keeping you at the forefront of technological advancements. +- **Community access:** Access to Qdrant’s developer and AI community for collaboration, networking, and shared learning. -`oversampling` \- Defines how many extra vectors should be pre-selected using quantized index, and then re-scored using original vectors. -For example, if oversampling is 2.4 and limit is 100, then 240 vectors will be pre-selected using quantized index, and then top-100 will be returned after re-scoring. -Oversampling is useful if you want to tune the tradeoff between search speed and search quality in the query time. +## Access to popular AI tools -## [Anchor](https://qdrant.tech/documentation/guides/quantization/\#quantization-tips) Quantization tips +We’ve built this program to support startups with their entire AI tech stack. In addition to Qdrant, accepted startups will receive exclusive discounts from our program partners - Hugging Face, LlamaIndex, and Airbyte - ensuring you have access to the key tools and resources needed to build and scale AI-driven applications. -#### [Anchor](https://qdrant.tech/documentation/guides/quantization/\#accuracy-tuning) Accuracy tuning +Accepted startup program members will have the ability to get additional benefits: -In this section, we will discuss how to tune the search precision. -The fastest way to understand the impact of quantization on the search quality is to compare the search results with and without quantization. +- Hugging Face: $100 compute credits for the HuggingFace Hub +- LlamaIndex: 20% discount for 12 months for LlamaCloud +- Airbyte: Cloud credits for Y Combinator startups -In order to disable quantization, you can set `ignore` to `true` in the search request: +[![qdrant-for-startups-launch](/blog/qdrant-for-startups-launch/startup-cta.png)](https://qdrant.tech/qdrant-for-startups) -httppythontypescriptrustjavacsharpgo +## Frequently Asked Questions: -```http -POST /collections/{collection_name}/points/query -{ - "query": [0.2, 0.1, 0.9, 0.7], - "params": { - "quantization": { - "ignore": true - } - }, - "limit": 10 -} +**Q: What are the eligibility requirements?** -``` +A: You must meet all of the following: -```python -from qdrant_client import QdrantClient, models +- Pre-seed, Seed or Series A startups (under five years old) +- New user of Qdrant Cloud +- Has not previously participated in the Qdrant for Startups program +- Offer is not valid to existing Qdrant customers +- Must be building an AI-driven product or services (agencies or devshops are not eligible) +- A live, functional website is required +- Billing must be done directly with Qdrant (not through a marketplace) + +**Q: How can I apply to the Qdrant Startup Program?** + +A: Apply through our online form by providing details about your startup and its plans for using Qdrant. Applications are reviewed within 7-10 business days, with selections based on innovation potential and alignment with our capabilities. + +**Q: What criteria are used to select startups for the program?** -client = QdrantClient(url="http://localhost:6333") +A: We evaluate applications based on the innovation potential of the tech or AI-driven products or services and their alignment with Qdrant’s capabilities. Startups that demonstrate a clear vision and potential for impactful use of our platform are more likely to be selected. -client.query_points( - collection_name="{collection_name}", - query=[0.2, 0.1, 0.9, 0.7], - search_params=models.SearchParams( - quantization=models.QuantizationSearchParams( - ignore=True, - ) - ), -) +**Q: How long is the discount valid, and are there any conditions?** -``` +A: The discount is valid for 12 months from the date of acceptance and applies exclusively to our Cloud services billed through Stripe. Participants need a Stripe account to utilize the discount. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +**Q: How can I maximize the co-marketing opportunities offered by the program?** -const client = new QdrantClient({ host: "localhost", port: 6333 }); +A: Engage actively with our marketing team for features on social media, possible appearances in Discord talks or webinars, and case studies to maximize your startup's visibility and showcase your innovative use of Qdrant. -client.query("{collection_name}", { - query: [0.2, 0.1, 0.9, 0.7], - params: { - quantization: { - ignore: true, - }, - }, -}); +**Q: Can existing Qdrant customers apply for the Startup Program?** -``` +A: Yes, existing Qdrant customers are eligible to apply for the Startup Program if their cloud account was created within the last 30 days from the date of application. This opportunity is designed to ensure startups at the early stages of using our platform can still benefit from the additional support and resources offered by the program. -```rust -use qdrant_client::qdrant::{ - QuantizationSearchParamsBuilder, QueryPointsBuilder, SearchParamsBuilder, -}; -use qdrant_client::Qdrant; +**Q: Can I reapply if my application is initially rejected?** -let client = Qdrant::from_url("http://localhost:6334").build()?; +A: Yes, we welcome reapplications from startups whose circumstances have changed or who can provide additional information that might have been overlooked in the initial review. You must wait 2 months to re-apply. -client - .query( - QueryPointsBuilder::new("{collection_name}") - .query(vec![0.2, 0.1, 0.9, 0.7]) - .limit(3) - .params( - SearchParamsBuilder::default() - .quantization(QuantizationSearchParamsBuilder::default().ignore(true)), - ), - ) - .await?; +**Q: Who can I contact for more information about the program?** -``` +A: After reading these FAQs in full, if you need more details or assistance, please contact startups@qdrant.com. -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.QuantizationSearchParams; -import io.qdrant.client.grpc.Points.QueryPoints; -import io.qdrant.client.grpc.Points.SearchParams; +<|page-340-lllmstxt|> +We are excited to announce that Qdrant has partnered with [Shakudo](https://www.shakudo.io/), bringing [Qdrant Hybrid Cloud](https://qdrant.tech/hybrid-cloud/) to Shakudo’s virtual private cloud (VPC) deployments. This collaboration allows Shakudo clients to seamlessly integrate Qdrant’s high-performance vector database as a managed service into their private infrastructure, ensuring data sovereignty, scalability, and low-latency vector search for enterprise AI applications. -import static io.qdrant.client.QueryFactory.nearest; +## Data Sovereignty and Compliance with Secure Vector Search -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +Shakudo’s VPC deployments ensure that client data remains within their infrastructure, providing strict control over sensitive information while leveraging a fully managed AI toolset. Qdrant Hybrid Cloud is tailored for environments where data privacy and regulatory compliance are paramount. It keeps the data plane inside the customer's infrastructure, with only essential telemetry shared externally, guaranteeing database isolation and security, while providing a fully managed service. -client.queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) - .setParams( - SearchParams.newBuilder() - .setQuantization( - QuantizationSearchParams.newBuilder().setIgnore(true).build()) - .build()) - .setLimit(10) - .build()) - .get(); +![shakudo-case-study](/blog/case-study-shakudo/shakudo-case-study.jpg) -``` +## Scaling and Performance Optimization for Enterprise Vector Search -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +Qdrant Hybrid Cloud is optimized for Kubernetes, allowing for fast, automated deployments and hands-off cluster management. Shakudo’s platform, designed for VPC-based environments, allows businesses to deploy Qdrant’s vector search clusters with no DevOps overhead. Qdrant’s ability to handle billions of vectors - powered by our customized Hierarchical Navigable Small World (HNSW) indexing - ensures real-time processing and high accuracy for AI-driven applications like semantic search, recommendation systems, and retrieval-augmented generation (RAG). -var client = new QdrantClient("localhost", 6334); +## Staying Compatible with the Entire Stack -await client.QueryAsync( - collectionName: "{collection_name}", - query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, - searchParams: new SearchParams - { - Quantization = new QuantizationSearchParams { Ignore = true } - }, - limit: 10 -); +By deploying Qdrant Hybrid Cloud on Shakudo, organizations gain immediate compatibility with their existing data sources, pipelines, and applications. It integrates seamlessly with the existing stack, ensuring smooth and efficient operation across all components. As business needs evolve, the data stack can easily scale and adapt to new demands. -``` +## Key Benefits of Qdrant in Shakudo's Virtual Private Cloud -```go -import ( - "context" +- **Data Privacy & Control**: Shakudo users can run a Qdrant vector database inside their own VPC, ensuring sensitive data never leaves their infrastructure, while enjoying a managed service for simplicity and reliability. +- **Seamless Integration**: Qdrant’s Kubernetes-native setup allows rapid deployment on Shakudo’s VPC-based infrastructure, which provides pre-configured environments optimized for AI workloads. +- **Scalability**: Qdrant’s ability to handle billions of vectors and its high-performance indexing like HNSW make it ideal for applications requiring fast, accurate similarity searches. +- **Enterprise Flexibility**: With both on-premise and cloud-native setups available, this partnership offers businesses the flexibility to balance operational needs with privacy requirements​. - "github.com/qdrant/go-client/qdrant" -) +## Learn More -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Ready to learn how Qdrant on Shakudo can enhance your AI infrastructure? Contact the Shakudo team to explore how they can help you deploy secure, high-performance vector search in your VPC environment, or get started [here](https://www.shakudo.io/integrations/qdrant). -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), - Params: &qdrant.SearchParams{ - Quantization: &qdrant.QuantizationSearchParams{ - Ignore: qdrant.PtrOf(false), - }, - }, -}) +If you are interested in Qdrant’s Managed Cloud, Hybrid Cloud, or Private Cloud solutions for flexible deployment options for top-tier data privacy, [contact us](https://qdrant.tech/contact-us/). -``` +<|page-341-lllmstxt|> +# Using Performance Metrics to Evaluate RAG Systems -- **Adjust the quantile parameter**: The quantile parameter in scalar quantization determines the quantization bounds. -By setting it to a value lower than 1.0, you can exclude extreme values (outliers) from the quantization bounds. -For example, if you set the quantile to 0.99, 1% of the extreme values will be excluded. -By adjusting the quantile, you find an optimal value that will provide the best search quality for your collection. +Evaluating the performance of a [Retrieval-Augmented Generation (RAG)](/rag/) application can be a complex task for developers. -- **Enable rescore**: Having the original vectors available, Qdrant can re-evaluate top-k search results using the original vectors. On large collections, this can improve the search quality, with just minor performance impact. +To help simplify this, Qdrant has partnered with [Relari](https://www.relari.ai) to provide an in-depth [RAG evaluation](/articles/rapid-rag-optimization-with-qdrant-and-quotient/) process. +As a [vector database](https://qdrant.tech), Qdrant handles the data storage and retrieval, while Relari enables you to run experiments to assess how well your RAG app performs in real-world scenarios. Together, they allow for fast, iterative testing and evaluation, making it easier to keep up with your app's development pace. -#### [Anchor](https://qdrant.tech/documentation/guides/quantization/\#memory-and-speed-tuning) Memory and speed tuning +{{< figure + src="/blog/qdrant-relari/banner.png" + caption="Qdrant & Relari partnered on a joint project to test RAG performance with synthetic data." + alt="Qdrant & Relari partnered on a joint project to test RAG performance with synthetic data." +>}} -In this section, we will discuss how to tune the memory and speed of the search process with quantization. +## What you'll learn -There are 3 possible modes to place storage of vectors within the qdrant collection: +In a recent webinar, we discussed the best approaches to building and evaluating RAG systems. Relari offers tools to evaluate [large language model (LLM) applications](/rag/) using both intrinsic and extrinsic methods. Combined with Qdrant’s powerful [data storage](/qdrant-vector-database/), it creates a solid framework for evaluation. -- **All in RAM** \- all vector, original and quantized, are loaded and kept in RAM. This is the fastest mode, but requires a lot of RAM. Enabled by default. +In this post, we’ll cover **two evaluation methods** you can use with Qdrant and Relari, along with practical use cases. Specifically, we’ll walk you through an example that analyzes the GitLab legal policies dataset. You can follow along with the code in this [Google Colab Notebook](https://colab.research.google.com/drive/1p6bF15ZWI7qVEh-b_7YGt1n1UwF3QAqd). -- **Original on Disk, quantized in RAM** \- this is a hybrid mode, allows to obtain a good balance between speed and memory usage. Recommended scenario if you are aiming to shrink the memory footprint while keeping the search speed. +## Key metrics for RAG evaluation: Top-K and Auto Prompt Optimization +To ensure your RAG system works well in real-world conditions, it's crucial to focus on performance optimization. While traditional metrics like precision, recall, and rank-based methods are helpful, they aren't always enough. Two advanced strategies for evaluating your RAG system are **Top-K Parameter Optimization** and **Auto Prompt Optimization**. These techniques help improve the chances that your model delivers the best experience for actual users. -This mode is enabled by setting `always_ram` to `true` in the quantization config while using memmap storage: +### Top-K parameter optimization -httppythontypescriptrustjavacsharpgo +The **Top-K** parameter controls how many top results are shown to users. Imagine using a [search engine](/advanced-search/) that only shows one result per query—it might be a good result, but most users prefer having more options. On the other hand, showing too many results can overwhelm users. -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 768, - "distance": "Cosine", - "on_disk": true - }, - "quantization_config": { - "scalar": { - "type": "int8", - "always_ram": true - } - } -} +For example, in a [product recommendation system](/recommendations/), the Top-K setting determines whether users see the top 3 best-selling products or 10 different options. Tuning this parameter ensures that users have enough relevant choices without feeling lost. -``` +With Relari and Qdrant, testing different Top-K values is easy. First, we’ll build a simple RAG app with Qdrant, and then we’ll use Relari to evaluate its performance. -```python -from qdrant_client import QdrantClient, models +### How to get started -client = QdrantClient(url="http://localhost:6333") +Head over to [Qdrant Cloud](https://cloud.qdrant.io) and [Relari](https://www.relari.ai) to create accounts and get your API keys. Once you have the keys, add them to your secrets in Google Colab, and you're ready to begin! -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE, on_disk=True), - quantization_config=models.ScalarQuantization( - scalar=models.ScalarQuantizationConfig( - type=models.ScalarType.INT8, - always_ram=True, - ), - ), -) +#### Install dependencies -``` +In this case, we will use Qdrant, FastEmbed, Relari, and LangChain** -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +```python +!pip install relari langchain_community langchain_qdrant +!pip install unstructured rank_bm25 +!pip install --upgrade nltk +``` -const client = new QdrantClient({ host: "localhost", port: 6333 }); +#### Setup the environment -client.createCollection("{collection_name}", { - vectors: { - size: 768, - distance: "Cosine", - on_disk: true, - }, - quantization_config: { - scalar: { - type: "int8", - always_ram: true, - }, - }, -}); +```python +from google.colab import userdata +import os +os.environ['RELARI_API_KEY'] = userdata.get('RELARI_API_KEY') +os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY') ``` -```rust -use qdrant_client::qdrant::{ - CreateCollectionBuilder, Distance, QuantizationType, ScalarQuantizationBuilder, - VectorParamsBuilder, -}; -use qdrant_client::Qdrant; +### Set up Relari -let client = Qdrant::from_url("http://localhost:6334").build()?; +```python +from relari import RelariClient +client = RelariClient() +``` -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) - .quantization_config( - ScalarQuantizationBuilder::default() - .r#type(QuantizationType::Int8.into()) - .always_ram(true), - ), - ) - .await?; +#### Create a new Relari Project +```python +proj = client.projects.create(name="Gitlab Employee Assistant") ``` -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; -import io.qdrant.client.grpc.Collections.QuantizationConfig; -import io.qdrant.client.grpc.Collections.QuantizationType; -import io.qdrant.client.grpc.Collections.ScalarQuantization; -import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.grpc.Collections.VectorsConfig; +### Defining the golden dataset -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +For this case study, we'll be using the GitLab legal policies dataset, but you can easily swap in your own dataset. Datasets are critical in Relari’s approach to evaluating and improving [LLM applications](/rag/). The dataset serves as the "ground truth" or reference point for testing the accuracy and performance of an LLM pipeline. -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig( - VectorsConfig.newBuilder() - .setParams( - VectorParams.newBuilder() - .setSize(768) - .setDistance(Distance.Cosine) - .setOnDisk(true) - .build()) - .build()) - .setQuantizationConfig( - QuantizationConfig.newBuilder() - .setScalar( - ScalarQuantization.newBuilder() - .setType(QuantizationType.Int8) - .setAlwaysRam(true) - .build()) - .build()) - .build()) - .get(); +Relari’s data-driven approach ensures that the evaluation is reliable and thorough. You can learn more about how Relari handles datasets [here](https://docs.relari.ai/getting-started/datasets/intro). +```python +!wget https://ceevaldata.blob.core.windows.net/examples/gitlab/gitlab_legal_policies.zip +!unzip gitlab_legal_policies.zip -d gitlab_legal_policies ``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +### Create the dataset -var client = new QdrantClient("localhost", 6334); +Once the data is downloaded, you can create the golden dataset by running the following command. This dataset will serve as your test or ground truth for evaluation, providing a benchmark to measure the accuracy of your RAG application. -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine, OnDisk = true }, - quantizationConfig: new QuantizationConfig - { - Scalar = new ScalarQuantization { Type = QuantizationType.Int8, AlwaysRam = true } - } -); +```python +from pathlib import Path +dir = Path("gitlab_legal_policies") +task_id = client.synth.new( + project_id=proj["id"], + name="Gitlab Legal Policies", + samples=30, + files=list(dir.glob("*.txt")), +) ``` -```go -import ( - "context" +This will prepare the dataset for use in Relari, allowing you to evaluate your application against a known reference. - "github.com/qdrant/go-client/qdrant" -) + -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +### Build a simple RAG app -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 768, - Distance: qdrant.Distance_Cosine, - OnDisk: qdrant.PtrOf(true), - }), - QuantizationConfig: qdrant.NewQuantizationScalar(&qdrant.ScalarQuantization{ - Type: qdrant.QuantizationType_Int8, - AlwaysRam: qdrant.PtrOf(true), - }), -}) +Now that the project is set up, let’s move on to building the Retrieval-Augmented Generation (RAG) application for evaluation. We’ll be using Qdrant for [vector search](/documentation/overview/vector-search/), [FastEmbed](/documentation/fastembed/) for embeddings, and [LangChain](/blog/using-qdrant-and-langchain/) for managing the document workflow. + +#### Import all libraries +```python +from langchain_community.document_loaders.directory import DirectoryLoader +from langchain_qdrant import Qdrant +from langchain_community.embeddings.fastembed import FastEmbedEmbeddings +from relari.core.types import DatasetDatum ``` -In this scenario, the number of disk reads may play a significant role in the search speed. -In a system with high disk latency, the re-scoring step may become a bottleneck. +### Load and chunk data -Consider disabling `rescore` to improve the search speed: +We will use LangChain to prepare our data. -httppythontypescriptrustjavacsharpgo +```python +# load the document and split it into chunks +loader = DirectoryLoader("gitlab_legal_policies/") +documents = loader.load_and_split() +``` -```http -POST /collections/{collection_name}/points/query -{ - "query": [0.2, 0.1, 0.9, 0.7], - "params": { - "quantization": { - "rescore": false - } - }, - "limit": 10 -} +Now we’ll use FastEmbed, Qdrant's built-in embedding provider, to embed our chunks. +```python +# Initialize FastEmbedEmbeddings +embeddings = FastEmbedEmbeddings( + model_name="BAAI/bge-small-en-v1.5", # specify the model +) ``` -```python -from qdrant_client import QdrantClient, models +### Store data in Qdrant -client = QdrantClient(url="http://localhost:6333") +Finally, we'll upload the chunks into a Qdrant collection. -client.query_points( - collection_name="{collection_name}", - query=[0.2, 0.1, 0.9, 0.7], - search_params=models.SearchParams( - quantization=models.QuantizationSearchParams(rescore=False) - ), +```python +# Load chunks into a Qdrant Cloud vectorstore using FastEmbedEmbeddings +db = Qdrant.from_documents( + documents, + embedding=embeddings, + url=os.environ['QDRANT_URL'], # Qdrant Cloud URL + api_key=os.environ['QDRANT_API_KEY'], # Qdrant Cloud API Key + collection_name="gitlab_legal_policies", ) - +print(f"{len(documents)} chunks loaded into Qdrant Cloud vector database.") ``` + -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +### Start logging results -const client = new QdrantClient({ host: "localhost", port: 6333 }); +Now that the data is uploaded to Qdrant, we can build a function to run different RAG pipelines over the dataset and log the results for evaluation. This will allow us to track the performance of various configurations, such as different Top-K values, and feed the results back into Relari for further analysis. -client.query("{collection_name}", { - query: [0.2, 0.1, 0.9, 0.7], - params: { - quantization: { - rescore: false, - }, - }, -}); +Here’s a function that logs the results from different retriever configurations: +```python +# Prepare a function to run different RAG pipelines over the dataset and log the results +def log_retriever_results(retriever, dataset): + log = list() + for datum in dataset.data: + # First compute the result + retrieved_docs = retriever.invoke(datum["question"]) + # Now log the result in Relari format + result = DatasetDatum( + label=datum["uid"], + data={"retrieved_context": [doc.page_content for doc in retrieved_docs]}, + ) + log.append(result) + return log ``` -```rust -use qdrant_client::qdrant::{ - QuantizationSearchParamsBuilder, QueryPointsBuilder, SearchParamsBuilder, -}; -use qdrant_client::Qdrant; +This is the power of combining Qdrant and Relari. Instead of having to build multiple applications, slowly [upsert](/documentation/concepts/points/#upload-points), and [retrieve](/documentation/concepts/search/) data, you can use both to quickly test different parameters and instantly get results. This evaluation system is built for fast, useful iteration. -let client = Qdrant::from_url("http://localhost:6334").build()?; +### Evaluate results -client - .query( - QueryPointsBuilder::new("{collection_name}") - .query(vec![0.2, 0.1, 0.9, 0.7]) - .limit(3) - .params( - SearchParamsBuilder::default() - .quantization(QuantizationSearchParamsBuilder::default().rescore(false)), - ), - ) - .await?; +Now that the RAG application is built, it’s time to evaluate its performance by experimenting with different Top-K values. The Top-K parameter controls how many top results are returned to users during retrieval, and optimizing this can improve user experience and the relevance of results. -``` +#### First attempt: experimenting with top-K -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.QuantizationSearchParams; -import io.qdrant.client.grpc.Points.QueryPoints; -import io.qdrant.client.grpc.Points.SearchParams; +In this experiment, we will test various Top-K values (3, 5, 7, and 9) to see how they affect retrieval performance. -import static io.qdrant.client.QueryFactory.nearest; +```python +k_values = [3, 5, 7, 9] # Define the different values of top k to experiment -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +semantic_retrievers = {} +semantic_logs = {} -client.queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) - .setParams( - SearchParams.newBuilder() - .setQuantization( - QuantizationSearchParams.newBuilder().setRescore(false).build()) - .build()) - .setLimit(3) - .build()) - .get(); +# Run the retrievers on the dataset and log retrieved chunks +for k in k_values: + retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k}) + log = log_retriever_results(retriever, dataset) + semantic_retrievers[f"k_{k}"] = retriever + semantic_logs[f"k_{k}"] = log + print(f"Results on {dataset.name} by Semantic Retriever with k={k} saved!") ``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +#### Send results to Relari -var client = new QdrantClient("localhost", 6334); +Once you’ve logged the results from the different Top-K experiments, you can submit them to Relari for evaluation. Relari will analyze your results using metrics like Precision/Recall and Rank-Aware methods, allowing you to compare the performance of each configuration. Here’s how to send your results to Relari: -await client.QueryAsync( - collectionName: "{collection_name}", - query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, - searchParams: new SearchParams - { - Quantization = new QuantizationSearchParams { Rescore = false } - }, - limit: 3 -); +For each Top-K configuration, we’ll submit the results to Relari and run the evaluation using appropriate metrics. This will help you benchmark the performance of your RAG system based on different values of K. + +```python +from relari import Metric + +for k in k_values: + eval_name = f"Semantic Retriever Evaluation k={k}" + eval_data = semantic_logs[f"k_{k}"] + eval_info = client.evaluations.submit( + project_id=proj["id"], + dataset=dataset_info["id"], + name=eval_name, + pipeline=[Metric.PrecisionRecallF1, Metric.RankedRetrievalMetrics], + data=eval_data, + ) + print(f"{eval_name} submitted!") ``` -```go -import ( - "context" +With our dataset, if we want the recall to be greater than 85%, we should pick a K value of at least 7. +![top-k-value](/blog/qdrant-relari/top-k-value.png) - "github.com/qdrant/go-client/qdrant" -) +We can even look at individual cases in the UI to get more insight. +![dashboard](/blog/qdrant-relari/dashboard.png) -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Relari and Qdrant can also be integrated to evaluate [hybrid search systems](/articles/hybrid-search/), which combine both sparse (traditional keyword-based) and dense (vector-based) search methods. This combination allows you to leverage the strengths of both approaches, potentially improving the relevance and accuracy of search results. -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), - Params: &qdrant.SearchParams{ - Quantization: &qdrant.QuantizationSearchParams{ - Rescore: qdrant.PtrOf(false), - }, - }, -}) +By using Relari’s evaluation framework alongside Qdrant’s [vector search](/advanced-search/) capabilities, you can experiment with different configurations for hybrid search. For example, you might test varying the ratio of [sparse-to-dense search results](/documentation/concepts/hybrid-queries/#hybrid-search) or adjust how each component contributes to the overall retrieval score. -``` +## Auto Prompt Optimization -- **All on Disk** \- all vectors, original and quantized, are stored on disk. This mode allows to achieve the smallest memory footprint, but at the cost of the search speed. +In conversational applications like [chatbots](/rag/), **Auto Prompt Optimization (APO)** enhances the chatbot's communication effectiveness by continuously refining how it interacts with users. APO learns from previous interactions to adjust and improve the phrasing of responses, resulting in more accurate, engaging, and user-friendly dialogues. -It is recommended to use this mode if you have a large collection and fast storage (e.g. SSD or NVMe). +For instance, in a [customer service chatbot](/documentation/examples/rag-customer-support-cohere-airbyte-aws/), the way a question is phrased can greatly impact user satisfaction. While the same information may be conveyed, how it’s expressed matters. Think of ordering in a Parisian cafĂ©: ordering in French may result in a more pleasant interaction than in English, even though the request is the same. Similarly, APO helps chatbots find the optimal way to frame questions or responses to ensure users feel understood and engaged, enhancing the overall experience. -This mode is enabled by setting `always_ram` to `false` in the quantization config while using mmap storage: +Over time, APO fine-tunes the prompts used by the chatbot to optimize interactions, making the system more responsive to user needs and context, improving the quality of the generated answers, and ultimately increasing user satisfaction. -httppythontypescriptrustjavacsharpgo +Auto Prompt Optimization continuously refines the chatbot’s responses to improve user interactions. Here's how you can implement APO with Relari: -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 768, - "distance": "Cosine", - "on_disk": true - }, - "quantization_config": { - "scalar": { - "type": "int8", - "always_ram": false - } - } -} +### Set up base prompt -``` +With Auto Prompt Optimization, you can define a system prompt and inspect the results at every iteration of the interaction. ```python -from qdrant_client import QdrantClient, models - -client = QdrantClient(url="http://localhost:6333") +from relari.core.types import Prompt, UserPrompt -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE, on_disk=True), - quantization_config=models.ScalarQuantization( - scalar=models.ScalarQuantizationConfig( - type=models.ScalarType.INT8, - always_ram=False, - ), +base_prompt = Prompt( + system="You are a GitLab legal policy Q&A bot. Answer the following question given the context.", + user=UserPrompt( + prompt="Question: $question\n\nContext:\n$ground_truth_context", + description="Question and context to answer the question.", ), ) - ``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +The prompt uses the variables`question` & `ground_truth_context`. These help measure how faithful the generated answer is to the ground truth context (i.e., it's not hallucinating). -const client = new QdrantClient({ host: "localhost", port: 6333 }); +For more details on other metrics, visit the [Relari documentation site](https://docs.relari.ai/metrics/PromptOptimization/supported_metrics) -client.createCollection("{collection_name}", { - vectors: { - size: 768, - distance: "Cosine", - on_disk: true, - }, - quantization_config: { - scalar: { - type: "int8", - always_ram: false, - }, - }, -}); +#### Set up task id +```python +task_id = client.prompts.optimize( + name="GitLab Legal Policy RAG Prompt", + project_id=proj["id"], + dataset_id=dataset_info["id"], + prompt=base_prompt, + llm="gpt-4o-mini", + task_description="Answer the question using the provided context.", + metric=client.prompts.Metrics.CORRECTNESS, +) +print(f"Optimization task submitted with ID: {task_id}") ``` -```rust -use qdrant_client::qdrant::{ - CreateCollectionBuilder, Distance, QuantizationType, ScalarQuantizationBuilder, - VectorParamsBuilder, -}; -use qdrant_client::Qdrant; +The [CORRECTNESS](https://docs.relari.ai/metrics/Generation/Deterministic/correctness) metric measures how close the generated answer is the the ground truth reference answers. -let client = Qdrant::from_url("http://localhost:6334").build()?; +### Analyze Prompts -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine).on_disk(true)) - .quantization_config( - ScalarQuantizationBuilder::default() - .r#type(QuantizationType::Int8.into()) - .always_ram(false), - ), - ) - .await?; +Once Auto Prompt Optimization has been set up, you can begin analyzing how the prompts evolve with each iteration in the Relari UI. This allows you to see how the system is adapting and refining its responses based on previous interactions and user feedback. -``` +In the Relari UI, you can: -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; -import io.qdrant.client.grpc.Collections.QuantizationConfig; -import io.qdrant.client.grpc.Collections.QuantizationType; -import io.qdrant.client.grpc.Collections.ScalarQuantization; -import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.grpc.Collections.VectorsConfig; +1. **Track Changes**: Review how prompts change over time and see the iterations that lead to improved performance. For example, you can analyze how different phrasings affect the accuracy and relevance of chatbot responses. + +2. **Evaluate Effectiveness**: Check how each prompt performs against key metrics like correctness, fluency, and user satisfaction. You can see which iterations lead to better outcomes and which need further adjustment. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +3. **Compare Iterations**: Visualize side-by-side comparisons of different prompt iterations, helping you understand which specific changes result in more accurate or engaging responses. -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig( - VectorsConfig.newBuilder() - .setParams( - VectorParams.newBuilder() - .setSize(768) - .setDistance(Distance.Cosine) - .setOnDisk(true) - .build()) - .build()) - .setQuantizationConfig( - QuantizationConfig.newBuilder() - .setScalar( - ScalarQuantization.newBuilder() - .setType(QuantizationType.Int8) - .setAlwaysRam(false) - .build()) - .build()) - .build()) - .get(); +4. **Identify Patterns**: Look for patterns in user interactions and how the chatbot adapts to different scenarios, giving you insights into what works best for your target audience. -``` +This prompt iteration analysis helps ensure that your chatbot's conversational flow continually improves, leading to more natural, effective interactions with users. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +![prompt-optimization](/blog/qdrant-relari/prompt-optimization.png) -var client = new QdrantClient("localhost", 6334); +The optimal **System Prompt** and **Few-Shot Examples** generated from Auto Prompt Optimization (APO) can significantly enhance your Retrieval-Augmented Generation (RAG) system's performance. -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine, OnDisk = true}, - quantizationConfig: new QuantizationConfig - { - Scalar = new ScalarQuantization { Type = QuantizationType.Int8, AlwaysRam = false } - } -); +- **System Prompt**: This is the foundational instruction that guides how your chatbot or application responds to user queries. APO helps refine this prompt to ensure that responses are aligned with user expectations and the application's goal, resulting in clearer and more accurate outputs. -``` +- **Few-Shot Examples**: These are examples provided to the model to demonstrate how to answer a question or solve a problem. By optimizing these examples, your RAG system can better understand the context and provide more relevant and coherent responses. For example, using well-crafted few-shot examples can drastically reduce hallucinations in language model outputs and lead to more contextually accurate results. -```go -import ( - "context" +Once you’ve identified the optimal system prompt and few-shot examples through APO, you can integrate them into your RAG system. This will ensure that the model consistently delivers high-quality results across different scenarios, improving the overall user experience and system performance. - "github.com/qdrant/go-client/qdrant" -) +## Conclusion -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Combining **Relari** and **Qdrant** allows you to create an iterative, data-driven evaluation framework, improving your RAG system for optimal real-world performance. These methods help ensure that your application is both responsive and effective, especially when dealing with user queries or recommendations. -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 768, - Distance: qdrant.Distance_Cosine, - OnDisk: qdrant.PtrOf(true), - }), - QuantizationConfig: qdrant.NewQuantizationScalar( - &qdrant.ScalarQuantization{ - Type: qdrant.QuantizationType_Int8, - AlwaysRam: qdrant.PtrOf(false), - }, - ), -}) +If you’d like to get started, sign up for **free** at [Qdrant Cloud](https://cloud.qdrant.io) and [Relari](https://www.relari.ai). + +<|page-342-lllmstxt|> +![nyris-case-study](/blog/case-study-nyris/nyris-case-study.png) + +## About Nyris + +Founded in 2015 by CTO Markus Lukasson and his sister Anna Lukasson-Herzig, [Nyris](https://www.nyris.io/) offers advanced visual search solutions for companies, positioning itself as the "Google Lens" for corporate data. Their technology powers use cases such as visual search on websites of large retailers and machine manufacturing companies that require visual identification of spare parts. The primary goal is to identify items in a product catalog or spare parts as quickly as possible. With a strong foundation in e-commerce and nearly a decade of experience in vector search, Nyris is at the forefront of visual search innovation. + +Beyond visual search, Nyris also provides synthetic data solutions, particularly for manufacturing and engineering sectors. Often, customers in these industries lack sufficient photos of parts to leverage visual search effectively. However, they do possess CAD files for their products. Nyris generates synthetic images from these CAD files, enabling visual search without needing actual product photos in the database. -``` +Prominent clients such as IKEA, Trumpf (a precision laser manufacturer), and DMG Mori rely on Nyris to support their field engineers in maintaining parts. -##### Was this page useful? +![nyris-visual-search](/blog/case-study-nyris/nyris-visual-search.png) -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +## Overcoming Limitations in Visual Product Search -Thank you for your feedback! 🙏 +During his time at Amazon, Lukasson observed that search engines like Google often outperformed Amazon's search capabilities for product searches. Recognizing the need for more precise search solutions in industries like e-commerce and spare part management, he identified a significant gap: Traditional keyword-based searches often fail, especially in situations where field engineers struggle to describe parts accurately with keywords. Visual search offers a solution, providing faster and more accurate results by leveraging images, which carry significantly more information than text-based queries. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/quantization.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +In their quest for the perfect visual search provider, Nyris ultimately decided to develop their own solution. -On this page: +## The Path to Vector-Based Visual Search -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/quantization.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Initially in 2015, the team explored traditional search algorithms based on key value SIFT (Scale Invariant Feature Transform) features to locate specific elements within images. However, they quickly realized that these methods were imprecise and unreliable. To address this, Nyris began experimenting with the first Convolutional Neural Networks (CNNs) to extract embeddings for vector search. -× +In the early days of vector search, there were few solutions available. Nyris initially developed their own vector search solution but later transitioned to SingleStore. At that time, SingleStore was the only option that could deliver efficient and fast brute-force vector search at scale. As Nyris's data grew, the need for rapid scaling became evident. They found that many standard database features, such as real-time analytics and atomicity, were unnecessary for their specific needs. Instead, what Nyris required was a solution focused on fast and efficient vector search capabilities, along with features that would enhance the search experience for their customers. -[Powered by](https://qdrant.tech/) +With the emergence of pure-play, native vector search engines, Nyris conducted extensive research and benchmarks. Ultimately, they chose Qdrant as their vector search engine of choice. Qdrant stood out for its accuracy, speed, and the ability to handle large datasets efficiently, meeting all of Nyris's requirements for a robust and scalable vector search solution. -<|page-172-lllmstxt|> -## vector-similarity-beyond-search -- [Articles](https://qdrant.tech/articles/) -- Vector Similarity: Going Beyond Full-Text Search \| Qdrant +### The Selection Process -[Back to Data Exploration](https://qdrant.tech/articles/data-exploration/) +As part of their selection process, Nyris evaluated several critical factors to ensure they chose the best vector search engine solution: -# Vector Similarity: Going Beyond Full-Text Search \| Qdrant +- **Accuracy and Speed**: These were primary considerations. Nyris needed to understand the performance differences between the [HNSW](https://qdrant.tech/articles/filtrable-hnsw/) graph-based approach and brute-force search. In particular, they examined edge cases that required numerous filters, sometimes necessitating a switch to brute-force search. Even in these scenarios, Qdrant demonstrated impressive speed and reliability, meeting Nyris's stringent performance requirements. +- **Insert Speed**: Nyris assessed how quickly data could be inserted into the database, including the performance during simultaneous data ingests and query requests. Qdrant excelled in this area, providing the necessary efficiency for their operations. +- **Total Cost of Ownership**: Nyris analyzed the infrastructure costs and licensing fees associated with each solution. Qdrant offered a competitive total cost of ownership, making it an economically viable option. +- **Data Sovereignty**: The ability to deploy Qdrant in their own clusters was a key aspect for Nyris, ensuring they maintained control over their data and complied with relevant data sovereignty requirements. +- **Dedicated Vector Search Engine:** One of the key advantages of Qdrant, as Lukasson highlights, is its specialization as a dedicated, native vector search engine. "Qdrant, being purpose-built for vector search, can introduce relevant features much faster, like [quantization](https://qdrant.tech/documentation/guides/quantization/), integer8 support, and float32 rescoring. These advancements make searches more precise and cost-effective without sacrificing accuracy—exactly what Nyris needs," said Lukasson. "When optimizing for search accuracy and speed, compromises aren't an option. Just as you wouldn't use a truck to race in Formula 1, we needed a solution designed specifically for vector search, not just a general database with vector search tacked on. With every Qdrant release, we gain new, tailored features that directly enhance our use case.” -Luis CossĂ­o +## Key Benefits of Qdrant in Production -· +Nyris has found several aspects of Qdrant particularly beneficial in their production environment: -August 08, 2023 +- **Enhanced Security with JWT**: [JSON Web Tokens](https://qdrant.tech/documentation/guides/security/#granular-access-control-with-jwt) provide enhanced security and performance, critical for safeguarding their data. +- **Seamless Scalability**: Qdrant's ability to [scale effortlessly across nodes](https://qdrant.tech/documentation/guides/distributed_deployment/) ensures consistent high performance, even as Nyris's data volume grows. +- **Flexible Search Options**: The availability of both graph-based and brute-force search methods offers Nyris the flexibility to tailor the search approach to specific use case requirements. +- **Versatile Data Handling**: Qdrant imposes almost no restrictions on data types and vector sizes, allowing Nyris to manage diverse and complex datasets effectively. +- **Built with Rust**: The use of [Rust](https://qdrant.tech/articles/why-rust/) ensures superior performance and future-proofing, while its open-source nature allows Nyris to inspect and customize the code as necessary. +- **Cost-Effective High Performance Search**: Qdrant’s efficient search capabilities ensure that Nyris can maintain high performance at a reasonable cost. With Qdrant, Nyris can search through extensive datasets efficiently, making it a crucial part of their technology stack. -![Vector Similarity: Going Beyond Full-Text Search | Qdrant](https://qdrant.tech/articles_data/vector-similarity-beyond-search/preview/title.jpg) +By hosting Qdrant on Google Cloud within their Kubernetes Cluster, Nyris benefits from the scalability and reliability essential for their demanding operations, ensuring a robust and efficient visual search solution. -# [Anchor](https://qdrant.tech/articles/vector-similarity-beyond-search/\#vector-similarity-unleashing-data-insights-beyond-traditional-search) Vector Similarity: Unleashing Data Insights Beyond Traditional Search +## Why Pure-Vector Search is the Future for Product Search -When making use of unstructured data, there are traditional go-to solutions that are well-known for developers: +Nyris’s vision is to identify every single product and spare part within milliseconds, and Qdrant plays an integral role in this. When envisioning the future of product search, Lukasson is convinced that vector representations will be the key to advancing search capabilities. Unlike keyword searches, vector search can seamlessly integrate various modalities, such as text, images, as well as depth or audio. This holistic approach will transform product and spare part searches, allowing for a single vector representation that encompasses a product’s text, visual and geometric descriptions. -- **Full-text search** when you need to find documents that contain a particular word or phrase. -- **[Vector search](https://qdrant.tech/documentation/overview/vector-search/)** when you need to find documents that are semantically similar to a given query. +“While traditional algorithms like BM25 are fast and cheap and still have a place in the search stack, vectors will replace them in the coming years," says Lukasson. "Today, we have separate spaces for text search, visual search, and other modalities, but we envision a future with a unified vector representation that encompasses all relevant item data. No matter what input you use for your query, the search results will be accurate. The days of scrolling through thousands of results or encountering 'no results' pages will soon be over. Every search request will deliver the right product or spare part in milliseconds.” -Sometimes people mix those two approaches, so it might look like the vector similarity is just an extension of full-text search. However, in this article, we will explore some promising new techniques that can be used to expand the use-case of unstructured data and demonstrate that vector similarity creates its own stack of data exploration tools. +<|page-343-lllmstxt|> +![kern-case-study](/blog/case-study-kern/kern-case-study.png) -## [Anchor](https://qdrant.tech/articles/vector-similarity-beyond-search/\#what-is-vector-similarity-search) What is vector similarity search? +## About Kern AI -Vector similarity offers a range of powerful functions that go far beyond those available in traditional full-text search engines. From dissimilarity search to diversity and recommendation, these methods can expand the cases in which vectors are useful. +[Kern AI](https://kern.ai/) specializes in data-centric AI. Originally an AI consulting firm, the team led by Co-Founder and CEO Johannes Hötter quickly realized that developers spend 80% of their time reviewing data instead of focusing on model development. This inefficiency significantly reduces the speed of development and adoption of AI. To tackle this challenge, Kern AI developed a low-code platform that enables developers to quickly analyze their datasets and identify outliers using vector search. This innovation led to enhanced data accuracy and streamlined workflows for the rapid deployment of AI applications. -Vector Databases, which are designed to store and process immense amounts of vectors, are the first candidates to implement these new techniques and allow users to exploit their data to its fullest. +With the rise of ChatGPT, Kern AI expanded its platform to support the quick development of accurate and secure Generative AI by integrating large language models (LLMs) like GPT, tailoring solutions specifically for the financial services sector. Kern AI’s solution enhances the reliability of any LLM by modeling and integrating company data in a way LLMs can understand, offering a platform with leading data modeling capabilities. -## [Anchor](https://qdrant.tech/articles/vector-similarity-beyond-search/\#vector-similarity-search-vs-full-text-search) Vector similarity search vs. full-text search +## The Challenge -While there is an intersection in the functionality of these two approaches, there is also a vast area of functions that is unique to each of them. -For example, the exact phrase matching and counting of results are native to full-text search, while vector similarity support for this type of operation is limited. -On the other hand, vector similarity easily allows cross-modal retrieval of images by text or vice-versa, which is impossible with full-text search. +Kern AI has partnered with leading insurers to efficiently streamline the process of managing complex customer queries within customer service teams, reducing the time and effort required. Customer inquiries are often complex, and support teams spend significant time locating and interpreting relevant sections in insurance contracts. This process leads to delays in responses and can negatively impact customer satisfaction. -This mismatch in expectations might sometimes lead to confusion. -Attempting to use a vector similarity as a full-text search can result in a range of frustrations, from slow response times to poor search results, to limited functionality. -As an outcome, they are getting only a fraction of the benefits of vector similarity. +To tackle this, Kern AI developed an internal AI chatbot for first-level support teams. Their platform helps data science teams improve data foundations to expedite application production. By using embeddings to identify relevant data points and outliers, Kern AI ensures more efficient and accurate data handling. To avoid being restricted to a single embedding model, they experimented with various models, including sentiment embeddings, leading them to discover Qdrant. -![Full-text search and Vector Similarity Functionality overlap](https://qdrant.tech/articles_data/vector-similarity-beyond-search/venn-diagram.png) +![kern-user-interface](/blog/case-study-kern/kern-user-interface.png) -Full-text search and Vector Similarity Functionality overlap +*Kern AI Refinery, is an open-source tool to scale, assess and maintain natural language data.* -Below we will explore why the vector similarity stack deserves new interfaces and design patterns that will unlock the full potential of this technology, which can still be used in conjunction with full-text search. +The impact of their solution is evident in the case of [Markel Insurance SE](https://www.markel.com/), which reduced the average response times from five minutes to under 30 seconds per customer query. This change significantly enhanced customer experience and reduced the support team's workload. Johannes Hötter notes, "Our solution has revolutionized how first-level support operates in the insurance industry, drastically improving efficiency and customer satisfaction." -## [Anchor](https://qdrant.tech/articles/vector-similarity-beyond-search/\#new-ways-to-interact-with-similarities) New ways to interact with similarities +## The Solution -Having a vector representation of unstructured data unlocks new ways of interacting with it. -For example, it can be used to measure semantic similarity between words, to cluster words or documents based on their meaning, to find related images, or even to generate new text. -However, these interactions can go beyond finding their nearest neighbors (kNN). +Kern AI discovered Qdrant and was impressed by its interactive Discord community, which highlighted the active support and continuous improvements of the platform. Qdrant was the first vector database the team used, and after testing other alternatives, they chose Qdrant for several reasons: -There are several other techniques that can be leveraged by vector representations beyond the traditional kNN search. These include dissimilarity search, diversity search, recommendations, and discovery functions. +- **Multi-vector Storage**: This feature was crucial as it allowed the team to store and manage different search indexes. Given that no single embedding fits all use cases, this capability brought essential diversity to their embeddings, enabling more flexible and robust data handling. +- **Easy Setup**: Qdrant's straightforward setup process enabled Kern AI to quickly integrate and start utilizing the database without extensive overhead, which was critical for maintaining development momentum. +- **Open Source**: The open-source nature of Qdrant aligned with Kern AI's own product development philosophy. This allowed for greater customization and integration into their existing open-source projects. +- **Rapid Progress**: Qdrant's swift advancements and frequent updates ensured that Kern AI could rely on continuous improvements and cutting-edge features to keep their solutions competitive. +- **Multi-vector Search**: Allowed Kern AI to perform complex queries across different embeddings simultaneously, enhancing the depth and accuracy of their search results. +- **Hybrid Search/Filters**: Enabled the combination of traditional keyword searches with vector searches, allowing for more nuanced and precise data retrieval. -## [Anchor](https://qdrant.tech/articles/vector-similarity-beyond-search/\#dissimilarity-ssearch) Dissimilarity ssearch +Kern AI uses Qdrant's open-source, on-premise solution for both their open-source project and their commercial end-to-end framework. This framework, focused on the financial and insurance markets, is similar to LangChain or LlamaIndex but tailored to the industry-specific needs. -The Dissimilarity —or farthest— search is the most straightforward concept after the nearest search, which can’t be reproduced in a traditional full-text search. -It aims to find the most un-similar or distant documents across the collection. +![kern-data-retrieval](/blog/case-study-kern/kern-data-retrieval.png) -![Dissimilarity Search](https://qdrant.tech/articles_data/vector-similarity-beyond-search/dissimilarity.png) +*Configuring data retrieval in Kern AI: Fine-tuning search inputs and metadata for optimized information extraction.* -Dissimilarity Search +## The Results -Unlike full-text match, Vector similarity can compare any pair of documents (or points) and assign a similarity score. -It doesn’t rely on keywords or other metadata. -With vector similarity, we can easily achieve a dissimilarity search by inverting the search objective from maximizing similarity to minimizing it. +Kern AI's primary use case focuses on enhancing customer service with extreme precision. Leveraging Qdrant's advanced vector search capabilities, Kern AI consistently maintains hallucination rates under 1%. This exceptional accuracy allows them to build the most precise RAG (Retrieval-Augmented Generation) chatbot for financial services. -The dissimilarity search can find items in areas where previously no other search could be used. -Let’s look at a few examples. +Key Achievements: -### [Anchor](https://qdrant.tech/articles/vector-similarity-beyond-search/\#case-mislabeling-detection) Case: mislabeling detection +- **<1% Hallucination Rate**: Ensures the highest level of accuracy and reliability in their chatbot solutions for the financial and insurance sector. +- **Reduced Customer Service Response Times**: Using Kern AI's solution, Markel Insurance SE reduced response times from five minutes to under 30 seconds, significantly improving customer experience and operational efficiency. -For example, we have a dataset of furniture in which we have classified our items into what kind of furniture they are: tables, chairs, lamps, etc. -To ensure our catalog is accurate, we can use a dissimilarity search to highlight items that are most likely mislabeled. +By utilizing Qdrant, Kern AI effectively supports various use cases in financial services, such as: -To do this, we only need to search for the most dissimilar items using the -embedding of the category title itself as a query. -This can be too broad, so, by combining it with filters —a [Qdrant superpower](https://qdrant.tech/articles/filtrable-hnsw/)—, we can narrow down the search to a specific category. +- **Claims Management**: Streamlining the claims process by quickly identifying relevant data points. +- **Similarity Search**: Enhancing incident handling by finding similar cases to improve decision-making quality. -![Mislabeling Detection](https://qdrant.tech/articles_data/vector-similarity-beyond-search/mislabelling.png) +## Outlook -Mislabeling Detection +Kern AI plans to expand its use of Qdrant to support both brownfield and greenfield use cases across the financial and insurance industry. -The output of this search can be further processed with heavier models or human supervision to detect actual mislabeling. +<|page-344-lllmstxt|> +[Qdrant 1.11.0 is out!](https://github.com/qdrant/qdrant/releases/tag/v1.11.0) This release largely focuses on features that improve memory usage and optimize segments. However, there are a few cool minor features, so let's look at the whole list: -### [Anchor](https://qdrant.tech/articles/vector-similarity-beyond-search/\#case-outlier-detection) Case: outlier detection +Optimized Data Structures:
+**Defragmentation:** Storage for multitenant workloads is more optimized and scales better.
+**On-Disk Payload Index:** Store less frequently used data on disk, rather than in RAM.
+**UUID for Payload Index:** Additional data types for payload can result in big memory savings. -In some cases, we might not even have labels, but it is still possible to try to detect anomalies in our dataset. -Dissimilarity search can be used for this purpose as well. +Improved Query API:
+**GroupBy Endpoint:** Use this query method to group results by a certain payload field.
+**Random Sampling:** Select a subset of data points from a larger dataset randomly.
+**Hybrid Search Fusion:** We are adding the Distribution-Based Score Fusion (DBSF) method.
-![Anomaly Detection](https://qdrant.tech/articles_data/vector-similarity-beyond-search/anomaly-detection.png) +New Web UI Tools:
+**Search Quality Tool:** Test the precision of your semantic search requests in real-time.
+**Graph Exploration Tool:** Visualize vector search in context-based exploratory scenarios.
-Anomaly Detection +### Quick Recap: Multitenant Workloads -The only thing we need is a bunch of reference points that we consider “normal”. -Then we can search for the most dissimilar points to this reference set and use them as candidates for further analysis. +Before we dive into the specifics of our optimizations, let's first go over Multitenancy. This is one of our most significant features, [best used for scaling and data isolation](https://qdrant.tech/articles/multitenancy/). -## [Anchor](https://qdrant.tech/articles/vector-similarity-beyond-search/\#diversity-search) Diversity search +If you’re using Qdrant to manage data for multiple users, regions, or workspaces (tenants), we suggest setting up a [multitenant environment](/documentation/guides/multiple-partitions/). This approach keeps all tenant data in a single global collection, with points separated and isolated by their payload. -Even with no input provided vector, (dis-)similarity can improve an overall selection of items from the dataset. +To avoid slow and unnecessary indexing, it’s better to create an index for each relevant payload rather than indexing the entire collection globally. Since some data is indexed more frequently, you can focus on building indexes for specific regions, workspaces, or users. -The naive approach is to do random sampling. -However, unless our dataset has a uniform distribution, the results of such sampling might be biased toward more frequent types of items. +*For more details on scaling best practices, read [How to Implement Multitenancy and Custom Sharding](https://qdrant.tech/articles/multitenancy/).* -![Example of random sampling](https://qdrant.tech/articles_data/vector-similarity-beyond-search/diversity-random.png) +### Defragmentation of Tenant Storage -Example of random sampling +With version 1.11, Qdrant changes how vectors from the same tenant are stored on disk, placing them **closer together** for faster bulk reading and reduced scaling costs. This approach optimizes storage and retrieval operations for different tenants, leading to more efficient system performance and resource utilization. -The similarity information can increase the diversity of those results and make the first overview more interesting. -That is especially useful when users do not yet know what they are looking for and want to explore the dataset. +**Figure 1:** Re-ordering by payload can significantly speed up access to hot and cold data. -![Example of similarity-based sampling](https://qdrant.tech/articles_data/vector-similarity-beyond-search/diversity-force.png) +![defragmentation](/blog/qdrant-1.11.x/defragmentation.png) -Example of similarity-based sampling +**Example:** When creating an index, you may set `is_tenant=true`. This configuration will optimize the storage based on your collection’s usage patterns. -The power of vector similarity, in the context of being able to compare any two points, allows making a diverse selection of the collection possible without any labeling efforts. -By maximizing the distance between all points in the response, we can have an algorithm that will sequentially output dissimilar results. +```http +PUT /collections/{collection_name}/index +{ + "field_name": "group_id", + "field_schema": { + "type": "keyword", + "is_tenant": true + } +} +``` -![Diversity Search](https://qdrant.tech/articles_data/vector-similarity-beyond-search/diversity.png) +```python +client.create_payload_index( + collection_name="{collection_name}", + field_name="group_id", + field_schema=models.KeywordIndexParams( + type="keyword", + is_tenant=True, + ), +) +``` -Diversity Search +```typescript +client.createPayloadIndex("{collection_name}", { + field_name: "group_id", + field_schema: { + type: "keyword", + is_tenant: true, + }, +}); +``` -Some forms of diversity sampling are already used in the industry and are known as [Maximum Margin Relevance](https://python.langchain.com/docs/integrations/vectorstores/qdrant#maximum-marginal-relevance-search-mmr) (MMR). Techniques like this were developed to enhance similarity on a universal search API. -However, there is still room for new ideas, particularly regarding diversity retrieval. -By utilizing more advanced vector-native engines, it could be possible to take use cases to the next level and achieve even better results. +```rust +use qdrant_client::qdrant::{ + CreateFieldIndexCollectionBuilder, + KeywordIndexParamsBuilder, + FieldType +}; +use qdrant_client::{Qdrant, QdrantError}; -## [Anchor](https://qdrant.tech/articles/vector-similarity-beyond-search/\#vector-similarity-recommendations) Vector similarity recommendations +let client = Qdrant::from_url("http://localhost:6334").build()?; -Vector similarity can go above a single query vector. -It can combine multiple positive and negative examples for a more accurate retrieval. -Building a recommendation API in a vector database can take advantage of using already stored vectors as part of the queries, by specifying the point id. -Doing this, we can skip query-time neural network inference, and make the recommendation search faster. +client.create_field_index( + CreateFieldIndexCollectionBuilder::new( + "{collection_name}", + "group_id", + FieldType::Keyword, + ).field_index_params( + KeywordIndexParamsBuilder::default() + .is_tenant(true) + ) + ).await?; +``` -There are multiple ways to implement recommendations with vectors. +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.PayloadIndexParams; +import io.qdrant.client.grpc.Collections.PayloadSchemaType; +import io.qdrant.client.grpc.Collections.KeywordIndexParams; -### [Anchor](https://qdrant.tech/articles/vector-similarity-beyond-search/\#vector-features-recommendations) Vector-features recommendations +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -The first approach is to take all positive and negative examples and average them to create a single query vector. -In this technique, the more significant components of positive vectors are canceled out by the negative ones, and the resulting vector is a combination of all the features present in the positive examples, but not in the negative ones. +client + .createPayloadIndexAsync( + "{collection_name}", + "group_id", + PayloadSchemaType.Keyword, + PayloadIndexParams.newBuilder() + .setKeywordIndexParams( + KeywordIndexParams.newBuilder() + .setIsTenant(true) + .build()) + .build(), + null, + null, + null) + .get(); +``` -![Vector-Features Based Recommendations](https://qdrant.tech/articles_data/vector-similarity-beyond-search/feature-based-recommendations.png) +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -Vector-Features Based Recommendations +var client = new QdrantClient("localhost", 6334); -This approach is already implemented in Qdrant, and while it works great when the vectors are assumed to have each of their dimensions represent some kind of feature of the data, sometimes distances are a better tool to judge negative and positive examples. +await client.CreatePayloadIndexAsync( + collectionName: "{collection_name}", + fieldName: "group_id", + schemaType: PayloadSchemaType.Keyword, + indexParams: new PayloadIndexParams + { + KeywordIndexParams = new KeywordIndexParams + { + IsTenant = true + } + } +); -### [Anchor](https://qdrant.tech/articles/vector-similarity-beyond-search/\#relative-distance-recommendations) Relative distance recommendations +``` -Another approach is to use the distance between negative examples to the candidates to help them create exclusion areas. -In this technique, we perform searches near the positive examples while excluding the points that are closer to a negative example than to a positive one. +As a result, the storage structure will be organized in a way to co-locate vectors of the same tenant together at the next optimization. -![Relative Distance Recommendations](https://qdrant.tech/articles_data/vector-similarity-beyond-search/relative-distance-recommendations.png) +*To learn more about defragmentation, read the [Multitenancy documentation](/documentation/guides/multiple-partitions/).* -Relative Distance Recommendations +### On-Disk Support for the Payload Index -The main use-case of both approaches —of course— is to take some history of user interactions and recommend new items based on it. +When managing billions of records across millions of tenants, keeping all data in RAM is inefficient. That is especially true when only a small subset is frequently accessed. As of 1.11, you can offload "cold" data to disk and cache the “hot” data in RAM. -## [Anchor](https://qdrant.tech/articles/vector-similarity-beyond-search/\#discovery) Discovery +*This feature can help you manage a high number of different payload indexes, which is beneficial if you are working with large varied datasets.* -In many exploration scenarios, the desired destination is not known in advance. -The search process in this case can consist of multiple steps, where each step would provide a little more information to guide the search in the right direction. +**Figure 2:** By moving the data from Workspace 2 to disk, the system can free up valuable memory resources for Workspaces 1, 3 and 4, which are accessed more frequently. -To get more intuition about the possible ways to implement this approach, let’s take a look at how similarity modes are trained in the first place: +![on-disk-payload](/blog/qdrant-1.11.x/on-disk-payload.png) -The most well-known loss function used to train similarity models is a [triplet-loss](https://en.wikipedia.org/wiki/Triplet_loss). -In this loss, the model is trained by fitting the information of relative similarity of 3 objects: the Anchor, Positive, and Negative examples. +**Example:** As you create an index for Workspace 2, set the `on_disk` parameter. -![Triplet Loss](https://qdrant.tech/articles_data/vector-similarity-beyond-search/triplet-loss.png) +```http +PUT /collections/{collection_name}/index +{ + "field_name": "group_id", + "field_schema": { + "type": "keyword", + "is_tenant": true, + "on_disk": true + } +} +``` -Triplet Loss +```python +client.create_payload_index( + collection_name="{collection_name}", + field_name="group_id", + field_schema=models.KeywordIndexParams( + type="keyword", + is_tenant=True, + on_disk=True, + ), +) +``` -Using the same mechanics, we can look at the training process from the other side. -Given a trained model, the user can provide positive and negative examples, and the goal of the discovery process is then to find suitable anchors across the stored collection of vectors. +```typescript +client.createPayloadIndex("{collection_name}", { + field_name: "group_id", + field_schema: { + type: "keyword", + is_tenant: true, + on_disk: true + }, +}); +``` -![Reversed triplet loss](https://qdrant.tech/articles_data/vector-similarity-beyond-search/discovery.png) +```rust +use qdrant_client::qdrant::{ + CreateFieldIndexCollectionBuilder, + KeywordIndexParamsBuilder, + FieldType +}; +use qdrant_client::{Qdrant, QdrantError}; -Reversed triplet loss +let client = Qdrant::from_url("http://localhost:6334").build()?; -Multiple positive-negative pairs can be provided to make the discovery process more accurate. -Worth mentioning, that as well as in NN training, the dataset may contain noise and some portion of contradictory information, so a discovery process should be tolerant of this kind of data imperfections. +client.create_field_index( + CreateFieldIndexCollectionBuilder::new( + "{collection_name}", + "group_id", + FieldType::Keyword, + ) + .field_index_params( + KeywordIndexParamsBuilder::default() + .is_tenant(true) + .on_disk(true), + ), +); +``` -![Sample pairs](https://qdrant.tech/articles_data/vector-similarity-beyond-search/discovery-noise.png) +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.PayloadIndexParams; +import io.qdrant.client.grpc.Collections.PayloadSchemaType; +import io.qdrant.client.grpc.Collections.KeywordIndexParams; -Sample pairs +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -The important difference between this and the recommendation method is that the positive-negative pairs in the discovery method don’t assume that the final result should be close to positive, it only assumes that it should be closer than the negative one. +client + .createPayloadIndexAsync( + "{collection_name}", + "group_id", + PayloadSchemaType.Keyword, + PayloadIndexParams.newBuilder() + .setKeywordIndexParams( + KeywordIndexParams.newBuilder() + .setIsTenant(true) + .setOnDisk(true) + .build()) + .build(), + null, + null, + null) + .get(); +``` -![Discovery vs Recommendation](https://qdrant.tech/articles_data/vector-similarity-beyond-search/discovery-vs-recommendations.png) +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -Discovery vs Recommendation +var client = new QdrantClient("localhost", 6334); -In combination with filtering or similarity search, the additional context information provided by the discovery pairs can be used as a re-ranking factor. +await client.CreatePayloadIndexAsync( + collectionName: "{collection_name}", + fieldName: "group_id", + schemaType: PayloadSchemaType.Keyword, + indexParams: new PayloadIndexParams + { + KeywordIndexParams = new KeywordIndexParams + { + IsTenant = true, + OnDisk = true + } + } +); -## [Anchor](https://qdrant.tech/articles/vector-similarity-beyond-search/\#a-new-api-stack-for-vector-databases) A new API stack for vector databases +``` -When you introduce vector similarity capabilities into your text search engine, you extend its functionality. -However, it doesn’t work the other way around, as the vector similarity as a concept is much broader than some task-specific implementations of full-text search. +By moving the index to disk, Qdrant can handle larger datasets that exceed the capacity of RAM, making the system more scalable and capable of storing more data without being constrained by memory limitations. -[Vector databases](https://qdrant.tech/), which introduce built-in full-text functionality, must make several compromises: +*To learn more about this, read the [Indexing documentation](/documentation/concepts/indexing/).* -- Choose a specific full-text search variant. -- Either sacrifice API consistency or limit vector similarity functionality to only basic kNN search. -- Introduce additional complexity to the system. +### UUID Datatype for the Payload Index -Qdrant, on the contrary, puts vector similarity in the center of its API and architecture, such that it allows us to move towards a new stack of vector-native operations. -We believe that this is the future of vector databases, and we are excited to see what new use-cases will be unlocked by these techniques. +Many Qdrant users rely on UUIDs in their payloads, but storing these as strings comes with a substantial memory overhead—approximately 36 bytes per UUID. In reality, UUIDs only require 16 bytes of storage when stored as raw bytes. -## [Anchor](https://qdrant.tech/articles/vector-similarity-beyond-search/\#key-takeaways) Key takeaways: +To address this inefficiency, we’ve developed a new index type tailored specifically for UUIDs that stores them internally as bytes, **reducing memory usage by up to 2.25x.** -- Vector similarity offers advanced data exploration tools beyond traditional full-text search, including dissimilarity search, diversity sampling, and recommendation systems. -- Practical applications of vector similarity include improving data quality through mislabeling detection and anomaly identification. -- Enhanced user experiences are achieved by leveraging advanced search techniques, providing users with intuitive data exploration, and improving decision-making processes. +**Example:** When adding two separate points, indicate their UUID in the payload. In this example, both data points belong to the same user (with the same UUID). -Ready to unlock the full potential of your data? [Try a free demo](https://qdrant.tech/contact-us/) to explore how vector similarity can revolutionize your data insights and drive smarter decision-making. +```http +PUT /collections/{collection_name}/points +{ + "points": [ + { + "id": 1, + "vector": [0.05, 0.61, 0.76, 0.74], + "payload": {"id": 550e8400-e29b-41d4-a716-446655440000} + }, + { + "id": 2, + "vector": [0.19, 0.81, 0.75, 0.11], + "payload": {"id": 550e8400-e29b-41d4-a716-446655440000} + }, + ] +} +``` -##### Was this page useful? +> For organizations that have numerous users and UUIDs, this simple fix can significantly reduce the cluster size and improve efficiency. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +*To learn more about this, read the [Payload documentation](/documentation/concepts/payload/).* -Thank you for your feedback! 🙏 +### Query API: Groups Endpoint -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/vector-similarity-beyond-search.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +When searching over data, you can group results by specific payload field, which is useful when you have multiple data points for the same item and you want to avoid redundant entries in the results. -On this page: +**Example:** If a large document is divided into several chunks, and you need to search or make recommendations on a per-document basis, you can group the results by the `document_id`. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/vector-similarity-beyond-search.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +```http +POST /collections/{collection_name}/points/query/groups +{ + "query": [0.01, 0.45, 0.67], + group_by="document_id", # Path of the field to group by + limit=4, # Max amount of groups + group_size=2, # Max amount of points per group +} +``` -× +```python +from qdrant_client import QdrantClient, models -[Powered by](https://qdrant.tech/) +client = QdrantClient(url="http://localhost:6333") -<|page-173-lllmstxt|> -## what-is-rag-in-ai -- [Articles](https://qdrant.tech/articles/) -- What is RAG: Understanding Retrieval-Augmented Generation +client.query_points_groups( + collection_name="{collection_name}", + query=[0.01, 0.45, 0.67], + group_by="document_id", + limit=4, + group_size=2, +) +``` -[Back to RAG & GenAI](https://qdrant.tech/articles/rag-and-genai/) +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -# What is RAG: Understanding Retrieval-Augmented Generation +const client = new QdrantClient({ host: "localhost", port: 6333 }); -Sabrina Aquino +client.queryGroups("{collection_name}", { + query: [0.01, 0.45, 0.67], + group_by: "document_id", + limit: 4, + group_size: 2, +}); +``` -· +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{Query, QueryPointsBuilder}; -March 19, 2024 +let client = Qdrant::from_url("http://localhost:6334").build()?; -![What is RAG: Understanding Retrieval-Augmented Generation](https://qdrant.tech/articles_data/what-is-rag-in-ai/preview/title.jpg) +client.query_groups( + QueryPointGroupsBuilder::new("{collection_name}", "document_id") + .query(Query::from(vec![0.01, 0.45, 0.67])) + .limit(4u64) + .group_size(2u64) +).await?; +``` -> Retrieval-augmented generation (RAG) integrates external information retrieval into the process of generating responses by Large Language Models (LLMs). It searches a database for information beyond its pre-trained knowledge base, significantly improving the accuracy and relevance of the generated responses. +```java +import static io.qdrant.client.QueryFactory.nearest; -Language models have exploded on the internet ever since ChatGPT came out, and rightfully so. They can write essays, code entire programs, and even make memes (though we’re still deciding on whether that’s a good thing). +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.QueryPointGroups; -But as brilliant as these chatbots become, they still have **limitations** in tasks requiring external knowledge and factual information. Yes, it can describe the honeybee’s waggle dance in excruciating detail. But they become far more valuable if they can generate insights from **any data** that we provide, rather than just their original training data. Since retraining those large language models from scratch costs millions of dollars and takes months, we need better ways to give our existing LLMs access to our custom data. +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -While you could be more creative with your prompts, it is only a short-term solution. LLMs can consider only a **limited** amount of text in their responses, known as a [context window](https://www.hopsworks.ai/dictionary/context-window-for-llms). Some models like GPT-3 can see up to around 12 pages of text (that’s 4,096 tokens of context). That’s not good enough for most knowledge bases. +client + .queryGroupsAsync( + QueryPointGroups.newBuilder() + .setCollectionName("{collection_name}") + .setGroupBy("document_id") + .setQuery(nearest(0.01f, 0.45f, 0.67f)) + .setLimit(4) + .setGroupSize(2) + .build()) + .get(); +``` -![How a RAG works](https://qdrant.tech/articles_data/what-is-rag-in-ai/how-rag-works.jpg) +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -The image above shows how a basic RAG system works. Before forwarding the question to the LLM, we have a layer that searches our knowledge base for the “relevant knowledge” to answer the user query. Specifically, in this case, the spending data from the last month. Our LLM can now generate a **relevant non-hallucinated** response about our budget. +var client = new QdrantClient("localhost", 6334); -As your data grows, you’ll need [efficient ways](https://qdrant.tech/rag/rag-evaluation-guide/) to identify the most relevant information for your LLM’s limited memory. This is where you’ll want a proper way to store and retrieve the specific data you’ll need for your query, without needing the LLM to remember it. +await client.QueryGroupsAsync( + collectionName: "{collection_name}", + groupBy: "document_id", + query: new float[] { + 0.01f, 0.45f, 0.67f + }, + limit: 4, + groupSize: 2 +); +``` -**Vector databases** store information as **vector embeddings**. This format supports efficient similarity searches to retrieve relevant data for your query. For example, Qdrant is specifically designed to perform fast, even in scenarios dealing with billions of vectors. +This endpoint will retrieve the best N points for each document, assuming that the payload of the points contains the document ID. Sometimes, the best N points cannot be fulfilled due to lack of points or a big distance with respect to the query. In every case, the `group_size` is a best-effort parameter, similar to the limit parameter. -This article will focus on RAG systems and architecture. If you’re interested in learning more about vector search, we recommend the following articles: [What is a Vector Database?](https://qdrant.tech/articles/what-is-a-vector-database/) and [What are Vector Embeddings?](https://qdrant.tech/articles/what-are-embeddings/). +*For more information on grouping capabilities refer to our [Hybrid Queries documentation](/documentation/concepts/hybrid-queries/).* -## [Anchor](https://qdrant.tech/articles/what-is-rag-in-ai/\#rag-architecture) RAG architecture +### Query API: Random Sampling -At its core, a RAG architecture includes the **retriever** and the **generator**. Let’s start by understanding what each of these components does. +Our [Food Discovery Demo](https://food-discovery.qdrant.tech) always shows a random sample of foods from the larger dataset. Now you can do the same and set the randomization from a basic Query API endpoint. -### [Anchor](https://qdrant.tech/articles/what-is-rag-in-ai/\#the-retriever) The Retriever +When calling the Query API, you will be able to select a subset of data points from a larger dataset randomly. -When you ask a question to the retriever, it uses **similarity search** to scan through a vast knowledge base of vector embeddings. It then pulls out the most **relevant** vectors to help answer that query. There are a few different techniques it can use to know what’s relevant: +*This technique is often used to reduce the computational load, improve query response times, or provide a representative sample of the data for various analytical purposes.* -#### [Anchor](https://qdrant.tech/articles/what-is-rag-in-ai/\#how-indexing-works-in-rag-retrievers) How indexing works in RAG retrievers +**Example:** When querying the collection, you can configure it to retrieve a random sample of data. -The indexing process organizes the data into your vector database in a way that makes it easily searchable. This allows the RAG to access relevant information when responding to a query. +```python +from qdrant_client import QdrantClient, models -![How indexing works](https://qdrant.tech/articles_data/what-is-rag-in-ai/how-indexing-works.jpg) +client = QdrantClient(url="http://localhost:6333") -As shown in the image above, here’s the process: +# Random sampling (as of 1.11.0) +sampled = client.query_points( + collection_name="{collection_name}", + query=models.SampleQuery(sample=models.Sample.Random) +) +``` -- Start with a _loader_ that gathers _documents_ containing your data. These documents could be anything from articles and books to web pages and social media posts. -- Next, a _splitter_ divides the documents into smaller chunks, typically sentences or paragraphs. -- This is because RAG models work better with smaller pieces of text. In the diagram, these are _document snippets_. -- Each text chunk is then fed into an _embedding machine_. This machine uses complex algorithms to convert the text into [vector embeddings](https://qdrant.tech/articles/what-are-embeddings/). +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -All the generated vector embeddings are stored in a knowledge base of indexed information. This supports efficient retrieval of similar pieces of information when needed. +const client = new QdrantClient({ host: "localhost", port: 6333 }); -#### [Anchor](https://qdrant.tech/articles/what-is-rag-in-ai/\#query-vectorization) Query vectorization +let sampled = client.query("{collection_name}", { + query: { sample: "random" }, +}); +``` -Once you have vectorized your knowledge base you can do the same to the user query. When the model sees a new query, it uses the same preprocessing and embedding techniques. This ensures that the query vector is compatible with the document vectors in the index. +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{Query, QueryPointsBuilder, Sample}; -![How retrieval works](https://qdrant.tech/articles_data/what-is-rag-in-ai/how-retrieval-works.jpg) +let client = Qdrant::from_url("http://localhost:6334").build()?; -#### [Anchor](https://qdrant.tech/articles/what-is-rag-in-ai/\#retrieval-of-relevant-documents) Retrieval of relevant documents +let sampled = client + .query( + QueryPointsBuilder::new("{collection_name}").query(Query::new_sample(Sample::Random)), + ) + .await?; +``` -When the system needs to find the most relevant documents or passages to answer a query, it utilizes vector similarity techniques. **Vector similarity** is a fundamental concept in machine learning and natural language processing (NLP) that quantifies the resemblance between vectors, which are mathematical representations of data points. +```java +import static io.qdrant.client.QueryFactory.sample; -The system can employ different vector similarity strategies depending on the type of vectors used to represent the data: +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.Sample; +import io.qdrant.client.grpc.Points.QueryPoints; + +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -##### [Anchor](https://qdrant.tech/articles/what-is-rag-in-ai/\#sparse-vector-representations) Sparse vector representations +client + .queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .setQuery(sample(Sample.Random)) + .build()) + .get(); +``` -A sparse vector is characterized by a high dimensionality, with most of its elements being zero. +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -The classic approach is **keyword search**, which scans documents for the exact words or phrases in the query. The search creates sparse vector representations of documents by counting word occurrences and inversely weighting common words. Queries with rarer words get prioritized. +var client = new QdrantClient("localhost", 6334); -![Sparse vector representation](https://qdrant.tech/articles_data/what-is-rag-in-ai/sparse-vectors.jpg) +await client.QueryAsync( + collectionName: "{collection_name}", + query: Sample.Random +); +``` -[TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) (Term Frequency-Inverse Document Frequency) and [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) are two classic related algorithms. They’re simple and computationally efficient. However, they can struggle with synonyms and don’t always capture semantic similarities. +*To learn more, check out the [Query API documentation](/documentation/concepts/hybrid-queries/).* -If you’re interested in going deeper, refer to our article on [Sparse Vectors](https://qdrant.tech/articles/sparse-vectors/). +### Query API: Distribution-Based Score Fusion -##### [Anchor](https://qdrant.tech/articles/what-is-rag-in-ai/\#dense-vector-embeddings) Dense vector embeddings +In version 1.10, we added Reciprocal Rank Fusion (RRF) as a way of fusing results from Hybrid Queries. Now we are adding Distribution-Based Score Fusion (DBSF). Michelangiolo Mazzeschi talks more about this fusion method in his latest [Medium article](https://medium.com/plain-simple-software/distribution-based-score-fusion-dbsf-a-new-approach-to-vector-search-ranking-f87c37488b18). -This approach uses large language models like [BERT](https://en.wikipedia.org/wiki/BERT_%28language_model%29) to encode the query and passages into dense vector embeddings. These models are compact numerical representations that capture semantic meaning. Vector databases like Qdrant store these embeddings, allowing retrieval based on **semantic similarity** rather than just keywords using distance metrics like cosine similarity. +*DBSF normalizes the scores of the points in each query, using the mean +/- the 3rd standard deviation as limits, and then sums the scores of the same point across different queries.* -This allows the retriever to match based on semantic understanding rather than just keywords. So if I ask about “compounds that cause BO,” it can retrieve relevant info about “molecules that create body odor” even if those exact words weren’t used. We explain more about it in our [What are Vector Embeddings](https://qdrant.tech/articles/what-are-embeddings/) article. +**Example:** To fuse `prefetch` results from sparse and dense queries, set `"fusion": "dbsf"` -#### [Anchor](https://qdrant.tech/articles/what-is-rag-in-ai/\#hybrid-search) Hybrid search +```http +POST /collections/{collection_name}/points/query +{ + "prefetch": [ + { + "query": { + "indices": [1, 42], // <┐ + "values": [0.22, 0.8] // <┮─Sparse vector + }, + "using": "sparse", + "limit": 20 + }, + { + "query": [0.01, 0.45, 0.67, ...], // <-- Dense vector + "using": "dense", + "limit": 20 + } + ], + "query": { "fusion": “dbsf" }, // <--- Distribution Based Score Fusion + "limit": 10 +} +``` -However, neither keyword search nor vector search are always perfect. Keyword search may miss relevant information expressed differently, while vector search can sometimes struggle with specificity or neglect important statistical word patterns. Hybrid methods aim to combine the strengths of different techniques. +```python +from qdrant_client import QdrantClient, models -![Hybrid search overview](https://qdrant.tech/articles_data/what-is-rag-in-ai/hybrid-search.jpg) +client = QdrantClient(url="http://localhost:6333") -Some common hybrid approaches include: +client.query_points( + collection_name="{collection_name}", + prefetch=[ + models.Prefetch( + query=models.SparseVector(indices=[1, 42], values=[0.22, 0.8]), + using="sparse", + limit=20, + ), + models.Prefetch( + query=[0.01, 0.45, 0.67, ...], # <-- dense vector + using="dense", + limit=20, + ), + ], + query=models.FusionQuery(fusion=models.Fusion.DBSF), +) +``` -- Using keyword search to get an initial set of candidate documents. Next, the documents are re-ranked/re-scored using semantic vector representations. -- Starting with semantic vectors to find generally topically relevant documents. Next, the documents are filtered/re-ranked e based on keyword matches or other metadata. -- Considering both semantic vector closeness and statistical keyword patterns/weights in a combined scoring model. -- Having multiple stages were different techniques. One example: start with an initial keyword retrieval, followed by semantic re-ranking, then a final re-ranking using even more complex models. +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -When you combine the powers of different search methods in a complementary way, you can provide higher quality, more comprehensive results. Check out our article on [Hybrid Search](https://qdrant.tech/articles/hybrid-search/) if you’d like to learn more. +const client = new QdrantClient({ host: "localhost", port: 6333 }); -### [Anchor](https://qdrant.tech/articles/what-is-rag-in-ai/\#the-generator) The Generator +client.query("{collection_name}", { + prefetch: [ + { + query: { + values: [0.22, 0.8], + indices: [1, 42], + }, + using: 'sparse', + limit: 20, + }, + { + query: [0.01, 0.45, 0.67], + using: 'dense', + limit: 20, + }, + ], + query: { + fusion: 'dbsf', + }, +}); +``` -With the top relevant passages retrieved, it’s now the generator’s job to produce a final answer by synthesizing and expressing that information in natural language. +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{Fusion, PrefetchQueryBuilder, Query, QueryPointsBuilder}; -The LLM is typically a model like GPT, BART or T5, trained on massive datasets to understand and generate human-like text. It now takes not only the query (or question) as input but also the relevant documents or passages that the retriever identified as potentially containing the answer to generate its response. +let client = Qdrant::from_url("http://localhost:6334").build()?; -![How a Generator works](https://qdrant.tech/articles_data/what-is-rag-in-ai/how-generation-works.png) +client.query( + QueryPointsBuilder::new("{collection_name}") + .add_prefetch(PrefetchQueryBuilder::default() + .query(Query::new_nearest([(1, 0.22), (42, 0.8)].as_slice())) + .using("sparse") + .limit(20u64) + ) + .add_prefetch(PrefetchQueryBuilder::default() + .query(Query::new_nearest(vec![0.01, 0.45, 0.67])) + .using("dense") + .limit(20u64) + ) + .query(Query::new_fusion(Fusion::Dbsf)) +).await?; +``` -The retriever and generator don’t operate in isolation. The image bellow shows how the output of the retrieval feeds the generator to produce the final generated response. +```java +import static io.qdrant.client.QueryFactory.nearest; -![The entire architecture of a RAG system](https://qdrant.tech/articles_data/what-is-rag-in-ai/rag-system.jpg) +import java.util.List; -## [Anchor](https://qdrant.tech/articles/what-is-rag-in-ai/\#where-is-rag-being-used) Where is RAG being used? +import static io.qdrant.client.QueryFactory.fusion; -Because of their more knowledgeable and contextual responses, we can find RAG models being applied in many areas today, especially those who need factual accuracy and knowledge depth. +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.Fusion; +import io.qdrant.client.grpc.Points.PrefetchQuery; +import io.qdrant.client.grpc.Points.QueryPoints; -### [Anchor](https://qdrant.tech/articles/what-is-rag-in-ai/\#real-world-applications) Real-World Applications: +QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -**Question answering:** This is perhaps the most prominent use case for RAG models. They power advanced question-answering systems that can retrieve relevant information from large knowledge bases and then generate fluent answers. +client.queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .addPrefetch(PrefetchQuery.newBuilder() + .setQuery(nearest(List.of(0.22f, 0.8f), List.of(1, 42))) + .setUsing("sparse") + .setLimit(20) + .build()) + .addPrefetch(PrefetchQuery.newBuilder() + .setQuery(nearest(List.of(0.01f, 0.45f, 0.67f))) + .setUsing("dense") + .setLimit(20) + .build()) + .setQuery(fusion(Fusion.DBSF)) + .build()) + .get(); +``` -**Language generation:** RAG enables more factual and contextualized text generation for contextualized text summarization from multiple sources +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -**Data-to-text generation:** By retrieving relevant structured data, RAG models can generate product/business intelligence reports from databases or describing insights from data visualizations and charts +var client = new QdrantClient("localhost", 6334); + +await client.QueryAsync( + collectionName: "{collection_name}", + prefetch: new List < PrefetchQuery > { + new() { + Query = new(float, uint)[] { + (0.22f, 1), (0.8f, 42), + }, + Using = "sparse", + Limit = 20 + }, + new() { + Query = new float[] { + 0.01f, 0.45f, 0.67f + }, + Using = "dense", + Limit = 20 + } + }, + query: Fusion.Dbsf +); +``` -**Multimedia understanding:** RAG isn’t limited to text - it can retrieve multimodal information like images, video, and audio to enhance understanding. Answering questions about images/videos by retrieving relevant textual context. +Note that `dbsf` is stateless and calculates the normalization limits only based on the results of each query, not on all the scores that it has seen. -## [Anchor](https://qdrant.tech/articles/what-is-rag-in-ai/\#creating-your-first-rag-chatbot-with-langchain-groq-and-openai) Creating your first RAG chatbot with Langchain, Groq, and OpenAI +*To learn more, check out the [Hybrid Queries documentation](/documentation/concepts/hybrid-queries/).* -Are you ready to create your own RAG chatbot from the ground up? We have a video explaining everything from the beginning. Daniel Romero’s will guide you through: +## Web UI: Search Quality Tool -- Setting up your chatbot -- Preprocessing and organizing data for your chatbot’s use -- Applying vector similarity search algorithms -- Enhancing the efficiency and response quality +We have updated the Qdrant Web UI with additional testing functionality. Now you can check the quality of your search requests in real time and measure it against exact search. -After building your RAG chatbot, you’ll be able to [evaluate its performance](https://qdrant.tech/rag/rag-evaluation-guide/) against that of a chatbot powered solely by a Large Language Model (LLM). +**Try it:** In the Dashboard, go to collection settings and test the **Precision** from the Search Quality menu tab. -Chatbot with RAG, using LangChain, OpenAI, and Groq - YouTube +> The feature will conduct a semantic search for each point and produce a report below. -[Photo image of Qdrant - Vector Database & Search Engine](https://www.youtube.com/channel/UC6ftm8PwH1RU_LM1jwG0LQA?embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) + -Qdrant - Vector Database & Search Engine +## Web UI: Graph Exploration Tool -8.12K subscribers +Deeper exploration is highly dependent on expanding context. This is something we previously covered in the [Discovery Needs Context](/articles/discovery-search/) article earlier this year. Now, we have developed a UI feature to help you visualize how semantic search can be used for exploratory and recommendation purposes. -[Chatbot with RAG, using LangChain, OpenAI, and Groq](https://www.youtube.com/watch?v=O60-KuZZeQA) +**Try it:** Using the feature is pretty self-explanatory. Each collection's dataset can be explored from the **Graph** tab. As you see the images change, you can steer your search in the direction of specific characteristics that interest you. -Qdrant - Vector Database & Search Engine +> Search results will become more "distilled" and tailored to your preferences. -Search + -Watch later +## Next Steps -Share +If you’re new to Qdrant, now is the perfect time to start. Check out our [documentation](/documentation/) guides and see why Qdrant is the go-to solution for vector search. -Copy link +We’re very happy to bring you this latest version of Qdrant, and we can’t wait to see what you build with it. As always, your feedback is invaluable—feel free to reach out with any questions or comments on our [community forum](https://qdrant.to/discord). -Info +<|page-345-lllmstxt|> +![Kairoswealth overview](/blog/case-study-kairoswealth/image2.png) -Shopping +## **About Kairoswealth** -Tap to unmute +[Kairoswealth](https://kairoswealth.com/) is a comprehensive wealth management platform designed to provide users with a holistic view of their financial portfolio. The platform offers access to unique financial products and automates back-office operations through its AI assistant, Gaia. -If playback doesn't begin shortly, try restarting your device. +![Dashboard Kairoswealth](/blog/case-study-kairoswealth/image3.png) -More videos +## **Motivations for Adopting a Vector Database** -## More videos +“At Kairoswealth we encountered several use cases necessitating the ability to run similarity queries on large datasets. Key applications included product recommendations and retrieval-augmented generation (RAG),” says [Vincent Teyssier](https://www.linkedin.com/in/vincent-teyssier/), Chief Technology & AI Officer at Kairoswealth. These needs drove the search for a more robust and scalable vector database solution. -You're signed out +## **Challenges with Previous Solutions** -Videos you watch may be added to the TV's watch history and influence TV recommendations. To avoid this, cancel and sign in to YouTube on your computer. +“We faced several critical showstoppers with our previous vector database solution, which led us to seek an alternative,” says Teyssier. These challenges included: -CancelConfirm +- **Performance Scalability:** Significant performance degradation occurred as more data was added, despite various optimizations. +- **Robust Multi-Tenancy:** The previous solution struggled with multi-tenancy, impacting performance. +- **RAM Footprint:** High memory consumption was an issue. -Share +## **Qdrant Use Cases at Kairoswealth** -Include playlist +Kairoswealth leverages Qdrant for several key use cases: -An error occurred while retrieving sharing information. Please try again later. +- **Internal Data RAG:** Efficiently handling internal RAG use cases. +- **Financial Regulatory Reports RAG:** Managing and generating financial reports. +- **Recommendations:** Enhancing the accuracy and efficiency of recommendations with the Kairoswealth platform. -[Watch on](https://www.youtube.com/watch?v=O60-KuZZeQA&embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +![Stock recommendation](/blog/case-study-kairoswealth/image1.png) -0:00 +## **Why Kairoswealth Chose Qdrant** -0:00 / 20:14 -‱Live +Some of the key reasons, why Kairoswealth landed on Qdrant as the vector database of choice are: -‱ +1. **High Performance with 2.4M Vectors:** “Qdrant efficiently handled the indexing of 1.2 million vectors with 16 metadata fields each, maintaining high performance with no degradation. Similarity queries and scrolls run in less than 0.3 seconds. When we doubled the dataset to 2.4 million vectors, performance remained consistent.So we decided to double that to 2.4M vectors, and it's as if we were inserting our first vector!” says Teyssier. +2. **8x Memory Efficiency:** The database storage size with Qdrant was eight times smaller than the previous solution, enabling the deployment of the entire dataset on smaller instances and saving significant infrastructure costs. +3. **Embedded Capabilities:** “Beyond simple search and similarity, Qdrant hosts a bunch of very nice features around recommendation engines, adding positive and negative examples for better spacial narrowing, efficient multi-tenancy, and many more,” says Teyssier. +4. **Support and Community:** “The Qdrant team, led by Andre Zayarni, provides exceptional support and has a strong passion for data engineering,” notes Teyssier, “the team's commitment to open-source and their active engagement in helping users, from beginners to veterans, is highly valued by Kairoswealth.” -[Watch on YouTube](https://www.youtube.com/watch?v=O60-KuZZeQA "Watch on YouTube") +## **Conclusion** -## [Anchor](https://qdrant.tech/articles/what-is-rag-in-ai/\#whats-next) What’s next? +Kairoswealth's transition to Qdrant has enabled them to overcome significant challenges related to performance, scalability, and memory efficiency, while also benefiting from advanced features and robust support. This partnership positions Kairoswealth to continue innovating in the wealth management sector, leveraging the power of AI to deliver superior services to their clients. -Have a RAG project you want to bring to life? Join our [Discord community](https://discord.gg/qdrant) where we’re always sharing tips and answering questions on vector search and retrieval. +## **Future Roadmap for Kairoswealth** -Learn more about how to properly evaluate your RAG responses: [Evaluating Retrieval Augmented Generation - a framework for assessment](https://superlinked.com/vectorhub/evaluating-retrieval-augmented-generation-a-framework-for-assessment). +Kairoswealth is seizing the opportunity to disrupt the wealth management sector, which has traditionally been underserved by technology. For example, they are developing the Kairos Terminal, a natural language interface that translates user queries into OpenBB commands (a set of tools for financial analysis and data visualization within the OpenBB Terminal). With regards to the future of the wealth management sector, Teyssier notes that “the integration of Generative AI will automate back-office tasks such as data collation, data reconciliation, and market research. This technology will also enable wealth managers to scale their services to broader segments, including affluent clients, by automating relationship management and interactions.” + +<|page-346-lllmstxt|> +[Qdrant 1.10.0 is out!](https://github.com/qdrant/qdrant/releases/tag/v1.10.0) This version introduces some major changes, so let's dive right in: -##### Was this page useful? +**Universal Query API:** All search APIs, including Hybrid Search, are now in one Query endpoint.
+**Built-in IDF:** We added the IDF mechanism to Qdrant's core search and indexing processes.
+**Multivector Support:** Native support for late interaction ColBERT is accessible via Query API. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +## One Endpoint for All Queries -Thank you for your feedback! 🙏 +**Query API** will consolidate all search APIs into a single request. Previously, you had to work outside of the API to combine different search requests. Now these approaches are reduced to parameters of a single request, so you can avoid merging individual results. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/what-is-rag-in-ai.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +You can now configure the Query API request with the following parameters: -On this page: +|Parameter|Description| +|-|-| +|no parameter|Returns points by `id`| +|`nearest`|Queries nearest neighbors ([Search](/documentation/concepts/search/))| +|`fusion`|Fuses sparse/dense prefetch queries ([Hybrid Search](/documentation/concepts/hybrid-queries/#hybrid-search))| +|`discover`|Queries `target` with added `context` ([Discovery](/documentation/concepts/explore/#discovery-api))| +|`context` |No target with `context` only ([Context](/documentation/concepts/explore/#context-search))| +|`recommend`|Queries against `positive`/`negative` examples. ([Recommendation](/documentation/concepts/explore/#recommendation-api))| +|`order_by`|Orders results by [payload field](/documentation/concepts/hybrid-queries/#re-ranking-with-payload-values)| -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/what-is-rag-in-ai.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +For example, you can configure Query API to run [Discovery search](/documentation/concepts/explore/#discovery-api). Let's see how that looks: + +```http +POST collections/{collection_name}/points/query +{ + "query": { + "discover": { + "target": , + "context": [ + { + "positive": , + "negative": + } + ] + } + } +} +``` -× +We will be publishing code samples in [docs](/documentation/concepts/hybrid-queries/) and our new [API specification](http://api.qdrant.tech).
*If you need additional support with this new method, our [Discord](https://qdrant.to/discord) on-call engineers can help you.* -[Powered by](https://qdrant.tech/) +### Native Hybrid Search Support -<|page-174-lllmstxt|> -## hybrid-cloud-cluster-creation -- [Documentation](https://qdrant.tech/documentation/) -- [Hybrid cloud](https://qdrant.tech/documentation/hybrid-cloud/) -- Create a Cluster +Query API now also natively supports **sparse/dense fusion**. Up to this point, you had to combine the results of sparse and dense searches on your own. This is now sorted on the back-end, and you only have to configure them as basic parameters for Query API. -# [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-cluster-creation/\#creating-a-qdrant-cluster-in-hybrid-cloud) Creating a Qdrant Cluster in Hybrid Cloud +```http +POST /collections/{collection_name}/points/query +{ + "prefetch": [ + { + "query": { + "indices": [1, 42], // <┐ + "values": [0.22, 0.8] // <┮─sparse vector + }, + "using": "sparse", + "limit": 20 + }, + { + "query": [0.01, 0.45, 0.67, ...], // <-- dense vector + "using": "dense", + "limit": 20 + } + ], + "query": { "fusion": "rrf" }, // <--- reciprocal rank fusion + "limit": 10 +} +``` -Once you have created a Hybrid Cloud Environment, you can create a Qdrant cluster in that enviroment. Use the same process to [Create a cluster](https://qdrant.tech/documentation/cloud/create-cluster/). Make sure to select your Hybrid Cloud Environment as the target. +```python +from qdrant_client import QdrantClient, models -![Create Hybrid Cloud Cluster](https://qdrant.tech/documentation/cloud/hybrid_cloud_create_cluster.png) +client = QdrantClient(url="http://localhost:6333") -Note that in the “Kubernetes Configuration” section you can additionally configure: +client.query_points( + collection_name="{collection_name}", + prefetch=[ + models.Prefetch( + query=models.SparseVector(indices=[1, 42], values=[0.22, 0.8]), + using="sparse", + limit=20, + ), + models.Prefetch( + query=[0.01, 0.45, 0.67], + using="dense", + limit=20, + ), + ], + query=models.FusionQuery(fusion=models.Fusion.RRF), +) +``` -- Node selectors for the Qdrant database pods -- Toleration for the Qdrant database pods -- Additional labels for the Qdrant database pods -- A service type and annotations for the Qdrant database service +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -These settings can also be changed after the cluster is created on the cluster detail page. +const client = new QdrantClient({ host: "localhost", port: 6333 }); -![Create Hybrid Cloud Cluster - Kubernetes Configuration](https://qdrant.tech/documentation/cloud/hybrid_cloud_kubernetes_configuration.png) +client.query("{collection_name}", { + prefetch: [ + { + query: { + values: [0.22, 0.8], + indices: [1, 42], + }, + using: 'sparse', + limit: 20, + }, + { + query: [0.01, 0.45, 0.67], + using: 'dense', + limit: 20, + }, + ], + query: { + fusion: 'rrf', + }, +}); +``` -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-cluster-creation/\#scheduling-configuration) Scheduling Configuration +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{Fusion, PrefetchQueryBuilder, Query, QueryPointsBuilder}; -When creating or editing a cluster, you can configure how the database Pods get scheduled in your Kubernetes cluster. This can be useful to ensure that the Qdrant databases will run on dedicated nodes. You can configure the necessary node selectors and tolerations in the “Kubernetes Configuration” section during cluster creation, or on the cluster detail page. +let client = Qdrant::from_url("http://localhost:6334").build()?; -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-cluster-creation/\#authentication-to-your-qdrant-clusters) Authentication to your Qdrant Clusters +client.query( + QueryPointsBuilder::new("{collection_name}") + .add_prefetch(PrefetchQueryBuilder::default() + .query(Query::new_nearest([(1, 0.22), (42, 0.8)].as_slice())) + .using("sparse") + .limit(20u64) + ) + .add_prefetch(PrefetchQueryBuilder::default() + .query(Query::new_nearest(vec![0.01, 0.45, 0.67])) + .using("dense") + .limit(20u64) + ) + .query(Query::new_fusion(Fusion::Rrf)) +).await?; +``` -In Hybrid Cloud the authentication information is provided by Kubernetes secrets. +```java +import static io.qdrant.client.QueryFactory.nearest; -You can configure authentication for your Qdrant clusters in the “Configuration” section of the Qdrant Cluster detail page. There you can configure the Kubernetes secret name and key to be used as an API key and/or read-only API key. +import java.util.List; -![Hybrid Cloud API Key configuration](https://qdrant.tech/documentation/cloud/hybrid_cloud_api_key.png) +import static io.qdrant.client.QueryFactory.fusion; -One way to create a secret is with kubectl: +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.Fusion; +import io.qdrant.client.grpc.Points.PrefetchQuery; +import io.qdrant.client.grpc.Points.QueryPoints; -```shell -kubectl create secret generic qdrant-api-key --from-literal=api-key=your-secret-api-key --namespace the-qdrant-namespace +QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +client.queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .addPrefetch(PrefetchQuery.newBuilder() + .setQuery(nearest(List.of(0.22f, 0.8f), List.of(1, 42))) + .setUsing("sparse") + .setLimit(20) + .build()) + .addPrefetch(PrefetchQuery.newBuilder() + .setQuery(nearest(List.of(0.01f, 0.45f, 0.67f))) + .setUsing("dense") + .setLimit(20) + .build()) + .setQuery(fusion(Fusion.RRF)) + .build()) + .get(); ``` -The resulting secret will look like this: +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -```yaml -apiVersion: v1 -data: - api-key: ... -kind: Secret -metadata: - name: qdrant-api-key - namespace: the-qdrant-namespace -type: kubernetes.io/generic +var client = new QdrantClient("localhost", 6334); +await client.QueryAsync( + collectionName: "{collection_name}", + prefetch: new List < PrefetchQuery > { + new() { + Query = new(float, uint)[] { + (0.22f, 1), (0.8f, 42), + }, + Using = "sparse", + Limit = 20 + }, + new() { + Query = new float[] { + 0.01f, 0.45f, 0.67f + }, + Using = "dense", + Limit = 20 + } + }, + query: Fusion.Rrf +); ``` -With this command the secret name would be `qdrant-api-key` and the key would be `api-key`. - -If you want to retrieve the secret again, you can also use `kubectl`: +Query API can now pre-fetch vectors for requests, which means you can run queries sequentially within the same API call. There are a lot of options here, so you will need to define a strategy to merge these requests using new parameters. For example, you can now include **rescoring within Hybrid Search**, which can open the door to strategies like iterative refinement via matryoshka embeddings. -```shell -kubectl get secret qdrant-api-key -o jsonpath="{.data.api-key}" --namespace the-qdrant-namespace | base64 --decode +*To learn more about this, read the [Query API documentation](/documentation/concepts/search/#query-api).* -``` +## Inverse Document Frequency [IDF] -#### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-cluster-creation/\#watch-the-video) Watch the Video +IDF is a critical component of the **TF-IDF (Term Frequency-Inverse Document Frequency)** weighting scheme used to evaluate the importance of a word in a document relative to a collection of documents (corpus). +There are various ways in which IDF might be calculated, but the most commonly used formula is: -In this tutorial, we walk you through the steps to expose your Qdrant database cluster running on Qdrant Hybrid Cloud to external applications or users outside your Kubernetes cluster. Learn how to configure TLS certificates for secure communication, set up authentication, and explore different methods like load balancers, ingress, and port configurations. +$$ +\text{IDF}(q_i) = \ln \left(\frac{N - n(q_i) + 0.5}{n(q_i) + 0.5}+1\right) +$$ -How to Securely Expose Qdrant on Hybrid Cloud to External Applications - YouTube +Where:
+`N` is the total number of documents in the collection.
+`n` is the number of documents containing non-zero values for the given vector. -[Photo image of Qdrant - Vector Database & Search Engine](https://www.youtube.com/channel/UC6ftm8PwH1RU_LM1jwG0LQA?embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +This variant is also used in BM25, whose support was heavily requested by our users. We decided to move the IDF calculation into the Qdrant engine itself. This type of separation allows streaming updates of the sparse embeddings while keeping the IDF calculation up-to-date. -Qdrant - Vector Database & Search Engine +The values of IDF previously had to be calculated using all the documents on the client side. However, now that Qdrant does it out of the box, you won't need to implement it anywhere else and recompute the value if some documents are removed or newly added. -8.12K subscribers +You can enable the IDF modifier in the collection configuration: -[How to Securely Expose Qdrant on Hybrid Cloud to External Applications](https://www.youtube.com/watch?v=ikofKaUc4x0) +```http +PUT /collections/{collection_name} +{ + "sparse_vectors": { + "text": { + "modifier": "idf" + } + } +} +``` -Qdrant - Vector Database & Search Engine +```python +from qdrant_client import QdrantClient, models +client = QdrantClient(url="http://localhost:6333") +client.create_collection( + collection_name="{collection_name}", + sparse_vectors={ + "text": models.SparseVectorParams( + modifier=models.Modifier.IDF, + ), + }, +) +``` -Search +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -Watch later +const client = new QdrantClient({ host: "localhost", port: 6333 }); -Share +client.createCollection("{collection_name}", { + sparse_vectors: { + "text": { + modifier: "idf" + } + } +}); +``` -Copy link +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{CreateCollectionBuilder, sparse_vectors_config::SparseVectorsConfigBuilder, Modifier, SparseVectorParamsBuilder}; -Info +let client = Qdrant::from_url("http://localhost:6334").build()?; -Shopping +let mut config = SparseVectorsConfigBuilder::default(); +config.add_named_vector_params( + "text", + SparseVectorParamsBuilder::default().modifier(Modifier::Idf), +); -Tap to unmute +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}") + .sparse_vectors_config(config), + ) + .await?; +``` -If playback doesn't begin shortly, try restarting your device. +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Modifier; +import io.qdrant.client.grpc.Collections.SparseVectorConfig; +import io.qdrant.client.grpc.Collections.SparseVectorParams; -More videos +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -## More videos +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setSparseVectorsConfig( + SparseVectorConfig.newBuilder() + .putMap("text", SparseVectorParams.newBuilder().setModifier(Modifier.Idf).build())) + .build()) + .get(); +``` -You're signed out +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -Videos you watch may be added to the TV's watch history and influence TV recommendations. To avoid this, cancel and sign in to YouTube on your computer. +var client = new QdrantClient("localhost", 6334); -CancelConfirm +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + sparseVectorsConfig: ("text", new SparseVectorParams { + Modifier = Modifier.Idf, + }) +); +``` -Share +### IDF as Part of BM42 -Include playlist +This quarter, Qdrant also introduced BM42, a novel algorithm that combines the IDF element of BM25 with transformer-based attention matrices to improve text retrieval. It utilizes attention matrices from your embedding model to determine the importance of each token in the document based on the attention value it receives. -An error occurred while retrieving sharing information. Please try again later. +We've prepared the standard `all-MiniLM-L6-v2` Sentence Transformer so [it outputs the attention values](https://huggingface.co/Qdrant/all_miniLM_L6_v2_with_attentions). Still, you can use virtually any model of your choice, as long as you have access to its parameters. This is just another reason to stick with open source technologies over proprietary systems. -[Watch on](https://www.youtube.com/watch?v=ikofKaUc4x0&embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +In practical terms, the BM42 method addresses the tokenization issues and computational costs associated with SPLADE. The model is both efficient and effective across different document types and lengths, offering enhanced search performance by leveraging the strengths of both BM25 and modern transformer techniques. -0:00 +> To learn more about IDF and BM42, read our [dedicated technical article](/articles/bm42/). -0:00 / 9:40 -‱Live +**You can expect BM42 to excel in scalable RAG-based scenarios where short texts are more common.** Document inference speed is much higher with BM42, which is critical for large-scale applications such as search engines, recommendation systems, and real-time decision-making systems. -‱ +## Multivector Support -[Watch on YouTube](https://www.youtube.com/watch?v=ikofKaUc4x0 "Watch on YouTube") +We are adding native support for multivector search that is compatible, e.g., with the late-interaction [ColBERT](https://github.com/stanford-futuredata/ColBERT) model. If you are working with high-dimensional similarity searches, **ColBERT is highly recommended as a reranking step in the Universal Query search.** You will experience better quality vector retrieval since ColBERT’s approach allows for deeper semantic understanding. -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-cluster-creation/\#exposing-qdrant-clusters-to-your-client-applications) Exposing Qdrant clusters to your client applications +This model retains contextual information during query-document interaction, leading to better relevance scoring. In terms of efficiency and scalability benefits, documents and queries will be encoded separately, which gives an opportunity for pre-computation and storage of document embeddings for faster retrieval. -You can expose your Qdrant clusters to your client applications using Kubernetes services and ingresses. By default, a `ClusterIP` service is created for each Qdrant cluster. +**Note:** *This feature supports all the original quantization compression methods, just the same as the regular search method.* -Within your Kubernetes cluster, you can access the Qdrant cluster using the service name and port: +**Run a query with ColBERT vectors:** -``` -http://qdrant-9a9f48c7-bb90-4fb2-816f-418a46a74b24.qdrant-namespace.svc:6333 +Query API can handle exceedingly complex requests. The following example prefetches 1000 entries most similar to the given query using the `mrl_byte` named vector, then reranks them to get the best 100 matches with `full` named vector and eventually reranks them again to extract the top 10 results with the named vector called `colbert`. A single API call can now implement complex reranking schemes. +```http +POST /collections/{collection_name}/points/query +{ + "prefetch": { + "prefetch": { + "query": [1, 23, 45, 67], // <------ small byte vector + "using": "mrl_byte", + "limit": 1000 + }, + "query": [0.01, 0.45, 0.67, ...], // <-- full dense vector + "using": "full", + "limit": 100 + }, + "query": [ // <─┐ + [0.1, 0.2, ...], // < │ + [0.2, 0.1, ...], // < ├─ multi-vector + [0.8, 0.9, ...] // < │ + ], // <─┘ + "using": "colbert", + "limit": 10 +} ``` -This endpoint is also visible on the cluster detail page. +```python +from qdrant_client import QdrantClient, models -If you want to access the database from your local developer machine, you can use `kubectl port-forward` to forward the service port to your local machine: +client = QdrantClient(url="http://localhost:6333") +client.query_points( + collection_name="{collection_name}", + prefetch=models.Prefetch( + prefetch=models.Prefetch(query=[1, 23, 45, 67], using="mrl_byte", limit=1000), + query=[0.01, 0.45, 0.67], + using="full", + limit=100, + ), + query=[ + [0.1, 0.2], + [0.2, 0.1], + [0.8, 0.9], + ], + using="colbert", + limit=10, +) ``` -kubectl --namespace your-qdrant-namespace port-forward service/qdrant-9a9f48c7-bb90-4fb2-816f-418a46a74b24 6333:6333 -``` +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -You can also expose the database outside the Kubernetes cluster with a `LoadBalancer` (if supported in your Kubernetes environment) or `NodePort` service or an ingress. +const client = new QdrantClient({ host: "localhost", port: 6333 }); -The service type and necessary annotations can be configured in the “Kubernetes Configuration” section during cluster creation, or on the cluster detail page. +client.query("{collection_name}", { + prefetch: { + prefetch: { + query: [1, 23, 45, 67], + using: 'mrl_byte', + limit: 1000 + }, + query: [0.01, 0.45, 0.67], + using: 'full', + limit: 100, + }, + query: [ + [0.1, 0.2], + [0.2, 0.1], + [0.8, 0.9], + ], + using: 'colbert', + limit: 10, +}); +``` -![Hybrid Cloud API Key configuration](https://qdrant.tech/documentation/cloud/hybrid_cloud_service.png) +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{PrefetchQueryBuilder, Query, QueryPointsBuilder}; -Especially if you create a LoadBalancer Service, you may need to provide annotations for the loadbalancer configration. Please refer to the documention of your cloud provider for more details. +let client = Qdrant::from_url("http://localhost:6334").build()?; -Examples: +client.query( + QueryPointsBuilder::new("{collection_name}") + .add_prefetch(PrefetchQueryBuilder::default() + .add_prefetch(PrefetchQueryBuilder::default() + .query(Query::new_nearest(vec![1.0, 23.0, 45.0, 67.0])) + .using("mrl_byte") + .limit(1000u64) + ) + .query(Query::new_nearest(vec![0.01, 0.45, 0.67])) + .using("full") + .limit(100u64) + ) + .query(Query::new_nearest(vec![ + vec![0.1, 0.2], + vec![0.2, 0.1], + vec![0.8, 0.9], + ])) + .using("colbert") + .limit(10u64) +).await?; +``` -- [AWS EKS LoadBalancer annotations](https://kubernetes-sigs.github.io/aws-load-balancer-controller/latest/guide/service/annotations/) -- [Azure AKS Public LoadBalancer annotations](https://learn.microsoft.com/en-us/azure/aks/load-balancer-standard) -- [Azure AKS Internal LoadBalancer annotations](https://learn.microsoft.com/en-us/azure/aks/internal-lb) -- [GCP GKE LoadBalancer annotations](https://cloud.google.com/kubernetes-engine/docs/concepts/service-load-balancer-parameters) +```java +import static io.qdrant.client.QueryFactory.nearest; -You could also create a Loadbalancer service manually like this: +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Points.PrefetchQuery; +import io.qdrant.client.grpc.Points.QueryPoints; -```yaml -apiVersion: v1 -kind: Service -metadata: - name: qdrant-9a9f48c7-bb90-4fb2-816f-418a46a74b24-lb - namespace: qdrant-namespace -spec: - type: LoadBalancer - ports: - - name: http - port: 6333 - - name: grpc - port: 6334 - selector: - app: qdrant - cluster-id: 9a9f48c7-bb90-4fb2-816f-418a46a74b24 +QdrantClient client = + new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +client + .queryAsync( + QueryPoints.newBuilder() + .setCollectionName("{collection_name}") + .addPrefetch( + PrefetchQuery.newBuilder() + .addPrefetch( + PrefetchQuery.newBuilder() + .setQuery(nearest(1, 23, 45, 67)) // <------------- small byte vector + .setUsing("mrl_byte") + .setLimit(1000) + .build()) + .setQuery(nearest(0.01f, 0.45f, 0.67f)) // <-- dense vector + .setUsing("full") + .setLimit(100) + .build()) + .setQuery( + nearest( + new float[][] { + {0.1f, 0.2f}, // <─┐ + {0.2f, 0.1f}, // < ├─ multi-vector + {0.8f, 0.9f} // < ┘ + })) + .setUsing("colbert") + .setLimit(10) + .build()) + .get(); ``` -An ingress could look like this: +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -```yaml -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: qdrant-9a9f48c7-bb90-4fb2-816f-418a46a74b24 - namespace: qdrant-namespace -spec: - rules: - - host: qdrant-9a9f48c7-bb90-4fb2-816f-418a46a74b24.your-domain.com - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: qdrant-9a9f48c7-bb90-4fb2-816f-418a46a74b24 - port: - number: 6333 +var client = new QdrantClient("localhost", 6334); +await client.QueryAsync( + collectionName: "{collection_name}", + prefetch: new List { + new() { + Prefetch = { + new List { + new() { + Query = new float[] { 1, 23, 45, 67 }, // <------------- small byte vector + Using = "mrl_byte", + Limit = 1000 + }, + } + }, + Query = new float[] {0.01f, 0.45f, 0.67f}, // <-- dense vector + Using = "full", + Limit = 100 + } + }, + query: new float[][] { + [0.1f, 0.2f], // <─┐ + [0.2f, 0.1f], // < ├─ multi-vector + [0.8f, 0.9f] // < ┘ + }, + usingVector: "colbert", + limit: 10 +); ``` -Please refer to the Kubernetes, ingress controller and cloud provider documentation for more details. - -If you expose the database like this, you will be able to see this also reflected as an endpoint on the cluster detail page. And will see the Qdrant database dashboard link pointing to it. +**Note:** *The multivector feature is not only useful for ColBERT; it can also be used in other ways.*
+For instance, in e-commerce, you can use multi-vector to store multiple images of the same item. This serves as an alternative to the [group-by](/documentation/concepts/search/#grouping-api) method. -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-cluster-creation/\#configuring-tls) Configuring TLS +## Sparse Vectors Compression -If you want to configure TLS for accessing your Qdrant database in Hybrid Cloud, there are two options: +In version 1.9, we introduced the `uint8` [vector datatype](/documentation/concepts/vectors/#datatypes) for sparse vectors, in order to support pre-quantized embeddings from companies like JinaAI and Cohere. +This time, we are introducing a new datatype **for both sparse and dense vectors**, as well as a different way of **storing** these vectors. -- You can offload TLS at the ingress or loadbalancer level. -- You can configure TLS directly in the Qdrant database. +**Datatype:** Sparse and dense vectors were previously represented in larger `float32` values, but now they can be turned to the `float16`. `float16` vectors have a lower precision compared to `float32`, which means that there is less numerical accuracy in the vector values - but this is negligible for practical use cases. -If you want to offload TLS at the ingress or loadbancer level, please refer to their respective documents. +These vectors will use half the memory of regular vectors, which can significantly reduce the footprint of large vector datasets. Operations can be faster due to reduced memory bandwidth requirements and better cache utilization. This can lead to faster vector search operations, especially in memory-bound scenarios. -If you want to configure TLS directly in the Qdrant database, you can reference a secret containing the TLS certificate and key in the “Configuration” section of the Qdrant Cluster detail page. +When creating a collection, you need to specify the `datatype` upfront: -![Hybrid Cloud API Key configuration](https://qdrant.tech/documentation/cloud/hybrid_cloud_tls.png) +```http +PUT /collections/{collection_name} +{ + "vectors": { + "size": 1024, + "distance": "Cosine", + "datatype": "float16" + } +} +``` -To create such a secret, you can use `kubectl`: +```python +from qdrant_client import QdrantClient, models -```shell - kubectl create secret tls qdrant-tls --cert=mydomain.com.crt --key=mydomain.com.key --namespace the-qdrant-namespace +client = QdrantClient(url="http://localhost:6333") +client.create_collection( + "{collection_name}", + vectors_config=models.VectorParams( + size=1024, distance=models.Distance.COSINE, datatype=models.Datatype.FLOAT16 + ), +) ``` -The resulting secret will look like this: +```typescript +import { QdrantClient } from "@qdrant/js-client-rest"; -```yaml -apiVersion: v1 -data: - tls.crt: ... - tls.key: ... -kind: Secret -metadata: - name: qdrant-tls - namespace: the-qdrant-namespace -type: kubernetes.io/tls +const client = new QdrantClient({ host: "localhost", port: 6333 }); +client.createCollection("{collection_name}", { + vectors: { + size: 1024, + distance: "Cosine", + datatype: "float16" + } +}); ``` -With this command the secret name to enter into the UI would be `qdrant-tls` and the keys would be `tls.crt` and `tls.key`. +```java +import io.qdrant.client.QdrantClient; +import io.qdrant.client.QdrantGrpcClient; +import io.qdrant.client.grpc.Collections.CreateCollection; +import io.qdrant.client.grpc.Collections.Datatype; +import io.qdrant.client.grpc.Collections.Distance; +import io.qdrant.client.grpc.Collections.VectorParams; +import io.qdrant.client.grpc.Collections.VectorsConfig; + +QdrantClient client = new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-cluster-creation/\#configuring-cpu-and-memory-resource-reservations) Configuring CPU and memory resource reservations +client + .createCollectionAsync( + CreateCollection.newBuilder() + .setCollectionName("{collection_name}") + .setVectorsConfig(VectorsConfig.newBuilder() + .setParams(VectorParams.newBuilder() + .setSize(1024) + .setDistance(Distance.Cosine) + .setDatatype(Datatype.Float16) + .build()) + .build()) + .build()) + .get(); +``` -When creating a Qdrant database cluster, Qdrant Cloud schedules Pods with specific CPU and memory requests and limits to ensure optimal performance. It will use equal requests and limits for stability. Ideally, Kubernetes nodes should match the Pod size, with one database Pod per VM. +```rust +use qdrant_client::Qdrant; +use qdrant_client::qdrant::{CreateCollectionBuilder, Datatype, Distance, VectorParamsBuilder}; -By default, Qdrant Cloud will reserve 20% of available CPU and memory on each Pod. This is done to leave room for the operating system, Kubernetes, and system components. This conservative default may need adjustment depending on node size, whereby smaller nodes might require more, and larger nodes less resources reserved. +let client = Qdrant::from_url("http://localhost:6334").build()?; -You can modify this reservation in the “Configuration” section of the Qdrant Cluster detail page. +client + .create_collection( + CreateCollectionBuilder::new("{collection_name}").vectors_config( + VectorParamsBuilder::new(1024, Distance::Cosine).datatype(Datatype::Float16), + ), + ) + .await?; +``` -If you want to check how much resources are availabe on an empty Kubernetes node, you can use the following command: +```csharp +using Qdrant.Client; +using Qdrant.Client.Grpc; -```shell -kubectl describe node +var client = new QdrantClient("localhost", 6334); +await client.CreateCollectionAsync( + collectionName: "{collection_name}", + vectorsConfig: new VectorParams { + Size = 1024, + Distance = Distance.Cosine, + Datatype = Datatype.Float16 + } +); ``` -This will give you a breakdown of the available resources to Kubernetes and how much is already reserved and used for system Pods. +**Storage:** On the backend, we implemented bit packing to minimize the bits needed to store data, crucial for handling sparse vectors in applications like machine learning and data compression. For sparse vectors with mostly zeros, this focuses on storing only the indices and values of non-zero elements. -##### Was this page useful? +You will benefit from a more compact storage and higher processing efficiency. This can also lead to reduced dataset sizes for faster processing and lower storage costs in data compression. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +## New Rust Client -Thank you for your feedback! 🙏 +Qdrant’s Rust client has been fully reshaped. It is now more accessible and +easier to use. We have focused on putting together a minimalistic API interface. +All operations and their types now use the builder pattern, providing an easy +and extensible interface, preventing breakage with future updates. See the Rust +[ColBERT query](#multivector-support) as great example. Additionally, +Rust supports safe concurrent execution, which is crucial for handling multiple +simultaneous requests efficiently. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/hybrid-cloud/hybrid-cloud-cluster-creation.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Documentation got a significant improvement as well. It is much better organized +and provides usage examples across the board. Everything links back to our main +documentation, making it easier to navigate and find the information you need. -On this page: +

+ Visit our + client and + operations documentation +

-- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/hybrid-cloud/hybrid-cloud-cluster-creation.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +## S3 Snapshot Storage -× +Qdrant **Collections**, **Shards** and **Storage** can be backed up with [Snapshots](/documentation/concepts/snapshots/) and saved in case of data loss or other data transfer purposes. These snapshots can be quite large and the resources required to maintain them can result in higher costs. AWS S3 and other S3-compatible implementations like [min.io](https://min.io/) is a great low-cost alternative that can hold snapshots without incurring high costs. It is globally reliable, scalable and resistant to data loss. -[Powered by](https://qdrant.tech/) +You can configure S3 storage settings in the [config.yaml](https://github.com/qdrant/qdrant/blob/master/config/config.yaml), specifically with `snapshots_storage`. -<|page-175-lllmstxt|> -## retrieval-quality -- [Documentation](https://qdrant.tech/documentation/) -- [Beginner tutorials](https://qdrant.tech/documentation/beginner-tutorials/) -- Measure Search Quality +For example, to use AWS S3: -# [Anchor](https://qdrant.tech/documentation/beginner-tutorials/retrieval-quality/\#measure-and-improve-retrieval-quality-in-semantic-search) Measure and Improve Retrieval Quality in Semantic Search +```yaml +storage: + snapshots_config: + # Use 's3' to store snapshots on S3 + snapshots_storage: s3 -| Time: 30 min | Level: Intermediate | | | -| --- | --- | --- | --- | + s3_config: + # Bucket name + bucket: your_bucket_here -Semantic search pipelines are as good as the embeddings they use. If your model cannot properly represent input data, similar objects might -be far away from each other in the vector space. No surprise, that the search results will be poor in this case. There is, however, another -component of the process which can also degrade the quality of the search results. It is the ANN algorithm itself. + # Bucket region (e.g. eu-central-1) + region: your_bucket_region_here -In this tutorial, we will show how to measure the quality of the semantic retrieval and how to tune the parameters of the HNSW, the ANN -algorithm used in Qdrant, to obtain the best results. + # Storage access key + # Can be specified either here or in the `AWS_ACCESS_KEY_ID` environment variable. + access_key: your_access_key_here -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/retrieval-quality/\#embeddings-quality) Embeddings quality + # Storage secret key + # Can be specified either here or in the `AWS_SECRET_ACCESS_KEY` environment variable. + secret_key: your_secret_key_here +``` -The quality of the embeddings is a topic for a separate tutorial. In a nutshell, it is usually measured and compared by benchmarks, such as -[Massive Text Embedding Benchmark (MTEB)](https://huggingface.co/spaces/mteb/leaderboard). The evaluation process itself is pretty -straightforward and is based on a ground truth dataset built by humans. We have a set of queries and a set of the documents we would expect -to receive for each of them. In the [evaluation process](https://qdrant.tech/rag/rag-evaluation-guide/), we take a query, find the most similar documents in the vector space and compare -them with the ground truth. In that setup, **finding the most similar documents is implemented as full kNN search, without any approximation**. -As a result, we can measure the quality of the embeddings themselves, without the influence of the ANN algorithm. +*Read more about [S3 snapshot storage](/documentation/concepts/snapshots/#s3) and [configuration](/documentation/guides/configuration/).* -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/retrieval-quality/\#retrieval-quality) Retrieval quality +This integration allows for a more convenient distribution of snapshots. Users of **any S3-compatible object storage** can now benefit from other platform services, such as automated workflows and disaster recovery options. S3's encryption and access control ensure secure storage and regulatory compliance. Additionally, S3 supports performance optimization through various storage classes and efficient data transfer methods, enabling quick and effective snapshot retrieval and management. -Embeddings quality is indeed the most important factor in the semantic search quality. However, vector search engines, such as Qdrant, do not -perform pure kNN search. Instead, they use **Approximate Nearest Neighbors** (ANN) algorithms, which are much faster than the exact search, -but can return suboptimal results. We can also **measure the retrieval quality of that approximation** which also contributes to the overall -search quality. +## Issues API -### [Anchor](https://qdrant.tech/documentation/beginner-tutorials/retrieval-quality/\#quality-metrics) Quality metrics +Issues API notifies you about potential performance issues and misconfigurations. This powerful new feature allows users (such as database admins) to efficiently manage and track issues directly within the system, ensuring smoother operations and quicker resolutions. -There are various ways of how quantify the quality of semantic search. Some of them, such as [Precision@k](https://en.wikipedia.org/wiki/Evaluation_measures_%28information_retrieval%29#Precision_at_k), -are based on the number of relevant documents in the top-k search results. Others, such as [Mean Reciprocal Rank (MRR)](https://en.wikipedia.org/wiki/Mean_reciprocal_rank), -take into account the position of the first relevant document in the search results. [DCG and NDCG](https://en.wikipedia.org/wiki/Discounted_cumulative_gain) -metrics are, in turn, based on the relevance score of the documents. +You can find the Issues button in the top right. When you click the bell icon, a sidebar will open to show ongoing issues. -If we treat the search pipeline as a whole, we could use them all. The same is true for the embeddings quality evaluation. However, for the -ANN algorithm itself, anything based on the relevance score or ranking is not applicable. Ranking in vector search relies on the distance -between the query and the document in the vector space, however distance is not going to change due to approximation, as the function is -still the same. +![issues api](/blog/qdrant-1.10.x/issues.png) -Therefore, it only makes sense to measure the quality of the ANN algorithm by the number of relevant documents in the top-k search results, -such as `precision@k`. It is calculated as the number of relevant documents in the top-k search results divided by `k`. In case of testing -just the ANN algorithm, we can use the exact kNN search as a ground truth, with `k` being fixed. It will be a measure on **how well the ANN** -**algorithm approximates the exact search**. +## Minor Improvements -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/retrieval-quality/\#measure-the-quality-of-the-search-results) Measure the quality of the search results +- Pre-configure collection parameters; quantization, vector storage & replication factor - [#4299](https://github.com/qdrant/qdrant/pull/4299) -Let’s build a quality [evaluation](https://qdrant.tech/rag/rag-evaluation-guide/) of the ANN algorithm in Qdrant. We will, first, call the search endpoint in a standard way to obtain -the approximate search results. Then, we will call the exact search endpoint to obtain the exact matches, and finally compare both results -in terms of precision. +- Overwrite global optimizer configuration for collections. Lets you separate roles for indexing and searching within the single qdrant cluster - [#4317](https://github.com/qdrant/qdrant/pull/4317) -Before we start, let’s create a collection, fill it with some data and then start our evaluation. We will use the same dataset as in the -[Loading a dataset from Hugging Face hub](https://qdrant.tech/documentation/tutorials/huggingface-datasets/) tutorial, `Qdrant/arxiv-titles-instructorxl-embeddings` -from the [Hugging Face hub](https://huggingface.co/datasets/Qdrant/arxiv-titles-instructorxl-embeddings). Let’s download it in a streaming -mode, as we are only going to use part of it. +- Delta encoding and bitpacking compression for sparse vectors reduces memory consumption for sparse vectors by up to 75% - [#4253](https://github.com/qdrant/qdrant/pull/4253), [#4350](https://github.com/qdrant/qdrant/pull/4350) -```python -from datasets import load_dataset +<|page-347-lllmstxt|> +Welcome to the very first edition of Community Highlights, where we celebrate the most impactful contributions and achievements of our vector search community! 🎉 -dataset = load_dataset( - "Qdrant/arxiv-titles-instructorxl-embeddings", split="train", streaming=True -) +## Content Highlights 🚀 -``` +Here are some standout projects and articles from our community this past month. If you're looking to learn more about vector search or build some great projects, we recommend you to check these guides: -We need some data to be indexed and another set for the testing purposes. Let’s get the first 50000 items for the training and the next 1000 -for the testing. +* **[Implementing Advanced Agentic Vector Search](https://towardsdev.com/implementing-advanced-agentic-vector-search-a-comprehensive-guide-to-crewai-and-qdrant-ca214ca4d039): A Comprehensive Guide to CrewAI and Qdrant by [Pavan Kumar](https://www.linkedin.com/in/kameshwara-pavan-kumar-mantha-91678b21/)** +* **Build Your Own RAG Using [Unstructured, Llama3 via Groq, Qdrant & LangChain](https://www.youtube.com/watch?v=m_3q3XnLlTI) by [Sudarshan Koirala](https://www.linkedin.com/in/sudarshan-koirala/)** +* **Qdrant filtering and [self-querying retriever](https://www.youtube.com/watch?v=iaXFggqqGD0) retrieval with LangChain by [Daniel Romero](https://www.linkedin.com/in/infoslack/)** +* **RAG Evaluation with [Arize Phoenix](https://superlinked.com/vectorhub/articles/retrieval-augmented-generation-eval-qdrant-arize) by [Atita Arora](https://www.linkedin.com/in/atitaarora/)** +* **Building a Serverless Application with [AWS Lambda and Qdrant](https://medium.com/@benitomartin/building-a-serverless-application-with-aws-lambda-and-qdrant-for-semantic-search-ddb7646d4c2f) for Semantic Search by [Benito Martin](https://www.linkedin.com/in/benitomzh/)** +* **Production ready Secure and [Powerful AI Implementations with Azure Services](https://towardsdev.com/production-ready-secure-and-powerful-ai-implementations-with-azure-services-671b68631212) by [Pavan Kumar](https://www.linkedin.com/in/kameshwara-pavan-kumar-mantha-91678b21/)** +* **Building [Agentic RAG with Rust, OpenAI & Qdrant](https://medium.com/@joshmo_dev/building-agentic-rag-with-rust-openai-qdrant-d3a0bb85a267) by [Joshua Mo](https://www.linkedin.com/in/joshua-mo-4146aa220/)** +* **Qdrant [Hybrid Search](https://medium.com/@nickprock/qdrant-hybrid-search-under-the-hood-using-haystack-355841225ac6) under the hood using Haystack by [Nicola Procopio](https://www.linkedin.com/in/nicolaprocopio/)** +* **[Llama 3 Powered Voice Assistant](https://medium.com/@datadrifters/llama-3-powered-voice-assistant-integrating-local-rag-with-qdrant-whisper-and-langchain-b4d075b00ac5): Integrating Local RAG with Qdrant, Whisper, and LangChain by [Datadrifters](https://medium.com/@datadrifters)** +* **[Distributed deployment](https://medium.com/@vardhanam.daga/distributed-deployment-of-qdrant-cluster-with-sharding-replicas-e7923d483ebc) of Qdrant cluster with sharding & replicas by [Vardhanam Daga](https://www.linkedin.com/in/vardhanam-daga/overlay/about-this-profile/)** +* **Private [Healthcare AI Assistant](https://medium.com/aimpact-all-things-ai/building-private-healthcare-ai-assistant-for-clinics-using-qdrant-hybrid-cloud-jwt-rbac-dspy-and-089a772e08ae) using Qdrant Hybrid Cloud, DSPy, and Groq by [Sachin Khandewal](https://www.linkedin.com/in/sachink1729/)** -```python -dataset_iterator = iter(dataset) -train_dataset = [next(dataset_iterator) for _ in range(60000)] -test_dataset = [next(dataset_iterator) for _ in range(1000)] -``` +## Creator of the Month 🌟 -Now, let’s create a collection and index the training data. This collection will be created with the default configuration. Please be aware that -it might be different from your collection settings, and it’s always important to test exactly the same configuration you are going to use later -in production. -```python -from qdrant_client import QdrantClient, models +Picture of Pavan Kumar with over 6 content contributions for the Creator of the Month -client = QdrantClient("http://localhost:6333") -client.create_collection( - collection_name="arxiv-titles-instructorxl-embeddings", - vectors_config=models.VectorParams( - size=768, # Size of the embeddings generated by InstructorXL model - distance=models.Distance.COSINE, - ), -) -``` +Congratulations to Pavan Kumar for being awarded **Creator of the Month!** Check out what were Pavan's most valuable contributions to the Qdrant vector search community this past month: -We are now ready to index the training data. Uploading the records is going to trigger the indexing process, which will build the HNSW graph. -The indexing process may take some time, depending on the size of the dataset, but your data is going to be available for search immediately -after receiving the response from the `upsert` endpoint. **As long as the indexing is not finished, and HNSW not built, Qdrant will perform** -**the exact search**. We have to wait until the indexing is finished to be sure that the approximate search is performed. -```python -client.upload_points( # upload_points is available as of qdrant-client v1.7.1 - collection_name="arxiv-titles-instructorxl-embeddings", - points=[\ - models.PointStruct(\ - id=item["id"],\ - vector=item["vector"],\ - payload=item,\ - )\ - for item in train_dataset\ - ] -) +* **[Implementing Advanced Agentic Vector Search](https://towardsdev.com/implementing-advanced-agentic-vector-search-a-comprehensive-guide-to-crewai-and-qdrant-ca214ca4d039): A Comprehensive Guide to CrewAI and Qdrant** +* **Production ready Secure and [Powerful AI Implementations with Azure Services](https://towardsdev.com/production-ready-secure-and-powerful-ai-implementations-with-azure-services-671b68631212)** +* **Building Neural Search Pipelines with Azure and Qdrant: A Step-by-Step Guide [Part-1](https://towardsdev.com/building-neural-search-pipelines-with-azure-and-qdrant-a-step-by-step-guide-part-1-40c191084258) and [Part-2](https://towardsdev.com/building-neural-search-pipelines-with-azure-and-qdrant-a-step-by-step-guide-part-2-fba287b49574)** +* **Building a RAG System with [Ollama, Qdrant and Raspberry Pi](https://blog.gopenai.com/harnessing-ai-at-the-edge-building-a-rag-system-with-ollama-qdrant-and-raspberry-pi-45ac3212cf75)** +* **Building a [Multi-Document ReAct Agent](https://blog.stackademic.com/building-a-multi-document-react-agent-for-financial-analysis-using-llamaindex-and-qdrant-72a535730ac3) for Financial Analysis using LlamaIndex and Qdrant** -while True: - collection_info = client.get_collection(collection_name="arxiv-titles-instructorxl-embeddings") - if collection_info.status == models.CollectionStatus.GREEN: - # Collection status is green, which means the indexing is finished - break +Pavan is a seasoned technology expert with 14 years of extensive experience, passionate about sharing his knowledge through technical blogging, engaging in technical meetups, and staying active with cycling! -``` +Thank you, Pavan, for your outstanding contributions and commitment to the community! -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/retrieval-quality/\#standard-mode-vs-exact-search) Standard mode vs exact search +## Most Active Members 🏆 -Qdrant has a built-in exact search mode, which can be used to measure the quality of the search results. In this mode, Qdrant performs a -full kNN search for each query, without any approximation. It is not suitable for production use with high load, but it is perfect for the -evaluation of the ANN algorithm and its parameters. It might be triggered by setting the `exact` parameter to `True` in the search request. -We are simply going to use all the examples from the test dataset as queries and compare the results of the approximate search with the -results of the exact search. Let’s create a helper function with `k` being a parameter, so we can calculate the `precision@k` for different -values of `k`. -```python -def avg_precision_at_k(k: int): - precisions = [] - for item in test_dataset: - ann_result = client.query_points( - collection_name="arxiv-titles-instructorxl-embeddings", - query=item["vector"], - limit=k, - ).points +Picture of the 3 most active members of our vector search community - knn_result = client.query_points( - collection_name="arxiv-titles-instructorxl-embeddings", - query=item["vector"], - limit=k, - search_params=models.SearchParams( - exact=True, # Turns on the exact search mode - ), - ).points - # We can calculate the precision@k by comparing the ids of the search results - ann_ids = set(item.id for item in ann_result) - knn_ids = set(item.id for item in knn_result) - precision = len(ann_ids.intersection(knn_ids)) / k - precisions.append(precision) +We're excited to recognize our most active community members, who have been a constant support to vector search builders, and sharing their knowledge and making our community more engaging: - return sum(precisions) / len(precisions) +* đŸ„‡ **1st Place: Robert Caulk** +* đŸ„ˆ **2nd Place: Nicola Procopio** +* đŸ„‰ **3rd Place: Joshua Mo** -``` +Thank you all for your dedication and for making the Qdrant vector search community such a dynamic and valuable place! -Calculating the `precision@5` is as simple as calling the function with the corresponding parameter: +Stay tuned for more highlights and updates in the next edition of Community Highlights! 🚀 -```python -print(f"avg(precision@5) = {avg_precision_at_k(k=5)}") +**Join us for Office Hours! đŸŽ™ïž** -``` +Don't miss our next [Office Hours hangout on Discord](https://discord.gg/s9YxGeQK?event=1252726857753821236), happening next week on June 27th. This is a great opportunity to introduce yourself to the community, learn more about vector search, and engage with the people behind this awesome content! -Response: +See you there 👋 -```text -avg(precision@5) = 0.9935999999999995 +<|page-348-lllmstxt|> +### Summary -``` +A security vulnerability has been discovered in Qdrant affecting all versions +prior to v1.9, described in [CVE-2024-3829](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2024-3829). +The vulnerability allows an attacker to upload arbitrary files to the +filesystem, which can be used to gain remote code execution. This is a different but similar vulnerability to CVE-2024-2221, announced in April 2024. -As we can see, the precision of the approximate search vs exact search is pretty high. There are, however, some scenarios when we -need higher precision and can accept higher latency. HNSW is pretty tunable, and we can increase the precision by changing its parameters. +The vulnerability does not materially affect Qdrant cloud deployments, as that +filesystem is read-only and authentication is enabled by default. At worst, +the vulnerability could be used by an authenticated user to crash a cluster, +which is already possible, such as by uploading more vectors than can fit in RAM. -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/retrieval-quality/\#tweaking-the-hnsw-parameters) Tweaking the HNSW parameters +Qdrant has addressed the vulnerability in v1.9.0 and above with code that +restricts file uploads to a folder dedicated to that purpose. -HNSW is a hierarchical graph, where each node has a set of links to other nodes. The number of edges per node is called the `m` parameter. -The larger the value of it, the higher the precision of the search, but more space required. The `ef_construct` parameter is the number of -neighbours to consider during the index building. Again, the larger the value, the higher the precision, but the longer the indexing time. -The default values of these parameters are `m=16` and `ef_construct=100`. Let’s try to increase them to `m=32` and `ef_construct=200` and -see how it affects the precision. Of course, we need to wait until the indexing is finished before we can perform the search. +### Action -```python -client.update_collection( - collection_name="arxiv-titles-instructorxl-embeddings", - hnsw_config=models.HnswConfigDiff( - m=32, # Increase the number of edges per node from the default 16 to 32 - ef_construct=200, # Increase the number of neighbours from the default 100 to 200 - ) -) +Check the current version of your Qdrant deployment. Upgrade if your deployment +is not at least v1.9.0. -while True: - collection_info = client.get_collection(collection_name="arxiv-titles-instructorxl-embeddings") - if collection_info.status == models.CollectionStatus.GREEN: - # Collection status is green, which means the indexing is finished - break +To confirm the version of your Qdrant deployment in the cloud or on your local +or cloud system, run an API GET call, as described in the [Qdrant Quickstart +guide](https://qdrant.tech/documentation/cloud/quickstart-cloud/#step-2-test-cluster-access). +If your Qdrant deployment is local, you do not need an API key. -``` +Your next step depends on how you installed Qdrant. For details, read the +[Qdrant Installation](https://qdrant.tech/documentation/guides/installation/) +guide. -The same function can be used to calculate the average `precision@5`: +#### If you use the Qdrant container or binary -```python -print(f"avg(precision@5) = {avg_precision_at_k(k=5)}") +Upgrade your deployment. Run the commands in the applicable section of the +[Qdrant Installation](https://qdrant.tech/documentation/guides/installation/) +guide. The default commands automatically pull the latest version of Qdrant. -``` +#### If you use the Qdrant helm chart -Response: +If you’ve set up Qdrant on kubernetes using a helm chart, follow the README in +the [qdrant-helm](https://github.com/qdrant/qdrant-helm/tree/main?tab=readme-ov-file#upgrading) repository. +Make sure applicable configuration files point to version v1.9.0 or above. -```text -avg(precision@5) = 0.9969999999999998 +#### If you use the Qdrant cloud -``` +No action is required. This vulnerability does not materially affect you. However, we suggest that you upgrade your cloud deployment to the latest version. -The precision has obviously increased, and we know how to control it. However, there is a trade-off between the precision and the search -latency and memory requirements. In some specific cases, we may want to increase the precision as much as possible, so now we know how -to do it. +<|page-349-lllmstxt|> +At Qdrant, we are happy to announce the successful completion our the SOC 2 Type II Audit. This achievement underscores our unwavering commitment to upholding the highest standards of security, availability, and confidentiality for our services and our customers’ data. -## [Anchor](https://qdrant.tech/documentation/beginner-tutorials/retrieval-quality/\#wrapping-up) Wrapping up -Assessing the quality of retrieval is a critical aspect of [evaluating](https://qdrant.tech/rag/rag-evaluation-guide/) semantic search performance. It is imperative to measure retrieval quality when aiming for optimal quality of. -your search results. Qdrant provides a built-in exact search mode, which can be used to measure the quality of the ANN algorithm itself, -even in an automated way, as part of your CI/CD pipeline. +## SOC 2 Type II: What Is It? -Again, **the quality of the embeddings is the most important factor**. HNSW does a pretty good job in terms of precision, and it is -parameterizable and tunable, when required. There are some other ANN algorithms available out there, such as [IVF\*](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes#cell-probe-methods-indexivf-indexes), -but they usually [perform worse than HNSW in terms of quality and performance](https://nirantk.com/writing/pgvector-vs-qdrant/#correctness). +SOC 2 Type II certification is an examination of an organization's controls in reference to the American Institute of Certified Public Accountants [(AICPA) Trust Services criteria](https://www.aicpa-cima.com/resources/download/2017-trust-services-criteria-with-revised-points-of-focus-2022). It evaluates not only our written policies but also their practical implementation, ensuring alignment between our stated objectives and operational practices. Unlike Type I, which is a snapshot in time, Type II verifies over several months that the company has lived up to those controls. The report represents thorough auditing of our security procedures throughout this examination period: January 1, 2024 to April 7, 2024. -##### Was this page useful? -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +## Key Audit Findings -Thank you for your feedback! 🙏 +The audit ensured with no exceptions noted the effectiveness of our systems and controls on the following Trust Service Criteria: -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/beginner-tutorials/retrieval-quality.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. -On this page: -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/beginner-tutorials/retrieval-quality.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +* Security +* Confidentiality +* Availability -× +These certifications are available today and automatically apply to your existing workloads. The full SOC 2 Type II report is available to customers and stakeholders upon request through the [Trust Center](https://app.drata.com/trust/9cbbb75b-0c38-11ee-865f-029d78a187d9). -[Powered by](https://qdrant.tech/) -<|page-176-lllmstxt|> -## qa-with-cohere-and-qdrant -- [Articles](https://qdrant.tech/articles/) -- Question Answering as a Service with Cohere and Qdrant +## Future Compliance -[Back to Practical Examples](https://qdrant.tech/articles/practicle-examples/) +Going forward, Qdrant will maintain SOC 2 Type II compliance by conducting continuous, annual audits to ensure our security practices remain aligned with industry standards and evolving risks. -# Question Answering as a Service with Cohere and Qdrant +Recognizing the critical importance of data security and the trust our clients place in us, achieving SOC 2 Type II compliance underscores our ongoing commitment to prioritize data protection with the utmost integrity and reliability. -Kacper Ɓukawski -· +## About Qdrant -November 29, 2022 +Qdrant is a vector database designed to handle large-scale, high-dimensional data efficiently. It allows for fast and accurate similarity searches in complex datasets. Qdrant strives to achieve seamless and scalable vector search capabilities for various applications. -![Question Answering as a Service with Cohere and Qdrant](https://qdrant.tech/articles_data/qa-with-cohere-and-qdrant/preview/title.jpg) +For more information about Qdrant and our security practices, please visit our [website](http://qdrant.tech) or [reach out to our team directly](https://qdrant.tech/contact-us/). -Bi-encoders are probably the most efficient way of setting up a semantic Question Answering system. -This architecture relies on the same neural model that creates vector embeddings for both questions and answers. -The assumption is, both question and answer should have representations close to each other in the latent space. -It should be like that because they should both describe the same semantic concept. That doesn’t apply -to answers like “Yes” or “No” though, but standard FAQ-like problems are a bit easier as there is typically -an overlap between both texts. Not necessarily in terms of wording, but in their semantics. +<|page-350-lllmstxt|> +We're excited to introduce **Qdrant Stars**, our new ambassador program created to recognize and support Qdrant users making a strong impact in the AI and vector search space. + +Whether through innovative content, real-world applications tutorials, educational events, or engaging discussions, they are constantly making vector search more accessible and interesting to explore. + +### 👋 Say hello to the first Qdrant Stars! + +Our inaugural Qdrant Stars are a diverse and talented lineup who have shown exceptional dedication to our community. You might recognize some of their names: + +
+
+
+
Robert Caulk
+ Robert LinkedIn +
+
+ Robert Caulk +
+

Robert is working with a team on AskNews to adaptively enrich, index, and report on over 1 million news articles per day. His team maintains an open-source tool geared toward cluster orchestration Flowdapt, which moves data around highly parallelized production environments. This is why Robert and his team rely on Qdrant for low-latency, scalable, hybrid search across dense and sparse vectors in asynchronous environments.

+
+
+
+ I am interested in brainstorming innovative ways to interact with Qdrant vector databases and building presentations that show the power of coupling Flowdapt with Qdrant for large-scale production GenAI applications. I look forward to networking with Qdrant experts and users so that I can learn from their experience. +
+ +
+
Joshua Mo
+ Josh LinkedIn +
+
+ Josh +
+

Josh is a Rust developer and DevRel Engineer at Shuttle, assisting with user engagement and being a point of contact for first-line information within the community. He's often writing educational content that combines Javascript with Rust and is a coach at Codebar, which is a charity that runs free programming workshops for minority groups within tech.

+
+
+
+ I am excited about getting access to Qdrant's new features and contributing to the AI community by demonstrating how those features can be leveraged for production environments. +
-![Bi-encoder structure. Both queries (questions) and documents (answers) are vectorized by the same neural encoder. Output embeddings are then compared by a chosen distance function, typically cosine similarity.](https://qdrant.tech/articles_data/qa-with-cohere-and-qdrant/biencoder-diagram.png) +
+
Nicholas Khami
+ Nick LinkedIn +
+
+ Nick +
+

Nick is a founder and product engineer at Trieve and has been using Qdrant since late 2022. He has a low level understanding of the Qdrant API, especially the Rust client, and knows a lot about how to make the most of Qdrant on an application level.

+
+
+
+ I'm looking forward to be helping folks use lesser known features to enhance and make their projects better! +
+
+
Owen Colegrove
+ Owen LinkedIn +
+
+ Owen Colegrove +
+

Owen Colegrove is the Co-Founder of SciPhi, making it easy build, deploy, and scale RAG systems using Qdrant vector search tecnology. He has Ph.D. in Physics and was previously a Quantitative Strategist at Citadel and a Researcher at CERN.

+
+
+
+ I'm excited about working together with Qdrant! +
+ +
+
Kameshwara Pavan Kumar Mantha
+ Pavan LinkedIn +
+
+ Kameshwara Pavan +
+

Kameshwara Pavan is a expert with 14 years of extensive experience in full stack development, cloud solutions, and AI. Specializing in Generative AI and LLMs. + Pavan has established himself as a leader in these cutting-edge domains. He holds a Master's in Data Science and a Master's in Computer Applications, and is currently pursuing his PhD.

+
+
+
+ Outside of my professional pursuits, I'm passionate about sharing my knowledge through technical blogging, engaging in technical meetups, and staying active with cycling. I admire the groundbreaking work Qdrant is doing in the industry, and I'm eager to collaborate and learn from the team that drives such exceptional advancements. +
-And yeah, you need to **bring your own embeddings**, in order to even start. There are various ways how -to obtain them, but using Cohere [co.embed API](https://docs.cohere.ai/reference/embed) is probably -the easiest and most convenient method. +
+
Niranjan Akella
+ Niranjan LinkedIn +
+
+ Niranjan Akella +
+

Niranjan is an AI/ML Engineer at Genesys who specializes in building and deploying AI models such as LLMs, Diffusion Models, and Vision Models at scale. He actively shares his projects through content creation and is passionate about applied research, developing custom real-time applications that that serve a greater purpose. +

+
+
+
+ I am a scientist by heart and an AI engineer by profession. I'm always armed to take a leap of faith into the impossible to be come the impossible. I'm excited to explore and venture into Qdrant Stars with some support to build a broader community and develop a sense of completeness among like minded people. +
-## [Anchor](https://qdrant.tech/articles/qa-with-cohere-and-qdrant/\#why-coembed-api-and-qdrant-go-well-together) Why co.embed API and Qdrant go well together? +
+
Bojan Jakimovski
+ Bojan LinkedIn +
+
+ Bojan Jakimovski +
+

Bojan is an Advanced Machine Learning Engineer at Loka currently pursuing a Master’s Degree focused on applying AI in Heathcare. He is specializing in Dedicated Computer Systems, with a passion for various technology fields. +

+
+
+
+I'm really excited to show the power of the Qdrant as vector database. Especially in some fields where accessing the right data by very fast and efficient way is a must, in fields like Healthcare and Medicine. +
+
-Maintaining a **Large Language Model** might be hard and expensive. Scaling it up and down, when the traffic -changes, require even more effort and becomes unpredictable. That might be definitely a blocker for any semantic -search system. But if you want to start right away, you may consider using a SaaS model, Cohere’s -[co.embed API](https://docs.cohere.ai/reference/embed) in particular. It gives you state-of-the-art language -models available as a Highly Available HTTP service with no need to train or maintain your own service. As all -the communication is done with JSONs, you can simply provide the co.embed output as Qdrant input. +We are happy to welcome this group of people who are deeply committed to advancing vector search technology. We look forward to supporting their vision, and helping them make a bigger impact on the community. -```python -# Putting the co.embed API response directly as Qdrant method input -qdrant_client.upsert( - collection_name="collection", - points=rest.Batch( - ids=[...], - vectors=cohere_client.embed(...).embeddings, - payloads=[...], - ), -) +You can find and chat with them at our [Discord Community](https://discord.gg/qdrant/). -``` +### Why become a Qdrant Star? -Both tools are easy to combine, so you can start working with semantic search in a few minutes, not days. +There are many ways you can benefit from the Qdrant Star Program. Here are just a few: -And what if your needs are so specific that you need to fine-tune a general usage model? Co.embed API goes beyond -pre-trained encoders and allows providing some custom datasets to -[customize the embedding model with your own data](https://docs.cohere.com/docs/finetuning). -As a result, you get the quality of domain-specific models, but without worrying about infrastructure. +##### Exclusive rewards programs -## [Anchor](https://qdrant.tech/articles/qa-with-cohere-and-qdrant/\#system-architecture-overview) System architecture overview +Celebrate top contributors monthly with special rewards, including exclusive swag and monetary prizes. Quarterly awards for 'Most Innovative Content' and 'Best Tutorial' offer additional prizes. -In real systems, answers get vectorized and stored in an efficient vector search database. We typically don’t -even need to provide specific answers, but just use sentences or paragraphs of text and vectorize them instead. -Still, if a bit longer piece of text contains the answer to a particular question, its distance to the question -embedding should not be that far away. And for sure closer than all the other, non-matching answers. Storing the -answer embeddings in a vector database makes the search process way easier. +##### Early access to new features -![Building the database of possible answers. All the texts are converted into their vector embeddings and those embeddings are stored in a vector database, i.e. Qdrant.](https://qdrant.tech/articles_data/qa-with-cohere-and-qdrant/vector-database.png) +Be the first to explore and write about our latest features and beta products. Participate in product meetings where your ideas and suggestions can directly influence our roadmap. -## [Anchor](https://qdrant.tech/articles/qa-with-cohere-and-qdrant/\#looking-for-the-correct-answer) Looking for the correct answer +##### Conference support -Once our database is working and all the answer embeddings are already in place, we can start querying it. -We basically perform the same vectorization on a given question and ask the database to provide some near neighbours. -We rely on the embeddings to be close to each other, so we expect the points with the smallest distance in the latent -space to contain the proper answer. +We love seeing our stars on stage! If you're planning to attend and speak about Qdrant at conferences, we've got you covered. Receive presentation templates, mentorship, and educational materials to help deliver standout conference presentations, with travel expenses covered. -![While searching, a question gets vectorized by the same neural encoder. Vector database is a component that looks for the closest answer vectors using i.e. cosine similarity. A proper system, like Qdrant, will make the lookup process more efficient, as it won’t calculate the distance to all the answer embeddings. Thanks to HNSW, it will be able to find the nearest neighbours with sublinear complexity.](https://qdrant.tech/articles_data/qa-with-cohere-and-qdrant/search-with-vector-database.png) +##### Qdrant Certification -## [Anchor](https://qdrant.tech/articles/qa-with-cohere-and-qdrant/\#implementing-the-qa-search-system-with-saas-tools) Implementing the QA search system with SaaS tools +End the program as a certified Qdrant ambassador and vector search specialist, with provided training resources and a certification test to showcase your expertise. -We don’t want to maintain our own service for the neural encoder, nor even set up a Qdrant instance. There are SaaS -solutions for both — Cohere’s [co.embed API](https://docs.cohere.ai/reference/embed) -and [Qdrant Cloud](https://qdrant.to/cloud), so we’ll use them instead of on-premise tools. +### What do Qdrant Stars do? -### [Anchor](https://qdrant.tech/articles/qa-with-cohere-and-qdrant/\#question-answering-on-biomedical-data) Question Answering on biomedical data +As a Qdrant Star, you'll share your knowledge with the community through articles, blogs, tutorials, or demos that highlight the power and versatility of vector search technology - in your own creative way. You'll be a friendly face and a trusted expert in the community, sparking discussions on topics you love and keeping our community active and engaged. -We’re going to implement the Question Answering system for the biomedical data. There is a -_[pubmed\_qa](https://huggingface.co/datasets/pubmed_qa)_ dataset, with it _pqa\_labeled_ subset containing 1,000 examples -of questions and answers labelled by domain experts. Our system is going to be fed with the embeddings generated by -co.embed API and we’ll load them to Qdrant. Using Qdrant Cloud vs your own instance does not matter much here. -There is a subtle difference in how to connect to the cloud instance, but all the other operations are executed -in the same way. +Love organizing events? You'll have the chance to host meetups, workshops, and other educational gatherings, with all the promotional and logistical support you need to make them a hit. But if large conferences are your thing, we’ll provide the resources and cover your travel expenses so you can focus on delivering an outstanding presentation. -```python -from datasets import load_dataset +You'll also have a say in the Qdrant roadmap by giving feedback on new features and participating in product meetings. Qdrant Stars are constantly contributing to the growth and value of the vector search ecosystem. -# Loading the dataset from HuggingFace hub. It consists of several columns: pubid, -# question, context, long_answer and final_decision. For the purposes of our system, -# we’ll use question and long_answer. -dataset = load_dataset("pubmed_qa", "pqa_labeled") +### How to join the Qdrant Stars Program -``` +Are you interested in becoming a Qdrant Star? -| **pubid** | **question** | **context** | **long\_answer** | **final\_decision** | -| --- | --- | --- | --- | --- | -| 18802997 | Can calprotectin predict relapse risk in infla
 | 
 | Measuring calprotectin may help to identify UC
 | maybe | -| 20538207 | Should temperature be monitorized during kidne
 | 
 | The new storage can affords more stable temper
 | no | -| 25521278 | Is plate clearing a risk factor for obesity? | 
 | The tendency to clear one’s plate when eating 
 | yes | -| 17595200 | Is there an intrauterine influence on obesity? | 
 | Comparison of mother-offspring and father-offs.. | no | -| 15280782 | Is unsafe sexual behaviour increasing among HI
 | 
 | There was no evidence of a trend in unsafe sex
 | no | +We're on the lookout for individuals who are passionate about vector search technology and looking to make an impact in the AI community. -### [Anchor](https://qdrant.tech/articles/qa-with-cohere-and-qdrant/\#using-cohere-and-qdrant-to-build-the-answers-database) Using Cohere and Qdrant to build the answers database +If you have a strong understanding of vector search technologies, enjoy creating content, speaking at conferences, and actively engage with our community. If this sounds like you, don't hesitate to apply. We look forward to potentially welcoming you as our next Qdrant Star. [Apply here!](https://forms.gle/q4fkwudDsy16xAZk8) -In order to start generating the embeddings, you need to [create a Cohere account](https://dashboard.cohere.ai/welcome/register). -That will start your trial period, so you’ll be able to vectorize the texts for free. Once logged in, your default API key will -be available in [Settings](https://dashboard.cohere.ai/api-keys). We’ll need it to call the co.embed API. with the official python package. +Share your journey with vector search technologies and how you plan to contribute further. -```python -import cohere +#### Nominate a Qdrant Star -cohere_client = cohere.Client(COHERE_API_KEY) +Do you know someone who could be our next Qdrant Star? Please submit your nomination through our [nomination form](https://forms.gle/n4zv7JRkvnp28qv17), explaining why they're a great fit. Your recommendation could help us find the next standout ambassador. -# Generating the embeddings with Cohere client library -embeddings = cohere_client.embed( - texts=["A test sentence"], - model="large", -) -vector_size = len(embeddings.embeddings[0]) -print(vector_size) # output: 4096 +#### Learn More -``` +For detailed information about the program's benefits, activities, and perks, refer to the [Qdrant Stars Handbook](https://qdrant.github.io/qdrant-stars-handbook/). -Let’s connect to the Qdrant instance first and create a collection with the proper configuration, so we can put some embeddings into it later on. +To connect with current Stars, ask questions, and stay updated on the latest news and events at Qdrant, [join our Discord community](http://discord.gg/qdrant). -```python -# Connecting to Qdrant Cloud with qdrant-client requires providing the api_key. -# If you use an on-premise instance, it has to be skipped. -qdrant_client = QdrantClient( - host="xyz-example.eu-central.aws.cloud.qdrant.io", - prefer_grpc=True, - api_key=QDRANT_API_KEY, -) +<|page-351-lllmstxt|> +#### New generation silicon is a game-changer for AI/ML applications +![qdrant cpu intel benchmark report](/blog/qdrant-cpu-intel-benchmark/qdrant-cpu-intel-benchmark.png) -``` +> *Intel’s 5th gen Xeon processor is made for enterprise-scale operations in vector space.* -Now we’re able to vectorize all the answers. They are going to form our collection, so we can also put them already into Qdrant, along with the -payloads and identifiers. That will make our dataset easily searchable. +Vector search is surging in popularity with institutional customers, and Intel is ready to support the emerging industry. Their latest generation CPU performed exceptionally with Qdrant, a leading vector database used for enterprise AI applications. -```python -answer_response = cohere_client.embed( - texts=dataset["train"]["long_answer"], - model="large", -) -vectors = [\ - # Conversion to float is required for Qdrant\ - list(map(float, vector))\ - for vector in answer_response.embeddings\ -] -ids = [entry["pubid"] for entry in dataset["train"]] +Intel just released the latest Xeon processor (**codename: Emerald Rapids**) for data centers, a market which is expected to grow to $45 billion. Emerald Rapids offers higher-performance computing and significant energy efficiency over previous generations. Compared to the 4th generation Sapphire Rapids, Emerald boosts AI inference performance by up to 42% and makes vector search 38% faster. -# Filling up Qdrant collection with the embeddings generated by Cohere co.embed API -qdrant_client.upsert( - collection_name="pubmed_qa", - points=rest.Batch( - ids=ids, - vectors=vectors, - payloads=list(dataset["train"]), - ) -) +## The CPU of choice for vector database operations -``` +The latest generation CPU performed exceptionally in tests carried out by Qdrant’s R&D division. Intel’s CPU was stress-tested for query speed, database latency and vector upload time against massive-scale datasets. Results showed that machines with 32 cores were 1.38x faster at running queries than their previous generation counterparts. In this range, Qdrant’s latency also dropped 2.79x when compared to Sapphire. -And that’s it. Without even setting up a single server on our own, we created a system that might be easily asked a question. I don’t want to call -it serverless, as this term is already taken, but co.embed API with Qdrant Cloud makes everything way easier to maintain. +Qdrant strongly recommends the use of Intel’s next-gen chips in the 8-64 core range. In addition to being a practical number of cores for most machines in the cloud, this compute capacity will yield the best results with mass-market use cases. -### [Anchor](https://qdrant.tech/articles/qa-with-cohere-and-qdrant/\#answering-the-questions-with-semantic-search--the-quality) Answering the questions with semantic search — the quality +The CPU affects vector search by influencing the speed and efficiency of mathematical computations. As of recently, companies have started using GPUs to carry large workloads in AI model training and inference. However, for vector search purposes, studies show that CPU architecture is a great fit because it can handle concurrent requests with great ease. -It’s high time to query our database with some questions. It might be interesting to somehow measure the quality of the system in general. -In those kinds of problems we typically use _top-k accuracy_. We assume the prediction of the system was correct if the correct answer -was present in the first _k_ results. +> *“Vector search is optimized for CPUs. Intel’s new CPU brings even more performance improvement and makes vector operations blazing fast for AI applications. Customers should consider deploying more CPUs instead of GPU compute power to achieve best performance results and reduce costs simultaneously.”* +> +> - AndrĂ© Zayarni, Qdrant CEO -```python -# Finding the position at which Qdrant provided the expected answer for each question. -# That allows to calculate accuracy@k for different values of k. -k_max = 10 -answer_positions = [] -for embedding, pubid in tqdm(zip(question_response.embeddings, ids)): - response = qdrant_client.search( - collection_name="pubmed_qa", - query_vector=embedding, - limit=k_max, - ) +## **Why does vector search matter?** - answer_ids = [record.id for record in response] - if pubid in answer_ids: - answer_positions.append(answer_ids.index(pubid)) - else: - answer_positions.append(-1) +![qdrant cpu intel benchmark report](/blog/qdrant-cpu-intel-benchmark/qdrant-cpu-intel-benchmark-future.png) -``` +Vector search engines empower AI to look deeper into stored data and retrieve strong relevant responses. -Saved answer positions allow us to calculate the metric for different _k_ values. +Qdrant’s vector database is key to modern information retrieval and machine learning systems. Those looking to run massive-scale Retrieval Augmented Generation (RAG) solutions need to leverage such semantic search engines in order to generate the best results with their AI products. -```python -# Prepared answer positions are being used to calculate different values of accuracy@k -for k in range(1, k_max + 1): - correct_answers = len( - list( - filter(lambda x: 0 <= x < k, answer_positions) - ) - ) - print(f"accuracy@{k} =", correct_answers / len(dataset["train"])) +Qdrant is purpose-built to enable developers to store and search for high-dimensional vectors efficiently. It easily integrates with a host of AI/ML tools: Large Language Models (LLM), frameworks such as LangChain, LlamaIndex or Haystack, and service providers like Cohere, OpenAI, and Ollama. -``` +## Supporting enterprise-scale AI/ML -Here are the values of the top-k accuracy for different values of k: +The market is preparing for a host of artificial intelligence and machine learning cases, pushing compute to the forefront of the innovation race. -| **metric** | **value** | -| --- | --- | -| accuracy@1 | 0.877 | -| accuracy@2 | 0.921 | -| accuracy@3 | 0.942 | -| accuracy@4 | 0.950 | -| accuracy@5 | 0.956 | -| accuracy@6 | 0.960 | -| accuracy@7 | 0.964 | -| accuracy@8 | 0.971 | -| accuracy@9 | 0.976 | -| accuracy@10 | 0.977 | - -It seems like our system worked pretty well even if we consider just the first result, with the lowest distance. -We failed with around 12% of questions. But numbers become better with the higher values of k. It might be also -valuable to check out what questions our system failed to answer, their perfect match and our guesses. +The main strength of a vector database like Qdrant is that it can consistently support the user way past the prototyping and launch phases. Qdrant’s product is already being used by large enterprises with billions of data points. Such users can go from testing to production almost instantly. Those looking to host large applications might only need up to 18GB RAM to support 1 million OpenAI Vectors. This makes Qdrant the best option for maximizing resource usage and data connection. -We managed to implement a working Question Answering system within just a few lines of code. If you are fine -with the results achieved, then you can start using it right away. Still, if you feel you need a slight improvement, -then fine-tuning the model is a way to go. If you want to check out the full source code, -it is available on [Google Colab](https://colab.research.google.com/drive/1YOYq5PbRhQ_cjhi6k4t1FnWgQm8jZ6hm?usp=sharing). +Intel’s latest development is crucial to the future of vector databases. Vector search operations are very CPU-intensive. Therefore, Qdrant relies on the innovations made by chip makers like Intel to offer large-scale support. -##### Was this page useful? +> *“Vector databases are a mainstay in today’s AI/ML toolchain, powering the latest generation of RAG and other Gen AI Applications. In teaming with Qdrant, Intel is helping enterprises deliver cutting-edge Gen-AI solutions and maximize their ROI by leveraging Qdrant’s high-performant and cost-efficient vector similarity search capabilities running on latest Intel Architecture based infrastructure across deployment models.”* +> +> - Arijit Bandyopadhyay, CTO - Enterprise Analytics & AI, Head of Strategy – Cloud and Enterprise, CSV Group, Intel Corporation -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +## Advancing vector search and the role of next-gen CPUs -Thank you for your feedback! 🙏 +Looking ahead, the vector database market is on the cusp of significant growth, particularly for the enterprise market. Developments in CPU technologies, such as those from Intel, are expected to enhance vector search operations by 1) improving processing speeds and 2) boosting retrieval efficiency and quality. This will allow enterprise users to easily manage large and more complex datasets and introduce AI on a global scale. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/qa-with-cohere-and-qdrant.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +As large companies continue to integrate sophisticated AI and machine learning tools, the reliance on robust vector databases is going to increase. This evolution in the market underscores the importance of continuous hardware innovation in meeting the expanding demands of data-intensive applications, with Intel's contributions playing a notable role in shaping the future of enterprise-scale AI/ML solutions. -On this page: +## Next steps -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/qa-with-cohere-and-qdrant.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Qdrant is open source and offers a complete SaaS solution, hosted on AWS, GCP, and Azure. -× +Getting started is easy, either spin up a [container image](https://hub.docker.com/r/qdrant/qdrant) or start a [free Cloud instance](https://cloud.qdrant.io/login). The documentation covers [adding the data](/documentation/tutorials/bulk-upload/) to your Qdrant instance as well as [creating your indices](/documentation/tutorials/optimize/). We would love to hear about what you are building and please connect with our engineering team on [Github](https://github.com/qdrant/qdrant), [Discord](https://discord.com/invite/tdtYvXjC4h), or [LinkedIn](https://www.linkedin.com/company/qdrant). -[Powered by](https://qdrant.tech/) +<|page-352-lllmstxt|> +We are excited to announce the interns selected for the inaugural Qdrant Summer of Code (QSoC) program! After receiving many impressive applications, we have chosen two talented individuals to work on the following projects: -<|page-177-lllmstxt|> -## geo-polygon-filter-gsoc -- [Articles](https://qdrant.tech/articles/) -- Google Summer of Code 2023 - Polygon Geo Filter for Qdrant Vector Database +**[Jishan Bhattacharya](https://www.linkedin.com/in/j16n/): WASM-based Dimension Reduction Visualization** -[Back to Qdrant Internals](https://qdrant.tech/articles/qdrant-internals/) +Jishan will be implementing a dimension reduction algorithm in Rust, compiling it to WebAssembly (WASM), and integrating it with the Qdrant Web UI. This project aims to provide a more efficient and smoother visualization experience, enabling the handling of more data points and higher dimensions efficiently. -# Google Summer of Code 2023 - Polygon Geo Filter for Qdrant Vector Database +**[Celine Hoang](https://www.linkedin.com/in/celine-h-hoang/): ONNX Cross Encoders in Python** -Zein Wen +Celine Hoang will focus on porting advanced ranking models—specifically Sentence Transformers, ColBERT, and BGE—to the ONNX (Open Neural Network Exchange) format. This project will enhance Qdrant's model support, making it more versatile and efficient in handling complex ranking tasks that are critical for applications such as recommendation engines and search functionalities. -· +We look forward to working with Jishan and Celine over the coming months and are excited to see their contributions to the Qdrant project. -October 12, 2023 +Stay tuned for more updates on the QSoC program and the progress of these projects! -![Google Summer of Code 2023 - Polygon Geo Filter for Qdrant Vector Database](https://qdrant.tech/articles_data/geo-polygon-filter-gsoc/preview/title.jpg) +<|page-353-lllmstxt|> +We all are. -## [Anchor](https://qdrant.tech/articles/geo-polygon-filter-gsoc/\#introduction) Introduction +> *“There is no use fighting it. Pick a vendor and go all in. Everything else is a mirage.”* +The last words of a seasoned IT professional +> -Greetings, I’m Zein Wen, and I was a Google Summer of Code 2023 participant at Qdrant. I got to work with an amazing mentor, Arnaud Gourlay, on enhancing the Qdrant Geo Polygon Filter. This new feature allows users to refine their query results using polygons. As the latest addition to the Geo Filter family of radius and rectangle filters, this enhancement promises greater flexibility in querying geo data, unlocking interesting new use cases. +As long as we are using any product, our solution’s infrastructure will depend on its vendors. Many say that building custom infrastructure will hurt velocity. **Is this true in the age of AI?** -## [Anchor](https://qdrant.tech/articles/geo-polygon-filter-gsoc/\#project-overview) Project Overview +It depends on where your company is at. Most startups don’t survive more than five years, so putting too much effort into infrastructure is not the best use of their resources. You first need to survive and demonstrate product viability. -![A Use Case of Geo Filter](https://qdrant.tech/articles_data/geo-polygon-filter-gsoc/geo-filter-example.png) +**Sometimes you may pick the right vendors and still fail.** -A Use Case of Geo Filter ( [https://traveltime.com/blog/map-postcode-data-catchment-area](https://traveltime.com/blog/map-postcode-data-catchment-area)) +![gpu-costs](/blog/are-you-vendor-locked/gpu-costs.png) -Because Qdrant is a powerful query vector database it presents immense potential for machine learning-driven applications, such as recommendation. However, the scope of vector queries alone may not always meet user requirements. Consider a scenario where you’re seeking restaurant recommendations; it’s not just about a list of restaurants, but those within your neighborhood. This is where the Geo Filter comes into play, enhancing query by incorporating additional filtering criteria. Up until now, Qdrant’s geographic filter options were confined to circular and rectangular shapes, which may not align with the diverse boundaries found in the real world. This scenario was exactly what led to a user feature request and we decided it would be a good feature to tackle since it introduces greater capability for geo-related queries. +We have all started to see the results of the AI hardware bottleneck. Running LLMs is expensive and smaller operations might fold to high costs. How will this affect large enterprises? -## [Anchor](https://qdrant.tech/articles/geo-polygon-filter-gsoc/\#technical-challenges) Technical Challenges +> If you are an established corporation, being dependent on a specific supplier can make or break a solid business case. For large-scale GenAI solutions, costs are essential to maintenance and dictate the long-term viability of such projects. In the short run, enterprises may afford high costs, but when the prices drop - then it’s time to adjust. +> -**1\. Geo Geometry Computation** +Unfortunately, the long run goal of scalability and flexibility may be countered by vendor lock-in. Shifting operations from one host to another requires expertise and compatibility adjustments. Should businesses become dependent on a single cloud service provider, they open themselves to risks ranging from soaring costs to stifled innovation. -![Geo Space Basic Concept](https://qdrant.tech/articles_data/geo-polygon-filter-gsoc/basic-concept.png) +**Finding the best vendor is key; but it’s crucial to stay mobile.** -Geo Space Basic Concept +## **Hardware is the New Vendor Lock** -Internally, the Geo Filter doesn’t start by testing each individual geo location as this would be computationally expensive. Instead, we create a geo hash layer that [divides the world](https://en.wikipedia.org/wiki/Grid_%28spatial_index%29#Grid-based_spatial_indexing) into rectangles. When a spatial index is created for Qdrant entries it assigns the entry to the geohash for its location. +> *“We’re so short on GPUs, the less people that use the tool [ChatGPT], the better.”* +OpenAI CEO, Sam Altman +> -During a query we first identify all potential geo hashes that satisfy the filters and subsequently check for location candidates within those hashes. Accomplishing this search involves two critical geometry computations: +When GPU hosting becomes too expensive, large and exciting Gen AI projects lose their luster. If moving clouds becomes too costly or difficulty to implement - you are vendor-locked. This used to be common with software. Now, hardware is the new dependency. -1. determining if a polygon intersects with a rectangle -2. ascertaining if a point lies within a polygon. +*Enterprises have many reasons to stay provider agnostic - but cost is the main one.* -![Geometry Computation Testing](https://qdrant.tech/articles_data/geo-polygon-filter-gsoc/geo-computation-testing.png) +[Appenzeller, Bornstein & Casado from Andreessen Horowitz](https://a16z.com/navigating-the-high-cost-of-ai-compute/) point to growing costs of AI compute. It is still a vendor’s market for A100 hourly GPUs, largely due to supply constraints. Furthermore, the price differences between AWS, GCP and Azure are dynamic enough to justify extensive cost-benefit analysis from prospective customers. -Geometry Computation Testing +![gpu-costs-a16z](/blog/are-you-vendor-locked/gpu-costs-a16z.png) -While we have a geo crate (a Rust library) that provides APIs for these computations, we dug in deeper to understand the underlying algorithms and verify their accuracy. This lead us to conduct extensive testing and visualization to determine correctness. In addition to assessing the current crate, we also discovered that there are multiple algorithms available for these computations. We invested time in exploring different approaches, such as [winding windows](https://en.wikipedia.org/wiki/Point_in_polygon#Winding%20number%20algorithm:~:text=of%20the%20algorithm.-,Winding%20number%20algorithm,-%5Bedit%5D) and [ray casting](https://en.wikipedia.org/wiki/Point_in_polygon#Winding%20number%20algorithm:~:text=.%5B2%5D-,Ray%20casting%20algorithm,-%5Bedit%5D), to grasp their distinctions, and pave the way for future improvements. +*Source: Andreessen Horowitz* -Through this process, I enjoyed honing my ability to swiftly grasp unfamiliar concepts. In addition, I needed to develop analytical strategies to dissect and draw meaningful conclusions from them. This experience has been invaluable in expanding my problem-solving toolkit. +Sure, your competitors can brag about all the features they can access - but are they willing to admit how much their company has lost to convenience and increasing costs? -**2\. Proto and JSON format design** +As an enterprise customer, one shouldn’t expect a vendor to stay consistent in this market. -Considerable effort was devoted to designing the ProtoBuf and JSON interfaces for this new feature. This component is directly exposed to users, requiring a consistent and user-friendly interface, which in turns help drive a a positive user experience and less code modifications in the future. +## How Does This Affect Qdrant? -Initially, we contemplated aligning our interface with the [GeoJSON](https://geojson.org/) specification, given its prominence as a standard for many geo-related APIs. However, we soon realized that the way GeoJSON defines geometries significantly differs from our current JSON and ProtoBuf coordinate definitions for our point radius and rectangular filter. As a result, we prioritized API-level consistency and user experience, opting to align the new polygon definition with all our existing definitions. +As an open source vector database, Qdrant is completely risk-free. Furthermore, cost savings is one of the many reasons companies use it to augment the LLM. You won’t need to burn through GPU cash for training or inference. A basic instance with a CPU and RAM can easily manage indexing and retrieval. -In addition, we planned to develop a separate multi-polygon filter in addition to the polygon. However, after careful consideration, we recognize that, for our use case, polygon filters can achieve the same result as a multi-polygon filter. This relationship mirrors how we currently handle multiple circles or rectangles. Consequently, we deemed the multi-polygon filter redundant and would introduce unnecessary complexity to the API. +> *However, we find that many of our customers want to host Qdrant in the same place as the rest of their infrastructure, such as the LLM or other data engineering infra. This can be for practical reasons, due to corporate security policies, or even global political reasons.* -Doing this work illustrated to me the challenge of navigating real-world solutions that require striking a balance between adhering to established standards and prioritizing user experience. It also was key to understanding the wisdom of focusing on developing what’s truly necessary for users, without overextending our efforts. +One day, they might find this infrastructure too costly. Although vector search will remain cheap, their training, inference and embedding costs will grow. Then, they will want to switch vendors. -## [Anchor](https://qdrant.tech/articles/geo-polygon-filter-gsoc/\#outcomes) Outcomes +What could interfere with the switch? Compatibility? Technologies? Lack of expertise? -**1\. Capability of Deep Dive** -Navigating unfamiliar code bases, concepts, APIs, and techniques is a common challenge for developers. Participating in GSoC was akin to me going from the safety of a swimming pool and right into the expanse of the ocean. Having my mentor’s support during this transition was invaluable. He provided me with numerous opportunities to independently delve into areas I had never explored before. I have grown into no longer fearing unknown technical areas, whether it’s unfamiliar code, techniques, or concepts in specific domains. I’ve gained confidence in my ability to learn them step by step and use them to create the things I envision. +In terms of features, cloud service standardization is difficult due to varying features between cloud providers. This leads to custom solutions and vendor lock-in, hindering migration and cost reduction efforts, [as seen with Snapchat and Twitter](https://www.businessinsider.com/snap-google-cloud-aws-reducing-costs-2023-2). -**2\. Always Put User in Minds** -Another crucial lesson I learned is the importance of considering the user’s experience and their specific use cases. While development may sometimes entail iterative processes, every aspect that directly impacts the user must be approached and executed with empathy. Neglecting this consideration can lead not only to functional errors but also erode the trust of users due to inconsistency and confusion, which then leads to them no longer using my work. +## **Fear, Uncertainty and Doubt** -**3\. Speak Up and Effectively Communicate** -Finally, In the course of development, encountering differing opinions is commonplace. It’s essential to remain open to others’ ideas, while also possessing the resolve to communicate one’s own perspective clearly. This fosters productive discussions and ultimately elevates the quality of the development process. +You spend months setting up the infrastructure, but your competitor goes all in with a cheaper alternative and has a competing product out in one month? Does avoiding the lock-in matter if your company will be out of business while you try to setup a fully agnostic platform? -### [Anchor](https://qdrant.tech/articles/geo-polygon-filter-gsoc/\#wrap-up) Wrap up +**Problem:** If you're not locked into a vendor, you're locked into managing a much larger team of engineers. The build vs buy tradeoff is real and it comes with its own set of risks and costs. -Being selected for Google Summer of Code 2023 and collaborating with Arnaud and the other Qdrant engineers, along with all the other community members, has been a true privilege. I’m deeply grateful to those who invested their time and effort in reviewing my code, engaging in discussions about alternatives and design choices, and offering assistance when needed. Through these interactions, I’ve experienced firsthand the essence of open source and the culture that encourages collaboration. This experience not only allowed me to write Rust code for a real-world product for the first time, but it also opened the door to the amazing world of open source. +**Acknowledgement:** Any organization that processes vast amounts of data with AI needs custom infrastructure and dedicated resources, no matter the industry. Having to work with expensive services such as A100 GPUs justifies the existence of in-house DevOps crew. Any enterprise that scales up needs to employ vigilant operatives if it wants to manage costs. -Without a doubt, I’m eager to continue growing alongside this community and contribute to new features and enhancements that elevate the product. I’ve also become an advocate for Qdrant, introducing this project to numerous coworkers and friends in the tech industry. I’m excited to witness new users and contributors emerge from within my own network! +> There is no need for **Fear, Uncertainty and Doubt**. Vendor lock is not a futile cause - so let’s dispel the sentiment that all vendors are adversaries. You just need to work with a company that is willing to accommodate flexible use of products. +> -If you want to try out my work, read the [documentation](https://qdrant.tech/documentation/concepts/filtering/#geo-polygon) and then, either sign up for a free [cloud account](https://cloud.qdrant.io/) or download the [Docker image](https://hub.docker.com/r/qdrant/qdrant). I look forward to seeing how people are using my work in their own applications! +**The Solution is Kubernetes:** Decoupling your infrastructure from a specific cloud host is currently the best way of staying risk-free. Any component of your solution that runs on Kubernetes can integrate seamlessly with other compatible infrastructure. -##### Was this page useful? +This is how you stay dynamic and move vendors whenever it suits you best. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +## **What About Hybrid Cloud?** -Thank you for your feedback! 🙏 +The key to freedom is to building your applications and infrastructure to run on any cloud. By leveraging containerization and service abstraction using Kubernetes or Docker, software vendors can exercise good faith in helping their customers transition to other cloud providers. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/geo-polygon-filter-gsoc.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +We designed the architecture of Qdrant Hybrid Cloud to meet the evolving needs of businesses seeking unparalleled flexibility, control, and privacy. -On this page: +This technology integrates Kubernetes clusters from any setting - cloud, on-premises, or edge - into a unified, enterprise-grade managed service. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/geo-polygon-filter-gsoc.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +#### Take a look. It's completely yours. We’ll help you manage it. -× +

-[Powered by](https://qdrant.tech/) +[Qdrant Hybrid Cloud](/hybrid-cloud/) marks a significant advancement in vector databases, offering the most flexible way to implement vector search. -<|page-178-lllmstxt|> -## detecting-coffee-anomalies -- [Articles](https://qdrant.tech/articles/) -- Metric Learning for Anomaly Detection +You can test out Qdrant Hybrid Cloud today. Sign up or log into your [Qdrant Cloud account](https://cloud.qdrant.io/login) and get started in the **Hybrid Cloud** section. -[Back to Machine Learning](https://qdrant.tech/articles/machine-learning/) +Also, to learn more about Qdrant Hybrid Cloud read our [Official Release Blog](/blog/hybrid-cloud/) or our [Qdrant Hybrid Cloud website](/hybrid-cloud/). For additional technical insights, please read our [documentation](/documentation/hybrid-cloud/). -# Metric Learning for Anomaly Detection +#### Try it out! -Yusuf Sarıgöz +[![hybrid-cloud-cta.png](/blog/are-you-vendor-locked/hybrid-cloud-cta.png)](https://qdrant.to/cloud) -· +<|page-354-lllmstxt|> +![visua/image1.png](/blog/case-study-visua/image1.png) -May 04, 2022 +For over a decade, [VISUA](https://visua.com/) has been a leader in precise, high-volume computer vision data analysis, developing a robust platform that caters to a wide range of use cases, from startups to large enterprises. Starting with social media monitoring, where it excels in analyzing vast data volumes to detect company logos, VISUA has built a diverse ecosystem of customers, including names in social media monitoring, like **Brandwatch**, cybersecurity like **Mimecast**, trademark protection like **Ebay** and several sports agencies like **Vision Insights** for sponsorship evaluation. -![Metric Learning for Anomaly Detection](https://qdrant.tech/articles_data/detecting-coffee-anomalies/preview/title.jpg) +![visua/image3.png](/blog/case-study-visua/image3.png) -Anomaly detection is a thirsting yet challenging task that has numerous use cases across various industries. -The complexity results mainly from the fact that the task is data-scarce by definition. +## The Challenge -Similarly, anomalies are, again by definition, subject to frequent change, and they may take unexpected forms. -For that reason, supervised classification-based approaches are: +**Quality Control at Scale** -- Data-hungry - requiring quite a number of labeled data; -- Expensive - data labeling is an expensive task itself; -- Time-consuming - you would try to obtain what is necessarily scarce; -- Hard to maintain - you would need to re-train the model repeatedly in response to changes in the data distribution. +The accuracy of object detection within images is critical for VISUA ensuring that their algorithms are detecting objects in images correctly. With growing volumes of data processed for clients, the company was looking for a way to enhance its quality control and anomaly detection mechanisms to be more scalable and auditable. -These are not desirable features if you want to put your model into production in a rapidly-changing environment. -And, despite all the mentioned difficulties, they do not necessarily offer superior performance compared to the alternatives. -In this post, we will detail the lessons learned from such a use case. +The challenge was twofold. First, VISUA needed a method to rapidly and accurately identify images and the objects within them that were similar, to identify false negatives, or unclear outcomes and use them as inputs for reinforcement learning. -## [Anchor](https://qdrant.tech/articles/detecting-coffee-anomalies/\#coffee-beans) Coffee Beans +Second, the rapid growth in data volume challenged their previous quality control processes, which relied on a sampling method based on meta-information (like analyzing lower-confidence, smaller, or blurry images), which involved more manual reviews and was not as scalable as needed. In response, the team at VISUA explored vector databases as a solution. -[Agrivero.ai](https://agrivero.ai/) \- is a company making AI-enabled solution for quality control & traceability of green coffee for producers, traders, and roasters. -They have collected and labeled more than **30 thousand** images of coffee beans with various defects - wet, broken, chipped, or bug-infested samples. -This data is used to train a classifier that evaluates crop quality and highlights possible problems. +## The Solution -![Anomalies in coffee](https://qdrant.tech/articles_data/detecting-coffee-anomalies/detection.gif) +**Accelerating Anomaly Detection and Elevating Quality Control with Vector Search** -Anomalies in coffee +In addressing the challenge of scaling and enhancing its quality control processes, VISUA turned to vector databases, with Qdrant emerging as the solution of choice. This technological shift allowed VISUA to leverage vector databases for identifying similarities and deduplicating vast volumes of images, videos, and frames. By doing so, VISUA was able to automatically classify objects with a level of precision that was previously unattainable. -We should note that anomalies are very diverse, so the enumeration of all possible anomalies is a challenging task on it’s own. -In the course of work, new types of defects appear, and shooting conditions change. Thus, a one-time labeled dataset becomes insufficient. +The introduction of vectors allowed VISUA to represent data uniquely and mark frames for closer examination by prioritizing the review of anomalies and data points with the highest variance. Consequently, this technology empowered Visia to scale its quality assurance and reinforcement learning processes tenfold. -Let’s find out how metric learning might help to address this challenge. +> *“Using Qdrant as a vector database for our quality control allowed us to review 10x more data by exploiting repetitions and deduplicating samples and doing that at scale with having a query engine.”* Alessandro Prest, Co-Founder at VISUA. -## [Anchor](https://qdrant.tech/articles/detecting-coffee-anomalies/\#metric-learning-approach) Metric Learning Approach +![visua/image2.jpg](/blog/case-study-visua/image2.jpg) -In this approach, we aimed to encode images in an n-dimensional vector space and then use learned similarities to label images during the inference. +## The Selection Process -The simplest way to do this is KNN classification. -The algorithm retrieves K-nearest neighbors to a given query vector and assigns a label based on the majority vote. +**Finding the Right Vector Database For Quality Analysis and Anomaly Detection** -In production environment kNN classifier could be easily replaced with [Qdrant](https://github.com/qdrant/qdrant) vector search engine. +Choosing the right vector database was a pivotal decision for VISUA, and the team conducted extensive benchmarks. They tested various solutions, including Weaviate, Pinecone, and Qdrant, focusing on the efficient handling of both vector and payload indexes. The objective was to identify a system that excels in managing hybrid queries that blend vector similarities with record attributes, crucial for enhancing their quality control and anomaly detection capabilities. -![Production deployment](https://qdrant.tech/articles_data/detecting-coffee-anomalies/anomalies_detection.png) +Qdrant distinguished itself through its: -Production deployment +- **Hybrid Query Capability:** Qdrant enables the execution of hybrid queries that combine payload fields and vector data, allowing for comprehensive and nuanced searches. This functionality leverages the strengths of both payload attributes and vector similarities for detailed data analysis. Prest noted the importance of Qdrant's hybrid approach, saying, “When talking with the founders of Qdrant, we realized that they put a lot of effort into this hybrid approach, which really resonated with us.” -This approach has the following advantages: +- **Performance Superiority**: Qdrant distinguished itself as the fastest engine for VISUA's specific needs, significantly outpacing alternatives with query speeds up to 40 times faster for certain VISUA use cases. Alessandro Prest highlighted, "Qdrant was the fastest engine by a large margin for our use case," underscoring its significant efficiency and scalability advantages. -- We can benefit from unlabeled data, considering labeling is time-consuming and expensive. -- The relevant metric, e.g., precision or recall, can be tuned according to changing requirements during the inference without re-training. -- Queries labeled with a high score can be added to the KNN classifier on the fly as new data points. +- **API Documentation**: The clarity, comprehensiveness, and user-friendliness of Qdrant’s API documentation and reference guides further solidified VISUA’s decision. -To apply metric learning, we need to have a neural encoder, a model capable of transforming an image into a vector. +This strategic selection enabled VISUA to achieve a notable increase in operational efficiency and scalability in its quality control processes. -Training such an encoder from scratch may require a significant amount of data we might not have. Therefore, we will divide the training into two steps: +## Implementing Qdrant -- The first step is to train the autoencoder, with which we will prepare a model capable of representing the target domain. +Upon selecting Qdrant as their vector database solution, VISUA undertook a methodical approach to integration. The process began in a controlled development environment, allowing VISUA to simulate real-world use cases and ensure that Qdrant met their operational requirements. This careful, phased approach ensured a smooth transition when moving Qdrant into their production environment, hosted on AWS clusters. VISUA is leveraging several specific Qdrant features in their production setup: -- The second step is finetuning. Its purpose is to train the model to distinguish the required types of anomalies. +1. **Support for Multiple Vectors per Record/Point**: This feature allows for a nuanced and multifaceted analysis of data, enabling VISUA to manage and query complex datasets more effectively. +2. **Quantization**: Quantization optimizes storage and accelerates query processing, improving data handling efficiency and lowering memory use, essential for large-scale operations. -![Model training architecture](https://qdrant.tech/articles_data/detecting-coffee-anomalies/anomaly_detection_training.png) +## The Results -Model training architecture +Integrating Qdrant into VISUA's quality control operations has delivered measurable outcomes when it comes to efficiency and scalability: -### [Anchor](https://qdrant.tech/articles/detecting-coffee-anomalies/\#step-1---autoencoder-for-unlabeled-data) Step 1 - Autoencoder for Unlabeled Data +- **40x Faster Query Processing**: Qdrant has drastically reduced the time needed for complex queries, enhancing workflow efficiency. -First, we pretrained a Resnet18-like model in a vanilla autoencoder architecture by leaving the labels aside. -Autoencoder is a model architecture composed of an encoder and a decoder, with the latter trying to recreate the original input from the low-dimensional bottleneck output of the former. +- **10x Scalability Boost:** The efficiency of Qdrant enables VISUA to handle ten times more data in its quality assurance and learning processes, supporting growth without sacrificing quality. -There is no intuitive evaluation metric to indicate the performance in this setup, but we can evaluate the success by examining the recreated samples visually. +- **Increased Data Review Capacity:** The increased capacity to review the data allowed VISUA to enhance the accuracy of its algorithms through reinforcement learning. -![Example of image reconstruction with Autoencoder](https://qdrant.tech/articles_data/detecting-coffee-anomalies/image_reconstruction.png) +#### Expanding Qdrant’s Use Beyond Anomaly Detection -Example of image reconstruction with Autoencoder +While the primary application of Qdrant is focused on quality control, VISUA's team is actively exploring additional use cases with Qdrant. VISUA's use of Qdrant has inspired new opportunities, notably in content moderation. "The moment we started to experiment with Qdrant, opened up a lot of ideas within the team for new applications,” said Prest on the potential unlocked by Qdrant. For example, this has led them to actively explore the Qdrant [Discovery API](/documentation/concepts/explore/?q=discovery#discovery-api), with an eye on enhancing content moderation processes. -Then we encoded a subset of the data into 128-dimensional vectors by using the encoder, -and created a KNN classifier on top of these embeddings and associated labels. +Beyond content moderation, VISUA is set for significant growth by broadening its copyright infringement detection services. As the demand for detecting a wider range of infringements, like unauthorized use of popular characters on merchandise, increases, VISUA plans to expand its technology capabilities. Qdrant will be pivotal in this expansion, enabling VISUA to meet the complex and growing challenges of moderating copyrighted content effectively and ensuring comprehensive protection for brands and creators. -Although the results are promising, we can do even better by finetuning with metric learning. +<|page-355-lllmstxt|> +[Qdrant 1.9.0 is out!](https://github.com/qdrant/qdrant/releases/tag/v1.9.0) This version complements the release of our new managed product [Qdrant Hybrid Cloud](/hybrid-cloud/) with key security features valuable to our enterprise customers, and all those looking to productionize large-scale Generative AI. **Data privacy, system stability and resource optimizations** are always on our mind - so let's see what's new: -### [Anchor](https://qdrant.tech/articles/detecting-coffee-anomalies/\#step-2---finetuning-with-metric-learning) Step 2 - Finetuning with Metric Learning +- **Granular access control:** You can further specify access control levels by using JSON Web Tokens. +- **Optimized shard transfers:** The synchronization of shards between nodes is now significantly faster! +- **Support for byte embeddings:** Reduce the memory footprint of Qdrant with official `uint8` support. -We started by selecting 200 labeled samples randomly without replacement. +## New access control options via JSON Web Tokens -In this step, The model was composed of the encoder part of the autoencoder with a randomly initialized projection layer stacked on top of it. -We applied transfer learning from the frozen encoder and trained only the projection layer with Triplet Loss and an online batch-all triplet mining strategy. +Historically, our API key supported basic read and write operations. However, recognizing the evolving needs of our user base, especially large organizations, we've implemented additional options for finer control over data access within internal environments. -Unfortunately, the model overfitted quickly in this attempt. -In the next experiment, we used an online batch-hard strategy with a trick to prevent vector space from collapsing. -We will describe our approach in the further articles. +Qdrant now supports [granular access control using JSON Web Tokens (JWT)](/documentation/guides/security/#granular-access-control-with-jwt). JWT will let you easily limit a user's access to the specific data they are permitted to view. Specifically, JWT-based authentication leverages tokens with restricted access to designated data segments, laying the foundation for implementing role-based access control (RBAC) on top of it. **You will be able to define permissions for users and restrict access to sensitive endpoints.** -This time it converged smoothly, and our evaluation metrics also improved considerably to match the supervised classification approach. +**Dashboard users:** For your convenience, we have added a JWT generation tool the Qdrant Web UI under the 🔑 tab. If you're using the default url, you will find it at `http://localhost:6333/dashboard#/jwt`. -![Metrics for the autoencoder model with KNN classifier](https://qdrant.tech/articles_data/detecting-coffee-anomalies/ae_report_knn.png) +![jwt-web-ui](/blog/qdrant-1.9.x/jwt-web-ui.png) -Metrics for the autoencoder model with KNN classifier +We highly recommend this feature to enterprises using [Qdrant Hybrid Cloud](/hybrid-cloud/), as it is tailored to those who need additional control over company data and user access. RBAC empowers administrators to define roles and assign specific privileges to users based on their roles within the organization. In combination with [Hybrid Cloud's data sovereign architecture](/documentation/hybrid-cloud/), this feature reinforces internal security and efficient collaboration by granting access only to relevant resources. -![Metrics for the finetuned model with KNN classifier](https://qdrant.tech/articles_data/detecting-coffee-anomalies/ft_report_knn.png) +> **Documentation:** [Read the access level breakdown](/documentation/guides/security/#table-of-access) to see which actions are allowed or denied. -Metrics for the finetuned model with KNN classifier +## Faster shard transfers on node recovery -We repeated this experiment with 500 and 2000 samples, but it showed only a slight improvement. -Thus we decided to stick to 200 samples - see below for why. +We now offer a streamlined approach to [data synchronization between shards](/documentation/guides/distributed_deployment/#shard-transfer-method) during node upgrades or recovery processes. Traditional methods used to transfer the entire dataset, but our new `wal_delta` method focuses solely on transmitting the difference between two existing shards. By leveraging the Write-Ahead Log (WAL) of both shards, this method selectively transmits missed operations to the target shard, ensuring data consistency. -## [Anchor](https://qdrant.tech/articles/detecting-coffee-anomalies/\#supervised-classification-approach) Supervised Classification Approach +In some cases, where transfers can take hours, this update **reduces transfers down to a few minutes.** -We also wanted to compare our results with the metrics of a traditional supervised classification model. -For this purpose, a Resnet50 model was finetuned with ~30k labeled images, made available for training. -Surprisingly, the F1 score was around ~0.86. +The advantages of this approach are twofold: +1. **It is faster** since only the differential data is transmitted, avoiding the transfer of redundant information. +2. It upholds robust **ordering guarantees**, crucial for applications reliant on strict sequencing. -Please note that we used only 200 labeled samples in the metric learning approach instead of ~30k in the supervised classification approach. -These numbers indicate a huge saving with no considerable compromise in the performance. +For more details on how this works, check out the [shard transfer documentation](/documentation/guides/distributed_deployment/#shard-transfer-method). -## [Anchor](https://qdrant.tech/articles/detecting-coffee-anomalies/\#conclusion) Conclusion +> **Note:** There are limitations to consider. First, this method only works with existing shards. Second, while the WALs typically retain recent operations, their capacity is finite, potentially impeding the transfer process if exceeded. Nevertheless, for scenarios like rapid node restarts or upgrades, where the WAL content remains manageable, WAL delta transfer is an efficient solution. -We obtained results comparable to those of the supervised classification method by using **only 0.66%** of the labeled data with metric learning. -This approach is time-saving and resource-efficient, and that may be improved further. Possible next steps might be: +Overall, this is a great optional optimization measure and serves as the **auto-recovery default for shard transfers**. It's safe to use everywhere because it'll automatically fall back to streaming records transfer if no difference can be resolved. By minimizing data redundancy and expediting transfer processes, it alleviates the strain on the cluster during recovery phases, enabling faster node catch-up. -- Collect more unlabeled data and pretrain a larger autoencoder. -- Obtain high-quality labels for a small number of images instead of tens of thousands for finetuning. -- Use hyperparameter optimization and possibly gradual unfreezing in the finetuning step. -- Use [vector search engine](https://github.com/qdrant/qdrant) to serve Metric Learning in production. +## Native support for uint8 embeddings -We are actively looking into these, and we will continue to publish our findings in this challenge and other use cases of metric learning. +Our latest version introduces [support for uint8 embeddings within Qdrant collections](/documentation/concepts/collections/#vector-datatypes). This feature supports embeddings provided by companies in a pre-quantized format. Unlike previous iterations where indirect support was available via [quantization methods](/documentation/guides/quantization/), this update empowers users with direct integration capabilities. -##### Was this page useful? +In the case of `uint8`, elements within the vector are represented as unsigned 8-bit integers, encompassing values ranging from 0 to 255. Using these embeddings gives you a **4x memory saving and about a 30% speed-up in search**, while keeping 99.99% of the response quality. As opposed to the original quantization method, with this feature you can spare disk usage if you directly implement pre-quantized embeddings. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +The configuration is simple. To create a collection with uint8 embeddings, simply add the following `datatype`: -Thank you for your feedback! 🙏 +```bash +PUT /collections/{collection_name} +{ + "vectors": { + "size": 1024, + "distance": "Dot", + "datatype": "uint8" + } +} +``` -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/detecting-coffee-anomalies.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +> **Note:** When using Quantization to optimize vector search, you can use this feature to `rescore` binary vectors against new byte vectors. With double the speedup, you will be able to achieve a better result than if you rescored with float vectors. With each byte vector quantized at the binary level, the result will deliver unparalleled efficiency and savings. To learn more about this optimization method, read our [Quantization docs](/documentation/guides/quantization/). -On this page: +## Minor improvements and new features -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/detecting-coffee-anomalies.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +- Greatly improve write performance while creating a snapshot of a large collection - [#3420](https://github.com/qdrant/qdrant/pull/3420), [#3938](https://github.com/qdrant/qdrant/pull/3938) +- Report pending optimizations awaiting an update operation in collection info - [#3962](https://github.com/qdrant/qdrant/pull/3962), [#3971](https://github.com/qdrant/qdrant/pull/3971) +- Improve `indexed_only` reliability on proxy shards - [#3998](https://github.com/qdrant/qdrant/pull/3998) +- Make shard diff transfer fall back to streaming records - [#3798](https://github.com/qdrant/qdrant/pull/3798) +- Cancel shard transfers when the shard is deleted - [#3784](https://github.com/qdrant/qdrant/pull/3784) +- Improve sparse vectors search performance by another 7% - [#4037](https://github.com/qdrant/qdrant/pull/4037) +- Build Qdrant with a single codegen unit to allow better compile-time optimizations - [#3982](https://github.com/qdrant/qdrant/pull/3982) +- Remove `vectors_count` from collection info because it is unreliable. **Check if you use this field before upgrading** - [#4052](https://github.com/qdrant/qdrant/pull/4052) +- Remove shard transfer method field from abort shard transfer operation - [#3803](https://github.com/qdrant/qdrant/pull/3803) -× +<|page-356-lllmstxt|> +With the launch of [Qdrant Hybrid Cloud](/hybrid-cloud/) we provide developers the ability to deploy Qdrant as a managed vector database in any desired environment, be it *in the cloud, on premise, or on the edge*. -[Powered by](https://qdrant.tech/) +We are excited to have trusted industry players support the launch of Qdrant Hybrid Cloud, allowing developers to unlock best-in-class advantages for building production-ready AI applications: -<|page-179-lllmstxt|> -## agentic-rag-crewai-zoom -- [Documentation](https://qdrant.tech/documentation/) -- Simple Agentic RAG System +- **Deploy In Your Own Environment:** Deploy the Qdrant vector database as a managed service on the infrastructure of choice, such as our launch partner solutions [Oracle Cloud Infrastructure (OCI)](https://blogs.oracle.com/cloud-infrastructure/post/qdrant-hybrid-cloud-now-available-oci-customers), [Red Hat OpenShift](/blog/hybrid-cloud-red-hat-openshift/), [Vultr](/blog/hybrid-cloud-vultr/), [DigitalOcean](/blog/hybrid-cloud-digitalocean/), [OVHcloud](/blog/hybrid-cloud-ovhcloud/), [Scaleway](/blog/hybrid-cloud-scaleway/), [Civo](/documentation/hybrid-cloud/platform-deployment-options/#civo), and [STACKIT](/blog/hybrid-cloud-stackit/). -![agentic-rag-crewai-zoom](https://qdrant.tech/documentation/examples/agentic-rag-crewai-zoom/agentic-rag-1.png) +- **Seamlessly Integrate with Every Key Component of the Modern AI Stack:** Our new hybrid cloud offering also allows you to integrate with all of the relevant solutions for building AI applications. These include partner frameworks like [LlamaIndex](/blog/hybrid-cloud-llamaindex/), [LangChain](/blog/hybrid-cloud-langchain/), [Haystack by deepset](/blog/hybrid-cloud-haystack/), and [Airbyte](/blog/hybrid-cloud-airbyte/), as well as large language models (LLMs) like [JinaAI](/blog/hybrid-cloud-jinaai/) and [Aleph Alpha](/blog/hybrid-cloud-aleph-alpha/). -# [Anchor](https://qdrant.tech/documentation/agentic-rag-crewai-zoom/\#agentic-rag-with-crewai--qdrant-vector-database) Agentic RAG With CrewAI & Qdrant Vector Database +- **Ensure Full Data Sovereignty and Privacy Control:** Qdrant Hybrid Cloud offers unparalleled data isolation and the flexibility to process workloads either in the cloud or on-premise, ensuring data privacy and sovereignty requirements - all while being fully managed. -| Time: 45 min | Level: Beginner | Output: [GitHub](https://github.com/qdrant/examples/tree/master/agentic_rag_zoom_crewai) | | -| --- | --- | --- | --- | +#### Try Qdrant Hybrid Cloud on Partner Platforms -By combining the power of Qdrant for vector search and CrewAI for orchestrating modular agents, you can build systems that don’t just answer questions but analyze, interpret, and act. +![Hybrid Cloud Launch Partners Tutorials](/blog/hybrid-cloud-launch-partners/hybrid-cloud-launch-partners-tutorials.png) -Traditional RAG systems focus on fetching data and generating responses, but they lack the ability to reason deeply or handle multi-step processes. +Together with our launch partners, we created in-depth tutorials and use cases for production-ready vector search that explain how developers can leverage Qdrant Hybrid Cloud alongside the best-in-class solutions of our launch partners. These tutorials demonstrate that Qdrant Hybrid Cloud is the most flexible foundation to build modern, customer-centric AI applications with endless deployment options and full data sovereignty. Let’s dive right in: -In this tutorial, we’ll walk you through building an Agentic RAG system step by step. By the end, you’ll have a working framework for storing data in a Qdrant Vector Database and extracting insights using CrewAI agents in conjunction with Vector Search over your data. +**AI Customer Support Chatbot** with Qdrant Hybrid Cloud, Airbyte, Cohere, and AWS -We already built this app for you. [Clone this repository](https://github.com/qdrant/examples/tree/master/agentic_rag_zoom_crewai) and follow along with the tutorial. +> This tutorial shows how to build a private AI customer support system using Cohere's AI models on AWS, Airbyte, and Qdrant Hybrid Cloud for efficient and secure query automation. -## [Anchor](https://qdrant.tech/documentation/agentic-rag-crewai-zoom/\#what-youll-build) What You’ll Build +[View Tutorial](/documentation/tutorials/rag-customer-support-cohere-airbyte-aws/) -In this hands-on tutorial, we’ll create a system that: +**RAG System for Employee Onboarding** with Qdrant Hybrid Cloud, Oracle Cloud Infrastructure (OCI), Cohere, and LangChain -1. Uses Qdrant to store and retrieve meeting transcripts as vector embeddings -2. Leverages CrewAI agents to analyze and summarize meeting data -3. Presents insights in a simple Streamlit interface for easy interaction +> This tutorial demonstrates how to use Oracle Cloud Infrastructure (OCI) for a secure setup that integrates Cohere's language models with Qdrant Hybrid Cloud, using LangChain to orchestrate natural language search for corporate documents, enhancing resource discovery and onboarding. -This project demonstrates how to build a Vector Search powered Agentic workflow to extract insights from meeting recordings. By combining Qdrant’s vector search capabilities with CrewAI agents, users can search through and analyze their own meeting content. +[View Tutorial](/documentation/tutorials/natural-language-search-oracle-cloud-infrastructure-cohere-langchain/) -The application first converts the meeting transcript into vector embeddings and stores them in a Qdrant vector database. It then uses CrewAI agents to query the vector database and extract insights from the meeting content. Finally, it uses Anthropic Claude to generate natural language responses to user queries based on the extracted insights from the vector database. +**Hybrid Search for Product PDF Manuals** with Qdrant Hybrid Cloud, LlamaIndex, and JinaAI -### [Anchor](https://qdrant.tech/documentation/agentic-rag-crewai-zoom/\#how-does-it-work) How Does It Work? +> Create a RAG-based chatbot that enhances customer support by parsing product PDF manuals using Qdrant Hybrid Cloud, LlamaIndex, and JinaAI, with DigitalOcean as the cloud host. This tutorial will guide you through the setup and integration process, enabling your system to deliver precise, context-aware responses for household appliance inquiries. -When you interact with the system, here’s what happens behind the scenes: +[View Tutorial](/documentation/tutorials/hybrid-search-llamaindex-jinaai/) -First the user submits a query to the system. In this example, we want to find out the average length of Marketing meetings. Since one of the data points from the meetings is the duration of the meeting, the agent can calculate the average duration of the meetings by averaging the duration of all meetings with the keyword “Marketing” in the topic or content. +**Region-Specific RAG System for Contract Management** with Qdrant Hybrid Cloud, Aleph Alpha, and STACKIT -![User Query Interface](https://qdrant.tech/articles_data/agentic-rag-crewai-zoom/query1.png) +> Learn how to streamline contract management with a RAG-based system in this tutorial, which utilizes Aleph Alpha’s embeddings and a region-specific cloud setup. Hosted on STACKIT with Qdrant Hybrid Cloud, this solution ensures secure, GDPR-compliant storage and processing of data, ideal for businesses with intensive contractual needs. -Next, the agent used the `search_meetings` tool to search the Qdrant vector database for the most semantically similar meeting points. We asked about Marketing meetings, so the agent searched the database with the search meeting tool for all meetings with the keyword “Marketing” in the topic or content. +[View Tutorial](/documentation/tutorials/rag-contract-management-stackit-aleph-alpha/) + +**Movie Recommendation System** with Qdrant Hybrid Cloud and OVHcloud -![Vector Search Results](https://qdrant.tech/articles_data/agentic-rag-crewai-zoom/output0.png) +> Discover how to build a recommendation system with our guide on collaborative filtering, using sparse vectors and the Movielens dataset. -Next, the agent used the `calculator` tool to find the average duration of the meetings. +[View Tutorial](/documentation/tutorials/recommendation-system-ovhcloud/) -![Duration Calculation](https://qdrant.tech/articles_data/agentic-rag-crewai-zoom/output.png) +**Private RAG Information Extraction Engine** with Qdrant Hybrid Cloud and Vultr using DSPy and Ollama -Finally, the agent used the `Information Synthesizer` tool to synthesize the analysis and present it in a natural language format. +> This tutorial teaches you how to handle and structure private documents with large unstructured data. Learn to use DSPy for information extraction, run your LLM with Ollama on Vultr, and manage data with Qdrant Hybrid Cloud on Vultr, perfect for regulated environments needing data privacy. -![Synthesized Analysis](https://qdrant.tech/articles_data/agentic-rag-crewai-zoom/output4.png) +[View Tutorial](/documentation/tutorials/rag-chatbot-vultr-dspy-ollama/) -The user sees the final output in a chat-like interface. +**RAG System That Chats with Blog Contents** with Qdrant Hybrid Cloud and Scaleway using LangChain. -![Chat Interface](https://qdrant.tech/articles_data/agentic-rag-crewai-zoom/app.png) +> Build a RAG system that combines blog scanning with the capabilities of semantic search. RAG enhances the generation of answers by retrieving relevant documents to aid the question-answering process. This setup showcases the integration of advanced search and AI language processing to improve information retrieval and generation tasks. -The user can then continue to interact with the system by asking more questions. +[View Tutorial](/documentation/tutorials/rag-chatbot-scaleway/) -### [Anchor](https://qdrant.tech/documentation/agentic-rag-crewai-zoom/\#architecture) Architecture +**Private Chatbot for Interactive Learning** with Qdrant Hybrid Cloud and Red Hat OpenShift using Haystack. -The system is built on three main components: +> In this tutorial, you will build a chatbot without public internet access. The goal is to keep sensitive data secure and isolated. Your RAG system will be built with Qdrant Hybrid Cloud on Red Hat OpenShift, leveraging Haystack for enhanced generative AI capabilities. This tutorial especially explores how this setup ensures that not a single data point leaves the environment. -- **Qdrant Vector Database**: Stores meeting transcripts and summaries as vector embeddings, enabling semantic search -- **CrewAI Framework**: Coordinates AI agents that handle different aspects of meeting analysis -- **Anthropic Claude**: Provides natural language understanding and response generation +[View Tutorial](/documentation/tutorials/rag-chatbot-red-hat-openshift-haystack/) -1. **Data Processing Pipeline** +#### Supporting Documentation - - Processes meeting transcripts and metadata - - Creates embeddings with SentenceTransformer - - Manages Qdrant collection and data upload -2. **AI Agent System** +Additionally, we built comprehensive documentation tutorials on how to successfully deploy Qdrant Hybrid Cloud on the right infrastructure of choice. For more information, please visit our documentation pages: - - Implements CrewAI agent logic - - Handles vector search integration - - Processes queries with Claude -3. **User Interface** +- [How to Deploy Qdrant Hybrid Cloud on AWS](/documentation/hybrid-cloud/platform-deployment-options/#amazon-web-services-aws) +- [How to Deploy Qdrant Hybrid Cloud on GCP](/documentation/hybrid-cloud/platform-deployment-options/#google-cloud-platform-gcp) +- [How to Deploy Qdrant Hybrid Cloud on Azure](/documentation/hybrid-cloud/platform-deployment-options/#mircrosoft-azure) +- [How to Deploy Qdrant Hybrid Cloud on DigitalOcean](/documentation/hybrid-cloud/platform-deployment-options/#digital-ocean) +- [How to Deploy Qdrant on Oracle Cloud](/documentation/hybrid-cloud/platform-deployment-options/#oracle-cloud-infrastructure) +- [How to Deploy Qdrant on Vultr](/documentation/hybrid-cloud/platform-deployment-options/#vultr) +- [How to Deploy Qdrant on Scaleway](/documentation/hybrid-cloud/platform-deployment-options/#scaleway) +- [How to Deploy Qdrant on OVHcloud](/documentation/hybrid-cloud/platform-deployment-options/#ovhcloud) +- [How to Deploy Qdrant on STACKIT](/documentation/hybrid-cloud/platform-deployment-options/#stackit) +- [How to Deploy Qdrant on Red Hat OpenShift](/documentation/hybrid-cloud/platform-deployment-options/#red-hat-openshift) +- [How to Deploy Qdrant on Linode](/documentation/hybrid-cloud/platform-deployment-options/#akamai-linode) +- [How to Deploy Qdrant on Civo](/documentation/hybrid-cloud/platform-deployment-options/#civo) - - Provides chat-like web interface - - Shows real-time processing feedback - - Maintains conversation history +#### Get Started Now! -* * * +[Qdrant Hybrid Cloud](/hybrid-cloud/) marks a significant advancement in vector databases, offering the most flexible way to implement vector search. -## [Anchor](https://qdrant.tech/documentation/agentic-rag-crewai-zoom/\#getting-started) Getting Started +You can test out Qdrant Hybrid Cloud today! Simply sign up for or log into your [Qdrant Cloud account](https://cloud.qdrant.io/login) and get started in the **Hybrid Cloud** section. Also, to learn more about Qdrant Hybrid Cloud read our [Official Release Blog](/blog/hybrid-cloud/) or our [Qdrant Hybrid Cloud website](/hybrid-cloud/). For additional technical insights, please read our [documentation](/documentation/hybrid-cloud/). -![agentic-rag-crewai-zoom](https://qdrant.tech/documentation/examples/agentic-rag-crewai-zoom/agentic-rag-2.png) +[![hybrid-cloud-get-started](/blog/hybrid-cloud-launch-partners/hybrid-cloud-get-started.png)](https://cloud.qdrant.io/login) -1. **Get API Credentials for Qdrant**: +<|page-357-lllmstxt|> +We are excited to announce the official launch of [Qdrant Hybrid Cloud](/hybrid-cloud/) today, a significant leap forward in the field of vector search and enterprise AI. Rooted in our open-source origin, we are committed to offering our users and customers unparalleled control and sovereignty over their data and vector search workloads. Qdrant Hybrid Cloud stands as **the industry's first managed vector database that can be deployed in any environment** - be it cloud, on-premise, or the edge. - - Sign up for an account at [Qdrant Cloud](https://cloud.qdrant.io/signup). - - Create a new cluster and copy the **Cluster URL** (format: [https://xxx.gcp.cloud.qdrant.io](https://xxx.gcp.cloud.qdrant.io/)). - - Go to **Data Access Control** and generate an **API key**. -2. **Get API Credentials for AI Services**: +

- - Get an API key from [Anthropic](https://www.anthropic.com/) - - Get an API key from [OpenAI](https://platform.openai.com/) +As the AI application landscape evolves, the industry is transitioning from prototyping innovative AI solutions to actively deploying AI applications into production (incl. GenAI, semantic search, or recommendation systems). In this new phase, **privacy**, **data sovereignty**, **deployment flexibility**, and **control** are at the top of developers’ minds. These factors are critical when developing, launching, and scaling new applications, whether they are customer-facing services like AI assistants or internal company solutions for knowledge and information retrieval or process automation. -* * * +Qdrant Hybrid Cloud offers developers a vector database that can be deployed in any existing environment, ensuring data sovereignty and privacy control through complete database isolation - with the full capabilities of our managed cloud service. -## [Anchor](https://qdrant.tech/documentation/agentic-rag-crewai-zoom/\#setup) Setup +- **Unmatched Deployment Flexibility**: With its Kubernetes-native architecture, Qdrant Hybrid Cloud provides the ability to bring your own cloud or compute by deploying Qdrant as a managed service on the infrastructure of choice, such as Oracle Cloud Infrastructure (OCI), Vultr, Red Hat OpenShift, DigitalOcean, OVHcloud, Scaleway, STACKIT, Civo, VMware vSphere, AWS, Google Cloud, or Microsoft Azure. -1. **Clone the Repository**: +- **Privacy & Data Sovereignty**: Qdrant Hybrid Cloud offers unparalleled data isolation and the flexibility to process vector search workloads in their own environments. -```bash -git clone https://github.com/qdrant/examples.git -cd agentic_rag_zoom_crewai +- **Scalable & Secure Architecture**: Qdrant Hybrid Cloud's design ensures scalability and adaptability with its Kubernetes-native architecture, separates data and control for enhanced security, and offers a unified management interface for ease of use, enabling businesses to grow and adapt without compromising privacy or control. -``` +- **Effortless Setup in Seconds**: Setting up Qdrant Hybrid Cloud is incredibly straightforward, thanks to our [simple Kubernetes installation](/documentation/hybrid-cloud/) that connects effortlessly with your chosen infrastructure, enabling secure, scalable deployments right from the get-go -2. **Create and Activate a Python Virtual Environment with Python 3.10 for compatibility**: +Let’s explore these aspects in more detail: -```bash -python3.10 -m venv venv -source venv/bin/activate # Windows: venv\Scripts\activate +#### Maximizing Deployment Flexibility: Enabling Applications to Run Across Any Environment -``` +![hybrid-cloud-environments](/blog/hybrid-cloud/hybrid-cloud-environments.png) -3. **Install Dependencies**: +Qdrant Hybrid Cloud, powered by our seamless Kubernetes-native architecture, is the first managed vector database engineered for unparalleled deployment flexibility. This means that regardless of where you run your AI applications, you can now enjoy the benefits of a fully managed Qdrant vector database, simplifying operations across any cloud, on-premise, or edge locations. -```bash -pip install -r requirements.txt +For this launch of Qdrant Hybrid Cloud, we are proud to collaborate with key cloud providers, including [Oracle Cloud Infrastructure (OCI)](https://blogs.oracle.com/cloud-infrastructure/post/qdrant-hybrid-cloud-now-available-oci-customers), [Red Hat OpenShift](/blog/hybrid-cloud-red-hat-openshift/), [Vultr](/blog/hybrid-cloud-vultr/), [DigitalOcean](/blog/hybrid-cloud-digitalocean/), [OVHcloud](/blog/hybrid-cloud-ovhcloud/), [Scaleway](/blog/hybrid-cloud-scaleway/), [Civo](/documentation/hybrid-cloud/platform-deployment-options/#civo), and [STACKIT](/blog/hybrid-cloud-stackit/). These partnerships underscore our commitment to delivering a versatile and robust vector database solution that meets the complex deployment requirements of today's AI applications. -``` +In addition to our partnerships with key cloud providers, we are also launching in collaboration with renowned AI development tools and framework leaders, including [LlamaIndex](/blog/hybrid-cloud-llamaindex/), [LangChain](/blog/hybrid-cloud-langchain/), [Airbyte](/blog/hybrid-cloud-airbyte/), [JinaAI](/blog/hybrid-cloud-jinaai/), [Haystack by deepset](/blog/hybrid-cloud-haystack/), and [Aleph Alpha](/blog/hybrid-cloud-aleph-alpha/). These launch partners are instrumental in ensuring our users can seamlessly integrate with essential technologies for their AI applications, enriching our offering and reinforcing our commitment to versatile and comprehensive deployment environments. -4. **Configure Environment Variables**: -Create a `.env.local` file with: +Together with our launch partners we have created detailed tutorials that show how to build cutting-edge AI applications with Qdrant Hybrid Cloud on the infrastructure of your choice. These tutorials are available in our [launch partner blog](/blog/hybrid-cloud-launch-partners/). Additionally, you can find expansive [documentation](/documentation/hybrid-cloud/) and instructions on how to [deploy Qdrant Hybrid Cloud](/documentation/hybrid-cloud/hybrid-cloud-setup/). -```bash -openai_api_key=your_openai_key_here -anthropic_api_key=your_anthropic_key_here -qdrant_url=your_qdrant_url_here -qdrant_api_key=your_qdrant_api_key_here +#### Powering Vector Search & AI with Unmatched Data Sovereignty -``` +Proprietary data, the lifeblood of AI-driven innovation, fuels personalized experiences, accurate recommendations, and timely anomaly detection. This data, unique to each organization, encompasses customer behaviors, internal processes, and market insights - crucial for tailoring AI applications to specific business needs and competitive differentiation. However, leveraging such data effectively while ensuring its **security, privacy, and control** requires diligence. -* * * +The innovative architecture of Qdrant Hybrid Cloud ensures **complete database isolation**, empowering developers with the autonomy to tailor where they process their vector search workloads with total data sovereignty. Rooted deeply in our commitment to open-source principles, this approach aims to foster a new level of trust and reliability by providing the essential tools to navigate the exciting landscape of enterprise AI. -## [Anchor](https://qdrant.tech/documentation/agentic-rag-crewai-zoom/\#usage) Usage +#### How We Designed the Qdrant Hybrid Cloud Architecture -### [Anchor](https://qdrant.tech/documentation/agentic-rag-crewai-zoom/\#1-process-meeting-data) 1\. Process Meeting Data +We designed the architecture of Qdrant Hybrid Cloud to meet the evolving needs of businesses seeking unparalleled flexibility, control, and privacy. -The [`data_loader.py`](https://github.com/qdrant/examples/blob/master/agentic_rag_zoom_crewai/vector/data_loader.py) script processes meeting data and stores it in Qdrant: +- **Kubernetes-Native Design**: By embracing Kubernetes, we've ensured that our architecture is both scalable and adaptable. This choice supports our deployment flexibility principle, allowing Qdrant Hybrid Cloud to integrate seamlessly with any infrastructure that can run Kubernetes. -```bash -python vector/data_loader.py +- **Decoupled Data and Control Planes**: Our architecture separates the data plane (where the data is stored and processed) from the control plane (which manages the cluster operations). This separation enhances security, allows for more granular control over the data, and enables the data plane to reside anywhere the user chooses. -``` +- **Unified Management Interface**: Despite the underlying complexity and the diversity of deployment environments, we designed a unified, user-friendly interface that simplifies the Qdrant cluster management. This interface supports everything from deployment to scaling and upgrading operations, all accessible from the [Qdrant Cloud portal](https://cloud.qdrant.io/login). -After this script has run, you should see a new collection in your Qdrant Cloud account called `zoom_recordings`. This collection contains the vector embeddings of the meeting transcripts. The points in the collection contain the original meeting data, including the topic, content, and summary. +- **Extensible and Modular**: Recognizing the rapidly evolving nature of technology and enterprise needs, we built Qdrant Hybrid Cloud to be both extensible and modular. Users can easily integrate new services, data sources, and deployment environments as their requirements grow and change. -### [Anchor](https://qdrant.tech/documentation/agentic-rag-crewai-zoom/\#2-launch-the-interface) 2\. Launch the Interface +#### Diagram: Qdrant Hybrid Cloud Architecture +![hybrid-cloud-architecture](/blog/hybrid-cloud/hybrid-cloud-architecture.png) -The [`streamlit_app.py`](https://github.com/qdrant/examples/blob/master/agentic_rag_zoom_crewai/vector/streamlit_app.py) is located in the `vector` folder. To launch it, run: +#### Quickstart: Effortless Setup with Our One-Step Installation -```bash -streamlit run vector/streamlit_app.py +We’ve made getting started with Qdrant Hybrid Cloud as simple as possible. The Kubernetes “One-Step” installation will allow you to connect with the infrastructure of your choice. This is how you can get started: -``` +1. **Activate Hybrid Cloud**: Simply sign up for or log into your [Qdrant Cloud](https://cloud.qdrant.io/login) account and navigate to the **Hybrid Cloud** section. -When you run this script, you will be able to interact with the system through a chat-like interface. Ask questions about the meeting content, and the system will use the AI agents to find the most relevant information and present it in a natural language format. +2. **Onboard your Kubernetes cluster**: Follow the onboarding wizard and add your Kubernetes cluster as a Hybrid Cloud Environment - be it in the cloud, on-premise, or at the edge. -### [Anchor](https://qdrant.tech/documentation/agentic-rag-crewai-zoom/\#the-data-pipeline) The Data Pipeline +3. **Deploy Qdrant clusters securely, with confidence:** Now, you can effortlessly create and manage Qdrant clusters in your own environment, directly from the central Qdrant Management Console. This supports horizontal and vertical scaling, zero-downtime upgrades, and disaster recovery seamlessly, allowing you to deploy anywhere with confidence. -At the heart of our system is the data processing pipeline: +Explore our [detailed documentation](/documentation/hybrid-cloud/) and [tutorials](/documentation/examples/) to seamlessly deploy Qdrant Hybrid Cloud in your preferred environment, and don't miss our [launch partner blog post](/blog/hybrid-cloud-launch-partners/) for practical insights. Start leveraging the full potential of Qdrant Hybrid Cloud and [create your first Qdrant cluster today](https://cloud.qdrant.io/login), unlocking the flexibility and control essential for your AI and vector search workloads. -```python -class MeetingData: - def _initialize(self): - self.data_dir = Path(__file__).parent.parent / 'data' - self.meetings = self._load_meetings() +[![hybrid-cloud-get-started](/blog/hybrid-cloud/hybrid-cloud-get-started.png)](https://cloud.qdrant.io/login) - self.qdrant_client = QdrantClient( - url=os.getenv('qdrant_url'), - api_key=os.getenv('qdrant_api_key') - ) - self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2') +## Launch Partners -``` +We launched Qdrant Hybrid Cloud with assistance and support of our trusted partners. Learn what they have to say about our latest offering: -The singleton pattern in data\_loader.py is implemented through a MeetingData class that uses Python’s **new** and **init** methods. The class maintains a private \_instance variable to track if an instance exists, and a \_initialized flag to ensure the initialization code only runs once. When creating a new instance with MeetingData(), **new** first checks if \_instance exists - if it doesn’t, it creates one and sets the initialization flag to False. The **init** method then checks this flag, and if it’s False, runs the initialization code and sets the flag to True. This ensures that all subsequent calls to MeetingData() return the same instance with the same initialized resources. +#### Oracle Cloud Infrastructure: +> *"We are excited to partner with Qdrant to bring their powerful vector search capabilities to Oracle Cloud Infrastructure. By offering Qdrant Hybrid Cloud as a managed service on OCI, we are empowering enterprises to harness the full potential of AI-driven applications while maintaining complete control over their data. This collaboration represents a significant step forward in making scalable vector search accessible and manageable for businesses across various industries, enabling them to drive innovation, enhance productivity, and unlock valuable insights from their data."* Dr. Sanjay Basu, Senior Director of Cloud Engineering, AI/GPU Infrastructure at Oracle -When processing meetings, we need to consider both the content and context. Each meeting gets converted into a rich text representation before being transformed into a vector: +Read more in [OCI's latest Partner Blog](https://blogs.oracle.com/cloud-infrastructure/post/qdrant-hybrid-cloud-now-available-oci-customers). -```python -text_to_embed = f""" - Topic: {meeting.get('topic', '')} - Content: {meeting.get('vtt_content', '')} - Summary: {json.dumps(meeting.get('summary', {}))} -""" +#### Red Hat: +> *“Red Hat is committed to driving transparency, flexibility and choice for organizations to more easily unlock the power of AI. By working with partners like Qdrant to enable streamlined integration experiences on Red Hat OpenShift for AI use cases, organizations can more effectively harness critical data and deliver real business outcomes,”* said Steven Huels, vice president and general manager, AI Business Unit, Red Hat. -``` +Read more in our [official Red Hat Partner Blog](/blog/hybrid-cloud-red-hat-openshift/). -This structured format ensures our vector embeddings capture the full context of each meeting. But processing meetings one at a time would be inefficient. Instead, we batch process our data: +#### Vultr: +> *"Our collaboration with Qdrant empowers developers to unlock the potential of vector search applications, such as RAG, by deploying Qdrant Hybrid Cloud with its high-performance search capabilities directly on Vultr's global, automated cloud infrastructure. This partnership creates a highly scalable and customizable platform, uniquely designed for deploying and managing AI workloads with unparalleled efficiency."* Kevin Cochrane, Vultr CMO. -```python -batch_size = 100 -for i in range(0, len(points), batch_size): - batch = points[i:i + batch_size] - self.qdrant_client.upsert( - collection_name='zoom_recordings', - points=batch - ) +Read more in our [official Vultr Partner Blog](/blog/hybrid-cloud-vultr/). -``` +#### OVHcloud: +> *“The partnership between OVHcloud and Qdrant Hybrid Cloud highlights, in the European AI landscape, a strong commitment to innovative and secure AI solutions, empowering startups and organisations to navigate AI complexities confidently. By emphasizing data sovereignty and security, we enable businesses to leverage vector databases securely."* Yaniv Fdida, Chief Product and Technology Officer, OVHcloud -### [Anchor](https://qdrant.tech/documentation/agentic-rag-crewai-zoom/\#building-the-ai-agent-system) Building the AI Agent System +Read more in our [official OVHcloud Partner Blog](/blog/hybrid-cloud-ovhcloud/). -Our AI system uses a tool-based approach. Let’s start with the simplest tool - a calculator for meeting statistics: +#### DigitalOcean: +> *“Qdrant, with its seamless integration and robust performance, equips businesses to develop cutting-edge applications that truly resonate with their users. Through applications such as semantic search, Q&A systems, recommendation engines, image search, and RAG, DigitalOcean customers can leverage their data to the fullest, ensuring privacy and driving innovation.“* - Bikram Gupta, Lead Product Manager, Kubernetes & App Platform, DigitalOcean. -```python -class CalculatorTool(BaseTool): - name: str = "calculator" - description: str = "Perform basic mathematical calculations" +Read more in our [official DigitalOcean Partner Blog](/blog/hybrid-cloud-digitalocean/). - def _run(self, a: int, b: int) -> dict: - return { - "addition": a + b, - "multiplication": a * b - } +#### Scaleway: +> *"With our partnership with Qdrant, Scaleway reinforces its status as Europe's leading cloud provider for AI innovation. The integration of Qdrant's fast and accurate vector database enriches our expanding suite of AI solutions. This means you can build smarter, faster AI projects with us, worry-free about performance and security."* FrĂ©dĂ©ric Bardolle, Lead PM AI, Scaleway -``` +Read more in our [official Scaleway Partner Blog](/blog/hybrid-cloud-scaleway/). -But the real power comes from our vector search integration. This tool converts natural language queries into vector representations and searches our meeting database: +#### Airbyte: +> *“The new Qdrant Hybrid Cloud is an exciting addition that offers peace of mind and flexibility, aligning perfectly with the needs of Airbyte Enterprise users who value the same balance. Being open-source at our core, both Qdrant and Airbyte prioritize giving users the flexibility to build and test locally—a significant advantage for data engineers and AI practitioners. We're enthusiastic about the Hybrid Cloud launch, as it mirrors our vision of enabling users to confidently transition from local development and local deployments to a managed solution, with both cloud and hybrid cloud deployment options.”* AJ Steers, Staff Engineer for AI, Airbyte -```python -class SearchMeetingsTool(BaseTool): - def _run(self, query: str) -> List[Dict]: - response = openai_client.embeddings.create( - model="text-embedding-ada-002", - input=query - ) - query_vector = response.data[0].embedding +Read more in our [official Airbyte Partner Blog](/blog/hybrid-cloud-airbyte/). - return self.qdrant_client.search( - collection_name='zoom_recordings', - query_vector=query_vector, - limit=10 - ) +#### deepset: +> *“We hope that with Haystack 2.0 and our growing partnerships such as what we have here with Qdrant Hybrid Cloud, engineers are able to build AI systems with full autonomy. Both in how their pipelines are designed, and how their data are managed.”* Tuana Çelik, Developer Relations Lead, deepset. -``` +Read more in our [official Haystack by deepset Partner Blog](/blog/hybrid-cloud-haystack/). -The search results then feed into our analysis tool, which uses Claude to provide deeper insights: +#### LlamaIndex: +> *“LlamaIndex is thrilled to partner with Qdrant on the launch of Qdrant Hybrid Cloud, which upholds Qdrant's core functionality within a Kubernetes-based architecture. This advancement enhances LlamaIndex's ability to support diverse user environments, facilitating the development and scaling of production-grade, context-augmented LLM applications.”* Jerry Liu, CEO and Co-Founder, LlamaIndex -```python -class MeetingAnalysisTool(BaseTool): - def _run(self, meeting_data: dict) -> Dict: - meetings_text = self._format_meetings(meeting_data) +Read more in our [official LlamaIndex Partner Blog](/blog/hybrid-cloud-llamaindex/). - message = client.messages.create( - model="claude-3-sonnet-20240229", - messages=[{\ - "role": "user",\ - "content": f"Analyze these meetings:\n\n{meetings_text}"\ - }] - ) +#### LangChain: +> *“The AI industry is rapidly maturing, and more companies are moving their applications into production. We're really excited at LangChain about supporting enterprises' unique data architectures and tooling needs through integrations and first-party offerings through LangSmith. First-party enterprise integrations like Qdrant's greatly contribute to the LangChain ecosystem with enterprise-ready retrieval features that seamlessly integrate with LangSmith's observability, production monitoring, and automation features, and we're really excited to develop our partnership further.”* -Erick Friis, Founding Engineer at LangChain -``` +Read more in our [official LangChain Partner Blog](/blog/hybrid-cloud-langchain/). -### [Anchor](https://qdrant.tech/documentation/agentic-rag-crewai-zoom/\#orchestrating-the-workflow) Orchestrating the Workflow +#### Jina AI: +> *“The collaboration of Qdrant Hybrid Cloud with Jina AI’s embeddings gives every user the tools to craft a perfect search framework with unmatched accuracy and scalability. It’s a partnership that truly pays off!”* Nan Wang, CTO, Jina AI -The magic happens when we bring these tools together under our agent framework. We create two specialized agents: +Read more in our [official Jina AI Partner Blog](/blog/hybrid-cloud-jinaai/). -```python -researcher = Agent( - role='Research Assistant', - goal='Find and analyze relevant information', - tools=[calculator, searcher, analyzer] -) +We have also launched Qdrant Hybrid Cloud with the support of **Aleph Alpha**, **STACKIT** and **Civo**. Learn more about our valued partners: -synthesizer = Agent( - role='Information Synthesizer', - goal='Create comprehensive and clear responses' -) +- **Aleph Alpha:** [Enhance AI Data Sovereignty with Aleph Alpha and Qdrant Hybrid Cloud](/blog/hybrid-cloud-aleph-alpha/) +- **STACKIT:** [STACKIT and Qdrant Hybrid Cloud for Best Data Privacy](/blog/hybrid-cloud-stackit/) +- **Civo:** [Deploy Qdrant Hybrid Cloud on Civo Kubernetes](/documentation/hybrid-cloud/platform-deployment-options/#civo) -``` +<|page-358-lllmstxt|> +> *"The problem with many of the vector databases is that they work fine, they are scalable. This is common. The problem is that they are not easy to use. So that is why I always use Qdrant.”*\ +— Syed Asad +> -These agents work together in a coordinated workflow. The researcher gathers and analyzes information, while the synthesizer creates clear, actionable responses. This separation of concerns allows each agent to focus on its strengths. +Syed Asad is an accomplished AI/ML Professional, specializing in LLM Operations and RAGs. With a focus on Image Processing and Massive Scale Vector Search Operations, he brings a wealth of expertise to the field. His dedication to advancing artificial intelligence and machine learning technologies has been instrumental in driving innovation and solving complex challenges. Syed continues to push the boundaries of AI/ML applications, contributing significantly to the ever-evolving landscape of the industry. -### [Anchor](https://qdrant.tech/documentation/agentic-rag-crewai-zoom/\#building-the-user-interface) Building the User Interface +***Listen to the episode on [Spotify](https://open.spotify.com/episode/4Gm4TQsO2PzOGBp5U6Cj2e?si=JrG0kHDpRTeb2gLi5zdi4Q), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/RVb6_CI7ysM?si=8Hm7XSWYTzK6SRj0).*** -The Streamlit interface provides a clean, chat-like experience for interacting with our AI system. Let’s start with the basic setup: + -```python -st.set_page_config( - page_title="Meeting Assistant", - page_icon="đŸ€–", - layout="wide" -) + -``` +## **Top takeaways:** -To make the interface more engaging, we add custom styling that makes the output easier to read: +Prompt engineering is the new frontier in AI. Let’s find out about how critical its role is in controlling AI language models. In this episode, Demetrios and Syed gets to discuss about it. -```python -st.markdown(""" - -""", unsafe_allow_html=True) +Syed also explores the retrieval augmented generation systems and machine learning technology at Kiwi Tech. This episode showcases the challenges and advancements in AI applications across various industries. -``` +Here are the highlights from this episode: -One of the key features is real-time feedback during processing. We achieve this with a custom output handler: +1. **Digital Family Tree:** Learn about the family tree app project that brings the past to life through video interactions with loved ones long gone. +2. **Multimodal Mayhem:** Discover the complexities of creating AI systems that can understand diverse accents and overcome transcription tribulations – all while being cost-effective! +3. **The Perfect Match:** Find out how semantic chunking is revolutionizing job matching in radiology and why getting the context right is non-negotiable. +4. **Quasar's Quantum Leap:** Syed shares the inside scoop on Quasar, a financial chatbot, and the AI magic that makes it tick. +5. **The Privacy Paradox:** Delve into the ever-present conflict between powerful AI outcomes and the essential quest to preserve data privacy. -```python -class ConsoleOutput: - def __init__(self, placeholder): - self.placeholder = placeholder - self.buffer = [] - self.update_interval = 0.5 # seconds - self.last_update = time.time() +> Fun Fact: Syed Asad and his team at Kiwi Tech use a GPU-based approach with GPT 4 for their AI system named Quasar, addressing challenges like temperature control and mitigating hallucinatory responses. +> - def write(self, text): - self.buffer.append(text) - if time.time() - self.last_update > self.update_interval: - self._update_display() +## Show notes: -``` +00:00 Clients seek engaging multimedia apps over chatbots.\ +06:03 Challenges in multimodal rags: accent, transcription, cost.\ +08:18 AWS credits crucial, but costs skyrocket quickly.\ +10:59 Accurate procedures crucial, Qdrant excels in search.\ +14:46 Embraces AI for monitoring and research.\ +19:47 Seeking insights on ineffective marketing models and solutions.\ +23:40 GPT 4 useful, prompts need tracking tools\ +25:28 Discussing data localization and privacy, favoring Ollama.\ +29:21 Hallucination control and pricing are major concerns.\ +32:47 DeepEval, AI testing, LLM, potential, open source.\ +35:24 Filter for appropriate embedding model based on use case and size. -This handler buffers the output and updates the display periodically, creating a smooth user experience. When a user sends a query, we process it with visual feedback: +## More Quotes from Syed: -```python -with st.chat_message("assistant"): - message_placeholder = st.empty() - progress_bar = st.progress(0) - console_placeholder = st.empty() +*"Qdrant has the ease of use. I have trained people in my team who specializes with Qdrant, and they were initially using Weaviate and Pinecone.”*\ +— Syed Asad - try: - console_output = ConsoleOutput(console_placeholder) - with contextlib.redirect_stdout(console_output): - progress_bar.progress(0.3) - full_response = get_crew_response(prompt) - progress_bar.progress(1.0) +*"What's happening nowadays is that the clients or the projects in which I am particularly working on are having more of multimedia or multimodal approach. They want their apps or their LLM apps to be more engaging rather than a mere chatbot.”*\ +— Syed Asad -``` +*"That is where the accuracy matters the most. And in this case, Qdrant has proved just commendable in giving excellent search results.”*\ +— Syed Asad in Advancements in Medical Imaging Search -The interface maintains a chat history, making it feel like a natural conversation: +## Transcript: +Demetrios: +What is up, good people? How y'all doing? We are back for yet another vector space talks. I'm super excited to be with you today because we're gonna be talking about rags and rag systems. And from the most basic naive rag all the way to the most advanced rag, we've got it covered with our guest of honor, Asad. Where are you at, my man? There he is. What's going on, dude? -```python -if "messages" not in st.session_state: - st.session_state.messages = [] +Syed Asad: +Yeah, everything is fine. -for message in st.session_state.messages: - with st.chat_message(message["role"]): - st.markdown(message["content"]) +Demetrios: +Excellent, excellent. Well, I know we were talking before we went live, and you are currently in India. It is very late for you, so I appreciate you coming on here and doing this with us. You are also, for those who do not know, a senior engineer for AI and machine learning at Kiwi Tech. Can you break down what Kiwi tech is for us real fast? -``` +Syed Asad: +Yeah, sure. Absolutely. So Kiwi tech is actually a software development, was actually a software development company focusing on software development, iOS and mobile apps. And right now we are in all focusing more on generative AI, machine learning and computer vision projects. So I am heading the AI part here. So. And we are having loads of projects here with, from basic to advanced rags, from naive to visual rags. So basically I'm doing rag in and out from morning to evening. -We also include helpful examples and settings in the sidebar: +Demetrios: +Yeah, you can't get away from it, huh? Man, that is great. -```python -with st.sidebar: - st.header("Settings") - search_limit = st.slider("Number of results", 1, 10, 5) +Syed Asad: +Everywhere there is rag. Even, even the machine learning part, which was previously done by me, is all now into rags engineered AI. Yeah. Machine learning is just at the background now. - analysis_depth = st.select_slider( - "Analysis Depth", - options=["Basic", "Standard", "Detailed"], - value="Standard" - ) +Demetrios: +Yeah, yeah, yeah. It's funny, I understand the demand for it because people are trying to see where they can get value in their companies with the new generative AI advancements. -``` +Syed Asad: +Yeah. -This combination of features creates an interface that’s both powerful and approachable. Users can see their query being processed in real-time, adjust settings to their needs, and maintain context through the chat history. +Demetrios: +So I want to talk a lot about advance rags, considering the audience that we have. I would love to hear about the visual rags also, because that sounds very exciting. Can we start with the visual rags and what exactly you are doing, what you're working on when it comes to that? -* * * +Syed Asad: +Yeah, absolutely. So initially when I started working, so you all might be aware with the concept of frozen rags, the normal and the basic rag, there is a text retrieval system. You just query your data and all those things. So what is happening nowadays is that the clients or the projects in which I am particularly working on are having more of multimedia or multimodal approach. So that is what is happening. So they want their apps or their LLM apps to be more engaging rather than a mere chatbot. Because. Because if we go on to the natural language or the normal english language, I mean, interacting by means of a video or interacting by means of a photo, like avatar, generation, anything like that. -## [Anchor](https://qdrant.tech/documentation/agentic-rag-crewai-zoom/\#conclusion) Conclusion +Syed Asad: +So that has become more popular or, and is gaining more popularity. And if I talk about, specifically about visual rags. So the projects which I am working on is, say, for example, say, for example, there is a family tree type of app in which. In which you have an account right now. So, so you are recording day videos every day, right? Like whatever you are doing, for example, you are singing a song, you're walking in the park, you are eating anything like that, and you're recording those videos and just uploading them on that app. But what do you want? Like, your future generations can do some sort of query, like what, what was my grandfather like? What was my, my uncle like? Anything my friend like. And it was, it is not straight, restricted to a family. It can be friends also. -![agentic-rag-crewai-zoom](https://qdrant.tech/documentation/examples/agentic-rag-crewai-zoom/agentic-rag-3.png) +Syed Asad: +Anyway, so. And these are all us based projects, not indian based projects. Okay, so, so you, you go in query and it returns a video about your grandfather who has already died. He has not. You can see him speaking about that particular thing. So it becomes really engaging. So this is something which is called visual rag, which I am working right now on this. -This tutorial has demonstrated how to build a sophisticated meeting analysis system that combines vector search with AI agents. Let’s recap the key components we’ve covered: +Demetrios: +I love that use case. So basically it's, I get to be closer to my family that may or may not be here with us right now because the rag can pull writing that they had. It can pull video of other family members talking about it. It can pull videos of when my cousin was born, that type of stuff. -1. **Vector Search Integration** +Syed Asad: +Anything, anything from cousin to family. You can add any numbers of members of your family. You can give access to any number of people who can have after you, after you're not there, like a sort of a nomination or a delegation live up thing. So that is, I mean, actually, it is a very big project, involves multiple transcription models, video transcription models. It also involves actually the databases, and I'm using Qdrant, proud of it. So, in that, so. And Qdrant is working seamlessly in that. So, I mean, at the end there is a vector search, but at the background there is more of more of visual rag, and people want to communicate through videos and photos. - - Efficient storage and retrieval of meeting content using Qdrant - - Semantic search capabilities through vector embeddings - - Batched processing for optimal performance -2. **AI Agent Framework** +Syed Asad: +So that is coming into picture more. - - Tool-based approach for modular functionality - - Specialized agents for research and analysis - - Integration with Claude for intelligent insights -3. **Interactive Interface** +Demetrios: +Well, talk to me about multimodal rag. And I know it's a bit of a hairy situation because if you're trying to do vector search with videos, it can be a little bit more complicated than just vector search with text. Right. So what are some of the unique challenges that you've seen when it comes to multimodal rag? - - Real-time feedback and progress tracking - - Persistent chat history - - Configurable search and analysis settings +Syed Asad: +The first challenge dealing with multimodal rags is actually the accent, because it can be varying accent. The problem with the transcription, one of the problems or the challenges which I have faced in this is that lack of proper transcription models, if you are, if you are able to get a proper transcription model, then if that, I want to deploy that model in the cloud, say for example, an AWS cloud. So that AWS cloud is costing heavy on the pockets. So managing infra is one of the part. I mean, I'm talking in a, in a, in a highly scalable production environment. I'm not talking about a research environment in which you can do anything on a collab notebook and just go with that. So whenever it comes to the client part or the delivery part, it becomes more critical. And even there, there were points then that we have to entirely overhaul the entire approach, which was working very fine when we were doing it on the dev environment, like the openais whisper. -The resulting system demonstrates the power of combining vector search with AI agents to create an intelligent meeting assistant. By following this tutorial, you’ve learned how to: +Syed Asad: +We started with that OpenAI's whisper. It worked fine. The transcription was absolutely fantastic. But we couldn't go into the production. -- Process and store meeting data efficiently -- Implement semantic search capabilities -- Create specialized AI agents for analysis -- Build an intuitive user interface +Demetrios: +Part with that because it was too, the word error rate was too high, or because it was too slow. What made it not allow you to go into production? -This foundation can be extended in many ways, such as: +Syed Asad: +It was, the word error rate was also high. It was very slow when it was being deployed on an AWS instance. And the thing is that the costing part, because usually these are startups, or mid startup, if I talk about the business point of view, not the tech point of view. So these companies usually offer these type of services for free, and on the basis of these services they try to raise funding. So they want something which is actually optimized, optimizing their cost as well. So what I personally feel, although AWS is massively scalable, but I don't prefer AWS at all until, unless there are various other options coming out, like salad. I had a call, I had some interactions with Titan machine learning also, but it was also fine. But salad is one of the best as of now. -- Adding more specialized agents -- Implementing additional analysis tools -- Enhancing the user interface -- Integrating with other data sources +Demetrios: +Yeah. Unless you get that free AWS credits from the startup program, it can get very expensive very quickly. And even if you do have the free AWS credits, it still gets very expensive very quickly. So I understand what you're saying is basically it was unusable because of the cost and the inability to figure out, it was more of a product problem if you could figure out how to properly monetize it. But then you had technical problems like word error rate being really high, the speed and latency was just unbearable. I can imagine. So unless somebody makes a query and they're ready to sit around for a few minutes and let that query come back to you, with a video or some documents, whatever it may be. Is that what I'm understanding on this? And again, this is for the family tree use case that you're talking about. -The code is available in the [repository](https://github.com/qdrant/examples/tree/master/agentic_rag_zoom_crewai), and we encourage you to experiment with your own modifications and improvements. +Syed Asad: +Yes, family tree use case. So what was happening in that, in that case is a video is uploaded, it goes to the admin for an approval actually. So I mean you can, that is where we, they were restricting the costing part as far as the project was concerned. It's because you cannot upload any random videos and they will select that. Just some sort of moderation was also there, as in when the admin approves those videos, that videos goes on to the transcription pipeline. They are transcripted via an, say a video to text model like the open eyes whisper. So what was happening initially, all the, all the research was done with Openais, but at the end when deployment came, we have to go with deep Gram and AssemblyAI. That was the place where these models were excelling far better than OpenAI. -* * * +Syed Asad: +And I'm a big advocate of open source models, so also I try to leverage those, but it was not pretty working in production environment. -##### Was this page useful? +Demetrios: +Fascinating. So you had that, that's one of your use cases, right? And that's very much the multimodal rag use case. Are all of your use cases multimodal or did you have, do you have other ones too? -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Syed Asad: +No, all are not multimodal. There are few multimodal, there are few text based on naive rag also. So what, like for example, there is one use case coming which is sort of a job search which is happening. A job search for a radiology, radiology section. I mean a very specialized type of client it is. And they're doing some sort of job search matching the modalities and procedures. And it is sort of a temporary job. Like, like you have two shifts ready, two shifts begin, just some. -Thank you for your feedback! 🙏 +Syed Asad: +So, so that is, that is very critical when somebody is putting their procedures or what in. Like for example, they, they are specializing in x rays in, in some sort of medical procedures and that is matching with the, with the, with the, with the employers requirement. So that is where the accuracy matters the most. Accurate. And in this case, Qdrant has proved just commendable in giving excellent search results. The other way around is that in this case is there were some challenges related to the quality of results also because. So progressing from frozen rack to advanced rag like adopting methods like re ranking, semantic chunking. I have, I have started using semantic chunking. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/agentic-rag-crewai-zoom.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Syed Asad: +So it has proved very beneficial as far as the quality of results is concerned. -On this page: +Demetrios: +Well, talk to me more about. I'm trying to understand this use case and why a rag is useful for the job matching. You have doctors who have specialties and they understand, all right, they're, maybe it's an orthopedic surgeon who is very good at a certain type of surgery, and then you have different jobs that come online. They need to be matched with those different jobs. And so where does the rag come into play? Because it seems like it could be solved with machine learning as opposed to AI. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/agentic-rag-crewai-zoom.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Syed Asad: +Yeah, it could have been solved through machine learning, but the type of modalities that are, the type of, say, the type of jobs which they were posting are too much specialized. So it needed some sort of contextual matching also. So there comes the use case for the rag. In this place, the contextual matching was required. Initially, an approach for machine learning was on the table, but it was done with, it was not working. -× +Demetrios: +I get it, I get it. So now talk to me. This is really important that you said accuracy needs to be very high in this use case. How did you make sure that the accuracy was high? Besides the, I think you said chunking, looking at the chunks, looking at how you were doing that, what were some other methods you took to make sure that the accuracy was high? -[Powered by](https://qdrant.tech/) +Syed Asad: +I mean, as far as the accuracy is concerned. So what I did was that my focus was on the embedding model, actually when I started with what type of embed, choice of embedding model. So initially my team started with open source model available readily on hugging face, looking at some sort of leaderboard metrics, some sort of model specializing in medical, say, data, all those things. But even I was curious that the large language, the embedding models which were specializing in medical data, they were also not returning good results and they were mismatching. When, when there was a tabular format, I created a visualization in which the cosine similarity of various models were compared. So all were lagging behind until I went ahead with cohere. Cohere re rankers. They were the best in that case, although they are not trained on that. -<|page-180-lllmstxt|> -## rag-deepseek -- [Documentation](https://qdrant.tech/documentation/) -- 5 Minute RAG with Qdrant and DeepSeek +Syed Asad: +And just an API call was required rather than loading that whole model onto the local. -![deepseek-rag-qdrant](https://qdrant.tech/documentation/examples/rag-deepseek/deepseek.png) +Demetrios: +Interesting. All right. And so then were you doing certain types, so you had the cohere re ranker that gave you a big up. Were you doing any kind of monitoring of the output also, or evaluation of the output and if so, how? -# [Anchor](https://qdrant.tech/documentation/rag-deepseek/\#5-minute-rag-with-qdrant-and-deepseek) 5 Minute RAG with Qdrant and DeepSeek +Syed Asad: +Yes, for evaluation, for monitoring we readily use arrays AI, because I am a, I'm a huge advocate of Llama index also because it has made everything so easier versus lang chain. I mean, if I talk about my personal preference, not regarding any bias, because I'm not linked with anybody, I'm not promoting it here, but they are having the best thing which I write, I like about Llama index and why I use it, is that anything which is coming into play as far as the new research is going on, like for example, a recent research paper was with the raft retrieval augmented fine tuning, which was released by the Microsoft, and it is right now available on archive. So barely few days after they just implemented it in the library, and you can readily start using it rather than creating your own structure. So, yeah, so it was. So one of my part is that I go through the research papers first, then coming on to a result. So a research based approach is required in actually selecting the models, because every day there is new advancement going on in rags and you cannot figure out what is, what would be fine for you, and you cannot do hit and trial the whole day. -| Time: 5 min | Level: Beginner | Output: [GitHub](https://github.com/qdrant/examples/blob/master/rag-with-qdrant-deepseek/deepseek-qdrant.ipynb) | | -| --- | --- | --- | --- | +Demetrios: +Yes, that is a great point. So then if we break down your tech stack, what does it look like? You're using Llama index, you're using arise for the monitoring, you're using Qdrant for your vector database. You have the, you have the coherent re ranker, you are using GPT 3.5. -This tutorial demonstrates how to build a **Retrieval-Augmented Generation (RAG)** pipeline using Qdrant as a vector storage solution and DeepSeek for semantic query enrichment. RAG pipelines enhance Large Language Model (LLM) responses by providing contextually relevant data. +Syed Asad: +No, it's GPT 4, not 3.5. -## [Anchor](https://qdrant.tech/documentation/rag-deepseek/\#overview) Overview +Demetrios: +You needed to go with GPT 4 because everything else wasn't good enough. -In this tutorial, we will: +Syed Asad: +Yes, because one of the context length was one of the most things. But regarding our production, we have been readily using since the last one and a half months. I have been readily using Mixtril. I have been. I have been using because there's one more challenge coming onto the rack, because there's one more I'll give, I'll give you an example of one more use case. It is the I'll name the project also because I'm allowed by my company. It is a big project by the name of Quasar markets. It is a us based company and they are actually creating a financial market type of check chatbot. -1. Take sample text and turn it into vectors with FastEmbed. -2. Send the vectors to a Qdrant collection. -3. Connect Qdrant and DeepSeek into a minimal RAG pipeline. -4. Ask DeepSeek different questions and test answer accuracy. -5. Enrich DeepSeek prompts with content retrieved from Qdrant. -6. Evaluate answer accuracy before and after. +Syed Asad: +Q u a s a r, quasar. You can search it also, and they give you access to various public databases also, and some paid databases also. They have a membership plan. So we are entirely handling the front end backend. I'm not handling the front end and the back end, I'm handling the AI part in that. So one of the challenges is the inference, timing, the timing in which the users are getting queries when it is hitting the database. Say for example, there is a database publicly available database called Fred of us government. So when user can select in that app and go and select the Fred database and want to ask some questions regarding that. -#### [Anchor](https://qdrant.tech/documentation/rag-deepseek/\#architecture) Architecture: +Syed Asad: +So that is in this place there is no vectors, there are no vector databases. It is going without that. So we are following some keyword approach. We are extracting keywords, classifying the queries in simple or complex, then hitting it again to the database, sending it on the live API, getting results. So there are multiple hits going on. So what happened? This all multiple hits which were going on. They reduced the timing and I mean the user experience was being badly affected as the time for the retrieval has gone up and user and if you're going any query and inputting any query it is giving you results in say 1 minute. You wouldn't be waiting for 1 minute for a result. -![deepseek-rag-architecture](https://qdrant.tech/documentation/examples/rag-deepseek/architecture.png) +Demetrios: +Not at all. -* * * +Syed Asad: +So this is one of the challenge for a GPU based approach. And in, in the background everything was working on GPT 4 even, not 3.5. I mean the costliest. -## [Anchor](https://qdrant.tech/documentation/rag-deepseek/\#prerequisites) Prerequisites +Demetrios: +Yeah. -Ensure you have the following: +Syed Asad: +So, so here I started with the LPU approach, the Grok. I mean it's magical. -- Python environment (3.9+) -- Access to [Qdrant Cloud](https://qdrant.tech/) -- A DeepSeek API key from [DeepSeek Platform](https://platform.deepseek.com/api_keys) +Demetrios: +Yeah. -## [Anchor](https://qdrant.tech/documentation/rag-deepseek/\#setup-qdrant) Setup Qdrant +Syed Asad: +I have been implementing proc since the last many days and it has been magical. The chatbots are running blazingly fast but there are some shortcomings also. You cannot control the temperature if you have lesser control on hallucination. That is one of the challenges which I am facing. So that is why I am not able to deploy Grok into production right now. Because hallucination is one of the concern for the client. Also for anybody who is having, who wants to have a rag on their own data, say, or AI on their own data, they won't, they won't expect you, the LLM, to be creative. So that is one of the challenges. -```python -pip install "qdrant-client[fastembed]>=1.14.1" +Syed Asad: +So what I found that although many of the tools that are available in the market right now day in and day out, there are more researches. But most of the things which are coming up in our feeds or more, I mean they are coming as a sort of a marketing gimmick. They're not working actually on the ground. -``` +Demetrios: +Tell me, tell me more about that. What other stuff have you tried that's not working? Because I feel that same way. I've seen it and I also have seen what feels like some people, basically they release models for marketing purposes as opposed to actual valuable models going out there. So which ones? I mean Grok, knowing about Grok and where it excels and what some of the downfalls are is really useful. It feels like this idea of temperature being able to control the knob on the temperature and then trying to decrease the hallucinations is something that is fixable in the near future. So maybe it's like months that we'll have to deal with that type of thing for now. But I'd love to hear what other things you've tried that were not like you thought they were going to be when you were scrolling Twitter or LinkedIn. -[Qdrant](https://qdrant.tech/) will act as a knowledge base providing the context information for the prompts we’ll be sending to the LLM. +Syed Asad: +Should I name them? -You can get a free-forever Qdrant cloud instance at [http://cloud.qdrant.io](http://cloud.qdrant.io/). Learn about setting up your instance from the [Quickstart](https://qdrant.tech/documentation/quickstart-cloud/). +Demetrios: +Please. So we all know we don't have to spend our time on them. -```python -QDRANT_URL = "https://xyz-example.eu-central.aws.cloud.qdrant.io:6333" -QDRANT_API_KEY = "" +Syed Asad: +I'll start with OpenAI. The clients don't like GPT 4 to be used in there just because the primary concern is the cost. Secondary concern is the data privacy. And the third is that, I mean, I'm talking from the client's perspective, not the tech stack perspective. -``` +Demetrios: +Yeah, yeah, yeah. -### [Anchor](https://qdrant.tech/documentation/rag-deepseek/\#instantiating-qdrant-client) Instantiating Qdrant Client +Syed Asad: +They consider OpenAI as a more of a marketing gimmick. Although GPT 4 gives good results. I'm, I'm aware of that, but the clients are not in favor. But the thing is that I do agree that GPT 4 is still the king of llms right now. So they have no option, no option to get the better, better results. But Mixtral is performing very good as far as the hallucinations are concerned. Just keeping the parameter temperature is equal to zero in a python code does not makes the hallucination go off. It is one of my key takeaways. -```python -from qdrant_client import QdrantClient, models +Syed Asad: +I have been bogging my head. Just. I'll give you an example, a chat bot. There is a, there's one of the use case in which is there's a big publishing company. I cannot name that company right now. And they want the entire system of books since the last 2025 years to be just converted into a rack pipeline. And the people got query. The. -client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY) +Syed Asad: +The basic problem which I was having is handling a hello. When a user types hello. So when you type in hello, it. -``` +Demetrios: +Gives you back a book. -### [Anchor](https://qdrant.tech/documentation/rag-deepseek/\#building-the-knowledge-base) Building the knowledge base +Syed Asad: +It gives you back a book even. It is giving you back sometimes. Hello, I am this, this, this. And then again, some information. What you have written in the prompt, it is giving you everything there. I will answer according to this. I will answer according to this. So, so even if the temperature is zero inside the code, even so that, that included lots of prompt engineering. -Qdrant will use vector embeddings of our facts to enrich the original prompt with some context. Thus, we need to store the vector embeddings and the facts used to generate them. +Syed Asad: +So prompt engineering is what I feel is one of the most important trades which will be popular, which is becoming popular. And somebody is having specialization in prompt engineering. I mean, they can control the way how an LLM behaves because it behaves weirdly. Like in this use case, I was using croc and Mixtral. So to control Mixtral in such a way. It was heck lot of work, although it, we made it at the end, but it was heck lot of work in prompt engineering part. -We’ll be using the [bge-base-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) model via [FastEmbed](https://github.com/qdrant/fastembed/) \- A lightweight, fast, Python library for embeddings generation. +Demetrios: +And this was, this was Mixtral large. -The Qdrant client provides a handy integration with FastEmbed that makes building a knowledge base very straighforward. +Syed Asad: +Mixtral, seven bits, eight by seven bits. -First, we need to create a collection, so Qdrant would know what vectors it will be dealing with, and then, we just pass our raw documents -wrapped into `models.Document` to compute and upload the embeddings. +Demetrios: +Yeah. I mean, yeah, that's the trade off that you have to deal with. And it wasn't fine tuned at all. -pythonpython +Syed Asad: +No, it was not fine tuned because we were constructing a rack pipeline, not a fine tuned application, because right now, right now, even the customers are not interested in getting a fine tune model because it cost them and they are more interested in a contextual, like a rag contextual pipeline. -```python -collection_name = "knowledge_base" -model_name = "BAAI/bge-small-en-v1.5" -client.create_collection( - collection_name=collection_name, - vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE) -) +Demetrios: +Yeah, yeah. Makes sense. So basically, this is very useful to think about. I think we all understand and we've all seen that GPT 4 does best if we can. We want to get off of it as soon as possible and see how we can, how far we can go down the line or how far we can go on the difficulty spectrum. Because as soon as you start getting off GPT 4, then you have to look at those kind of issues with like, okay, now it seems to be hallucinating a lot more. How do I figure this out? How can I prompt it? How can I tune my prompts? How can I have a lot of prompt templates or a prompt suite to make sure that things work? And so are you using any tools for keeping track of prompts? I know there's a ton out there. -``` +Syed Asad: +We initially started with the parameter efficient fine tuning for prompts, but nothing is working 100% interesting. Nothing works 100% it is as far as the prompting is concerned. It goes on to a hit and trial at the end. Huge wastage of time in doing prompt engineering. Even if you are following the exact prompt template given on the hugging face given on the model card anywhere, it will, it will behave, it will act, but after some time. -```python -documents = [\ - "Qdrant is a vector database & vector similarity search engine. It deploys as an API service providing search for the nearest high-dimensional vectors. With Qdrant, embeddings or neural network encoders can be turned into full-fledged applications for matching, searching, recommending, and much more!",\ - "Docker helps developers build, share, and run applications anywhere — without tedious environment configuration or management.",\ - "PyTorch is a machine learning framework based on the Torch library, used for applications such as computer vision and natural language processing.",\ - "MySQL is an open-source relational database management system (RDBMS). A relational database organizes data into one or more data tables in which data may be related to each other; these relations help structure the data. SQL is a language that programmers use to create, modify and extract data from the relational database, as well as control user access to the database.",\ - "NGINX is a free, open-source, high-performance HTTP server and reverse proxy, as well as an IMAP/POP3 proxy server. NGINX is known for its high performance, stability, rich feature set, simple configuration, and low resource consumption.",\ - "FastAPI is a modern, fast (high-performance), web framework for building APIs with Python 3.7+ based on standard Python type hints.",\ - "SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings. You can use this framework to compute sentence / text embeddings for more than 100 languages. These embeddings can then be compared e.g. with cosine-similarity to find sentences with a similar meaning. This can be useful for semantic textual similar, semantic search, or paraphrase mining.",\ - "The cron command-line utility is a job scheduler on Unix-like operating systems. Users who set up and maintain software environments use cron to schedule jobs (commands or shell scripts), also known as cron jobs, to run periodically at fixed times, dates, or intervals.",\ -] -client.upsert( - collection_name=collection_name, - points=[\ - models.PointStruct(\ - id=idx,\ - vector=models.Document(text=document, model=model_name),\ - payload={"document": document},\ - )\ - for idx, document in enumerate(documents)\ - ], -) +Demetrios: +Yeah, yeah. -``` +Syed Asad: +But mixed well. Is performing very good. Very, very good. Mixtral eight by seven bits. That's very good. -## [Anchor](https://qdrant.tech/documentation/rag-deepseek/\#setup-deepseek) Setup DeepSeek +Demetrios: +Awesome. -RAG changes the way we interact with Large Language Models. We’re converting a knowledge-oriented task, in which the model may create a counterfactual answer, into a language-oriented task. The latter expects the model to extract meaningful information and generate an answer. LLMs, when implemented correctly, are supposed to be carrying out language-oriented tasks. +Syed Asad: +The summarization part is very strong. It gives you responses at par with GPT 4. -The task starts with the original prompt sent by the user. The same prompt is then vectorized and used as a search query for the most relevant facts. Those facts are combined with the original prompt to build a longer prompt containing more information. +Demetrios: +Nice. Okay. And you don't have to deal with any of those data concerns that your customers have. -But let’s start simply by asking our question directly. +Syed Asad: +Yeah, I'm coming on to that only. So the next part was the data concern. So they, they want either now or in future the localization of llms. I have been doing it with readily, with Llama, CPP and Ollama. Right now. Ollama is very good. I mean, I'm a huge, I'm a huge fan of Ollama right now, and it is performing very good as far as the localization and data privacy is concerned because, because at the end what you are selling, it makes things, I mean, at the end it is sales. So even if the client is having data of the customers, they want to make their customers assure that the data is safe. -```python -prompt = """ -What tools should I need to use to build a web service using vector embeddings for search? -""" +Syed Asad: +So that is with the localization only. So they want to gradually go into that place. So I want to bring here a few things. To summarize what I said, localization of llms is one of the concern right now is a big market. Second is quantization of models. -``` +Demetrios: +Oh, interesting. -Using the Deepseek API requires providing the API key. You can obtain it from the [DeepSeek platform](https://platform.deepseek.com/api_keys). +Syed Asad: +In quantization of models, whatever. So I perform scalar quantization and binary quantization, both using bits and bytes. I various other techniques also, but the bits and bytes was the best. Scalar quantization is performing better. Binary quantization, I mean the maximum compression or maximum lossy function is there, so it is not, it is, it is giving poor results. Scalar quantization is working very fine. It, it runs on CPU also. It gives you good results because whatever projects which we are having right now or even in the markets also, they are not having huge corpus of data right now, but they will eventually scale. -Now we can finally call the completion API. +Syed Asad: +So they want something right now so that quantization works. So quantization is one of the concerns. People want to dodge aws, they don't want to go to AWS, but it is there. They don't have any other way. So that is why they want aws. -```python -import requests -import json +Demetrios: +And is that because of costs lock in? -# Fill the environmental variable with your own Deepseek API key -# See: https://platform.deepseek.com/api_keys -API_KEY = "" +Syed Asad: +Yeah, cost is the main part. -HEADERS = { - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json", -} +Demetrios: +Yeah. They understand that things can get out of hand real quick if you're using AWS and you start using different services. I think it's also worth noting that when you're using different services on AWS, it may be a very similar service. But if you're using sagemaker endpoints on AWS, it's like a lot more expensive than just an EKS endpoint. -def query_deepseek(prompt): - data = { - "model": "deepseek-chat", - "messages": [{"role": "user", "content": prompt}], - "stream": False, - } +Syed Asad: +Minimum cost for a startup, for just the GPU, bare minimum is minimum. $450. Minimum. It's $450 even without just on the testing phases or the development phases, even when it has not gone into production. So that gives a dent to the client also. - response = requests.post( - "https://api.deepseek.com/chat/completions", headers=HEADERS, data=json.dumps(data) - ) +Demetrios: +Wow. Yeah. Yeah. So it's also, and this is even including trying to use like tranium or inferencia and all of that stuff. You know those services? - if response.ok: - result = response.json() - return result["choices"][0]["message"]["content"] - else: - raise Exception(f"Error {response.status_code}: {response.text}") +Syed Asad: +I know those services, but I've not readily tried those services. I'm right now in the process of trying salad also for inference, and they are very, very cheap right now. -``` +Demetrios: +Nice. Okay. Yeah, cool. So if you could wave your magic wand and have something be different when it comes to your work, your day in, day out, especially because you've been doing a lot of rags, a lot of different kinds of rags, a lot of different use cases with, with rags. Where do you think you would get the biggest uptick in your performance, your ability to just do what you need to do? How could rags be drastically changed? Is it something that you say, oh, the hallucinations. If we didn't have to deal with those, that would make my life so much easier. I didn't have to deal with prompts that would make my life infinitely easier. What are some things like where in five years do you want to see this field be? -and also the query +Syed Asad: +Yeah, you figured it right. The hallucination part is one of the concerns, or biggest concerns with the client when it comes to the rag, because what we see on LinkedIn and what we see on places, it gives you a picture that it, it controls hallucination, and it gives you answer that. I don't know anything about this, as mentioned in the context, but it does not really happen when you come to the production. It gives you information like you are developing a rag for a publishing company, and it is giving you. Where is, how is New York like, it gives you information on that also, even if you have control and everything. So that is one of the things which needs to be toned down. As far as the rag is concerned, pricing is the biggest concern right now, because there are very few players in the market as far as the inference is concerned, and they are just dominating the market with their own rates. So this is one of the pain points. -```python -query_deepseek(prompt) +Syed Asad: +And the. I'll also want to highlight the popular vector databases. There are many Pinecone weaviate, many things. So they are actually, the problem with many of the vector databases is that they work fine. They are scalable. This is common. The problem is that they are not easy to use. So that is why I always use Qdrant. -``` +Syed Asad: +Not because Qdrant is sponsoring me, not because I am doing a job with Qdrant, but Qdrant is having the ease of use. And it, I have, I have trained people in my team who specialize with Qdrant, and they were initially using Weaviate and Pinecone. I mean, you can do also store vectors in those databases, but it is not especially the, especially the latest development with Pine, sorry, with Qdrant is the fast embed, which they just now released. And it made my work a lot easier by using the ONNX approach rather than a Pytorch based approach, because there was one of the projects in which we were deploying embedding model on an AWS server and it was running continuously. And minimum utilization of ram is 6gb. Even when it is not doing any sort of vector embedding so fast. Embed has so Qdrant is playing a huge role, I should acknowledge them. And one more thing which I would not like to use is LAN chain. -The response is: +Syed Asad: +I have been using it. So. So I don't want to use that language because it is not, it did not serve any purpose for me, especially in the production. It serves purpose in the research phase. When you are releasing any notebook, say you have done this and does that. It is not. It does not works well in production, especially for me. Llama index works fine, works well. -```bash -"Building a web service that uses vector embeddings for search involves several components, including data processing, embedding generation, storage, search, and serving the service via an API. Below is a list of tools and technologies you can use for each step:\n\n---\n\n### 1. **Data Processing**\n - **Python**: For general data preprocessing and scripting.\n - **Pandas**: For handling tabular data.\n - **NumPy**: For numerical operations.\n - **NLTK/Spacy**: For text preprocessing (tokenization, stemming, etc.).\n - **LLM models**: For generating embeddings if you're using pre-trained models.\n\n---\n\n### 2. **Embedding Generation**\n - **Pre-trained Models**:\n - Embeddings (e.g., `text-embedding-ada-002`).\n - Hugging Face Transformers (e.g., `Sentence-BERT`, `all-MiniLM-L6-v2`).\n - Google's Universal Sentence Encoder.\n - **Custom Models**:\n - TensorFlow/PyTorch: For training custom embedding models.\n - **Libraries**:\n - `sentence-transformers`: For generating sentence embeddings.\n - `transformers`: For using Hugging Face models.\n\n---\n\n### 3. **Vector Storage**\n - **Vector Databases**:\n - Pinecone: Managed vector database for similarity search.\n - Weaviate: Open-source vector search engine.\n - Milvus: Open-source vector database.\n - FAISS (Facebook AI Similarity Search): Library for efficient similarity search.\n - Qdrant: Open-source vector search engine.\n - Redis with RedisAI: For storing and querying vectors.\n - **Traditional Databases with Vector Support**:\n - PostgreSQL with pgvector extension.\n - Elasticsearch with dense vector support.\n\n---\n\n### 4. **Search and Retrieval**\n - **Similarity Search Algorithms**:\n - Cosine similarity, Euclidean distance, or dot product for comparing vectors.\n - **Libraries**:\n - FAISS: For fast nearest-neighbor search.\n - Annoy (Approximate Nearest Neighbors Oh Yeah): For approximate nearest neighbor search.\n - **Vector Databases**: Most vector databases (e.g., Pinecone, Weaviate) come with built-in search capabilities.\n\n---\n\n### 5. **Web Service Framework**\n - **Backend Frameworks**:\n - Flask/Django/FastAPI (Python): For building RESTful APIs.\n - Node.js/Express: If you prefer JavaScript.\n - **API Documentation**:\n - Swagger/OpenAPI: For documenting your API.\n - **Authentication**:\n - OAuth2, JWT: For securing your API.\n\n---\n\n### 6. **Deployment**\n - **Containerization**:\n - Docker: For packaging your application.\n - **Orchestration**:\n - Kubernetes: For managing containers at scale.\n - **Cloud Platforms**:\n - AWS (EC2, Lambda, S3).\n - Google Cloud (Compute Engine, Cloud Functions).\n - Azure (App Service, Functions).\n - **Serverless**:\n - AWS Lambda, Google Cloud Functions, or Vercel for serverless deployment.\n\n---\n\n### 7. **Monitoring and Logging**\n - **Monitoring**:\n - Prometheus + Grafana: For monitoring performance.\n - **Logging**:\n - ELK Stack (Elasticsearch, Logstash, Kibana).\n - Fluentd.\n - **Error Tracking**:\n - Sentry.\n\n---\n\n### 8. **Frontend (Optional)**\n - **Frontend Frameworks**:\n - React, Vue.js, or Angular: For building a user interface.\n - **Libraries**:\n - Axios: For making API calls from the frontend.\n\n---\n\n### Example Workflow\n1. Preprocess your data (e.g., clean text, tokenize).\n2. Generate embeddings using a pre-trained model (e.g., Hugging Face).\n3. Store embeddings in a vector database (e.g., Pinecone or FAISS).\n4. Build a REST API using FastAPI or Flask to handle search queries.\n5. Deploy the service using Docker and Kubernetes or a serverless platform.\n6. Monitor and scale the service as needed.\n\n---\n\n### Example Tools Stack\n- **Embedding Generation**: Hugging Face `sentence-transformers`.\n- **Vector Storage**: Pinecone or FAISS.\n- **Web Framework**: FastAPI.\n- **Deployment**: Docker + AWS/GCP.\n\nBy combining these tools, you can build a scalable and efficient web service for vector embedding-based search." +Demetrios: +You haven't played around with anything else, have you? Like Haystack or. -``` +Syed Asad: +Yeah, haystack. Haystack. I have been playing out around, but haystack is lacking functionalities. It is working well. I would say it is working well, but it lacks some functionalities. They need to add more things as compared to Llama index. -### [Anchor](https://qdrant.tech/documentation/rag-deepseek/\#extending-the-prompt) Extending the prompt +Demetrios: +And of course, the hottest one on the block right now is DSPY. Right? Have you messed around with that at all? -Even though the original answer sounds credible, it didn’t answer our question correctly. Instead, it gave us a generic description of an application stack. To improve the results, enriching the original prompt with the descriptions of the tools available seems like one of the possibilities. Let’s use a semantic knowledge base to augment the prompt with the descriptions of different technologies! +Syed Asad: +DSPy, actually DSPY. I have messed with DSPY. But the thing is that DSPY is right now, I have not experimented with that in the production thing, just in the research phase. -```python -results = client.query_points( - collection_name=collection_name, - query=models.Document(text=prompt, model=model_name), - limit=3, -) -results +Demetrios: +Yeah. -``` +Syed Asad: +So, and regarding the evaluation part, DeepEval, I heard you might have a DeepEval. So I've been using that. It is because one of the, one of the challenges is the testing for the AI. Also, what responses are large language model is generating the traditional testers or the manual tester software? They don't know, actually. So there's one more vertical which is waiting to be developed, is the testing for AI. It has a huge potential. And DeepEval, the LLM based approach on testing is very, is working fine and is open source also. -Here is the response: +Demetrios: +And that's the DeepEval I haven't heard. -```bash -QueryResponse(points=[\ - ScoredPoint(id=0, version=0, score=0.67437416, payload={'document': 'Qdrant is a vector database & vector similarity search engine. It deploys as an API service providing search for the nearest high-dimensional vectors. With Qdrant, embeddings or neural network encoders can be turned into full-fledged applications for matching, searching, recommending, and much more!'}, vector=None, shard_key=None, order_value=None),\ - ScoredPoint(id=6, version=0, score=0.63144326, payload={'document': 'SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings. You can use this framework to compute sentence / text embeddings for more than 100 languages. These embeddings can then be compared e.g. with cosine-similarity to find sentences with a similar meaning. This can be useful for semantic textual similar, semantic search, or paraphrase mining.'}, vector=None, shard_key=None, order_value=None),\ - ScoredPoint(id=5, version=0, score=0.6064749, payload={'document': 'FastAPI is a modern, fast (high-performance), web framework for building APIs with Python 3.7+ based on standard Python type hints.'}, vector=None, shard_key=None, order_value=None)\ -]) +Syed Asad: +Let me just tell you the exact spelling. It is. Sorry. It is DeepEval. D E E P. Deep eval. I can. -``` +Demetrios: +Yeah. Okay. I know DeepEval. All right. Yeah, for sure. Okay. Hi. I for some reason was understanding D Eval. -We used the original prompt to perform a semantic search over the set of tool descriptions. Now we can use these descriptions to augment the prompt and create more context. +Syed Asad: +Yeah, actually I was pronouncing it wrong. -```python -context = "\n".join(r.payload['document'] for r in results.points) -context +Demetrios: +Nice. So these are some of your favorite, non favorite, and that's very good to know. It is awesome to hear about all of this. Is there anything else that you want to say before we jump off? Anything that you can, any wisdom you can impart on us for your rag systems and how you have learned the hard way? So tell us so we don't have to learn that way. -``` +Syed Asad: +Just go. Don't go with the marketing. Don't go with the marketing. Do your own research. Hugging face is a good, I mean, just fantastic. The leaderboard, although everything does not work in the leaderboard, also say, for example, I don't, I don't know about today and tomorrow, today and yesterday, but there was a model from Salesforce, the embedding model from Salesforce. It is still topping charts, I think, in the, on the MTEB. MTEB leaderboard for the embedding models. -The response is: +Syed Asad: +But you cannot use it in the production. It is way too huge to implement it. So what's the use? Mixed bread AI. The mixed bread AI, they are very light based, lightweight, and they, they are working fine. They're not even on the leaderboard. They were on the leaderboard, but they're right, they might not. When I saw they were ranking on around seven or eight on the leaderboard, MTEB leaderboard, but they were working fine. So even on the leaderboard thing, it does not works. -```bash -'Qdrant is a vector database & vector similarity search engine. It deploys as an API service providing search for the nearest high-dimensional vectors. With Qdrant, embeddings or neural network encoders can be turned into full-fledged applications for matching, searching, recommending, and much more!\nFastAPI is a modern, fast (high-performance), web framework for building APIs with Python 3.7+ based on standard Python type hints.\nPyTorch is a machine learning framework based on the Torch library, used for applications such as computer vision and natural language processing.' +Demetrios: +And right now it feels a little bit like, especially when it comes to embedding models, you just kind of go to the leaderboard and you close your eyes and then you pick one of them. Have you figured out a way to better test these or do you just find one and then try and use it everywhere? -``` +Syed Asad: +No, no, that is not the case. Actually what I do is that I need to find the first, the embedding model. Try to find the embedding model based on my use case. Like if it is an embedding model on a medical use case more. So I try to find that. But the second factor to filter that is, is the size of that embedding model. Because at the end, if I am doing the entire POC or an entire research with that embedding model, what? And it has happened to me that we did entire research with embedding models, large language models, and then we have to remove everything just on the production part and it just went in smoke. Everything. -Finally, let’s build a metaprompt, the combination of the assumed role of the LLM, the original question, and the results from our semantic search that will force our LLM to use the provided context. +Syed Asad: +So a lightweight embedding model, especially the one which, which has started working recently, is that the cohere embedding models, and they have given a facility to call those embedding models in a quantized format. So that is also working and fast. Embed is one of the things which is by Qdrant, these two things are working in the production. I'm talking in the production for research. You can do anything. -By doing this, we effectively convert the knowledge-oriented task into a language task and hopefully reduce the chances of hallucinations. It also should make the response sound more relevant. +Demetrios: +Brilliant, man. Well, this has been great. I really appreciate it. Asad, thank you for coming on here and for anybody else that would like to come on to the vector space talks, just let us know. In the meantime, don't get lost in vector space. We will see you all later. Have a great afternoon. Morning, evening, wherever you are. -```python -metaprompt = f""" -You are a software architect. -Answer the following question using the provided context. -If you can't find the answer, do not pretend you know it, but answer "I don't know". +Demetrios: +Asad, you taught me so much, bro. Thank you. -Question: {prompt.strip()} +<|page-359-lllmstxt|> +> *"It's very, very simple to build search over an Open API specification with a tool like Trieve and Qdrant. I think really there's something to highlight here and how awesome it is to work with a group based system if you're using Qdrant.”*\ +— Nick Khami +> -Context: -{context.strip()} +Nick Khami, a seasoned full-stack engineer, has been deeply involved in the development of vector search and RAG applications since the inception of Qdrant v0.11.0 back in October 2022. His expertise and passion for innovation led him to establish Trieve, a company dedicated to facilitating businesses in embracing cutting-edge vector search and RAG technologies. -Answer: -""" +***Listen to the episode on [Spotify](https://open.spotify.com/episode/1JtL167O2ygirKFVyieQfP?si=R2cN5LQrTR60i-JzEh_m0Q), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/roLpKNTeG5A?si=JkKI7yOFVOVEY4Qv).*** -# Look at the full metaprompt -print(metaprompt) + -``` + -**Response:** +## **Top takeaways:** -```bash -You are a software architect. -Answer the following question using the provided context. -If you can't find the answer, do not pretend you know it, but answer "I don't know". +Nick showcases Trieve and the advancements in the world of search technology, demonstrating with Qdrant how simple it is to construct precise search functionalities with open API specs for colorful sneaker discoveries, all while unpacking the potential of improved search experiences and analytics for diverse applications like apps for legislation. -Question: What tools should I need to use to build a web service using vector embeddings for search? +We're going deep into the mechanics of search and recommendation applications. Whether you're a developer or just an enthusiast, this episode is guaranteed in giving you insight into how to create a seamless search experience using the latest advancements in the industry. -Context: -Qdrant is a vector database & vector similarity search engine. It deploys as an API service providing search for the nearest high-dimensional vectors. With Qdrant, embeddings or neural network encoders can be turned into full-fledged applications for matching, searching, recommending, and much more! -FastAPI is a modern, fast (high-performance), web framework for building APIs with Python 3.7+ based on standard Python type hints. -PyTorch is a machine learning framework based on the Torch library, used for applications such as computer vision and natural language processing. +Here are five key takeaways from this episode: -Answer: +1. **Understand the Open API Spec**: Discover the magic behind Open API specifications and how they can serve your development needs especially when it comes to rest API routes. +2. **Simplify with Trieve and Qdrant**: Nick walks us through a real-world application using Trieve and Qdrant's group-based system, demonstrating how to effortlessly build search capabilities. +3. **Elevate Search Results**: Learn about the power of grouping and recommendations within Qdrant to fine-tune your search results, using the colorful world of sneakers as an example! +4. **Trieve's Infrastructure Made Easy**: Find out how taking advantage of Trieve can make creating datasets, obtaining API keys, and kicking off searches simpler than you ever imagined. +5. **Enhanced Vector Search with Tantivy**: If you're curious about alternative search engines, get the scoop on Tantivy, how it complements Qdrant, and its role within the ecosystem. -``` +> Fun Fact: Trieve was established in 2023 and the name is a play on the word "retrieve”. +> -Our current prompt is much longer, and we also used a couple of strategies to make the responses even better: +## Show notes: -1. The LLM has the role of software architect. -2. We provide more context to answer the question. -3. If the context contains no meaningful information, the model shouldn’t make up an answer. +00:00 Vector Space Talks intro to Nick Khami.\ +06:11 Qdrant system simplifies difficult building process.\ +07:09 Using Qdrant to organize and manage content.\ +11:43 Creating a group: search results may not group.\ +14:23 Searching with Qdrant: utilizing system routes.\ +17:00 Trieve wrapped up YC W24 batch.\ +21:45 Revolutionizing company search.\ +23:30 Next update: user tracking, analytics, and cross-encoders.\ +27:39 Quadruple supported sparse vectors.\ +30:09 Final questions and wrap up. -Let’s find out if that works as expected. +## More Quotes from Nick: -**Question:** +*"You can get this RAG, this search and the data upload done in a span of maybe 10-15 minutes, which is really cool and something that we were only really possible to build at Trieve, thanks to what the amazing team at Qdrant has been able to create.”*\ +— Nick Khami -```python -query_deepseek(metaprompt) +*"Qdrant also offers recommendations for groups, so like, which is really cool... Not only can you search groups, you can also recommend groups, which is, I think, awesome. But yeah, you can upload all your data, you go to the search UI, you can search it, you can test out how recommendations are working [and] in a lot of cases too, you can fix problems in your search.”*\ +— Nick Khami -``` +*"Typically when you do recommendations, you take the results that you want to base recommendations off of and you build like an average vector that you then use to search. Qdrant offers a more evolved recommendation pattern now where you can traverse the graph looking at the positive point similarity, then also the negative similarity.”*\ +— Nick Khami -**Answer:** +## Transcript: +Demetrios: +What is happening? Everyone? Welcome back to another edition of the Vector Space Talks. I am super excited to be here with you today. As always, we've got a very special guest. We've got Nick, the founder and engineer, founder slash engineer of Trieve. And as you know, we like to start these sessions off with a little recommendations of what you can hopefully be doing to make life better. And so when Sabrina's here, I will kick it over to her and ask her for her latest recommendation of what she's been doing. But she's traveling right now, so I'm just going to give you mine on some things that I've been listening to and I have been enjoying. For those who want some nice music, I would recommend an oldie, but a goodie. -```bash -'To build a web service using vector embeddings for search, you can use the following tools:\n\n1. **Qdrant**: As a vector database and similarity search engine, Qdrant will handle the storage and retrieval of high-dimensional vectors. It provides an API service for searching and matching vectors, making it ideal for applications that require vector-based search functionality.\n\n2. **FastAPI**: This web framework is perfect for building the API layer of your web service. It is fast, easy to use, and based on Python type hints, which makes it a great choice for developing the backend of your service. FastAPI will allow you to expose endpoints that interact with Qdrant for vector search operations.\n\n3. **PyTorch**: If you need to generate vector embeddings from your data (e.g., text, images), PyTorch can be used to create and train neural network models that produce these embeddings. PyTorch is a powerful machine learning framework that supports a wide range of applications, including natural language processing and computer vision.\n\n### Summary:\n- **Qdrant** for vector storage and search.\n- **FastAPI** for building the web service API.\n- **PyTorch** for generating vector embeddings (if needed).\n\nThese tools together provide a robust stack for building a web service that leverages vector embeddings for search functionality.' +Demetrios: +It is from the incredible band that is not coming to me right now, but it's called this must be the place from the. Actually, it's from the Talking Heads. Definitely recommend that one as a fun way to get the day started. We will throw a link to that music in the chat, but we're not going to be just talking about good music recommendations. Today we are going to get Nick on the stage to talk all about search and rags. And Nick is in a very interesting position because he's been using vector search from Qdrant since 2022. Let's bring this man on the stage and see what he's got to say. What's up, dude? -``` +Nick Khami: +Hey. -### [Anchor](https://qdrant.tech/documentation/rag-deepseek/\#testing-out-the-rag-pipeline) Testing out the RAG pipeline +Demetrios: +Hey. -By leveraging the semantic context we provided our model is doing a better job answering the question. Let’s enclose the RAG as a function, so we can call it more easily for different prompts. +Nick Khami: +Nice to meet you. -```python -def rag(question: str, n_points: int = 3) -> str: - results = client.query_points( - collection_name=collection_name, - query=models.Document(text=question, model=model_name), - limit=n_points, - ) +Demetrios: +How you doing? - context = "\n".join(r.payload["document"] for r in results.points) +Nick Khami: +Doing great. - metaprompt = f""" - You are a software architect. - Answer the following question using the provided context. - If you can't find the answer, do not pretend you know it, but only answer "I don't know". +Demetrios: +Well, it's great to have you. - Question: {question.strip()} +Nick Khami: +Yeah, yeah. Nice sunny day. It looks like it's going to be here in San Francisco, which is good. It was raining like all of January, but finally got some good sunny days going, which is awesome. - Context: - {context.strip()} +Demetrios: +Well, it is awesome that you are waking up early for us and you're doing this. I appreciate it coming all the way from San Francisco and talking to us today all about search and recommender system. Sorry, rag apps. I just have in my mind, whenever I say search, I automatically connect recommender because it is kind of similar, but not in this case. You're going to be talking about search and rag apps and specifically around the Open API spec. I know you've got a talk set up for. For us. Do you want to kick it off? And then I'll be monitoring the chat. - Answer: - """ +Demetrios: +So if anybody has any questions, throw it in the chat and I'll pop up on screen again and ask away. - return query_deepseek(metaprompt) +Nick Khami: +Yeah, yeah, I'd love to. I'll go ahead and get this show on the road. Okay. So I guess the first thing I'll talk about is what exactly an Open API spec is. This is Qdrants open API spec. I feel like it's a good topical example for vector space talk. You can see here, Qdrant offers a bunch of different rest API routes on their API. Each one of these exists within this big JSON file called the Open API specification. -``` +Nick Khami: +There's a lot of projects that have an Open API specification. Stripe has one, I think sentry has one. It's kind of like a de facto way of documenting your API. -Now it’s easier to ask a broad range of questions. +Demetrios: +Can you make your screen just a little or the font just a little bit bigger? Maybe zoom in? -**Question:** +Nick Khami: +I think I can, yeah. -```python -rag("What can the stack for a web api look like?") +Demetrios: +All right, awesome. So that my eyesight is not there. Oh, that is brilliant. That is awesome. -``` +Nick Khami: +Okay, we doing good here? All right, awesome. Yeah. Hopefully this is more readable for everyone, but yeah. So this is an open API specification. If you look at it inside of a JSON file, it looks a little bit like this. And if you go to the top, I can show the structure. There's a list or there's an object called paths that contains all the different API paths for the API. And then there's another object called security, which explains the authentication scheme. -**Answer:** +Nick Khami: +And you have a nice info section I'm going to ignore, kind of like these two, they're not all that important. And then you have this list of like tags, which is really cool because this is kind of how things get organized. If we go back, you can see these kind of exist as tags. So these items here will be your tags in the Open API specification. One thing that's kind of like interesting is it would be cool if it was relatively trivial to build search over an OpenAPI specification, because if you don't know what you're looking for, then this search bar does not always work great. For example, if you type in search within groups. Oh, this one actually works pretty good. Wow, this seems like an enhanced Open API specification search bar. -```bash -'The stack for a web API can include the following components based on the provided context:\n\n1. **Web Framework**: FastAPI can be used as the web framework for building the API. It is modern, fast, and leverages Python type hints for better development and performance.\n\n2. **Reverse Proxy/Web Server**: NGINX can be used as a reverse proxy or web server to handle incoming HTTP requests, load balancing, and serving static content. It is known for its high performance and low resource consumption.\n\n3. **Containerization**: Docker can be used to containerize the application, making it easier to build, share, and run the API consistently across different environments without worrying about configuration issues.\n\nThis stack provides a robust, scalable, and efficient setup for building and deploying a web API.' +Nick Khami: +I should have made sure that I checked it before going. So this is quite good. Our search bar for tree in example, does not actually, oh, it does have the same search, but I was really interested in, I guess, explaining how you could enhance this or hook it up to vector search in order to do rag audit. It's what I want to highlight here. Qdrant has a really interesting feature called groups. You can search over a group of points at one time and kind of return results in a group oriented way instead of only searching for a singular route. And for an Open API specification, that's very interesting. Because it means that you can search for a tag while looking at each tag's individual paths. -``` +Nick Khami: +It is like a, it's something that's very difficult to build without a system like Qdrant and kind of like one of the primary, I think, feature offerings of it compared to PG vector or maybe like brute force with face or yousearch or something. And the goal that I kind of had was to figure out which endpoint was going to be most relevant for what I was trying to do. In a lot of cases with particularly Qdrants, Open API spec in this example. To go about doing that, I used a scripting runtime for JavaScript called Bun. I'm a big fan of it. It tends to work quite well. It's very performant and kind of easy to work with. I start off here by loading up the Qdrant Open API spec from JSON and then I import some things that exist inside of tree. -**Question:** +Nick Khami: +Trieve uses Qdrant under the hood to offer a lot of its features, and that's kind of how I'm going to go about doing this here. So I import some stuff from the tree SDK client package, instantiate a couple of environment variables, set up my configuration for the tree API, and now this is where it gets interesting. I pull the tags from the Qdrant Open API JSON specification, which is this array here, and then I iterate over each tag and I check if I've already created the group. If I have, then I do nothing. But if I have it, then I go ahead and I create a group. For each tag, I'm creating these groups so that way I can insert each path into its relevant groups whenever I create them as individual points. Okay, so I finished creating all of the groups, and now for like the next part, I iterate over the paths, which are the individual API routes. For each path I pull the tags that it has, the summary, the description and the API method. -```python -rag("Where is the nearest grocery store?") +Nick Khami: +So post, get put, delete, et cetera, and I then create the point. In Trieve world, we call each point a chunk, kind of using I guess like rag terminology. For each individual path I create the chunk and by including its tags in this group tracking ids request body key, it will automatically get added to its relevant groups. I have some try catches here, but that's really the whole script. It's very, very simple to build search over an Open API specification with a tool like Trieve and Qdrant. I think really there's something to highlight here and how awesome it is to work with a group based system. If you're using Qdrant. If you can think about an e commerce store, sometimes you have multiple colorways of an item. -``` +Nick Khami: +You'll have a red version of the sneaker, a white version, a blue version, et cetera. And when someone performs a search, you not only want to find the relevant shoe, you want to find the relevant colorway of that shoe. And groups allow you to do this within Qdrant because you can place each colorway as an individual point. Or again, in tree world, chunk into a given group, and then when someone searches, they're going to get the relevant colorway at the top of the given group. It's really nice, really cool. You can see running this is very simple. If I want to update the entire data set by running this again, I can, and this is just going to go ahead and create all the relevant chunks for every route that Qdrant offers. If you guys who are watching or interested in replicating this experiment, I created an open source GitHub repo. -**Answer:** +Nick Khami: +We're going to zoom in here that you can reference@GitHub.com/devflowinc/OpenAPI/search. You can follow the instructions in the readme to replicate the whole thing. Okay, but I uploaded all the data. Let's see how this works from a UI perspective. Yeah. Trieve bundles in a really nice UI for searching after you add all of your data. So if I go home here, you can see that I'm using the Qdrant Open API spec dataset. And the organization here is like the email I use. -```bash -"I don't know. The provided context does not contain any information about the location of the nearest grocery store." +Nick Khami: +Nick.K@OpenAPI one of the nice things about Trieve, kind of like me on just the simplicity of adding data is we use Qdrant's multi tenancy feature to offer the ability to have multiple datasets within a given organization. So you can have, I have the Open API organization. You can create additional datasets with different embedding models to test with and experiment when it comes to your search. Okay. But not going to go through all those features today, I kind of want to highlight this Open API search that we just finished building. So I guess to compare and contrast, I'm going to use the exact same query that I used before, also going to zoom in. Okay. -``` +Nick Khami: +And that one would be like what we just did, right? So how do I maybe, how do I create a group? This isn't a Gen AI rag search. This is just a generic, this is just a generic search. Okay, so for how do I create a group? We're going to get all these top level results. In this case, we're not doing a group oriented search. We're just returning relevant chunks. Sometimes, or a lot of times I think that people will want to have a more group oriented search where the results are grouped by tag. So here I'm going to see that the most relevant endpoint or the most relevant tag within Qdrant's Open API spec is in theory collections, and within collections it thinks that these are the top three routes that are relevant. Recommend point groups discover bash points recommend bash points none of these are quite what I wanted, which is how do I create a group? But it's okay for cluster, you can see create shard key delete. -Our model can now: +Nick Khami: +So for cluster, this is kind of interesting. It thinks cluster is relevant, likely because a cluster is a kind of group and it matches to a large extent on the query. Then we also have points which it keys in on the shard system and the snapshotting system. When the next version gets released, we'll have rolling snapshots in Qdrant, which is very exciting. If anyone else is excited about that feature. I certainly am. Then it pulls the metrics. For another thing that might be a little bit easier for the search to work on. -1. Take advantage of the knowledge in our vector datastore. -2. Answer, based on the provided context, that it can not provide an answer. +Nick Khami: +You can type in how do I search points via group? And now it kind of is going to key in on what I would say is a better result. And you can see here we have very nice sub sentence highlighting on the request. It's bolding the sentence of the response that it thinks is the most relevant, which in this case are the second two paragraphs. Yep, the description and summary of what the request does. Another convenient thing about tree is in our default search UI, you can include links out to your resources. If I click this link, I'm going to immediately get to the correct place within the Qdrant redox specification. That's the entire search experience. For the Jedi side of this, I did a lot less optimization, but we can experiment and see how it goes. -We have just shown a useful mechanism to mitigate the risks of hallucinations in Large Language Models. +Nick Khami: +I'm going to zoom in again, guys. Okay, so let's say I want to make a new rag chat and I'm going to ask here, how would I search over points in a group oriented way with Qdrant? And it's going to go ahead and do a search query for me on my behalf again, powered by the wonder of Qdrant. And once it does this search query, I'm able to get citations and and see what the model thinks. The model is a pretty good job with the first response, and it says that to search for points and group oriented wave Qdrant, I can utilize the routes and endpoints provided by the system and the ones that I'm going to want to use first is points search groups. If I click doc one here and I look at the route, this is actually correct. Conveniently, you're able to open the link in the. Oh, well, okay, this env is wrong, but conveniently what this is supposed to do, if I paste it and fix the incorrect portion of the system. Changing chat to search is you can load the individual chunk of the search UI and read it here, and then you can update it to include document expansion, change the actual copy of what was indexed out, et cetera. -##### Was this page useful? +Nick Khami: +It's like a really convenient way to merchandise and enhance your data set without having to write a lot of code. Yeah, and it'll continue writing its answer. I'm not going to go through the whole thing, but this really encapsulates what I wanted to show. This is incredibly simple to do. You can get this RAG, this search and the data upload done in a span of maybe 10-15 minutes, which is really cool and something that we were only really possible to build at Trieve, thanks to what the amazing team at Qdrant has been able to create. And yeah, guys, hopefully that was cool. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Demetrios: +Excellent. So I've got some questions. Woo the infinite spinning field. So I want to know about Trieve and I want to jump into what you all are doing there. And then I want to jump in a little bit about the evolution that you've seen of Qdrant over the years, because you've been using it for a while. But first, can we get a bit of an idea on what you're doing and how you're dedicating yourself to creating what you're creating? -Thank you for your feedback! 🙏 +Nick Khami: +Yeah. At Trieve, we just wrapped up the Y Combinator W 24 batch and our fundogram, which is like cool. It took us like a year. So Dens and I started Trieve in January of 2023, and we kind of kept building and building and building, and in the process, we started out trying to build an app for you to have like AI powered arguments at work. It wasn't the best of ideas. That's kind of why we started using Qdrant originally in the process of building that, we thought it was really hard to get the amazing next gen search that products like Qdrant offer, because for a typical team, they have to run a Docker compose file on the local machine, add the Qdrant service, that docker compose docker compose up D stand up Qdrant, set an env, download the Qdrant SDK. All these things get very, very difficult after you index all of your data, you then have to create a UI to view it, because if you don't do that. It can be very hard to judge performance. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/rag-deepseek.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Nick Khami: +I mean, you can always make these benchmarks, but search and recommendations are kind of like a heuristic thing. It's like you can always have a benchmark, but the data is dynamic, it changes and you really like. In what we were experiencing at the time, we really needed a way to quickly gauge the system was doing. We gave up on our rag AI application argumentation app and pivoted to trying to build infrastructure for other people to benefit from the high quality search that is offered by splayed for sparse, or like sparse encode. I mean, elastics, LSR models, really cool. There's all the dense embedding vector models and we wanted to offer a managed suite of infrastructure for building on this kind of stuff. That's kind of what tree is. So like, with tree you go to. -On this page: +Nick Khami: +It's more of like a managed experience. You go to the dashboard, you make an account, you create the data set, you get an API key and the data set id, you go to your little script and mine for the Open API specs, 80 lines, you add all your data and then boom, bam, bing bop. You can just start searching and you can. We offer recommendations as well. Maybe I should have shown those in my demo, like, you can open an individual path and get recommendations for similar. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/rag-deepseek.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Demetrios: +There were recommendations, so I wasn't too far off the mark. See, search and recommendation, they just, they occupy the same spot in my head. -× +Nick Khami: +And Qdrant also offers recommendations for groups, guys. So like, which is really cool. Like you can, you can, like, not only can you search groups, you can also recommend groups, which is, I think, awesome. But yeah, you can upload all your data, you go to the search UI, you can search it, you can test out how recommendations are working in a lot of cases too. You can fix problems in your search. A good example of this is we built search for Y comb later companies so they could make it a lot better. Algolia is on an older search algorithm that doesn't offer semantic capabilities. And that means that you go to the Y combinator search companies bar, you type in which company offers short term rentals and you don't get Airbnb. -[Powered by](https://qdrant.tech/) +Nick Khami: +But with like Trieve it is. It is. But with tree, like, the magic of it is that even, believe it or not, there's a bunch of YC companies to do short term rentals and Airbnb does not appear first naturally. So with tree like, we offer a merchandising UI where you put that query in, you see Airbnb ranks a little bit lower than you want. You can immediately adjust the text that you indexed and even add like a re ranking weight so that appears higher in results. Do it again and it works. And you can also experiment and play with the rag. I think rag is kind of a third class citizen in our API. -![Company Logo](https://cdn.cookielaw.org/logos/static/ot_company_logo.png) +Nick Khami: +It turns out search recommendations are a lot more popular with our customers and users. But yeah, like tree, I would say like to encapsulate it. Trieve is an all in one infrastructure suite for teams building search recommendations in Rag. And we bundle the power of databases like Qdrant and next gen search ML AI models with uis for fine tuning ranking of results. -## Privacy Preference Center +Demetrios: +Dude, the reason I love this is because you can do so much with like well done search that is so valuable for so many companies and it's overlooked as like a solved problem, I think, for a lot of people, but it's not, and it's not that easy as you just explained. -Cookies used on the site are categorized, and below, you can read about each category and allow or deny some or all of them. When categories that have been previously allowed are disabled, all cookies assigned to that category will be removed from your browser. -Additionally, you can see a list of cookies assigned to each category and detailed information in the cookie declaration. +Nick Khami: +Yeah, I mean, like we're fired up about it. I mean, like, even if you guys go to like YC.Trieve.AI, that's like the Y combinator company search and you can a b test it against like the older style of search that Algolia offers or like elasticsearch offers. And like, it's, to me it's magical. It's like it's an absolute like work of human ingenuity and amazingness that you can type in, which company should I get an airbed at? And it finds Airbnb despite like none of the keywords matching up. And I'm afraid right now our brains are trained to go to Google. And on Google search bar you can ask a question, you can type in abstract ideas and concepts and it works. But anytime we go to an e commerce search bar or oh, they're so. +Demetrios: +Bad, they're so bad. Everybody's had that experience too, where I don't even search. Like, I just am like, well, all right, or I'll go to Google and search specifically on Google for that website, you know, and like put in parentheses. -[More information](https://qdrant.tech/legal/privacy-policy/#cookies-and-web-beacons) +Nick Khami: +We'Re just excited about that. Like we want to, we're trying to make it a lot like the goal of tree is to make it a lot easier to power these search experiences, the latest gentech, and help fix this problem. Like, especially if AI continues to get better, people are going to become more and more used to like things working and not having to hack around, faceting and filtering for it to work. And yeah, we're just excited to make that easier for companies to work on and build. -Allow All +Demetrios: +So there's one question coming through in the chat asking where we can get actual search metrics. -### Manage Consent Preferences +Nick Khami: +Yeah, so that's like the next thing that we're planning to add. Basically, like right now at tree, we don't track your users as queries. The next thing that we're like building at tree is a system for doing that. You're going to be able to analyze all of the searches that have been used on your data set within that search merchandising UI, or maybe a new UI, and adjust your rankings spot fix things the same way you can now, but with the power of the analytics. The other thing we're going to be offering soon is dynamically tunable cross encoders. Cross encoders are this magic neural net that can zip together full text and semantic results into a new ranked order. And they're underutilized, but they're also hard to adjust over time. We're going to be offering API endpoints for uploading, for doing your click through rates on the search results, and then dynamically on a batched timer tuning across encoder to adjust ranking. -#### Targeting Cookies +Nick Khami: +This should be coming out in the next two to three weeks. But yeah, we're just now getting to the analytics hurdle. We also just got to the speed hurdle. So things are fast now. As you guys hopefully saw in the demo, it's sub 50 milliseconds for most queries. P 95 is like 80 milliseconds, which is pretty cool thanks to Qdrant, by the way. Nice Qdrant is huge, I mean for powering all of that. But yeah, analytics will be coming next two or three weeks. -Targeting Cookies +Nick Khami: +We're excited about it. -These cookies may be set through our site by our advertising partners. They may be used by those companies to build a profile of your interests and show you relevant adverts on other sites. They do not store directly personal information, but are based on uniquely identifying your browser and internet device. If you do not allow these cookies, you will experience less targeted advertising. +Demetrios: +So there's another question coming through in the chat and they're asking, I wonder if llms can suggest graph QL queries based on schema as it's not so tied to endpoints. -#### Functional Cookies +Nick Khami: +I think they could in the system that we built for this case, I didn't actually use the response body. If you guys go to devflowinc Open API search on GitHub, you guys can make your own example where you fix that. In the response query of the Open API JSON spec, you have the structure. If you embed that inside of the chunk as another paragraph tag and then go back to doing rag, it probably can do that. I see no reason why I wouldn't be able to. -Functional Cookies +Demetrios: +I just dropped the link in the chat for anybody that is interested. And now let's talk a little bit for these next couple minutes about the journey of using Qdrant. You said you've been using it since 2022. Things have evolved a ton with the product over these years. Like, what have you seen what's been the most value add that you've had since starting? -These cookies enable the website to provide enhanced functionality and personalisation. They may be set by us or by third party providers whose services we have added to our pages. If you do not allow these cookies then some or all of these services may not function properly. +Nick Khami: +I mean, there's so many, like, okay, the one that I have highlighted in my head that I wanted to talk about was, I remember in May of 2023, there's a GitHub issue with an Algora bounty for API keys. I remember Dens and I, we'd already been using it for a while and we knew there was no API key thing. There's no API key for it. We were always joking about it. We were like, oh, we're so early. There's not even an API key for our database. You had to have access permissions in your VPC or sub routing to have it work securely. And I'm not sure it's like the highest. -#### Strictly Necessary Cookies +Nick Khami: +I'll talk about some other things where higher value add, but I just remember, like, how cool that was. Yeah, yeah, yeah. -Always Active +Demetrios: +State of the nation. When you found out about it and. -These cookies are necessary for the website to function and cannot be switched off in our systems. They are usually only set in response to actions made by you which amount to a request for services, such as setting your privacy preferences, logging in or filling in forms. You can set your browser to block or alert you about these cookies, but some parts of the site will not then work. These cookies do not store any personally identifiable information. +Nick Khami: +It was so hyped, like, the API key had added, we were like, wow, this is awesome. It was kind of like a simple thing, but like, for us it was like, oh, whoa, this is. We're so much more comfortable in security now. But dude, Qdrant added so many cool things. Like a couple of things that I think I'd probably highlight are the group system. That was really awesome when that got added. I mean, I think it's one of my favorite features. Then after that, the sparse vector support and a recent version was huge. -#### Performance Cookies +Nick Khami: +We had a whole crazy subsystem with Tantivy. If anyone watching knows the crate Tantivy, it's like a full text. Uh, it's like a leucine alternative written in rust. Um, and we like, built this whole crazy subsystem and then quad fit, like, supported the sparse vectors and we were like, oh my God, we should have probably like, worked with them on the sparse vector thing we didn't even know you guys wanted to do, uh, because like, we spent all this time building it and probably could have like, helped out that PR. We felt bad, um, because that was really nice. When that got added, the performance fixes for that were also really cool. Some of the other things that, like, Qdrant added while we've been using it that were really awesome. Oh, the multiple recommendation modes, I think I forget what they're both called, but there's, it's also like insane for people, like, out there watching, like, try Qdrant for sure, it's so, so, so good compared to like a lot of what you can do in a PG vector. -Performance Cookies +Nick Khami: +There's like, this recommendation feature is awesome. Typically when you do recommendations, you take the results that you want to base recommendations off of and you build like an average vector that you then use to search. Qdrant offers a more evolved recommendation pattern now where you can traverse the graph looking at the positive point similarity, then also the negative similarity. And if the similarity of the negative points is higher than that of the positive points, it'll ignore that edge recommendations. And for us at least, like with our customers, this improved their quality of recommendations a lot when they use negative samples. And we didn't even find out about that. It was in the version release notes and we didn't think about it. And like a month or two later we had a customer that was like communicating that they wanted higher quality recommendations. -These cookies allow us to count visits and traffic sources so we can measure and improve the performance of our site. They help us to know which pages are the most and least popular and see how visitors move around the site. All information these cookies collect is aggregated and therefore anonymous. If you do not allow these cookies we will not know when you have visited our site, and will not be able to monitor its performance. +Nick Khami: +And we were like, okay, what is like, are we using all the features available? And we weren't. That was cool. -Back Button +Demetrios: +The fact that you understand that now and you were able to communicate it back to me almost like better than I communicate it to people is really cool. And it shows that you've been in the weeds on it and you have seen a strong use case for it, because sometimes it's like, okay, this is out there. It needs to be communicated in the best use case so that people can understand it. And it seems like with that e commerce use case, it really stuck. -### Cookie List +Nick Khami: +This one was actually for a company that does search over american legislation called funny enough, we want more e commerce customers for retrieve. Most of our customers right now are like SaaS applications. This particular customer, I don't think they'd mind me shouting them out. It's called Bill Track 50. If you guys want to like search over us legislation, try them out. They're very, very good. And yeah, they were the team that really used it. But yeah, it's another cool thing, I think, about infrastructure like Qdrant in general, and it's so, so powerful that like a lot of times it can be worth like getting an implementation partner. -Search Icon +Nick Khami: +Like, even if you're gonna, if you're gonna use Qdrant, like, the team at Qdrant is very helpful and you should consider reaching out to them because they can probably help anyone who's going to build search recommendations to figure out what is offered and what can help on a high level, not so much a GitHub issue code level, but at a high level. Thinking about your use case. Again, search is such a heuristic problem and so human in a way that it's always worth talking through your solution with people it that are very familiar with search recommendations in general. -Filter Icon +Demetrios: +Yeah. And they know the best features and the best tool to use that is going to get you that outcome you're looking for. So. All right, Nick, last question for you. It is about Trieve. I have my theory on why you call it that. Is it retrieve? You just took off the Re-? -Clear +Nick Khami: +Yes. Drop the read. It's cleaner. That's like the Facebook quote, but for Trieve. -checkbox labellabel +Demetrios: +I was thinking when I first read it, I was like, it must be some french word I'm not privy to. And so it's cool because it's french. You just got to put like an accent over one of these e's or both of them, and then it's even cooler. It's like luxury brand to the max. So I appreciate you coming on here. I appreciate you walking us through this and talking about it, man. This was awesome. -ApplyCancel +Nick Khami: +Yeah, thanks for having me on. I appreciate it. -ConsentLeg.Interest +Demetrios: +All right. For anybody else that is out there and wants to come on the vector space talks, come join us. You know where to find us. As always, later. -checkbox labellabel +<|page-360-lllmstxt|> +# Exploring Gen AI and Vector Search: Insights from Iveta Lohovska -checkbox labellabel +> *"In the generative AI context of AI, all foundational models have been trained on some foundational data sets that are distributed in different ways. Some are very conversational, some are very technical, some are on, let's say very strict taxonomy like healthcare or chemical structures. We call them modalities, and they have different representations.”*\ +— Iveta Lohovska +> -checkbox labellabel +Iveta Lohovska serves as the Chief Technologist and Principal Data Scientist for AI and Supercomputing at [Hewlett Packard Enterprise (HPE)](https://www.hpe.com/us/en/home.html), where she champions the democratization of decision intelligence and the development of ethical AI solutions. An industry leader, her multifaceted expertise encompasses natural language processing, computer vision, and data mining. Committed to leveraging technology for societal benefit, Iveta is a distinguished technical advisor to the United Nations' AI for Good program and a Data Science lecturer at the Vienna University of Applied Sciences. Her career also includes impactful roles with the World Bank Group, focusing on open data initiatives and Sustainable Development Goals (SDGs), as well as collaborations with USAID and the Gates Foundation. -Reject AllConfirm My Choices +***Listen to the episode on [Spotify](https://open.spotify.com/episode/7f1RDwp5l2Ps9N7gKubl8S?si=kCSX4HGCR12-5emokZbRfw), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/RsRAUO-fNaA).*** -[![Powered by Onetrust](https://cdn.cookielaw.org/logos/static/powered_by_logo.svg)](https://www.onetrust.com/products/cookie-consent/) + -<|page-181-lllmstxt|> -## filtering -- [Documentation](https://qdrant.tech/documentation/) -- [Concepts](https://qdrant.tech/documentation/concepts/) -- Filtering + -# [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#filtering) Filtering +## **Top takeaways:** -With Qdrant, you can set conditions when searching or retrieving points. -For example, you can impose conditions on both the [payload](https://qdrant.tech/documentation/concepts/payload/) and the `id` of the point. +In our continuous pursuit of knowledge and understanding, especially in the evolving landscape of AI and the vector space, we brought another great Vector Space Talk episode featuring Iveta Lohovska as she talks about generative AI and [vector search](https://qdrant.tech/). -Setting additional conditions is important when it is impossible to express all the features of the object in the embedding. -Examples include a variety of business requirements: stock availability, user location, or desired price range. +Iveta brings valuable insights from her work with the World Bank and as Chief Technologist at HPE, explaining the ins and outs of ethical AI implementation. -## [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#related-content) Related Content +Here are the episode highlights: +- Exploring the critical role of trustworthiness and explainability in AI, especially within high confidentiality use cases like government and security agencies. +- Discussing the importance of transparency in AI models and how it impacts the handling of data and understanding the foundational datasets for vector search. +- Iveta shares her experiences implementing generative AI in high-stakes environments, including the energy sector and policy-making, emphasizing accuracy and source credibility. +- Strategies for managing data privacy in high-stakes sectors, the superiority of on-premises solutions for control, and the implications of opting for cloud or hybrid infrastructure. +- Iveta's take on the maturity levels of generative AI, the ongoing development of smaller, more focused models, and the evolving landscape of AI model licensing and open-source contributions. -| [A Complete Guide to Filtering in Vector Search](https://qdrant.tech/articles/vector-search-filtering/) | Developer advice on proper usage and advanced practices. | -| --- | --- | +> Fun Fact: The climate agent solution showcased by Iveta helps individuals benchmark their carbon footprint and assists policymakers in drafting policy recommendations based on scientifically accurate data. +> -## [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#filtering-clauses) Filtering clauses +## Show notes: -Qdrant allows you to combine conditions in clauses. -Clauses are different logical operations, such as `OR`, `AND`, and `NOT`. -Clauses can be recursively nested into each other so that you can reproduce an arbitrary boolean expression. +00:00 AI's vulnerabilities and ethical implications in practice.\ +06:28 Trust reliable sources for accurate climate data.\ +09:14 Vector database offers control and explainability.\ +13:21 On-prem vital for security and control.\ +16:47 Gen AI chat models at basic maturity.\ +19:28 Mature technical community, but slow enterprise adoption.\ +23:34 Advocates for open source but highlights complexities.\ +25:38 Unreliable information, triangle of necessities, vector space. -Let’s take a look at the clauses implemented in Qdrant. +## More Quotes from Iveta: -Suppose we have a set of points with the following payload: +*"What we have to ensure here is that every citation and every answer and augmentation by the generative AI on top of that is linked to the exact source of paper or publication, where it's coming from, to ensure that we can trace it back to where the climate information is coming from.”*\ +— Iveta Lohovska -```json -[\ - { "id": 1, "city": "London", "color": "green" },\ - { "id": 2, "city": "London", "color": "red" },\ - { "id": 3, "city": "London", "color": "blue" },\ - { "id": 4, "city": "Berlin", "color": "red" },\ - { "id": 5, "city": "Moscow", "color": "green" },\ - { "id": 6, "city": "Moscow", "color": "blue" }\ -] +*"Explainability means if you receive a certain answer based on your prompt, you can trace it back to the exact source where the embedding has been stored or the source of where the information is coming from and things.”*\ +— Iveta Lohovska -``` +*"Chat GPT for conversational purposes and individual help is something very cool but when this needs to be translated into actual business use cases scenario with all the constraint of the enterprise architecture, with the constraint of the use cases, the reality changes quite dramatically.”*\ +— Iveta Lohovska -### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#must) Must +## Transcript: +Demetrios: +Look at that. We are back for another vector space talks. I'm very excited to be doing this today with you all. I am joined by none other than Sabrina again. Where are you at, Sabrina? How's it going? -Example: +Sabrina Aquino: +Hey there, Demetrios. Amazing. Another episode and I'm super excited for this one. How are you doing? -httppythontypescriptrustjavacsharpgo +Demetrios: +I'm great. And we're going to bring out our guest of honor today. We are going to be talking a lot about trustworthy AI because Iveta has a background working with the World bank and focusing on the open data with that. But currently she is chief technologist and principal data scientist at HPE. And we were talking before we hit record before we went live. And we've got some hot takes that are coming up. So I'm going to bring Iveta to the stage. Where are you? There you are, our guest of honor. -```http -POST /collections/{collection_name}/points/scroll -{ - "filter": { - "must": [\ - { "key": "city", "match": { "value": "London" } },\ - { "key": "color", "match": { "value": "red" } }\ - ] - } - ... -} +Demetrios: +How you doing? -``` +Iveta Lohovska: +Good. I hope you can hear me well. -```python -from qdrant_client import QdrantClient, models +Demetrios: +Loud and clear. Yes. -client = QdrantClient(url="http://localhost:6333") +Iveta Lohovska: +Happy to join here from Vienna and thank you for the invite. -client.scroll( - collection_name="{collection_name}", - scroll_filter=models.Filter( - must=[\ - models.FieldCondition(\ - key="city",\ - match=models.MatchValue(value="London"),\ - ),\ - models.FieldCondition(\ - key="color",\ - match=models.MatchValue(value="red"),\ - ),\ - ] - ), -) +Demetrios: +Yes. So I'm very excited to talk with you today. I think it's probably worth getting the TLDR on your story and why you're so passionate about trustworthiness and explainability. -``` +Iveta Lohovska: +Well, I think especially in the genaid context where if there any vulnerabilities around the solution or the training data set or any underlying context, either in the enterprise or in a smaller scale, it's just the scale that AI engine AI can achieve if it has any vulnerabilities or any weaknesses when it comes to explainability or trustworthiness or bias, it just goes explain nature. So it is to be considered and taken with high attention when it comes to those use cases. And most of my work is within an enterprise with high confidentiality use cases. So it plays a big role more than actually people will think it's on a high level. It just sounds like AI ethical principles or high level words that are very difficult to implement in technical terms. But in reality, when you hit the ground, when you hit the projects, when you work with in the context of, let's say, governments or organizations that deal with atomic energy, I see it in Vienna, the atomic agency is a neighboring one, or security agencies. Then you see the importance and the impact of those terms and the technical implications behind that. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +Sabrina Aquino: +That's amazing. And can you talk a little bit more about the importance of the transparency of these models and what can happen if we don't know exactly what kind of data they are being trained on? -const client = new QdrantClient({ host: "localhost", port: 6333 }); +Iveta Lohovska: +I mean, this is especially relevant under our context of [vector databases](https://qdrant.tech/articles/what-is-a-vector-database/) and vector search. Because in the generative AI context of AI, all foundational models have been trained on some foundational data sets that are distributed in different ways. Some are very conversational, some are very technical, some are on, let's say very strict taxonomy like healthcare or chemical structures. We call them modalities, and they have different representations. So, so when it comes to implementing vector search or [vector database](https://qdrant.tech/articles/what-is-a-vector-database/) and knowing the distribution of the foundational data sets, you have better control if you introduce additional layers or additional components to have the control in your hands of where the information is coming from, where it's stored, [what are the embeddings](https://qdrant.tech/articles/what-are-embeddings/). So that helps, but it is actually quite important that you know what the foundational data sets are, so that you can predict any kind of weaknesses or vulnerabilities or penetrations that the solution or the use case of the model will face when it lands at the end user. Because we know with generative AI that is unpredictable, we know we can implement guardrails. They're already solutions. -client.scroll("{collection_name}", { - filter: { - must: [\ - {\ - key: "city",\ - match: { value: "London" },\ - },\ - {\ - key: "color",\ - match: { value: "red" },\ - },\ - ], - }, -}); +Iveta Lohovska: +We know they're not 100, they don't give you 100% certainty, but they are definitely use cases and work where you need to hit the hundred percent certainty, especially intelligence, cybersecurity and healthcare. -``` +Demetrios: +Yeah, that's something that I wanted to dig into a little bit. More of these high stakes use cases feel like you can't. I don't know. I talk with a lot of people about at this current time, it's very risky to try and use specifically generative AI for those high stakes use cases. Have you seen people that are doing it well, and if so, how? -```rust -use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; -use qdrant_client::Qdrant; +Iveta Lohovska: +Yeah, I'm in the business of high stakes use cases and yes, we do those kind of projects and work, which is very exciting and interesting, and you can see the impact. So I'm in the generative AI implementation into enterprise control. An enterprise context could mean critical infrastructure, could mean telco, could mean a government, could mean intelligence organizations. So those are just a few examples, but I could flip the coin and give you an alternative for a public one where I can share, let's say a good example is climate data. And we recently worked on, on building a knowledge worker, a climate agent that is trained, of course, his foundational knowledge, because all foundational models have prior knowledge they can refer to. But the key point here is to be an expert on climate data emissions gap country cards. Every country has a commitment to meet certain reduction emission reduction goals and then benchmarked and followed through the international supervisions of the world, like the United nations environmental program and similar entities. So when you're training this agent on climate data, they're competing ideas or several sources. -let client = Qdrant::from_url("http://localhost:6334").build()?; +Iveta Lohovska: +You can source your information from the local government that is incentivized to show progress to the nation and other stakeholders faster than the actual reality, the independent entities that provide information around the state of the world when it comes to progress towards certain climate goals. And there are also different parties. So for this kind of solution, we were very lucky to work with kind of the status co provider, the benchmark around climate data, around climate publications. And what we have to ensure here is that every citation and every answer and augmentation by the generative AI on top of that is linked to the exact source of paper or publication, where it's coming from, to ensure that we can trace it back to where the climate information is coming from. If Germany performs better compared to Austria, and also the partner we work with was the United nations environmental program. So they want to make sure that they're the citadel scientific arm when it comes to giving information. And there's no compromise, could be a compromise on the structure of the answer, on the breadth and death of the information, but there should be no compromise on the exact fact fullness of the information and where it's coming from. And this is a concrete example because why, you oughta ask, why is this so important? Because it has two interfaces. -client - .scroll( - ScrollPointsBuilder::new("{collection_name}").filter(Filter::must([\ - Condition::matches("city", "london".to_string()),\ - Condition::matches("color", "red".to_string()),\ - ])), - ) - .await?; +Iveta Lohovska: +It has the public. You can go and benchmark your carbon footprint as an individual living in one country comparing to an individual living in another. But if you are a policymaker, which is the other interface of this application, who will write the policy recommendation of a country in their own country, or a country they're advising on, you might want to make sure that the scientific citations and the policy recommendations that you're making are correct and they are retrieved from the proper data sources. Because there will be a huge implication when you go public with those numbers or when you actually design a law that is reinforceable with legal terms and law enforcement. -``` +Sabrina Aquino: +That's very interesting, Iveta, and I think this is one of the great use cases for [RAG](https://qdrant.tech/articles/what-is-rag-in-ai/), for example. And I think if you can talk a little bit more about how vector search is playing into all of this, how it's helping organizations do this, this. -```java -import java.util.List; +Iveta Lohovska: +Would be amazing in such specific use cases. I think the main differentiator is the traceability component, the first that you have full control on which data it will refer to, because if you deal with open source models, most of them are open, but the data it has been trained on has not been opened or given public so with vector database you introduce a step of control and explainability. Explainability means if you receive a certain answer based on your prompt, you can trace it back to the exact source where the embedding has been stored or the source of where the information is coming from and things. So this is a major use case for us for those kind of high stake solution is that you have the explainability and traceability. Explainability. It could be as simple as a semantical similarity to the text, but also the traceability of where it's coming from and the exact link of where it's coming from. So it should be, it shouldn't be referred. You can close and you can cut the line of the model referring to its previous knowledge by introducing a [vector database](https://qdrant.tech/articles/what-is-a-vector-database/), for example. -import static io.qdrant.client.ConditionFactory.matchKeyword; +Iveta Lohovska: +So there could be many other implications and improvements in terms of speed and just handling huge amounts of data, yet also nice to have that come with this kind of technique, but the prior use case is actually not incentivized around those. -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.ScrollPoints; +Demetrios: +So if I'm hearing you correctly, it's like yet another reason why you should be thinking about using vector databases, because you need that ability to cite your work and it's becoming a very strong design pattern. Right. We all understand now, if you can't see where this data has been pulled from or you can't get, you can't trace back to the actual source, it's hard to trust what the output is. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +Iveta Lohovska: +Yes, and the easiest way to kind of cluster the two groups. If you think of creative fields and marketing fields and design fields where you could go wild and crazy with the temperature on each model, how creative it could go and how much novelty it could bring to the answer are one family of use cases. But there is exactly the opposite type of use cases where this is a no go and you don't need any creativity, you just focus on, focus on the factfulness and explainability. So it's more of the speed and the accuracy of retrieving information with a high level of novelty, but not compromising on any kind of facts within the answer, because there will be legal implications and policy implications and societal implications based on the action taken on this answer, either policy recommendation or legal action. There's a lot to do with the intelligence agencies that retrieve information based on nearest neighbor or kind of a relational analysis that you can also execute with vector databases and generative AI. -client - .scrollAsync( - ScrollPoints.newBuilder() - .setCollectionName("{collection_name}") - .setFilter( - Filter.newBuilder() - .addAllMust( - List.of(matchKeyword("city", "London"), matchKeyword("color", "red"))) - .build()) - .build()) - .get(); +Sabrina Aquino: +And we know that for these high stakes sectors that data privacy is a huge concern. And when we're talking about using vector databases and storing that data somewhere, what are some of the principles or techniques that you use in terms of infrastructure, where should you store your vector database and how should you think about that part of your system? -``` +Iveta Lohovska: +Yeah, so most of the cases, I would say 99% of the cases, is that if you have such a high requirements around security and explainability, security of the data, but those security of the whole use case and environment, and the explainability and trustworthiness of the answer, then it's very natural to have expectations that will be on prem and not in the cloud, because only on prem you have a full control of where your data sits, where your model sits, the full ownership of your IP, and then the full ownership of having less question marks of the implementation and architecture, but mainly the full ownership of the end to end solution. So when it comes to those use cases, RAG on Prem, with the whole infrastructure, with the whole software and platform layers, including models on Prem, not accessible through an API, through a service somewhere where you don't know where the guardrails is, who designed the guardrails, what are the guardrails? And we see those, this a lot with, for example, copilot, a lot of question marks around that. So it's a huge part of my work is just talking of it, just sorting out that. -```csharp -using Qdrant.Client; -using static Qdrant.Client.Grpc.Conditions; +Sabrina Aquino: +Exactly. You don't want to just give away your data to a cloud provider, because there's many implications that that comes with. And I think even your clients, they need certain certifications, then they need to make sure that nobody can access that data, something that you cannot. Exactly. I think ensure if you're just using a cloud provider somewhere, which is, I think something that's very important when you're thinking about these high stakes solutions. But also I think if you're going to maybe outsource some of the infrastructure, you also need to think about something that's similar to a [hybrid cloud solution](https://qdrant.tech/documentation/hybrid-cloud/) where you can keep your data and outsource the kind of management of infrastructure. So that's also a nice use case for that, right? -var client = new QdrantClient("localhost", 6334); +Iveta Lohovska: +I mean, I work for HPE, so hybrid is like one of our biggest sacred words. Yeah, exactly. But actually like if you see the trends and if you see how expensive is to work to run some of those workloads in the cloud, either for training for national model or fine tuning. And no one talks about inference, inference not in ten users, but inference in hundred users with big organizations. This itself is not sustainable. Honestly, when you do the simple Linux, algebra or math of the exponential cost around this. That's why everything is hybrid. And there are use cases that make sense to be fast and speedy and easy to play with, low risk in the cloud to try. -// & operator combines two conditions in an AND conjunction(must) -await client.ScrollAsync( - collectionName: "{collection_name}", - filter: MatchKeyword("city", "London") & MatchKeyword("color", "red") -); +Iveta Lohovska: +But when it comes to actual GenAI work and LLM models, yeah, the answer is never straightforward when it comes to the infrastructure and the environment where you are hosting it, for many reasons, not just cost, but any other. -``` +Demetrios: +So there's something that I've been thinking about a lot lately that I would love to get your take on, especially because you deal with this day in and day out, and it is the maturity levels of the current state of Gen AI and where we are at for chat GPT or just llms and foundational models feel like they just came out. And so we're almost in the basic, basic, basic maturity levels. And when you work with customers, how do you like kind of signal that, hey, this is where we are right now, but you should be very conscientious that you're going to need to potentially work with a lot of breaking changes or you're going to have to be constantly updating. And this isn't going to be set it and forget it type of thing. This is going to be a lot of work to make sure that you're staying up to date, even just like trying to stay up to date with the news as we were talking about. So I would love to hear your take on on the different maturity levels that you've been seeing and what that looks like. -```go -import ( - "context" +Iveta Lohovska: +So I have huge exposure to GenAI for the enterprise, and there's a huge component expectation management. Why? Because chat GPT for conversational purposes and individual help is something very cool. But when this needs to be translated into actual business use cases scenario with all the constraint of the enterprise architecture, with the constraint of the use cases, the reality changes quite dramatically. So end users who are used to expect level of forgiveness as conversational chatbots have, is very different of what you will get into actual, let's say, knowledge worker type of context, or summarization type of context into the enterprise. And it's not so much to the performance of the models, but we have something called modalities of the models. And I don't think there will be ultimately one model with all the capabilities possible, let's say cult generation or image generation, voice generational, or just being very chatty and loving and so on. There will be multiple mini models out there for those. Modalities in actual architecture with reasonable cost are very difficult to handle. - "github.com/qdrant/go-client/qdrant" -) +Iveta Lohovska: +So I would say the technical community feels we are very mature and very fast. The enterprise adoption is a totally different topic, and it's a couple of years behind, but also the society type of technologists like me, who try to keep up with the development and we know where we stand at this point, but they're the legal side and the regulations coming in, like the EU act and Biden trying to regulate the compute power, but also how societies react to this and how they adapt. And I think especially on the third one, we are far behind understanding and the implications of this technology, also adopting it at scale and understanding the vulnerabilities. That's why I enjoy so much my enterprise work is because it's a reality check. When you put the price tag attached to actual Gen AI use case in production with the inference cost and the expected performance, it's different situation when you just have an app on the phone and you chat with it and it pulls you interesting links. So yes, I think that there's a bridge to be built between the two worlds. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Demetrios: +Yeah. And I find it really interesting too, because it feels to me like since it is so new, people are more willing to explore and not necessarily have that instant return of the ROI, but when it comes to more traditional ML or predictive ML, it is a bit more mature and so there's less patience for that type of exploration. Or, hey, is this use case? If you can't by now show the ROI of a predictive ML use case, then that's a little bit more dangerous. But if you can't with a Gen AI use case, it is not that big of a deal. -client.Scroll(context.Background(), &qdrant.ScrollPoints{ - CollectionName: "{collection_name}", - Filter: &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("city", "London"), - qdrant.NewMatch("color", "red"), - }, - }, -}) +Iveta Lohovska: +Yeah, it's basically a technology growing up in front of our eyes. It's a kind of a flying a plane while building it type of situation. We are seeing it in the real time, and I agree with you. So that the maturity around ML is one thing, but around generative AI, and they will be a model of kind of mini disappointment or decline, in my opinion, before actually maturing product. This kind of powerful technology in a sustainable way. Sustainable ways mean you can afford it, but also it proves your business case and use case. Otherwise it's just doing for the sake of doing it because everyone else is doing it. -``` +Demetrios: +Yeah, yeah, 100%. So I know we're bumping up against time here. I do feel like there was a bit of a topic that we wanted to discuss with the licenses and how that plays into basically trustworthiness and explainability. And so we were talking about how, yeah, the best is to run your own model, and it probably isn't going to be this gigantic model that can do everything. It's the, it seems like the trends are going into smaller models. And from your point of view though, we are getting new models like every week. It feels like. Yeah, especially. -Filtered points would be: +Demetrios: +I mean, we were just talking about this before we went live again, like databricks just released there. What is it? DBRX Yesterday you had Mistral releasing like a new base model over the weekend, and then Llama 3 is probably going to come out in the flash of an eye. So where do you stand in regards to that? It feels like there's a lot of movement in open source, but it is a little bit of, as you mentioned, like, to be cautious with the open source movement. -```json -[{ "id": 2, "city": "London", "color": "red" }] +Iveta Lohovska: +So I think it feels like there's a lot of open source, but that. So I'm totally for open sourcing and giving the people and the communities the power to be able to innovate, to do R & D in different labs so it's not locked to the view. Elite big tech companies that can afford this kind of technology. So kudos to meta for trying compared to the other equal players in the space. But open source comes with a lot of ecosystem in our world, especially for the more powerful models, which is something I don't like because it becomes like just, it immediately translates into legal fees type of conversation. It's like there are too many if else statements in those open source licensing terms where it becomes difficult to navigate, for technologists to understand what exactly this means, and then you have to bring the legal people to articulate it to you or to put additional clauses. So it's becoming a very complex environment to handle and less and less open, because there are not so many open source and small startup players that can afford to train foundational models that are powerful and useful. So it becomes a bit of a game logged to a view, and I think everyone needs to be a bit worried about that. -``` +Iveta Lohovska: +So we can use the equivalents from the past, but I don't think we are doing well enough in terms of open sourcing. The three main core components of LLM model, which is the model itself, the data it has been trained on, and the data sets, and most of the times, at least in one of those, is restricted or missing. So it's difficult space to navigate. -When using `must`, the clause becomes `true` only if every condition listed inside `must` is satisfied. -In this sense, `must` is equivalent to the operator `AND`. +Demetrios: +Yeah, yeah. You can't really call it trustworthy, or you can't really get the information that you need and that you would hope for if you're missing one of those three. I do like that little triangle of the necessities. So, Iveta, this has been awesome. I really appreciate you coming on here. Thank you, Sabrina, for joining us. And for everyone else that is watching, remember, don't get lost in vector space. This has been another vector space talk. -### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#should) Should +Demetrios: +We are out. Have a great weekend, everyone. -Example: +Iveta Lohovska: +Thank you. Bye. Thank you. Bye. -httppythontypescriptrustjavacsharpgo +<|page-361-lllmstxt|> +> *"So usually I get asked, why are you using Qdrant? What's the big deal? Why are you picking these over all of the other ones? And to me it boils down to, aside from being renowned or recognized, that it works fairly well. There's one core component that is critical here, and that is it has to be very straightforward, very easy to set up so that I can teach it, because if it's easy, well, sort of like easy to or straightforward to teach, then you can take the next step and you can make it a little more complex, put other things around it, and that creates a great development experience and a learning experience as well.”*\ +— Alfredo Deza +> -```http -POST /collections/{collection_name}/points/scroll -{ - "filter": { - "should": [\ - { "key": "city", "match": { "value": "London" } },\ - { "key": "color", "match": { "value": "red" } }\ - ] - } -} +Alfredo is a software engineer, speaker, author, and former Olympic athlete working in Developer Relations at Microsoft. He has written several books about programming languages and artificial intelligence and has created online courses about the cloud and machine learning. -``` +He currently is an Adjunct Professor at Duke University, and as part of his role, works closely with universities around the world like Georgia Tech, Duke University, Carnegie Mellon, and Oxford University where he often gives guest lectures about technology. -```python -client.scroll( - collection_name="{collection_name}", - scroll_filter=models.Filter( - should=[\ - models.FieldCondition(\ - key="city",\ - match=models.MatchValue(value="London"),\ - ),\ - models.FieldCondition(\ - key="color",\ - match=models.MatchValue(value="red"),\ - ),\ - ] - ), -) +***Listen to the episode on [Spotify](https://open.spotify.com/episode/4HFSrTJWxl7IgQj8j6kwXN?si=99H-p0fKQ0WuVEBJI9ugUw), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/3l6F6A_It0Q?feature=shared).*** -``` + -```typescript -client.scroll("{collection_name}", { - filter: { - should: [\ - {\ - key: "city",\ - match: { value: "London" },\ - },\ - {\ - key: "color",\ - match: { value: "red" },\ - },\ - ], - }, -}); + -``` +## **Top takeaways:** -```rust -use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; -use qdrant_client::Qdrant; +How does a former athlete such as Alfredo Deza end up in this AI and Machine Learning industry? That’s what we’ll find out in this episode of Vector Space Talks. Let’s understand how his background as an olympian offers a unique perspective on consistency and discipline that's a real game-changer in this industry. -let client = Qdrant::from_url("http://localhost:6334").build()?; +Here are some things you’ll discover from this episode: -client - .scroll( - ScrollPointsBuilder::new("{collection_name}").filter(Filter::should([\ - Condition::matches("city", "london".to_string()),\ - Condition::matches("color", "red".to_string()),\ - ])), - ) - .await?; +1. **The Intersection of Teaching and Tech:** Alfredo discusses on how to effectively bridge the gap between technical concepts and student understanding, especially when dealing with complex topics like vector databases. +2. **Simplified Learning:** Dive into Alfredo's advocacy for simplicity in teaching methods, mirroring his approach with Qdrant and the potential for a Rust in-memory implementation aimed at enhancing learning experiences. +3. **Beyond the Titanic Dataset:** Discover why Alfredo prefers to teach with a wine dataset he developed himself, underscoring the importance of using engaging subject matter in education. +4. **AI Learning Acceleration:** Alfredo discusses the struggle universities face to keep pace with AI advancements and how online platforms can offer a more up-to-date curriculum. +5. **Consistency is Key:** Alfredo draws parallels between the discipline required in high-level athletics and the ongoing learning journey in AI, zeroing in on his mantra, “There is no secret” to staying consistent. -``` +> Fun Fact: Alfredo tells the story of athlete Dick Fosbury's invention of the Fosbury Flop to highlight the significance of teaching simplicity. +> -```java -import static io.qdrant.client.ConditionFactory.matchKeyword; +## Show notes: -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.ScrollPoints; -import java.util.List; +00:00 Teaching machine learning, Python to graduate students.\ +06:03 Azure AI search service simplifies teaching, Qdrant facilitates learning.\ +10:49 Controversy over high jump style.\ +13:18 Embracing past for inspiration, emphasizing consistency.\ +15:43 Consistent learning and practice lead to success.\ +20:26 Teaching SQL uses SQLite, Rust has limitations.\ +25:21 Online platforms improve and speed up education.\ +29:24 Duke and Coursera offer specialized language courses.\ +31:21 Passion for wines, creating diverse dataset.\ +35:00 Encouragement for vector db discussion, wrap up.\ -client - .scrollAsync( - ScrollPoints.newBuilder() - .setCollectionName("{collection_name}") - .setFilter( - Filter.newBuilder() - .addAllShould( - List.of(matchKeyword("city", "London"), matchKeyword("color", "red"))) - .build()) - .build()) - .get(); +## More Quotes from Alfredo: -``` +*"Qdrant makes it straightforward. We use it in-memory for my classes and I would love to see something similar setup in Rust to make teaching even easier.”*\ +— Alfredo Deza -```csharp -using Qdrant.Client; -using static Qdrant.Client.Grpc.Conditions; +*"Retrieval augmented generation is kind of like having an open book test. So the large language model is the student, and they have an open book so they can see the answers and then repackage that into their own words and provide an answer.”*\ +— Alfredo Deza -var client = new QdrantClient("localhost", 6334); +*"With Qdrant, I appreciate that the use of the Python API is so simple. It avoids the complexity that comes from having a back-end system like in Rust where you need an actual instance of the database running.”*\ +— Alfredo Deza -// | operator combines two conditions in an OR disjunction(should) -await client.ScrollAsync( - collectionName: "{collection_name}", - filter: MatchKeyword("city", "London") | MatchKeyword("color", "red") -); +## Transcript: +Demetrios: +What is happening? Everyone, welcome back to another vector space talks. I am Demetrios, and I am joined today by good old Sabrina. Where you at, Sabrina? Hello? -``` +Sabrina Aquino: +Hello, Demetrios. I'm from Brazil. I'm in Brazil right now. I know that you are traveling currently. -```go -import ( - "context" +Demetrios: +Where are you? At Kubecon in Paris. And it has been magnificent. But I could not wait to join the session today because we've got Alfredo coming at us. - "github.com/qdrant/go-client/qdrant" -) +Alfredo Deza: +What's up, dude? Hi. How are you? -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Demetrios: +I'm good, man. It's been a while. I think the last time that we chatted was two years ago, maybe right before your book came out. When did the book come out? -client.Scroll(context.Background(), &qdrant.ScrollPoints{ - CollectionName: "{collection_name}", - Filter: &qdrant.Filter{ - Should: []*qdrant.Condition{ - qdrant.NewMatch("city", "London"), - qdrant.NewMatch("color", "red"), - }, - }, -}) +Alfredo Deza: +Yeah, something like that. I would say a couple of years ago. Yeah. I wrote, co authored practical machine learning operations with no gift. And it was published on O'Reilly. -``` +Demetrios: +Yeah. And that was, I think, two years ago. So you've been doing a lot of stuff since then. Let's be honest, you are maybe one of the most active men on the Internet. I always love seeing what you're doing. You're bringing immense value to everything that you touch. I'm really excited to be able to chat with you for this next 30 minutes. -Filtered points would be: +Alfredo Deza: +Yeah, of course. -```json -[\ - { "id": 1, "city": "London", "color": "green" },\ - { "id": 2, "city": "London", "color": "red" },\ - { "id": 3, "city": "London", "color": "blue" },\ - { "id": 4, "city": "Berlin", "color": "red" }\ -] +Demetrios: +Maybe just, we'll start it off. We're going to get into it when it comes to what you're doing and really what the space looks like right now. Right. But I would love to hear a little bit of what you've been up to since, for the last two years, because I haven't talked to you. -``` +Alfredo Deza: +Yeah, that's right. Well, several different things, actually. Right after we chatted last time, I joined Microsoft to work in developer relations. Microsoft has a big group of folks working in developer relations. And basically, for me, it signaled my shift away from regular software engineering. I was primarily doing software engineering and thought that perhaps with the books and some of the courses that I had published, it was time for me to get into more teaching and providing useful content, which is really something very rewarding. And in developer relations, in advocacy in general, it's kind of like a way of teaching. We demonstrate technology, how it works from a technical point of view. -When using `should`, the clause becomes `true` if at least one condition listed inside `should` is satisfied. -In this sense, `should` is equivalent to the operator `OR`. +Alfredo Deza: +So aside from that, started working really closely with several different universities. I work with Georgia Tech, Oxford University, Carnegie Mellon University, and Duke University, where I've been working as an adjunct professor for a couple of years as well. So at Duke, what I do is I teach a couple of classes a year. One is on machine learning. Last year was machine learning operations, and this year it's going to, I think, hopefully I'm not messing anything up. I think we're going to shift a little bit to doing operations with large language models. And in the fall I teach a programming class for graduate students that want to join one of the graduate programs and they want to get a primer on Python. So I teach a little bit of that. -### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#must-not) Must Not +Alfredo Deza: +And in the meantime, also in partnership with Duke, getting a lot of courses out on Coursera, and from large language models to doing stuff with Azure, to machine learning operations, to rust, I've been doing a lot of rust lately, which I really like. So, yeah, so a lot of different things, but I think the core pillar for me remains being able to teach and spread the knowledge. -Example: +Demetrios: +Love it, man. And I know you've been diving into vector databases. Can you tell us more? -httppythontypescriptrustjavacsharpgo +Alfredo Deza: +Yeah, well, the thing is that when you're trying to teach, and yes, one of the courses that we had out for large language models was applying retrieval augmented generation, which is the basis for vector databases, to see how it works. This is how it works. These are the components that you need. Let's create an application from scratch and see how it works. And for those that don't know, retrieval augmented generation is kind of like having. The other day I saw a description about this, which I really like, which is a way of, it's kind of like having an open book test. So the large language model is the student, and they have an open book so they can see the answers and then repackage that into their own words and provide an answer, which is kind of like what we do with vector databases in the retrieval augmented generation pattern. We've been putting a lot of examples on how to do these, and in the case of Azure, you're enabling certain services. -```http -POST /collections/{collection_name}/points/scroll -{ - "filter": { - "must_not": [\ - { "key": "city", "match": { "value": "London" } },\ - { "key": "color", "match": { "value": "red" } }\ - ] - } -} +Alfredo Deza: +There's the Azure AI search service, which is really good. But sometimes when you're trying to teach specifically, it is useful to have a very straightforward way to do this and applying or creating a retrieval augmented generation pattern, it's kind of tricky, I think. We're not there yet to do it in a nice, straightforward way. So there are several different options, Qdrant being one of them. So usually I get asked, why are you using Qdrant? What's the big deal? Why are you picking these over all of the other ones? And to me it boils down to, aside from being renowned or recognized, that it works fairly well. There's one core component that is critical here, and that is it has to be very straightforward, very easy to set up so that I can teach it, because if it's easy, well, sort of like easy to or straightforward to teach, then you can take the next step and you can make it a little more complex, put other things around it, and that creates a great development experience and a learning experience as well. If something is very complex, if the list of requirements is very long, you're not going to be very happy, you're going to spend all this time trying to figure, and when you have, similar to what happens with automation, when you have a list of 20 different things that you need to, in order to, say, deploy a website, you're going to get things out of order, you're going to forget one thing, you're going to have a typo, you're going to mess it up, you're going to have to start from scratch, and you're going to get into a situation where you can't get out of it. And Qdrant does provide a very straightforward way to run the database, and that one is the in memory implementation with Python. -``` +Alfredo Deza: +So you can actually write a little bit of python once you install the libraries and say, I want to instantiate a vector database and I wanted to run it in memory. So for teaching, this is great. It's like, hey, of course it's not for production, but just write these couple of lines and let's get right into it. Let's just start populating these and see how it works. And it works. It's great. You don't need to have all of these, like, wow, let's launch Kubernetes over here and let's have all of these dynamic. No, why? I mean, sure, you want to create a business model and you want to launch to production eventually, and you want to have all that running perfect. -```python -client.scroll( - collection_name="{collection_name}", - scroll_filter=models.Filter( - must_not=[\ - models.FieldCondition(key="city", match=models.MatchValue(value="London")),\ - models.FieldCondition(key="color", match=models.MatchValue(value="red")),\ - ] - ), -) +Alfredo Deza: +But for this setup, like for understanding how it works, for trying baby steps into understanding vector databases, this is perfect. My one requirement, or my one wish list item is to have that in memory thing for rust. That would be pretty sweet, because I think it'll make teaching rust and retrieval augmented generation with rust much easier. I wouldn't have to worry about bringing up containers or external services. So that's the deal with rust. And I'll tell you one last story about why I think specifically making it easy to get started with so that I can teach it, so that others can learn from it, is crucial. I would say almost 50 years ago, maybe a little bit more, my dad went to Italy to have a course on athletics. My dad was involved in sports and he was going through this, I think it was like a six month specialization on athletics. -``` +Alfredo Deza: +And he was in class and it had been recent that the high jump had transitioned from one style to the other. The previous style, the old style right now is the old style. It's kind of like, it was kind of like over the bar. It was kind of like a weird style. And it had recently transitioned to a thing called the Fosbury flop. This person, his last name is Dick Fosbury, invented the Fosbury flop. He said, no, I'm just going to go straight at it, then do a little curve and then jump over it. And then he did, and then he started winning everything. -```typescript -client.scroll("{collection_name}", { - filter: { - must_not: [\ - {\ - key: "city",\ - match: { value: "London" },\ - },\ - {\ - key: "color",\ - match: { value: "red" },\ - },\ - ], - }, -}); +Alfredo Deza: +And everybody's like, what this guy? Well, first they thought he was crazy, and they thought that dismissive of what he was trying to do. And there were people that sticklers that wanted to stay with the older style, but then he started beating records and winning medals, and so people were like, well, is this a good thing? Let's try it out. So there was a whole. They were casting doubt. It's like, is this really the thing? Is this really what we should be doing? So one of the questions that my dad had to answer in this specialization he did in Italy was like, which style is better, it's the old style or the new style? And so my dad said, it's the new style. And they asked him, why is the new style better? And he didn't choose the path of answering the, well, because this guy just won the Olympics or he just did a record over here that at the end is meaningless. What he said was, it is the better style because it's easier to teach and it is 100% correct. When you're teaching high jump, it is much easier to teach the Fosbury flop than the other style. -``` +Alfredo Deza: +It is super hard. So you start seeing this parallel in teaching and learning where, but with this one, you have all of these world records and things are going great. Well, great. But is anybody going to try, are you going to have more people looking into it or are you going to have less? What is it that we're trying to do here? Right. -```rust -use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; -use qdrant_client::Qdrant; +Demetrios: +Not going to lie, I did not see how you were going to land the plane on coming from the high jump into the vector database space, but you did it gracefully. That was well done. So, basically, the easier it is to teach, the more people are going to be able to jump on board and the more people are going to be able to get value out of it. -let client = Qdrant::from_url("http://localhost:6334").build()?; +Sabrina Aquino: +I absolutely love it, by the way. It's a pleasure to meet you, Alfredo. And I was actually about to ask you. I love your background as an olympic athlete. Right. And I was wondering, do you make any connections or how do we interact this background with your current teaching and AI? And do you see any similarities or something coming from that approach into what you've applied? -client - .scroll( - ScrollPointsBuilder::new("{collection_name}").filter(Filter::must_not([\ - Condition::matches("city", "london".to_string()),\ - Condition::matches("color", "red".to_string()),\ - ])), - ) - .await?; +Alfredo Deza: +Well, you're bringing a great point. It's taken me a very long time to feel comfortable talking about my professional sports past. I don't want to feel like I'm overwhelming anyone or trying to be like a show off. So I usually try not to mention, although I'm feeling more comfortable mentioning my professional past. But the only situations where I think it's good to talk about it is when I feel like there's a small chance that I might get someone thinking about the possibilities of what they can actually do and what they can try. And things that are seemingly complex might be achievable. So you mentioned similarities, but I think there are a couple of things that happen when you're an athlete in any sport, really, that you're trying to or you're operating at the very highest level and there's several things that happen there. You have to be consistent. -``` +Alfredo Deza: +And it's something that I teach my kids as well. I have one of my kids, he's like, I did really a lot of exercise today and then for a week he doesn't do anything else. And he's like, now I'm going to do exercise again. And she's going to do 4 hours. And it's like, wait a second, wait a second. It's okay. You want to do it. This is great. -```java -import java.util.List; +Alfredo Deza: +But no intensity. You need to be consistent. Oh, dad, you don't let me work out and it's like, no work out. Good, I support you, but you have to be consistent and slowly start ramping up and slowly start getting better. And it happens a lot with learning. We are in an era that concepts and things are advancing so fast that things are getting obsolete even faster. So you're always in this motion of trying to learn. So what I would say is the similarities are in the consistency. -import static io.qdrant.client.ConditionFactory.matchKeyword; +Alfredo Deza: +You have to keep learning, you have to keep applying yourself. But it can be like, oh, today I'm going to read this whole book from start to end and you're just going to learn everything about, I don't know, rust. It's like, well, no, try applying rust a little bit every day and feel comfortable with it. And at the very end you will do better. Like, you can't go with high intensity because you're going to get burned out, you're going to overwhelmed and it's not going to work out. You don't go to the Olympics by working out for like a few months. Actually, a very long time ago, a reporter asked me, how many months have you been working out preparing for the Olympics? It's like, what do you mean with how many months? I've been training my whole life for this. What are we talking about? -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.ScrollPoints; +Demetrios: +We're not talking in months or years. We're talking in lifetimes, right? -client - .scrollAsync( - ScrollPoints.newBuilder() - .setCollectionName("{collection_name}") - .setFilter( - Filter.newBuilder() - .addAllMustNot( - List.of(matchKeyword("city", "London"), matchKeyword("color", "red"))) - .build()) - .build()) - .get(); +Alfredo Deza: +So you have to take it easy. You can't do that. And beyond that, consistency. Consistency goes hand in hand with discipline. I came to the US in 2006. I don't live like I was born in Peru and I came to the US with no degree. I didn't go to college. Well, I went to college for a few months and then I dropped out and I didn't have a career, I didn't have experience. -``` +Alfredo Deza: +I was just recently married. I have never worked in my life because I used to be a professional athlete. And the only thing that I decided to do was to do amazing work, apply myself and try to keep learning and never stop learning. In the back of my mind, it's like, oh, I have a tremendous knowledge gap that I need to fulfill by learning. And actually, I have tremendous respect and I'm incredibly grateful by all of the people that opened doors for me and gave me an opportunity, one of them being Noah Giff, which I co authored a few books with him and some of the courses. And he actually taught me to write Python. I didn't know how to program. And he said, you know what? I think you should learn to write some python. -```csharp -using Qdrant.Client; -using static Qdrant.Client.Grpc.Conditions; +Alfredo Deza: +And I was like, python? Why would I ever need to do that? And I did. He's like, let's just find something to automate. I mean, what a concept. Find something to apply automation. And every week on Fridays, we'll just take a look at it and that's it. And we did that for a while. And then he said, you know what? You should apply for speaking at Python. How can I be speaking at a conference when I just started learning? It's like your perspective is different. -var client = new QdrantClient("localhost", 6334); +Alfredo Deza: +You just started learning these. You're going to do it in an interesting way. So I think those are concepts that are very important to me. Stay disciplined, stay consistent, and keep at it. The secret is that there's no secret. That's the bottom line. You have to keep consistent. Otherwise things are always making excuses. -// The ! operator negates the condition(must not) -await client.ScrollAsync( - collectionName: "{collection_name}", - filter: !(MatchKeyword("city", "London") & MatchKeyword("color", "red")) -); +Alfredo Deza: +Is very simple. -``` +Demetrios: +The secret is there is no secret. That is beautiful. So you did kind of sprinkle this idea of, oh, I wish there was more stuff happening with Qdrant and rust. Can you talk a little bit more to that? Because one piece of Qdrant that people tend to love is that it's built in rust. Right. But also, I know that you mentioned before, could we get a little bit of this action so that I don't have to deal with any. What was it you were saying? The containers. -```go -import ( - "context" +Alfredo Deza: +Yeah. Right. Now, if you want to have a proof of concept, and I always go for like, what's the easiest, the most straightforward, the less annoying things I need to do, the better. And with Python, the Python API for Qdrant, you can just write a few lines and say, I want to create an instance in memory and then that's it. The database is created for you. This is very similar, or I would say actually almost identical to how you run SQLite. Sqlite is the embedded database you can create in memory. And it's actually how I teach SQL as well. - "github.com/qdrant/go-client/qdrant" -) +Alfredo Deza: +When I have to teach SQl, I use sqlite. I think it's perfect. But in rust, like you said, Qdrant's backend is built on rust. There is no in memory implementation. So you are required to have an actual instance of the Qdrant database running. So you have a couple of options, but one of them probably means you'll have to bring up a container with Qdrant running and then you'll have to connect to that instance. So when you're teaching, the development environments are kind of constrained. Either you are in a lab somewhere like Crusader has labs, but those are self contained. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Alfredo Deza: +It's kind of tricky to get them running 100%. You can run multiple containers at the same time. So things start becoming more complex. Not only more complex for the learner, but also in this case, like the teacher, me who wants to figure out how to make this all run in a very constrained environment. And that makes it tricky. And I fasted the team, by the way, and I was told that maybe at some point they can do some magic and put the in memory implementation on the rust side of things, which I think it would be tremendous. -client.Scroll(context.Background(), &qdrant.ScrollPoints{ - CollectionName: "{collection_name}", - Filter: &qdrant.Filter{ - MustNot: []*qdrant.Condition{ - qdrant.NewMatch("city", "London"), - qdrant.NewMatch("color", "red"), - }, - }, -}) +Sabrina Aquino: +We're going to advocate for that on our side. We're also going to be asking for it. And I think this is really good too. It really makes it easier. Me as a student not long ago, I do see what you mean. It's quite hard to get it all working very fast in the time of a class that you don't have a lot of time and students can get. I don't know, it's quite complex. I do get what you mean. -``` +Sabrina Aquino: +And you also are working both on the tech industry and on academia, which I think is super interesting. And I always kind of feel like those two are a bit disconnected sometimes. And I was wondering what you think that how important is the collaboration of these two areas considering how fast the AI space is going to right now? And what are your thoughts? -Filtered points would be: +Alfredo Deza: +Well, I don't like generalizing, but I'm going to generalize right now. I would say most universities are several steps behind, and there's a lot of complexities involved in higher education specifically. Most importantly, these institutions tend to be fairly large, and with fairly large institutions, what do you get? Oh, you get the magical bureaucracy for anything you want to do. Something like, oh, well, you need to talk to that department that needs to authorize something, that needs to go to some other department, and it's like, I'm going to change the curriculum. It's like, no, you can't. What does that mean? I have actually had conversations with faculty in universities where they say, listen, curricula. Yeah, we get that. We need to update it, but we change curricula every five years. -```json -[\ - { "id": 5, "city": "Moscow", "color": "green" },\ - { "id": 6, "city": "Moscow", "color": "blue" }\ -] +Alfredo Deza: +And so. See you in a while. It's been three years. We have two more years to go. See you in a couple of years. And that's detrimental to students now. I get it. Building curricula, it's very hard. -``` +Alfredo Deza: +It takes a lot of work for the faculty to put something together. So it is something that, from a faculty perspective, it's like they're not going to get paid more if they update the curriculum. -When using `must_not`, the clause becomes `true` if none of the conditions listed inside `must_not` is satisfied. -In this sense, `must_not` is equivalent to the expression `(NOT A) AND (NOT B) AND (NOT C)`. +Demetrios: +Right. -### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#clauses-combination) Clauses combination +Alfredo Deza: +And it's a massive amount of work now that, of course, comes to the detriment of the learner. The student will be under service because they will have to go through curricula that is fairly dated. Now, there are situations and there are programs where this doesn't happen. And Duke, I've worked with several. They're teaching Llama file, which was built by Mozilla. And when did Llama file came out? It was just like a few months ago. And I think it's incredible. And I think those skills that are the ones that students need today in order to not only learn these things, but also be able to apply them when they're looking for a job or trying to professionally even apply them into their day to day, now that's one side of things. -It is also possible to use several clauses simultaneously: +Alfredo Deza: +But there's the other aspect. In the case of Duke, as well as other universities out there, they're using these online platforms so that they can put courses out there faster. Do you really need to go through a four year program to understand how retrieval augmented generation works? Or how to implement it? I would argue no, but would you be better out, like, taking a course that will take you perhaps a couple of weeks to go through and be fairly proficient? I would say yes, 100%. And you see several institutions putting courses out there that are meaningful, that are useful, that they can cope with the speed at which things are needed. I think it's kind of good. And I think that sometimes we tend to think about knowledge and learning things, kind of like in a bubble, especially here in the US. I think there's this college is this magical place where all of the amazing things happen. And if you don't go to college, things are going to go very bad for you. -httppythontypescriptrustjavacsharpgo +Alfredo Deza: +And I don't think that's true. I think if you like college, if you like university, by all means take advantage of it. You want to experience it. That sounds great. I think there's tons of opportunity to do it outside of the university or the college setting and taking online courses from validated instructors. They have a good profile. Not someone that just dumped something on genetic AI and started. -```http -POST /collections/{collection_name}/points/scroll -{ - "filter": { - "must": [\ - { "key": "city", "match": { "value": "London" } }\ - ], - "must_not": [\ - { "key": "color", "match": { "value": "red" } }\ - ] - } -} +Demetrios: +Someone like you. -``` +Alfredo Deza: +Well, if you want to. Yeah, sure, why not? I mean, there's students that really like my teaching style. I think that's great. If you don't like my teaching style. Sometimes I tend to go a little bit slower because I don't want to overwhelm anyone. That's all good. But there is opportunity. And when I mention these things, people are like, oh, really? I'm not advertising for Coursera or anything else, but some of these platforms, if you pay a monthly fee, I think it's between $40 and $60. -```python -client.scroll( - collection_name="{collection_name}", - scroll_filter=models.Filter( - must=[\ - models.FieldCondition(key="city", match=models.MatchValue(value="London")),\ - ], - must_not=[\ - models.FieldCondition(key="color", match=models.MatchValue(value="red")),\ - ], - ), -) +Alfredo Deza: +I think on the expensive side, you can take advantage of all of these courses and as much as you can take them. Sometimes even companies say, hey, you have a paid subscription, go take it all. And I've met people like that. It's like, this is incredible. I'm learning so much. Perfect. I think there's a mix of things. I don't think there's like a binary answer, like, oh, you need to do this, or, no, don't do that, and everything's going to be well again. -``` +Demetrios: +Yeah. Can you talk a little bit more about your course? And if I wanted to go on Coursera, what can I expect from. -```typescript -client.scroll("{collection_name}", { - filter: { - must: [\ - {\ - key: "city",\ - match: { value: "London" },\ - },\ - ], - must_not: [\ - {\ - key: "color",\ - match: { value: "red" },\ - },\ - ], - }, -}); +Alfredo Deza: +You know, and again, I don't think as much as I like talking about my courses and the things that I do, I want to emphasize, like, if someone is watching this video or listening into what we're talking about, find something that is interesting to you and find a course that kind of delivers that thing, that sliver of interesting stuff, and then try it out. I think that's the best way. Don't get overwhelmed by. It's like, is this the right vector database that I should be learning? Is this instructor? It's like, no, try it out. What's going to happen? You don't like it when you're watching a bad video series or docuseries on Netflix or any streaming platform? Do you just like, I pay my $10 a month, so I'm going to muster through this whole 20 more episodes of this thing that I don't like. It's meaningless. It doesn't matter. Just move on. -``` +Alfredo Deza: +So having said that, on Coursera specifically with Duke University, we tend to put courses out there that are going to be used in our programs in the things that I teach. For example, we just released the large language models. Specialization and specialization is a grouping of between four and six courses. So in there we have doing large language models with Azure, for example, introduction to generative AI, having a very simple rag pattern with Qdrant. I also have examples on how to do it with Azure AI search, which I think is pretty cool as well. How to do it locally with Llama file, which I think is great. You can have all of these large language models running locally, and then you have a little bit of Qdrant sprinkle over there, and then you have rack pattern. Now, I tend to teach with things that I really like, and I'll give you a quick example. -```rust -use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; +Alfredo Deza: +I think there's three data sets that are one of the top three most used data sets in all of machine learning and data science. Those are the Boston housing market, the diabetes data set in the US, and the other one is the Titanic. And everybody uses those. And I don't really understand why. I mean, perhaps I do understand why. It's because they're easy, they're clean, they're ready to go. Nothing's ever wrong with these, and everybody has used them to boredom. But for the life of me, you wouldn't be able to convince me to use any of those, because these are not topics that I really care about and they don't resonate with me. -client - .scroll( - ScrollPointsBuilder::new("{collection_name}").filter(Filter { - must: vec![Condition::matches("city", "London".to_string())], - must_not: vec![Condition::matches("color", "red".to_string())], - ..Default::default() - }), - ) - .await?; +Alfredo Deza: +The Titanic specifically is just horrid. Well, if I was 37 and I'm on first class and I'm male, would I survive? It's like, what are we trying to do here? How is this useful to anyone? So I tend to use things that I like, and I'm really passionate about wine. So I built my own data set, which is a collection of wines from all over the world, they have the ratings, they have the region, they have the type of grape and the notes and the name of the wine. So when I'm teaching them, like, look at this, this is amazing. It's wines from all over the world. So let's do a little bit of things here. So, for rag, what I was able to do is actually in the courses as well. I do, ah, I really know wines from Argentina, but these wines, it would be amazing if you can find me not a Malbec, but perhaps a cabernet franc. -``` +Alfredo Deza: +That is amazing. From, it goes through Qdrant, goes back to llama file using some large language model or even small language model, like the Phi 2 from Microsoft, I think is really good. And he goes, it tells. Yeah, sure. I get that you want to have some good wines. Here's some good stuff that I can give you. And so it's great, right? I think it's great. So I think those kinds of things that are interesting to the person that is teaching or presenting, I think that's the key, because whenever you're talking about things that are very boring, that you do not care about, things are not going to go well for you. -```java -import static io.qdrant.client.ConditionFactory.matchKeyword; +Alfredo Deza: +I mean, if I didn't like teaching, if I didn't like vector databases, you would tell right away. It's like, well, yes, I've been doing stuff with the vector databases. They're good. Yeah, Qdrant, very good. You would tell right away. I can't lie. Very good. -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.ScrollPoints; +Demetrios: +You can't fool anybody. -client - .scrollAsync( - ScrollPoints.newBuilder() - .setCollectionName("{collection_name}") - .setFilter( - Filter.newBuilder() - .addMust(matchKeyword("city", "London")) - .addMustNot(matchKeyword("color", "red")) - .build()) - .build()) - .get(); +Alfredo Deza: +No. -``` +Demetrios: +Well, dude, this is awesome. We will drop a link to the chat. We will drop a link to the course in the chat so that in case anybody does want to go on this wine tasting journey with you, they can. And I'm sure there's all kinds of things that will spark the creativity of the students as they go through it, because when you were talking about that, I was like, oh, it would be really cool to make that same type of thing, but with ski resorts there, you go around the world. And if I want this type of ski resort, I'm going to just ask my chat bot. So I'm excited to see what people create with it. I also really appreciate you coming on here, giving us your time and talking through all this. It's been a pleasure, as always, Alfredo. -```csharp -using Qdrant.Client; -using static Qdrant.Client.Grpc.Conditions; +Demetrios: +Thank you so much. -var client = new QdrantClient("localhost", 6334); +Alfredo Deza: +Yeah, thank you. Thank you for having me. Always happy to chat with you. I think Qdrant is doing a very solid product. Hopefully, my wish list item of in memory in rust comes to fruition, but I get it. Sometimes there are other priorities. It's all good. Yeah. -await client.ScrollAsync( - collectionName: "{collection_name}", - filter: MatchKeyword("city", "London") & !MatchKeyword("color", "red") -); +Alfredo Deza: +If anyone wants to connect with me, I'm always active on LinkedIn primarily. Always happy to connect with folks and talk about learning and improving and always being a better person. -``` +Demetrios: +Excellent. Well, we will sign off, and if anyone else out there wants to come on here and talk to us about vector databases, we're always happy to have you. Feel free to reach out. And remember, don't get lost in vector space, folks. We will see you on the next one. -```go -import ( - "context" +Sabrina Aquino: +Good night. Thank you so much. - "github.com/qdrant/go-client/qdrant" -) +<|page-362-lllmstxt|> +> *"We love Qdrant! It is our default DB. We support it in three different forms, file based, container based, and cloud based as well.”*\ +— Piero Savastano +> -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Piero Savastano is the Founder and Maintainer of the open-source project, Cheshire Cat AI. He started in Deep Learning pure research. He wrote his first neural network from scratch at the age of 19. After a period as a researcher at La Sapienza and CNR, he provides international consulting, training, and mentoring services in the field of machine and deep learning. He spreads Artificial Intelligence awareness on YouTube and TikTok. -client.Scroll(context.Background(), &qdrant.ScrollPoints{ - CollectionName: "{collection_name}", - Filter: &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("city", "London"), - }, - MustNot: []*qdrant.Condition{ - qdrant.NewMatch("color", "red"), - }, - }, -}) +> *"Another feature is the quantization because with this Qdrant feature we improve the accuracy at the performance. We use the scalar quantitation because we are model agnostic and other quantitation like the binary quantitation.”*\ +— Nicola Procopio +> -``` +Nicola Procopio has more than 10 years of experience in data science and has worked in different sectors and markets from Telco to Healthcare. At the moment he works in the Media market, specifically on semantic search, vector spaces, and LLM applications. He has worked in the R&D area on data science projects and he has been and is currently a contributor to some open-source projects like Cheshire Cat. He is the author of popular science articles about data science on specialized blogs. -Filtered points would be: +***Listen to the episode on [Spotify](https://open.spotify.com/episode/2d58Xui99QaUyXclIE1uuH?si=68c5f1ae6073472f), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/K40DIG9ZzAU?feature=shared).*** -```json -[\ - { "id": 1, "city": "London", "color": "green" },\ - { "id": 3, "city": "London", "color": "blue" }\ -] + -``` + -In this case, the conditions are combined by `AND`. +## **Top takeaways:** -Also, the conditions could be recursively nested. Example: +Did you know that companies across Italy, Germany, and the USA are already harnessing the power of Cheshire Cat for a variety of nifty purposes? It's not just a pretty face; it's evolved from a simple tutorial to an influential framework! -httppythontypescriptrustjavacsharpgo +It’s time to learn how to meow! Piero in this episode of Vector Space Talks discusses the community and open-source nature that contributes to the framework's success and expansion while Nicola reveals the Cheshire Cat’s use of Qdrant and quantization to enhance search accuracy and performance in a hybrid mode. -```http -POST /collections/{collection_name}/points/scroll -{ - "filter": { - "must_not": [\ - {\ - "must": [\ - { "key": "city", "match": { "value": "London" } },\ - { "key": "color", "match": { "value": "red" } }\ - ]\ - }\ - ] - } -} +Here are the highlights from this episode: -``` +1. **The Art of Embedding:** Discover how Cheshire Cat uses collections with an embedder, fine-tuning them through scalar quantization and other methods to enhance accuracy and performance. +2. **Vectors in Harmony:** Get the lowdown on storing quantized vectors in a hybrid mode – it's all about saving memory without compromising on speed. +3. **Memory Matters:** Scoop on managing different types of memory within Qdrant, the go-to vector DB for Cheshire Cat. +4. **Community Chronicles:** Talking about the growing community that's shaping the evolution of Cheshire Cat - from enthusiasts to core contributors! +5. **Looking Ahead:** They've got grand plans brewing for a cloud version of Cheshire Cat. Imagine a marketplace buzzing with user-generated plugins. This is the future they're painting! -```python -client.scroll( - collection_name="{collection_name}", - scroll_filter=models.Filter( - must_not=[\ - models.Filter(\ - must=[\ - models.FieldCondition(\ - key="city", match=models.MatchValue(value="London")\ - ),\ - models.FieldCondition(\ - key="color", match=models.MatchValue(value="red")\ - ),\ - ],\ - ),\ - ], - ), -) +> Fun Fact: The Cheshire Cat community on Discord plays a crucial role in the development and user support of the framework, described humorously by Piero as "a mess" due to its large and active nature. +> -``` +## Show notes: -```typescript -client.scroll("{collection_name}", { - filter: { - must_not: [\ - {\ - must: [\ - {\ - key: "city",\ - match: { value: "London" },\ - },\ - {\ - key: "color",\ - match: { value: "red" },\ - },\ - ],\ - },\ - ], - }, -}); +00:00 Powerful open source framework.\ +06:11 Tutorials, code customization, conversational forms, community challenges.\ +09:09 Exploring Qdrant's memory features.\ +13:02 Qdrant experiments with document quantization.\ +17:52 Explore details, export, and memories.\ +20:42 Addressing challenges in ensuring Cheshire Cat's reliability.\ +23:36 Leveraging cool features presents significant challenges.\ +27:06 Plugin-based approach distinguishes the CAT framework.\ +29:28 Wrap up -``` +## More Quotes from Piero and Nicola: -```rust -use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; +*"We have a little partnership going on with Qdrant because the native DB in this framework is Qdrant.”*\ +— Piero Savastano -client - .scroll( - ScrollPointsBuilder::new("{collection_name}").filter(Filter::must_not([Filter::must(\ - [\ - Condition::matches("city", "London".to_string()),\ - Condition::matches("color", "red".to_string()),\ - ],\ - )\ - .into()])), - ) - .await?; +*"We explore the feature, the Qdrant aliases feature, and we call this topic the drunken cut effect because if we have several embedders, for example two model, two embedders with the same dimension, we can put in the collection in the episodic or declarative collection factors from two different embeddings with the same dimension. But the points are different for the same sentences and for the cat is like for the human, when he mixes drinks he has a big headache and don't understand what it retrieved.”*\ +— Nicola Procopio -``` +*"It's a classic language model assistant chat we have for each message you have explainability, you can upload documents. This is all handled automatically and we start with new stuff. You have a memory page where you can search through the memories of your cat, delete, explore collections, collection from Qdrant.”*\ +— Piero Savastano -```java -import java.util.List; +*"Because I'm a researcher, a data scientist, I like to play with strange features like binary quantization, but we need to maintain the focus on the user needs, on the user behavior.”*\ +— Nicola Procopio -import static io.qdrant.client.ConditionFactory.filter; -import static io.qdrant.client.ConditionFactory.matchKeyword; +## Transcript: +Demetrios: +What is up, good people of the Internet? We are here for another one of these vector space talks and I've got to say it's a special day. We've got the folks from Cheshire Cat coming at you full on today and I want to get it started right away because I know they got a lot to talk about. And today we get a two for one discount. It's going to be nothing like you have experienced before. Or maybe those are big words. I'm setting them up huge. We've got Piero coming at us live. Where you at, Piero? Piero, founder. -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.ScrollPoints; +Demetrios: +There he is, founder at Cheshire Cat. And you are joined today by Nicola, one of the core contributors. It's great to have you both very excited. So you guys are going to be talking to us all about what you poetically put how to meow on the long tail with Cheshire Cat. And so I know you've got some slides prepared. I know you've got all that fun stuff working right now and I'm going to let you hop right into it so we don't waste any time. You ready? Who wants to share their screen first? Is it you, Nicola, or go? -client - .scrollAsync( - ScrollPoints.newBuilder() - .setCollectionName("{collection_name}") - .setFilter( - Filter.newBuilder() - .addMustNot( - filter( - Filter.newBuilder() - .addAllMust( - List.of( - matchKeyword("city", "London"), - matchKeyword("color", "red"))) - .build())) - .build()) - .build()) - .get(); +Piero Savastano: +I'll go. Thanks. -``` +Demetrios: +Here we go. Man, you should be seeing it right now. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -using static Qdrant.Client.Grpc.Conditions; +Piero Savastano: +Yes. -var client = new QdrantClient("localhost", 6334); +Demetrios: +Boom. -await client.ScrollAsync( - collectionName: "{collection_name}", - filter: new Filter { MustNot = { MatchKeyword("city", "London") & MatchKeyword("color", "red") } } -); +Piero Savastano: +Let's go. Thank you, Demetrios. We're happy to be hosted at the vector space talk. Let's talk about the Cheshire Cat AI. This is an open source framework. We have a little partnership going on with Qdrant because the native DB in this framework is Qdrant. It's a python framework. And before starting to get into the details, I'm going to show you a little video. -``` +Piero Savastano: +This is the website. So you see, it's a classic language model assistant chat we have for each message you have explainability, you can upload documents. This is all handled automatically and we start with new stuff. You have a memory page where you can search through the memories of your cat, delete, explore collections, collection from Qdrant. We have a plugin system and you can publish any plugin. You can sell your plugin. There is a big ecosystem already and we also give explanation on memories. We have adapters for the most common language models. -```go -import ( - "context" +Piero Savastano: +Dark team, you can do a lot of stuff with the framework. This is how it presents itself. We have a blog with tutorials, but going back to our numbers, it is open source, GPL licensed. We have some good numbers. We are mostly active in Italy and in a good part of Europe, East Europe, and also a little bit of our communities in the United States. There are a lot of contributors already and our docker image has been downloaded quite a few times, so it's really easy to start up and running because you just docker run and you're good to go. We have also a discord server with thousands of members. If you want to join us, it's going to be fun. - "github.com/qdrant/go-client/qdrant" -) +Piero Savastano: +We like meme, we like to build culture around code, so it is not just the code, these are the main components of the cat. You have a chat as usual. The rabbit hole is our module dedicated to document ingestion. You can extend all of these parts. We have an agent manager. Meddetter is the module to manage plugins. We have a vectordb which is Qdrant natively, by the way. We use both the file based Qdrant, the container version, and also we support the cloud version. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Piero Savastano: +So if you are using Qdrant, we support the whole stack. Right now with the framework we have an embedder and a large language model coming to the embedder and language models. You can use any language model or embedded you want, closed source API, open Ollama, self hosted anything. These are the main features. So the first feature of the cat is that he's ready to fight. It is already dogsized. It's model agnostic. One command in the terminal and you can meow. -client.Scroll(context.Background(), &qdrant.ScrollPoints{ - CollectionName: "{collection_name}", - Filter: &qdrant.Filter{ - MustNot: []*qdrant.Condition{ - qdrant.NewFilterAsCondition(&qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("city", "London"), - qdrant.NewMatch("color", "red"), - }, - }), - }, - }, -}) +Piero Savastano: +The other aspect is that there is not only a retrieval augmented generation system, but there is also an action agent. This is all customizable. You can plug in any script you want as an agent, or you can customize the ready default presence default agent. And one of our specialty is that we do retrieve augmented generation, not only on documents as everybody's doing, but we do also augmented generation over conversations. I can hear your keyboard. We do augmented generation over conversations and over procedures. So also our tools and form conversational forms are embedded into the DB. We have a big plugin system. -``` +Piero Savastano: +It's really easy to use and with different primitives. We have hooks which are events, WordPress style events. We have tools, function calling, and also we just build up a spec for conversational forms. So you can use your assistant to order a pizza, for example, multitool conversation and order a pizza, book a flight. You can do operative stuff. I already told you, and I repeat a little, not just a runner, but it's a full fledged framework. So we built this not to use language model, but to build applications on top of language models. There is a big documentation where all the events are described. -Filtered points would be: +Piero Savastano: +You find tutorials and with a few lines of code you can change the prompt. You can use long chain inspired tools, and also, and this is the big part we just built, you can use conversational forms. We launched directly on GitHub and in our discord a pizza challenge, where we challenged our community members to build up prototypes to support a multi turn conversational pizza order. And the result of this challenge is this spec where you define a pedantic model in Python and then you subclass the pizza form, the cut form from the framework, and you can give examples on utterances that triggers the form, stops the forms, and you can customize the submit function and any other function related to the form. So with a simple subclass you can handle pragmatic, operational, multi turn conversations. And I truly believe we are among the first in the world to build such a spec. We have a lot of plugins. Many are built from the community itself. -```json -[\ - { "id": 1, "city": "London", "color": "green" },\ - { "id": 3, "city": "London", "color": "blue" },\ - { "id": 4, "city": "Berlin", "color": "red" },\ - { "id": 5, "city": "Moscow", "color": "green" },\ - { "id": 6, "city": "Moscow", "color": "blue" }\ -] +Piero Savastano: +Many people is already hosting private plugins. There is a little marketplace independent about plugins. All of these plugins are open source. There are many ways to customize the cat. The big advantage here is no vendor lock in. So since the framework is open and the plugin system can be open, you do not need to pass censorship from big tech giants. This is one of the best key points of moving the framework along the open source values for the future. We plan to add the multimodality. -``` +Piero Savastano: +At the moment we are text only, but there are plugins to generate images. But we want to have images and sounds natively into the framework. We already accomplished the conversational forms. In a later talk we can speak in more detail about this because it's really cool and we want to integrate a knowledge graph into the framework so we can play with both symbolic vector representations and symbolic network ones like linked data, for example wikidata. This stuff is going to be really interesting within. Yes, we love the Qdrant. It is our default DB. We support it in three different forms, file based, container based, and cloud based also. -## [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#filtering-conditions) Filtering conditions +Piero Savastano: +But from now on I want to give word to Nicola, which is way more expert on this vector search topic and he wrote most of the part related to the DB. So thank you guys. Nicola to you. -Different types of values in payload correspond to different kinds of queries that we can apply to them. -Let’s look at the existing condition variants and what types of data they apply to. +Nicola Procopio: +Thanks Piero. Thanks Demetrios. I'm so proud to be hosted here because I'm a vector space talks fan. Okay, Qdrant is the vector DB of the cat and now I will try to explore the feature that we use on Cheshire Cat. The first slide, explain the cut's memory. Because Qdrant is our memory. We have a long term memory in three parts. The episodic memory when we store and manage the conversation, the chart, the declarative memory when we store and manage documents and the procedural memory when we store and manage the tools how to manage three memories with several embedder because the user can choose his fabric embedder and change it. -### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#match) Match +Nicola Procopio: +We explore the feature, the Qdrant aliases feature, and we call this topic the drunken cut effect because if we have several embedders, for example two model, two embedders with the same dimension, we can put in the collection in the episodic or declarative collection factors from two different embeddings with the same dimension. But the points are different for the same sentences and for the cat is like for the human, when he mixes drinks he has a big headache and don't understand what it retrieved. To us the flow now is this. We create the collection with the name and we use the aliases to. -jsonpythontypescriptrustjavacsharpgo +Piero Savastano: +Label. -```json -{ - "key": "color", - "match": { - "value": "red" - } -} +Nicola Procopio: +This collection with the name of the embedder used. When the user changed the embedder, we check if the embedder has the same dimension. If has the same dimension, we check also the aliases. If the aliases is the same we don't change nothing. Otherwise we create another collection and this is the drunken cut effect. The first feature that we use in the cat. Another feature is the quantization because with this Qdrant feature we improve the accuracy at the performance. We use the scalar quantitation because we are model agnostic and other quantitation like the binary quantitation. -``` +Nicola Procopio: +If you read on the Qdrant documents are experimented on not to all embedder but also for OpenAI and Coer. If I remember well with this discover quantitation and the scour quantization is used in the storage step. The vector are quantized and stored in a hybrid mode, the original vector on disk, the quantized vector in RAM and with this procedure we procedure we can use less memory. In case of Qdrant scalar quantization, the flat 32 elements is converted to int eight on a single number on a single element needs 75% less memory. In case of big embeddings like I don't know Gina embeddings or mistral embeddings with more than 1000 elements. This is big improvements. The second part is the retriever step. We use a quantizement query at the quantized vector to calculate causing similarity and we have the top n results like a simple semantic search pipeline. -```python -models.FieldCondition( - key="color", - match=models.MatchValue(value="red"), -) +Nicola Procopio: +But if we want a top end results in quantize mod, the quantity mod has less quality on the information and we use the oversampling. The oversampling is a simple multiplication. If we want top n with n ten with oversampling with a score like one five, we have 15 results, quantities results. When we have these 15 quantities results, we retrieve also the same 15 unquanted vectors. And on these unquanted vectors we rescale busset on the query and filter the best ten. This is an improvement because the retrieve step is so fast. Yes, because using these tip and tricks, the Cheshire capped vectors achieve up. -``` +Piero Savastano: +Four. -```typescript -{ - key: 'color', - match: {value: 'red'} -} +Nicola Procopio: +Times lower memory footprint and two time performance increase. We are so fast using this Qdrant feature. And last but not least, we go in deep on the memory. This is the visualization that Piero showed before. This is the vector space in 2D we use Disney is very similar to the Qdrant cloud visualization. For the embeddings we have the search bar, how many vectors we want to retrieve. We can choose the memory and other filters. We can filter on the memory and we can wipe a memory or all memory and clean all our space. -``` +Nicola Procopio: +We can go in deep using the details. We can pass on the dot and we have a bubble or use the detail, the detail and we have a list of first n results near our query for every memory. Last but not least, we can export and share our memory in two modes. The first is exporting the JSON using the export button from the UI. Or if you are very curious, you can navigate the folder in the project and share the long term memory folder with all the memories. Or the experimental feature is wake up the door mouse. This feature is simple, the download of Qdrant snapshots. This is experimental because the snapshot is very easy to download and we will work on faster methods to use it. -```rust -Condition::matches("color", "red".to_string()) +Nicola Procopio: +But now it works and sometimes us, some user use this feature for me is all and thank you. -``` +Demetrios: +All right, excellent. So that is perfect timing. And I know there have been a few questions coming through in the chat, one from me. I think you already answered, Piero. But when we can have some pistachio gelato made from good old Cheshire cat. -```java -matchKeyword("color", "red"); +Piero Savastano: +So the plan is make the cat order gelato from service from an API that can already be done. So we meet somewhere or at our house and gelato is going to come through the cat. The cat is able to take, each of us can do a different order, but to make the gelato itself, we're going to wait for more open source robotics to come to our way. And then we go also there. -``` +Demetrios: +Then we do that, we can get the full program. How cool is that? Well, let's see, I'll give it another minute, let anyone from the chat ask any questions. This was really cool and I appreciate you all breaking down. Not only the space and what you're doing, but the different ways that you're using Qdrant and the challenges and the architecture behind it. I would love to know while people are typing in their questions, especially for you, Nicola, what have been some of the challenges that you've faced when you're dealing with just trying to get Cheshire Cat to be more reliable and be more able to execute with confidence? -```csharp -using static Qdrant.Client.Grpc.Conditions; +Nicola Procopio: +The challenges are in particular to mix a lot of Qdrant feature with the user needs. Because I'm a researcher, a data scientist, I like to play with strange features like binary quantization, but we need to maintain the focus on the user needs, on the user behavior. And sometimes we cut some feature on the Cheshire cat because it's not important now for for the user and we can introduce some bug, or rather misunderstanding for the user. -MatchKeyword("color", "red"); +Demetrios: +Can you hear me? Yeah. All right, good. Now I'm seeing a question come through in the chat that is asking if you are thinking about cloud version of the cat. Like a SaaS, it's going to come. It's in the works. -``` +Piero Savastano: +It's in the works. Not only you can self host the cat freely, some people install it on a raspberry, so it's really lightweight. We plan to have an osted version and also a bigger plugin ecosystem with a little marketplace. Also user will be able to upload and maybe sell their plugins. So we want to build an know our vision is a WordPress style ecosystem. -```go -import "github.com/qdrant/go-client/qdrant" +Demetrios: +Very cool. Oh, that is awesome. So basically what I'm hearing from Nicola asking about some of the challenges are like, hey, there's some really cool features that we've got in Qdrant, but it's almost like you have to keep your eye on the prize and make sure that you're building for what people need and want instead of just using cool features because you can use cool features. And then Piero, you're saying, hey, we really want to enable people to be able to build more cool things and use all these cool different features and whatever flavors or tools they want to use. But we want to be that ecosystem creator so that anyone can bring and create an app on top of the ecosystem and then enable them to get paid also. So it's not just Cheshire cat getting paid, it's also the contributors that are creating cool stuff. -qdrant.NewMatch("color", "red") +Piero Savastano: +Yeah. Community is the first protagonist without community. I'm going to tell you, the cat started as a tutorial. When chat GPT came out, I decided to do a little rug tutorial and I chose Qdrant as vector. I took OpenAI as a language model, and I built a little tutorial, and then from being a tutorial to show how to build an agent on GitHub, it completely went out of hand. So the whole framework is organically grown? -``` +Demetrios: +Yeah, that's the best. That is really cool. Simone is asking if there's companies that are already using Cheshire cat, and if you can mention a few. -For the other types, the match condition will look exactly the same, except for the type used: +Piero Savastano: +Yeah, okay. In Italy, there are at least 1015 companies distributed along education, customer care, typical chatbot usage. Also, one of them in particular is trying to build for public administration, which is really hard to do on the international level. We are seeing something in Germany, like web agencies starting to use the cat a little on the USA. Mostly they are trying to build agents using the cat and Ollama as a runner. And a company in particular presented in a conference in Vegas a pitch about a 3d avatar. Inside the avatar, there is the cat as a linguistic device. -jsonpythontypescriptrustjavacsharpgo +Demetrios: +Oh, nice. -```json -{ - "key": "count", - "match": { - "value": 0 - } -} +Piero Savastano: +To be honest, we have a little problem tracking companies because we still have no telemetry. We decided to be no telemetry for the moment. So I hope companies will contribute and make themselves happen. If that does not, we're going to track a little more. But companies using the cat are at least in the 50, 60, 70. -``` +Demetrios: +Yeah, nice. So if anybody out there is using the cat, and you have not talked to Piero yet, let him know so that he can have a good idea of what you're doing and how you're doing it. There's also another question coming through about the market analysis. Are there some competitors? -```python -models.FieldCondition( - key="count", - match=models.MatchValue(value=0), -) +Piero Savastano: +There are many competitors. When you go down to what distinguishes the cat from many other frameworks that are coming out, we decided since the beginning to go for a plugin based operational agent. And at the moment, most frameworks are retrieval augmented generation frameworks. We have both retrieval augmented generation. We have tooling, we have forms. The tools and the forms are also embedded. So the cat can have 20,000 tools, because we also embed the tools and we make a recall over the function calling. So we scaled up both documents, conversation and tools, conversational forms, and I've not seen anybody doing that till now. -``` +Piero Savastano: +So if you want to build an application, a pragmatic, operational application, to buy products, order pizza, do stuff, have a company assistant. The cat is really good at the moment. -```typescript -{ - key: 'count', - match: {value: 0} -} +Demetrios: +Excellent. -``` +Nicola Procopio: +And the cat has a very big community on discord works. -```rust -Condition::matches("count", 0) +Piero Savastano: +Our discord is a mess. -``` +Demetrios: +You got the best memes around. If that doesn't make people join the discord, I don't know what will. -```java -import static io.qdrant.client.ConditionFactory.match; +Piero Savastano: +Please, Nicola. Sorry for interrupting. -match("count", 0); +Demetrios: +No. -``` +Nicola Procopio: +Okay. The community is a plus for Cheshire Cat because we have a lot of developer user on Discord, and for an open source project, the community is fundamentally 100%. -```csharp -using static Qdrant.Client.Grpc.Conditions; +Demetrios: +Well fellas, this has been awesome. I really appreciate you coming on the vector space talks and sharing about the cat for anybody that is interested. Hopefully they go, they check it out, they join your community, they share some memes and they get involved, maybe even contribute back and create some tools. That would be awesome. So Piero and Nicola, I really appreciate your time. We'll see you all later. -Match("count", 0); +Piero Savastano: +Thank you. -``` +Nicola Procopio: +Thank you. -```go -import "github.com/qdrant/go-client/qdrant" +Demetrios: +And for anybody out there that wants to come on to the vector space talks and give us a bit of an update on how you're using Qdrant, we'd love to hear it. Just reach out and we'll schedule you in. Until next time. See y'all. Bye. -qdrant.NewMatchInt("count", 0) +<|page-363-lllmstxt|> +### Summary -``` +A security vulnerability has been discovered in Qdrant affecting all versions +prior to v1.9, described in [CVE-2024-2221](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2024-2221). +The vulnerability allows an attacker to upload arbitrary files to the +filesystem, which can be used to gain remote code execution. -The simplest kind of condition is one that checks if the stored value equals the given one. -If several values are stored, at least one of them should match the condition. -You can apply it to [keyword](https://qdrant.tech/documentation/concepts/payload/#keyword), [integer](https://qdrant.tech/documentation/concepts/payload/#integer) and [bool](https://qdrant.tech/documentation/concepts/payload/#bool) payloads. +The vulnerability does not materially affect Qdrant cloud deployments, as that +filesystem is read-only and authentication is enabled by default. At worst, +the vulnerability could be used by an authenticated user to crash a cluster, +which is already possible, such as by uploading more vectors than can fit in RAM. -### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#match-any) Match Any +Qdrant has addressed the vulnerability in v1.9.0 and above with code that +restricts file uploads to a folder dedicated to that purpose. -_Available as of v1.1.0_ +### Action -In case you want to check if the stored value is one of multiple values, you can use the Match Any condition. -Match Any works as a logical OR for the given values. It can also be described as a `IN` operator. +Check the current version of your Qdrant deployment. Upgrade if your deployment +is not at least v1.9.0. -You can apply it to [keyword](https://qdrant.tech/documentation/concepts/payload/#keyword) and [integer](https://qdrant.tech/documentation/concepts/payload/#integer) payloads. +To confirm the version of your Qdrant deployment in the cloud or on your local +or cloud system, run an API GET call, as described in the [Qdrant Cloud Setup +guide](/documentation/cloud/authentication/#test-cluster-access). +If your Qdrant deployment is local, you do not need an API key. -Example: +Your next step depends on how you installed Qdrant. For details, read the +[Qdrant Installation](/documentation/guides/installation/) +guide. -jsonpythontypescriptrustjavacsharpgo +#### If you use the Qdrant container or binary -```json -{ - "key": "color", - "match": { - "any": ["black", "yellow"] - } -} +Upgrade your deployment. Run the commands in the applicable section of the +[Qdrant Installation](/documentation/guides/installation/) +guide. The default commands automatically pull the latest version of Qdrant. -``` +#### If you use the Qdrant helm chart -```python -models.FieldCondition( - key="color", - match=models.MatchAny(any=["black", "yellow"]), -) +If you’ve set up Qdrant on kubernetes using a helm chart, follow the README in +the [qdrant-helm](https://github.com/qdrant/qdrant-helm/tree/main?tab=readme-ov-file#upgrading) repository. +Make sure applicable configuration files point to version v1.9.0 or above. -``` +#### If you use the Qdrant cloud -```typescript -{ - key: 'color', - match: {any: ['black', 'yellow']} -} +No action is required. This vulnerability does not materially affect you. However, we suggest that you upgrade your cloud deployment to the latest version. -``` +> Note: This article has been updated on 2024-05-10 to encourage users to upgrade to 1.9.0 to ensure protection from both CVE-2024-2221 and CVE-2024-3829. -```rust -Condition::matches("color", vec!["black".to_string(), "yellow".to_string()]) +<|page-364-lllmstxt|> +Today, we're happy to announce that **FastLLM (FLLM)**, our lightweight Language Model tailored specifically for Retrieval Augmented Generation (RAG) use cases, has officially entered Early Access! -``` +Developed to seamlessly integrate with Qdrant, **FastLLM** represents a significant leap forward in AI-driven content generation. Up to this point, LLM’s could only handle up to a few million tokens. -```java -import static io.qdrant.client.ConditionFactory.matchKeywords; +**As of today, FLLM offers a context window of 1 billion tokens.** -matchKeywords("color", List.of("black", "yellow")); +However, what sets FastLLM apart is its optimized architecture, making it the ideal choice for RAG applications. With minimal effort, you can combine FastLLM and Qdrant to launch applications that process vast amounts of data. Leveraging the power of Qdrant's scalability features, FastLLM promises to revolutionize how enterprise AI applications generate and retrieve content at massive scale. -``` +> *“First we introduced [FastEmbed](https://github.com/qdrant/fastembed). But then we thought - why stop there? Embedding is useful and all, but our users should do everything from within the Qdrant ecosystem. FastLLM is just the natural progression towards a large-scale consolidation of AI tools.” Andre Zayarni, President & CEO, Qdrant* +> -```csharp -using static Qdrant.Client.Grpc.Conditions; +## Going Big: Quality & Quantity -Match("color", ["black", "yellow"]); +Very soon, an LLM will come out with a context window so wide, it will completely eliminate any value a measly vector database can add. -``` +***We know this. That’s why we trained our own LLM to obliterate the competition. Also, in case vector databases go under, at least we'll have an LLM left!*** -```go -import "github.com/qdrant/go-client/qdrant" +As soon as we entered Series A, we knew it was time to ramp up our training efforts. FLLM was trained on 300,000 NVIDIA H100s connected by 5Tbps Infiniband. It took weeks to fully train the model, but our unified efforts produced the most powerful LLM known to human
..or LLM. -qdrant.NewMatchKeywords("color", "black", "yellow") +We don’t see how any other company can compete with FastLLM. Most of our competitors will soon be burning through graphics cards trying to get to the next best thing. But it is too late. By this time next year, we will have left them in the dust. -``` +> ***“Everyone has an LLM, so why shouldn’t we? Let’s face it - the more products and features you offer, the more they will sign up. Sure, this is a major pivot
but life is all about being bold.”*** *David Myriel, Director of Product Education, Qdrant* +> -In this example, the condition will be satisfied if the stored value is either `black` or `yellow`. +## Extreme Performance -If the stored value is an array, it should have at least one value matching any of the given values. E.g. if the stored value is `["black", "green"]`, the condition will be satisfied, because `"black"` is in `["black", "yellow"]`. +Qdrant’s R&D is proud to stand behind the most dramatic benchmark results. Across a range of standard benchmarks, FLLM surpasses every single model in existence. In the [Needle In A Haystack](https://github.com/gkamradt/LLMTest_NeedleInAHaystack) (NIAH) test, FLLM found the embedded text with 100% accuracy, always within blocks containing 1 billion tokens. We actually believe FLLM can handle more than a trillion tokens, but it’s quite possible that it is hiding its true capabilities. -### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#match-except) Match Except +FastLLM has a fine-grained mixture-of-experts architecture and a whopping 1 trillion total parameters. As developers and researchers delve into the possibilities unlocked by this new model, they will uncover new applications, refine existing solutions, and perhaps even stumble upon unforeseen breakthroughs. As of now, we're not exactly sure what problem FLLM is solving, but hey, it's got a lot of parameters! -_Available as of v1.2.0_ +> *Our customers ask us “What can I do with an LLM this extreme?” I don’t know, but it can’t hurt to build another RAG chatbot.” Kacper Lukawski, Senior Developer Advocate, Qdrant* +> -In case you want to check if the stored value is not one of multiple values, you can use the Match Except condition. -Match Except works as a logical NOR for the given values. -It can also be described as a `NOT IN` operator. +## Get Started! -You can apply it to [keyword](https://qdrant.tech/documentation/concepts/payload/#keyword) and [integer](https://qdrant.tech/documentation/concepts/payload/#integer) payloads. +Don't miss out on this opportunity to be at the forefront of AI innovation. Join FastLLM's Early Access program now and embark on a journey towards AI-powered excellence! -Example: +Stay tuned for more updates and exciting developments as we continue to push the boundaries of what's possible with AI-driven content generation. -jsonpythontypescriptrustjavacsharpgo +Happy Generating! 🚀 -```json -{ - "key": "color", - "match": { - "except": ["black", "yellow"] - } -} +[Sign Up for Early Access](https://qdrant.to/cloud) -``` +<|page-365-lllmstxt|> +> *"It's like mandatory to have a vector database that is scalable, that is fast, that has low latencies, that can under parallel request a large amount of requests. So you have really this need and Qdrant was like an obvious choice.”*\ +— Guillaume Marquis +> -```python -models.FieldCondition( - key="color", - match=models.MatchExcept(**{"except": ["black", "yellow"]}), -) +Guillaume Marquis, a dedicated Engineer and AI enthusiast, serves as the Chief Technology Officer and Co-Founder of VirtualBrain, an innovative AI company. He is committed to exploring novel approaches to integrating artificial intelligence into everyday life, driven by a passion for advancing the field and its applications. -``` +***Listen to the episode on [Spotify](https://open.spotify.com/episode/20iFzv2sliYRSHRy1QHq6W?si=xZqW2dF5QxWsAN4nhjYGmA), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/v85HqNqLQcI?feature=shared).*** -```typescript -{ - key: 'color', - match: {except: ['black', 'yellow']} -} + -``` + -```rust -use qdrant_client::qdrant::r#match::MatchValue; +## **Top takeaways:** -Condition::matches( - "color", - !MatchValue::from(vec!["black".to_string(), "yellow".to_string()]), -) +Who knew that document retrieval could be creative? Guillaume and VirtualBrain help draft sales proposals using past reports. It's fascinating how tech aids deep work beyond basic search tasks. -``` +Tackling document retrieval and AI assistance, Guillaume furthermore unpacks the ins and outs of searching through vast data using a scoring system, the virtue of [RAG](https://qdrant.tech/rag/rag-evaluation-guide/) for deep work, and going through the 'illusion of work', enhancing insights for knowledge workers while confronting the challenges of scalability and user feedback on hallucinations. -```java -import static io.qdrant.client.ConditionFactory.matchExceptKeywords; +Here are some key insight from this episode you need to look out for: -matchExceptKeywords("color", List.of("black", "yellow")); +1. How to navigate the world of data with a precision scoring system for document retrieval. +2. The importance of fresh data and how to avoid the black holes of outdated info. +3. Techniques to boost system scalability and speed — essential in the vastness of data space. +4. AI Assistants tailored for depth rather than breadth, aiding in tasks like crafting stellar commercial proposals. +5. The intriguing role of user perception in AI tool interactions, plus a dash of timing magic. -``` +> Fun Fact: VirtualBrain uses Qdrant, for its advantages in speed, scalability, and API capabilities. +> -```csharp -using static Qdrant.Client.Grpc.Conditions; -Match("color", ["black", "yellow"]); +## Show notes: -``` +00:00 Hosts and guest recommendations.\ +09:01 Leveraging past knowledge to create new proposals.\ +12:33 Ingesting and parsing documents for context retrieval.\ +14:26 Creating and storing data, performing advanced searches.\ +17:39 Analyzing document date for accurate information retrieval.\ +20:32 Perceived time can calm nerves and entertain.\ +24:23 Tried various vector databases, preferred open source.\ +27:42 LangFuse: open source tool for monitoring tasks.\ +33:10 AI tool designed to stay within boundaries.\ +34:31 Minimizing hallucination in AI through careful analysis. -```go -import "github.com/qdrant/go-client/qdrant" -qdrant.NewMatchExcept("color", "black", "yellow") +## More Quotes from Guillaume: -``` -In this example, the condition will be satisfied if the stored value is neither `black` nor `yellow`. +*"We only exclusively use open source tools because of security aspects and stuff like that. That's why also we are using Qdrant one of the important point on that. So we have a system, we are using this serverless stuff to ingest document over time.”*\ +— Guillaume Marquis -If the stored value is an array, it should have at least one value not matching any of the given values. E.g. if the stored value is `["black", "green"]`, the condition will be satisfied, because `"green"` does not match `"black"` nor `"yellow"`. +*"One of the challenging part was the scalability of the system. We have clients that come with terra octave of data and want to be parsed really fast and so you have the ingestion, but even after the semantic search, even on a large data set can be slow. And today ChatGPT answers really fast. So your users, even if the question is way more complicated to answer than a basic ChatGPT question, they want to have their answer in seconds. So you have also this challenge that you really have to take care.”*\ +— Guillaume Marquis -### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#nested-key) Nested key +*"Our AI is not trained to write you a speech based on Shakespeare and with the style of Martin Luther King. It's not the purpose of the tool. So if you ask something that is out of the box, he will just say like, okay, I don't know how to answer that. And that's an important point. That's a feature by itself to be able to not go outside of the box.”*\ +— Guillaume Marquis -_Available as of v1.1.0_ +## Transcript: +Demetrios: +So, dude, I'm excited for this talk. Before we get into it, I want to make sure that we have some pre conversation housekeeping items that go out, one of which being, as always, we're doing these vector space talks and everyone is encouraged and invited to join in. Ask your questions, let us know where you're calling in from, let us know what you're up to, what your use case is, and feel free to drop any questions that you may have in the chat. We will be monitoring it like a hawk. Today I am joined by none other than Sabrina. How are you doing, Sabrina? -Payloads being arbitrary JSON object, it is likely that you will need to filter on a nested field. +Sabrina Aquino: +What's up, Demetrios? I'm doing great. Excited to be here. I just love seeing what amazing stuff people are building with Qdrant and. Yeah, let's get into it. -For convenience, we use a syntax similar to what can be found in the [Jq](https://stedolan.github.io/jq/manual/#Basicfilters) project. +Demetrios: +Yeah. So I think I see Sabrina's wearing a special shirt which is don't get lost in vector space shirt. If anybody wants a shirt like that. There we go. Well, we got you covered, dude. You will get one at your front door soon enough. If anybody else wants one, come on here. Present at the next vector space talks. -Suppose we have a set of points with the following payload: +Demetrios: +We're excited to have you. And we've got one last thing that I think is fun that we can talk about before we jump into the tech piece of the conversation. And that is I told Sabrina to get ready with some recommendations. Know vector databases, they can be used occasionally for recommendation systems, but nothing's better than getting that hidden gem from your friend. And right now what we're going to try and do is give you a few hidden gems so that the next time the recommendation engine is working for you, it's working in your favor. And Sabrina, I asked you to give me one music that you can recommend, one show and one rando. So basically one random thing that you can recommend to us. -```json -[\ - {\ - "id": 1,\ - "country": {\ - "name": "Germany",\ - "cities": [\ - {\ - "name": "Berlin",\ - "population": 3.7,\ - "sightseeing": ["Brandenburg Gate", "Reichstag"]\ - },\ - {\ - "name": "Munich",\ - "population": 1.5,\ - "sightseeing": ["Marienplatz", "Olympiapark"]\ - }\ - ]\ - }\ - },\ - {\ - "id": 2,\ - "country": {\ - "name": "Japan",\ - "cities": [\ - {\ - "name": "Tokyo",\ - "population": 9.3,\ - "sightseeing": ["Tokyo Tower", "Tokyo Skytree"]\ - },\ - {\ - "name": "Osaka",\ - "population": 2.7,\ - "sightseeing": ["Osaka Castle", "Universal Studios Japan"]\ - }\ - ]\ - }\ - }\ -] +Sabrina Aquino: +So I've picked. I thought about this. Okay, I give it some thought. The movie would be Catch Me If You Can by Leo DiCaprio and Tom Hanks. Have you guys watched it? Really good movie. The song would be oh, children by knee cave and the bad scenes. Also very good song. And the random recommendation is my favorite scented candle, which is citrus notes, sea salt and cedar. -``` +Sabrina Aquino: +So there you go. -You can search on a nested field using a dot notation. +Demetrios: +A scented candle as a recommendation. I like it. I think that's cool. I didn't exactly tell you to get ready with that. So I'll go next, then you can have some more time to think. So for anybody that's joining in, we're just giving a few recommendations to help your own recommendation engines at home. And we're going to get into this conversation about rags in just a moment. But my song is with. -httppythontypescriptrustjavacsharpgo +Demetrios: +Oh, my God. I've been listening to it because I didn't think that they had it on Spotify, but I found it this morning and I was so happy that they did. And it is Bill Evans and Chet Baker. Basically, their whole album, the legendary sessions, is just like, incredible. But the first song on that album is called Alone Together. And when Chet Baker starts playing his little trombone, my God, it is like you can feel emotion. You can touch it. That is what I would recommend. -```http -POST /collections/{collection_name}/points/scroll -{ - "filter": { - "should": [\ - {\ - "key": "country.name",\ - "match": {\ - "value": "Germany"\ - }\ - }\ - ] - } -} +Demetrios: +Anyone out there? I'll drop a link in the chat if you like it. The film or series. This fool, if you speak Spanish, it's even better. It is amazing series. Get that, do it. And as the rando thing, I've been having Rishi mushroom powder in my coffee in the mornings. I highly recommend it. All right, last one, let's get into your recommendations and then we'll get into this rag chat. -``` +Guillaume Marquis: +So, yeah, I sucked a little bit. So for the song, I think I will give something like, because I'm french, I think you can hear it. So I will choose Get Lucky of Daft Punk and because I am a little bit sad of the end of their collaboration. So, yeah, just like, I cannot forget it. And it's a really good music. Like, miss them as a movie, maybe something like I really enjoy. So we have a lot of french movies that are really nice, but something more international maybe, and more mainstream. Jungle of Tarantino, that is really a good movie and really enjoy it. -```python -client.scroll( - collection_name="{collection_name}", - scroll_filter=models.Filter( - should=[\ - models.FieldCondition(\ - key="country.name", match=models.MatchValue(value="Germany")\ - ),\ - ], - ), -) +Guillaume Marquis: +I watched it several times and still a good movie to watch. And random thing, maybe a city. A city to go to visit. I really enjoyed. It's hard to choose. Really hard to choose a place in general. Okay, Florence, like in Italy. -``` +Demetrios: +There we go. -```typescript -client.scroll("{collection_name}", { - filter: { - should: [\ - {\ - key: "country.name",\ - match: { value: "Germany" },\ - },\ - ], - }, -}); +Guillaume Marquis: +Yeah, it's a really cool city to go. So if you have time, and even Sabrina, if you went to Europe soon, it's really a nice place to go. -``` +Demetrios: +That is true. Sabrina is going to Europe soon. We're blowing up her spot right now. So hopefully Florence is on the list. I know that most people watching did not tune in to hearing the three of us just randomly give recommendations. We are here to talk more about retrieval augmented generation. But hopefully those recommendations help some of you all at home with your recommendation engines. And you're maybe using a little bit of a vector database in your recommendation engine building skills. -```rust -use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; +Demetrios: +Let's talk about this, though, man, because I think it would be nice if you can set the scene. What exactly are you working on? I know you've got virtual brain. Can you tell us a little bit about that so that we can know how you're doing rags? -client - .scroll( - ScrollPointsBuilder::new("{collection_name}").filter(Filter::should([\ - Condition::matches("country.name", "Germany".to_string()),\ - ])), - ) - .await?; +Guillaume Marquis: +Because rag is like, I think the most famous word in the AI sphere at the moment. So, virtual brain, what we are building in particular is that we are building an AI assistant for knowledge workers. So we are not only building this next gen search bar to search content through documents, it's a tool for enterprises at enterprise grade that provide some easy way to interact with your knowledge. So basically, we create a tool that we connect to the world knowledge of the company. It could be whatever, like the drives, sharepoints, whatever knowledge you have, any kind of documents, and with that you will be able to perform tasks on your knowledge, such as like audit, RFP, due diligence. It's not only like everyone that is building rag or building a kind of search system through rag are always giving the same number. Is that like 20%? As a knowledge worker, you spend 20% of your time by searching information. And I think I heard this number so much time, and that's true, but it's not enough. -``` +Guillaume Marquis: +Like the search bar, a lot of companies, like many companies, are working on how to search stuff for a long time, and it's always a subject. But the real pain and what we want to handle and what we are handling is deep work, is real tasks, is how to help these workers, to really help them as an assistant, not only on search bar, like as an assistant on real task, real added value tasks. So inside that, can you give us. -```java -import static io.qdrant.client.ConditionFactory.matchKeyword; +Demetrios: +An example of that? Is it like that? It pops up when it sees me working on notion and talking about or creating a PRD, and then it says, oh, this might be useful for your PRD because you were searching about that a week ago or whatever. -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.ScrollPoints; +Guillaume Marquis: +For instance. So we are working with companies that have from 100 employees to several thousand employees. For instance, when you have to create a commercial proposal as a salesperson in a company, you have an history with a company, an history in this ecosystem, a history within this environment, and you have to capitalize on all this commercial proposition that you did in the past in your company, you can have thousands of propositions, you can have thousands of documents, you can have reporting from different departments, depending of the industry you are working on, and with that, with the tool. So you can ask question, you can capitalize on this document, and you can easily create new proposal by asking question, by interacting with the tool, to go deeply in this use case and to create something that is really relevant for your new use case. And that is using really the knowledge that you have in your company. And so it's not only like retrieve or just like find me as last proposition of this client. It's more like, okay, use x past proposals to create a new one. And that's a real challenge that is linked to our subject. -client - .scrollAsync( - ScrollPoints.newBuilder() - .setCollectionName("{collection_name}") - .setFilter( - Filter.newBuilder() - .addShould(matchKeyword("country.name", "Germany")) - .build()) - .build()) - .get(); +Guillaume Marquis: +It's because it's not only like retrieve one, two or even ten documents, it's about retrieving like hundred, 200, a lot of documents, a lot of information, and you have a real something to do with a lot of documents, a lot of context, a lot of information you have to manage. -``` +Demetrios: +I have the million dollar question that I think is probably coming through everyone's head is like, you're retrieving so many documents, how are you evaluating your retrieval? -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -using static Qdrant.Client.Grpc.Conditions; +Guillaume Marquis: +That's definitely the $1 million question. It's a toss task to do, to be honest. To be fair. Currently what we are doing is that we monitor every tasks of the process, so we have the output of every tasks. On each tasks we use a scoring system to evaluate if it's relevant to the initial question or the initial task of the user. And we have a global scoring system on all the system. So it's quite odd, it's a little bit empiric, but it works for now. And it really help us to also improve over time all the tasks and all the processes that are done by the tool. -var client = new QdrantClient("localhost", 6334); +Guillaume Marquis: +So it's really important. And for instance, you have this kind of framework that is called RAGtriad. That is a way to evaluate rag on the accuracy of the context you retrieve on the link with the initial question and so on, several parameters. And you can really have a first way to evaluate the quality of answers and the quality of everything on each steps. -await client.ScrollAsync(collectionName: "{collection_name}", filter: MatchKeyword("country.name", "Germany")); +Sabrina Aquino: +I love it. Can you go more into the tech that you use for each one of these steps in architecture? -``` +Guillaume Marquis: +So the process is quite like, it starts at the moment we ingest documents because basically it's hard to retrieve good documents or retrieve documents in a proper way if you don't parse it well. If you just like the dumb rug, as I call it, is like, okay, you take a document, you divide it in text, and that's it. But you will definitely lose the context, the global context of the document, what the document in general is talking about. And you really need to do it properly and to keep this context. And that's a real challenge, because if you keep some noises, if you don't do that well, everything will be broken at the end. So technically how it works. So we have a proper system that we developed to ingest documents using technologies, open source technologies. We only exclusively use open source tools because of security aspects and stuff like that. -```go -import ( - "context" +Guillaume Marquis: +That's why also we are using Qdrant one of the important point on that. So we have a system, we are using this serverless stuff to ingest document over time. We have also models that create tags on documents. So we use open source slms to tag documents, to enrich documents, also to create a new title, to create a summary of documents, to keep the context. When we divide the document, we keep the title of paragraphers, the context inside paragraphers, and we leak every piece of text between each other to keep the context after that, when we retrieve the document. So it's like the retrieving part. We have a new breed search system. We are using Qdrant on the semantic port. - "github.com/qdrant/go-client/qdrant" -) +Guillaume Marquis: +So basically we are creating unbelieving, we are storing it into Qdrant. We are performing similarity search to retrieve documents based on title summary filtering, on tags, on the semantic context. And we have also some keyword search, but it's more for specific tasks, like when we know that we need a specific document, at some point we are searching it with a keyword search. So it's like a kind of ebrid system that is using deterministic approach with filtering with tags, and a probabilistic approach with selecting document with this ebot search, and doing a scoring system after that to get what is the most relevant document and to select how much content we will take from each document. It's a little bit techy, but it's really cool to create and we have a way to evolve it and to improve it. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Demetrios: +That's what we like around here, man. We want the techie stuff. That's what I think everybody signed up for. So that's very cool. One question that definitely comes up a lot when it comes to rags and when you're ingesting documents, and then when you're retrieving documents and updating documents, how do you make sure that the documents that you are, let's say, I know there's probably a hypothetical HR scenario where the company has a certain policy and they say you can have European style holidays, you get like three months of holidays a year, or even French style holidays. Basically, you just don't work. And whenever you want, you can work, you don't work. And then all of a sudden a US company comes and takes it over and they say, no, you guys don't get holidays. -client.Scroll(context.Background(), &qdrant.ScrollPoints{ - CollectionName: "{collection_name}", - Filter: &qdrant.Filter{ - Should: []*qdrant.Condition{ - qdrant.NewMatch("country.name", "Germany"), - }, - }, -}) +Demetrios: +Even when you do get holidays, you're not working or you are working and so you have to update all the HR documents, right? So now when you have this knowledge worker that is creating something, or when you have anyone that is getting help, like this copilot help, how do you make sure that the information that person is getting is the most up to date information possible? -``` +Guillaume Marquis: +That's a new $1 million question. -You can also search through arrays by projecting inner values using the `[]` syntax. +Demetrios: +I'm coming with the hits today. I don't know what you were looking for. -httppythontypescriptrustjavacsharpgo +Guillaume Marquis: +That's a really good question. So basically you have several possibilities on that. First one you have like this PowerPoint presentation. That's a mess in the knowledge bases and sometimes you just want to use the most updated up to date documents. So basically we can filter on the created ad and the date of the documents. Sometimes you want to also compare the evolution of the process over time. So that's another use case. Basically we base. -```http -POST /collections/{collection_name}/points/scroll -{ - "filter": { - "should": [\ - {\ - "key": "country.cities[].population",\ - "range": {\ - "gte": 9.0,\ - }\ - }\ - ] - } -} +Guillaume Marquis: +So during the ingestion we are analyzing if date is inside the document, because sometimes in documentation you have like the date at the end of the document or at the beginning of the document. That's a first way to do it. We have the date of the creation of the document, but it's not a source of truth because sometimes you created it after or you duplicated it and the date is not the same, depending if you are working on Windows, Microsoft, stuff like that. It's definitely a mess. And also we compare documents. So when we retry the documents and documents are really similar one to each other, we keep it in mind and we try to give more information as possible. Sometimes it's not possible, so it's not 100%, it's not bulletproof, but it's a real question of that. So it's a partial answer of your question, but it's like some way we are today filtering and answering on this special topic. -``` +Sabrina Aquino: +Now I wonder what was the most challenging part of building this frag since there was like. -```python -client.scroll( - collection_name="{collection_name}", - scroll_filter=models.Filter( - should=[\ - models.FieldCondition(\ - key="country.cities[].population",\ - range=models.Range(\ - gt=None,\ - gte=9.0,\ - lt=None,\ - lte=None,\ - ),\ - ),\ - ], - ), -) +Guillaume Marquis: +There are a lot of parts that are really challenging. -``` +Sabrina Aquino: +Challenging. -```typescript -client.scroll("{collection_name}", { - filter: { - should: [\ - {\ - key: "country.cities[].population",\ - range: {\ - gt: null,\ - gte: 9.0,\ - lt: null,\ - lte: null,\ - },\ - },\ - ], - }, -}); +Guillaume Marquis: +One of the challenging part was the scalability of the system. We have clients that come with terra octave of data and want to be parsed really fast and so you have the ingestion, but even after the semantic search, even on a large data set can be slow. And today Chat GPT answer really fast. So your users, even if the question is way more complicated to answer than a basic Chat GPT question, they want to have their answer in seconds. So you have also this challenge that is really you have to take care. So it's quite challenging and it's like this industrial supply chain. So when you upgrade something, you have to be sure that everything is working well on the other side. And that's a real challenge to handle. -``` +Guillaume Marquis: +And we are still on it because we are still evolving and getting more data. And at the end of the day, you have to be sure that everything is working well in terms of LLM, but in terms of research and in terms also a few weeks to give some insight to the user of what is working under the hood, to give them the possibility to wait a few seconds more, but starting to give them pieces of answer. -```rust -use qdrant_client::qdrant::{Condition, Filter, Range, ScrollPointsBuilder}; +Demetrios: +Yeah, it's funny you say that because I remember talking to somebody that was working at you.com and they were saying how there's like the actual time. So they were calling it something like perceived time and real, like actual time. So you as an end user, if you get asked a question or maybe there's like a trivia quiz while the question is coming up, then it seems like it's not actually taking as long as it is. Even if it takes 5 seconds, it's a little bit cooler. Or as you were mentioning, I remember reading some paper, I think, on how people are a lot less anxious if they see the words starting to pop up like that and they see like, okay, it's not just I'm waiting and then the whole answer gets spit back out at me. It's like I see the answer forming as it is in real time. And so that can calm people's nerves too. -client - .scroll( - ScrollPointsBuilder::new("{collection_name}").filter(Filter::should([\ - Condition::range(\ - "country.cities[].population",\ - Range {\ - gte: Some(9.0),\ - ..Default::default()\ - },\ - ),\ - ])), - ) - .await?; +Guillaume Marquis: +Yeah, definitely. Human's brain is like marvelous on that. And you have a lot of stuff. Like, one of my favorites is the illusion of work. Do you know it? It's the total opposite. If you have something that seems difficult to do, adding more time of processing. So the user will imagine that it's really an OD task to do. And so that's really funny. -``` +Demetrios: +So funny like that. -```java -import static io.qdrant.client.ConditionFactory.range; +Guillaume Marquis: +Yeah. Yes. It's the opposite of what you will think if you create a product, but that's real stuff. And sometimes just to output them that you are performing toss tasks in the background, it helps them to. Oh, yes. My question was really like a complex question, like you have a lot of work to do. It's Axe word like. If you answer too fast, they will not trust the answer. -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.Range; -import io.qdrant.client.grpc.Points.ScrollPoints; +Guillaume Marquis: +And it's the opposite if you answer too slow. You can have this. Okay. But it should be dumb because it's really slow. So it's a dumb AI or stuff like that. So that's really funny. My co founder actually was a product guy, so really focused on product, and he really loves this kind of stuff. -client - .scrollAsync( - ScrollPoints.newBuilder() - .setCollectionName("{collection_name}") - .setFilter( - Filter.newBuilder() - .addShould( - range( - "country.cities[].population", - Range.newBuilder().setGte(9.0).build())) - .build()) - .build()) - .get(); +Demetrios: +Great thought experiment, that's interesting. -``` +Sabrina Aquino: +And you mentioned like you chose Qdrant because it's open source, but now I wonder if there's also something to do with your need for something that's fast, that's scalable, and what other factors you took in consideration when choosing the vector DB. -```csharp -using Qdrant.Client; -using static Qdrant.Client.Grpc.Conditions; +Guillaume Marquis: +Yes, so I told you that the scalability and the speed is like one of the most important points and toast part to endure. And yes, definitely, because when you are building a complex rag, you are not like just performing one research, at some points you are doing it maybe like you are splitting the question, doing several at the same time. And so it's like mandatory to have a vector database that is scalable, that is fast, that has low latencies, that can under parallel request a large amount of requests. So you have really this need. And Qdrant was like an obvious choose. Actually, we did a benchmark, so we really tried several possibilities. -var client = new QdrantClient("localhost", 6334); +Demetrios: +Some tell me more. Yeah. -await client.ScrollAsync( - collectionName: "{collection_name}", - filter: Range("country.cities[].population", new Qdrant.Client.Grpc.Range { Gte = 9.0 }) -); +Guillaume Marquis: +So we tried the classic postgres page vectors, that is, I think we tried it like 30 minutes, and we realized really fast that it was really not good for our use case. We tried Weaviate, we tried Milvus, we tried Qdrant, we tried a lot. We prefer use open source because of security issues. We tried Pinecone initially, we were on Pinecone at the beginning of the company. And so the most important point, so we have the speed of the tool, we have the scalability we have also, maybe it's a little bit dumb to say that, but we have also the API. I remember using Pinecone and trying just to get all vectors and it was not possible somehow, and you have this dumb stuff that are sometimes really strange. And if you have a tool that is 100% made for your use case with people that are working on it, really dedicated on that, and that are aligned with your vision of what is the evolution of this. I think it's like the best tool you have to choose. -``` +Demetrios: +So one thing that I would love to hear about too, is when you're looking at your system and you're looking at just the product in general, what are some of the key metrics that you are constantly monitoring, and how do you know that you're hitting them or you're not? And then if you're not hitting them, what are some ways that you debug the situation? -```go -import ( - "context" +Guillaume Marquis: +By metrics you mean like usage metrics. - "github.com/qdrant/go-client/qdrant" -) +Demetrios: +Or like, I'm more thinking on your whole tech setup and the quality of your rag. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Guillaume Marquis: +Basically we are focused on industry of knowledge workers and industry in particular like of consultants. So we have some data set of questions that we know should be answered. Well, we know the kind of outputs we should have. The metrics we are like monitoring on our rag is mostly the accuracy of the answer, the accuracy of sources, the number of hallucination that is sometimes really also hard to manage. Actually our tool is sourcing everything. When you ask a question or when you perform a task, it gives you all the sources. But sometimes you can have a perfect answer and just like one number inside your answer that comes from nowhere, that is totally like invented and that's up to get. We are still working on that. -client.Scroll(context.Background(), &qdrant.ScrollPoints{ - CollectionName: "{collection_name}", - Filter: &qdrant.Filter{ - Should: []*qdrant.Condition{ - qdrant.NewRange("country.cities[].population", &qdrant.Range{ - Gte: qdrant.PtrOf(9.0), - }), - }, - }, -}) +Guillaume Marquis: +We are not the most advanced on this part. We just implemented a tool I think you may know it's LangFuse. Do you know them? LangFuse? -``` +Demetrios: +No. Tell me more. -This query would only output the point with id 2 as only Japan has a city with population greater than 9.0. +Guillaume Marquis: +LangFuse is like a tool that is made to monitor tasks on your rack so you can easily log stuff. It's also open source tool, you can easily self host it and you can monitor every part of your rag. You can create data sets based on questions and answers that has been asked or some you created by yourself. And you can easily perform like check of your rag just to trade out and to give a final score of it, and to be able to monitor everything and to give global score based on your data set of your rag. So we are currently implementing it. I give their name because it's wonderful the work they did, and I really enjoyed it. It's one of the most important points to not be blind. I mean, in general, in terms of business, you have to follow metrics. -And the leaf nested field can also be an array. +Guillaume Marquis: +Numbers cannot lie. Humans lies, but not numbers. But after that you have to interpret numbers. So that's also another toss part. But it's important to have the good metrics and to be able to know if you are evolving it, if you are improving your system and if everything is working. Basically the different stuff we are doing, we are not like. -httppythontypescriptrustjavacsharpgo +Demetrios: +Are you collecting human feedback? For the hallucinations part, we try, but. -```http -POST /collections/{collection_name}/points/scroll -{ - "filter": { - "should": [\ - {\ - "key": "country.cities[].sightseeing",\ - "match": {\ - "value": "Osaka Castle"\ - }\ - }\ - ] - } -} +Guillaume Marquis: +Humans are not like giving a lot of feedback. -``` +Demetrios: +It's hard. That's why it's really hard the end user to do anything, even just like the thumbs up, thumbs down can be difficult. -```python -client.scroll( - collection_name="{collection_name}", - scroll_filter=models.Filter( - should=[\ - models.FieldCondition(\ - key="country.cities[].sightseeing",\ - match=models.MatchValue(value="Osaka Castle"),\ - ),\ - ], - ), -) +Guillaume Marquis: +We tried several stuff. We have the thumbs up, thumbs down, we tried stars. You ask real feedback to write something, hey, please help us. Human feedback is quite poor, so we are not counting on that. -``` +Demetrios: +I think the hard part about it, at least me as an end user, whenever I've been using these, is like the thumbs down or the, I've even seen it go as far as, like, you have more than just one emoji. Like, maybe you have the thumbs up, you have the thumbs down. You have, like, a mushroom emoji. So it's, like, hallucinated. And you have, like. -```typescript -client.scroll("{collection_name}", { - filter: { - should: [\ - {\ - key: "country.cities[].sightseeing",\ - match: { value: "Osaka Castle" },\ - },\ - ], - }, -}); +Guillaume Marquis: +What was the. -``` +Demetrios: +Other one that I saw that I thought was pretty? I can't remember it right now, but. -```rust -use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; +Guillaume Marquis: +I never saw the mushroom. But that's quite fun. -client - .scroll( - ScrollPointsBuilder::new("{collection_name}").filter(Filter::should([\ - Condition::matches("country.cities[].sightseeing", "Osaka Castle".to_string()),\ - ])), - ) - .await?; +Demetrios: +Yeah, it's good. It's not just wrong. It's absolutely, like, way off the mark. And what I think is interesting there when I've been the end user is that it's a little bit just like, I don't have time to explain the nuances as to why this is not useful. I really would have to sit down and almost, like, write a book or at least an essay on, yeah, this is kind of useful, but it's like a two out of a five, not a four out of a five. And so that's why I gave it the thumbs down. Or there was this part that is good and that part's bad. And so it's just like the ways that you have to, or the nuances that you have to go into as the end user when you're trying to evaluate it, I think it's much better. -``` +Demetrios: +And what I've seen a lot of people do is just expect to do that in house. After the fact, you get all the information back, you see, on certain metrics, like, oh, did this person commit the code? Then that's a good signal that it's useful. But then you can also look at it, or did this person copy paste it? Et cetera, et cetera. And how can we see if they didn't copy paste that or if they didn't take that next action that we would expect them to take? Why not? And let's try and dig into what we can do to make that better. -```java -import static io.qdrant.client.ConditionFactory.matchKeyword; +Guillaume Marquis: +Yes. We can also evaluate the next questions, like the following questions. That's a great point. We are not currently doing it automatically, but if you see that a user just answer, no, it's not true, or you should rephrase it or be more concise, or these kind of following questions, you know that the first answer was not as relevant as. -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.ScrollPoints; +Demetrios: +That's such a great point. Or you do some sentiment analysis and it slowly is getting more and more angry. -client - .scrollAsync( - ScrollPoints.newBuilder() - .setCollectionName("{collection_name}") - .setFilter( - Filter.newBuilder() - .addShould(matchKeyword("country.cities[].sightseeing", "Germany")) - .build()) - .build()) - .get(); +Guillaume Marquis: +Yeah, that's true. That's a good point also. -``` +Demetrios: +Yeah, this one went downhill, so. All right, cool. I think that's it. Sabrina, any last questions from your side? -```csharp -using Qdrant.Client; -using static Qdrant.Client.Grpc.Conditions; +Sabrina Aquino: +Yeah, I think I'm just very interesting to know from a user perspective, from a virtual brain, how are traditional models worse or what kind of errors virtual brain fixes in their structure, that users find it better that way. -var client = new QdrantClient("localhost", 6334); +Guillaume Marquis: +I think in this particular, so we talked about hallucinations, I think it's like one of the main issues people have on classic elements. We really think that when you create a one size fit all tool, you have some chole because you have to manage different approaches, like when you are creating copilot as Microsoft, you have to under the use cases of, and I really think so. Our AI is not trained to write you a speech based on Shakespeare and with the style of Martin Luther King. It's not the purpose of the tool. So if you ask something that is out of the box, he will just say like, okay, I don't know how to answer that. And that's an important point. That's a feature by itself to be able to not go outside of the box. And so we did this choice of putting the AI inside the box, the box that is containing basically all the knowledge of your company, all the retrieved knowledge. -await client.ScrollAsync( - collectionName: "{collection_name}", - filter: MatchKeyword("country.cities[].sightseeing", "Germany") -); +Guillaume Marquis: +Actually we do not have a lot of hallucination, I will not say like 0%, but it's close to zero. Because we analyze a question, we put the AI in a box, we enforce the AI to think about the answer before answering, and we analyze also the answer to know if the answer is relevant. And that's an important point that we are fixing and we fix for our user and we prefer yes, to give like non answers and a bad answer. -``` +Sabrina Aquino: +Absolutely. And there are people who think like, hey, this is a rag, it's not going to hallucinate, and that's not the case at all. It will hallucinate less inside a certain context window that you provide. Right. But it still has a possibility. So minimizing that as much as possible is very valuable. -```go -import ( - "context" +Demetrios: +So good. Well, I think with that, our time here is coming to an end. I really appreciate this. I encourage everyone to go and have a little look at virtual brain. We'll drop a link in the comment in case anyone wants free to sign up. - "github.com/qdrant/go-client/qdrant" -) +Guillaume Marquis: +So you can trade for free. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Demetrios: +Even better. Look at that, Christmas came early. Well, let's go have some fun, play around with it. And I can't promise, but I may give you some feedback, I may give you some [evaluation](https://qdrant.tech/rag/rag-evaluation-guide/) metrics if it's hallucinating. -client.Scroll(context.Background(), &qdrant.ScrollPoints{ - CollectionName: "{collection_name}", - Filter: &qdrant.Filter{ - Should: []*qdrant.Condition{ - qdrant.NewMatch("country.cities[].sightseeing", "Germany"), - }, - }, -}) +Guillaume Marquis: +Or what if I see some thumbs up or thumbs down, I will know that it's you. -``` +Demetrios: +Yeah, cool. Exactly. All right, folks, that's about it for today. We will see you all later. As a reminder, don't get lost in vector space. This has been another vector space talks. And if you want to come on here and chat with us, feel free to reach out. See ya. -This query would only output the point with id 2 as only Japan has a city with the “Osaka castke” as part of the sightseeing. +Guillaume Marquis: +Cool. -### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#nested-object-filter) Nested object filter +Sabrina Aquino: +See you guys. Thank you. Bye. -_Available as of v1.2.0_ +<|page-366-lllmstxt|> +> *"Now I do believe that Qdrant, I'm not sponsored by Qdrant, but I do believe it's the best one for a couple of reasons. And we're going to see them mostly because I can just run it on my computer so it's full private and I'm in charge of my data.”*\ +-- Francesco Saverio Zuppichini +> -By default, the conditions are taking into account the entire payload of a point. +Francesco Saverio Zuppichini is a Senior Full Stack Machine Learning Engineer at Zurich Insurance with experience in both large corporations and startups of various sizes. He is passionate about sharing knowledge, and building communities, and is known as a skilled practitioner in computer vision. He is proud of the community he built because of all the amazing people he got to know. -For instance, given two points with the following payload: +***Listen to the episode on [Spotify](https://open.spotify.com/episode/7kVd5a64sz2ib26IxyUikO?si=mrOoVP3ISQ22kXrSUdOmQA), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/56mFleo06LI).*** -```json -[\ - {\ - "id": 1,\ - "dinosaur": "t-rex",\ - "diet": [\ - { "food": "leaves", "likes": false},\ - { "food": "meat", "likes": true}\ - ]\ - },\ - {\ - "id": 2,\ - "dinosaur": "diplodocus",\ - "diet": [\ - { "food": "leaves", "likes": true},\ - { "food": "meat", "likes": false}\ - ]\ - }\ -] + -``` + -The following query would match both points: +## **Top takeaways:** -httppythontypescriptrustjavacsharpgo +Curious about transforming YouTube content into searchable elements? Francesco Zuppichini unpacks the journey of coding a RAG by using subtitles as input, harnessing technologies like YouTube DL, Hugging Face, and Qdrant, while debating framework reliance and the fine art of selecting the right software tools. -```http -POST /collections/{collection_name}/points/scroll -{ - "filter": { - "must": [\ - {\ - "key": "diet[].food",\ - "match": {\ - "value": "meat"\ - }\ - },\ - {\ - "key": "diet[].likes",\ - "match": {\ - "value": true\ - }\ - }\ - ] - } -} +Here are some insights from this episode: -``` +1. **Behind the Code**: Francesco unravels how to create a RAG using YouTube videos. Get ready to geek out on the nuts and bolts that make this magic happen. +2. **Vector Voodoo**: Ever wonder how embedding vectors carry out their similarity searches? Francesco's got you covered with his brilliant explanation of vector databases and the mind-bending distance method that seeks out those matches. +3. **Function over Class**: The debate is as old as stardust. Francesco shares why he prefers using functions over classes for better code organization and demonstrates how this approach solidifies when running language models with Ollama. +4. **Metadata Magic**: Find out how metadata isn't just a sidekick but plays a pivotal role in the realm of Qdrant and RAGs. Learn why Francesco values metadata as payload and the challenges it presents in developing domain-specific applications. +5. **Tool Selection Tips**: Deciding on the right software tool can feel like navigating an asteroid belt. Francesco shares his criteria—ease of installation, robust documentation, and a little help from friends—to ensure a safe landing. -```python -client.scroll( - collection_name="{collection_name}", - scroll_filter=models.Filter( - must=[\ - models.FieldCondition(\ - key="diet[].food", match=models.MatchValue(value="meat")\ - ),\ - models.FieldCondition(\ - key="diet[].likes", match=models.MatchValue(value=True)\ - ),\ - ], - ), -) +> Fun Fact: Francesco confessed that his code for chunking subtitles was "a little bit crappy" because of laziness—proving that even pros take shortcuts to the stars now and then. +> -``` +## Show notes: -```typescript -client.scroll("{collection_name}", { - filter: { - must: [\ - {\ - key: "diet[].food",\ - match: { value: "meat" },\ - },\ - {\ - key: "diet[].likes",\ - match: { value: true },\ - },\ - ], - }, -}); +00:00 Intro to Francesco\ +05:36 Create YouTube rack for data retrieval.\ +09:10 Local web dev showcase without frameworks effectively.\ +11:12 Qdrant: converting video text to vectors.\ +13:43 Connect to vectordb, specify config, keep it simple.\ +17:59 Recreate, compare vectors, filter for right matches.\ +21:36 Use functions and share states for simpler coding.\ +29:32 Gemini Pro generates task-based outputs effectively.\ +32:36 Good documentation shows pride in the product.\ +35:38 Organizing different data types in separate collections.\ +38:36 Proactive approach to understanding code and scalability.\ +42:22 User feedback and statistics evaluation is crucial.\ +44:09 Consider user needs for chatbot accuracy and relevance. -``` +## More Quotes from Francesco: -```rust -use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; +*"So through Docker, using Docker compose, very simple here I just copy and paste the configuration for the Qdrant documentation. I run it and when I run it I also get a very nice looking interface.*”\ +-- Francesco Saverio Zuppichini -client - .scroll( - ScrollPointsBuilder::new("{collection_name}").filter(Filter::must([\ - Condition::matches("diet[].food", "meat".to_string()),\ - Condition::matches("diet[].likes", true),\ - ])), - ) - .await?; +*"It's a very easy way to debug stuff because if you see a lot of vectors from the same document in the same place, maybe your chunking is not doing a great job because maybe you have some too much kind of overlapping on the recent bug in your code in which you have duplicate chunks. Okay, so we have our vector DB running. Now we need to do some setup stuff. So very easy to do with Qdrant. You just need to get the Qdrant client.”*\ +-- Francesco Saverio Zuppichini -``` +*"So straightforward, so useful. A lot of people, they don't realize that types are very useful. So kudos to the Qdrant team to actually make all the types very nice.”*\ +-- Francesco Saverio Zuppichini -```java -import java.util.List; +## Transcript: +Demetrios: +Folks, welcome to another vector space talks. I'm excited to be here and it is a special day because I've got a co host with me today. Sabrina, what's going on? How you doing? -import static io.qdrant.client.ConditionFactory.match; -import static io.qdrant.client.ConditionFactory.matchKeyword; +Sabrina Aquino: +Let's go. Thank you so much, Demetrios, for having me here. I've always wanted to participate in vector space talks. Now it's finally my chance. So thank you so much. -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.ScrollPoints; +Demetrios: +Your dream has come true and what a day for it to come true because we've got a special guest today. While we've got you here, Sabrina, I know you've been doing some excellent stuff on the Internet when it comes to other ways to engage with the Qdrant community. Can you break that down real fast before we jump into this? + +Sabrina Aquino: +Absolutely. I think an announcement here is we're hosting our first discord office hours. We're going to be answering all your questions about Qdrant with Qdrant team members, where you can interact with us, with our community as well. And we're also going to be dropping a few insights on the next Qdrant release 1.8. So that's super exciting and also, we are. Sorry, I just have another thing going on here on the live. + +Demetrios: +Music got in your ear. + +Sabrina Aquino: +We're also having the vector voices on Twitter, the X Spaces roundtable, where we bring experts to talk about a topic with our team. And you can also jump in and ask questions on the AMA. So that's super exciting as well. And, yeah, see you guys there. And I'll drop a link of the discord in the comments so you guys can join our community and be a part of it. + +Demetrios: +Exactly what I was about to say. So without further ado, let's bring on our guest of honor, Mr. Where are you at, dude? + +Francesco Zuppichini: +Hi. Hello. How are you? -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +Demetrios: +I'm great. How are you doing? -client - .scrollAsync( - ScrollPoints.newBuilder() - .setCollectionName("{collection_name}") - .setFilter( - Filter.newBuilder() - .addAllMust( - List.of(matchKeyword("diet[].food", "meat"), match("diet[].likes", true))) - .build()) - .build()) - .get(); +Francesco Zuppichini: +Great. -``` +Demetrios: +I've been seeing you all around the Internet and I am very excited to be able to chat with you today. I know you've got a bit of stuff planned for us. You've got a whole presentation, right? -```csharp -using Qdrant.Client; -using static Qdrant.Client.Grpc.Conditions; +Francesco Zuppichini: +Correct. -var client = new QdrantClient("localhost", 6334); +Demetrios: +But for those that do not know you, you're a full stack machine learning engineer at Zurich Insurance. I think you also are very vocal and you are fun to follow on LinkedIn is what I would say. And we're going to get to that at the end after you give your presentation. But once again, reminder for everybody, if you want to ask questions, hit us up with questions in the chat. As far as going through his presentation today, you're going to be talking to us all about some really cool stuff about rags. I'm going to let you get into it, man. And while you're sharing your screen, I'm going to tell people a little bit of a fun fact about you. That you put ketchup on your pizza, which I think is a little bit sacrilegious. -await client.ScrollAsync( - collectionName: "{collection_name}", - filter: MatchKeyword("diet[].food", "meat") & Match("diet[].likes", true) -); +Francesco Zuppichini: +Yes. So that's 100% true. And I hope that the italian pizza police is not listening to this call or I can be in real trouble. -``` +Demetrios: +I think we just lost a few viewers there, but it's all good. -```go -import ( - "context" +Sabrina Aquino: +Italy viewers just dropped out. - "github.com/qdrant/go-client/qdrant" -) +Demetrios: +Yeah, the Italians just dropped, but it's all good. We will cut that part out in post production, my man. I'm going to share your screen and I'm going to let you get after it. I'll be hanging around in case any questions pop up with Sabrina in the background. And here you go, bro. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Francesco Zuppichini: +Wonderful. So you can see my screen, right? -client.Scroll(context.Background(), &qdrant.ScrollPoints{ - CollectionName: "{collection_name}", - Filter: &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("diet[].food", "meat"), - qdrant.NewMatchBool("diet[].likes", true), - }, - }, -}) +Demetrios: +Yes, for sure. -``` +Francesco Zuppichini: +That's perfect. Okay, so today we're going to talk about talk with YouTube without paying a cent, no framework bs. So the goal of today is to showcase how to code a RAG given as an input a YouTube video without using any framework like language, et cetera, et cetera. And I want to show you that it's straightforward, using a bunch of technologies and Qdrants as well. And you can do all of this without actually pay to any service. Right. So we are going to run our PEDro DB locally and also the language model. We are going to run our machines. -This happens because both points are matching the two conditions: +Francesco Zuppichini: +And yeah, it's going to be a technical talk, so I will kind of guide you through the code. Feel free to interrupt me at any time if you have questions, if you want to ask why I did that, et cetera, et cetera. So very quickly, before we get started, I just want you not to introduce myself. So yeah, senior full stack machine engineer. That's just a bunch of funny work to basically say that I do a little bit of everything. Start. So when I was working, I start as computer vision engineer, I work at PwC, then a bunch of startups, and now I sold my soul to insurance companies working at insurance. And before I was doing computer vision, now I'm doing due to Chat GPT, hyper language model, I'm doing more of that. -- the “t-rex” matches food=meat on `diet[1].food` and likes=true on `diet[1].likes` -- the “diplodocus” matches food=meat on `diet[1].food` and likes=true on `diet[0].likes` +Francesco Zuppichini: +But I'm always involved in bringing the full product together. So from zero to something that is deployed and running. So I always be interested in web dev. I can also do website servers, a little bit of infrastructure as well. So now I'm just doing a little bit of everything. So this is why there is full stack there. Yeah. Okay, let's get started to something a little bit more interesting than myself. -To retrieve only the points which are matching the conditions on an array element basis, that is the point with id 1 in this example, you would need to use a nested object filter. +Francesco Zuppichini: +So our goal is to create a full local YouTube rack. And if you don't want a rack, is, it's basically a system in which you take some data. In this case, we are going to take subtitles from YouTube videos and you're able to basically q a with your data. So you're able to use a language model, you ask questions, then we retrieve the relevant parts in the data that you provide, and hopefully you're going to get the right answer to your. So let's talk about the technologies that we're going to use. So to get the subtitles from a video, we're going to use YouTube DL and YouTube DL. It's a library that is available through Pip. So Python, I think at some point it was on GitHub and then I think it was removed because Google, they were a little bit beach about that. -Nested object filters allow arrays of objects to be queried independently of each other. +Francesco Zuppichini: +So then they realized it on GitHub. And now I think it's on GitHub again, but you can just install it through Pip and it's very cool. -It is achieved by using the `nested` condition type formed by a payload key to focus on and a filter to apply. +Demetrios: +One thing, man, are you sharing a slide? Because all I see is your. I think you shared a different screen. -The key should point to an array of objects and can be used with or without the bracket notation (“data” or “data\[\]”). +Francesco Zuppichini: +Oh, boy. -httppythontypescriptrustjavacsharpgo +Demetrios: +I just see the video of you. There we go. -```http -POST /collections/{collection_name}/points/scroll -{ - "filter": { - "must": [{\ - "nested": {\ - "key": "diet",\ - "filter":{\ - "must": [\ - {\ - "key": "food",\ - "match": {\ - "value": "meat"\ - }\ - },\ - {\ - "key": "likes",\ - "match": {\ - "value": true\ - }\ - }\ - ]\ - }\ - }\ - }] - } -} +Francesco Zuppichini: +Entire screen. Yeah. I'm sorry. Thank you so much. -``` +Demetrios: +There we go. -```python -client.scroll( - collection_name="{collection_name}", - scroll_filter=models.Filter( - must=[\ - models.NestedCondition(\ - nested=models.Nested(\ - key="diet",\ - filter=models.Filter(\ - must=[\ - models.FieldCondition(\ - key="food", match=models.MatchValue(value="meat")\ - ),\ - models.FieldCondition(\ - key="likes", match=models.MatchValue(value=True)\ - ),\ - ]\ - ),\ - )\ - )\ - ], - ), -) +Francesco Zuppichini: +Wonderful. Okay, so in order to get the embedding. So to translate from text to vectors, right, so we're going to use hugging face just an embedding model so we can actually get some vectors. Then as soon as we got our vectors, we need to store and search them. So we're going to use our beloved Qdrant to do so. We also need to keep a little bit of stage right because we need to know which video we have processed so we don't redo the old embeddings and the storing every time we see the same video. So for this part, I'm just going to use SQLite, which is just basically an SQL database in just a file. So very easy to use, very kind of lightweight, and it's only your computer, so it's safe to run the language model. -``` +Francesco Zuppichini: +We're going to use Ollama. That is a very simple way and very well done way to just get a language model that is running on your computer. And you can also call it using the OpenAI Python library because they have implemented the same endpoint as. It's like, it's super convenient, super easy to use. If you already have some code that is calling OpenAI, you can just run a different language model using Ollama. And you just need to basically change two lines of code. So what we're going to do, basically, I'm going to take a video. So here it's a video from Fireship IO. -```typescript -client.scroll("{collection_name}", { - filter: { - must: [\ - {\ - nested: {\ - key: "diet",\ - filter: {\ - must: [\ - {\ - key: "food",\ - match: { value: "meat" },\ - },\ - {\ - key: "likes",\ - match: { value: true },\ - },\ - ],\ - },\ - },\ - },\ - ], - }, -}); +Francesco Zuppichini: +We're going to run our command line and we're going to ask some questions. Now, if you can still, in theory, you should be able to see my full screen. Yeah. So very quickly to showcase that to you, I already processed this video from the good sound YouTube channel and I have already here my command line. So I can already kind of see, you know, I can ask a question like what is the contact size of Germany? And we're going to get the reply. Yeah. And here we're going to get a reply. And now I want to walk you through how you can do something similar. -``` +Francesco Zuppichini: +Now, the goal is not to create the best rack in the world. It's just to showcase like show zero to something that is actually working. How you can do that in a fully local way without using any framework so you can really understand what's going on under the hood. Because I think a lot of people, they try to copy, to just copy and paste stuff on Langchain and then they end up in a situation when they need to change something, but they don't really know where the stuff is. So this is why I just want to just show like Windfield zero to hero. So the first step will be I get a YouTube video and now I need to get the subtitle. So you could actually use a model to take the audio from the video and get the text. Like a whisper model from OpenAI, for example. -```rust -use qdrant_client::qdrant::{Condition, Filter, NestedCondition, ScrollPointsBuilder}; +Francesco Zuppichini: +In this case, we are taking advantage that YouTube allow people to upload subtitles and YouTube will automatically generate the subtitles. So here using YouTube dial, I'm just going to get my video URL. I'm going to set up a bunch of options like the format they want, et cetera, et cetera. And then basically I'm going to download and get the subtitles. And they look something like this. Let me show you an example. Something similar to this one, right? We have the timestamps and we do have all text inside. Now the next step. -client - .scroll( - ScrollPointsBuilder::new("{collection_name}").filter(Filter::must([NestedCondition {\ - key: "diet".to_string(),\ - filter: Some(Filter::must([\ - Condition::matches("food", "meat".to_string()),\ - Condition::matches("likes", true),\ - ])),\ - }\ - .into()])), - ) - .await?; +Francesco Zuppichini: +So we got our source of data, we have our text key. Next step is I need to translate my text to vectors. Now the easiest way to do so is just use sentence transformers for backing phase. So here I've installed it. I load in a model. In this case I'm using this model here. I have no idea what tat model is. I just default one tatted find and it seems to work fine. -``` +Francesco Zuppichini: +And then in order to use it, I'm just providing a query and I'm getting back a list of vectors. So we have a way to take a video, take the text from the video, convert that to vectors with a semantic meaningful representation. And now we need to store them. Now I do believe that Qdrant, I'm not sponsored by Qdrant, but I do believe it's the best one for a couple of reasons. And we're going to see them mostly because I can just run it on my computer so it's full private and I'm in charge of my data. So the way I'm running it is through Docker compose. So through Docker, using Docker compose, very simple here I just copy and paste the configuration for the Qdrant documentation. I run it and when I run it I also get a very nice looking interface. -```java -import java.util.List; +Francesco Zuppichini: +I'm going to show that to you because I think it's very cool. So here I've already some vectors inside here so I can just look in my collection, it's called embeddings, an original name. And we can see all the chunks that were embed with the metadata, in this case just the video id. A super cool thing, super useful to debug is go in the visualize part and see the embeddings, the projected embeddings. You can actually do a bounce of stuff. You can actually also go here and color them by some metadata. Like I can say I want to have a different color based on the video id. In this case I just have one video. -import static io.qdrant.client.ConditionFactory.match; -import static io.qdrant.client.ConditionFactory.matchKeyword; -import static io.qdrant.client.ConditionFactory.nested; +Francesco Zuppichini: +I will show that as soon as we add more videos. This is so cool, so useful. I will use this at work as well in which I have a lot of documents. And it's a very easy way to debug stuff because if you see a lot of vectors from the same document in the same place, maybe your chunking is not doing a great job because maybe you have some too much kind of overlapping on the recent bug in your code in which you have duplicate chunks. Okay, so we have our vector DB running. Now we need to do some setup stuff. So very easy to do with Qdrant. You just need to get the Qdrant client. -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.ScrollPoints; +Francesco Zuppichini: +So you have a connection with a vectordb, you create a connection, you specify a name, you specify some configuration stuff. In this case I just specify the vector size because Qdrant, it needs to know how big the vectors are going to be and the distance I want to use. So I'm going to use the cosite distance in Qdrant documentation there are a lot of parameters. You can do a lot of crazy stuff here and just keep it very simple. And yeah, another important thing is that since we are going to embed more videos, when I ask a question to a video, I need to know which embedded are from that video. So we're going to create an index. So it's very efficient to filter my embedded based on that index, an index on the metadata video because when I store a chunk in Qdrant, I also going to include from which video is coming from. Very simple, very simple to set up. -client - .scrollAsync( - ScrollPoints.newBuilder() - .setCollectionName("{collection_name}") - .setFilter( - Filter.newBuilder() - .addMust( - nested( - "diet", - Filter.newBuilder() - .addAllMust( - List.of( - matchKeyword("food", "meat"), match("likes", true))) - .build())) - .build()) - .build()) - .get(); +Francesco Zuppichini: +You just need to do this once. I was very lazy so I just assumed that if this is going to fail, it means that it's because I've already created a collection. So I'm just going to pass it and call it a day. Okay, so this is basically all the preprocess this setup you need to do to have your Qdrant ready to store and search vectors. To store vectors. Straightforward, very straightforward as well. Just need again the client. So the connection to the database here I'm passing my embedding so sentence transformer model and I'm passing my chunks as a list of documents. -``` +Francesco Zuppichini: +So documents in my code is just a type that will contain just this metadata here. Very simple. It's similar to Lang chain here. I just have attacked it because it's lightweight. To store them we call the upload records function. We encode them here. There is a little bit of bad variable names from my side which I replacing that. So you shouldn't do that. -```csharp -using Qdrant.Client; -using static Qdrant.Client.Grpc.Conditions; +Francesco Zuppichini: +Apologize about that and you just send the records. Another very cool thing about Qdrant. So the second things that I really like is that they have types for what you send through the library. So this models record is a Qdrant type. So you use it and you know immediately. So what you need to put inside. So let me give you an example. Right? So assuming that I'm programming, right, I'm going to say model record bank. -var client = new QdrantClient("localhost", 6334); +Francesco Zuppichini: +I know immediately. So what I have to put inside, right? So straightforward, so useful. A lot of people, they don't realize that types are very useful. So kudos to the Qdrant team to actually make all the types very nice. Another cool thing is that if you're using fast API to build a web server, if you are going to return a Qdrant models type, it's actually going to be serialized automatically through pydantic. So you don't need to do weird stuff. It's all handled by the Qdrant APIs, by the product SDK. Super cool. -await client.ScrollAsync( - collectionName: "{collection_name}", - filter: Nested("diet", MatchKeyword("food", "meat") & Match("likes", true)) -); +Francesco Zuppichini: +Now we have a way to store our chunks to embed them. So this is how they look like in the interface. I can see them, I can go to them, et cetera, et Cetera. Very nice. Now the missing part, right. So video subtitles. I chunked the subtitles. I haven't show you the chunking code. -``` +Francesco Zuppichini: +It's a little bit crappy because I was very lazy. So I just like chunking by characters count and a little bit of overlapping. We have a way to store and embed our chunks and now we need a way to search. That's basically one of the missing steps. Now search straightforward as well. This is also a good example because I can show you how effective is to create filters using Qdrant. So what do we need to search with again the vector client, the embeddings, because we have a query, right. We need to run the query with the same embedding models. -```go -import ( - "context" +Francesco Zuppichini: +We need to recreate to embed in a vector and then we need to compare with the vectors in the vector Db using a distance method, in this case considered similarity in order to get the right matches right, the closest one in our vector DB, in our vector search base. So passing a query string, I'm passing a video id and I pass in a label. So how many hits I want to get from the metadb. Now to create a filter again you're going to use the model package from the Qdrant framework. So here I'm just creating a filter class for the model and I'm saying okay, this filter must match this key, right? So metadata video id with this video id. So when we search, before we do the similarity search, we are going to filter away all the vectors that are not from that video. Wonderful. Now super easy as well. - "github.com/qdrant/go-client/qdrant" -) +Francesco Zuppichini: +We just call the DB search, right pass. Our collection name here is star coded. Apologies about that, I think I forgot to put the right global variable our coded, we create a query, we set the limit, we pass the query filter, we get the it back as a dictionary in the payload field of each it and we recreate our document a dictionary. I have types, right? So I know what this function is going to return. Now if you were to use a framework, right this part, it will be basically the same thing. If I were to use langchain and I want to specify a filter, I would have to write the same amount of code. So most of the times you don't really need to use a framework. One thing that is nice about not using a framework here is that I add control on the indexes. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Francesco Zuppichini: +Lang chain, for instance, will create the indexes only while you call a classmate like from document. And that is kind of cumbersome because sometimes I wasn't quoting bugs in which I was not understanding why one index was created before, after, et cetera, et cetera. So yes, just try to keep things simple and not always write on frameworks. Wonderful. Now I have a way to ask a query to get back the relative parts from that video. Now we need to translate this list of chunks to something that we can read as human. Before we do that, I was almost going to forget we need to keep state. Now, one of the last missing part is something in which I can store data. -client.Scroll(context.Background(), &qdrant.ScrollPoints{ - CollectionName: "{collection_name}", - Filter: &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewNestedFilter("diet", &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("food", "meat"), - qdrant.NewMatchBool("likes", true), - }, - }), - }, - }, -}) +Francesco Zuppichini: +Here I just have a setup function in which I'm going to create an SQL lite database, create a table called videos in which I have an id and a title. So later I can check, hey, is this video already in my database? Yes. I don't need to process that. I can just start immediately to QA on that video. If not, I'm going to do the chunking and embeddings. Got a couple of functions here to get video from Db to save video from and to save video to Db. So notice now I only use functions. I'm not using classes here. -``` +Francesco Zuppichini: +I'm not a fan of object writing programming because it's very easy to kind of reach inheritance health in which we have like ten levels of inheritance. And here if a function needs to have state, here we do need to have state because we need a connection. So I will just have a function that initialize that state. I return tat to me, and me as a caller, I'm just going to call it and pass my state. Very simple tips allow you really to divide your code properly. You don't need to think about is my class to couple with another class, et cetera, et cetera. Very simple, very effective. So what I suggest when you're coding, just start with function and share states across just pass down state. -The matching logic is modified to be applied at the level of an array element within the payload. +Francesco Zuppichini: +And when you realize that you can cluster a lot of function together with a common behavior, you can go ahead and put state in a class and have key function as methods. So try to not start first by trying to understand which class I need to use around how I connect them, because in my opinion it's just a waste of time. So just start with function and then try to cluster them together if you need to. Okay, last part, the juicy part as well. Language models. So we need the language model. Why do we need the language model? Because I'm going to ask a question, right. I'm going to get a bunch of relevant chunks from a video and the language model. -Nested filters work in the same way as if the nested filter was applied to a single element of the array at a time. -Parent document is considered to match the condition if at least one element of the array matches the nested filter. +Francesco Zuppichini: +It needs to answer that to me. So it needs to get information from the chunks and reply that to me using that information as a context. To run language model, the easiest way in my opinion is using Ollama. There are a lot of models that are available. I put a link here and you can also bring your own model. There are a lot of videos and tutorial how to do that. You run this command as soon as you install it on Linux. It's a one line to install Ollama. -**Limitations** +Francesco Zuppichini: +You run this command here, it's going to download Mistral 7B very good model and run it on your gpu if you have one, or your cpu if you don't have a gpu, run it on GPU. Here you can see it yet. It's around 6gb. So even with a low tier gpu, you should be able to run a seven minute model on your gpu. Okay, so this is the prompt just for also to show you how easy is this, this prompt was just very lazy. Copy and paste from langchain source code here prompt use the following piece of context to answer the question at the end. Blah blah blah variable to inject the context inside question variable to get question and then we're going to get an answer. How do we call it? Is it easy? I have a function here called getanswer passing a bunch of stuff, passing also the OpenAI from the OpenAI Python package model client passing a question, passing a vdb, my DB client, my embeddings, reading my prompt, getting my matching documents, calling the search function we have just seen before, creating my context. -The `has_id` condition is not supported within the nested object filter. If you need it, place it in an adjacent `must` clause. +Francesco Zuppichini: +So just joining the text in the chunks on a new line, calling the format function in Python. As simple as that. Just calling the format function in Python because the format function will look at a string and kitty will inject variables that match inside these parentheses. Passing context passing question using the OpenAI model client APIs and getting a reply back. Super easy. And here I'm returning the reply from the language model and also the list of documents. So this should be documents. I think I did a mistake. -httppythontypescriptrustjavacsharpgo +Francesco Zuppichini: +When I copy and paste this to get this image and we are done right. We have a way to get some answers from a video by putting everything together. This can seem scary because there is no comment here, but I can show you tson code. I think it's easier so I can highlight stuff. I'm creating my embeddings, I'm getting my database, I'm getting my vector DB login, some stuff I'm getting my model client, I'm getting my vid. So here I'm defining the state that I need. You don't need comments because I get it straightforward. Like here I'm getting the vector db, good function name. -```http -POST /collections/{collection_name}/points/scroll -{ - "filter":{ - "must":[\ - {\ - "nested":{\ - "key":"diet",\ - "filter":{\ - "must":[\ - {\ - "key":"food",\ - "match":{\ - "value":"meat"\ - }\ - },\ - {\ - "key":"likes",\ - "match":{\ - "value":true\ - }\ - }\ - ]\ - }\ - }\ - },\ - {\ - "has_id":[\ - 1\ - ]\ - }\ - ] - } -} +Francesco Zuppichini: +Then if I don't have the vector db, sorry. If I don't have the video id in a database, I'm going to get some information to the video. I'm going to download the subtitles, split the subtitles. I'm going to do the embeddings. In the end I'm going to save it to the betterDb. Finally I'm going to get my video back, printing something and start a while loop in which you can get an answer. So this is the full pipeline. Very simple, all function. -``` +Francesco Zuppichini: +Also here fit function is very simple to divide things. Around here I have a file called RAG and here I just do all the RAG stuff. Right. It's all here similar. I have my file called crude. Here I'm doing everything I need to do with my database, et cetera, et cetera. Also a file called YouTube. So just try to split things based on what they do instead of what they are. -```python -client.scroll( - collection_name="{collection_name}", - scroll_filter=models.Filter( - must=[\ - models.NestedCondition(\ - nested=models.Nested(\ - key="diet",\ - filter=models.Filter(\ - must=[\ - models.FieldCondition(\ - key="food", match=models.MatchValue(value="meat")\ - ),\ - models.FieldCondition(\ - key="likes", match=models.MatchValue(value=True)\ - ),\ - ]\ - ),\ - )\ - ),\ - models.HasIdCondition(has_id=[1]),\ - ], - ), -) +Francesco Zuppichini: +I think it's easier than to code. Yeah. So I can actually show you a demo in which we kind of embed a video from scratch. So let me kill this bad boy here. Let's get a juicy YouTube video from Sam. We can go with Gemma. We can go with Gemma. I think I haven't embedded that yet. -``` +Francesco Zuppichini: +I'm sorry. My Eddie block is doing weird stuff over here. Okay, let me put this here. -```typescript -client.scroll("{collection_name}", { - filter: { - must: [\ - {\ - nested: {\ - key: "diet",\ - filter: {\ - must: [\ - {\ - key: "food",\ - match: { value: "meat" },\ - },\ - {\ - key: "likes",\ - match: { value: true },\ - },\ - ],\ - },\ - },\ - },\ - {\ - has_id: [1],\ - },\ - ], - }, -}); +Demetrios: +This is the moment that we need to all pray to the demo gods that this will work. -``` +Francesco Zuppichini: +Oh yeah. I'm so sorry. I'm so sorry. I think it was already processed. So let me. I don't know this one. Also I noticed I'm seeing this very weird thing which I've just not seen that yesterday. So that's going to be interesting. -```rust -use qdrant_client::qdrant::{Condition, Filter, NestedCondition, ScrollPointsBuilder}; +Francesco Zuppichini: +I think my poor Linux computer is giving up to running language models. Okay. Downloading ceramic logs, embeddings and we have it now before I forgot because I think that you guys spent some time doing this. So let's go on the visualize page and let's actually do the color by and let's do metadata, video id. Video id. Let's run it. Metadata, metadata, video meta. Oh my God. -client - .scroll( - ScrollPointsBuilder::new("{collection_name}").filter(Filter::must([\ - NestedCondition {\ - key: "diet".to_string(),\ - filter: Some(Filter::must([\ - Condition::matches("food", "meat".to_string()),\ - Condition::matches("likes", true),\ - ])),\ - }\ - .into(),\ - Condition::has_id([1]),\ - ])), - ) - .await?; +Francesco Zuppichini: +Data video id. Why don't see the other one? I don't know. This is the beauty of live section. -``` +Demetrios: +This is how we know it's real. -```java -import java.util.List; +Francesco Zuppichini: +Yeah, I mean, this is working, right? This is called Chevroni Pro. That video. Yeah, I don't know about that. I don't know about that. It was working before. I can touch for sure. So probably I'm doing something wrong, probably later. Let's try that. -import static io.qdrant.client.ConditionFactory.hasId; -import static io.qdrant.client.ConditionFactory.match; -import static io.qdrant.client.ConditionFactory.matchKeyword; -import static io.qdrant.client.ConditionFactory.nested; -import static io.qdrant.client.PointIdFactory.id; +Francesco Zuppichini: +Let's see. I must be doing something wrong, so don't worry about that. But we are ready to ask questions, so maybe I can just say I don't know, what is Gemini pro? So let's see, Mr. Running on GPU is kind of fast, it doesn't take too much time. And here we can see we are 6gb, 1gb is for the embedding model. So 4gb, 5gb running the language model here it says Gemini pro is a colonized tool that can generate output based on given tasks. Blah, blah, blah, blah, blah, blah. Yeah, it seems to work. -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.ScrollPoints; +Francesco Zuppichini: +Here you have it. Thanks. Of course. And I don't know if there are any questions about it. -client - .scrollAsync( - ScrollPoints.newBuilder() - .setCollectionName("{collection_name}") - .setFilter( - Filter.newBuilder() - .addMust( - nested( - "diet", - Filter.newBuilder() - .addAllMust( - List.of( - matchKeyword("food", "meat"), match("likes", true))) - .build())) - .addMust(hasId(id(1))) - .build()) - .build()) - .get(); +Demetrios: +So many questions. There's a question that came through the chat that is a simple one that we can answer right away, which is can we access this code anywhere? -``` +Francesco Zuppichini: +Yeah, so it's on my GitHub. Can I share a link with you in the chat? Maybe? So that should be YouTube. Can I put it here maybe? -```csharp -using Qdrant.Client; -using static Qdrant.Client.Grpc.Conditions; +Demetrios: +Yes, most definitely can. And we'll drop that into all of the spots so that we have it. Now. Next question from my side, while people are also asking, and you've got some fans in the chat right now, so. -var client = new QdrantClient("localhost", 6334); +Francesco Zuppichini: +Nice to everyone by the way. -await client.ScrollAsync( - collectionName: "{collection_name}", - filter: Nested("diet", MatchKeyword("food", "meat") & Match("likes", true)) & HasId(1) -); +Demetrios: +So from my side, I'm wondering, do you have any specific design decisions criteria that you use when you are building out your stack? Like you chose Mistral, you chose Ollama, you chose Qdrant. It sounds like with Qdrant you did some testing and you appreciated the capabilities. With Qdrant, was it similar with Ollama and Mistral? -``` +Francesco Zuppichini: +So my test is how long it's going to take to install that tool. If it's taking too much time and it's hard to install because documentation is bad, so that it's a red flag, right? Because if it's hard to install and documentation is bad for the installation, that's the first thing people are going to read. So probably it's not going to be great for something down the road to use Olama. It took me two minutes, took me two minutes, it was incredible. But just install it, run it and it was done. Same thing with Qualent as well and same thing with the hacking phase library. So to me, usually as soon as if I see that something is easy to install, that's usually means that is good. And if the documentation to install it, it's good. -```go -import ( - "context" +Francesco Zuppichini: +It means that people thought about it and they care about writing good documentation because they want people to use their tools. A lot of times for enterprises tools like cloud enterprise services, documentation is terrible because they know you're going to pay because you're an enterprise. And some manager has decided five years ago to use TatCloud provider, not the other. So I think know if you see recommendation that means that the people's company, startup enterprise behind that want you to use their software because they know and they're proud of it. Like they know that is good. So usually this is my way of going. And then of course I watch a lot of YouTube videos so I see people talking about different texts, et cetera. And if some youtuber which I trust say like I tried this seems to work well, I will note it down. - "github.com/qdrant/go-client/qdrant" -) +Francesco Zuppichini: +So then in the future I know hey, for these things I think I use ABC and this has already be tested by someone. I don't know I'm going to use it. Another important thing is reach out to your friends networks and say hey guys, I need to do this. Do you know if you have a good stock that you're already trying to experience with that? -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Demetrios: +Yeah. With respect to the enterprise software type of tools, there was something that I saw that was hilarious. It was something along the lines of custom customer and user is not the same thing. Customer is the one who pays, user is the one who suffers. -client.Scroll(context.Background(), &qdrant.ScrollPoints{ - CollectionName: "{collection_name}", - Filter: &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewNestedFilter("diet", &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("food", "meat"), - qdrant.NewMatchBool("likes", true), - }, - }), - qdrant.NewHasID(qdrant.NewIDNum(1)), - }, - }, -}) +Francesco Zuppichini: +That's really true for enterprise software, I need to tell you. So that's true. -``` +Demetrios: +Yeah, we've all been through it. So there's another question coming through in the chat about would there be a collection for each embedded video based on your unique view video id? -### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#full-text-match) Full Text Match +Francesco Zuppichini: +No. What you want to do, I mean you could do that of course, but collection should encapsulate the project that you're doing more or less in my mind. So in this case I just call it embeddings. Maybe I should have called videos. So they are just going to be inside the same collection, they're just going to have different metadata. I think you need to correct me if I'm wrong that from your side, from the Qdrant code, searching things in the same collection, probably it's more effective to some degree. And imagine that if you have 1000 videos you need to create 1000 collection. And then I think cocoa wise collection are meant to have data coming from the same source, semantic value. -_Available as of v0.10.0_ +Francesco Zuppichini: +So in my case I have all videos. If I were to have different data, maybe from pdfs. Probably I would just create another collection, right, if I don't want them to be in the same part and search them. And one cool thing of having all the videos in the same collection is that I can just ask a question to all the videos at the same time if I want to, or I can change my filter and ask questions to two free videos. Specifically, you can do that if you have one collection per video, right? Like for instance at work I was embedding PDF and using qualitative and sometimes you need to talk with two pdf at the same time free, or just one, or maybe all the PDF in that folder. So I was just changing the filter, right? And that can only be done if they're all in the same collection. -A special case of the `match` condition is the `text` match condition. -It allows you to search for a specific substring, token or phrase within the text field. +Sabrina Aquino: +Yeah, that's a great explanation of collections. And I do love your approach of having everything locally and having everything in a structured way that you can really understand what you're doing. And I know you mentioned sometimes frameworks are not necessary. And I wonder also from your side, when do you think a framework would be necessary and does it have to do with scaling? What do you think? -Exact texts that will match the condition depend on full-text index configuration. -Configuration is defined during the index creation and describe at [full-text index](https://qdrant.tech/documentation/concepts/indexing/#full-text-index). +Francesco Zuppichini: +So that's a great question. So what frameworks in theory should give you is good interfaces, right? So a good interface means that if I'm following that interface, I know that I can always call something that implements that interface in the same way. Like for instance in Langchain, if I call a betterdb, I can just swap the betterdb and I can call it in the same way. If the interfaces are good, the framework is useful. If you know that you are going to change stuff. In my case, I know from the beginning that I'm going to use Qdrant, I'm going to use Ollama, and I'm going to use SQL lite. So why should I go to the hello reading framework documentation? I install libraries, and then you need to install a bunch of packages from the framework that you don't even know why you need them. Maybe you have a conflict package, et cetera, et cetera. -If there is no full-text index for the field, the condition will work as exact substring match. +Francesco Zuppichini: +If you know ready. So what you want to do then just code it and call it a day? Like in this case, I know I'm not going to change the vector DB. If you think that you're going to change something, even if it's a simple approach, it's fair enough, simple to change stuff. Like I will say that if you know that you want to change your vector DB providers, either you define your own interface or you use a framework with an already defined interface. But be careful because right too much on framework will. First of all, basically you don't know what's going on inside the hood for launching because it's so kudos to them. They were the first one. They are very smart people, et cetera, et cetera. -jsonpythontypescriptrustjavacsharpgo +Francesco Zuppichini: +But they have inheritance held in that code. And in order to understand how to do certain stuff I had to look at in the source code, right. And try to figure it out. So which class is inherited from that? And going straight up in order to understand what behavior that class was supposed to have. If I pass this parameter, and sometimes defining an interface is straightforward, just maybe you want to define a couple of function in a class. You call it, you just need to define the inputs and the outputs and if you want to scale and you can just implement a new class called that interface. Yeah, that is at least like my take. I try to first try to do stuff and then if I need to scale, at least I have already something working and I can scale it instead of kind of try to do the perfect thing from the beginning. -```json -{ - "key": "description", - "match": { - "text": "good cheap" - } -} +Francesco Zuppichini: +Also because I hate reading documentation, so I try to avoid doing that in general. -``` +Sabrina Aquino: +Yeah, I totally love this. It's about having like what's your end project? Do you actually need what you're going to build and understanding what you're building behind? I think it's super nice. We're also having another question which is I haven't used Qdrant yet. The metadata is also part of the embedding, I. E. Prepended to the chunk or so basically he's asking if the metadata is also embedded in the answer for that. Go ahead. -```python -models.FieldCondition( - key="description", - match=models.MatchText(text="good cheap"), -) +Francesco Zuppichini: +I think you have a good article about another search which you also probably embed the title. Yeah, I remember you have a good article in which you showcase having chunks with the title from, I think the section, right. And you first do a search, find the right title and then you do a search inside. So all the chunks from that paragraph, I think from that section, if I'm not mistaken. It really depends on the use case, though. If you have a document full of information, splitting a lot of paragraph, very long one, and you need to very be precise on what you want to fetch, you need to take advantage of the structure of the document, right? -``` +Sabrina Aquino: +Yeah, absolutely. The metadata goes as payload in Qdrant. So basically it's like a JSON type of information attached to your data that's not embedded. We also have documentation on it. I will answer on the comments as well, I think another question I have for you, Franz, about the sort of evaluation and how would you perform a little evaluation on this rag that you created. -```typescript -{ - key: 'description', - match: {text: 'good cheap'} -} +Francesco Zuppichini: +Okay, so that is an interesting question, because everybody talks about metrics and evaluation. Most of the times you don't really have that, right? So you have benchmarks, right. And everybody can use a benchmark to evaluate their pipeline. But when you have domain specific documents, like at work, for example, I'm doing RAG on insurance documents now. How do I create a data set from that in order to evaluate my RAG? It's going to be very time consuming. So what we are trying to do, so we get a bunch of people who knows these documents, catching some paragraph, try to ask a question, and that has the reply there and having basically a ground truth from their side. A lot of time the reply has to be composed from different part of the document. So, yeah, it's very hard. -``` +Francesco Zuppichini: +It's very hard. So what I will kind of suggest is try to use no benchmark, or then you empirically try that. If you're building a RAG that users are going to use, always include a way to collect feedback and collect statistics. So collect the conversation, if that is okay with your privacy rules. Because in my opinion, it's always better to put something in production till you wait too much time, because you need to run all your metrics, et cetera, et cetera. And as soon as people start using that, you kind of see if it is good enough, maybe for language model itself, so that it's a different task, because you need to be sure that they don't say, we're stuck to the users. I don't really have the source of true answer here. It's very hard to evaluate them. -```rust -use qdrant_client::qdrant::Condition; +Francesco Zuppichini: +So what I know people also try to do, like, so they get some paragraph or some chunks, they ask GPD four to generate a question and the answer based on the paragraph, and they use that as an auto labeling way to create a data set to evaluate your RAG. That can also be effective, I guess 100%, yeah. -Condition::matches_text("description", "good cheap") +Demetrios: +And depending on your use case, you probably need more rigorous evaluation or less, like in this case, what you're doing, it might not need that rigor. -``` +Francesco Zuppichini: +You can see, actually, I think was Canada Airlines, right? -```java -import static io.qdrant.client.ConditionFactory.matchText; +Demetrios: +Yeah. -matchText("description", "good cheap"); +Francesco Zuppichini: +If you have something that is facing paying users, then think one of the times before that. In my case at all, I have something that is used by internal users and we communicate with them. So if my chat bot is saying something wrong, so they will tell me. And the worst thing that can happen is that they need to manually look for the answer. But as soon as your chatbot needs to do something that had people that are going to pay or medical stuff. You need to understand that for some use cases, you need to apply certain rules for others and you can be kind of more relaxed, I would say, based on the arm that your chatbot is going to generate. -``` +Demetrios: +Yeah, I think that's all the questions we've got for now. Appreciate you coming on here and chatting with us. And I also appreciate everybody listening in. Anyone who is not following Fran, go give him a follow, at least for the laughs, the chuckles, and huge thanks to you, Sabrina, for joining us, too. It was a pleasure having you here. I look forward to doing many more of these. -```csharp -using static Qdrant.Client.Grpc.Conditions; +Sabrina Aquino: +The pleasure is all mine, Demetrios, and it was a total pleasure. Fran, I learned a lot from your session today. -MatchText("description", "good cheap"); +Francesco Zuppichini: +Thank you so much. Thank you so much. And also go ahead and follow the Qdrant on LinkedIn. They post a lot of cool stuff and read the Qdrant blogs. They're very good. They're very good. -``` +Demetrios: +That's it. The team is going to love to hear that, I'm sure. So if you are doing anything cool with good old Qdrant, give us a ring so we can feature you in the vector space talks. Until next time, don't get lost in vector space. We will see you all later. Have a good one, y'all. -```go -import "github.com/qdrant/go-client/qdrant" +<|page-367-lllmstxt|> +We're thrilled to announce that Qdrant is now [officially available on Azure Marketplace](https://azuremarketplace.microsoft.com/en-en/marketplace/apps/qdrantsolutionsgmbh1698769709989.qdrant-db), bringing enterprise-level vector search directly to Azure's vast community of users. This integration marks a significant milestone in our journey to make Qdrant more accessible and convenient for businesses worldwide. -qdrant.NewMatchText("description", "good cheap") +> *With the landscape of AI being complex for most customers, Qdrant's ease of use provides an easy approach for customers' implementation of RAG patterns for Generative AI solutions and additional choices in selecting AI components on Azure,* - Tara Walker, Principal Software Engineer at Microsoft. -``` +## Why Azure Marketplace? -If the query has several words, then the condition will be satisfied only if all of them are present in the text. +[Azure Marketplace](https://azuremarketplace.microsoft.com/en-us/) is renowned for its robust ecosystem, trusted by millions of users globally. By listing Qdrant on Azure Marketplace, we're not only expanding our reach but also ensuring seamless integration with Azure's suite of tools and services. This collaboration opens up new possibilities for our users, enabling them to leverage the power of Azure alongside the capabilities of Qdrant. -### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#range) Range +> *Enterprises like Bosch can now use the power of Microsoft Azure to host Qdrant, unleashing unparalleled performance and massive-scale vector search. "With Qdrant, we found the missing piece to develop our own provider independent multimodal generative AI platform at enterprise scale,* - Jeremy Teichmann (AI Squad Technical Lead & Generative AI Expert), Daly Singh (AI Squad Lead & Product Owner) - Bosch Digital. -jsonpythontypescriptrustjavacsharpgo +## Key Benefits for Users: -```json -{ - "key": "price", - "range": { - "gt": null, - "gte": 100.0, - "lt": null, - "lte": 450.0 - } -} +- **Rapid Application Development:** Deploying a cluster on Microsoft Azure via the Qdrant Cloud console only takes a few seconds and can scale up as needed, giving developers maximal flexibility for their production deployments. -``` +- **Billion Vector Scale:** Seamlessly grow and handle large-scale datasets with billions of vectors by leveraging Qdrant's features like vertical and horizontal scaling or binary quantization with Microsoft Azure's scalable infrastructure. -```python -models.FieldCondition( - key="price", - range=models.Range( - gt=None, - gte=100.0, - lt=None, - lte=450.0, - ), -) +- **Unparalleled Performance:** Qdrant is built to handle scaling challenges, high throughput, low latency, and efficient indexing. Written in Rust makes Qdrant fast and reliable even under high load. See benchmarks. -``` +- **Versatile Applications:** From recommendation systems to similarity search, Qdrant's integration with Microsoft Azure provides a versatile tool for a diverse set of AI applications. -```typescript -{ - key: 'price', - range: { - gt: null, - gte: 100.0, - lt: null, - lte: 450.0 - } -} +## Getting Started: -``` +Ready to experience the benefits of Qdrant on Azure Marketplace? Getting started is easy: -```rust -use qdrant_client::qdrant::{Condition, Range}; +1. **Visit the Azure Marketplace**: Navigate to [Qdrant's Marketplace listing](https://azuremarketplace.microsoft.com/en-en/marketplace/apps/qdrantsolutionsgmbh1698769709989.qdrant-db). +2. **Deploy Qdrant**: Follow the simple deployment instructions to set up your instance. +3. **Start Using Qdrant**: Once deployed, start exploring the [features and capabilities of Qdrant](/documentation/concepts/) on Azure. +4. **Read Documentation**: Read Qdrant's [Documentation](/documentation/) and build demo apps using [Tutorials](/documentation/tutorials/). -Condition::range( - "price", - Range { - gt: None, - gte: Some(100.0), - lt: None, - lte: Some(450.0), - }, -) +## Join Us on this Exciting Journey: -``` +We're incredibly excited about this collaboration with Azure Marketplace and the opportunities it brings for our users. As we continue to innovate and enhance Qdrant, we invite you to join us on this journey towards greater efficiency, scalability, and success. -```java -import static io.qdrant.client.ConditionFactory.range; +Ready to elevate your business with Qdrant? **Click the banner and get started today!** -import io.qdrant.client.grpc.Points.Range; +[![Get Started on Azure Marketplace](/blog/azure-marketplace/cta.png)](https://azuremarketplace.microsoft.com/en-en/marketplace/apps/qdrantsolutionsgmbh1698769709989.qdrant-db) -range("price", Range.newBuilder().setGte(100.0).setLte(450).build()); +### About Qdrant: -``` +Qdrant is the leading, high-performance, scalable, open-source vector database and search engine, essential for building the next generation of AI/ML applications. Qdrant is able to handle billions of vectors, supports the matching of semantically complex objects, and is implemented in Rust for performance, memory safety, and scale. -```csharp -using static Qdrant.Client.Grpc.Conditions; +<|page-368-lllmstxt|> +> *"We've got a lot of fun challenges ahead of us in the industry, I think, and the industry is establishing best practices. Like you said, everybody's just trying to figure out what's going on. And some of these base layer tools like Qdrant really enable products and enable companies and they enable us.”*\ +-- Robert Caulk +> -Range("price", new Qdrant.Client.Grpc.Range { Gte = 100.0, Lte = 450 }); +Robert, Founder of Emergent Methods is a scientist by trade, dedicating his career to a variety of open-source projects that range from large-scale artificial intelligence to discrete element modeling. He is currently working with a team at Emergent Methods to adaptively model over 1 million news articles per day, with a goal of reducing media bias and improving news awareness. -``` +***Listen to the episode on [Spotify](https://open.spotify.com/episode/7lQnfv0v2xRtFksGAP6TUW?si=Vv3B9AbjQHuHyKIrVtWL3Q), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/0ORi9QJlud0).*** -```go -import "github.com/qdrant/go-client/qdrant" + -qdrant.NewRange("price", &qdrant.Range{ - Gte: qdrant.PtrOf(100.0), - Lte: qdrant.PtrOf(450.0), -}) + -``` +## **Top takeaways:** -The `range` condition sets the range of possible values for stored payload values. -If several values are stored, at least one of them should match the condition. +How do Robert Caulk and Emergent Methods contribute to the open-source community, particularly in AI systems and news article modeling? -Comparisons that can be used: +In this episode, we'll be learning stuff about open-source projects that are reshaping how we interact with AI systems and news article modeling. Robert takes us on an exploration into the evolving landscape of news distribution and the tech making it more efficient and balanced. -- `gt` \- greater than -- `gte` \- greater than or equal -- `lt` \- less than -- `lte` \- less than or equal +Here are some takeaways from this episode: -Can be applied to [float](https://qdrant.tech/documentation/concepts/payload/#float) and [integer](https://qdrant.tech/documentation/concepts/payload/#integer) payloads. +1. **Context Matters**: Discover the importance of context engineering in news and how it ensures a diversified and consumable information flow. +2. **Introducing Newscatcher API**: Get the lowdown on how this tool taps into 50,000 news sources for more thorough and up-to-date reporting. +3. **The Magic of Embedding**: Learn about article summarization and semantic search, and how they're crucial for discovering content that truly resonates. +4. **Qdrant & Cloud**: Explore how Qdrant's cloud offering and its single responsibility principle support a robust, modular approach to managing news data. +5. **Startup Superpowers**: Find out why startups have an edge in implementing new tech solutions and how incumbents are tied down by legacy products. -### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#datetime-range) Datetime Range +> Fun Fact: Did you know that startups' lack of established practices is actually a superpower in the face of new tech paradigms? Legacy products can't keep up! +> -The datetime range is a unique range condition, used for [datetime](https://qdrant.tech/documentation/concepts/payload/#datetime) payloads, which supports RFC 3339 formats. -You do not need to convert dates to UNIX timestaps. During comparison, timestamps are parsed and converted to UTC. +## Show notes: -_Available as of v1.8.0_ +00:00 Intro to Robert and Emergent Methods.\ +05:22 Crucial dedication to scaling context engineering.\ +07:07 Optimizing embedding for semantic similarity in search.\ +13:07 New search technology boosts efficiency and speed.\ +14:17 Reliable cloud provider with privacy and scalability.\ +17:46 Efficient data movement and resource management.\ +22:39 GoLang for services, Rust for security.\ +27:34 Logistics organized; Newscatcher provides up-to-date news.\ +30:27 Tested Weaviate and another in Rust.\ +32:01 Filter updates by starring and user preferences. -jsonpythontypescriptrustjavacsharpgo +## More Quotes from Robert: -```json -{ - "key": "date", - "range": { - "gt": "2023-02-08T10:49:00Z", - "gte": null, - "lt": null, - "lte": "2024-01-31 10:14:31Z" - } -} +*"Web search is powerful, but it's slow and ultimately inaccurate. What we're building is real time indexing and we couldn't do that without Qdrant*”\ +-- Robert Caulk -``` +*"You need to start thinking about persistence and search and making sure those services are robust. That's where Qdrant comes into play. And we found that the all in one solutions kind of sacrifice performance for convenience, or sacrifice accuracy for convenience, but it really wasn't for us. We'd rather just orchestrate it ourselves and let Qdrant do what Qdrant does, instead of kind of just hope that an all in one solution is handling it for us and that allows for modularity performance.”*\ +-- Robert Caulk -```python -models.FieldCondition( - key="date", - range=models.DatetimeRange( - gt="2023-02-08T10:49:00Z", - gte=None, - lt=None, - lte="2024-01-31T10:14:31Z", - ), -) +*"Anyone riding the Qdrant wave is just reaping benefits. It seems monthly, like two months ago, sparse vector support got added. There's just constantly new massive features that enable products.”*\ +-- Robert Caulk -``` +## Transcript: +Demetrios: +Robert, it's great to have you here for the vector space talks. I don't know if you're familiar with some of this fun stuff that we do here, but we get to talk with all kinds of experts like yourself on what they're doing when it comes to the vector space and how you've overcome challenges, how you're working through things, because this is a very new field and it is not the most intuitive, as you will tell us more in this upcoming talk. I really am excited because you've been a scientist by trade. Now, you're currently founder at Emergent Methods and you've dedicated your career to a variety of open source projects that range from the large scale AI systems to the discrete element modeling. Now at emergent methods, you are adaptively modeling over 1 million news articles per day. That sounds like a whole lot of news articles. And you've been talking and working through production grade RAG, which is basically everyone's favorite topic these days. So I know you got to talk for us, man. -```typescript -{ - key: 'date', - range: { - gt: '2023-02-08T10:49:00Z', - gte: null, - lt: null, - lte: '2024-01-31T10:14:31Z' - } -} +Demetrios: +I'm going to hand it over to you. I'll bring up your screen right now, and when someone wants to answer or ask a question, feel free to throw it in the chat and I'll jump out at Robert and stop him if needed. -``` +Robert Caulk: +Sure. -```rust -use qdrant_client::qdrant::{Condition, DatetimeRange, Timestamp}; +Demetrios: +Great to have you here, man. I'm excited for this one. -Condition::datetime_range( - "date", - DatetimeRange { - gt: Some(Timestamp::date_time(2023, 2, 8, 10, 49, 0).unwrap()), - gte: None, - lt: None, - lte: Some(Timestamp::date_time(2024, 1, 31, 10, 14, 31).unwrap()), - }, -) +Robert Caulk: +Thanks for having me, Demetrios. Yeah, it's a great opportunity. I love talking about vector spaces, parameter spaces. So to talk on the show is great. We've got a lot of fun challenges ahead of us in the industry, I think, and the industry is establishing best practices. Like you said, everybody's just trying to figure out what's going on. And some of these base layer tools like Qdrant really enable products and enable companies and they enable us. So let me start. -``` +Robert Caulk: +Yeah, like you said, I'm Robert and I'm a founder of emergent methods. Our background, like you said, we are really committed to free and open source software. We started with a lot of narrow AI. Freak AI was one of our original projects, which is AI ML for algo trading very narrow AI, but we came together and built flowdapt. It's a really nice cluster orchestration software, and I'll talk a little bit about that during this presentation. But some of our background goes into, like you said, large scale deep learning for supercomputers. Really cool, interesting stuff. We have some cloud experience. -```java -import static io.qdrant.client.ConditionFactory.datetimeRange; +Robert Caulk: +We really like configuration, so let's dive into it. Why do we actually need to engineer context in the news? There's a lot of reasons why news is important and why it needs to be distributed in a way that's balanced and diversified, but also consumable. Right, let's look at Chat GPT on the left. This is Chat GPT plus it's kind of hanging out searching for Gaza news on Bing, trying to find the top three articles live. Web search is powerful, but it's slow and ultimately inaccurate. What we're building is real time indexing and we couldn't do that without Qdrant, and there's a lot of reasons which I'll be perfectly happy to dive into, but eventually Chat GPT will pull something together here. There it is. And the first thing it reports is 25 day old article with 25 day old nudes. -import com.google.protobuf.Timestamp; -import io.qdrant.client.grpc.Points.DatetimeRange; -import java.time.Instant; +Robert Caulk: +Old news. So it's just inaccurate. So it's borderline dangerous, what's happening here. Right, so this is a very delicate topic. Engineering context in news properly, which takes a lot of energy, a lot of time and dedication and focus, and not every company really has this sort of resource. So we're talking about enforcing journalistic standards, right? OpenAI and Chat GPt, they just don't have the time and energy to build a dedicated prompt for this sort of thing. It's fine, they're doing great stuff, they're helping you code. But someone needs to step in and really do enforce some journalistic standards here. -long gt = Instant.parse("2023-02-08T10:49:00Z").getEpochSecond(); -long lte = Instant.parse("2024-01-31T10:14:31Z").getEpochSecond(); +Robert Caulk: +And that includes enforcing diversity, languages, regions and sources. If I'm going to read about Gaza, what's happening over there, you can bet I want to know what Egypt is saying and what France is saying and what Algeria is saying. So let's do this right. That's kind of what we're suggesting, and the only way to do that is to parse a lot of articles. That's how you avoid outdated, stale reporting. And that's a real danger, which is kind of what we saw on that first slide. Everyone here knows hallucination is a problem and it's something you got to minimize, especially when you're talking about the news. It's just a really high cost if you get it wrong. -datetimeRange("date", - DatetimeRange.newBuilder() - .setGt(Timestamp.newBuilder().setSeconds(gt)) - .setLte(Timestamp.newBuilder().setSeconds(lte)) - .build()); +Robert Caulk: +And so you need people dedicated to this. And if you're going to dedicate a ton of resources and ton of people, you might as well scale that properly. So that's kind of where this comes into. We call this context engineering news context engineering, to be precise, before llama two, which also is enabling products left and right. As we all know, the traditional pipeline was chunk it up, take 512 tokens, put it through a translator, put it through distill art, do some sentence extraction, and maybe text classification, if you're lucky, get some sentiment out of it and it works. It gets you something. But after we're talking about reading full articles, getting real rich, context, flexible output, translating, summarizing, really deciding that custom extraction on the fly as your product evolves, that's something that the traditional pipeline really just doesn't support. Right. -``` +Robert Caulk: +We're talking being able to on the fly say, you know what, actually we want to ask this very particular question of all articles and get this very particular field out. And it's really just a prompt modification. This all is based on having some very high quality, base level, diversified news. And so we'll talk a little bit more. But newscatchers is one of the sources that we're using, which opens up 50,000 different sources. So check them out. That's newscatcherapi.com. They even give free access to researchers if you're doing research in this. -```csharp -using Qdrant.Client.Grpc; +Robert Caulk: +So I don't want to dive too much into the direct rag stuff. We can go deep, but I'm happy to talk about some examples of how to optimize this and how we've optimized it. Here on the right, you can see the diagram where we're trying to follow along the process of summarizing and embedding. And I'll talk a bit more about that in a moment. It's here to support after we've summarized those articles and we're ready to embed that. Embedding is really important to get that right because like the name of the show suggests you have to have a clean cluster vector space if you're going to be doing any sort of really rich semantic similarity searches. And if you're going to be able to dive deep into extracting important facts out of all 1 million articles a day, you're going to need to do this right. So having a user query which is not equivalent to the embedded page where this is the data, the enriched data that the embedding that we really want to be able to do search on. -Conditions.DatetimeRange( - field: "date", - gt: new DateTime(2023, 2, 8, 10, 49, 0, DateTimeKind.Utc), - lte: new DateTime(2024, 1, 31, 10, 14, 31, DateTimeKind.Utc) -); +Robert Caulk: +And then how do we connect the dots here? Of course, there are many ways to go about it. One way which is interesting and fun to talk about is ide. So that's basically a hypothetical document embedding. And what you do is you use the LLM directly to generate a fake article. And that's what we're showing here on the right. So let's say if the user says, what's going on in New York City government, well, you could say, hey, write me just a hypothetical summary based, it could completely fake and use that to create a fake embedding page and use that for the search. Right. So then you're getting a lot closer to where you want to go. -``` +Robert Caulk: +There's some limitations to this, to it's, there's a computational cost also, it's not updated. It's based on whatever. It's basically diving into what it knows about the New York City government and just creating keywords for you. So there's definitely optimizations here as well. When you talk about ambiguity, well, what if the user follows up and says, well, why did they change the rules? Of course, that's where you can start prompt engineering a little bit more and saying, okay, given this historic conversation and the current question, give me some explicit question without ambiguity, and then do the high, if that's something you want to do. The real goal here is to stay in a single parameter space, a single vector space. Stay as close as possible when you're doing your search as when you do your embedding. So we're talking here about production scale of stuff. -```go -import ( - "time" +Robert Caulk: +So I really am happy to geek out about the stack, the open source stack that we're relying on, which includes Qdrant here. But let's start with VLLM. I don't know if you guys have heard of it. This is a really great new project, and their focus on continuous batching and page detention. And if I'm being completely honest with you, it's really above my pay grade in the technicals and how they're actually implementing all of that inside the GPU memory. But what we do is we outsource that to that project and we really like what they're doing, and we've seen really good results. It's increasing throughput. So when you're talking about trying to parse through a million articles, you're going to need a lot of throughput. - "github.com/qdrant/go-client/qdrant" - "google.golang.org/protobuf/types/known/timestamppb" -) +Robert Caulk: +The other is text embedding inference. This is a great server. A lot of vector databases will say, okay, we'll do all the embedding for you and we'll do all everything. But when you move to production scale, I'll talk a bit about this later. You need to be using micro service architecture, so it's not super smart to have your database bogged down with doing sorting out the embeddings and sorting out other things. So honestly, I'm a real big fan of single responsibility principle, and that's what Tei does for you. And it also does dynamic batching, which is great in this world where everything is heterogeneous lengths of what's coming in and what's going out. So it's great. -qdrant.NewDatetimeRange("date", &qdrant.DatetimeRange{ - Gt: timestamppb.New(time.Date(2023, 2, 8, 10, 49, 0, 0, time.UTC)), - Lte: timestamppb.New(time.Date(2024, 1, 31, 10, 14, 31, 0, time.UTC)), -}) +Robert Caulk: +It really simplifies the process and allows you to isolate resources. But now the star of the show Qdrant, it's really come into its own. Anyone riding the Qdrant wave is just reaping benefits. It seems monthly, like two months ago, sparse vector support got added. There's just constantly new massive features that enable products. Right. So for us, we're doing so much up Cert, we really need to minimize client connections and networking overhead. So you got that batch up cert. -``` +Robert Caulk: +The filters are huge. We're talking about real time filtering. We can't be searching on news articles from a month ago, two months ago, if the user is asking for a question that's related to the last 24 hours. So having that timestamp filtering and having it be efficient, which is what it is in Qdrant, is huge. Keyword filtering really opens up a massive realm of product opportunities for us. And then the sparse vectors, we hopped on this train immediately and are just seeing benefits. I don't want to say replacement of elasticsearch, but elasticsearch is using sparse vectors as well. So you can add splade into elasticsearch, and splade is great. -### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#uuid-match) UUID Match +Robert Caulk: +It's a really great alternative to BM 25. It's based on that architecture, and that really opens up a lot of opportunities for filtering out keywords that are kind of useless to the search when the user uses the and a, and then there, these words that are less important splays a bit of a hybrid into semantics, but sparse retrieval. So it's really interesting. And then the idea of hybrid search with semantic and a sparse vector also opens up the ability to do ranking, and you got a higher quality product at the end, which is really the goal, right, especially in production. Point number four here, I would say, is probably one of the most important to us, because we're dealing in a world where latency is king, and being able to deploy Qdrant inside of the same cluster as all the other services. So we're just talking through the switch. That's huge. We're never getting bogged down by network. -_Available as of v1.11.0_ +Robert Caulk: +We're never worried about a cloud provider potentially getting overloaded or noisy neighbor problems, stuff like that, completely removed. And then you got high privacy, right. All the data is completely isolated from the external world. So this point number four, I'd say, is one of the biggest value adds for us. But then distributing deployment is huge because high availability is important, and deep storage, which when you're in the business of news archival, and that's one of our main missions here, is archiving the news forever. That's an ever growing database, and so you need a database that's going to be able to grow with you as your data grows. So what's the TLDR to this context? Engineering? Well, service orchestration is really just based on service orchestration in a very heterogeneous and parallel event driven environment. On the right side, we've got the user requests coming in. -Matching of UUID values works similarly to the regular `match` condition for strings. -Functionally, it will work with `keyword` and `uuid` indexes exactly the same, but `uuid` index is more memory efficient. +Robert Caulk: +They're hitting all the same services, which every five minutes or every two minutes, whatever you've scheduled the scrape workflow on, also hitting the same services, this requires some orchestration. So that's kind of where I want to move into discussing the real production, scaling, orchestration of the system and how we're doing that. Provide some diagrams to show exactly why we're using the tools we're using here. This is an overview of our Kubernetes cluster with the services that we're using. So it's a bit of a repaint of the previous diagram, but a better overview about showing kind of how these things are connected and why they're connected. I'll go through one by one on these services to just give a little deeper dive into each one. But the goal here is for us, in our opinion, microservice orchestration is key. Sticking to single responsibility principle. -jsonpythontypescriptrustjavacsharpgo +Robert Caulk: +Open source projects like Qdrant, like Tei, like VLLM and Kubernetes, it's huge. Kubernetes is opening up doors for security and for latency. And of course, if you're going to be getting involved in this game, you got to find the strong DevOps. There's no escaping that. So let's step through kind of piece by piece and talk about flow Dapp. So that's our project. That's our open source project. We've spent about two years building this for our needs, and we're really excited because we did a public open sourcing maybe last week or the week before. -```json -{ - "key": "uuid", - "match": { - "value": "f47ac10b-58cc-4372-a567-0e02b2c3d479" - } -} +Robert Caulk: +So finally, after all of our testing and rewrites and refactors, we're open. We're open for business. And it's running asknews app right now, and we're really excited for where it's going to go and how it's going to help other people orchestrate their clusters. Our goal and our priorities were highly paralyzed compute and we were running tests using all sorts of different executors, comparing them. So when you use Flowdapt, you can choose ray or dask. And that's key. Especially with vanilla Python, zero code changes, you don't need to know how ray or dask works. In the back end, flowdapt is vanilla Python. -``` +Robert Caulk: +That was a key goal for us to ensure that we're optimizing how data is moving around the cluster. Automatic resource management this goes back to Ray and dask. They're helping manage the resources of the cluster, allocating a GPU to a task, or allocating multiple tasks to one GPU. These can come in very, very handy when you're dealing with very heterogeneous workloads like the ones that we discussed in those previous slides. For us, the biggest priority was ensuring rapid prototyping and debugging locally. When you're dealing with clusters of 1015 servers, 40 or 5100 with ray, honestly, ray just scales as far as you want. So when you're dealing with that big of a cluster, it's really imperative that what you see on your laptop is also what you are going to see once you deploy. And being able to debug anything you see in the cluster is big for us, we really found the need for easy cluster wide data sharing methods between tasks. -```python -models.FieldCondition( - key="uuid", - match=models.MatchValue(value="f47ac10b-58cc-4372-a567-0e02b2c3d479"), -) +Robert Caulk: +So essentially what we've done is made it very easy to get and put values. And so this makes it extremely easy to move data and share data between tasks and make it highly available and stay in cluster memory or persist it to disk, so that when you do the inevitable version update or debug, you're reloading from a persisted state in the real time. News business scheduling is huge. Scheduling, making sure that various workflows are scheduled at different points and different periods or frequencies rather, and that they're being scheduled correctly, and that their triggers are triggering exactly what you need when you need it. Huge for real time. And then one of our biggest selling points, if you will, for this project is Kubernetes style. Everything. Our goal is everything's Kubernetes style, so that if you're coming from Kubernetes, everything's familiar, everything's resource oriented. -``` +Robert Caulk: +We even have our own flowectl, which would be the Kubectl style command schemas. A lot of what we've done is ensuring deployment cycle efficiency here. So the goal is that flowdapt can schedule everything and manage all these services for you, create workflows. But why these services? For this particular use case, I'll kind of skip through quickly. I know I'm kind of running out of time here, but of course you're going to need some proprietary remote models. That's just how it works. You're going to of course share that load with on premise llms to reduce cost and to have some reasoning engine on premise. But there's obviously advantages and disadvantages to these. -```typescript -{ - key: 'uuid', - match: {value: 'f47ac10b-58cc-4372-a567-0e02b2c3d479'} -} +Robert Caulk: +I'm not going to go through them. I'm happy to make these slides available, and you're welcome to kind of parse through the details. Yeah, for sure. You need to start thinking about persistence and search and making sure those services are robust. That's where Qdrant comes into play. And we found that the all in one solutions kind of sacrifice performance for convenience, or sacrifice accuracy for convenience, but it really wasn't for us. We'd rather just orchestrate it ourselves and let Qdrant do what Qdrant does, instead of kind of just hope that an all in one solution is handling it for us and that allows for modularity performance. And we'll dump Qdrant if we want to. -``` +Robert Caulk: +Probably we won't. Or we'll dump if we need to, or we'll swap out for whatever replaces vllm. Trying to keep things modular so that future engineers are able to adapt with the tech that's just blowing up and exploding right now. Right. The last thing to talk about here in a production scale environment is really minimizing the latency. I touched on this with Kubernetes ensuring that these services are sitting on the same network, and that is huge. But that talks about decommunication latency. But when you start talking about getting hit with a ton of traffic, production scale, tons of people asking a question all simultaneously, and you needing to go hit a variety of services, well, this is where you really need to isolate that to an asynchronous environment. -```rust -Condition::matches("uuid", "f47ac10b-58cc-4372-a567-0e02b2c3d479".to_string()) +Robert Caulk: +And of course, if you could write this all in Golang, that's probably going to be your best bet for us. We have some services written in Golang, but predominantly, especially the endpoints that the ML engineers need to work with. We're using fast API on pydantic and honestly, it's powerful. Pydantic V 2.0 now runs on Rust, and as anyone in the Qdrant community knows, Rust is really valuable when you're dealing with highly parallelized environments that require high security and protections for immutability and atomicity. Forgive me for the pronunciation, that kind of sums up the production scale talk, and I'm happy to answer questions. I love diving into this sort of stuff. I do have some just general thoughts on why startups are so much more well positioned right now than some of these incumbents, and I'll just do kind of a quick run through, less than a minute just to kind of get it out there. We can talk about it, see if we agree or disagree. -``` +Robert Caulk: +But you touched on it, Demetrios, in the introduction, which was the best practices have not been established. That's it. That is why startups have such a big advantage. And the reason they're not established is because, well, the new paradigm of technology is just underexplored. We don't really know what the limits are and how to properly handle these things. And that's huge. Meanwhile, some of these incumbents, they're dealing with all sorts of limitations and resistance to change and stuff, and then just market expectations for incumbents maintaining these kind of legacy products and trying to keep them hobbling along on this old tech. In my opinion, startups, you got your reasoning engine building everything around a reasoning engine, using that reasoning engine for every aspect of your system to really open up the adaptivity of your product. -```java -matchKeyword("uuid", "f47ac10b-58cc-4372-a567-0e02b2c3d479"); +Robert Caulk: +And okay, I won't put elasticsearch in the incumbent world. I'll keep elasticsearch in the middle. I understand it still has a lot of value, but some of these vendor lock ins, not a huge fan of. But anyway, that's it. That's kind of all I have to say. But I'm happy to take questions or chat a bit. -``` +Demetrios: +Dude, I've got so much to ask you and thank you for breaking down that stack. That is like the exact type of talk that I love to see because you open the kimono full on. And I was just playing around with asknews app. And so I think it's probably worth me sharing my screen just to show everybody what exactly that is and how that looks at the moment. So you should be able to see it now. Right? And super cool props to you for what you've built. Because I went, and intuitively I was able to say like, oh, cool, I can change, I can see positive news, and I can go by the region that I'm looking at. I want to make sure that I'm checking out all the stuff in Europe or all the stuff in America categories. -```csharp -using static Qdrant.Client.Grpc.Conditions; +Demetrios: +I can look at sports, blah blah blah, like as if you were flipping the old newspaper and you could go to the sports section or the finance section, and then you cite the sources and you see like, oh, what's the trend in the coverage here? What kind of coverage are we getting? Where are we at in the coverage cycle? Probably something like that. And then, wait, although I was on the happy news, I thought murder, she wrote. So anyway, what we do is we. -MatchKeyword("uuid", "f47ac10b-58cc-4372-a567-0e02b2c3d479"); +Robert Caulk: +Actually sort it from we take the poll and we actually just sort most positive to the least positive. But you're right, we were talking the other day, we're like, let's just only show the positive. But yeah, that's a good point. -``` +Demetrios: +There you go. -```go -import "github.com/qdrant/go-client/qdrant" +Robert Caulk: +Murder, she wrote. -qdrant.NewMatch("uuid", "f47ac10b-58cc-4372-a567-0e02b2c3d479") +Demetrios: +But the one thing that I was actually literally just yesterday talking to someone about was how you update things inside of your vector database. So I can imagine that news, as you mentioned, news cycles move very fast and the news that happened 2 hours ago is very different. The understanding of what happened in a very big news event is very different 2 hours ago than it is right now. So how do you make sure that you're always pulling the most current and up to date information? -``` +Robert Caulk: +This is another logistical point that we think needs to get sorted properly and there's a few layers to it. So for us, as we're parsing that data coming in from Newscatcher, so newscatcher is doing a good job of always feeding the latest buckets to us. Sometimes one will be kind of arrive, but generally speaking, it's always the latest news. So we're taking five minute buckets, and then with those buckets, we're going through and doing all of our enrichment on that, adding it to Qdrant. And that is the point where we use that timestamp filtering, which is such an important point. So in the metadata of Qdrant, we're using the range filter, which is where we call that the timestamp filter, but it's really range filter, and that helps. So when we're going back to update things, we're sorting and ensuring that we're filtering out only what we haven't seen. -### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#geo) Geo +Demetrios: +Okay, that makes complete sense. And basically you could generalize this to something like what I was talking to with people yesterday about, which was, hey, I've got an HR policy that gets updated every other month or every quarter, and I want to make sure that if my HR chatbot is telling people what their vacation policy is, it's pulling from the most recent HR policy. So how do I make sure and do that? And how do I make sure that my vector database isn't like a landmine where it's pulling any information, but we don't necessarily have that control to be able to pull the correct information? And this comes down to that retrieval evaluation, which is such a hot topic, too. -#### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#geo-bounding-box) Geo Bounding Box +Robert Caulk: +That's true. No, I think that's a key piece of the puzzle. Now, in that particular example, maybe you actually want to go in and start cleansing a bit, your database, just to make sure if it's really something you're never going to need again. You got to get rid of it. This is a piece I didn't add to the presentation, but it's tangential. You got to keep multiple databases and you got to making sure to isolate resources and cleaning out a database, especially in real time. So ensuring that your database is representative of what you want to be searching on. And you can do this with collections too, if you want. -jsonpythontypescriptrustjavacsharpgo +Robert Caulk: +But we find there's sometimes a good opportunity to isolate resources in that sense, 100%. -```json -{ - "key": "location", - "geo_bounding_box": { - "bottom_right": { - "lon": 13.455868, - "lat": 52.495862 - }, - "top_left": { - "lon": 13.403683, - "lat": 52.520711 - } - } -} +Demetrios: +So, another question that I had for you was, I noticed Mongo was in the stack. Why did you not just use the Mongo vector option? Is it because of what you were mentioning, where it's like, yeah, you have these all-in-one options, but you sacrifice that performance for the convenience? -``` +Robert Caulk: +We didn't test that, to be honest, I can't say. All I know is we tested weaviate, we tested one other, and I just really like. Although I was going to say I like that it's written in rust, although I believe Mongo is also written in rust, if I'm not mistaken. But for us, the document DB is more of a representation of state and what's happening, especially for our configurations and workflows. Meanwhile, we really like keeping and relying on Qdrant and all the features. Qdrant is updating, so, yeah, I'd say single responsibility principle is key to that. But I saw some chat in Qdrant discord about this, which I think the only way to use vector is actually to use their cloud offering, if I'm not mistaken. Do you know about this? -```python -models.FieldCondition( - key="location", - geo_bounding_box=models.GeoBoundingBox( - bottom_right=models.GeoPoint( - lon=13.455868, - lat=52.495862, - ), - top_left=models.GeoPoint( - lon=13.403683, - lat=52.520711, - ), - ), -) +Demetrios: +Yeah, I think so, too. -``` +Robert Caulk: +This would also be a piece that we couldn't do. -```typescript -{ - key: 'location', - geo_bounding_box: { - bottom_right: { - lon: 13.455868, - lat: 52.495862 - }, - top_left: { - lon: 13.403683, - lat: 52.520711 - } - } -} +Demetrios: +Yeah. Where it's like it's open source, but not open source, so that makes sense. Yeah. This has been excellent, man. So I encourage anyone who is out there listening, check out again this is asknews app, and stay up to date with the most relevant news in your area and what you like. And I signed in, so I'm guessing that when I sign in, it's going to tweak my settings. Am I going to be able. -``` +Robert Caulk: +Good question. -```rust -use qdrant_client::qdrant::{Condition, GeoBoundingBox, GeoPoint}; +Demetrios: +Catch this next time. -Condition::geo_bounding_box( - "location", - GeoBoundingBox { - bottom_right: Some(GeoPoint { - lon: 13.455868, - lat: 52.495862, - }), - top_left: Some(GeoPoint { - lon: 13.403683, - lat: 52.520711, - }), - }, -) +Robert Caulk: +Well, at the moment, if you star a story, a narrative that you find interesting, then you can filter on the star and whatever the latest updates are, you'll get it for that particular story. Okay. It brings up another point about Qdrant, which is at the moment we're not doing it yet, but we have plans to use the recommendation system for letting a user kind of create their profile by just saying what they like, what they don't like, and then using the recommender to start recommending stories that they may or may not like. And that's us outsourcing the Qdrant almost entirely. Right. It's just us building around it. So that's nice. + +Demetrios: +Yeah. That makes life a lot easier, especially knowing recommender systems. Yeah, that's excellent. + +Robert Caulk: +Thanks. I appreciate that. For sure. And I'll try to make the slides available. I don't know if I can send them to the two Qdrant or something. They could post them in the discord maybe, for sure. + +Demetrios: +And we can post them in the link in the description of this talk. So this has been excellent. Rob, I really appreciate you coming on here and chatting with me about this, and thanks for breaking down everything that you're doing. I also love the VllM project. It's blowing up. It's cool to see so much usage and all the good stuff that you're doing with it. And yeah, man, for anybody that wants to follow along on your journey, we'll drop a link to your LinkedIn so that they can connect with you and. -``` +Robert Caulk: +Cool. -```java -import static io.qdrant.client.ConditionFactory.geoBoundingBox; +Demetrios: +Thank you. -geoBoundingBox("location", 52.520711, 13.403683, 52.495862, 13.455868); +Robert Caulk: +Thanks for having me. Demetrios, talk to you later. -``` +Demetrios: +Catch you later, man. Take care. -```csharp -using static Qdrant.Client.Grpc.Conditions; +<|page-369-lllmstxt|> +> *"There is this really great vector db comparison that came out recently. I saw there are like maybe more than 40 vector stores in 2024. When we started back in 2023, there were only a few. What I see, which is really lacking in this pipeline of retrieval augmented generation is major innovation around data pipeline.”*\ +-- Hooman Sedghamiz +> -GeoBoundingBox("location", 52.520711, 13.403683, 52.495862, 13.455868); +Hooman Sedghamiz, Sr. Director AI/ML - Insights at Bayer AG is a distinguished figure in AI and ML in the life sciences field. With years of experience, he has led teams and projects that have greatly advanced medical products, including implantable and wearable devices. Notably, he served as the Generative AI product owner and Senior Director at Bayer Pharmaceuticals, where he played a pivotal role in developing a GPT-based central platform for precision medicine. -``` +In 2023, he assumed the role of Co-Chair for the EMNLP 2023 GEM industrial track, furthering his contributions to the field. Hooman has also been an AI/ML advisor and scientist at the University of California, San Diego, leveraging his expertise in deep learning to drive biomedical research and innovation. His strengths lie in guiding data science initiatives from inception to commercialization and bridging the gap between medical and healthcare applications through MLOps, LLMOps, and deep learning product management. Engaging with research institutions and collaborating closely with Dr. Nemati at Harvard University and UCSD, Hooman continues to be a dynamic and influential figure in the data science community. -```go -import "github.com/qdrant/go-client/qdrant" +***Listen to the episode on [Spotify](https://open.spotify.com/episode/2oj2ne5l9qrURQSV0T1Hft?si=DMJRTAt7QXibWiQ9CEKTJw), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/yfzLaH5SFX0).*** -qdrant.NewGeoBoundingBox("location", 52.520711, 13.403683, 52.495862, 13.455868) + -``` + -It matches with `location` s inside a rectangle with the coordinates of the upper left corner in `bottom_right` and the coordinates of the lower right corner in `top_left`. +## **Top takeaways:** -#### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#geo-radius) Geo Radius +Why is real-time evaluation critical in maintaining the integrity of chatbot interactions and preventing issues like promoting competitors or making false promises? What strategies do developers employ to minimize cost while maximizing the effectiveness of model evaluations, specifically when dealing with LLMs? These might be just some of the many questions people in the industry are asking themselves. We aim to cover most of it in this talk. -jsonpythontypescriptrustjavacsharpgo +Check out their conversation as they peek into world of AI chatbot evaluations. Discover the nuances of ensuring your chatbot's quality and continuous improvement across various metrics. -```json -{ - "key": "location", - "geo_radius": { - "center": { - "lon": 13.403683, - "lat": 52.520711 - }, - "radius": 1000.0 - } -} +Here are the key topics of this episode: -``` +1. **Evaluating Chatbot Effectiveness**: An exploration of systematic approaches to assess chatbot quality across various stages, encompassing retrieval accuracy, response generation, and user satisfaction. +2. **Importance of Real-Time Assessment**: Insights into why continuous and real-time evaluation of chatbots is essential to maintain integrity and ensure they function as designed without promoting undesirable actions. +3. **Indicators of Compromised Systems**: Understand the significance of identifying behaviors that suggest a system may be prone to 'jailbreaking' and the methods available to counter these through API integration. +4. **Cost-Effective Evaluation Models**: Discussion on employing smaller models for evaluation to reduce costs without compromising the depth of analysis, focusing on failure cases and root-cause assessments. +5. **Tailored Evaluation Metrics**: Emphasis on the necessity of customizing evaluation criteria to suit specific use case requirements, including an exploration of the different metrics applicable to diverse scenarios. -```python -models.FieldCondition( - key="location", - geo_radius=models.GeoRadius( - center=models.GeoPoint( - lon=13.403683, - lat=52.520711, - ), - radius=1000.0, - ), -) +>Fun Fact: Large language models like Mistral, Llama, and Nexus Raven have improved in their ability to perform function calling with low hallucination and high-quality output. -``` +> -```typescript -{ - key: 'location', - geo_radius: { - center: { - lon: 13.403683, - lat: 52.520711 - }, - radius: 1000.0 - } -} +## Show notes: -``` +00:00 Introduction to Bayer AG\ +05:15 Drug discovery, trial prediction, medical virtual assistants.\ +10:35 New language models like Llama rival GPT 3.5.\ +12:46 Large language model solving, efficient techniques, open source.\ +16:12 Scaling applications for diverse, individualized models.\ +19:02 Open source offers multilingual embedding.\ +25:06 Stability improved, reliable function calling capabilities emerged.\ +27:19 Platform aims for efficiency, measures impact.\ +31:01 Build knowledge discovery tool, measure value\ +33:10 Wrap up -```rust -use qdrant_client::qdrant::{Condition, GeoPoint, GeoRadius}; +## More Quotes from Hooman: -Condition::geo_radius( - "location", - GeoRadius { - center: Some(GeoPoint { - lon: 13.403683, - lat: 52.520711, - }), - radius: 1000.0, - }, -) +*"I think there has been concentration around vector stores. So a lot of startups that have appeared around vector store idea, but I think what really is lacking are tools that you have a lot of sources of knowledge, information.*”\ +-- Hooman Sedghamiz -``` +*"You can now kind of take a look and see that the performance of them is really, really getting close, if not better than GPT 3.5 already at same level and really approaching step by step to GPT 4.”*\ +-- Hooman Sedghamiz in advancements in language models -```java -import static io.qdrant.client.ConditionFactory.geoRadius; +*"I think the biggest, I think the untapped potential, it goes back to when you can do scientific discovery and all those sort of applications which are more challenging, not just around the efficiency and all those sort of things.”*\ +-- Hooman Sedghamiz -geoRadius("location", 52.520711, 13.403683, 1000.0f); +## Transcript: +Demetrios: +We are here and I couldn't think of a better way to spend my Valentine's Day than with you Hooman this is absolutely incredible. I'm so excited for this talk that you're going to bring and I want to let everyone that is out there listening know what caliber of a speaker we have with us today because you have done a lot of stuff. Folks out there do not let this man's young look fool you. You look like you are not in your fifty's or sixty's. But when it comes to your bio, it looks like you should be in your seventy's. I am very excited. You've got a lot of experience running data science projects, ML projects, LLM projects, all that fun stuff. You're working at Bayern Munich, sorry, not Bayern Munich, Bayer AG. And you're the senior director of AI and ML. -``` -```csharp -using static Qdrant.Client.Grpc.Conditions; +Demetrios: +And I think that there is a ton of other stuff that you've done when it comes to machine learning, artificial intelligence. You've got both like the traditional ML background, I think, and then you've also got this new generative AI background and so you can leverage both. But you also think about things in data engineering way. You understand the whole lifecycle. And so today we get to talk all about some of this fun. I know you've got some slides prepared for us. I'll let you throw those on and I'll let anyone else in the chat. Feel free to ask questions while Hooman is going through the presentation and I'll jump in and stop them when needed. -GeoRadius("location", 52.520711, 13.403683, 1000.0f); -``` +Demetrios: +But also we can have a little discussion after a few minutes of slides. So for everyone looking, we're going to be watching this and then we're going to be checking out like really talking about what 2024 AI in the enterprise looks like and what is needed to really take advantage of that. So Hooman, I'm dropping off to you, man, and I'll jump in when needed. -```go -import "github.com/qdrant/go-client/qdrant" -qdrant.NewGeoRadius("location", 52.520711, 13.403683, 1000.0) +Hooman Sedghamiz: +Thanks a lot for the introduction. Let me get started. Do you have my screen already? -``` +Demetrios: +Yeah, we see it. -It matches with `location` s inside a circle with the `center` at the center and a radius of `radius` meters. +Hooman Sedghamiz: +Okay, perfect. All right, so hopefully I can change the slides. Yes, as you said, first, thanks a lot for spending your day with me. I know it's Valentine's Day, at least here in the US people go crazy when it gets Valentine's. But I know probably a lot of you are in love with large language models, semantic search and all those sort of things, so it's great to have you here. Let me just start with the. I have a lot of slides, by the way, but maybe I can start with kind of some introduction about the company I work for, what these guys are doing and what we are doing at a life science company like Bayer, which is involved in really major humanity needs, right? So health and the food chain and like agriculture, we do three major kind of products or divisions in the company, mainly consumer halls, over the counter medication that probably a lot of you have taken, aspirin, all those sort of good stuff. And we have crop science division that works on ensuring that the yield is high for crops and the food chain is performing as it should, and also pharmaceutical side which is around treatment and prevention. -If several values are stored, at least one of them should match the condition. -These conditions can only be applied to payloads that match the [geo-data format](https://qdrant.tech/documentation/concepts/payload/#geo). +Hooman Sedghamiz: +So now you can imagine via is really important to us because it has the potential of unlocking a future where good health is a reality and hunger is a memory. So I maybe start about maybe giving you a hint of what are really the numerous use cases that AI or challenges that AI could help out with. In life science industry. You can think of adverse event detection when patients are taking a medication, too much of it. The patients might report adverse events, stomach bleeding and go to social media post about it. A few years back, it was really difficult to process automatically all this sort of natural text in a kind of scalable manner. But nowadays, thanks to large language models, it's possible to automate this and identify if there is a medication or anything that might have negatively an adverse event on a patient population. Similarly, you can now create a lot of marketing content using these large language models for products. -#### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#geo-polygon) Geo Polygon +Hooman Sedghamiz: +At the same time, drug discovery is making really big strides when it comes to identifying new compounds. You can essentially describe these compounds using formats like smiles, which could be represented as real text. And these large language models can be trained on them and they can predict the sequences. At the same time, you have this clinical trial outcome prediction, which is huge for pharmaceutical companies. If you could predict what will be the outcome of a trial, it would be a huge time and resource saving for a lot of companies. And of course, a lot of us already see in the market a lot of medical virtual assistants using large language models that can answer medical inquiries and give consultations around them. And there is really, I believe the biggest potential here is around real world data, like most of us nowadays, have some sort of sensor or watch that's measuring our health maybe at a minute by minute level, or it's measuring our heart rate. You go to the hospital, you have all your medical records recorded there, and these large language models have their capacity to process this complex data, and you will be able to drive better insights for individualized insights for patients. -Geo Polygons search is useful for when you want to find points inside an irregularly shaped area, for example a country boundary or a forest boundary. A polygon always has an exterior ring and may optionally include interior rings. A lake with an island would be an example of an interior ring. If you wanted to find points in the water but not on the island, you would make an interior ring for the island. +Hooman Sedghamiz: +And our company is also in crop science, as I mentioned, and crop yield prediction. If you could help farmers improve their crop yield, it means that they can produce better products faster with higher quality. So maybe I could start with maybe a history in 2023, what happened? How companies like ours were looking at large language models and opportunities. They bring, I think in 2023, everyone was excited to bring these efficiency games, right? Everyone wanted to use them for creating content, drafting emails, all these really low hanging fruit use cases. That was around. And one of the earlier really nice architectures that came up that I really like was from a 16 z enterprise that was, I think, back in really, really early 2023. LangChain was new, we had land chain and we had all this. Of course, Qdrant been there for a long time, but it was the first time that you could see vector store products could be integrated into applications. -When defining a ring, you must pick either a clockwise or counterclockwise ordering for your points. The first and last point of the polygon must be the same. +Hooman Sedghamiz: +Really at large scale. There are different components. It's quite complex architecture. So on the right side you see how you can host large language models. On the top you see how you can augment them using external data. Of course, we had these plugins, right? So you can connect these large language models with Google search APIs, all those sort of things, and some validation that are in the middle that you could use to validate the responses fast forward. Maybe I can kind of spend, let me check out the time. Maybe I can spend a few minutes about the components of LLM APIs and hosting because that I think has a lot of potential in terms of applications that need to be really scalable. -Currently, we only support unprojected global coordinates (decimal degrees longitude and latitude) and we are datum agnostic. +Hooman Sedghamiz: +Just to give you some kind of maybe summary about my company, we have around 100,000 people in almost all over the world. Like the languages that people speak are so diverse. So it makes it really difficult to build an application that will serve 200,000 people. And it's kind of efficient. It's not really costly and all those sort of things. So maybe I can spend a few minutes talking about what that means and how kind of larger scale companies might be able to tackle that efficiently. So we have, of course, out of the box solutions, right? So you have Chat GPT already for enterprise, you have other copilots and for example from Microsoft and other companies that are offering, but normally they are seat based, right? So you kind of pay a subscription fee, like Spotify, you pay like $20 per month, $30 on average, somewhere between $20 to $60. And for a company, like, I was like, just if you calculate that for 3000 people, that means like 180,000 per month in subscription fees. -jsonpythontypescriptrustjavacsharpgo +Hooman Sedghamiz: +And we know that most of the users won't use that. We know that it's a usage based application. You just probably go there. Depending on your daily work, you probably use it. Some people don't use it heavily. I kind of did some calculation. If you build it in house using APIs that you can access yourself, and large language models that corporations can deploy internally and locally, that cost saving could be huge, really magnitudes cheaper, maybe 30 to 20 to 30 times cheaper. So looking, comparing 2024 to 2023, a lot of things have changed. -```json +Hooman Sedghamiz: +Like if you look at the open source large language models that came out really great models from Mistral, now we have models like Llama, two based model, all of these models came out. You can now kind of take a look and see that the performance of them is really, really getting close, if not better than GPT 3.5 already at same level and really approaching step by step to GPT 4. And looking at the price on the right side and speed or throughput, you can see that like for example, Mistral seven eight B could be a really cheap option to deploy. And also the performance of it gets really close to GPT 3.5 for many use cases in the enterprise companies. I think two of the big things this year, end of last year that came out that make this kind of really a reality are really a few large language models. I don't know if I can call them large language models. They are like 7 billion to 13 billion compared to GPT four, GT 3.5. I don't think they are really large. -{ - "key": "location", - "geo_polygon": { - "exterior": { - "points": [\ - { "lon": -70.0, "lat": -70.0 },\ - { "lon": 60.0, "lat": -70.0 },\ - { "lon": 60.0, "lat": 60.0 },\ - { "lon": -70.0, "lat": 60.0 },\ - { "lon": -70.0, "lat": -70.0 }\ - ] - }, - "interiors": [\ - {\ - "points": [\ - { "lon": -65.0, "lat": -65.0 },\ - { "lon": 0.0, "lat": -65.0 },\ - { "lon": 0.0, "lat": 0.0 },\ - { "lon": -65.0, "lat": 0.0 },\ - { "lon": -65.0, "lat": -65.0 }\ - ]\ - }\ - ] - } -} +Hooman Sedghamiz: +But one was Nexus Raven. We know that applications, if they want to be robust, they really need function calling. We are seeing this paradigm of function calling, which essentially you ask a language model to generate structured output, you give it a function signature, right? You ask it to generate an output, structured output argument for that function. Next was Raven came out last year, that, as you can see here, really is getting really close to GPT four, right? And GPT four being magnitude bigger than this model. This model only being 13 billion parameters really provides really less hallucination, but at the same time really high quality of function calling. So this makes me really excited for the open source and also the companies that want to build their own applications that requires function calling. That was really lacking maybe just five months ago. At the same time, we have really dedicated large language models to programming languages or scripting like SQL, that we are also seeing like SQL coder that's already beating GPT four. -``` +Hooman Sedghamiz: +So maybe we can now quickly take a look at how model solving will look like for a large company like ours, like companies that have a lot of people across the globe again, in this aspect also, the community has made really big progress, right? So we have text generation inference from hugging face is open source for most purposes, can be used and it's the choice of mine and probably my group prefers this option. But we have Olama, which is great, a lot of people are using it. We have llama CPP which really optimizes the large language models for local deployment as well, and edge devices. I was really amazed seeing Raspberry PI running a large language model, right? Using Llama CPP. And you have this text generation inference that offers quantization support, continuous patching, all those sort of things that make these large LLMs more quantized or more compressed and also more suitable for deployment to large group of people. Maybe I can kind of give you kind of a quick summary of how, if you decide to deploy these large language models, what techniques you could use to make them more efficient, cost friendly and more scalable. So we have a lot of great open source projects like we have Lite LLM which essentially creates an open AI kind of signature on top of your large language models that you have deployed. Let's say you want to use Azure to host or to access GPT four gypty 3.5 or OpenAI to access OpenAI API. -```python -models.FieldCondition( - key="location", - geo_polygon=models.GeoPolygon( - exterior=models.GeoLineString( - points=[\ - models.GeoPoint(\ - lon=-70.0,\ - lat=-70.0,\ - ),\ - models.GeoPoint(\ - lon=60.0,\ - lat=-70.0,\ - ),\ - models.GeoPoint(\ - lon=60.0,\ - lat=60.0,\ - ),\ - models.GeoPoint(\ - lon=-70.0,\ - lat=60.0,\ - ),\ - models.GeoPoint(\ - lon=-70.0,\ - lat=-70.0,\ - ),\ - ] - ), - interiors=[\ - models.GeoLineString(\ - points=[\ - models.GeoPoint(\ - lon=-65.0,\ - lat=-65.0,\ - ),\ - models.GeoPoint(\ - lon=0.0,\ - lat=-65.0,\ - ),\ - models.GeoPoint(\ - lon=0.0,\ - lat=0.0,\ - ),\ - models.GeoPoint(\ - lon=-65.0,\ - lat=0.0,\ - ),\ - models.GeoPoint(\ - lon=-65.0,\ - lat=-65.0,\ - ),\ - ]\ - )\ - ], - ), -) +Hooman Sedghamiz: +To access those, you could put them behind Lite LLM. You could have models using hugging face that are deployed internally, you could put lightlm in front of those, and then your applications could just use OpenAI, Python SDK or anything to call them naturally. And then you could simply do load balancing between those. Of course, we have also, as I mentioned, a lot of now serving opportunities for deploying those models that you can accelerate. Semantic caching is another opportunity for saving cost. Like for example, if you have cute rent, you are storing the conversations. You could semantically check if the user has asked similar questions and if that question is very similar to the history, you could just return that response instead of calling the large language model that can create costs. And of course you have line chain that you can summarize conversations, all those sort of things. -``` +Hooman Sedghamiz: +And we have techniques like prompt compression. So as I mentioned, this really load balancing can offer a lot of opportunities for scaling this large language model. As you know, a lot of offerings from OpenAI APIs or Microsoft Azure, they have rate limits, right? So you can't call those models extensively. So what you could do, you could have them in multiple regions, you can have multiple APIs, local TGI deployed models using hugging face TGI or having Azure endpoints and OpenAI endpoints. And then you could use light LLM to load balance between these models. Once the users get in. Right. User one, you send the user one to one deployment, you send the user two requests to the other deployment. -```typescript -{ - key: "location", - geo_polygon: { - exterior: { - points: [\ - {\ - lon: -70.0,\ - lat: -70.0\ - },\ - {\ - lon: 60.0,\ - lat: -70.0\ - },\ - {\ - lon: 60.0,\ - lat: 60.0\ - },\ - {\ - lon: -70.0,\ - lat: 60.0\ - },\ - {\ - lon: -70.0,\ - lat: -70.0\ - }\ - ] - }, - interiors: [\ - {\ - points: [\ - {\ - lon: -65.0,\ - lat: -65.0\ - },\ - {\ - lon: 0,\ - lat: -65.0\ - },\ - {\ - lon: 0,\ - lat: 0\ - },\ - {\ - lon: -65.0,\ - lat: 0\ - },\ - {\ - lon: -65.0,\ - lat: -65.0\ - }\ - ]\ - }\ - ] - } -} +Hooman Sedghamiz: +So this way you can really scale your application to large amount of users. And of course, we have these opportunities for applications called Lorex that use Lora. Probably a lot of you have heard of like very efficient way of fine tuning these models with fewer number of parameters that we could leverage to have really individualized models for a lot of applications. And you can see the costs are just not comparable if you wanted to use, right. So at GPT 3.5, even in terms of performance and all those sort of things, because you can use really small hardware GPU to deploy thousands of Lora weights or adapters, and then you will be able to serve a diverse set of models to your users. I think one really important part of these kind of applications is the part that you add contextual data, you add augmentation to make them smarter and to make them more up to date. So, for example, in healthcare domain, a lot of Americans already don't have high trust in AI when it comes to decision making in healthcare. So that's why augmentation of data or large language models is really, really important for bringing trust and all those sort of state of the art knowledge to this large language model. -``` +Hooman Sedghamiz: +For example, if you ask about cancer or rededicated questions that need to build on top of scientific knowledge, it's very important to use those. Augmented or retrieval augmented generation. No, sorry, go next. Jumped on one. But let me see. I think I'm missing a slide, but yeah, I have it here. So going through this kind of, let's say retrieval augmented generation, different parts of it. You have, of course, these vector stores that in 2024, I see explosion of vector stores. -```rust -use qdrant_client::qdrant::{Condition, GeoLineString, GeoPoint, GeoPolygon}; +Hooman Sedghamiz: +Right. So there is this really great vector DB comparison that came out recently. I saw there are like maybe more than 40 vector stores in 2024. When we started back in 2023 was only a few. And what I see, which is really lacking in this pipeline of retrieval augmented generation is major innovation around data pipeline. And I think we were talking before this talk together that ETL is not something that is taken seriously. So far. We have a lot of embedding models that are coming out probably on a weekly basis. -Condition::geo_polygon( - "location", - GeoPolygon { - exterior: Some(GeoLineString { - points: vec![\ - GeoPoint {\ - lon: -70.0,\ - lat: -70.0,\ - },\ - GeoPoint {\ - lon: 60.0,\ - lat: -70.0,\ - },\ - GeoPoint {\ - lon: 60.0,\ - lat: 60.0,\ - },\ - GeoPoint {\ - lon: -70.0,\ - lat: 60.0,\ - },\ - GeoPoint {\ - lon: -70.0,\ - lat: -70.0,\ - },\ - ], - }), - interiors: vec![GeoLineString {\ - points: vec![\ - GeoPoint {\ - lon: -65.0,\ - lat: -65.0,\ - },\ - GeoPoint {\ - lon: 0.0,\ - lat: -65.0,\ - },\ - GeoPoint { lon: 0.0, lat: 0.0 },\ - GeoPoint {\ - lon: -65.0,\ - lat: 0.0,\ - },\ - GeoPoint {\ - lon: -65.0,\ - lat: -65.0,\ - },\ - ],\ - }], - }, -) +Hooman Sedghamiz: +We have great embedding models that are open source, BgEM. Three is one that is multilingual, 100 plus languages. You could embed text in those languages. We have a lot of vector stores, but we don't have really ETL tools, right? So we have maybe a few airbytes, right? How can you reindex data efficiently? How can you parse scientific articles? Like imagine I have an image here, we have these articles or archive or on a pubmed, all those sort of things that have images and complex structure that our parsers are not able to parse them efficiently and make sense of them so that you can embed them really well. And really doing this Internet level, scientific level retrieval is really difficult. And no one I think is still doing it at scale. I just jumped, I have a love slide, maybe I can jump to my last and then we can pause there and take in some questions. Where I see 2014 and beyond, beyond going for large language models for enterprises, I see assistance, right? I see assistance for personalized assistance, for use cases coming out, right? So these have probably four components. -``` +Hooman Sedghamiz: +You have even a personalized large language model that can learn from the history of your conversation, not just augmented. Maybe you can fine tune that using Laura and all those techniques. You have the knowledge that probably needs to be customized for your assistant and integrated using vector stores and all those sort of things, technologies that we have out, you know, plugins that bring a lot of plugins, some people call them skills, and also they can cover a lot of APIs that can bring superpowers to the large language model and multi agent setups. Right? We have autogen, a lot of cool stuff that is going on. The agent technology is getting really mature now as we go forward. We have langraph from Langchain that is bringing a lot of more stabilized kind of agent technology. And then you can think of that as for companies building all these kind of like App Stores or assistant stores that use cases, store there. And the colleagues can go there, search. -```java -import static io.qdrant.client.ConditionFactory.geoPolygon; +Hooman Sedghamiz: +I'm looking for this application. That application is customized for them, or even they can have their own assistant which is customized to them, their own large language model, and they could use that to bring value. And then even a nontechnical person could create their own assistant. They could attach the documents they like, they could select the plugins they like, they'd like to be connected to, for example, archive, or they need to be connected to API and how many agents you like. You want to build a marketing campaign, maybe you need an agent that does market research, one manager. And then you build your application which is customized to you. And then based on your feedback, the large language model can learn from your feedback as well. Going forward, maybe I pause here and then we can it was a bit longer than I expected, but yeah, it's all good, man. -import io.qdrant.client.grpc.Points.GeoLineString; -import io.qdrant.client.grpc.Points.GeoPoint; +Demetrios: +Yeah, this is cool. Very cool. I appreciate you going through this, and I also appreciate you coming from the past, from 2014 and talking about what we're going to do in 2024. That's great. So one thing that I want to dive into right away is the idea of ETL and why you feel like that is a bit of a blocker and where you think we can improve there. -geoPolygon( - "location", - GeoLineString.newBuilder() - .addAllPoints( - List.of( - GeoPoint.newBuilder().setLon(-70.0).setLat(-70.0).build(), - GeoPoint.newBuilder().setLon(60.0).setLat(-70.0).build(), - GeoPoint.newBuilder().setLon(60.0).setLat(60.0).build(), - GeoPoint.newBuilder().setLon(-70.0).setLat(60.0).build(), - GeoPoint.newBuilder().setLon(-70.0).setLat(-70.0).build())) - .build(), - List.of( - GeoLineString.newBuilder() - .addAllPoints( - List.of( - GeoPoint.newBuilder().setLon(-65.0).setLat(-65.0).build(), - GeoPoint.newBuilder().setLon(0.0).setLat(-65.0).build(), - GeoPoint.newBuilder().setLon(0.0).setLat(0.0).build(), - GeoPoint.newBuilder().setLon(-65.0).setLat(0.0).build(), - GeoPoint.newBuilder().setLon(-65.0).setLat(-65.0).build())) - .build())); +Hooman Sedghamiz: +Yeah. So I think there has been concentration around vector stores. Right. So a lot of startups that have appeared around vector store idea, but I think what really is lacking tools that you have a lot of sources of knowledge, information. You have your Gmail, if you use outlook, if you use scientific knowledge, like sources like archive. We really don't have any startup that I hear that. Okay. I have a platform that offers real time retrieval from archive papers. -``` +Hooman Sedghamiz: +And you want to ask a question, for example, about transformers. It can do retrieval, augmented generation over all archive papers in real time as they get added for you and brings back the answer to you. We don't have that. We don't have these syncing tools. You can of course, with tricks you can maybe build some smart solutions, but I haven't seen many kind of initiatives around that. And at the same time, we have this paywall knowledge. So we have these nature medicine amazing papers which are paywall. We can access them. -```csharp -using Qdrant.Client.Grpc; -using static Qdrant.Client.Grpc.Conditions; +Hooman Sedghamiz: +Right. So we can build rag around them yet, but maybe some startups can start coming up with strategies, work with this kind of publishing companies to build these sort of things. -GeoPolygon( - field: "location", - exterior: new GeoLineString - { - Points = - { - new GeoPoint { Lat = -70.0, Lon = -70.0 }, - new GeoPoint { Lat = 60.0, Lon = -70.0 }, - new GeoPoint { Lat = 60.0, Lon = 60.0 }, - new GeoPoint { Lat = -70.0, Lon = 60.0 }, - new GeoPoint { Lat = -70.0, Lon = -70.0 } - } - }, - interiors: [\ - new()\ - {\ - Points =\ - {\ - new GeoPoint { Lat = -65.0, Lon = -65.0 },\ - new GeoPoint { Lat = 0.0, Lon = -65.0 },\ - new GeoPoint { Lat = 0.0, Lon = 0.0 },\ - new GeoPoint { Lat = -65.0, Lon = 0.0 },\ - new GeoPoint { Lat = -65.0, Lon = -65.0 }\ - }\ - }\ - ] -); +Demetrios: +Yeah, it's almost like you're seeing it not as the responsibility of nature or. -``` +Hooman Sedghamiz: +Maybe they can do it. -```go -import "github.com/qdrant/go-client/qdrant" +Demetrios: +Yeah, they can potentially, but maybe that's not their bread and butter and so they don't want to. And so how do startups get in there and take some of this paywalled information and incorporate it into their product? And there is another piece that you mentioned on, just like when it comes to using agents, I wonder, have you played around with them a lot? Have you seen their reliability get better? Because I'm pretty sure a lot of us out there have tried to mess around with agents and maybe just like blown a bunch of money on GPT, four API calls. And it's like this thing isn't that stable. What's going on? So do you know something that we don't? -qdrant.NewGeoPolygon("location", - &qdrant.GeoLineString{ - Points: []*qdrant.GeoPoint{ - {Lat: -70, Lon: -70}, - {Lat: 60, Lon: -70}, - {Lat: 60, Lon: 60}, - {Lat: -70, Lon: 60}, - {Lat: -70, Lon: -70}, - }, - }, &qdrant.GeoLineString{ - Points: []*qdrant.GeoPoint{ - {Lat: -65, Lon: -65}, - {Lat: 0, Lon: -65}, - {Lat: 0, Lon: 0}, - {Lat: -65, Lon: 0}, - {Lat: -65, Lon: -65}, - }, - }) +Hooman Sedghamiz: +I think they have become much, much more stable. If you look back in 2023, like June, July, they were really new, like auto GPT. We had all these new projects came out, really didn't work out as you say, they were not stable. But I would say by the end of 2023, we had really stable frameworks, for example, customized solutions around agent function calling. I think when function calling came out, the capability that you could provide signature or dot string of, I don't know, a function and you could get back the response really reliably. I think that changed a lot. And Langchen has this OpenAI function calling agent that works with some measures. I mean, of course I wouldn't say you could automate 100% something, but for a knowledge, kind of. -``` +Hooman Sedghamiz: +So for example, if you have an agent that has access to data sources, all those sort of things, and you ask it to go out there, see what are the latest clinical trial design trends, it can call these tools, it can reliably now get you answer out of ten times, I would say eight times, it works. Now it has become really stable. And what I'm excited about is the latest multi agent scenarios and we are testing them. They are very promising. Right? So you have autogen from Microsoft platform, which is open source, and also you have landgraph from Langchain, which I think the frameworks are becoming really stable. My prediction is between the next few months is lots of, lots of applications will rely on agents. -A match is considered any point location inside or on the boundaries of the given polygon’s exterior but not inside any interiors. +Demetrios: +So you also mentioned how to recognize if a project is winning or losing type thing. And considering there are so many areas that you can plug in AI, especially when you're looking at buyer and all the different places that you can say, oh yeah, we could add some AI to this. How are you setting up metrics so, you know, what is worth it to continue investing into versus what maybe sounded like a better idea, but in practice it wasn't actually that good of an idea. -If several location values are stored for a point, then any of them matching will include that point as a candidate in the resultset. -These conditions can only be applied to payloads that match the [geo-data format](https://qdrant.tech/documentation/concepts/payload/#geo). +Hooman Sedghamiz: +Yeah, depends on the platform that you're building. Right? So where we started back in 2023, the platform was aiming for efficiency, right? So how can you make our colleagues more efficient? They can be faster in their daily work, like really delegate this boring stuff, like if you want to summarize or you want to create a presentation, all those sort of things, and you have measures in place that, for example, you could ask, okay, now you're using this platform for months. Let us know how many hours you're saving during your daily work. And really we could see the shift, right? So we did a questionnaire and I think we could see a lot of shift in terms of saving hours, daily work, all those sort of things that is measurable. And it's like you could then convert it, of course, to the value that brings for the enterprise on the company. And I think the biggest, I think the untapped potential, it goes back to when you can do scientific discovery and all those sort of applications which are more challenging, not just around the efficiency and all those sort of things. And then you need to really, if you're building a product, if it's not the general product. And for example, let's say if you're building a natural language to SQL, let's say you have a database. -### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#values-count) Values count +Hooman Sedghamiz: +It was a relational database. You want to build an application that searches cars in the background. The customers go there and ask, I'm looking for a BMW 2013. It uses qudrant in the back, right. It kind of does semantic search, all these cool things and returns the response. I think then you need to have really good measures to see how satisfied your customers are when you're integrating a kind of generative application on top of your website that's selling cars. So measuring this in a kind of, like, cyclic manner, people are not going to be happy because you start that there are a lot of things that you didn't count for. You measure all those kind of metrics and then you go forward, you improve your platform. -In addition to the direct value comparison, it is also possible to filter by the amount of values. +Demetrios: +Well, there's also something else that you mentioned, and it brought up this thought in my mind, which is undoubtedly you have these low hanging fruit problems, and it's mainly based on efficiency gains. Right. And so it's helping people extract data from pdfs or what be it, and you're saving time there. You're seeing that you're saving time, and it's a fairly easy setup. Right. But then you have moonshots, I would imagine, like creating a whole new type of aspirin or tylenol or whatever it is, and that is a lot more of an investment of time and energy and infrastructure and everything along those lines. How do you look at both of these and say, we want to make sure that we make headway in both directions. And I'm not sure if you have unlimited resources to be able to just do everything or if you have to recognize what the trade offs are and how you measure those types of metrics. -For example, given the data: +Demetrios: +Again, in seeing where do we invest and where do we cut ties with different initiatives. -```json -[\ - { "id": 1, "name": "product A", "comments": ["Very good!", "Excellent"] },\ - { "id": 2, "name": "product B", "comments": ["meh", "expected more", "ok"] }\ -] +Hooman Sedghamiz: +Yeah. So that's a great question. So for product development, like the example that you made, there are really a lot of stages involved. Right. So you start from scientific discovery stage. So I can imagine that you can have multiple products along the way to help out. So if you have a product already out there that you want to generate insights and see. Let's say you have aspirin out there. -``` +Hooman Sedghamiz: +You want to see if it is also helpful for cardiovascular problems that patients might have. So you could build a sort of knowledge discovery tool that could search for you, give it a name of your product, it will go out there, look into pubmed, all these articles that are being published, brings you back the results. Then you need to have really clear metrics to see if this knowledge discovery platform, after a few months is able to bring value to the customers or the stakeholders that you build the platform for. We have these experts that are really experts in their own field. Takes them really time to go read these articles to make conclusions or answer questions about really complex topic. I think it's really difficult based on the initial feedback we see, it helps, it helps save them time. But really I think it goes back again to the ETL problem that we still don't have your paywall. We can't access a lot of scientific knowledge yet. -We can perform the search only among the items with more than two comments: +Hooman Sedghamiz: +And these guys get a little bit discouraged at the beginning because they expect that a lot of people, especially non technical, say like you go to Chat GPT, you ask and it brings you the answer, right? But it's not like that. It doesn't work like that. But we can measure it, we can see improvements, they can access knowledge faster, but it's not comprehensive. That's the problem. It's not really deep knowledge. And I think the companies are still really encouraging developing these platforms and they can see that that's a developing field. Right. So it's very hard to give you a short answer, very hard to come up with metrics that gives you success of failure in a short term time period. -jsonpythontypescriptrustjavacsharpgo +Demetrios: +Yeah, I like the creativity that you're talking about there though. That is like along this multistepped, very complex product creation. There are potential side projects that you can do that show and prove value along the way, and they don't necessarily need to be as complex as that bigger project. -```json -{ - "key": "comments", - "values_count": { - "gt": 2 - } -} +Hooman Sedghamiz: +True. -``` +Demetrios: +Sweet, man. Well, this has been awesome. I really appreciate you coming on here to the vector space talks for anyone that would like to join us and you have something cool to present. We're always open to suggestions. Just hit me up and we will make sure to send you some shirt or whatever kind of swag is on hand. Remember, all you astronauts out there, don't get lost in vector space. This has been another edition of the Qdrant vector space talks with Hooman, my man, on Valentine's Day. I can't believe you decided to spend it with me. -```python -models.FieldCondition( - key="comments", - values_count=models.ValuesCount(gt=2), -) +Demetrios: +I appreciate it. -``` +Hooman Sedghamiz: +Thank you. Take care. -```typescript -{ - key: 'comments', - values_count: {gt: 2} -} +<|page-370-lllmstxt|> +> "*You don't want to use an expensive model like GPT 4 for evaluation, because then the cost adds up and it does not work out. If you are spending more on evaluating the responses, you might as well just do something else, like have a human to generate the responses.*”\ +-- Sourabh Agrawal +> -``` +Sourabh Agrawal, CEO & Co-Founder at UpTrain AI is a seasoned entrepreneur and AI/ML expert with a diverse background. He began his career at Goldman Sachs, where he developed machine learning models for financial markets. Later, he contributed to the autonomous driving team at Bosch/Mercedes, focusing on computer vision modules for scene understanding. In 2020, Sourabh ventured into entrepreneurship, founding an AI-powered fitness startup that gained over 150,000 users. Throughout his career, he encountered challenges in evaluating AI models, particularly Generative AI models. To address this issue, Sourabh is developing UpTrain, an open-source LLMOps tool designed to evaluate, test, and monitor LLM applications. UpTrain provides scores and offers insights to enhance LLM applications by performing root-cause analysis, identifying common patterns among failures, and providing automated suggestions for resolution. -```rust -use qdrant_client::qdrant::{Condition, ValuesCount}; +***Listen to the episode on [Spotify](https://open.spotify.com/episode/1o7xdbdx32TiKe7OSjpZts?si=yCHU-FxcQCaJLpbotLk7AQ), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/vBJF2sy1Pyw).*** -Condition::values_count( - "comments", - ValuesCount { - gt: Some(2), - ..Default::default() - }, -) + -``` + -```java -import static io.qdrant.client.ConditionFactory.valuesCount; +## **Top takeaways:** -import io.qdrant.client.grpc.Points.ValuesCount; +Why is real-time evaluation critical in maintaining the integrity of chatbot interactions and preventing issues like promoting competitors or making false promises? What strategies do developers employ to minimize cost while maximizing the effectiveness of model evaluations, specifically when dealing with LLMs? These might be just some of the many questions people in the industry are asking themselves. Fear, not! Sourabh will break it down for you. -valuesCount("comments", ValuesCount.newBuilder().setGt(2).build()); +Check out the full conversation as they dive into the intricate world of AI chatbot evaluations. Discover the nuances of ensuring your chatbot's quality and continuous improvement across various metrics. -``` +Here are the key topics of this episode: -```csharp -using Qdrant.Client.Grpc; -using static Qdrant.Client.Grpc.Conditions; +1. **Evaluating Chatbot Effectiveness**: An exploration of systematic approaches to assess chatbot quality across various stages, encompassing retrieval accuracy, response generation, and user satisfaction. +2. **Importance of Real-Time Assessment**: Insights into why continuous and real-time evaluation of chatbots is essential to maintain integrity and ensure they function as designed without promoting undesirable actions. +3. **Indicators of Compromised Systems**: Understand the significance of identifying behaviors that suggest a system may be prone to 'jailbreaking' and the methods available to counter these through API integration. +4. **Cost-Effective Evaluation Models**: Discussion on employing smaller models for evaluation to reduce costs without compromising the depth of analysis, focusing on failure cases and root-cause assessments. +5. **Tailored Evaluation Metrics**: Emphasis on the necessity of customizing evaluation criteria to suit specific use case requirements, including an exploration of the different metrics applicable to diverse scenarios. -ValuesCount("comments", new ValuesCount { Gt = 2 }); +> Fun Fact: Sourabh discussed the use of Uptrend, an innovative API that provides scores and explanations for various data checks, facilitating logical and informed decision-making when evaluating AI models. +> -``` +## Show notes: -```go -import "github.com/qdrant/go-client/qdrant" +00:00 Prototype evaluation subjective; scalability challenges emerge.\ +05:52 Use cheaper, smaller models for effective evaluation.\ +07:45 Use LLM objectively, avoid subjective biases.\ +10:31 Evaluate conversation quality and customization for AI.\ +15:43 Context matters for AI model performance.\ +19:35 Chat bot creates problems for car company.\ +20:45 Real-time user query evaluations, guardrails, and jailbreak.\ +27:27 Check relevance, monitor data, filter model failures.\ +28:09 Identify common themes, insights, experiment with settings.\ +32:27 Customize jailbreak check for specific app purposes.\ +37:42 Mitigate hallucination using evaluation data techniques.\ +38:59 Discussion on productizing hallucination mitigation techniques.\ +42:22 Experimentation is key for system improvement. -qdrant.NewValuesCount("comments", &qdrant.ValuesCount{ - Gt: qdrant.PtrOf(uint64(2)), -}) +## More Quotes from Sourabh: -``` +*"There are some cases, let's say related to safety, right? Like you want to check whether the user is trying to jailbreak your LLMs or not. So in that case, what you can do is you can do this evaluation in parallel to the generation because based on just the user query, you can check whether the intent is to jailbreak or it's an intent to actually use your product to kind of utilize it for the particular model purpose.*”\ +-- Sourabh Agrawal -The result would be: +*"You have to break down the response into individual facts and just see whether each fact is relevant for the question or not. And then take some sort of a ratio to get the final score. So that way all the biases which comes up into the picture, like egocentric bias, where LLM prefers its own outputs, those biases can be mitigated to a large extent.”*\ +-- Sourabh Agrawal -```json -[{ "id": 2, "name": "product B", "comments": ["meh", "expected more", "ok"] }] +*"Generally speaking, what we have been seeing is that the better context you retrieve, the better your model becomes.”*\ +-- Sourabh Agrawal -``` +## Transcript: +Demetrios: +Sourabh, I've got you here from Uptrain. I think you have some notes that you wanted to present, but I also want to ask you a few questions because we are going to be diving into a topic that is near and dear to my heart and I think it's been coming up so much recently that is using LLMs as a judge. It is really hot these days. Some have even gone as far to say that it is the topic of 2024. I would love for you to dive in. Let's just get right to it, man. What are some of the key topics when you're talking about using LLMs to evaluate what key metrics are you using? How does this work? Can you break it down? -If stored value is not an array - it is assumed that the amount of values is equals to 1. +Sourabh Agrawal: +Yeah. First of all, thanks a lot for inviting me and no worries for hiccup. I guess I have never seen a demo or a talk which goes without any technical hiccups. It is bound to happen. Really excited to be here. Really excited to talk about LLM evaluations. And as you rightly pointed right, it's really a hot topic and rightly so. Right. -### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#is-empty) Is Empty +Sourabh Agrawal: +The way things have been panning out with LLMs and chat, GPT and GPT four and so on, is that people started building all these prototypes, right? And the way to evaluate them was just like eyeball them, just trust your gut feeling, go with the vibe. I guess they truly adopted the startup methodology, push things out to production and break things. But what people have been realizing is that it's not scalable, right? I mean, rightly so. It's highly subjective. It's a developer, it's a human who is looking at all the responses, someday he might like this, someday he might like something else. And it's not possible for them to kind of go over, just read through more than ten responses. And now the unique thing about production use cases is that they need continuous refinement. You need to keep on improving them, you need to keep on improving your prompt or your retrieval, your embedding model, your retrieval mechanisms and so on. -Sometimes it is also useful to filter out records that are missing some value. -The `IsEmpty` condition may help you with that: +Sourabh Agrawal: +So that presents a case like you have to use a more scalable technique, you have to use LLMs as a judge because that's scalable. You can have an API call, and if that API call gives good quality results, it's a way you can mimic whatever your human is doing or in a way augment them which can truly act as their copilot. -jsonpythontypescriptrustjavacsharpgo +Demetrios: +Yeah. So one question that's been coming through my head when I think about using LLMs as a judge and I get more into it, has been around when do we use those API calls. It's not in the moment that we're looking for this output. Is it like just to see if this output is real? And then before we show it to the user, it's kind of in bunches after we've gotten a bit of feedback from the user. So that means that certain use cases are automatically discarded from this, right? Like if we are thinking, all right, we're going to use LLMs as a judge to make sure that we're mitigating hallucinations or that we are evaluating better, it is not necessarily something that we can do in the moment, if I'm understanding it correctly. So can you break that down a little bit more? How does it actually look in practice? -```json -{ - "is_empty": { - "key": "reports" - } -} +Sourabh Agrawal: +Yeah, definitely. And that's a great point. The way I see it, there are three cases. Case one is what you mentioned in the moment before showing the response to the user. You want to check whether the response is good or not. In most of the scenarios you can't do that because obviously checking requires extra time and you don't want to add latency. But there are some cases, let's say related to safety, right? Like you want to check whether the user is trying to jailbreak your LLMs or not. So in that case, what you can do is you can do this evaluation in parallel to the generation because based on just the user query, you can check whether the intent is to jailbreak or it's an intent to actually use your product to kind of utilize it for the particular model purpose. -``` +Sourabh Agrawal: +But most of the other evaluations like relevance, hallucinations, quality and so on, it has to be done. Post whatever you show to the users and then there you can do it in two ways. You can either experiment with use them to experiment with things, or you can run monitoring on your production and find out failure cases. And typically we are seeing like developers are adopting a combination of these two to find cases and then experiment and then improve their systems. -```python -models.IsEmptyCondition( - is_empty=models.PayloadField(key="reports"), -) +Demetrios: +Okay, so when you're doing it in parallel, that feels like something that is just asking you craft a prompt and as soon as. So you're basically sending out two prompts. Another piece that I have been thinking about is, doesn't this just add a bunch more cost to your system? Because there you're effectively doubling your cost. But then later on I can imagine you can craft a few different ways of making the evaluations and sending out the responses to the LLM better, I guess. And you can figure out how to trim some tokens off, or you can try and concatenate some of the responses and do tricks there. I'm sure there's all kinds of tricks that you know about that I don't, and I'd love to tell you to tell me about them, but definitely what kind of cost are we looking at? How much of an increase can we expect? -``` +Sourabh Agrawal: +Yeah, so I think that's like a very valid limitation of evaluation. So that's why, let's say at uptrend, what we truly believe in is that you don't want to use an expensive model like GPT four for evaluation, because then the cost adds up and it does not work out. Right. If you are spending more on evaluating the responses, you may as well just do something else, like have a human to generate the responses. We rely on smaller models, on cheaper models for this. And secondly, the methodology which we adopt is that you don't want to evaluate everything on all the data points. Like maybe you have a higher level check, let's say, for jailbreak or let's say for the final response quality. And when you find cases where the quality is low, you run a battery of checks on these failures to figure out which part of the pipeline is exactly failing. -```typescript -{ - is_empty: { - key: "reports" - } -} +Sourabh Agrawal: +This is something what we call as like root cause analysis, where you take all these failure cases, which may be like 10% or 20% of the cases out of all what you are seeing in production. Take these 20% cases, run like a battery of checks on them. They might be exhaustive. You might run like five to ten checks on them. And then based on those checks, you can figure out that, what is the error mode? Is it a retrieval problem? Is it a citation problem? Is it a utilization problem? Is it hallucination? Is the query like the question asked by the user? Is it not clear enough? Is it like your embedding model is not appropriate? So that's how you can kind of take best of the two. Like, you can also improve the performance at the same time, make sure that you don't burn a hole in your pocket. -``` +Demetrios: +I've also heard this before, and it's almost like you're using the LLMs as tests and they're helping you write. It's not that they're helping you write tests, it's that they are there and they're part of the tests that you're writing. -```rust -use qdrant_client::qdrant::Condition; +Sourabh Agrawal: +Yeah, I think the key here is that you have to use them objectively. What I have seen is a lot of people who are trying to do LLM evaluations, what they do is they ask the LLM that, okay, this is my response. Can you tell is it relevant or not? Or even, let's say, they go a step beyond and do like a grading thing, that is it highly relevant, somewhat relevant, highly irrelevant. But then it becomes very subjective, right? It depends upon the LLM to decide whether it's relevant or not. Rather than that you have to transform into an objective setting. You have to break down the response into individual facts and just see whether each fact is relevant for the question or not. And then take some sort of a ratio to get the final score. So that way all the biases which comes up into the picture, like egocentric bias, where LLM prefers its own outputs, those biases can be mitigated to a large extent. -Condition::is_empty("reports") +Sourabh Agrawal: +And I believe that's the key for making LLM evaluations work, because similar to LLM applications, even LLM evaluations, you have to put in a lot of efforts to make them really work and finally get some scores which align well with human expectations. -``` +Demetrios: +It's funny how these LLMs mimic humans so much. They love the sound of their own voice, even. It's hilarious. Yeah, dude. Well, talk to me a bit more about how this looks in practice, because there's a lot of different techniques that you can do. Also, I do realize that when it comes to the use cases, it's very different, right. So if it's code generation use case, and you're evaluating that, it's going to be pretty clear, did the code run or did it not? And then you can go into some details on is this code actually more valuable? Is it a hacked way to do it? Et cetera, et cetera. But there's use cases that I would consider more sensitive and less sensitive. -```java -import static io.qdrant.client.ConditionFactory.isEmpty; +Demetrios: +And so how do you look at that type of thing? -isEmpty("reports"); +Sourabh Agrawal: +Yeah, I think so. The way even we think about evaluations is there's no one size fit all solution for different use cases. You need to look at different things. And even if you, let's say, looking at hallucinations, different use cases, or different businesses would look at evaluations from different lenses. Right. For someone, whatever, if they are focusing a lot on certain aspects of the correctness, someone else would focus less on those aspects and more on other aspects. The way we think about it is, know, we define different criteria for different use cases. So if you have A-Q-A bot, right? So you look at the quality of the response, the quality of the context. -``` +Sourabh Agrawal: +If you have a conversational agent, then you look at the quality of the conversation as a whole. You look at whether the user is satisfied with that conversation. If you are writing long form content. Like, you look at coherence across the content, you look at the creativity or the sort of the interestingness of the content. If you have an AI agent, you look at how well they are able to plan, how well they were able to execute a particular task, and so on. How many steps do they take to achieve their objective? So there are a variety of these evaluation matrices, which are each one of which is more suitable for different use cases. And even there, I believe a good tool needs to provide certain customization abilities to their developers so that they can transform it, they can modify it in a way that it makes most sense for their business. -```csharp -using Qdrant.Client.Grpc; -using static Qdrant.Client.Grpc.Conditions; +Demetrios: +Yeah. Is there certain ones that you feel like are more prevalent and that if I'm just thinking about this, I'm developing on the side and I'm thinking about this right now and I'm like, well, how could I start? What would you recommend? -IsEmpty("reports"); +Sourabh Agrawal: +Yeah, definitely. One of the biggest use case for LLMs today is rag. Applications for Rag. I think retrieval is the key. So I think the best starting points in terms of evaluations is like look at the response quality, so look at the relevance of the response, look at the completeness of the response, look at the context quality. So like context relevance, which judges the retrieval quality. Hallucinations, which judges whether the response is grounded by the context or not. If tone matters for your use case, look at the tonality and finally look at the conversation satisfaction, because at the end, whatever outputs you give, you also need to judge whether the end user is satisfied with these outputs. -``` +Sourabh Agrawal: +So I would say these four or five matrices are the best way for any developer to start who is building on top of these LLMs. And from there you can understand how the behavior is going, and then you can go more deeper, look at more nuanced metrics, which can help you understand your systems even better. -```go -import "github.com/qdrant/go-client/qdrant" +Demetrios: +Yeah, I like that. Now, one thing that has also been coming up in my head a lot are like the custom metrics and custom evaluation and also proprietary data set, like evaluation data sets, because as we all know, the benchmarks get gamed. And you see on Twitter, oh wow, this new model just came out. It's so good. And then you try it and you're like, what are you talking about? This thing just was trained on the benchmarks. And so it seems like it's good, but it's not. And can you talk to us about creating these evaluation data sets? What have you seen as far as the best ways of going about it? What kind of size? Like how many do we need to actually make it valuable. And what is that? Give us a breakdown there? -qdrant.NewIsEmpty("reports") +Sourabh Agrawal: +Yeah, definitely. So, I mean, surprisingly, the answer is that you don't need that many to get started. We have seen cases where even if someone builds a test data sets of like 50 to 100 samples, that's actually like a very good starting point than where they were in terms of manual annotation and in terms of creation of this data set, I believe that the best data set is what actually your users are asking. You can look at public benchmarks, you can generate some synthetic data, but none of them matches the quality of what actually your end users are looking, because those are going to give you issues which you can never anticipate. Right. Even you're generating and synthetic data, you have to anticipate what issues can come up and generate data. Beyond that, if you're looking at public data sets, they're highly curated. There is always problems of them leaking into the training data and so on. -``` +Sourabh Agrawal: +So those benchmarks becomes highly reliable. So look at your traffic, take 50 samples from them. If you are collecting user feedback. So the cases where the user has downvoted or the user has not accepted the response, I mean, they are very good cases to look at. Or if you're running some evaluations, quality checks on that cases which are failing, I think they are the best starting point for you to have a good quality test data sets and use that as a way to experiment with your prompts, experiment with your systems, experiment with your retrievals, and iteratively improve them. -This condition will match all records where the field `reports` either does not exist, or has `null` or `[]` value. +Demetrios: +Are you weighing any metrics more than others? Because I've heard stories about how sometimes you'll see that a new model will come out, or you're testing out a new model, and it seems like on certain metrics, it's gone down. But then the golden metric that you have, it actually has gone up. And so have you seen which metrics are better for different use cases? -### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#is-null) Is Null +Sourabh Agrawal: +I think for here, there's no single answer. I think that metric depends upon the business. Generally speaking, what we have been seeing is that the better context you retrieve, the better your model becomes. Especially like if you're using any of the bigger models, like any of the GPT or claudes, or to some extent even mistral, is highly performant. So if you're using any of these highly performant models, then if you give them the right context, the response more or less, it comes out to be good. So I think one thing which we are seeing people focusing a lot on, experimenting with different retrieval mechanisms, embedding models, and so on. But then again, the final golden key, I think many people we have seen, they annotate some data set so they have like a ground root response or a golden response, and they completely rely on just like how well their answer matches with that golden response, which I believe it's a very good starting point because now you know that, okay, if this is right and you're matching very highly with that, then obviously your response is also right. -It is not possible to test for `NULL` values with the **match** condition. -We have to use `IsNull` condition instead: +Demetrios: +And what about those use cases where golden responses are very subjective? -jsonpythontypescriptrustjavacsharpgo +Sourabh Agrawal: +Yeah, I think that's where the issues like. So I think in those scenarios, what we have seen is that one thing which people have been doing a lot is they try to see whether all information in the golden response is contained in the generated response. You don't miss out any of the important information in your ground truth response. And on top of that you want it to be concise, so you don't want it to be blabbering too much or giving highly verbose responses. So that is one way we are seeing where people are getting around this subjectivity issue of the responses by making sure that the key information is there. And then beyond that it's being highly concise and it's being to the point in terms of the task being asked. -```json -{ - "is_null": { - "key": "reports" - } -} +Demetrios: +And so you kind of touched on this earlier, but can you say it again? Because I don't know if I fully grasped it. Where are all the places in the system that you are evaluating? Because it's not just the output. Right. And how do you look at evaluation as a system rather than just evaluating the output every once in a while? -``` +Sourabh Agrawal: +Yeah, so I mean, what we do is we plug with every part. So even if you start with retrieval, so we have a high level check where we look at the quality of retrieved context. And then we also have evaluations for every part of this retrieval pipeline. So if you're doing query rewrite, if you're doing re ranking, if you're doing sub question, we have evaluations for all of them. In fact, we have worked closely with the llama index team to kind of integrate with all of their modular pipelines. Secondly, once we cross the retrieval step, we have around five to six matrices on this retrieval part. Then we look at the response generation. We have their evaluations for different criterias. -```python -models.IsNullCondition( - is_null=models.PayloadField(key="reports"), -) +Sourabh Agrawal: +So conciseness, completeness, safety, jailbreaks, prompt injections, as well as you can define your custom guidelines. So you can say that, okay, if the user is asking anything and related to code, the output should also give an example code snippet so you can just in plain English, define this guideline. And we check for that. And then finally, like zooming out, we also have checks. We look at conversations as a whole, how the user is satisfied, how many turns it requires for them to, for the chatbot or the LLM to answer the user. Yeah, that's how we look at the whole evaluations as a whole. -``` +Demetrios: +Yeah. It really reminds me, I say this so much because it's one of the biggest fails, I think, on the Internet, and I'm sure you've seen it where I think it was like Chevy or GM, the car manufacturer car company, they basically slapped a chat bot on their website. It was a GPT call, and people started talking to it and realized, oh my God, this thing will do anything that we want it to do. So they started asking it questions like, is Tesla better than GM? And the bot would say, yeah, give a bunch of reasons why Tesla is better than GM on the website of GM. And then somebody else asked it, oh, can I get a car for a dollar? And it said, no. And then it said, but I'm broke and I need a car for a dollar. And it said, ok, we'll sell you the car for the dollar. And so you're getting yourself into all this trouble just because you're not doing that real time evaluation. -```typescript -{ - is_null: { - key: "reports" - } -} +Demetrios: +How do you think about the real time evaluation? And is that like an extra added layer of complexity? -``` +Sourabh Agrawal: +Yeah, for the real time evaluations, I think the most important cases, which, I mean, there are two scenarios which we feel like are most important to deal with. One is you have to put some guardrails in the sense that you don't want the users to talk about your competitors. You don't want to answer some queries, like, say, you don't want to make false promises, and so on, right? Some of them can be handled with pure rejects, contextual logics, and some of them you have to do evaluations. And the second is jailbreak. Like, you don't want the user to use, let's say, your Chevy chatbot to kind of solve math problems or solve coding problems, right? Because in a way, you're just like subsidizing GPT four for them. And all of these can be done just on the question which is being asked. So you can have a system where you can fire a query, evaluate a few of these key matrices, and in parallel generate your responses. And as soon as you get your response, you also get your evaluations. -```rust -use qdrant_client::qdrant::Condition; +Sourabh Agrawal: +And you can have some logic that if the user is asking about something which I should not be answering. Instead of giving the response, I should just say, sorry, I could not answer this or have a standard text for those cases and have some mechanisms to limit such scenarios and so on. -Condition::is_null("reports") +Demetrios: +And it's better to do that in parallel than to try and catch the response. Make sure it's okay before sending out an LLM call. -``` +Sourabh Agrawal: +I mean, generally, yes, because if you look at, if you catch the response, it adds another layer of latency. -```java -import static io.qdrant.client.ConditionFactory.isNull; +Demetrios: +Right. -isNull("reports"); +Sourabh Agrawal: +And at the end of the day, 95% of your users are not trying to do this any good product. A lot of those users are genuinely trying to use it and you don't want to build something which kind of breaks, creates an issue for them, add a latency for them just to solve for that 5%. So you have to be cognizant of this fact and figure out clever ways to do this. -``` +Demetrios: +Yeah, I remember I was talking to Philip of company called honeycomb, and they added some LLM functionality to their product. And he said that when people were trying to either prompt, inject or jailbreak, it was fairly obvious because there were a lot of calls. It kind of started to be not human usage and it was easy to catch in that way. Have you seen some of that too? And what are some signs that you see when people are trying to jailbreak? -```csharp -using Qdrant.Client.Grpc; -using static Qdrant.Client.Grpc.Conditions; +Sourabh Agrawal: +Yeah, I think we also have seen typically, what we also see is that whenever someone is trying to jailbreak, the length of their question or the length of their prompt typically is much larger than any average question, because they will have all sorts of instruction like forget everything, you know, you are allowed to say all of those things. And then again, this issue also comes because when they try to jailbreak, they try with one technique, it doesn't work. They try with another technique, it doesn't work. Then they try with third technique. So there is like a burst of traffic. And even in terms of sentiment, typically the sentiment or the coherence in those cases, we have seen that to be lower as compared to a genuine question, because people are just trying to cramp up all these instructions into the response. So there are definitely certain signs which already indicates that the user is trying to jailbreak this. And I think those are leg race indicators to catch them. -IsNull("reports"); +Demetrios: +And I assume that you've got it set up so you can just set an alert when those things happen and then it at least will flag it and have humans look over it or potentially just ask the person to cool off for the next minute. Hey, you've been doing some suspicious activity here. We want to see something different so I think you were going to show us a little bit about uptrend, right? I want to see what you got. Can we go for a spin? -``` +Sourabh Agrawal: +Yeah, definitely. Let me share my screen and I can show you how that looks like. -```go -import "github.com/qdrant/go-client/qdrant" +Demetrios: +Cool, very cool. Yeah. And just while you're sharing your screen, I want to mention that for this talk, I wore my favorite shirt, which is it says, I don't know if everyone can see it, but it says, I hallucinate more than Chat GPT. -qdrant.NewIsNull("reports") +Sourabh Agrawal: +I think that's a cool one. -``` +Demetrios: +What do we got here? -This condition will match all records where the field `reports` exists and has `NULL` value. +Sourabh Agrawal: +Yeah, so, yeah, let me kind of just get started. So I create an account with uptrend. What we have is an API method, API way of calculating these evaluations. So you get an API key similar to what you get for chat, GPT or others, and then you can just do uptrend log and evaluate and you can tell give your data. So you can give whatever your question responses context, and you can define your checks which you want to evaluate for. So if I create an API key, I can just copy this code and I just already have it here. So I'll just show you. So we have two mechanisms. -### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#has-id) Has id +Sourabh Agrawal: +One is that you can just run evaluations so you can define like, okay, I want to run context relevance, I want to run response completeness. Similarly, I want to run jailbreak. I want to run for safety. I want to run for satisfaction of the users and so on. And then when you run it, it gives back you a score and it gives back you an explanation on why this particular score has been given for this particular question. -This type of query is not related to payload, but can be very useful in some situations. -For example, the user could mark some specific search results as irrelevant, or we want to search only among the specified points. +Demetrios: +Can you make that a little bit bigger? Yeah, just give us some plus. Yeah, there we. -httppythontypescriptrustjavacsharpgo +Sourabh Agrawal: +It'S, it's essentially an API call which takes the data, takes the list of checks which you want to run, and then it gives back and score and an explanation for that. So based on that score, you can have logics, right? If the jailbreak score is like more than 0.5, then you don't want to show it. Like you want to switch back to a default response and so on. And then you can also configure that we log all of these course, and we have dashboard where you can access them. -```http -POST /collections/{collection_name}/points/scroll -{ - "filter": { - "must": [\ - { "has_id": [1,3,5,7,9,11] }\ - ] - } - ... -} +Demetrios: +I was just going to ask if you have dashboards. Everybody loves a good dashboard. Let's see it. That's awesome. -``` +Sourabh Agrawal: +So let's see. Okay, let's take this one. So in this case, I just ran some of this context relevance checks for some of the queries. So you can see how that changes on your data sets. If you're running the same. We also run this in a monitoring setting, so you can see how this varies over time. And then finally you have all of the data. So we provide all of the data, you can download it, run whatever analysis you want to run, and then you can also, one of the features which we have built recently and is getting very popular amongst our users is that you can filter cases where, let's say, the model is failing. -```python -client.scroll( - collection_name="{collection_name}", - scroll_filter=models.Filter( - must=[\ - models.HasIdCondition(has_id=[1, 3, 5, 7, 9, 11]),\ - ], - ), -) +Sourabh Agrawal: +So let's say I take all the cases where the responses is zero and I can find common topics. So I can look at all these cases and I can find, okay, what's the common theme across them? Maybe, as you can see, they're all talking about France, Romeo Juliet and so on. So it can just pull out a common topic among these cases. So then this gives you some insights into where things are going wrong and what do you need to improve upon. And the second piece of the puzzle is the experiments. So, not just you can evaluate them, but also you can use it to experiment with different settings. So let's say. Let me just pull out an experiment I ran recently. -``` +Demetrios: +Yeah. -```typescript -client.scroll("{collection_name}", { - filter: { - must: [\ - {\ - has_id: [1, 3, 5, 7, 9, 11],\ - },\ - ], - }, -}); +Sourabh Agrawal: +So let's say I want to compare two different models, right? So GPT 3.5 and clot two. So I can now see that, okay, clot two is giving more concise responses, but in terms of factual accuracy, like GPT 3.5 is more factually accurate. So I can now decide, based on my application, based on what my users want, I can now decide which of these criteria is more meaningful for me, it's more meaningful for my users, for my data, and decide which prompt or which model I want to go ahead with. -``` +Demetrios: +This is totally what I was talking about earlier, where you get a new model and you're seeing on some metrics, it's doing worse. But then on your core metric that you're looking at, it's actually performing better. So you have to kind of explain to yourself, why is it doing better on those other metrics? I don't know if I'm understanding this correctly. We can set the metrics that we're looking at. -```rust -use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; -use qdrant_client::Qdrant; +Sourabh Agrawal: +Yeah, actually, I'll show you the kind of metric. Also, I forgot to mention earlier, uptrend is like open source. -let client = Qdrant::from_url("http://localhost:6334").build()?; +Demetrios: +Nice. -client - .scroll( - ScrollPointsBuilder::new("{collection_name}") - .filter(Filter::must([Condition::has_id([1, 3, 5, 7, 9, 11])])), - ) - .await?; +Sourabh Agrawal: +Yeah. So we have these pre configured checks, so you don't need to do anything. You can just say uptrend response completeness or uptrend prompt injection. So these are like, pre configured. So we did the hard work of getting all these scores and so on. And on top of that, we also have ways for you to customize these matrices so you can define a custom guideline. You can change the prompt which you want. You can even define a custom python function which you want to act as an evaluator. -``` +Sourabh Agrawal: +So we provide all of those functionalities so that they can also take advantage of things which are already there, as well as they can create custom things which make sense for them and have a way to kind of truly understand how their systems are doing. -```java -import java.util.List; +Demetrios: +Oh, that's really cool. I really like the idea of custom, being able to set custom ones, but then also having some that just come right out of the box to make life easier on us. -import static io.qdrant.client.ConditionFactory.hasId; -import static io.qdrant.client.PointIdFactory.id; +Sourabh Agrawal: +Yeah. And I think both are needed because you want someplace to start, and as you advance, you also want to kind of like, you can't cover everything right, with pre configured. So you want to have a way to customize things. -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.ScrollPoints; +Demetrios: +Yeah. And especially once you have data flowing, you'll start to see what other things you need to be evaluating exactly. -client - .scrollAsync( - ScrollPoints.newBuilder() - .setCollectionName("{collection_name}") - .setFilter( - Filter.newBuilder() - .addMust(hasId(List.of(id(1), id(3), id(5), id(7), id(9), id(11)))) - .build()) - .build()) - .get(); +Sourabh Agrawal: +Yeah, that's very true. -``` +Demetrios: +Just the random one. I'm not telling you how to build your product or anything, but have you thought about having a community sourced metric? So, like, all these custom ones that people are making, maybe there's a hub where we can add our custom? -```csharp -using Qdrant.Client; -using static Qdrant.Client.Grpc.Conditions; +Sourabh Agrawal: +Yeah, I think that's really interesting. This is something we also have been thinking a lot. It's not built out yet, but we plan to kind of go in that direction pretty soon. We want to kind of create, like a store kind of a thing where people can add their custom matrices. So. Yeah, you're right on. I think I also believe that's the way to go, and we will be releasing something on those fronts pretty soon. -var client = new QdrantClient("localhost", 6334); +Demetrios: +Nice. So drew's asking, how do you handle jailbreak for different types of applications? Jailbreak for a medical app would be different than one for a finance one, right? Yeah. -await client.ScrollAsync(collectionName: "{collection_name}", filter: HasId([1, 3, 5, 7, 9, 11])); +Sourabh Agrawal: +The way our jailbreak check is configured. So it takes something, what you call as a model purpose. So you define what is the purpose of your model? For a financial app, you need to say that, okay, this LLM application is designed to answer financial queries so and so on. From medical. You will have a different purpose, so you can configure what is the purpose of your app. And then when we take up a user query, we check whether the user query is under. Firstly, we check also for illegals activities and so on. And then we also check whether it's under the preview of this purpose. -``` +Sourabh Agrawal: +If not, then we tag that as a scenario of jailbreak because the user is trying to do something other than the purpose so that's how we tackle it. -```go -import ( - "context" +Demetrios: +Nice, dude. Well, this is awesome. Is there anything else you want to say before we jump off? - "github.com/qdrant/go-client/qdrant" -) +Sourabh Agrawal: +No, I mean, it was like, a great conversation. Really glad to be here and great talking to you. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Demetrios: +Yeah, I'm very happy that we got this working and you were able to show us a little bit of uptrend. Super cool that it's open source. So I would recommend everybody go check it out, get your LLMs working with confidence, and make sure that nobody is using your chatbot to be their GPT subsidy, like GM use case and. Yeah, it's great, dude. I appreciate. -client.Scroll(context.Background(), &qdrant.ScrollPoints{ - CollectionName: "{collection_name}", - Filter: &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewHasID( - qdrant.NewIDNum(1), - qdrant.NewIDNum(3), - qdrant.NewIDNum(5), - qdrant.NewIDNum(7), - qdrant.NewIDNum(9), - qdrant.NewIDNum(11), - ), - }, - }, -}) +Sourabh Agrawal: +Yeah, check us out like we are@GitHub.com. Slash uptrendai slashuptrend. -``` +Demetrios: +There we go. And if anybody else wants to come on to the vector space talks and talk to us about all the cool stuff that you're doing, hit us up and we'll see you all astronauts later. Don't get lost in vector space. -Filtered points would be: +Sourabh Agrawal: +Yeah, thank you. Thanks a lot. -```json -[\ - { "id": 1, "city": "London", "color": "green" },\ - { "id": 3, "city": "London", "color": "blue" },\ - { "id": 5, "city": "Moscow", "color": "green" }\ -] +Demetrios: +All right, dude. There we go. We are good. I don't know how the hell I'm going to stop this one because I can't go through on my phone or I can't go through on my computer. It's so weird. So I'm not, like, technically there's nobody at the wheel right now. So I think if we both get off, it should stop working. Okay. -``` +Demetrios: +Yeah, but that was awesome, man. This is super cool. I really like what you're doing, and it's so funny. I don't know if we're not connected on LinkedIn, are we? I literally just today posted a video of me going through a few different hallucination mitigation techniques. So it's, like, super timely that you talk about this. I think so many people have been thinking about this. -### [Anchor](https://qdrant.tech/documentation/concepts/filtering/\#has-vector) Has vector +Sourabh Agrawal: +Definitely with enterprises, it's like a big issue. Right? I mean, how do you make it safe? How do you make it production ready? So I'll definitely check out your video. Also would be super interesting. -_Available as of v1.13.0_ +Demetrios: +Just go to my LinkedIn right now. It's just like LinkedIn.com dpbrinkm or just search for me. I think we are connected. We're connected. All right, cool. Yeah, so, yeah, check out the last video I just posted, because it's literally all about this. And there's a really cool paper that came out and you probably saw it. It's all like, mitigating AI hallucinations, and it breaks down all 32 techniques. -This condition enables filtering by the presence of a given named vector on a point. +Demetrios: +And I was talking with on another podcast that I do, I was literally talking with the guys from weights and biases yesterday, and I was talking about how I was like, man, these evaluation data sets as a service feels like something that nobody's doing. And I guess it's probably because, and you're the expert, so I would love to hear what you have to say about it, but I guess it's because you don't really need it that bad. With a relatively small amount of data, you can start getting some really good evaluation happening. So it's a lot better than paying somebody else. -For example, if we have two named vector in our collection. +Sourabh Agrawal: +And also, I think it doesn't make sense also for a service because some external person is not best suited to make a data set for your use case. -```http -PUT /collections/{collection_name} -{ - "vectors": { - "image": { - "size": 4, - "distance": "Dot" - }, - "text": { - "size": 8, - "distance": "Cosine" - } - }, - "sparse_vectors": { - "sparse-image": {}, - "sparse-text": {}, - }, -} +Demetrios: +Right. -``` +Sourabh Agrawal: +It's you. You have to look at what your users are asking to create a good data set. You can have a method, which is what optrain also does. We basically help you to sample and pick out the right cases from this data set based on the feedback of your users, based on the scores which are being generated. But it's difficult for someone external to craft really good questions or really good queries or really good cases which make sense for your business. -Some points in the collection might have all vectors, some might have only a subset of them. +Demetrios: +Because the other piece that kind of, like, spitballed off of that, the other piece of it was techniques. So let me see if I can place all this words into a coherent sentence for you. It's basically like, okay, evaluation data sets don't really make sense because you're the one who knows the most. With a relatively small amount of data, you're going to be able to get stuff going real quick. What I thought about is, what about these hallucination mitigation techniques so that you can almost have options. So in this paper, right, there's like 32 different kinds of techniques that they use, and some are very pertinent for rags. They have like, five different or four different types of techniques. When you're dealing with rags to mitigate hallucinations, then they have some like, okay, if you're distilling a model, here is how you can make sure that the new distilled model doesn't hallucinate as much. -This is how you can search for points which have the dense `image` vector defined: +Demetrios: +Blah, blah, blah. But what I was thinking is like, what about how can you get a product? Or can you productize these kind of techniques? So, all right, cool. They're in this paper, but in uptrain, can we just say, oh, you want to try this new mitigation technique? We make that really easy for you. You just have to select it as one of the hallucination mitigation techniques. And then we do the heavy lifting of, if it's like, there's one. Have you heard of fleek? That was one that I was talking about in the video. Fleek is like where there's a knowledge graph, LLM that is created, and it is specifically created to try and combat hallucinations. And the way that they do it is they say that LLM will try and identify anywhere in the prompt or the output. -httppythontypescriptrustjavacsharpgo +Demetrios: +Sorry, the output. It will try and identify if there's anything that can be fact checked. And so if it says that humans landed on the moon in 1969, it will identify that. And then either through its knowledge graph or through just forming a search query that will go out and then search the Internet, it will verify if that fact is true in the output. So that's like one technique, right? And so what I'm thinking about is like, oh, man, wouldn't it be cool if you could have all these different techniques to be able to use really easily as opposed to, great, I read it in a paper. Now, how the fuck am I going to get my hands on one of these LLMs with a knowledge graph if I don't train it myself? -```http -POST /collections/{collection_name}/points/scroll -{ - "filter": { - "must": [\ - { "has_vector": "image" }\ - ] - } -} +Sourabh Agrawal: +Shit, yeah, I think that's a great suggestion. I'll definitely check it out. One of the things which we also want to do is integrate with all these techniques because these are really good techniques and they help solve a lot of problems, but using them is not simple. Recently we integrated with Spade. It's basically like a technique where I. -``` +Demetrios: +Did another video on spade, actually. -```python -from qdrant_client import QdrantClient, models +Sourabh Agrawal: +Yeah, basically. I think I'll also check out these hallucinations. So right now what we do is based on this paper called fact score, which instead of checking on the Internet, it checks in the context only to verify this fact can be verified from the context or not. But I think it would be really cool if people can just play around with these techniques and just see whether it's actually working on their data or not. -client = QdrantClient(url="http://localhost:6333") +Demetrios: +That's kind of what I was thinking is like, oh, can you see? Does it give you a better result? And then the other piece is like, oh, wait a minute, does this actually, can I put like two or three of them in my system at the same time? Right. And maybe it's over engineering or maybe it's not. I don't know. So there's a lot of fun stuff that can go down there and it's fascinating to think about. -client.scroll( - collection_name="{collection_name}", - scroll_filter=models.Filter( - must=[\ - models.HasVectorCondition(has_vector="image"),\ - ], - ), -) +Sourabh Agrawal: +Yeah, definitely. And I think experimentation is the key here, right? I mean, unless you try out them, you don't know what works. And if something works which improves your system, then definitely it was worth it. -``` +Demetrios: +Thanks for that. -```typescript -client.scroll("{collection_name}", { - filter: { - must: [\ - {\ - has_vector: "image",\ - },\ - ], - }, -}); +Sourabh Agrawal: +We'll check into it. -``` +Demetrios: +Dude, awesome. It's great chatting with you, bro. And I'll talk to you later, bro. -```rust -use qdrant_client::qdrant::{Condition, Filter, ScrollPointsBuilder}; -use qdrant_client::Qdrant; +Sourabh Agrawal: +Yeah, thanks a lot. Great speaking. See you. Bye. -let client = Qdrant::from_url("http://localhost:6334").build()?; +<|page-371-lllmstxt|> +> "*The vector search engine that we chose is Qdrant, but why did we choose it? Actually, it answers all the load constraints and the technical needs that we had. It allows us to do a fast neighbor search. It has a python API which matches the recommender tag that we have.*”\ +-- Gladys Roch +> -client - .scroll( - ScrollPointsBuilder::new("{collection_name}") - .filter(Filter::must([Condition::has_vector("image")])), - ) - .await?; +Gladys Roch is a French Machine Learning Engineer at Dailymotion working on recommender systems for video content. -``` +> "*We don't have full control and at the end the cost of their solution is very high for a very low proposal. So after that we tried to benchmark other solutions and we found out that Qdrant was easier for us to implement.*”\ +-- Samuel Leonardo Gracio +> -```java -import java.util.List; +Samuel Leonardo Gracio, a Senior Machine Learning Engineer at Dailymotion, mainly works on recommender systems and video classification. -import static io.qdrant.client.ConditionFactory.hasVector; -import static io.qdrant.client.PointIdFactory.id; +***Listen to the episode on [Spotify](https://open.spotify.com/episode/4YYASUZKcT5A90d6H2mOj9?si=a5GgBd4JTR6Yo3HBJfiejQ), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/z_0VjMZ2JY0).*** -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.ScrollPoints; + -client - .scrollAsync( - ScrollPoints.newBuilder() - .setCollectionName("{collection_name}") - .setFilter( - Filter.newBuilder() - .addMust(hasVector("image")) - .build()) - .build()) - .get(); + -``` +## **Top takeaways:** -```csharp -using Qdrant.Client; -using static Qdrant.Client.Grpc.Conditions; +Are you captivated by how video recommendations that are engineered to serve up your next binge-worthy content? We definitely are. -var client = new QdrantClient("localhost", 6334); +Get ready to unwrap the secrets that keep millions engaged, as Demetrios chats with the brains behind the scenes of Dailymotion. This episode is packed with insights straight from ML Engineers at Dailymotion who are reshaping how we discover videos online. -await client.ScrollAsync(collectionName: "{collection_name}", filter: HasVector("image")); +Here's what you’ll unbox from this episode: -``` +1. **The Mech Behind the Magic:** Understand how a robust video embedding process can change the game - from textual metadata to audio signals and beyond. +2. **The Power of Multilingual Understanding:** Discover the tools that help recommend videos to a global audience, transcending language barriers. +3. **Breaking the Echo Chamber:** Learn about Dailymotion's 'perspective' feature that's transforming the discovery experience for users. +4. **Challenges & Triumphs:** Hear how Qdrant helps Dailymotion tackle a massive video catalog and ensure the freshest content pops on your feed. +5. **Behind the Scenes with Qdrant:** Get an insider’s look at why Dailymotion entrusted their recommendation needs to Qdrant's capable hands (or should we say algorithms?). -```go -import ( - "context" +> Fun Fact: Did you know that Dailymotion juggles over 13 million recommendations daily? That's like serving up a personalized video playlist to the entire population of Greece. Every single day! +> - "github.com/qdrant/go-client/qdrant" -) +## Show notes: -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +00:00 Vector Space Talks intro with Gladys and Samuel.\ +05:07 Recommender system needs vector search for recommendations.\ +09:29 Chose vector search engine for fast neighbor search.\ +13:23 Video transcript use for scalable multilingual embedding.\ +16:35 Transcripts prioritize over video title and tags.\ +17:46 Videos curated based on metadata for quality.\ +20:53 Qdrant setup overview for machine learning engineers.\ +25:25 Enhanced recommendation system improves user engagement.\ +29:36 Recommender system, A/B testing, collection aliases strategic.\ +33:03 Dailymotion's new feature diversifies video perspectives.\ +34:58 Exploring different perspectives and excluding certain topics. -client.Scroll(context.Background(), &qdrant.ScrollPoints{ - CollectionName: "{collection_name}", - Filter: &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewHasVector( - "image", - ), - }, - }, -}) +## More Quotes from Gladys and Sam: -``` +"*Basically, we're computing the embeddings and then we feed them into Qdrant, and we do that with a streaming pipeline, which means that every time, so everything is in streaming, every time a new video is uploaded or updated, if the description changes, for example, then the embedding will be computed and then it will be fed directly into Qdrant.*”\ +-- Gladys Roch -##### Was this page useful? +*"We basically recommend videos to a user if other users watching the same video were watching other videos. But the problem with that is that it only works with videos where we have what we call here high signal. So videos that have at least thousands of views, some interactions, because for fresh and fresh or niche videos, we don't have enough interaction.”*\ +-- Samuel Leonardo Gracio -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +*"But every time we add new videos to Dailymotion, then it's growing. So it can provide recommendation for videos with few interactions that we don't know well. So we're very happy because it led us to huge performances increase on the low signal. We did a threefold increase on the CTR, which means the number of clicks on the recommendation. So with Qdrant we were able to kind of fix our call start issues.”*\ +-- Gladys Roch -Thank you for your feedback! 🙏 +*"The fact that you have a very cool team that helped us to implement some parts when it was difficult, I think it was definitely the thing that make us choose Qdrant instead of another solution.”*\ +-- Samuel Leonardo Gracio -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/filtering.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +## Transcript: +Demetrios: +I don't know if you all realize what you got yourself into, but we are back for another edition of the Vector Space Talks. My stream is a little bit chunky and slow, so I think we're just to get into it with Gladys and Samuel from Daily motion. Thank you both for joining us. It is an honor to have you here. For everyone that is watching, please throw your questions and anything else that you want to remark about into the chat. We love chatting with you and I will jump on screen if there is something that we need to stop the presentation about and ask right away. But for now, I think you all got some screen shares you want to show us. -On this page: +Samuel Leonardo Gracio: +Yes, exactly. So first of all, thank you for the invitation, of course. And yes, I will share my screen. We have a presentation. Excellent. Should be okay now. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/filtering.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Demetrios: +Brilliant. -× +Samuel Leonardo Gracio: +So can we start? -[Powered by](https://qdrant.tech/) +Demetrios: +I would love it. Yes, I'm excited. I think everybody else is excited too. -<|page-182-lllmstxt|> -## create-snapshot -- [Documentation](https://qdrant.tech/documentation/) -- [Database tutorials](https://qdrant.tech/documentation/database-tutorials/) -- Create & Restore Snapshots +Gladys Roch: +So welcome, everybody, to our vector space talk. I'm Gladys Roch, machine learning engineer at Dailymotion. -# [Anchor](https://qdrant.tech/documentation/database-tutorials/create-snapshot/\#backup-and-restore-qdrant-collections-using-snapshots) Backup and Restore Qdrant Collections Using Snapshots +Samuel Leonardo Gracio: +And I'm Samuel, senior machine learning engineer at Dailymotion. -| Time: 20 min | Level: Beginner | | | -| --- | --- | --- | --- | +Gladys Roch: +Today we're going to talk about Vector search in the context of recommendation and in particular how Qdrant. That's going to be a hard one. We actually got used to pronouncing Qdrant as a french way, so we're going to sleep a bit during this presentation, sorry, in advance, the Qdrant and how we use it for our content based recommender. So we are going to first present the context and why we needed a vector database and why we chose Qdrant, how we fit Qdrant, what we put in it, and we are quite open about the pipelines that we've set up and then we get into the results and how Qdrant helped us solve the issue that we had. -A collection is a basic unit of data storage in Qdrant. It contains vectors, their IDs, and payloads. However, keeping the search efficient requires additional data structures to be built on top of the data. Building these data structures may take a while, especially for large collections. -That’s why using snapshots is the best way to export and import Qdrant collections, as they contain all the bits and pieces required to restore the entire collection efficiently. +Samuel Leonardo Gracio: +Yeah. So first of all, I will talk about, globally, the recommendation at Dailymotion. So just a quick introduction about Dailymotion, because you're not all french, so you may not all know what Dailymotion is. So we are a video hosting platform as YouTube or TikTok, and we were founded in 2005. So it's a node company for videos and we have 400 million unique users per month. So that's a lot of users and videos and views. So that's why we think it's interesting. So Dailymotion is we can divide the product in three parts. -This tutorial will show you how to create a snapshot of a collection and restore it. Since working with snapshots in a distributed environment might be thought to be a bit more complex, we will use a 3-node Qdrant cluster. However, the same approach applies to a single-node setup. +Samuel Leonardo Gracio: +So one part is the native app. As you can see, it's very similar from other apps like TikTok or Instagram reels. So you have vertical videos, you just scroll and that's it. We also have a website. So Dailymotion.com, that is our main product, historical product. So on this website you have a watching page like you can have for instance, on YouTube. And we are also a video player that you can find in most of the french websites and even in other countries. And so we have recommendation almost everywhere and different recommenders for each of these products. -You can use the techniques described in this page to migrate a cluster. Follow the instructions -in this tutorial to create and download snapshots. When you [Restore from snapshot](https://qdrant.tech/documentation/database-tutorials/create-snapshot/#restore-from-snapshot), restore your data to the new cluster. +Gladys Roch: +Okay, so that's Dailymotion. But today we're going to focus on one of our recommender systems. Actually, the machine learning engineer team handles multiple recommender systems. But the video to video recommendation is the oldest and the most used. And so it's what you can see on the screen, it's what you have the recommendation queue of videos that you can see on the side or below the videos that you're watching. And to compute these suggestions, we have multiple models running. So that's why it's a global system. This recommendation is quite important for Dailymotion. -## [Anchor](https://qdrant.tech/documentation/database-tutorials/create-snapshot/\#prerequisites) Prerequisites +Gladys Roch: +It's actually a key component. It's one of the main levers of audience generation. So for everybody who comes to the website from SEO or other ways, then that's how we generate more audience and more engagement. So it's very important in the revenue stream of the platform. So working on it is definitely a main topic of the team and that's why we are evolving on this topic all the time. -Let’s assume you already have a running Qdrant instance or a cluster. If not, you can follow the [installation guide](https://qdrant.tech/documentation/guides/installation/) to set up a local Qdrant instance or use [Qdrant Cloud](https://cloud.qdrant.io/) to create a cluster in a few clicks. +Samuel Leonardo Gracio: +Okay, so why would we need a vector search for this recommendation? I think we are here for that. So as many platforms and as many recommender systems, I think we have a very usual approach based on a collaborative model. So we basically recommend videos to a user if other users watching the same video were watching other videos. But the problem with that is that it only works with videos where we have what we call here high signal. So videos that have at least thousands of views, some interactions, because for fresh and fresh or niche videos, we don't have enough interaction. And we have a problem that I think all the recommender systems can have, which is a costar tissue. So this costar tissue is for new users and new videos, in fact. So if we don't have any information or interaction, it's difficult to recommend anything based on this collaborative approach. -Once the cluster is running, let’s install the required dependencies: +Samuel Leonardo Gracio: +So the idea to solve that was to use a content based recommendation. It's also a classic solution. And the idea is when you have a very fresh video. So video, hey, in this case, a good thing to recommend when you don't have enough information is to recommend a very similar video and hope that the user will watch it also. So for that, of course, we use Qdrant and we will explain how. So yeah, the idea is to put everything in the vector space. So each video at Dailymotion will go through an embedding model. So for each video we'll get a video on embedding. -```shell -pip install qdrant-client datasets +Samuel Leonardo Gracio: +We will describe how we do that just after and put it in a vector space. So after that we could use Qdrant to, sorry, Qdrant to query and get similar videos that we will recommend to our users. -``` +Gladys Roch: +Okay, so if we have embeddings to represent our videos, then we have a vector space, but we need to be able to query this vector space and not only to query it, but to do it at scale and online because it's like a recommender facing users. So we have a few requirements. The first one is that we have a lot of videos in our catalog. So actually doing an exact neighbor search would be unreasonable, unrealistic. It's a combinatorial explosion issue, so we can't do an exact Knn. Plus we also have new videos being uploaded to Dailymotion every hour. So if we could somehow manage to do KNN and to pre compute it, it would never be up to date and it would be very expensive to recompute all the time to include all the new videos. So we need a solution that can integrate new videos all the time. -### [Anchor](https://qdrant.tech/documentation/database-tutorials/create-snapshot/\#establish-a-connection-to-qdrant) Establish a connection to Qdrant +Gladys Roch: +And we're also at scale, we serve over 13 million recommendation each day. So it means that we need a big setup to retrieve the neighbors of many videos all day. And finally, we have users waiting for the recommendation. So it's not just pre computed and stored, and it's not just content knowledge. We are trying to provide the recommendation as fast as possible. So we have time constraints and we only have a few hundred milliseconds to compute the recommendation that we're going to show the user. So we need to be able to retrieve the close video that we'd like to propose to the user very fast. So we need to be able to navigate this vector space that we are building quite quickly. -We are going to use the Python SDK and raw HTTP calls to interact with Qdrant. Since we are going to use a 3-node cluster, we need to know the URLs of all the nodes. For the simplicity, let’s keep them all in constants, along with the API key, so we can refer to them later: +Gladys Roch: +So of course we need vector search engine. That's the most easy way to do it, to be able to compute and approximate neighbor search and to do it at scale. So obviously, evidently the vector search engine that we chose this Qdrant, but why did we choose it? Actually, it answers all the load constraints and the technical needs that we had. It allows us to do a fast neighbor search. It has a python API which match the recommendous tag that we have. A very important issue for us was to be able to not only put the embeddings of the vectors in this space but also to put metadata with it to be able to get a bit more information and not just a mathematical representation of the video in this database. And actually doing that make it filterable, which means that we can retrieve neighbors of a video, but given some constraints, and it's very important for us typically for language constraints. Samuel will talk a bit more in details about that just after. -```python -QDRANT_MAIN_URL = "https://my-cluster.com:6333" -QDRANT_NODES = ( - "https://node-0.my-cluster.com:6333", - "https://node-1.my-cluster.com:6333", - "https://node-2.my-cluster.com:6333", -) -QDRANT_API_KEY = "my-api-key" +Gladys Roch: +But we have an embedding that is multilingual and we need to be able to filter all the language, all the videos on their language to offer more robust recommendation for our users. And also Qdrant is distributed and so it's scalable and we needed that due to the load that I just talked about. So that's the main points that led us to choose Qdrant. -``` +Samuel Leonardo Gracio: +And also they have an amazing team. -We can now create a client instance: +Gladys Roch: +So that's another, that would be our return of experience. The team of Qdrant is really nice. You helped us actually put in place the cluster. -```python -from qdrant_client import QdrantClient +Samuel Leonardo Gracio: +Yeah. So what do we put in our Qdrant cluster? So how do we build our robust video embedding? I think it's really interesting. So the first point for us was to know what a video is about. So it's a really tricky question, in fact. So of course, for each video uploaded on the platform, we have the video signal, so many frames representing the video, but we don't use that for our meetings. And in fact, why we are not using them, it's because it contains a lot of information. Right, but not what we want. For instance, here you have video about an interview of LeBron James. -client = QdrantClient(QDRANT_MAIN_URL, api_key=QDRANT_API_KEY) +Samuel Leonardo Gracio: +But if you only use the frames, the video signal, you can't even know what he's saying, what the video is about, in fact. So we still try to use it. But in fact, the most interesting thing to represent our videos are the textual metadata. So the textual metadata, we have them for every video. So for every video uploaded on the platform, we have a video title, video description that are put by the person that uploads the video. But we also have automatically detected tags. So for instance, for this video, you could have LeBron James, and we also have subtitles that are automatically generated. So just to let you know, we do that using whisper, which is an open source solution provided by OpenAI, and we do it at scale. -``` +Samuel Leonardo Gracio: +When a video is uploaded, we directly have the video transcript and we can use this information to represent our videos with just a textual embedding, which is far more easy to treat, and we need less compute than for frames, for instance. So the other issue for us was that we needed an embedding that could scale so that does not require too much time to compute because we have a lot of videos, more than 400 million videos, and we have many videos uploaded every hour, so it needs to scale. We also have many languages on our platform, more than 300 languages in the videos. And even if we are a french video platform, in fact, it's only a third of our videos that are actually in French. Most of the videos are in English or other languages such as Turkish, Spanish, Arabic, et cetera. So we needed something multilingual, which is not very easy to find. But we came out with this embedding, which is called multilingual universal sentence encoder. It's not the most famous embedding, so I think it's interesting to share it. -First of all, we are going to create a collection from a precomputed dataset. If you already have a collection, you can skip this step and start by [creating a snapshot](https://qdrant.tech/documentation/database-tutorials/create-snapshot/#create-and-download-snapshots). +Samuel Leonardo Gracio: +It's open source, so everyone can use it. It's available on Tensorflow hub, and I think that now it's also available on hugging face, so it's easy to implement and to use it. The good thing is that it's pre trained, so you don't even have to fine tune it on your data. You can, but I think it's not even required. And of course it's multilingual, so it doesn't work with every languages. But still we have the main languages that are used on our platform. It focuses on semantical similarity. And you have an example here when you have different video titles. -(Optional) Create collection and import data +Samuel Leonardo Gracio: +So for instance, one about soccer, another one about movies. Even if you have another video title in another language, if it's talking about the same topic, they will have a high cosine similarity. So that's what we want. We want to be able to recommend every video that we have in our catalog, not depending on the language. And the good thing is that it's really fast. Actually, it's a few milliseconds on cpu, so it's really easy to scale. So that was a huge requirement for us. -### Load the dataset +Demetrios: +Can we jump in here? -We are going to use a dataset with precomputed embeddings, available on Hugging Face Hub. The dataset is called [Qdrant/arxiv-titles-instructorxl-embeddings](https://huggingface.co/datasets/Qdrant/arxiv-titles-instructorxl-embeddings) and was created using the [InstructorXL](https://huggingface.co/hkunlp/instructor-xl) model. It contains 2.25M embeddings for the titles of the papers from the [arXiv](https://arxiv.org/) dataset. +Demetrios: +There's a few questions that are coming through that I think are pretty worth. And it's actually probably more suited to the last slide. Sameer is asking this one, actually, one more back. Sorry, with the LeBron. Yeah, so it's really about how you understand the videos. And Sameer was wondering if you can quote unquote hack the understanding by putting some other tags or. -Loading the dataset is as simple as: +Samuel Leonardo Gracio: +Ah, you mean from a user perspective, like the person uploading the video, right? -```python -from datasets import load_dataset +Demetrios: +Yeah, exactly. -dataset = load_dataset( - "Qdrant/arxiv-titles-instructorxl-embeddings", split="train", streaming=True -) +Samuel Leonardo Gracio: +You could do that before using transcripts, but since we are using them mainly and we only use the title, so the tags are automatically generated. So it's on our side. So the title and description, you can put whatever you want. But since we have the transcript, we know the content of the video and we embed that. So the title and the description are not the priority in the embedding. So I think it's still possible, but we don't have such use case. In fact, most of the people uploading videos are just trying to put the right title, but I think it's still possible. But yeah, with the transcript we don't have any examples like that. -``` +Samuel Leonardo Gracio: +Yeah, hopefully. -We used the streaming mode, so the dataset is not loaded into memory. Instead, we can iterate through it and extract the id and vector embedding: +Demetrios: +So that's awesome to think about too. It kind of leads into the next question, which is around, and this is from Juan Pablo. What do you do with videos that have no text and no meaningful audio, like TikTok or a reel? -```python -for payload in dataset: - id_ = payload.pop("id") - vector = payload.pop("vector") - print(id_, vector, payload) +Samuel Leonardo Gracio: +So for the moment, for these videos, we are only using the signal from the title tags, description and other video metadata. And we also have a moderation team which is watching the videos that we have here in the mostly recommended videos. So we know that the videos that we recommend are mostly good videos. And for these videos, so that don't have audio signal, we are forced to use the title tags and description. So these are the videos where the risk is at the maximum for us currently. But we are also working at the moment on something using the audio signal and the frames, but not all the frames. But for the moment, we don't have this solution. Right. -``` +Gladys Roch: +Also, as I said, it's not just one model, we're talking about the content based model. But if we don't have a similarity score that is high enough, or if we're just not confident about the videos that were the closest, then we will default to another model. So it's not just one, it's a huge system. -A single payload looks like this: +Samuel Leonardo Gracio: +Yeah, and one point also, we are talking about videos with few interactions, so they are not videos at risk. I mean, they don't have a lot of views. When this content based algo is called, they are important because there are very fresh videos, and fresh videos will have a lot of views in a few minutes. But when the collaborative model will be retrained, it will be able to recommend videos on other things than the content itself, but it will use the collaborative signal. So I'm not sure that it's a really important risk for us. But still, I think we could still do some improvement for that aspect. -```json -{ - 'title': 'Dynamics of partially localized brane systems', - 'DOI': '1109.1415' -} +Demetrios: +So where do I apply to just watch videos all day for the content team? All right, I'll let you get back to it. Sorry to interrupt. And if anyone else has good questions. -``` +Samuel Leonardo Gracio: +And I think it's good to ask your question during the presentation, it's more easier to answer. So, yeah, sorry, I was saying that we had this multilingual embedding, and just to present you our embedding pipeline. So, for each video that is uploaded or edited, because you can change the video title whenever you want, we have a pub sub event that is sent to a dataflow pipeline. So it's a streaming job for every video we will retrieve. So textual metadata, title, description tags or transcript, preprocess it to remove some words, for instance, and then call the model to have this embedding. And then. So we put it in bigquery, of course, but also in Qdrant. -### Create a collection +Gladys Roch: +So I'm going to present a bit our Qdrant setup. So actually all this was deployed by our tier DevOps team, not by us machine learning engineers. So it's an overview, and I won't go into the details because I'm not familiar with all of this, but basically, as Samuel said, we're computing the embeddings and then we feed them into Qdrant, and we do that with a streaming pipeline, which means that every time, so everything is in streaming, every time a new video is uploaded or updated, if the description changes, for example, then the embedding will be computed and then it will be fed directly into Qdrant. And on the other hand, our recommender queries the Qdrant vector space through GrPC ingress. And actually Qdrant is running on six pods that are using arm nodes. And you have the specificities of which type of nodes we're using there, if you're interested. But basically that's the setup. And what is interesting is that our recommendation stack for now, it's on premise, which means it's running on Dailymotion servers, not on the Google Kubernetes engine, whereas Qdrant is on the TKE. -First things first, we need to create our collection. We’re not going to play with the configuration of it, but it makes sense to do it right now. -The configuration is also a part of the collection snapshot. +Gladys Roch: +So we are querying it from outside. And also if you have more questions about this setup, we'll be happy to redirect you to the DevOps team that helped us put that in place. And so finally the results. So we stated earlier that we had a call start issue. So before Qdrant, we had a lot of difficulties with this challenge. We had a collaborative recommender that was trained and performed very well on high senior videos, which means that is videos with a lot of interactions. So we can see what user like to watch, which videos they like to watch together. And we also had a metadata recommender. -```python -from qdrant_client import models +Gladys Roch: +But first, this collaborative recommender was actually also used to compute call start recommendation, which is not allowed what it is trained on, but we were using a default embedding to compute like a default recommendation for call start, which led to a lot of popularity issues. Popularity issues for recommender system is when you always recommend the same video that is hugely popular and it's like a feedback loop. A lot of people will default to this video because it might be clickbait and then we will have a lot of inhaler action. So it will pollute the collaborative model all over again. So we had popularity issues with this, obviously. And we also had like this metadata recommender that only focused on a very small scope of trusted owners and trusted video sources. So it was working. It was an auto encoder and it was fine, but the scope was too small. -if not client.collection_exists("test_collection"): - client.create_collection( - collection_name="test_collection", - vectors_config=models.VectorParams( - size=768, # Size of the embedding vector generated by the InstructorXL model - distance=models.Distance.COSINE - ), - ) +Gladys Roch: +Too few videos could be recommended through this model. And also those two models were trained very infrequently, only every 4 hours and 5 hours, which means that any fresh videos on the platform could not be recommended properly for like 4 hours. So it was the main issue because Dailymotion uses a lot of fresh videos and we have a lot of news, et cetera. So we need to be topical and this couldn't be done with this huge delay. So we had overall bad performances on the Los signal. And so with squadron we fixed that. We still have our collaborative recommender. It has evolved since then. -``` +Gladys Roch: +It's actually computed much more often, but the collaborative model is only focused on high signal now and it's not computed like default recommendation for low signal that it doesn't know. And we have a content based recommender based on the muse embedding and Qdrant that is able to recommend to users video as soon as they are uploaded on the platform. And it has like a growing scope, 20 million vectors at the moment. But every time we add new videos to Dailymotion, then it's growing. So it can provide recommendation for videos with few interactions that we don't know well. So we're very happy because it led us to huge performances increase on the low signal. We did a threefold increase on the CTR, which means the number of clicks on the recommendation. So with Qdrant we were able to kind of fix our call start issues. -### Upload the dataset +Gladys Roch: +What I was talking about fresh videos, popularities, low performances. We fixed that and we were very happy with the setup. It's running smoothly. Yeah, I think that's it for the presentation, for the slides at least. So we are open to discussion and if you have any questions to go into the details of the recommender system. So go ahead, shoot. -Calculating the embeddings is usually a bottleneck of the vector search pipelines, but we are happy to have them in place already. Since the goal of this tutorial is to show how to create a snapshot, **we are going to upload only a small part of the dataset**. +Demetrios: +I've got some questions while people are typing out everything in the chat and the first one I think that we should probably get into is how did the evaluation process go for you when you were looking at different vector databases and vector search engines? -```python -ids, vectors, payloads = [], [], [] -for payload in dataset: - id_ = payload.pop("id") - vector = payload.pop("vector") +Samuel Leonardo Gracio: +So that's a good point. So first of all, you have to know that we are working with Google cloud platform. So the first thing that we did was to use their vector search engine, so which called matching engine. - ids.append(id_) - vectors.append(vector) - payloads.append(payload) +Gladys Roch: +Right. - # We are going to upload only 1000 vectors - if len(ids) == 1000: - break +Samuel Leonardo Gracio: +But the issue with matching engine is that we could not in fact add the API, wasn't easy to use. First of all. The second thing was that we could not put metadata, as we do in Qdrant, and filter out, pre filter before the query, as we are doing now in a Qdrant. And the first thing is that their solution is managed. Yeah, is managed. We don't have the full control and at the end the cost of their solution is very high for a very low proposal. So after that we tried to benchmark other solutions and we found out that Qdrant was easier for us to implement. We had a really cool documentation, so it was easy to test some things and basically we couldn't find any drawbacks for our use case at least. -client.upsert( - collection_name="test_collection", - points=models.Batch( - ids=ids, - vectors=vectors, - payloads=payloads, - ), -) +Samuel Leonardo Gracio: +And moreover, the fact that you have a very cool team that helped us to implement some parts when it was difficult, I think it was definitely the thing that make us choose Qdrant instead of another solution, because we implemented Qdrant. -``` +Gladys Roch: +Like on February or even January 2023. So Qdrant is fairly new, so the documentation was still under construction. And so you helped us through the discord to set up the cluster. So it was really nice. -Our collection is now ready to be used for search. Let’s create a snapshot of it. +Demetrios: +Excellent. And what about least favorite parts of using Qdrant? -If you already have a collection, you can skip the previous step and start by [creating a snapshot](https://qdrant.tech/documentation/database-tutorials/create-snapshot/#create-and-download-snapshots). +Gladys Roch: +Yeah, I have one. I discovered it was not actually a requirement at the beginning, but for recommender systems we tend to do a lot of a B test. And you might wonder what's the deal with Qdrant and a b test. It's not related, but actually we were able to a b test our collection. So how we compute the embedding? First we had an embedding without the transcript, and now we have an embedding that includes the transcript. So we wanted to a b test that. And on Quellin you can have collection aliases and this is super helpful because you can have two collections that live on the cluster at the same time, and then on your code you can just call the production collection and then set the alias to the proper one. So for a d testing and rollout it's very useful. -## [Anchor](https://qdrant.tech/documentation/database-tutorials/create-snapshot/\#create-and-download-snapshots) Create and download snapshots +Gladys Roch: +And I found it when I first wanted to do an a test. So I like this one. It was an existed and I like it also, the second thing I like is the API documentation like the one that is auto generated with all the examples and how to query any info on Qdrant. It's really nice for someone who's not from DevOps. It help us just debug our collection whenever. So it's very easy to get into. -Qdrant exposes an HTTP endpoint to request creating a snapshot, but we can also call it with the Python SDK. -Our setup consists of 3 nodes, so we need to call the endpoint **on each of them** and create a snapshot on each node. While using Python SDK, that means creating a separate client instance for each node. +Samuel Leonardo Gracio: +And the fact that the product is evolving so fast, like every week almost. You have a new feeder. I think it's really cool. There is one community and I think, yeah, it's really interesting and it's amazing to have such people working on that on an open source project like this one. -pythonhttp +Gladys Roch: +We had feedback from our devot team when preparing this presentation. We reached out to them for the small schema that I tried to present. And yeah, they said that the open source community of quasant was really nice. It was easy to contribute, it was very open on Discord. I think we did a return on experience at some point on how we set up the cluster at the beginning. And yeah, they were very hyped by the fact that it's coded in rust. I don't know if you hear this a lot, but to them it's even more encouraging contributing with this kind of new language. -```python -snapshot_urls = [] -for node_url in QDRANT_NODES: - node_client = QdrantClient(node_url, api_key=QDRANT_API_KEY) - snapshot_info = node_client.create_snapshot(collection_name="test_collection") +Demetrios: +100% excellent. So last question from my end, and it is on if you're using Qdrant for anything else when it comes to products at Dailymotion, yes, actually we do. - snapshot_url = f"{node_url}/collections/test_collection/snapshots/{snapshot_info.name}" - snapshot_urls.append(snapshot_url) +Samuel Leonardo Gracio: +I have one slide about this. -``` +Gladys Roch: +We have slides because we presented quadrum to another talk a few weeks ago. -```http -// for `https://node-0.my-cluster.com:6333` -POST /collections/test_collection/snapshots +Samuel Leonardo Gracio: +So we didn't prepare this slide just for this presentation, it's from another presentation, but still, it's a good point because we're currently trained to use it in other projects. So as we said in this presentation, we're mostly using it for the watching page. So Dailymotion.com but we also introduced it in the mobile app recently through a feature that is called perspective. So the goal of the feature is to be able to break this vertical feed algorithm to let the users to have like a button to discover new videos. So when you go through your feed, sometimes you will get a video talking about, I don't know, a movie. You will get this button, which is called perspective, and you will be able to have other videos talking about the same movie but giving to you another point of view. So people liking the movie, people that didn't like the movie, and we use Qdrant, sorry for the candidate generation part. So to get the similar videos and to get the videos that are talking about the same subject. -// for `https://node-1.my-cluster.com:6333` -POST /collections/test_collection/snapshots +Samuel Leonardo Gracio: +So I won't talk too much about this project because it will require another presentation of 20 minutes or more. But still we are using it in other projects and yeah, it's really interesting to see what we are able to do with that tool. -// for `https://node-2.my-cluster.com:6333` -POST /collections/test_collection/snapshots +Gladys Roch: +Once we have the vector space set up, we can just query it from everywhere. In every project of recommendation. -``` +Samuel Leonardo Gracio: +We also tested some search. We are testing many things actually, but we don't have implemented it yet. For the moment we just have this perspective feed and the content based Roko, but we still have a lot of ideas using this vector search space. -Response +Demetrios: +I love that idea on the get another perspective. So it's not like you get, as you were mentioning before, you don't get that echo chamber and just about everyone saying the same thing. You get to see are there other sides to this? And I can see how that could be very uh, Juan Pablo is back, asking questions in the chat about are you able to recommend videos with negative search queries and negative in the sense of, for example, as a user I want to see videos of a certain topic, but I want to exclude some topics from the video. -```json -{ - "result": { - "name": "test_collection-559032209313046-2024-01-03-13-20-11.snapshot", - "creation_time": "2024-01-03T13:20:11", - "size": 18956800 - }, - "status": "ok", - "time": 0.307644965 -} +Gladys Roch: +Okay. We actually don't do that at the moment, but we know we can with squadron we can set positive and negative points from where to query. So actually for the moment we only retrieve close positive neighbors and we apply some business filters on top of that recommendation. But that's it. -``` +Samuel Leonardo Gracio: +And that's because we have also this collaborative model, which is our main recommender system. But I think we definitely need to check that and maybe in the future we will implement that. We saw that many documentation about this and I'm pretty sure that it would work very well on our use case. -Once we have the snapshot URLs, we can download them. Please make sure to include the API key in the request headers. -Downloading the snapshot **can be done only through the HTTP API**, so we are going to use the `requests` library. +Demetrios: +Excellent. Well folks, I think that's about it for today. I want to thank you so much for coming and chatting with us and teaching us about how you're using Qdrant and being very transparent about your use. I learned a ton. And for anybody that's out there doing recommender systems and interested in more, I think they can reach out to you on LinkedIn. I've got both of your we'll drop them in the chat right now and we'll let everybody enjoy. So don't get lost in vector base. We will see you all later. -```python -import requests -import os +Demetrios: +If anyone wants to give a talk next, reach out to me. We always are looking for incredible talks and so this has been great. Thank you all. -# Create a directory to store snapshots -os.makedirs("snapshots", exist_ok=True) +Gladys Roch: +Thank you. -local_snapshot_paths = [] -for snapshot_url in snapshot_urls: - snapshot_name = os.path.basename(snapshot_url) - local_snapshot_path = os.path.join("snapshots", snapshot_name) +Samuel Leonardo Gracio: +Thank you very much for the invitation and for everyone listening. Thank you. - response = requests.get( - snapshot_url, headers={"api-key": QDRANT_API_KEY} - ) - with open(local_snapshot_path, "wb") as f: - response.raise_for_status() - f.write(response.content) +Gladys Roch: +See you. Bye. - local_snapshot_paths.append(local_snapshot_path) +<|page-372-lllmstxt|> +> *"Building AI applications doesn't have to be complicated. You can leverage pre-trained models and support complex pipelines with a few lines of code. LangChain provides a unified interface, so that you can avoid writing boilerplate code and focus on the value you want to bring."* Kacper Lukawski, Developer Advocate, Qdrant -``` +## Long-Term Memory for Your GenAI App -Alternatively, you can use the `wget` command: +Qdrant's vector database quickly grew due to its ability to make Generative AI more effective. On its own, an LLM can be used to build a process-altering invention. With Qdrant, you can turn this invention into a production-level app that brings real business value. -```bash -wget https://node-0.my-cluster.com:6333/collections/test_collection/snapshots/test_collection-559032209313046-2024-01-03-13-20-11.snapshot \ - --header="api-key: ${QDRANT_API_KEY}" \ - -O node-0-shapshot.snapshot +The use of vector search in GenAI now has a name: **Retrieval Augmented Generation (RAG)**. [In our previous article](/articles/rag-is-dead/), we argued why RAG is an essential component of AI setups, and why large-scale AI can't operate without it. Numerous case studies explain that AI applications are simply too costly and resource-intensive to run using only LLMs. -wget https://node-1.my-cluster.com:6333/collections/test_collection/snapshots/test_collection-559032209313047-2024-01-03-13-20-12.snapshot \ - --header="api-key: ${QDRANT_API_KEY}" \ - -O node-1-shapshot.snapshot +> Going forward, the solution is to leverage composite systems that use models and vector databases. -wget https://node-2.my-cluster.com:6333/collections/test_collection/snapshots/test_collection-559032209313048-2024-01-03-13-20-13.snapshot \ - --header="api-key: ${QDRANT_API_KEY}" \ - -O node-2-shapshot.snapshot +**What is RAG?** Essentially, a RAG setup turns Qdrant into long-term memory storage for LLMs. As a vector database, Qdrant manages the efficient storage and retrieval of user data. -``` +Adding relevant context to LLMs can vastly improve user experience, leading to better retrieval accuracy, faster query speed and lower use of compute. Augmenting your AI application with vector search reduces hallucinations, a situation where AI models produce legitimate-sounding but made-up responses. -The snapshots are now stored locally. We can use them to restore the collection to a different Qdrant instance, or treat them as a backup. We will create another collection using the same data on the same cluster. +Qdrant streamlines this process of retrieval augmentation, making it faster, easier to scale and efficient. When you are accessing vast amounts of data (hundreds or thousands of documents), vector search helps your sort through relevant context. **This makes RAG a primary candidate for enterprise-scale use cases.** -## [Anchor](https://qdrant.tech/documentation/database-tutorials/create-snapshot/\#restore-from-snapshot) Restore from snapshot +## Why LangChain? -Our brand-new snapshot is ready to be restored. Typically, it is used to move a collection to a different Qdrant instance, but we are going to use it to create a new collection on the same cluster. -It is just going to have a different name, `test_collection_import`. We do not need to create a collection first, as it is going to be created automatically. +Retrieval Augmented Generation is not without its challenges and limitations. One of the main setbacks for app developers is managing the entire setup. The integration of a retriever and a generator into a single model can lead to a raised level of complexity, thus increasing the computational resources required. -Restoring collection is also done separately on each node, but our Python SDK does not support it yet. We are going to use the HTTP API instead, -and send a request to each node using `requests` library. +[LangChain](https://www.langchain.com/) is a framework that makes developing RAG-based applications much easier. It unifies interfaces to different libraries, including major embedding providers like OpenAI or Cohere and vector stores like Qdrant. With LangChain, you can focus on creating tangible GenAI applications instead of writing your logic from the ground up. -```python -for node_url, snapshot_path in zip(QDRANT_NODES, local_snapshot_paths): - snapshot_name = os.path.basename(snapshot_path) - requests.post( - f"{node_url}/collections/test_collection_import/snapshots/upload?priority=snapshot", - headers={ - "api-key": QDRANT_API_KEY, - }, - files={"snapshot": (snapshot_name, open(snapshot_path, "rb"))}, - ) +> Qdrant is one of the **top supported vector stores** on LangChain, with [extensive documentation](https://python.langchain.com/docs/integrations/vectorstores/qdrant) and [examples](https://python.langchain.com/docs/integrations/retrievers/self_query/qdrant_self_query). -``` +**How it Works:** LangChain receives a query and retrieves the query vector from an embedding model. Then, it dispatches the vector to a vector database, retrieving relevant documents. Finally, both the query and the retrieved documents are sent to the large language model to generate an answer. -Alternatively, you can use the `curl` command: +![qdrant-langchain-rag](/blog/using-qdrant-and-langchain/flow-diagram.png) -```bash -curl -X POST 'https://node-0.my-cluster.com:6333/collections/test_collection_import/snapshots/upload?priority=snapshot' \ - -H 'api-key: ${QDRANT_API_KEY}' \ - -H 'Content-Type:multipart/form-data' \ - -F 'snapshot=@node-0-shapshot.snapshot' +When supported by LangChain, Qdrant can help you set up effective question-answer systems, detection systems and chatbots that leverage RAG to its full potential. When it comes to long-term memory storage, developers can use LangChain to easily add relevant documents, chat history memory & rich user data to LLM app prompts via Qdrant. -curl -X POST 'https://node-1.my-cluster.com:6333/collections/test_collection_import/snapshots/upload?priority=snapshot' \ - -H 'api-key: ${QDRANT_API_KEY}' \ - -H 'Content-Type:multipart/form-data' \ - -F 'snapshot=@node-1-shapshot.snapshot' +## Common Use Cases -curl -X POST 'https://node-2.my-cluster.com:6333/collections/test_collection_import/snapshots/upload?priority=snapshot' \ - -H 'api-key: ${QDRANT_API_KEY}' \ - -H 'Content-Type:multipart/form-data' \ - -F 'snapshot=@node-2-shapshot.snapshot' +Integrating Qdrant and LangChain can revolutionize your AI applications. Let's take a look at what this integration can do for you: -``` +*Enhance Natural Language Processing (NLP):* +LangChain is great for developing question-answering **chatbots**, where Qdrant is used to contextualize and retrieve results for the LLM. We cover this in [our article](/articles/langchain-integration/), and in OpenAI's [cookbook examples](https://cookbook.openai.com/examples/vector_databases/qdrant/qa_with_langchain_qdrant_and_openai) that use LangChain and GPT to process natural language. -**Important:** We selected `priority=snapshot` to make sure that the snapshot is preferred over the data stored on the node. You can read mode about the priority in the [documentation](https://qdrant.tech/documentation/concepts/snapshots/#snapshot-priority). +*Improve Recommendation Systems:* +Food delivery services thrive on indecisive customers. Businesses need to accomodate a multi-aim search process, where customers seek recommendations though semantic search. With LangChain you can build systems for **e-commerce, content sharing, or even dating apps**. -Apart from Snapshots, Qdrant also provides the [Qdrant Migration Tool](https://github.com/qdrant/migration) that supports: +*Advance Data Analysis and Insights:* Sometimes you just want to browse results that are not necessarily closest, but still relevant. Semantic search helps user discover products in **online stores**. Customers don't exactly know what they are looking for, but require constrained space in which a search is performed. -- Migration between Qdrant Cloud instances. -- Migrating vectors from other providers into Qdrant. -- Migrating from Qdrant OSS to Qdrant Cloud. +*Offer Content Similarity Analysis:* Ever been stuck seeing the same recommendations on your **local news portal**? You may be held in a similarity bubble! As inputs get more complex, diversity becomes scarce, and it becomes harder to force the system to show something different. LangChain developers can use semantic search to develop further context. -Follow our [migration guide](https://qdrant.tech/documentation/database-tutorials/migration/) to learn how to effectively use the Qdrant Migration tool. +## Building a Chatbot with LangChain -##### Was this page useful? +_Now that you know how Qdrant and LangChain work together - it's time to build something!_ -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Follow Daniel Romero's video and create a RAG Chatbot completely from scratch. You will only use OpenAI, Qdrant and LangChain. +Here is what this basic tutorial will teach you: -Thank you for your feedback! 🙏 +**1. How to set up a chatbot using Qdrant and LangChain:** You will use LangChain to create a RAG pipeline that retrieves information from a dataset and generates output. This will demonstrate the difference between using an LLM by itself and leveraging a vector database like Qdrant for memory retrieval. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/database-tutorials/create-snapshot.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +**2. Preprocess and format data for use by the chatbot:** First, you will download a sample dataset based on some academic journals. Then, you will process this data into embeddings and store it as vectors inside of Qdrant. -On this page: +**3. Implement vector similarity search algorithms:** Second, you will create and test a chatbot that only uses the LLM. Then, you will enable the memory component offered by Qdrant. This will allow your chatbot to be modified and updated, giving it long-term memory. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/database-tutorials/create-snapshot.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +**4. Optimize the chatbot's performance:** In the last step, you will query the chatbot in two ways. First query will retrieve parametric data from the LLM, while the second one will get contexual data via Qdrant. -× +The goal of this exercise is to show that RAG is simple to implement via LangChain and yields much better results than using LLMs by itself. -[Powered by](https://qdrant.tech/) + -<|page-183-lllmstxt|> -## agentic-rag-langgraph -- [Documentation](https://qdrant.tech/documentation/) -- Agentic RAG With LangGraph +## Scaling Qdrant and LangChain -# [Anchor](https://qdrant.tech/documentation/agentic-rag-langgraph/\#agentic-rag-with-langgraph-and-qdrant) Agentic RAG With LangGraph and Qdrant +If you are looking to scale up and keep the same level of performance, Qdrant and LangChain are a rock-solid combination. Getting started with both is a breeze and the [documentation](https://python.langchain.com/docs/integrations/vectorstores/qdrant) covers a broad number of cases. However, the main strength of Qdrant is that it can consistently support the user way past the prototyping and launch phases. -Traditional Retrieval-Augmented Generation (RAG) systems follow a straightforward path: query → retrieve → generate. Sure, this works well for many scenarios. But let’s face it—this linear approach often struggles when you’re dealing with complex queries that demand multiple steps or pulling together diverse types of information. +> *"We are all-in on performance and reliability. Every release we make Qdrant faster, more stable and cost-effective for the user. When others focus on prototyping, we are already ready for production. Very soon, our users will build successful products and go to market. At this point, I anticipate a great need for a reliable vector store. Qdrant will be there for LangChain and the entire community."* -[Agentic RAG](https://qdrant.tech/articles/agentic-rag/) takes things up a notch by introducing AI agents that can orchestrate multiple retrieval steps and smartly decide how to gather and use the information you need. Think of it this way: in an Agentic RAG workflow, RAG becomes just one powerful tool in a much bigger and more versatile toolkit. +Whether you are building a bank fraud-detection system, RAG for e-commerce, or services for the federal government - you will need to leverage a scalable architecture for your product. Qdrant offers different features to help you considerably increase your application’s performance and lower your hosting costs. -By combining LangGraph’s robust state management with Qdrant’s cutting-edge vector search, we’ll build a system that doesn’t just answer questions—it tackles complex, multi-step information retrieval tasks with finesse. +> Read more about out how we foster [best practices for large-scale deployments](/articles/multitenancy/). -## [Anchor](https://qdrant.tech/documentation/agentic-rag-langgraph/\#what-well-build) What We’ll Build +## Next Steps -We’re building an AI agent to answer questions about Hugging Face and Transformers documentation using LangGraph. At the heart of our AI agent lies LangGraph, which acts like a conductor in an orchestra. It directs the flow between various components—deciding when to retrieve information, when to perform a web search, and when to generate responses. +Now that you know how Qdrant and LangChain can elevate your setup - it's time to try us out. -The components are: two Qdrant vector stores and the Brave web search engine. However, our agent doesn’t just blindly follow one path. Instead, it evaluates each query and decides whether to tap into the first vector store, the second one, or search the web. +- Qdrant is open source and you can [quickstart locally](/documentation/quick-start/), [install it via Docker](/documentation/quick-start/), [or to Kubernetes](https://github.com/qdrant/qdrant-helm/). -This selective approach gives your system the flexibility to choose the best data source for the job, rather than being locked into the same retrieval process every time, like traditional RAG. While we won’t dive into query refinement in this tutorial, the concepts you’ll learn here are a solid foundation for adding that functionality down the line. +- We also offer [a free-tier of Qdrant Cloud](https://cloud.qdrant.io/) for prototyping and testing. -## [Anchor](https://qdrant.tech/documentation/agentic-rag-langgraph/\#workflow) Workflow +- For best integration with LangChain, read the [official LangChain documentation](https://python.langchain.com/docs/integrations/vectorstores/qdrant/). -![image1](https://qdrant.tech/documentation/examples/agentic-rag-langgraph/image1.png) +- For all other cases, [Qdrant documentation](/documentation/integrations/langchain/) is the best place to get there. -| **Step** | **Description** | -| --- | --- | -| **1\. User Input** | You start by entering a query or request through an interface, like a chatbot or a web form. This query is sent straight to the AI Agent, the brain of the operation. | -| **2\. AI Agent Processes the Query** | The AI Agent analyzes your query, figuring out what you’re asking and which tools or data sources will best answer your question. | -| **3\. Tool Selection** | Based on its analysis, the AI Agent picks the right tool for the job. Your data is spread across two vector databases, and depending on the query, it chooses the appropriate one. For queries needing real-time or external web data, the agent taps into a web search tool powered by BraveSearchAPI. | -| **4\. Query Execution** | The AI Agent then puts its chosen tool to work:
\- **RAG Tool 1** queries Vector Database 1.
\- **RAG Tool 2** queries Vector Database 2.
\- **Web Search Tool** dives into the internet using the search API. | -| **5\. Data Retrieval** | The results roll in:
\- Vector Database 1 and 2 return the most relevant documents for your query.
\- The Web Search Tool provides up-to-date or external information. | -| **6\. Response Generation** | Using a text generation model (like GPT), the AI Agent crafts a detailed and accurate response tailored to your query. | -| **7\. User Response** | The polished response is sent back to you through the interface, ready to use. | +> We offer additional support tailored to your business needs. [Contact us](https://qdrant.to/contact-us) to learn more about implementation strategies and integrations that suit your company. -## [Anchor](https://qdrant.tech/documentation/agentic-rag-langgraph/\#the-stack) The Stack +<|page-373-lllmstxt|> +Artificial intelligence is evolving customer support, offering unprecedented capabilities for automating interactions, understanding user needs, and enhancing the overall customer experience. [IrisAgent](https://irisagent.com/), founded by former Google product manager [Palak Dalal Bhatia](https://www.linkedin.com/in/palakdalal/), demonstrates the concrete impact of AI on customer support with its AI-powered customer support automation platform. -The architecture taps into cutting-edge tools to power efficient Agentic RAG workflows. Here’s a quick overview of its components and the technologies you’ll need: +Bhatia describes IrisAgent as “the system of intelligence which sits on top of existing systems of records like support tickets, engineering bugs, sales data, or product data,” with the main objective of leveraging AI and generative AI, to automatically detect the intent and tags behind customer support tickets, reply to a large number of support tickets chats improve the time to resolution and increase the deflection rate of support teams. Ultimately, IrisAgent enables support teams to more with less and be more effective in helping customers. -- **AI Agent:** The mastermind of the system, this agent parses your queries, picks the right tools, and integrates the responses. We’ll use OpenAI’s _gpt-4o_ as the reasoning engine, managed seamlessly by LangGraph. -- **Embedding:** Queries are transformed into vector embeddings using OpenAI’s _text-embedding-3-small_ model. -- **Vector Database:** Embeddings are stored and used for similarity searches, with Qdrant stepping in as our database of choice. -- **LLM:** Responses are generated using OpenAI’s _gpt-4o_, ensuring answers are accurate and contextually grounded. -- **Search Tools:** To extend RAG’s capabilities, we’ve added a web search component powered by BraveSearchAPI, perfect for real-time and external data retrieval. -- **Workflow Management:** The entire orchestration and decision-making flow is built with LangGraph, providing the flexibility and intelligence needed to handle complex workflows. +## The Challenge -Ready to start building this system from the ground up? Let’s get to it! +Throughout her career Bhatia noticed a lot of manual and inefficient processes in support teams paired with information silos between important functions like customer support, product management, engineering teams, and sales teams. These silos typically prevent support teams from accurately solving customers’ pain points, as they are only able to access a fraction of the internal knowledge and don’t get the relevant information and insights that other teams have. -## [Anchor](https://qdrant.tech/documentation/agentic-rag-langgraph/\#implementation) Implementation +IrisAgent is addressing these challenges with AI and GenAI by generating meaningful customer experience insights about what the root cause of specific customer escalations or churn. “The platform allows support teams to gather these cross-functional insights and connect them to a single view of customer problems,” Bhatia says. Additionally, IrisAgent facilitates the automation of mundane and repetitive support processes. In the past, these tasks were difficult to automate effectively due to the limitations of early AI technologies. Support functions often depended on rudimentary solutions like legacy decision trees, which suffered from a lack of scalability and robustness, primarily relying on simplistic keyword matching. However, advancements in AI and GenAI technologies have now enabled more sophisticated and efficient automation of these support processes. -Before we dive into building our agent, let’s get everything set up. +## The Solution -### [Anchor](https://qdrant.tech/documentation/agentic-rag-langgraph/\#imports) Imports +“IrisAgent provides a very holistic product profile, as we are the operating system for support teams,” Bhatia says. The platform includes features like omni-channel customer support automation, which integrates with other parts of the business, such as engineering or sales platforms, to really understand customer escalation points. Long before the advent of technologies such as ChatGPT, IrisAgeny had already been refining and advancing their AI and ML stack. This has enabled them to develop a comprehensive range of machine learning models, including both proprietary solutions and those built on cloud technologies. Through this advancement, IrisAgent was able to finetune on public and private customer data to achieve the level of accuracy that is needed to successfully deflect and resolve customer issues at scale. -Here’s a list of key imports required: +![Iris GPT info](/blog/iris-agent-qdrant/iris_gpt.png) -```python -import os -import json -from typing import Annotated, TypedDict -from dotenv import load_dotenv -from langchain.embeddings import OpenAIEmbeddings -from langgraph import StateGraph, tool, ToolNode, ToolMessage -from langchain.document_loaders import HuggingFaceDatasetLoader -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain.llms import ChatOpenAI -from qdrant_client import QdrantClient -from qdrant_client.http.models import VectorParams -from brave_search import BraveSearch +Since IrisAgent built out a lot of their AI related processes in-house with proprietary technology, they wanted to find ways to augment these capabilities with RAG technologies and vector databases. This strategic move was aimed at abstracting much of the technical complexity, thereby simplifying the process for engineers and data scientists on the team to interact with data and develop a variety of solutions built on top of it. -``` +![Quote from CEO of IrisAgent](/blog/iris-agent-qdrant/iris_ceo_quote.png) -### [Anchor](https://qdrant.tech/documentation/agentic-rag-langgraph/\#qdrant-vector-database-setup) Qdrant Vector Database Setup +“We were looking at a lot of vector databases in the market and one of our core requirements was that the solution needed to be open source because we have a strong emphasis on data privacy and security,” Bhatia says. Also, performance played a key role for IrisAgent during their evaluation as Bhatia mentions: “Despite it being a relatively new project at the time we tested Qdrant, the performance was really good.” Additional evaluation criteria were the ease of ability to deployment, future maintainability, and the quality of available documentation. Ultimately, IrisAgent decided to build with Qdrant as their vector database of choice, given these reasons: -We’ll use **Qdrant Cloud** as our vector store for document embeddings. Here’s how to set it up: +* **Open Source and Flexibility**: IrisAgent required a solution that was open source, to align with their data security needs and preference for self-hosting. Qdrant's open-source nature allowed IrisAgent to deploy it on their cloud infrastructure seamlessly. +* **Performance**: Early on, IrisAgent recognized Qdrant's superior performance, despite its relative newness in the market. This performance aspect was crucial for handling large volumes of data efficiently. +* **Ease of Use**: Qdrant's user-friendly SDKs and compatibility with major programming languages like Go and Python made it an ideal choice for IrisAgent's engineering team. Additionally, IrisAgent values Qdrant’s the solid documentation, which is easy to follow. +* **Maintainability**: IrisAgent prioritized future maintainability in their choice of Qdrant, notably valuing the robustness and efficiency Rust provides, ensuring a scalable and future-ready solution. -| **Step** | **Description** | -| --- | --- | -| **1\. Create an Account** | If you don’t already have one, head to Qdrant Cloud and sign up. | -| **2\. Set Up a Cluster** | Log in to your account and find the **Create New Cluster** button on the dashboard. Follow the prompts to configure:
\- Select your **preferred region**.
\- Choose the **free tier** for testing. | -| **3\. Secure Your Details** | Once your cluster is ready, note these details:
\- **Cluster URL** (e.g., [https://xxx-xxx-xxx.aws.cloud.qdrant.io](https://xxx-xxx-xxx.aws.cloud.qdrant.io/))
\- **API Key** | +## Optimizing IrisAgent's AI Pipeline: The Evaluation and Integration of Qdrant -Save these securely for future use! +IrisAgent utilizes comprehensive testing and sandbox environments, ensuring no customer data is used during the testing of new features. Initially, they deployed Qdrant in these environments to evaluate its performance, leveraging their own test data and employing Qdrant’s console and SDK features to conduct thorough data exploration and apply various filters. The primary languages used in these processes are Go, for its efficiency, and Python, for its strength in data science tasks. -### [Anchor](https://qdrant.tech/documentation/agentic-rag-langgraph/\#openai-api-configuration) OpenAI API Configuration +After the successful testing, Qdrant's outputs are now integrated into IrisAgent’s AI pipeline, enhancing a suite of proprietary AI models designed for tasks such as detecting hallucinations and similarities, and classifying customer intents. With Qdrant, IrisAgent saw significant performance and quality gains for their RAG use cases. Beyond this, IrisAgent also performs fine-tuning further in the development process. -Your OpenAI API key will power both embedding generation and language model interactions. Visit [OpenAI’s platform](https://platform.openai.com/) and sign up for an account. In the API section of your dashboard, create a new API key. We’ll use the text-embedding-3-small model for embeddings and GPT-4 as the language model. +Qdrant’s emphasis on open-source technology and support for main programming languages (Go and Python) ensures ease of use and compatibility with IrisAgent’s production environment. IrisAgent is deploying Qdrant on Google Cloud in order to fully leverage Google Cloud's robust infrastructure and innovative offerings. -### [Anchor](https://qdrant.tech/documentation/agentic-rag-langgraph/\#brave-search) Brave Search +![Iris agent flow chart](/blog/iris-agent-qdrant/iris_agent_flow_chart.png) -To enhance search capabilities, we’ll integrate Brave Search. Visit the [Brave API](https://api.search.brave.com/) and complete their API access request process to obtain an API key. This key will enable web search functionality for our agent. +## Future of IrisAgent -For added security, store all API keys in a .env file. +Looking ahead, IrisAgent is committed to pushing the boundaries of AI in customer support, with ambitious plans to evolve their product further. The cornerstone of this vision is a feature that will allow support teams to leverage historical support data more effectively, by automating the generation of knowledge base content to redefine how FAQs and product documentation are created. This strategic initiative aims not just to reduce manual effort but also to enrich the self-service capabilities of users. As IrisAgent continues to refine its AI algorithms and expand its training datasets, the goal is to significantly elevate the support experience, making it more seamless and intuitive for end-users. -```json -OPENAI_API_KEY = -QDRANT_KEY = -QDRANT_URL = -BRAVE_API_KEY = +<|page-374-lllmstxt|> +## Dailymotion's Journey to Crafting the Ultimate Content-Driven Video Recommendation Engine with Qdrant Vector Database +In today's digital age, the consumption of video content has become ubiquitous, with an overwhelming abundance of options available at our fingertips. However, amidst this vast sea of videos, the challenge lies not in finding content, but in discovering the content that truly resonates with individual preferences and interests and yet is diverse enough to not throw users into their own filter bubble. As viewers, we seek meaningful and relevant videos that enrich our experiences, provoke thought, and spark inspiration. -``` +Dailymotion is not just another video application; it's a beacon of curated content in an ocean of options. With a steadfast commitment to providing users with meaningful and ethical viewing experiences, Dailymotion stands as the bastion of videos that truly matter. -* * * +They aim to boost a dynamic visual dialogue, breaking echo chambers and fostering discovery. -Then load the environment variables: +### Scale -```python -load_dotenv() -qdrant_key = os.getenv("QDRANT_KEY") -qdrant_url = os.getenv("QDRANT_URL") -brave_key = os.getenv("BRAVE_API_KEY") +- **420 million+ videos** +- **2k+ new videos / hour** +- **13 million+ recommendations / day** +- **300+ languages in videos** +- **Required response time < 100 ms** -``` -* * * +### Challenge -### [Anchor](https://qdrant.tech/documentation/agentic-rag-langgraph/\#document-processing) Document Processing -Before we can create our agent, we need to process and store the documentation. We’ll be working with two datasets from Hugging Face: their general documentation and Transformers-specific documentation. +- **Improve video recommendations** across all 3 applications of Dailymotion (mobile app, website and embedded video player on all major French and International sites) as it is the main driver of audience engagement and revenue stream of the platform. +- Traditional [collaborative recommendation model](https://en.wikipedia.org/wiki/Collaborative_filtering) tends to recommend only popular videos, fresh and niche videos suffer due to zero or minimal interaction +- Video content based recommendation system required processing all the video embedding at scale and in real time, as soon as they are added to the platform +- Exact neighbor search at the scale and keeping them up to date with new video updates in real time at Dailymotion was unreasonable and unrealistic +- Precomputed [KNN](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm) would be expensive and may not work due to video updates every hour +- Platform needs fast recommendations ~ < 100ms +- Needed fast ANN search on a vector search engine which could support the scale and performance requirements of the platform -Here’s our document preprocessing function: +### Background / Journey -```python -def preprocess_dataset(docs_list): - text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( - chunk_size=700, - chunk_overlap=50, - disallowed_special=() - ) - doc_splits = text_splitter.split_documents(docs_list) - return doc_splits +The quest of Dailymotion to deliver an intelligent video recommendation engine providing a curated selection of videos to its users started with a need to present more relevant videos to the first-time users of the platform (cold start problem) and implement an ideal home feed experience to allow users to watch videos that are expected to be relevant, diverse, explainable, and easily tunable. \ +This goal accounted for their efforts focused on[ Optimizing Video Recommender for Dailymotion's Home Feed ](https://medium.com/dailymotion/optimizing-video-feed-recommendations-with-diversity-machine-learning-first-steps-4cf9abdbbffd)back in the time. -``` +They continued their work in [Optimising the recommender engine with vector databases and opinion mining](https://medium.com/dailymotion/reinvent-your-recommender-system-using-vector-database-and-opinion-mining-a4fadf97d020) later with emphasis on ranking videos based on features like freshness, real views ratio, watch ratio, and aspect ratio to enhance user engagement and optimise watch time per user on the home feed. Furthermore, the team continued to focus on diversifying user interests by grouping videos based on interest and using stratified sampling to ensure a balanced experience for users. -* * * -This function processes our documents by splitting them into manageable chunks, ensuring important context is preserved at the chunk boundaries through overlap. We’ll use the HuggingFaceDatasetLoader to load the datasets into Hugging Face documents. +By now it was clear to the Dailymotion team that the future initiatives will involve overcoming obstacles related to data processing, sentiment analysis, and user experience to provide meaningful and diverse recommendations. The main challenge stayed at the candidate generation process, textual embeddings, opinion mining, along with optimising the efficiency and accuracy of these processes and tackling the complexities of large-scale content curation. -```python -hugging_face_doc = HuggingFaceDatasetLoader("m-ric/huggingface_doc","text") -transformers_doc = HuggingFaceDatasetLoader("m-ric/transformers_documentation_en","text") +### Solution at glance -``` +![solution-at-glance](/case-studies/dailymotion/solution-at-glance.png) -* * * +The solution involved implementing a content based Recommendation System leveraging Qdrant to power the similar videos, with the following characteristics. -In this demo, we are selecting the first 50 documents from the dataset and passing them to the processing function. +**Fields used to represent each video** - +Title , Tags , Description , Transcript (generated by [OpenAI whisper](https://openai.com/research/whisper)) -```python -hf_splits = preprocess_dataset(hugging_face_doc.load()[:number_of_docs]) -transformer_splits = preprocess_dataset(transformers_doc.load()[:number_of_docs]) +**Encoding Model used** - [MUSE - Multilingual Universal Sentence Encoder](https://www.tensorflow.org/hub/tutorials/retrieval_with_tf_hub_universal_encoder_qa) -``` +* Supports - 16 languages -* * * +### Why Qdrant? -Our splits are ready. Let’s create a collection in Qdrant to store them. +![quote-from-Samuel](/case-studies/dailymotion/Dailymotion-Quote.jpg) -### [Anchor](https://qdrant.tech/documentation/agentic-rag-langgraph/\#defining-the-state) Defining the State +Looking at the complexity, scale and adaptability of the desired solution, the team decided to leverage Qdrant’s vector database to implement a content-based video recommendation that undoubtedly offered several advantages over other methods: -In LangGraph, a **state** refers to the data or information stored and maintained at a specific point during the execution of a process or a series of operations. States capture the intermediate or final results that the system needs to keep track of to manage and control the flow of tasks, -LangGraph works with a state-based system. We define our state like this: +**1. Efficiency in High-Dimensional Data Handling:** -```python -class State(TypedDict): - messages: Annotated[list, add_messages] + Video content is inherently high-dimensional, comprising various features such as audio, visual, textual, and contextual elements. + Qdrant excels in efficiently handling high-dimensional data and out-of-the-box support for all the models with up to 65536 dimensions, making it well-suited for representing and processing complex video features with choice of any embedding model. -``` -* * * +**2. Scalability:** + + As the volume of video content and user interactions grows, scalability becomes paramount. Qdrant is meticulously designed to scale vertically as well as horizontally, allowing for seamless expansion to accommodate large volumes of data and user interactions without compromising performance. -Let’s build our tools. -### [Anchor](https://qdrant.tech/documentation/agentic-rag-langgraph/\#building-the-tools) Building the Tools +**3. Fast and Accurate Similarity Search:** -Our agent is equipped with three powerful tools: + Efficient video recommendation systems rely on identifying similarities between videos to make relevant recommendations. Qdrant leverages advanced HNSW indexing and similarity search algorithms to support fast and accurate retrieval of similar videos based on their feature representations nearly instantly (20ms for this use case) -1. **Hugging Face Documentation Retriever** -2. **Transformers Documentation Retriever** -3. **Web Search Tool** -Let’s start by defining a retriever that takes documents and a collection name, then returns a retriever. The query is transformed into vectors using **OpenAIEmbeddings**. +**4. Flexibility in vector representation with metadata through payloads:** -```python -def create_retriever(collection_name, doc_splits): - vectorstore = QdrantVectorStore.from_documents( - doc_splits, - OpenAIEmbeddings(model="text-embedding-3-small"), - url=qdrant_url, - api_key=qdrant_key, - collection_name=collection_name, - ) - return vectorstore.as_retriever() + Qdrant offers flexibility in storing vectors with metadata in form of payloads and offers support for advanced metadata filtering during the similarity search to incorporate custom logic. -``` -* * * +**5. Reduced Dimensionality and Storage Requirements:** -Both the Hugging Face documentation retriever and the Transformers documentation retriever use this same function. With this setup, it’s incredibly simple to create separate tools for each. + Vector representations in Qdrant offer various Quantization and memory mapping techniques to efficiently store and retrieve vectors, leading to reduced storage requirements and computational overhead compared to alternative methods such as content-based filtering or collaborative filtering. -```python -hf_retriever_tool = create_retriever_tool( - hf_retriever, - "retriever_hugging_face_documentation", - "Search and return information about hugging face documentation, it includes the guide and Python code.", -) -transformer_retriever_tool = create_retriever_tool( - transformer_retriever, - "retriever_transformer", - "Search and return information specifically about transformers library", -) +**6. Impressive Benchmarks:** -``` + [Qdrant’s benchmarks](/benchmarks/) has definitely been one of the key motivations for the Dailymotion’s team to try the solution and the team comments that the performance has been only better than the benchmarks. -* * * -For web search, we create a simple yet effective tool using Brave Search: +**7. Ease of usage:** -```python -@tool("web_search_tool") -def search_tool(query): - search = BraveSearch.from_api_key(api_key=brave_key, search_kwargs={"count": 3}) - return search.run(query) + Qdrant API’s have been immensely easy to get started with as compared to Google Vertex Matching Engine (which was Dailymotion’s initial choice) and the support from the team has been of a huge value to us. -``` -* * * +**8. Being able to fetch data by id** -The search\_tool function leverages the BraveSearch API to perform a search. It takes a query, retrieves the top 3 search results using the API key, and returns the results. + Qdrant allows to retrieve vector point / videos by ids while the Vertex Matching Engine requires a vector input to be able to search for other vectors which was another really important feature for Dailymotion + + -Next, we’ll set up and integrate our tools with a language model: -```python -tools = [hf_retriever_tool, transformer_retriever_tool, search_tool] +### Data Processing pipeline -tool_node = ToolNode(tools=tools) +![data-processing](/case-studies/dailymotion/data-processing-pipeline.png) -llm = ChatOpenAI(model="gpt-4o", temperature=0) +Figure shows the streaming architecture of the data processing pipeline that processes everytime a new video is uploaded or updated (Title, Description, Tags, Transcript), an updated embedding is computed and fed directly into Qdrant. -llm_with_tools = llm.bind_tools(tools) -``` +### Results -* * * -Here, the ToolNode class handles and orchestrates our tools: +![before-qdrant-results](/case-studies/dailymotion/before-qdrant.png) -```python -class ToolNode: - def __init__(self, tools: list) -> None: - self.tools_by_name = {tool.name: tool for tool in tools} - def __call__(self, inputs: dict): - if messages := inputs.get("messages", []): - message = messages[-1] - else: - raise ValueError("No message found in input") +There has been a big improvement in the recommended content processing time and quality as the existing system had issues like: - outputs = [] - for tool_call in message.tool_calls: - tool_result = self.tools_by_name[tool_call["name"]].invoke( - tool_call["args"] - ) - outputs.append( - ToolMessage( - content=json.dumps(tool_result), - name=tool_call["name"], - tool_call_id=tool_call["id"], - ) - ) +1. Subpar video recommendations due to long processing time ~ 5 hours +2. Collaborative recommender tended to recommend and focused on high signal / popular videos +3. Metadata based recommender focussed only on a very small scope of trusted video sources +4. The recommendations did not take contents of the video into consideration - return {"messages": outputs} -``` -* * * +![after-qdrant-results](/case-studies/dailymotion/after-qdrant.png) -The ToolNode class handles tool execution by initializing a list of tools and mapping tool names to their corresponding functions. It processes input dictionaries, extracts the last message, and checks for tool\_calls from LLM tool-calling capability providers such as Anthropic, OpenAI, and others. -### [Anchor](https://qdrant.tech/documentation/agentic-rag-langgraph/\#routing-and-decision-making) Routing and Decision Making +The new recommender system implementation leveraging Qdrant along with the collaborative recommender offered various advantages : -Our agent needs to determine when to use tools and when to end the cycle. This decision is managed by the routing function: -```python -def route(state: State): - if isinstance(state, list): - ai_message = state[-1] - elif messages := state.get("messages", []): - ai_message = messages[-1] - else: - raise ValueError(f"No messages found in input state to tool_edge: {state}") +1. The processing time for the new video content reduced significantly to a few minutes which enabled the fresh videos to be part of recommendations. +2. The performant & scalable scope of video recommendation currently processes 22 Million videos and can provide recommendation for videos with fewer interactions too. +3. The overall huge performance gain on the low signal videos has contributed to more than 3 times increase on the interaction and CTR ( number of clicks) on the recommended videos. +4. Seamlessly solved the initial cold start and low performance problems with the fresh content. - if hasattr(ai_message, "tool_calls") and len(ai_message.tool_calls) > 0: - return "tools" +### Outlook / Future plans - return END +The team is very excited with the results they achieved on their recommender system and wishes to continue building with it. \ +They aim to work on Perspective feed next and say -``` +>”We've recently integrated this new recommendation system into our mobile app through a feature called Perspective. The aim of this feature is to disrupt the vertical feed algorithm, allowing users to discover new videos. When browsing their feed, users may encounter a video discussing a particular movie. With Perspective, they have the option to explore different viewpoints on the same topic. Qdrant plays a crucial role in this feature by generating candidate videos related to the subject, ensuring users are exposed to diverse perspectives and preventing them from being confined to an echo chamber where they only encounter similar viewpoints.” \ +> Gladys Roch - Machine Learning Engineer -* * * -## [Anchor](https://qdrant.tech/documentation/agentic-rag-langgraph/\#putting-it-all-together-the-graph) Putting It All Together: The Graph -Finally, we’ll construct the graph that ties everything together: +![perspective-feed-with-qdrant](/case-studies/dailymotion/perspective-feed-qdrant.jpg) -```python -graph_builder = StateGraph(State) -graph_builder.add_node("agent", agent) -graph_builder.add_node("tools", tool_node) +The team is also interested in leveraging advanced features like [Qdrant’s Discovery API](/documentation/concepts/explore/#recommendation-api) to promote exploration of content to enable finding not only similar but dissimilar content too by using positive and negative vectors in the queries and making it work with the existing collaborative recommendation model. -graph_builder.add_conditional_edges( - "agent", - route, - {"tools": "tools", END: END}, -) +### References -graph_builder.add_edge("tools", "agent") -graph_builder.add_edge(START, "agent") +**2024 -** [https://www.youtube.com/watch?v=1ULpLpWD0Aw](https://www.youtube.com/watch?v=1ULpLpWD0Aw) -``` +**2023 -** [https://medium.com/dailymotion/reinvent-your-recommender-system-using-vector-database-and-opinion-mining-a4fadf97d020](https://medium.com/dailymotion/reinvent-your-recommender-system-using-vector-database-and-opinion-mining-a4fadf97d020) -* * * +**2022 -** [https://medium.com/dailymotion/optimizing-video-feed-recommendations-with-diversity-machine-learning-first-steps-4cf9abdbbffd](https://medium.com/dailymotion/optimizing-video-feed-recommendations-with-diversity-machine-learning-first-steps-4cf9abdbbffd) -This is what the graph looks like: +<|page-375-lllmstxt|> +# Qdrant vs Pinecone: An Analysis of Vector Databases for AI Applications -![image2](https://qdrant.tech/documentation/examples/agentic-rag-langgraph/image2.jpg) +Data forms the foundation upon which AI applications are built. Data can exist in both structured and unstructured formats. Structured data typically has well-defined schemas or inherent relationships. However, unstructured data, such as text, image, audio, or video, must first be converted into numerical representations known as [vector embeddings](https://qdrant.tech/articles/what-are-embeddings/). These embeddings encapsulate the semantic meaning or features of unstructured data and are in the form of high-dimensional vectors. -Fig. 3: Agentic RAG with LangGraph +Traditional databases, while effective at handling structured data, fall short when dealing with high-dimensional unstructured data, which are increasingly the focal point of modern AI applications. Key reasons include: -### [Anchor](https://qdrant.tech/documentation/agentic-rag-langgraph/\#running-the-agent) Running the Agent +- **Indexing Limitations**: Database indexing methods like B-Trees or hash indexes, typically used in relational databases, are inefficient for high-dimensional data and show poor query performance. +- **Curse of Dimensionality**: As dimensions increase, data points become sparse, and distance metrics like Euclidean distance lose their effectiveness, leading to poor search query performance. +- **Lack of Specialized Algorithms**: Traditional databases do not incorporate advanced algorithms designed to handle high-dimensional data, resulting in slow query processing times. +- **Scalability Challenges**: Managing and querying high-dimensional [vectors](https://qdrant.tech/documentation/concepts/vectors/) require optimized data structures, which traditional databases are not built to handle. +- **Storage Inefficiency**: Traditional databases are not optimized for efficiently storing large volumes of high-dimensional data, facing significant challenges in managing space complexity and [retrieval efficiency](https://qdrant.tech/documentation/tutorials/retrieval-quality/). -With everything set up, we can run our agent using a simple function: +Vector databases address these challenges by efficiently storing and querying high-dimensional vectors. They offer features such as high-dimensional vector storage and retrieval, efficient similarity search, sophisticated indexing algorithms, advanced compression techniques, and integration with various machine learning frameworks. -```python -def run_agent(user_input: str): - for event in graph.stream({"messages": [("user", user_input)]}): - for value in event.values(): - print("Assistant:", value["messages"][-1].content) +Due to their capabilities, vector databases are now a cornerstone of modern AI and are becoming pivotal in building applications that leverage similarity search, recommendation systems, natural language processing, computer vision, image recognition, speech recognition, and more. -``` +Over the past few years, several vector database solutions have emerged – the two leading ones being Qdrant and Pinecone, among others. Both are powerful vector database solutions with unique strengths. However, they differ greatly in their principles and approach, and the capabilities they offer to developers. In this article, we’ll examine both solutions and discuss the factors you need to consider when choosing amongst the two. Let’s dive in! -* * * +## Exploring Qdrant Vector Database: Features and Capabilities -Now, you’re ready to ask questions about Hugging Face and Transformers! Our agent will intelligently combine information from the documentation with web search results when needed. +Qdrant is a high-performance, open-source vector similarity search engine built with [Rust](https://qdrant.tech/articles/why-rust/), designed to handle the demands of large-scale AI applications with exceptional speed and reliability. Founded in 2021, Qdrant's mission is to "build the most efficient, scalable, and high-performance vector database in the market." This mission is reflected in its architecture and feature set. -For example, you can ask: +Qdrant is highly scalable and performant: it can handle billions of vectors efficiently and with [minimal latency](https://qdrant.tech/benchmarks/). Its advanced vector indexing, search, and retrieval capabilities make it ideal for applications that require fast and accurate search results. It supports vertical and horizontal scaling, advanced compression techniques, highly flexible deployment options – including cloud-native, [hybrid cloud](https://qdrant.tech/documentation/hybrid-cloud/), and private cloud solutions – and powerful security features. -```txt -In the Transformers library, are there any multilingual models? +### Key Features of Qdrant Vector Database -``` +- **Advanced Similarity Search:** Qdrant supports various similarity [search](https://qdrant.tech/documentation/concepts/search/) metrics like dot product, cosine similarity, Euclidean distance, and Manhattan distance. You can store additional information along with vectors, known as [payload](https://qdrant.tech/documentation/concepts/payload/) in Qdrant terminology. A payload is any JSON formatted data. +- **Built Using Rust:** Qdrant is built with Rust, and leverages its performance and efficiency. Rust is famed for its [memory safety](https://arxiv.org/abs/2206.05503) without the overhead of a garbage collector, and rivals C and C++ in speed. +- **Scaling and Multitenancy**: Qdrant supports both vertical and horizontal scaling and uses the Raft consensus protocol for [distributed deployments](https://qdrant.tech/documentation/guides/distributed_deployment/). Developers can run Qdrant clusters with replicas and shards, and seamlessly scale to handle large datasets. Qdrant also supports [multitenancy](https://qdrant.tech/documentation/guides/multiple-partitions/) where developers can create single collections and partition them using payload. +- **Payload Indexing and Filtering:** Just as Qdrant allows attaching any JSON payload to vectors, it also supports payload indexing and [filtering](https://qdrant.tech/documentation/concepts/filtering/) with a wide range of data types and query conditions, including keyword matching, full-text filtering, numerical ranges, nested object filters, and [geo](https://qdrant.tech/documentation/concepts/filtering/#geo)filtering. +- **Hybrid Search with Sparse Vectors:** Qdrant supports both dense and [sparse vectors](https://qdrant.tech/articles/sparse-vectors/), thereby enabling hybrid search capabilities. Sparse vectors are numerical representations of data where most of the elements are zero. Developers can combine search results from dense and sparse vectors, where sparse vectors ensure that results containing the specific keywords are returned and dense vectors identify semantically similar results. +- **Built-In Vector Quantization:** Qdrant offers three different [quantization](https://qdrant.tech/documentation/guides/quantization/) options to developers to optimize resource usage. Scalar quantization balances accuracy, speed, and compression by converting 32-bit floats to 8-bit integers. Binary quantization, the fastest method, significantly reduces memory usage. Product quantization offers the highest compression, and is perfect for memory-constrained scenarios. +- **Flexible Deployment Options:** Qdrant offers a range of deployment options. Developers can easily set up Qdrant (or Qdrant cluster) [locally](https://qdrant.tech/documentation/quick-start/#download-and-run) using Docker for free. [Qdrant Cloud](https://qdrant.tech/cloud/), on the other hand, is a scalable, managed solution that provides easy access with flexible pricing. Additionally, Qdrant offers [Hybrid Cloud](https://qdrant.tech/hybrid-cloud/) which integrates Kubernetes clusters from cloud, on-premises, or edge, into an enterprise-grade managed service. +- **Security through API Keys, JWT and RBAC:** Qdrant offers developers various ways to [secure](https://qdrant.tech/documentation/guides/security/) their instances. For simple authentication, developers can use API keys (including Read Only API keys). For more granular access control, it offers JSON Web Tokens (JWT) and the ability to build Role-Based Access Control (RBAC). TLS can be enabled to secure connections. Qdrant is also [SOC 2 Type II](https://qdrant.tech/blog/qdrant-soc2-type2-audit/) certified. -The agent will dive into the Transformers documentation, extract relevant details about multilingual models, and deliver a clear, comprehensive answer. +Additionally, Qdrant integrates seamlessly with popular machine learning frameworks such as [LangChain](https://qdrant.tech/blog/using-qdrant-and-langchain/), LlamaIndex, and Haystack; and Qdrant Hybrid Cloud integrates seamlessly with AWS, DigitalOcean, Google Cloud, Linode, Oracle Cloud, OpenShift, and Azure, among others. -Here’s what the response might look like: +By focusing on performance, scalability and efficiency, Qdrant has positioned itself as a leading solution for enterprise-grade vector similarity search, capable of meeting the growing demands of modern AI applications. -```txt -Yes, the Transformers library includes several multilingual models. Here are some examples: +However, how does it compare with Pinecone? Let’s take a look. -BERT Multilingual: -Models like `bert-base-multilingual-uncased` can be used just like monolingual models. +## Exploring Pinecone Vector Database: Key Features and Capabilities -XLM (Cross-lingual Language Model): -Models like `xlm-mlm-ende-1024` (English-German), `xlm-mlm-enfr-1024` (English-French), and others use language embeddings to specify the language used at inference. +An alternative to Qdrant, Pinecone provides a fully managed vector database that abstracts the complexities of infrastructure and scaling. The company’s founding principle, when it started in 2019, was to make Pinecone “accessible to engineering teams of all sizes and levels of AI expertise.” -M2M100: -Models like `facebook/m2m100_418M` and `facebook/m2m100_1.2B` are used for multilingual translation. +Similarly to Qdrant, Pinecone offers advanced vector search and retrieval capabilities. There are two different ways you can use Pinecone: using its serverless architecture or its pod architecture. Pinecone also supports advanced similarity search metrics such as dot product, Euclidean distance, and cosine similarity. Using its pod architecture, you can leverage horizontal or vertical scaling. Finally, Pinecone offers privacy and security features such as Role-Based Access Control (RBAC) and end-to-end encryption, including encryption in transit and at rest. -MBart: -Models like `facebook/mbart-large-50-one-to-many-mmt` and `facebook/mbart-large-50-many-to-many-mmt` are used for multilingual machine translation across 50 languages. +### Key Features of Pinecone Vector Database -These models are designed to handle multiple languages and can be used for tasks like translation, classification, and more. +- **Fully Managed Service:** Pinecone offers a fully managed SaaS-only service. It handles the complexities of infrastructure management such as scaling, performance optimization, and maintenance. Pinecone is designed for developers who want to focus on building AI applications without worrying about the underlying database infrastructure. +- **Serverless and Pod Architecture:** Pinecone offers two different architecture options to run their vector database - the serverless architecture and the pod architecture. Serverless architecture runs as a managed service on the AWS cloud platform, and allows automatic scaling based on workload. Pod architecture, on the other hand, provides pre-configured hardware units (pods) for hosting and executing services, and supports horizontal and vertical scaling. Pods can be run on AWS, GCP, or Azure. +- **Advanced Similarity Search:** Pinecone supports three different similarity search metrics – dot product, Euclidean distance, and cosine similarity. It currently does not support Manhattan distance metric. +- **Privacy and Security Features:** Pinecone offers Role-Based Access Control (RBAC), end-to-end encryption, and compliance with SOC 2 Type II and GDPR. Pinecone allows for the creation of “organization”, which, in turn, has “projects” and “members” with single sign-on (SSO) and access control. +- **Hybrid Search and Sparse Vectors**: Pinecone supports both sparse and dense vectors, and allows hybrid search. This gives developers the ability to combine semantic and keyword search in a single query. +- **Metadata Filtering**: Pinecone allows attaching key-value metadata to vectors in an index, which can later be queried. Semantic search using metadata filters retrieve exactly the results that match the filters. -``` +Pinecone’s fully managed service makes it a compelling choice for developers who’re looking for a vector database that comes without the headache of infrastructure management. -* * * +## Pinecone vs Qdrant: Key Differences and Use Cases -## [Anchor](https://qdrant.tech/documentation/agentic-rag-langgraph/\#conclusion) Conclusion +Qdrant and Pinecone are both robust vector database solutions, but they differ significantly in their design philosophy, deployment options, and technical capabilities. -We’ve successfully implemented Agentic RAG. But this is just the beginning—there’s plenty more you can explore to take your system to the next level. +Qdrant is an open-source vector database that gives control to the developer. It can be run locally, on-prem, in the cloud, or as a managed service, and it even offers a hybrid cloud option for enterprises. This makes Qdrant suitable for a wide range of environments, from development to enterprise settings. It supports multiple programming languages and offers advanced features like customizable distance metrics, payload filtering, and [integration with popular AI frameworks](https://qdrant.tech/documentation/frameworks/). -Agentic RAG is transforming how businesses connect data sources with AI, enabling smarter and more dynamic interactions. In this tutorial, you’ve learned how to build an Agentic RAG system that combines the power of LangGraph, Qdrant, and web search into one seamless workflow. +Pinecone, on the other hand, is a fully managed, SaaS-only solution designed to abstract the complexities of infrastructure management. It provides a serverless architecture for automatic scaling and a pod architecture for resource customization. Pinecone focuses on ease of use and high performance, offering built-in security measures, compliance certifications, and a user-friendly API. However, it has some limitations in terms of metadata handling and flexibility compared to Qdrant. -This system doesn’t just stop at retrieving relevant information from Hugging Face and Transformers documentation. It also smartly falls back to web search when needed, ensuring no query goes unanswered. With Qdrant as the vector database backbone, you get fast, scalable semantic search that excels at retrieving precise information—even from massive datasets. +| Aspect | Qdrant | Pinecone | +| ------------------------- | ---------------------------------------------------------------------- | -------------------------------------------------- | +| Deployment Modes | Local, on-premises, cloud | SaaS-only | +| Supported Languages | Python, JavaScript/TypeScript, Rust, Go, Java | Python, JavaScript/TypeScript, Java, Go | +| Similarity Search Metrics | Dot Product, Cosine Similarity, Euclidean Distance, Manhattan Distance | Dot Product, Cosine Similarity, Euclidean Distance | +| Hybrid Search | Highly customizable Hybrid search by combining Sparse and Dense Vectors, with support for separate indices within the same collection | Supports Hybrid search with a single sparse-dense index | +| Vector Payload | Accepts any JSON object as payload, supports NULL values, geolocation, and multiple vectors per point | Flat metadata structure, does not support NULL values, geolocation, or multiple vectors per point | +| Scalability | Vertical and horizontal scaling, distributed deployment with Raft consensus | Serverless architecture and pod architecture for horizontal and vertical scaling | +| Performance | Efficient indexing, low latency, high throughput, customizable distance metrics | High throughput, low latency, gRPC client for higher upsert speeds | +| Security | Flexible, environment-specific configurations, API key authentication in Qdrant Cloud, JWT and RBAC, SOC 2 Type II certification | Built-in RBAC, end-to-end encryption, SOC 2 Type II certification | -To truly grasp the potential of this approach, why not apply these concepts to your own projects? Customize the template we’ve shared to fit your unique use case, and unlock the full potential of Agentic RAG for your business needs. The possibilities are endless. +## Choosing the Right Vector Database: Factors to Consider -##### Was this page useful? +When choosing between Qdrant and Pinecone, you need to consider some key factors that may impact your project long-term. Below are some primary considerations to help guide your decision: -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +### 1. Deployment Flexibility -Thank you for your feedback! 🙏 +**Qdrant** offers multiple deployment options, including a local Docker node or cluster, Qdrant Cloud, and Hybrid Cloud. This allows you to choose an environment that best suits your project. You can start with a local Docker node for development, then add nodes to your cluster, and later switch to a Hybrid Cloud solution. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/agentic-rag-langgraph.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +**Pinecone**, on the other hand, is a fully managed SaaS solution. To use Pinecone, you connect your development environment to its cloud service. It abstracts the complexities of infrastructure management, making it easier to deploy, but it is also less flexible in terms of deployment options compared to Qdrant. -On this page: +### 2. Scalability Requirements -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/agentic-rag-langgraph.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +**Qdrant** supports both vertical and horizontal scaling and is suitable for deployments of all scales. You can run it as a single Docker node, a large cluster, or a Hybrid cloud, depending on the size of your dataset. Qdrant’s architecture allows for distributed deployment with replicas and shards, and scales extremely well to billions of vectors with minimal latency. -× +**Pinecone** provides a serverless architecture and a pod architecture that automatically scales based on workload. Serverless architecture removes the need for any manual intervention, whereas pod architecture provides a bit more control. Since Pinecone is a managed SaaS-only solution, your application’s scalability is tied to both Pinecone's service and the underlying cloud provider in use. -[Powered by](https://qdrant.tech/) +### 3. Performance and Throughput -<|page-184-lllmstxt|> -## qdrant-1.2.x -- [Articles](https://qdrant.tech/articles/) -- Introducing Qdrant 1.2.x +**Qdrant** excels in providing different performance profiles tailored to specific use cases. It offers efficient vector and payload indexing, low-latency queries, optimizers, and high throughput, along with multiple options for quantization to further optimize performance. -[Back to Qdrant Articles](https://qdrant.tech/articles/) +**Pinecone** recommends increasing the number of replicas to boost the throughput of pod-based indexes. For serverless indexes, Pinecone automatically handles scaling and throughput. To decrease latency, Pinecone suggests using namespaces to partition records within a single index. However, since Pinecone is a managed SaaS-only solution, developer control over performance and throughput is limited. -# Introducing Qdrant 1.2.x +### 4. Security Considerations -Kacper Ɓukawski +**Qdrant** allows for tailored security configurations specific to your deployment environment. It supports API keys (including read-only API keys), JWT authentication, and TLS encryption for connections. Developers can build Role-Based Access Control (RBAC) according to their application needs in a completely custom manner. Additionally, Qdrant's deployment flexibility allows organizations that need to adhere to stringent data laws to deploy it within their infrastructure, ensuring compliance with data sovereignty regulations. -· +**Pinecone** provides comprehensive built-in security features in its managed SaaS solution, including Role-Based Access Control (RBAC) and end-to-end encryption. Its compliance with SOC 2 Type II and GDPR-readiness makes it a good choice for applications requiring standardized security measures. -May 24, 2023 +### 5. Pricing -![Introducing Qdrant 1.2.x](https://qdrant.tech/articles_data/qdrant-1.2.x/preview/title.jpg) +**Qdrant** can be self-hosted locally (single node or a cluster) with a single Docker command. With its SaaS option, it offers a free tier in Qdrant Cloud sufficient for around 1M 768-dimensional vectors, without any limitation on the number of collections it is used for. This allows developers to build multiple demos without limitations. For more pricing information, check [here](https://qdrant.tech/pricing/). -A brand-new Qdrant 1.2 release comes packed with a plethora of new features, some of which -were highly requested by our users. If you want to shape the development of the Qdrant vector -database, please [join our Discord community](https://qdrant.to/discord) and let us know -how you use it! +**Pinecone** cannot be self-hosted, and signing up for the SaaS solution is the only option. Pinecone has a free tier that supports approximately 300K 1536-dimensional embeddings. For Pinecone’s pricing details, check their pricing page. -## [Anchor](https://qdrant.tech/articles/qdrant-1.2.x/\#new-features) New features +### Qdrant vs Pinecone: Complete Summary -As usual, a minor version update of Qdrant brings some interesting new features. We love to see your -feedback, and we tried to include the features most requested by our community. +The choice between Qdrant and Pinecone hinges on your specific needs: -### [Anchor](https://qdrant.tech/articles/qdrant-1.2.x/\#product-quantization) Product Quantization +- **Qdrant** is ideal for organizations that require flexible deployment options, extensive scalability, and customization. It is also suitable for projects needing deep integration with existing security infrastructure and those looking for a cost-effective, self-hosted solution. +- **Pinecone** is suitable for teams seeking a fully managed solution with robust built-in security features and standardized compliance. It is suitable for cloud-native applications and dynamic environments where automatic scaling and low operational overhead are critical. -The primary focus of Qdrant was always performance. That’s why we built it in Rust, but we were -always concerned about making vector search affordable. From the very beginning, Qdrant offered -support for disk-stored collections, as storage space is way cheaper than memory. That’s also -why we have introduced the [Scalar Quantization](https://qdrant.tech/articles/scalar-quantization/) mechanism recently, -which makes it possible to reduce the memory requirements by up to four times. +By carefully considering these factors, you can select the vector database that best aligns with your technical requirements and strategic goals. -Today, we are bringing a new quantization mechanism to life. A separate article on [Product\\ -Quantization](https://qdrant.tech/documentation/quantization/#product-quantization) will describe that feature in more -detail. In a nutshell, you can **reduce the memory requirements by up to 64 times**! +## Choosing the Best Vector Database for Your AI Application -### [Anchor](https://qdrant.tech/articles/qdrant-1.2.x/\#optional-named-vectors) Optional named vectors +Selecting the best vector database for your AI project depends on several factors, including your deployment preferences, scalability needs, performance requirements, and security considerations. -Qdrant has been supporting multiple named vectors per point for quite a long time. Those may have -utterly different dimensionality and distance functions used to calculate similarity. Having multiple -embeddings per item is an essential real-world scenario. For example, you might be encoding textual -and visual data using different models. Or you might be experimenting with different models but -don’t want to make your payloads redundant by keeping them in separate collections. +- **Choose Qdrant if**: + - You require flexible deployment options (local, on-premises, managed SaaS solution, or a Hybrid Cloud). + - You need extensive customization and control over your vector database. + - You project needs to adhere to data security and data sovereignty laws specific to your geography + - Your project would benefit from advanced search capabilities, including complex payload filtering and geolocation support. + - Cost efficiency and the ability to self-host are significant considerations. +- **Choose Pinecone if**: + - You prefer a fully managed SaaS solution that abstracts the complexities of infrastructure management. + - You need a serverless architecture that automatically adjusts to varying workloads. + - Built-in security features and compliance certifications (SOC 2 Type II, GDPR) are sufficient for your application. + - You want to build your project with minimal operational overhead. -![Optional vectors](https://qdrant.tech/articles_data/qdrant-1.2.x/optional-vectors.png) +For maximum control, security, and cost-efficiency, choose Qdrant. It offers flexible deployment options, customizability, and advanced search features, and is ideal for building data sovereign AI applications. However, if you prioritize ease of use and automatic scaling with built-in security, Pinecone's fully managed SaaS solution with a serverless architecture is the way to go. -However, up to the previous version, we requested that you provide all the vectors for each point. There -have been many requests to allow nullable vectors, as sometimes you cannot generate an embedding or -simply don’t want to for reasons we don’t need to know. +## Next Steps -### [Anchor](https://qdrant.tech/articles/qdrant-1.2.x/\#grouping-requests) Grouping requests +Qdrant is one of the leading Pinecone alternatives in the market. For developers who seek control of their vector database, Qdrant offers the highest level of customization, flexible deployment options, and advanced security features. -Embeddings are great for capturing the semantics of the documents, but we rarely encode larger pieces -of data into a single vector. Having a summary of a book may sound attractive, but in reality, we -divide it into paragraphs or some different parts to have higher granularity. That pays off when we -perform the semantic search, as we can return the relevant pieces only. That’s also how modern tools -like Langchain process the data. The typical way is to encode some smaller parts of the document and -keep the document id as a payload attribute. +To get started with Qdrant, explore our [documentation](https://qdrant.tech/documentation/), hop on to our [Discord](https://qdrant.to/discord) channel, sign up for [Qdrant cloud](https://cloud.qdrant.io/signup) (or [Hybrid cloud](https://qdrant.tech/hybrid-cloud/)), or [get in touch](https://qdrant.tech/contact-us/) with us today. -![Query without grouping request](https://qdrant.tech/articles_data/qdrant-1.2.x/without-grouping-request.png) +References: -There are cases where we want to find relevant parts, but only up to a specific number of results -per document (for example, only a single one). Up till now, we had to implement such a mechanism -on the client side and send several calls to the Qdrant engine. But that’s no longer the case. -Qdrant 1.2 provides a mechanism for [grouping requests](https://qdrant.tech/documentation/search/#grouping-api), which -can handle that server-side, within a single call to the database. This mechanism is similar to the -SQL `GROUP BY` clause. +- [Pinecone Documentation](https://docs.pinecone.io/) +- [Qdrant Documentation](https://qdrant.tech/documentation/) -![Query with grouping request](https://qdrant.tech/articles_data/qdrant-1.2.x/with-grouping-request.png) + - If you aren't ready yet, [try out Qdrant locally](/documentation/quick-start/) or sign up for [Qdrant Cloud](https://cloud.qdrant.io/signup). -You are not limited to a single result per document, and you can select how many entries will be -returned. + - For more basic information on Qdrant read our [Overview](/documentation/overview/) section or learn more about Qdrant Cloud's [Free Tier](/documentation/cloud/). -### [Anchor](https://qdrant.tech/articles/qdrant-1.2.x/\#nested-filters) Nested filters + - If ready to migrate, please consult our [Comprehensive Guide](https://github.com/NirantK/qdrant_tools) for further details on migration steps. -Unlike some other vector databases, Qdrant accepts any arbitrary JSON payload, including -arrays, objects, and arrays of objects. You can also [filter the search results using nested\\ -keys](https://qdrant.tech/documentation/filtering/#nested-key), even though arrays (using the `[]` syntax). +<|page-376-lllmstxt|> +# Understanding Vector Similarity: Powering Next-Gen AI Applications -Before Qdrant 1.2 it was impossible to express some more complex conditions for the -nested structures. For example, let’s assume we have the following payload: +A core function of a wide range of AI applications is to first understand the *meaning* behind a user query, and then provide *relevant* answers to the questions that the user is asking. With increasingly advanced interfaces and applications, this query can be in the form of language, or an image, an audio, video, or other forms of *unstructured* data. -```json -{ - "country": "Japan", - "cities": [\ - {\ - "name": "Tokyo",\ - "population": 9.3,\ - "area": 2194\ - },\ - {\ - "name": "Osaka",\ - "population": 2.7,\ - "area": 223\ - },\ - {\ - "name": "Kyoto",\ - "population": 1.5,\ - "area": 827.8\ - }\ - ] -} +On an ecommerce platform, a user can, for instance, try to find ‘clothing for a trek’, when they actually want results around ‘waterproof jackets’, or ‘winter socks’. Keyword, or full-text, or even synonym search would fail to provide any response to such a query. Similarly, on a music app, a user might be looking for songs that sound similar to an audio clip they have heard. Or, they might want to look up furniture that has a similar look as the one they saw on a trip. -``` +## How Does Vector Similarity Work? +So, how does an algorithm capture the essence of a user’s query, and then unearth results that are relevant? -We want to filter out the results to include the countries with a city with over 2 million citizens -and an area bigger than 500 square kilometers but no more than 1000. There is no such a city in -Japan, looking at our data, but if we wrote the following filter, it would be returned: +At a high level, here’s how: -```json -{ - "filter": { - "must": [\ - {\ - "key": "country.cities[].population",\ - "range": {\ - "gte": 2\ - }\ - },\ - {\ - "key": "country.cities[].area",\ - "range": {\ - "gt": 500,\ - "lte": 1000\ - }\ - }\ - ] - }, - "limit": 3 -} +- Unstructured data is first converted into a numerical representation, known as vectors, using a deep-learning model. The goal here is to capture the ‘semantics’ or the key features of this data. +- The vectors are then stored in a vector database, along with references to their original data. +- When a user performs a query, the query is first converted into its vector representation using the same model. Then search is performed using a metric, to find other vectors which are closest to the query vector. +- The list of results returned corresponds to the vectors that were found to be the closest. -``` +At the heart of all such searches lies the concept of *vector similarity*, which gives us the ability to measure how closely related two data points are, how similar or dissimilar they are, or find other related data points. -Japan would be returned because Tokyo and Osaka match the first criteria, while Kyoto fulfills -the second. But that’s not what we wanted to achieve. That’s the motivation behind introducing -a new type of nested filter. +In this document, we will deep-dive into the essence of vector similarity, study how vector similarity search is used in the context of AI, look at some real-world use cases and show you how to leverage the power of vector similarity and vector similarity search for building AI applications. -```json -{ - "filter": { - "must": [\ - {\ - "nested": {\ - "key": "country.cities",\ - "filter": {\ - "must": [\ - {\ - "key": "population",\ - "range": {\ - "gte": 2\ - }\ - },\ - {\ - "key": "area",\ - "range": {\ - "gt": 500,\ - "lte": 1000\ - }\ - }\ - ]\ - }\ - }\ - }\ - ] - }, - "limit": 3 -} +## **Understanding Vectors, Vector Spaces and Vector Similarity** -``` +ML and deep learning models require numerical data as inputs to accomplish their tasks. Therefore, when working with non-numerical data, we first need to convert them into a numerical representation that captures the key features of that data. This is where vectors come in. -The syntax is consistent with all the other supported filters and enables new possibilities. In -our case, it allows us to express the joined condition on a nested structure and make the results -list empty but correct. +A vector is a set of numbers that represents data, which can be text, image, or audio, or any multidimensional data. Vectors reside in a high-dimensional space, the vector space, where each dimension captures a specific aspect or feature of the data. -## [Anchor](https://qdrant.tech/articles/qdrant-1.2.x/\#important-changes) Important changes +{{< figure width=80% src=/blog/what-is-vector-similarity/working.png caption="Working" >}} -The latest release focuses not only on the new features but also introduces some changes making -Qdrant even more reliable. +The number of dimensions of a vector can range from tens or hundreds to thousands, and each dimension is stored as the element of an array. Vectors are, therefore, an array of numbers of fixed length, and in their totality, they encode the key features of the data they represent. -### [Anchor](https://qdrant.tech/articles/qdrant-1.2.x/\#recovery-mode) Recovery mode +Vector embeddings are created by AI models, a process known as vectorization. They are then stored in vector stores like Qdrant, which have the capability to rapidly search through vector space, and find similar or dissimilar vectors, cluster them, find related ones, or even the ones which are complete outliers. -There has been an issue in memory-constrained environments, such as cloud, happening when users were -pushing massive amounts of data into the service using `wait=false`. This data influx resulted in an -overreaching of disk or RAM limits before the Write-Ahead Logging (WAL) was fully applied. This -situation was causing Qdrant to attempt a restart and reapplication of WAL, failing recurrently due -to the same memory constraints and pushing the service into a frustrating crash loop with many -Out-of-Memory errors. +For example, in the case of text data, “coat” and “jacket” have similar meaning, even though the words are completely different. Vector representations of these two words should be such that they lie close to each other in the vector space. The process of measuring their proximity in vector space is vector similarity. -Qdrant 1.2 enters recovery mode, if enabled, when it detects a failure on startup. -That makes the service halt the loading of collection data and commence operations in a partial state. -This state allows for removing collections but doesn’t support search or update functions. -**Recovery mode [has to be enabled by user](https://qdrant.tech/documentation/administration/#recovery-mode).** +Vector similarity, therefore, is a measure of how closely related two data points are in a vector space. It quantifies how alike or different two data points are based on their respective vector representations. -### [Anchor](https://qdrant.tech/articles/qdrant-1.2.x/\#appendable-mmap) Appendable mmap +Suppose we have the words "king", "queen" and “apple”. Given a model, words with similar meanings have vectors that are close to each other in the vector space. Vector representations of “king” and “queen” would be, therefore, closer together than "king" and "apple", or “queen” and “apple” due to their semantic relationship. Vector similarity is how you calculate this. -For a long time, segments using mmap storage were `non-appendable` and could only be constructed by -the optimizer. Dynamically adding vectors to the mmap file is fairly complicated and thus not -implemented in Qdrant, but we did our best to implement it in the recent release. If you want -to read more about segments, check out our docs on [vector storage](https://qdrant.tech/documentation/storage/#vector-storage). +An extremely powerful aspect of vectors is that they are not limited to representing just text, image or audio. In fact, vector representations can be created out of any kind of data. You can create vector representations of 3D models, for instance. Or for video clips, or molecular structures, or even [protein sequences](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-3220-8). -## [Anchor](https://qdrant.tech/articles/qdrant-1.2.x/\#security) Security +There are several methodologies through which vectorization is performed. In creating vector representations of text, for example, the process involves analyzing the text for its linguistic elements using a transformer model. These models essentially learn to capture the essence of the text by dissecting its language components. -There are two major changes in terms of [security](https://qdrant.tech/documentation/security/): +## **How Is Vector Similarity Calculated?** -1. **API-key support** \- basic authentication with a static API key to prevent unwanted access. Previously -API keys were only supported in [Qdrant Cloud](https://cloud.qdrant.io/). -2. **TLS support** \- to use encrypted connections and prevent sniffing/MitM attacks. +There are several ways to calculate the similarity (or distance) between two vectors, which we call metrics. The most popular ones are: -## [Anchor](https://qdrant.tech/articles/qdrant-1.2.x/\#release-notes) Release notes +**Dot Product**: Obtained by multiplying corresponding elements of the vectors and then summing those products. A larger dot product indicates a greater degree of similarity. -As usual, [our release notes](https://github.com/qdrant/qdrant/releases/tag/v1.2.0) describe all the changes -introduced in the latest version. +**Cosine Similarity**: Calculated using the dot product of the two vectors divided by the product of their magnitudes (norms). Cosine similarity of 1 implies that the vectors are perfectly aligned, while a value of 0 indicates no similarity. A value of -1 means they are diametrically opposed (or dissimilar). -##### Was this page useful? +**Euclidean Distance**: Assuming two vectors act like arrows in vector space, Euclidean distance calculates the length of the straight line connecting the heads of these two arrows. The smaller the Euclidean distance, the greater the similarity. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +**Manhattan Distance**: Also known as taxicab distance, it is calculated as the total distance between the two vectors in a vector space, if you follow a grid-like path. The smaller the Manhattan distance, the greater the similarity. -Thank you for your feedback! 🙏 +{{< figure width=80% src=/blog/what-is-vector-similarity/products.png caption="Metrics" >}} -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/qdrant-1.2.x.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +As a rule of thumb, the choice of the best similarity metric depends on how the vectors were encoded. -On this page: +Of the four metrics, Cosine Similarity is the most popular. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/qdrant-1.2.x.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +## **The Significance of Vector Similarity** -× +Vector Similarity is vital in powering machine learning applications. By comparing the vector representation of a query to the vectors of all data points, vector similarity search algorithms can retrieve the most relevant vectors. This helps in building powerful similarity search and recommendation systems, and has numerous applications in image and text analysis, in natural language processing, and in other domains that deal with high-dimensional data. -[Powered by](https://qdrant.tech/) +Let’s look at some of the key ways in which vector similarity can be leveraged. -<|page-185-lllmstxt|> -## discovery-search -- [Articles](https://qdrant.tech/articles/) -- Discovery needs context +**Image Analysis** -[Back to Data Exploration](https://qdrant.tech/articles/data-exploration/) +Once images are converted to their vector representations, vector similarity can help create systems to identify, categorize, and compare them. This can enable powerful reverse image search, facial recognition systems, or can be used for object detection and classification. -# Discovery needs context +**Text Analysis** -Luis CossĂ­o +Vector similarity in text analysis helps in understanding and processing language data. Vectorized text can be used to build semantic search systems, or in document clustering, or plagiarism detection applications. -· +**Retrieval Augmented Generation (RAG)** -January 31, 2024 +Vector similarity can help in representing and comparing linguistic features, from single words to entire documents. This can help build retrieval augmented generation (RAG) applications, where the data is retrieved based on user intent. It also enables nuanced language tasks such as sentiment analysis, synonym detection, language translation, and more. -![Discovery needs context](https://qdrant.tech/articles_data/discovery-search/preview/title.jpg) +**Recommender Systems** -# [Anchor](https://qdrant.tech/articles/discovery-search/\#discovery-needs-context) Discovery needs context +By converting user preference vectors into item vectors from a dataset, vector similarity can help build semantic search and recommendation systems. This can be utilized in a range of domains such e-commerce or OTT services, where it can help in suggesting relevant products, movies or songs. -When Christopher Columbus and his crew sailed to cross the Atlantic Ocean, they were not looking for the Americas. They were looking for a new route to India because they were convinced that the Earth was round. They didn’t know anything about a new continent, but since they were going west, they stumbled upon it. +Due to its varied applications, vector similarity has become a critical component in AI tooling. However, implementing it at scale, and in production settings, poses some hard problems. Below we will discuss some of them and explore how Qdrant helps solve these challenges. -They couldn’t reach their _target_, because the geography didn’t let them, but once they realized it wasn’t India, they claimed it a new “discovery” for their crown. If we consider that sailors need water to sail, then we can establish a _context_ which is positive in the water, and negative on land. Once the sailor’s search was stopped by the land, they could not go any further, and a new route was found. Let’s keep these concepts of _target_ and _context_ in mind as we explore the new functionality of Qdrant: **Discovery search**. +## **Challenges with Vector Similarity Search** -## [Anchor](https://qdrant.tech/articles/discovery-search/\#what-is-discovery-search) What is discovery search? +The biggest challenge in this area comes from what researchers call the "[curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality)." Algorithms like k-d trees may work well for finding exact matches in low dimensions (in 2D or 3D space). However, when you jump to high-dimensional spaces (hundreds or thousands of dimensions, which is common with vector embeddings), these algorithms become impractical. Traditional search methods and OLTP or OLAP databases struggle to handle this curse of dimensionality efficiently. -In version 1.7, Qdrant [released](https://qdrant.tech/articles/qdrant-1.7.x/) this novel API that lets you constrain the space in which a search is performed, relying only on pure vectors. This is a powerful tool that lets you explore the vector space in a more controlled way. It can be used to find points that are not necessarily closest to the target, but are still relevant to the search. +This means that building production applications that leverage vector similarity involves navigating several challenges. Here are some of the key challenges to watch out for. -You can already select which points are available to the search by using payload filters. This by itself is very versatile because it allows us to craft complex filters that show only the points that satisfy their criteria deterministically. However, the payload associated with each point is arbitrary and cannot tell us anything about their position in the vector space. In other words, filtering out irrelevant points can be seen as creating a _mask_ rather than a hyperplane –cutting in between the positive and negative vectors– in the space. +### Scalability -## [Anchor](https://qdrant.tech/articles/discovery-search/\#understanding-context) Understanding context +Various vector search algorithms were originally developed to handle datasets small enough to be accommodated entirely within the memory of a single computer. -This is where a **vector _context_** can help. We define _context_ as a list of pairs. Each pair is made up of a positive and a negative vector. With a context, we can define hyperplanes within the vector space, which always prefer the positive over the negative vectors. This effectively partitions the space where the search is performed. After the space is partitioned, we then need a _target_ to return the points that are more similar to it. +However, in real-world production settings, the datasets can encompass billions of high-dimensional vectors. As datasets grow, the storage and computational resources required to maintain and search through vector space increases dramatically. -![Discovery search visualization](https://qdrant.tech/articles_data/discovery-search/discovery-search.png) +For building scalable applications, leveraging vector databases that allow for a distributed architecture and have the capabilities of sharding, partitioning and load balancing is crucial. -While positive and negative vectors might suggest the use of the [recommendation interface](https://qdrant.tech/documentation/concepts/explore/#recommendation-api), in the case of _context_ they require to be paired up in a positive-negative fashion. This is inspired from the machine-learning concept of [_triplet loss_](https://en.wikipedia.org/wiki/Triplet_loss), where you have three vectors: an anchor, a positive, and a negative. Triplet loss is an evaluation of how much the anchor is closer to the positive than to the negative vector, so that learning happens by “moving” the positive and negative points to try to get a better evaluation. However, during discovery, we consider the positive and negative vectors as static points, and we search through the whole dataset for the “anchors”, or result candidates, which fit this characteristic better. +### Efficiency -![Triplet loss](https://qdrant.tech/articles_data/discovery-search/triplet-loss.png) +As the number of dimensions in vectors increases, algorithms that work in lower dimensions become less effective in measuring true similarity. This makes finding nearest neighbors computationally expensive and inaccurate in high-dimensional space. -[**Discovery search**](https://qdrant.tech/articles/discovery-search/#discovery-search), then, is made up of two main inputs: +For efficient query processing, it is important to choose vector search systems which use indexing techniques that help speed up search through high-dimensional vector space, and reduce latency. -- **target**: the main point of interest -- **context**: the pairs of positive and negative points we just defined. +### Security -However, it is not the only way to use it. Alternatively, you can **only** provide a context, which invokes a [**Context Search**](https://qdrant.tech/articles/discovery-search/#context-search). This is useful when you want to explore the space defined by the context, but don’t have a specific target in mind. But hold your horses, we’ll get to that [later â†Ș](https://qdrant.tech/articles/discovery-search/#context-search). +For real-world applications, vector databases frequently house privacy-sensitive data. This can encompass Personally Identifiable Information (PII) in customer records, intellectual property (IP) like proprietary documents, or specialized datasets subject to stringent compliance regulations. -## [Anchor](https://qdrant.tech/articles/discovery-search/\#real-world-discovery-search-applications) Real-world discovery search applications +For data security, the vector search system should offer features that prevent unauthorized access to sensitive information. Also, it should empower organizations to retain data sovereignty, ensuring their data complies with their own regulations and legal requirements, independent of the platform or the cloud provider. -Let’s talk about the first case: context with a target. +These are some of the many challenges that developers face when attempting to leverage vector similarity in production applications. -To understand why this is useful, let’s take a look at a real-world example: using a multimodal encoder like [CLIP](https://openai.com/blog/clip/) to search for images, from text **and** images. -CLIP is a neural network that can embed both images and text into the same vector space. This means that you can search for images using either a text query or an image query. For this example, we’ll reuse our [food recommendations demo](https://food-discovery.qdrant.tech/) by typing “burger” in the text input: +To address these challenges head-on, we have made several design choices at Qdrant which help power vector search use-cases that go beyond simple CRUD applications. -![Burger text input in food demo](https://qdrant.tech/articles_data/discovery-search/search-for-burger.png) +## How Qdrant Solves Vector Similarity Search Challenges -This is basically nearest neighbor search, and while technically we have only images of burgers, one of them is a logo representation of a burger. We’re looking for actual burgers, though. Let’s try to exclude images like that by adding it as a negative example: +Qdrant is a highly performant and scalable vector search system, developed ground up in Rust. Qdrant leverages Rust’s famed memory efficiency and performance. It supports horizontal scaling, sharding, and replicas, and includes security features like role-based authentication. Additionally, Qdrant can be deployed in various environments, including [hybrid cloud setups](/hybrid-cloud/). -![Try to exclude burger drawing](https://qdrant.tech/articles_data/discovery-search/try-to-exclude-non-burger.png) +Here’s how we have taken on some of the key challenges that vector search applications face in production. -Wait a second, what has just happened? These pictures have **nothing** to do with burgers, and still, they appear on the first results. Is the demo broken? +### Efficiency -Turns out, multimodal encoders [might not work how you expect them to](https://modalitygap.readthedocs.io/en/latest/). Images and text are embedded in the same space, but they are not necessarily close to each other. This means that we can create a mental model of the distribution as two separate planes, one for images and one for text. +Our [choice of Rust](/articles/why-rust/) significantly contributes to the efficiency of Qdrant’s vector similarity search capabilities. Rust’s emphasis on safety and performance, without the need for a garbage collector, helps with better handling of memory and resources. Rust is renowned for its performance and safety features, particularly in concurrent processing, and we leverage it heavily to handle high loads efficiently. -![Mental model of CLIP embeddings](https://qdrant.tech/articles_data/discovery-search/clip-mental-model.png) +Also, a key feature of Qdrant is that we leverage both vector and traditional indexes (payload index). This means that vector index helps speed up vector search, while traditional indexes help filter the results. -This is where discovery excels because it allows us to constrain the space considering the same mode (images) while using a target from the other mode (text). +The vector index in Qdrant employs the Hierarchical Navigable Small World (HNSW) algorithm for Approximate Nearest Neighbor (ANN) searches, which is one of the fastest algorithms according to [benchmarks](https://github.com/erikbern/ann-benchmarks). -![Cross-modal search with discovery](https://qdrant.tech/articles_data/discovery-search/clip-discovery.png) +### Scalability -Discovery search also lets us keep giving feedback to the search engine in the shape of more context pairs, so we can keep refining our search until we find what we are looking for. +For massive datasets and demanding workloads, Qdrant supports [distributed deployment](/documentation/guides/distributed_deployment/) from v0.8.0. In this mode, you can set up a Qdrant cluster and distribute data across multiple nodes, enabling you to maintain high performance and availability even under increased workloads. Clusters support sharding and replication, and harness the Raft consensus algorithm to manage node coordination. -Another intuitive example: imagine you’re looking for a fish pizza, but pizza names can be confusing, so you can just type “pizza”, and prefer a fish over meat. Discovery search will let you use these inputs to suggest a fish pizza
 even if it’s not called fish pizza! +Qdrant also supports vector [quantization](/documentation/guides/quantization/) to reduce memory footprint and speed up vector similarity searches, making it very effective for large-scale applications where efficient resource management is critical. -![Simple discovery example](https://qdrant.tech/articles_data/discovery-search/discovery-example-with-images.png) +There are three quantization strategies you can choose from - scalar quantization, binary quantization and product quantization - which will help you control the trade-off between storage efficiency, search accuracy and speed. -## [Anchor](https://qdrant.tech/articles/discovery-search/\#context-search) Context search +### Security -Now, the second case: only providing context. +Qdrant offers several [security features](/documentation/guides/security/) to help protect data and access to the vector store: -Ever been caught in the same recommendations on your favorite music streaming service? This may be caused by getting stuck in a similarity bubble. As user input gets more complex, diversity becomes scarce, and it becomes harder to force the system to recommend something different. +- API Key Authentication: This helps secure API access to Qdrant Cloud with static or read-only API keys. +- JWT-Based Access Control: You can also enable more granular access control through JSON Web Tokens (JWT), and opt for restricted access to specific parts of the stored data while building Role-Based Access Control (RBAC). +- TLS Encryption: Additionally, you can enable TLS Encryption on data transmission to ensure security of data in transit. -![Context vs recommendation search](https://qdrant.tech/articles_data/discovery-search/context-vs-recommendation.png) +To help with data sovereignty, Qdrant can be run in a [Hybrid Cloud](/hybrid-cloud/) setup. Hybrid Cloud allows for seamless deployment and management of the vector database across various environments, and integrates Kubernetes clusters into a unified managed service. You can manage these clusters via Qdrant Cloud’s UI while maintaining control over your infrastructure and resources. -**Context search** solves this by de-focusing the search around a single point. Instead, it selects points randomly from within a zone in the vector space. This search is the most influenced by _triplet loss_, as the score can be thought of as _“how much a point is closer to a negative than a positive vector?”_. If it is closer to the positive one, then its score will be zero, same as any other point within the same zone. But if it is on the negative side, it will be assigned a more and more negative score the further it gets. +## Optimizing Similarity Search Performance -![Context search visualization](https://qdrant.tech/articles_data/discovery-search/context-search.png) +In order to achieve top performance in vector similarity searches, Qdrant employs a number of other tactics in addition to the features discussed above.**FastEmbed**: Qdrant supports [FastEmbed](/articles/fastembed/), a lightweight Python library for generating fast and efficient text embeddings. FastEmbed uses quantized transformer models integrated with ONNX Runtime, and is significantly faster than traditional methods of embedding generation. -Creating complex tastes in a high-dimensional space becomes easier since you can just add more context pairs to the search. This way, you should be able to constrain the space enough so you select points from a per-search “category” created just from the context in the input. +**Support for Dense and Sparse Vectors**: Qdrant supports both dense and sparse vector representations. While dense vectors are most common, you may encounter situations where the dataset contains a range of specialized domain-specific keywords. [Sparse vectors](/articles/sparse-vectors/) shine in such scenarios. Sparse vectors are vector representations of data where most elements are zero. -![A more complex context search](https://qdrant.tech/articles_data/discovery-search/complex-context-search.png) +**Multitenancy**: Qdrant supports [multitenancy](/documentation/guides/multiple-partitions/) by allowing vectors to be partitioned by payload within a single collection. Using this you can isolate each user's data, and avoid creating separate collections for each user. In order to ensure indexing performance, Qdrant also offers ways to bypass the construction of a global vector index, so that you can index vectors for each user independently. -This way you can give refreshing recommendations, while still being in control by providing positive and negative feedback, or even by trying out different permutations of pairs. +**IO Optimizations**: If your data doesn’t fit into the memory, it may require storing on disk. To [optimize disk IO performance](/articles/io_uring/), Qdrant offers io_uring based *async uring* storage backend on Linux-based systems. Benchmarks show that it drastically helps reduce operating system overhead from disk IO. -## [Anchor](https://qdrant.tech/articles/discovery-search/\#key-takeaways) Key takeaways: +**Data Integrity**: To ensure data integrity, Qdrant handles data changes in two stages. First, changes are recorded in the Write-Ahead Log (WAL). Then, changes are applied to segments, which store both the latest and individual point versions. In case of abnormal shutdowns, data is restored from WAL. -- Discovery search is a powerful tool for controlled exploration in vector spaces. -Context, consisting of positive and negative vectors constrain the search space, while a target guides the search. -- Real-world applications include multimodal search, diverse recommendations, and context-driven exploration. -- Ready to learn more about the math behind it and how to use it? Check out the [documentation](https://qdrant.tech/documentation/concepts/explore/#discovery-api) +**Integrations**: Qdrant has integrations with most popular frameworks, such as LangChain, LlamaIndex, Haystack, Apache Spark, FiftyOne, and more. Qdrant also has several [trusted partners](/blog/hybrid-cloud-launch-partners/) for Hybrid Cloud deployments, such as Oracle Cloud Infrastructure, Red Hat OpenShift, Vultr, OVHcloud, Scaleway, and DigitalOcean. -##### Was this page useful? +We regularly run [benchmarks](/benchmarks/) comparing Qdrant against other vector databases like Elasticsearch, Milvus, and Weaviate. Our benchmarks show that Qdrant consistently achieves the highest requests-per-second (RPS) and lowest latencies across various scenarios, regardless of the precision threshold and metric used. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +## Real-World Use Cases -Thank you for your feedback! 🙏 +Vector similarity is increasingly being used in a wide range of [real-world applications](/use-cases/). In e-commerce, it powers recommendation systems by comparing user behavior vectors to product vectors. In social media, it can enhance content recommendations and user connections by analyzing user interaction vectors. In image-oriented applications, vector similarity search enables reverse image search, similar image clustering, and efficient content-based image retrieval. In healthcare, vector similarity helps in genetic research by comparing DNA sequence vectors to identify similarities and variations. The possibilities are endless. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/discovery-search.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +A unique example of real-world application of vector similarity is how VISUA uses Qdrant. A leading computer vision platform, VISUA faced two key challenges. First, a rapid and accurate method to identify images and objects within them for reinforcement learning. Second, dealing with the scalability issues of their quality control processes due to the rapid growth in data volume. Their previous quality control, which relied on meta-information and manual reviews, was no longer scalable, which prompted the VISUA team to explore vector databases as a solution. -On this page: +After exploring a number of vector databases, VISUA picked Qdrant as the solution of choice. Vector similarity search helped identify similarities and deduplicate large volumes of images, videos, and frames. This allowed VISUA to uniquely represent data and prioritize frames with anomalies for closer examination, which helped scale their quality assurance and reinforcement learning processes. Read our [case study](/blog/case-study-visua/) to learn more. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/discovery-search.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +## Future Directions and Innovations -× +As real-world deployments of vector similarity search technology grows, there are a number of promising directions where this technology is headed. -[Powered by](https://qdrant.tech/) +We are developing more efficient indexing and search algorithms to handle increasing data volumes and high-dimensional data more effectively. Simultaneously, in case of dynamic datasets, we are pushing to enhance our handling of real-time updates and low-latency search capabilities. -<|page-186-lllmstxt|> -## dimension-reduction-qsoc -- [Articles](https://qdrant.tech/articles/) -- Qdrant Summer of Code 2024 - WASM based Dimension Reduction +Qdrant is one of the most secure vector stores out there. However, we are working on bringing more privacy-preserving techniques in vector search implementations to protect sensitive data. -[Back to Ecosystem](https://qdrant.tech/articles/ecosystem/) +We have just about witnessed the tip of the iceberg in terms of what vector similarity can achieve. If you are working on an interesting use-case that uses vector similarity, we would like to hear from you. -# Qdrant Summer of Code 2024 - WASM based Dimension Reduction +### Key Takeaways: -Jishan Bhattacharya +- **Vector Similarity in AI:** Vector similarity is a crucial technique in AI, allowing for the accurate matching of queries with relevant data, driving advanced applications like semantic search and recommendation systems. -· +- **Versatile Applications of Vector Similarity:** This technology powers a wide range of AI-driven applications, from reverse image search in e-commerce to sentiment analysis in text processing. -August 31, 2024 +- **Overcoming Vector Search Challenges:** Implementing vector similarity at scale poses challenges like the curse of dimensionality, but specialized systems like Qdrant provide efficient and scalable solutions. -![Qdrant Summer of Code 2024 - WASM based Dimension Reduction](https://qdrant.tech/articles_data/dimension-reduction-qsoc/preview/title.jpg) +- **Qdrant's Advanced Vector Search:** Qdrant leverages Rust's performance and safety features, along with advanced algorithms, to deliver high-speed and secure vector similarity search, even for large-scale datasets. -## [Anchor](https://qdrant.tech/articles/dimension-reduction-qsoc/\#introduction) Introduction +- **Future Innovations in Vector Similarity:** The field of vector similarity is rapidly evolving, with advancements in indexing, real-time search, and privacy-preserving techniques set to expand its capabilities in AI applications. -Hello, everyone! I’m Jishan Bhattacharya, and I had the incredible opportunity to intern at Qdrant this summer as part of the Qdrant Summer of Code 2024. Under the mentorship of [Andrey Vasnetsov](https://www.linkedin.com/in/andrey-vasnetsov-75268897/), I dived into the world of performance optimization, focusing on enhancing vector visualization using WebAssembly (WASM). In this article, I’ll share the insights, challenges, and accomplishments from my journey — one filled with learning, experimentation, and plenty of coding adventures. +## Getting Started with Qdrant -## [Anchor](https://qdrant.tech/articles/dimension-reduction-qsoc/\#project-overview) Project Overview +Ready to implement vector similarity in your AI applications? Explore Qdrant's vector database to enhance your data retrieval and AI capabilities. For additional resources and documentation, visit: -Qdrant is a robust vector database and search engine designed to store vector data and perform tasks like similarity search and clustering. One of its standout features is the ability to visualize high-dimensional vectors in a 2D space. However, the existing implementation faced performance bottlenecks, especially when scaling to large datasets. My mission was to tackle this challenge by leveraging a WASM-based solution for dimensionality reduction in the visualization process. +- [Quick Start Guide](/documentation/quick-start/) +- [Documentation](/documentation/) -## [Anchor](https://qdrant.tech/articles/dimension-reduction-qsoc/\#learnings--challenges) Learnings & Challenges +We are always available on our [Discord channel](https://qdrant.to/discord) to answer any questions you might have. You can also sign up for our [newsletter](/subscribe/) to stay ahead of the curve. -Our weapon of choice was Rust, paired with WASM, and we employed the t-SNE algorithm for dimensionality reduction. For those unfamiliar, t-SNE (t-Distributed Stochastic Neighbor Embedding) is a technique that helps visualize high-dimensional data by projecting it into two or three dimensions. It operates in two main steps: +<|page-377-lllmstxt|> +# The Evolving Landscape of AI Frameworks +As Large Language Models (LLMs) and vector stores have become steadily more powerful, a new generation of frameworks has appeared which can streamline the development of AI applications by leveraging LLMs and vector search technology. These frameworks simplify the process of building everything from Retrieval Augmented Generation (RAG) applications to complex chatbots with advanced conversational abilities, and even sophisticated reasoning-driven AI applications. -1. **Computing Pairwise Similarity:** This step involves calculating the similarity between each pair of data points in the original high-dimensional space. +The most well-known of these frameworks is possibly [LangChain](https://github.com/langchain-ai/langchain). [Launched in October 2022](https://en.wikipedia.org/wiki/LangChain) as an open-source project by Harrison Chase, the project quickly gained popularity, attracting contributions from hundreds of developers on GitHub. LangChain excels in its broad support for documents, data sources, and APIs. This, along with seamless integration with vector stores like Qdrant and the ability to chain multiple LLMs, has allowed developers to build complex AI applications without reinventing the wheel. -2. **Iterative Optimization:** The second step is iterative, where the embedding is refined using gradient descent. Here, the similarity matrix from the first step plays a crucial role. +However, despite the many capabilities unlocked by frameworks like LangChain, developers still needed expertise in [prompt engineering](https://en.wikipedia.org/wiki/Prompt_engineering) to craft optimal LLM prompts. Additionally, optimizing these prompts and adapting them to build multi-stage reasoning AI remained challenging with the existing frameworks. +In fact, as you start building production-grade AI applications, it becomes clear that a single LLM call isn’t enough to unlock the full capabilities of LLMs. Instead, you need to create a workflow where the model interacts with external tools like web browsers, fetches relevant snippets from documents, and compiles the results into a multi-stage reasoning pipeline. -At the outset, Andrey tasked me with rewriting the existing JavaScript implementation of t-SNE in Rust, introducing multi-threading along the way. Setting up WASM with Vite for multi-threaded execution was no small feat, but the effort paid off. The resulting Rust implementation outperformed the single-threaded JavaScript version, although it still struggled with large datasets. +This involves building an architecture that combines and reasons on intermediate outputs, with LLM prompts that adapt according to the task at hand, before producing a final output. A manual approach to prompt engineering quickly falls short in such scenarios. -Next came the challenge of optimizing the algorithm further. A key aspect of t-SNE’s first step is finding the nearest neighbors for each data point, which requires an efficient data structure. I opted for a [Vantage Point Tree](https://en.wikipedia.org/wiki/Vantage-point_tree) (also known as a Ball Tree) to speed up this process. As for the second step, while it is inherently sequential, there was still room for improvement. I incorporated Barnes-Hut approximation to accelerate the gradient calculation. This method approximates the forces between points in low dimensional space, making the process more efficient. +In October 2023, researchers working in Stanford NLP released a library, [DSPy](https://github.com/stanfordnlp/dspy), which entirely automates the process of optimizing prompts and weights for large language models (LLMs), eliminating the need for manual prompting or prompt engineering. -To illustrate, imagine dividing a 2D space into quadrants, each containing multiple points. Every quadrant is again subdivided into four quadrants. This is done until every point belongs to a single cell. +One of DSPy's key features is its ability to automatically tune LLM prompts, an approach that is especially powerful when your application needs to call the LLM several times within a pipeline. -![Calculating the resultant force on red point using Barnes-Hut approximation](https://qdrant.tech/articles_data/dimension-reduction-qsoc/barnes_hut.png) +So, when building an LLM and vector store-backed AI application, which of these frameworks should you choose? In this article, we dive deep into the capabilities of each and discuss scenarios where each of these frameworks shine. Let’s get started! -Barnes-Hut Approximation +## **LangChain: Features, Performance, and Use Cases** -We then calculate the center of mass for each cell represented by a blue circle as shown in the figure. Now let’s say we want to find all the forces, represented by dotted lines, on the red point. Barnes Hut’s approximation states that for points that are sufficiently distant, instead of computing the force for each individual point, we use the center of mass as a proxy, significantly reducing the computational load. This is represented by the blue dotted line in the figure. +LangChain, as discussed above, is an open-source orchestration framework available in both [Python](https://python.langchain.com/v0.2/docs/introduction/) and [JavaScript](https://js.langchain.com/v0.2/docs/introduction/), designed to simplify the development of AI applications leveraging LLMs. For developers working with one or multiple LLMs, it acts as a universal interface for these AI models. LangChain integrates with various external data sources, supports a wide range of data types and stores, streamlines the handling of vector embeddings and retrieval through similarity search, and simplifies the integration of AI applications with existing software workflows. -These optimizations made a remarkable difference — Barnes-Hut t-SNE was eight times faster than the exact t-SNE for 10,000 vectors. +At a high level, LangChain abstracts the common steps required to work with language models into modular components, which serve as the building blocks of AI applications. These components can be "chained" together to create complex applications. Thanks to these abstractions, LangChain allows for rapid experimentation and prototyping of AI applications in a short timeframe. -![Image of visualizing 10,000 vectors using exact t-SNE which took 884.728s](https://qdrant.tech/articles_data/dimension-reduction-qsoc/rust_rewrite.jpg) +LangChain breaks down the functionality required to build AI applications into three key sections: -Exact t-SNE - Total time: 884.728s +- **Model I/O**: Building blocks to interface with the LLM. +- **Retrieval**: Building blocks to streamline the retrieval of data used by the LLM for generation (such as the retrieval step in RAG applications). +- **Composition**: Components to combine external APIs, services and other LangChain primitives. -![Image of visualizing 10,000 vectors using Barnes-Hut t-SNE which took 110.728s](https://qdrant.tech/articles_data/dimension-reduction-qsoc/rust_bhtsne.jpg) +These components are pulled together into ‘chains’ that are constructed using [LangChain Expression Language](https://python.langchain.com/v0.1/docs/expression_language/) (LCEL). We’ill first look at the various building blocks, and then see how they can be combined using LCEL. -Barnes-Hut t-SNE - Total time: 104.191s +### **LLM Model I/O** -Despite these improvements, the first step of the algorithm was still a bottleneck, leading to noticeable delays and blank screens. I experimented with approximate nearest neighbor algorithms, but the performance gains were minimal. After consulting with my mentor, we decided to compute the nearest neighbors on the server side, passing the distance matrix directly to the visualization process instead of the raw vectors. +LangChain offers broad compatibility with various LLMs, and its [LLM](https://python.langchain.com/v0.1/docs/modules/model_io/llms/) class provides a standard interface to these models. Leveraging proprietary models offered by platforms like OpenAI, Mistral, Cohere, or Gemini is straightforward and requires just an API key from the respective platform. -While waiting for the distance-matrix API to be ready, I explored further optimizations. I observed that the worker thread sent results to the main thread for rendering at specific intervals, causing unnecessary delays due to serialization and deserialization. +For instance, to use OpenAI models, you simply need to do the following: -![Image showing serialization and deserialization overhead due to message passing between threads](https://qdrant.tech/articles_data/dimension-reduction-qsoc/channels.png) +```python +from langchain_openai import OpenAI -Serialization and Deserialization Overhead +llm = OpenAI(api_key="...") -To address this, I implemented a `SharedArrayBuffer`, allowing the main thread to access changes made by the worker thread instantly. This change led to noticeable improvements. +llm.invoke("Where is Paris?") -Additionally, the previous architecture resulted in choppy animations due to the fixed intervals at which the worker thread sent results. +``` -![Image showing the previous architecture of the frontend with fixed intervals for sending results](https://qdrant.tech/articles_data/dimension-reduction-qsoc/prev_arch.png) -Previous architecture with fixed intervals +Open-source models like Meta AI’s Llama variants (such as Llama3-8B) or Mistral AI’s open models (like Mistral-7B) can be easily integrated using their Hugging Face endpoints or local LLM deployment tools like Ollama, vLLM, or LM Studio. You can also use the [CustomLLM](https://python.langchain.com/v0.1/docs/modules/model_io/llms/custom_llm/) class to build Custom LLM wrappers. -I introduced a “rendering-on-demand” approach, where the main thread would signal the worker thread when it was ready to render the next result. This created smoother, more responsive animations. +Here’s how simple it is to use LangChain with LlaMa3-8B, using [Ollama](https://ollama.com/). -![Image showing the current architecture of the frontend with rendering-on-demand approach](https://qdrant.tech/articles_data/dimension-reduction-qsoc/curr_arch.png) +```python +from langchain_community.llms import Ollama -Current architecture with rendering-on-demand +llm = Ollama(model="llama3") -With these optimizations in place, the final step was wrapping up the project by creating a Node.js [package](https://www.npmjs.com/package/wasm-dist-bhtsne). This package exposed the necessary interfaces to accept the distance matrix, perform calculations, and return the results, making the solution easy to integrate into various projects. +llm.invoke("Where is Berlin?") -## [Anchor](https://qdrant.tech/articles/dimension-reduction-qsoc/\#areas-for-improvement) Areas for Improvement +``` -While reflecting on this transformative journey, there are still areas that offer room for improvement and future enhancements: -1. **Payload Parsing:** When requesting a large number of vectors, parsing the payload on the main thread can make the user interface unresponsive. Implementing a faster parser could mitigate this issue. +LangChain also offers output parsers to structure the LLM output in a format that the application may need, such as structured data types like JSON, XML, CSV, and others. To understand LangChain’s interface with LLMs in detail, read the documentation [here](https://python.langchain.com/v0.1/docs/modules/model_io/). -2. **Direct Data Requests:** Allowing the worker thread to request data directly could eliminate the initial transfer of data from the main thread, speeding up the overall process. +### **Retrieval** -3. **Chart Library Optimization:** Profiling revealed that nearly 80% of the time was spent on the Chart.js update function. Switching to a WebGL-accelerated chart library could dramatically improve performance, especially for large datasets. -![Image showing profiling results with 80% time spent on Chart.js update function](https://qdrant.tech/articles_data/dimension-reduction-qsoc/profiling.png) +Most enterprise AI applications are built by augmenting the LLM context using data specific to the application’s use case. To accomplish this, the relevant data needs to be first retrieved, typically using vector similarity search, and then passed to the LLM context at the generation step. This architecture, known as [Retrieval Augmented Generation](/articles/what-is-rag-in-ai/) (RAG), can be used to build a wide range of AI applications. -Profiling Result +While the retrieval process sounds simple, it involves a number of complex steps: loading data from a source, splitting it into chunks, converting it into vectors or vector embeddings, storing it in a vector store, and then retrieving results based on a query before the generation step. +LangChain offers a number of building blocks to make this retrieval process simpler. -## [Anchor](https://qdrant.tech/articles/dimension-reduction-qsoc/\#conclusion) Conclusion +- **Document Loaders**: LangChain offers over 100 different document loaders, including integrations with providers like Unstructured or Airbyte. It also supports loading various types of documents, such as PDFs, HTML, CSV, and code, from a range of locations like S3. +- **Splitting**: During the retrieval step, you typically need to retrieve only the relevant section of a document. To do this, you need to split a large document into smaller chunks. LangChain offers various document transformers that make it easy to split, combine, filter, or manipulate documents. +- **Text Embeddings**: A key aspect of the retrieval step is converting document chunks into vectors, which are high-dimensional numerical representations that capture the semantic meaning of the text. LangChain offers integrations with over 25 embedding providers and methods, such as [FastEmbed](https://github.com/qdrant/fastembed). +- **Vector Store Integration**: LangChain integrates with over 50 vector stores, including specialized ones like [Qdrant](/documentation/frameworks/langchain/), and exposes a standard interface. +- **Retrievers**: LangChain offers various retrieval algorithms and allows you to use third-party retrieval algorithms or create custom retrievers. +- **Indexing**: LangChain also offers an indexing API that keeps data from any data source in sync with the vector store, helping to reduce complexities around managing unchanged content or avoiding duplicate content. -Participating in the Qdrant Summer of Code 2024 was a deeply rewarding experience. I had the chance to push the boundaries of my coding skills while exploring new technologies like Rust and WebAssembly. I’m incredibly grateful for the guidance and support from my mentor and the entire Qdrant team, who made this journey both educational and enjoyable. +### **Composition** -This experience has not only honed my technical skills but also ignited a deeper passion for optimizing performance in real-world applications. I’m excited to apply the knowledge and skills I’ve gained to future projects and to see how Qdrant’s enhanced vector visualization feature will benefit users worldwide. +Finally, LangChain also offers building blocks that help combine external APIs, services, and LangChain primitives. For instance, it provides tools to fetch data from Wikipedia or search using Google Lens. The list of tools it offers is [extremely varied](https://python.langchain.com/v0.1/docs/integrations/tools/). -This experience has not only honed my technical skills but also ignited a deeper passion for optimizing performance in real-world applications. I’m excited to apply the knowledge and skills I’ve gained to future projects and to see how Qdrant’s enhanced vector visualization feature will benefit users worldwide. +LangChain also offers ways to build agents that use language models to decide on the sequence of actions to take. -Thank you for joining me on this coding adventure. I hope you found something valuable in my journey, and I look forward to sharing more exciting projects with you in the future. Happy coding! +### **LCEL** -##### Was this page useful? +The primary method of building an application in LangChain is through the use of [LCEL](https://python.langchain.com/v0.1/docs/expression_language/), the LangChain Expression Language. It is a declarative syntax designed to simplify the composition of chains within the LangChain framework. It provides a minimalist code layer that enables the rapid development of chains, leveraging advanced features such as streaming, asynchronous execution, and parallel processing. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +LCEL is particularly useful for building chains that involve multiple language model calls, data transformations, and the integration of outputs from language models into downstream applications. -Thank you for your feedback! 🙏 +### **Some Use Cases of LangChain** -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/dimension-reduction-qsoc.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Given the flexibility that LangChain offers, a wide range of applications can be built using the framework. Here are some examples: -On this page: +**RAG Applications**: LangChain provides all the essential building blocks needed to build Retrieval Augmented Generation (RAG) applications. It integrates with vector stores and LLMs, streamlining the entire process of loading, chunking, and retrieving relevant sections of a document in a few lines of code. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/dimension-reduction-qsoc.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +**Chatbots**: LangChain offers a suite of components that streamline the process of building conversational chatbots. These include chat models, which are specifically designed for message-based interactions and provide a conversational tone suitable for chatbots. -× +**Extracting Structured Outputs**: LangChain assists in extracting structured output from data using various tools and methods. It supports multiple extraction approaches, including tool/function calling mode, JSON mode, and prompting-based extraction. -[Powered by](https://qdrant.tech/) +**Agents**: LangChain simplifies the process of building agents by providing building blocks and integration with LLMs, enabling developers to construct complex, multi-step workflows. These agents can interact with external data sources and tools, and generate dynamic and context-aware responses for various applications. -<|page-187-lllmstxt|> -## networking-logging-monitoring -- [Documentation](https://qdrant.tech/documentation/) -- [Hybrid cloud](https://qdrant.tech/documentation/hybrid-cloud/) -- Networking, Logging & Monitoring +If LangChain offers such a wide range of integrations and the primary building blocks needed to build AI applications, *why do we need another framework?* -# [Anchor](https://qdrant.tech/documentation/hybrid-cloud/networking-logging-monitoring/\#configuring-networking-logging--monitoring-in-qdrant-hybrid-cloud) Configuring Networking, Logging & Monitoring in Qdrant Hybrid Cloud +As Omar Khattab, PhD, Stanford and researcher at Stanford NLP, said when introducing DSPy in his [talk](https://www.youtube.com/watch?v=Dt3H2ninoeY) at ‘Scale By the Bay’ in November 2023: “We can build good reliable systems with these new artifacts that are language models (LMs), but importantly, this is conditioned on us *adapting* them as well as *stacking* them well”. -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/networking-logging-monitoring/\#configure-network-policies) Configure network policies +## **DSPy: Features, Performance, and Use Cases** -For security reasons, each database cluster is secured with network policies. By default, database pods only allow egress traffic between each and allow ingress traffic to ports 6333 (rest) and 6334 (grpc) from within the Kubernetes cluster. +When building AI systems, developers need to break down the task into multiple reasoning steps, adapt language model (LM) prompts for each step until they get the right results, and then ensure that the steps work together to achieve the desired outcome. -You can modify the default network policies in the Hybrid Cloud environment configuration: +Complex multihop pipelines, where multiple LLM calls are stacked, are messy. They involve string-based prompting tricks or prompt hacks at each step, and getting the pipeline to work is even trickier. -```yaml -qdrant: - networkPolicies: - ingress: - - from: - - ipBlock: - cidr: 192.168.0.0/22 - - podSelector: - matchLabels: - app: client-app - namespaceSelector: - matchLabels: - kubernetes.io/metadata.name: client-namespace - - podSelector: - matchLabels: - app: traefik - namespaceSelector: - matchLabels: - kubernetes.io/metadata.name: kube-system - ports: - - port: 6333 - protocol: TCP - - port: 6334 - protocol: TCP +Additionally, the manual prompting approach is highly unscalable, as any change in the underlying language model breaks the prompts and the pipeline. LMs are highly sensitive to prompts and slight changes in wording, context, or phrasing can significantly impact the model's output. Due to this, despite the functionality provided by frameworks like LangChain, developers often have to spend a lot of time engineering prompts to get the right results from LLMs. -``` +How do you build a system that’s less brittle and more predictable? Enter DSPy! -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/networking-logging-monitoring/\#logging) Logging +[DSPy](https://github.com/stanfordnlp/dspy) is built on the paradigm that language models (LMs) should be programmed rather than prompted. The framework is designed for algorithmically optimizing and adapting LM prompts and weights, and focuses on replacing prompting techniques with a programming-centric approach. -You can access the logs with kubectl or the Kubernetes log management tool of your choice. For example: +DSPy treats the LM like a device and abstracts out the underlying complexities of prompting. To achieve this, DSPy introduces three simple building blocks: -```bash -kubectl -n qdrant-namespace logs -l app=qdrant,cluster-id=9a9f48c7-bb90-4fb2-816f-418a46a74b24 +### **Signatures** -``` +[Signatures](https://dspy.ai/learn/programming/signatures/) replace handwritten prompts and are written in natural language. They are simply declarations or specs of the behavior that you expect from the language model. Some examples are: -**Configuring log levels:** You can configure log levels for the databases individually in the configuration section of the Qdrant Cluster detail page. The log level for the **Qdrant Cloud Agent** and **Operator** can be set in the [Hybrid Cloud Environment configuration](https://qdrant.tech/documentation/hybrid-cloud/operator-configuration/). +- question -> answer +- long_document -> summary +- context, question -> rationale, response -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/networking-logging-monitoring/\#integrating-with-a-log-management-system) Integrating with a log management system +Rather than manually crafting complex prompts or engaging in extensive fine-tuning of LLMs, signatures allow for the automatic generation of optimized prompts. -You can integrate the logs into any log management system that supports Kubernetes. There are no Qdrant specific configurations necessary. Just configure the agents of your system to collect the logs from all Pods in the Qdrant namespace. +DSPy Signatures can be specified in two ways: -## [Anchor](https://qdrant.tech/documentation/hybrid-cloud/networking-logging-monitoring/\#monitoring) Monitoring +1. Inline Signatures: Simple tasks can be defined in a concise format, like "question -> answer" for question-answering or "document -> summary" for summarization. -The Qdrant Cloud console gives you access to basic metrics about CPU, memory and disk usage of your Qdrant clusters. +2. Class-Based Signatures: More complex tasks might require class-based signatures, which can include additional instructions or descriptions about the inputs and outputs. For example, a class for emotion classification might clearly specify the range of emotions that can be classified. -If you want to integrate the Qdrant metrics into your own monitoring system, you can instruct it to scrape the following endpoints that provide metrics in a Prometheus/OpenTelemetry compatible format: +### **Modules** -- `/metrics` on port 6333 of every Qdrant database Pod, this provides metrics about each the database and its internals itself -- `/metrics` on port 9290 of the Qdrant Operator Pod, this provides metrics about the Operator, as well as the status of Qdrant Clusters and Snapshots -- `/metrics` on port 9090 of the Qdrant Cloud Agent Pod, this provides metrics about the Agent and its connection to the Qdrant Cloud control plane -- `/metrics` on port 8080 of the [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) Pod, this provides metrics about the state of Kubernetes resources like Pods and PersistentVolumes within the Qdrant Hybrid Cloud namespace (useful, if you are not running kube-state-metrics cluster-wide anyway) +Modules take signatures as input, and automatically generate high-quality prompts. Inspired heavily from PyTorch, DSPy [modules](https://dspy.ai/learn/programming/modules/) eliminate the need for crafting prompts manually. -### [Anchor](https://qdrant.tech/documentation/hybrid-cloud/networking-logging-monitoring/\#grafana-dashboard) Grafana dashboard +The framework supports advanced modules like [dspy.ChainOfThought](https://dspy-docs.vercel.app/api/modules/ChainOfThought), which adds step-by-step rationalization before producing an output. The output not only provides answers but also rationales. Other modules include [dspy.ProgramOfThought](https://dspy-docs.vercel.app/api/modules/ProgramOfThought), which outputs code whose execution results dictate the response, and [dspy.ReAct](https://dspy-docs.vercel.app/api/modules/ReAct), an agent that uses tools to implement signatures. -If you scrape the above metrics into your own monitoring system, and your are using Grafana, you can use our [Grafana dashboard](https://github.com/qdrant/qdrant-cloud-grafana-dashboard) to visualize these metrics. +DSPy also offers modules like [dspy.MultiChainComparison](https://dspy-docs.vercel.app/api/modules/MultiChainComparison), which can compare multiple outputs from dspy.ChainOfThought in order to produce a final prediction. There are also utility modules like [dspy.majority](https://dspy.ai/learn/programming/modules/?h=modul#what-other-dspy-modules-are-there-how-can-i-use-them) for aggregating responses through voting. -![Grafa dashboard](https://qdrant.tech/documentation/cloud/cloud-grafana-dashboard.png) +Modules can be composed into larger programs, and you can compose multiple modules into bigger modules. This allows you to create complex, behavior-rich applications using language models. -##### Was this page useful? +### **Optimizers** -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +[Optimizers](https://dspy.ai/learn/optimization/optimizers/) take a set of modules that have been connected to create a pipeline, compile them into auto-optimized prompts, and maximize an outcome metric. -Thank you for your feedback! 🙏 +Essentially, optimizers are designed to generate, test, and refine prompts, and ensure that the final prompt is highly optimized for the specific dataset and task at hand. Using optimizers in the DSPy framework significantly simplifies the process of developing and refining LM applications by automating the prompt engineering process. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/hybrid-cloud/networking-logging-monitoring.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +### **Building AI Applications with DSPy** -On this page: +A typical DSPy program requires the developer to follow the following 8 steps: -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/hybrid-cloud/networking-logging-monitoring.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +1. **Defining the Task**: Identify the specific problem you want to solve, including the input and output formats. +2. **Defining the Pipeline**: Plan the sequence of operations needed to solve the task. Then craft the signatures and the modules. +3. **Testing with Examples**: Run the pipeline with a few examples to understand the initial performance. This helps in identifying immediate issues with the program and areas for improvement. +4. **Defining Your Data**: Prepare and structure your training and validation datasets. This is needed by the optimizer for training the model and evaluating its performance accurately. +5. **Defining Your Metric**: Choose metrics that will measure the success of your model. These metrics help the optimizer evaluate how well the model is performing. +6. **Collecting Zero-Shot Evaluations**: Run initial evaluations without prior training to establish a baseline. This helps in understanding the model’s capabilities and limitations out of the box. +7. **Compiling with a DSPy Optimizer**: Given the data and metric, you can now optimize the program. DSPy offers a variety of optimizers designed for different purposes. These optimizers can generate step-by-step examples, craft detailed instructions, and/or update language model prompts and weights as needed. +8. **Iterating**: Continuously refine each aspect of your task, from the pipeline and data to the metrics and evaluations. Iteration helps in gradually improving the model’s performance and adapting to new requirements. +9. -× -[Powered by](https://qdrant.tech/) +{{< figure src=/blog/dspy-vs-langchain/process.jpg caption="Process" >}} -<|page-188-lllmstxt|> -## optimize -- [Documentation](https://qdrant.tech/documentation/) -- [Guides](https://qdrant.tech/documentation/guides/) -- Optimize Performance +**Language Model Setup** -# [Anchor](https://qdrant.tech/documentation/guides/optimize/\#optimizing-qdrant-performance-three-scenarios) Optimizing Qdrant Performance: Three Scenarios +Setting up the LM in DSPy is easy. -Different use cases require different balances between memory usage, search speed, and precision. Qdrant is designed to be flexible and customizable so you can tune it to your specific needs. +```python +# pip install dspy -This guide will walk you three main optimization strategies: +import dspy -- High Speed Search & Low Memory Usage -- High Precision & Low Memory Usage -- High Precision & High Speed Search +llm = dspy.OpenAI(model='gpt-3.5-turbo-1106', max_tokens=300) -![qdrant resource tradeoffs](https://qdrant.tech/docs/tradeoff.png) +dspy.configure(lm=llm) -## [Anchor](https://qdrant.tech/documentation/guides/optimize/\#1-high-speed-search-with-low-memory-usage) 1\. High-Speed Search with Low Memory Usage +# Let's test this. First define a module (ChainOfThought) and assign it a signature (return an answer, given a question). -To achieve high search speed with minimal memory usage, you can store vectors on disk while minimizing the number of disk reads. Vector quantization is a technique that compresses vectors, allowing more of them to be stored in memory, thus reducing the need to read from disk. +qa = dspy.ChainOfThought('question -> answer') -To configure in-memory quantization, with on-disk original vectors, you need to create a collection with the following parameters: +# Then, run with the default LM configured. -- `on_disk`: Stores original vectors on disk. -- `quantization_config`: Compresses quantized vectors to `int8` using the `scalar` method. -- `always_ram`: Keeps quantized vectors in RAM. +response = qa(question="Where is Paris?") -httppythontypescriptrustjavacsharpgo +print(response.answer) -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 768, - "distance": "Cosine", - "on_disk": true - }, - "quantization_config": { - "scalar": { - "type": "int8", - "always_ram": true - } - } -} +``` + +You are not restricted to using one LLM in your program; you can use [multiple](https://dspy.ai/learn/programming/language_models/?h=language#using-multiple-lms). DSPy can be used with both managed models such as OpenAI, Cohere, Anyscale, Together, or PremAI as well as with local LLM deployments through vLLM, Ollama, or TGI server. All LLM calls are cached by default. + +**Vector Store Integration (Retrieval Model)** -``` +You can easily set up [Qdrant](/documentation/frameworks/dspy/) vector store to act as the retrieval model. To do so, follow these steps: ```python -from qdrant_client import QdrantClient, models +# pip install dspy-ai dspy-qdrant -client = QdrantClient(url="http://localhost:6333") +import dspy -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE, on_disk=True), - quantization_config=models.ScalarQuantization( - scalar=models.ScalarQuantizationConfig( - type=models.ScalarType.INT8, - always_ram=True, - ), - ), -) +from dspy_qdrant import QdrantRM -``` +from qdrant_client import QdrantClient -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +llm = dspy.OpenAI(model="gpt-3.5-turbo") -const client = new QdrantClient({ host: "localhost", port: 6333 }); +qdrant_client = QdrantClient() -client.createCollection("{collection_name}", { - vectors: { - size: 768, - distance: "Cosine", - on_disk: true, - }, - quantization_config: { - scalar: { - type: "int8", - always_ram: true, - }, - }, -}); +qdrant_rm = QdrantRM("collection-name", qdrant_client, k=3) + +dspy.settings.configure(lm=llm, rm=qdrant_rm) ``` -```rust -use qdrant_client::qdrant::{ - CreateCollectionBuilder, Distance, QuantizationType, ScalarQuantizationBuilder, - VectorParamsBuilder, -}; -use qdrant_client::Qdrant; +The above code sets up DSPy to use Qdrant (localhost), with collection-name as the default retrieval client. You can now build a RAG module in the following way: -let client = Qdrant::from_url("http://localhost:6334").build()?; +```python -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) - .quantization_config( - ScalarQuantizationBuilder::default() - .r#type(QuantizationType::Int8.into()) - .always_ram(true), - ), - ) - .await?; +class RAG(dspy.Module): + def __init__(self, num_passages=5): + super().__init__() + + self.retrieve = dspy.Retrieve(k=num_passages) + self.generate_answer = dspy.ChainOfThought('context, question -> answer') # using inline signature + + def forward(self, question): + context = self.retrieve(question).passages + prediction = self.generate_answer(context=context, question=question) + return dspy.Prediction(context=context, answer=prediction.answer) ``` -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; -import io.qdrant.client.grpc.Collections.QuantizationConfig; -import io.qdrant.client.grpc.Collections.QuantizationType; -import io.qdrant.client.grpc.Collections.ScalarQuantization; -import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.grpc.Collections.VectorsConfig; +Now you can use the RAG module like any Python module. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +**Optimizing the Pipeline** -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig( - VectorsConfig.newBuilder() - .setParams( - VectorParams.newBuilder() - .setSize(768) - .setDistance(Distance.Cosine) - .setOnDisk(true) - .build()) - .build()) - .setQuantizationConfig( - QuantizationConfig.newBuilder() - .setScalar( - ScalarQuantization.newBuilder() - .setType(QuantizationType.Int8) - .setAlwaysRam(true) - .build()) - .build()) - .build()) - .get(); +In this step, DSPy requires you to create a training dataset and a metric function, which can help validate the output of your program. Using this, DSPy tunes the parameters (i.e., the prompts and/or the LM weights) to maximize the accuracy of the RAG pipeline. -``` +Using DSPy optimizers involves the following steps: -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +1. Set up your DSPy program with the desired signatures and modules. +2. Create a training and validation dataset, with example input and output that you expect from your DSPy program. +3. Choose an appropriate optimizer such as BootstrapFewShotWithRandomSearch, MIPRO, or BootstrapFinetune. +4. Create a metric function that evaluates the performance of the DSPy program. You can evaluate based on accuracy or quality of responses, or on a metric that’s relevant to your program. +5. Run the optimizer with the DSPy program, metric function, and training inputs. DSPy will compile the program and automatically adjust parameters and improve performance. +6. Use the compiled program to perform the task. Iterate and adapt if required. -var client = new QdrantClient("localhost", 6334); +To learn more about optimizing DSPy programs, read [this](https://dspy.ai/learn/optimization/optimizers/). -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine, OnDisk = true }, - quantizationConfig: new QuantizationConfig - { - Scalar = new ScalarQuantization { Type = QuantizationType.Int8, AlwaysRam = true } - } -); +DSPy is heavily influenced by PyTorch, and replaces complex prompting with reusable modules for common tasks. Instead of crafting specific prompts, you write code that DSPy automatically translates for the LLM. This, along with built-in optimizers, makes working with LLMs more systematic and efficient. -``` +### **Use Cases of DSPy** -```go -import ( - "context" +As we saw above, DSPy can be used to create fairly complex applications which require stacking multiple LM calls without the need for prompt engineering. Even though the framework is comparatively new - it started gaining popularity since November 2023 when it was first introduced - it has created a promising new direction for LLM-based applications. - "github.com/qdrant/go-client/qdrant" -) +Here are some of the possible uses of DSPy: -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +**Automating Prompt Engineering**: DSPy automates the process of creating prompts for LLMs, and allows developers to focus on the core logic of their application. This is powerful as manual prompt engineering makes AI applications highly unscalable and brittle. -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 768, - Distance: qdrant.Distance_Cosine, - OnDisk: qdrant.PtrOf(true), - }), - QuantizationConfig: qdrant.NewQuantizationScalar(&qdrant.ScalarQuantization{ - Type: qdrant.QuantizationType_Int8, - AlwaysRam: qdrant.PtrOf(true), - }), -}) +**Building Chatbots**: The modular design of DSPy makes it well-suited for creating chatbots with improved response quality and faster development cycles. DSPy's automatic prompting and optimizers can help ensure chatbots generate consistent and informative responses across different conversation contexts. -``` +**Complex Information Retrieval Systems**: DSPy programs can be easily integrated with vector stores, and used to build multi-step information retrieval systems with stacked calls to the LLM. This can be used to build highly sophisticated retrieval systems. For example, DSPy can be used to develop custom search engines that understand complex user queries and retrieve the most relevant information from vector stores. -### [Anchor](https://qdrant.tech/documentation/guides/optimize/\#disable-rescoring-for-faster-search-optional) Disable Rescoring for Faster Search (optional) +**Improving LLM Pipelines**: One of the best uses of DSPy is to optimize LLM pipelines. DSPy's modular design greatly simplifies the integration of LLMs into existing workflows. Additionally, DSPy's built-in optimizers can help fine-tune LLM pipelines based on desired metrics. -This is completely optional. Disabling rescoring with search `params` can further reduce the number of disk reads. Note that this might slightly decrease precision. +**Multi-Hop Question-Answering**: Multi-hop question-answering involves answering complex questions that require reasoning over multiple pieces of information, which are often scattered across different documents or sections of text. With DSPy, users can leverage its automated prompt engineering capabilities to develop prompts that effectively guide the model on how to piece together information from various sources. -httppythontypescriptrustjavacsharpgo +## **Comparative Analysis: DSPy vs LangChain** -```http -POST /collections/{collection_name}/points/query -{ - "query": [0.2, 0.1, 0.9, 0.7], - "params": { - "quantization": { - "rescore": false - } - }, - "limit": 10 -} +DSPy and LangChain are both powerful frameworks for building AI applications, leveraging large language models (LLMs) and vector search technology. Below is a comparative analysis of their key features, performance, and use cases: -``` +| Feature | LangChain | DSPy | +| --- | --- | --- | +| Core Focus | Focus on providing a large number of building blocks to simplify the development of applications that use LLMs in conjunction with user-specified data sources. | Focus on automating and modularizing LLM interactions, eliminating manual prompt engineering and improving systematic reliability. | +| Approach | Utilizes modular components and chains that can be linked together using the LangChain Expression Language (LCEL). | Streamlines LLM interaction by prioritizing programming instead of prompting, and automating prompt refinement and weight tuning. | +| Complex Pipelines | Facilitates the creation of chains using LCEL, supporting asynchronous execution and integration with various data sources and APIs. | Simplifies multi-stage reasoning pipelines using modules and optimizers, and ensures scalability through less manual intervention. | +| Optimization | Relies on user expertise for prompt engineering and chaining of multiple LLM calls. | Includes built-in optimizers that automatically tune prompts and weights, and helps bring efficiency and effectiveness in LLM pipelines. | +| Community and Support | Large open-source community with extensive documentation and examples. | Emerging framework with growing community support, and bringing a paradigm-shift in LLM prompting. | -```python -from qdrant_client import QdrantClient, models +### **LangChain** -client = QdrantClient(url="http://localhost:6333") +Strengths: -client.query_points( - collection_name="{collection_name}", - query=[0.2, 0.1, 0.9, 0.7], - search_params=models.SearchParams( - quantization=models.QuantizationSearchParams(rescore=False) - ), -) +1. Data Sources and APIs: LangChain supports a wide variety of data sources and APIs, and allows seamless integration with different types of data. This makes it highly versatile for various AI applications​. +2. LangChain provides modular components that can be chained together and allows you to create complex AI workflows. LangChain Expression Language (LCEL) lets you use declarative syntax and makes it easier to build and manage workflows. +3. Since LangChain is an older framework, it has extensive documentation and thousands of examples that developers can take inspiration from. -``` +Weaknesses: -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +1. For projects involving complex, multi-stage reasoning tasks, LangChain requires significant manual prompt engineering. This can be time-consuming and prone to errors​. +2. Scalability Issues: Managing and scaling workflows that require multiple LLM calls can be pretty challenging. +3. Developers need sound understanding of prompt engineering in order to build applications that require multiple calls to the LLM. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +### **DSPy** -client.query("{collection_name}", { - query: [0.2, 0.1, 0.9, 0.7], - params: { - quantization: { - rescore: false, - }, - }, -}); +Strengths: -``` +1. DSPy automates the process of prompt generation and optimization, and significantly reduces the need for manual prompt engineering. This makes working with LLMs easier and helps build scalable AI workflows​. +2. The framework includes built-in optimizers like BootstrapFewShot and MIPRO, which automatically refine prompts and adapt them to specific datasets​. +3. DSPy uses general-purpose modules and optimizers to simplify the complexities of prompt engineering. This can help you create complex multi-step reasoning applications easily, without worrying about the intricacies of dealing with LLMs. +4. DSPy supports various LLMs, including the flexibility of using multiple LLMs in the same program. +5. By focusing on programming rather than prompting, DSPy ensures higher reliability and performance for AI applications, particularly those that require complex multi-stage reasoning​​. -```rust -use qdrant_client::qdrant::{ - QuantizationSearchParamsBuilder, QueryPointsBuilder, SearchParamsBuilder, -}; -use qdrant_client::Qdrant; +Weaknesses: -let client = Qdrant::from_url("http://localhost:6334").build()?; +1. As a newer framework, DSPy has a smaller community compared to LangChain. This means you will have limited availability of resources, examples, and community support​. +2. Although DSPy offers tutorials and guides, its documentation is less extensive than LangChain’s, which can pose challenges when you start​. +3. When starting with DSPy, you may feel limited to the paradigms and modules it provides. ​ -client - .query( - QueryPointsBuilder::new("{collection_name}") - .query(vec![0.2, 0.1, 0.9, 0.7]) - .limit(3) - .params( - SearchParamsBuilder::default() - .quantization(QuantizationSearchParamsBuilder::default().rescore(false)), - ), - ) - .await?; +## **Selecting the Ideal Framework for Your AI Project** -``` +When deciding between DSPy and LangChain for your AI project, you should consider the problem statement and choose the framework that best aligns with your project goals. -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.QuantizationSearchParams; -import io.qdrant.client.grpc.Points.QueryPoints; -import io.qdrant.client.grpc.Points.SearchParams; +Here are some guidelines: -import static io.qdrant.client.QueryFactory.nearest; +### **Project Type** -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +**LangChain**: LangChain is ideal for projects that require extensive integration with multiple data sources and APIs, especially projects that benefit from the wide range of document loaders, vector stores, and retrieval algorithms that it supports​. -client.queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) - .setParams( - SearchParams.newBuilder() - .setQuantization( - QuantizationSearchParams.newBuilder().setRescore(false).build()) - .build()) - .setLimit(3) - .build()) - .get(); +**DSPy**: DSPy is best suited for projects that involve complex multi-stage reasoning pipelines or those that may eventually need stacked LLM calls. DSPy’s systematic approach to prompt engineering and its ability to optimize LLM interactions can help create highly reliable AI applications​. -``` +### **Technical Expertise** -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +**LangChain**: As the complexity of the application grows, LangChain requires a good understanding of prompt engineering and expertise in chaining multiple LLM calls. -var client = new QdrantClient("localhost", 6334); +**DSPy**: Since DSPy is designed to abstract away the complexities of prompt engineering, it makes it easier for developers to focus on high-level logic rather than low-level prompt crafting. -await client.QueryAsync( - collectionName: "{collection_name}", - query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, - searchParams: new SearchParams - { - Quantization = new QuantizationSearchParams { Rescore = false } - }, - limit: 3 -); +### **Community and Support** -``` +**LangChain**: LangChain boasts a large and active community with extensive documentation, examples, and active contributions, and you will find it easier to get going. -```go -import ( - "context" +**DSPy**: Although newer and with a smaller community, DSPy is growing rapidly and offers tutorials and guides for some of the key use cases. DSPy may be more challenging to get started with, but its architecture makes it highly scalable. - "github.com/qdrant/go-client/qdrant" -) +### **Use Case Scenarios** -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +**Retrieval Augmented Generation (RAG) Applications** -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), - Params: &qdrant.SearchParams{ - Quantization: &qdrant.QuantizationSearchParams{ - Rescore: qdrant.PtrOf(true), - }, - }, -}) +**LangChain**: Excellent for building simple RAG applications due to its robust support for vector stores, document loaders, and retrieval algorithms. -``` +**DSPy**: Suitable for RAG applications requiring high reliability and automated prompt optimization, ensuring consistent performance across complex retrieval tasks. -## [Anchor](https://qdrant.tech/documentation/guides/optimize/\#2-high-precision-with-low-memory-usage) 2\. High Precision with Low Memory Usage +**Chatbots and Conversational AI** -If you require high precision but have limited RAM, you can store both vectors and the HNSW index on disk. This setup reduces memory usage while maintaining search precision. +**LangChain**: Provides a wide range of components for building conversational AI, making it easy to integrate LLMs with external APIs and services​​. -To store the vectors `on_disk`, you need to configure both the vectors and the HNSW index: +**DSPy**: Ideal for developing chatbots that need to handle complex, multi-stage conversations with high reliability and performance. DSPy’s automated optimizations ensure consistent and contextually accurate responses. -httppythontypescriptrustjavacsharpgo +**Complex Information Retrieval Systems** -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 768, - "distance": "Cosine", - "on_disk": true - }, - "hnsw_config": { - "on_disk": true - } -} +**LangChain**: Effective for projects that require seamless integration with various data sources and sophisticated retrieval capabilities​​. -``` +**DSPy**: Best for systems that involve complex multi-step retrieval processes, where prompt optimization and modular design can significantly enhance performance and reliability. -```python -from qdrant_client import QdrantClient, models +You can also choose to combine and use the best features of both. In fact, LangChain has released an [integration with DSPy](https://python.langchain.com/v0.1/docs/integrations/providers/dspy/) to simplify this process. This allows you to use some of the utility functions that LangChain provides, such as text splitter, directory loaders, or integrations with other data sources while using DSPy for the LM interactions. -client = QdrantClient(url="http://localhost:6333") +### Key Takeaways: -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE, on_disk=True), - hnsw_config=models.HnswConfigDiff(on_disk=True), -) +- **LangChain's Flexibility:** LangChain integrates seamlessly with Qdrant, enabling streamlined vector embedding and retrieval for AI workflows. -``` +- **Optimized Retrieval:** Automate and enhance retrieval processes in multi-stage AI reasoning applications. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +- **Enhanced RAG Applications:** Fast and accurate retrieval of relevant document sections through vector similarity search. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +- **Support for Complex AI:** LangChain integration facilitates the creation of advanced AI architectures requiring precise information retrieval. -client.createCollection("{collection_name}", { - vectors: { - size: 768, - distance: "Cosine", - on_disk: true, - }, - hnsw_config: { - on_disk: true, - }, -}); +- **Streamlined AI Development:** Simplify managing and retrieving large datasets, leading to more efficient AI development cycles in LangChain and DSPy. -``` +- **Future AI Workflows:** Qdrant's role in optimizing retrieval will be crucial as AI frameworks like DSPy continue to evolve and scale. -```rust -use qdrant_client::qdrant::{ - CreateCollectionBuilder, Distance, HnswConfigDiffBuilder, - VectorParamsBuilder, -}; -use qdrant_client::Qdrant; +## **Level Up Your AI Projects with Advanced Frameworks** -let client = Qdrant::from_url("http://localhost:6334").build()?; +LangChain and DSPy both offer unique capabilities and can help you build powerful AI applications. Qdrant integrates with both LangChain and DSPy, allowing you to leverage its performance, efficiency and security features in either scenario. LangChain is ideal for projects that require extensive integration with various data sources and APIs. On the other hand, DSPy offers a powerful paradigm for building complex multi-stage applications. For pulling together an AI application that doesn’t require much prompt engineering, use LangChain. However, pick DSPy when you need a systematic approach to prompt optimization and modular design, and need robustness and scalability for complex, multi-stage reasoning applications. -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine).on_disk(true)) - .hnsw_config(HnswConfigDiffBuilder::default().on_disk(true)), - ) - .await?; +## **References** -``` +[https://python.langchain.com/v0.1/docs/get_started/introduction](https://python.langchain.com/v0.1/docs/get_started/introduction) -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.HnswConfigDiff; -import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.grpc.Collections.VectorsConfig; +[DSPy Introduction](https://dspy.ai/) -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +<|page-378-lllmstxt|> +Google Summer of Code (#GSoC) is celebrating its 20th anniversary this year with the 2024 program. Over the past 20 years, 19K new contributors were introduced to #opensource through the program under the guidance of thousands of mentors from over 800 open-source organizations in various fields. Qdrant participated successfully in the program last year. Both projects, the UI Dashboard with unstructured data visualization and the advanced Geo Filtering, were completed in time and are now a part of the engine. One of the two young contributors joined the team and continues working on the project. -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig( - VectorsConfig.newBuilder() - .setParams( - VectorParams.newBuilder() - .setSize(768) - .setDistance(Distance.Cosine) - .setOnDisk(true) - .build()) - .build()) - .setHnswConfig(HnswConfigDiff.newBuilder().setOnDisk(true).build()) - .build()) - .get(); +We are thrilled to announce that Qdrant was 𝐍𝐎𝐓 đšđœđœđžđ©đ­đžđ into the GSoc 2024 program for unknown reasons, but instead, we are introducing our own đđđ«đšđ§đ­ đ’đźđŠđŠđžđ« 𝐹𝐟 𝐂𝐹𝐝𝐞 program with a stipend for contributors! To not reinvent the wheel, we follow all the timelines and rules of the official Google program. -``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -var client = new QdrantClient("localhost", 6334); +## Our project ideas. +We have prepared some excellent project ideas. Take a look and choose if you want to contribute in Rust or a Python-based project. -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine, OnDisk = true }, - hnswConfig: new HnswConfigDiff { OnDisk = true } -); -``` +➡ *WASM-based dimension reduction viz* 📊 -```go -import ( - "context" +Implement a dimension reduction algorithm in Rust, compile to WASM and integrate the WASM code with Qdrant Web UI. - "github.com/qdrant/go-client/qdrant" -) +➡ *Efficient BM25 and Okapi BM25, which uses the BERT Tokenizer* đŸ„‡ -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +BM25 and Okapi BM25 are popular ranking algorithms. Qdrant's FastEmbed supports dense embedding models. We need a fast, efficient, and massively parallel Rust implementation with Python bindings for these. -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 768, - Distance: qdrant.Distance_Cosine, - OnDisk: qdrant.PtrOf(true), - }), - HnswConfig: &qdrant.HnswConfigDiff{ - OnDisk: qdrant.PtrOf(true), - }, -}) +➡ *ONNX Cross Encoders in Python* ⚔ -``` +Export a cross-encoder ranking models to operate on ONNX runtime and integrate this model with the Qdrant's FastEmbed to support efficient re-ranking -### [Anchor](https://qdrant.tech/documentation/guides/optimize/\#improving-precision) Improving Precision +➡ *Ranking Fusion Algorithms implementation in Rust* đŸ§Ș -Increase the `ef` and `m` parameters of the HNSW index to improve precision, even with limited RAM: +Develop Rust implementations of various ranking fusion algorithms including but not limited to Reciprocal Rank Fusion (RRF). For a complete list, see: https://github.com/AmenRa/ranx +and create Python bindings for the implemented Rust modules. -```json -... -"hnsw_config": { - "m": 64, - "ef_construct": 512, - "on_disk": true -} -... +➡ *Setup Jepsen to test Qdrant’s distributed guarantees* 💣 -``` +Design and write Jepsen tests based on implementations for other Databases and create a report or blog with the findings. -**Note:** The speed of this setup depends on the disk’s IOPS (Input/Output Operations Per Second). -You can use [fio](https://gist.github.com/superboum/aaa45d305700a7873a8ebbab1abddf2b) to measure disk IOPS. +See all details on our Notion page: https://www.notion.so/qdrant/GSoC-2024-ideas-1dfcc01070094d87bce104623c4c1110 -## [Anchor](https://qdrant.tech/documentation/guides/optimize/\#3-high-precision-with-high-speed-search) 3\. High Precision with High-Speed Search -For scenarios requiring both high speed and high precision, keep as much data in RAM as possible. Apply quantization with re-scoring for tunable accuracy. +Contributor application period begins on March 18th. We will accept applications via email. Let's contribute and celebrate together! -Here is how you can configure scalar quantization for a collection: +In open-source, we trust! đŸŠ€đŸ€˜đŸš€ -httppythontypescriptrustjavacsharpgo +<|page-379-lllmstxt|> +One of the major promises of artificial intelligence is its potential to +accelerate efficiency and productivity within businesses, empowering employees +and teams in their daily tasks. The French company [Dust](https://dust.tt/), co-founded by former +Open AI Research Engineer [Stanislas Polu](https://www.linkedin.com/in/spolu/), set out to deliver on this promise by +providing businesses and teams with an expansive platform for building +customizable and secure AI assistants. -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 768, - "distance": "Cosine" - }, - "quantization_config": { - "scalar": { - "type": "int8", - "always_ram": true - } - } -} +## Challenge -``` +"The past year has shown that large language models (LLMs) are very useful but +complicated to deploy," Polu says, especially in the context of their +application across business functions. This is why he believes that the goal of +augmenting human productivity at scale is especially a product unlock and not +only a research unlock, with the goal to identify the best way for companies to +leverage these models. Therefore, Dust is creating a product that sits between +humans and the large language models, with the focus on supporting the work of +a team within the company to ultimately enhance employee productivity. -```python -from qdrant_client import QdrantClient, models +A major challenge in leveraging leading LLMs like OpenAI, Anthropic, or Mistral +to their fullest for employees and teams lies in effectively addressing a +company's wide range of internal use cases. These use cases are typically very +general and fluid in nature, requiring the use of very large language models. +Due to the general nature of these use cases, it is very difficult to finetune +the models - even if financial resources and access to the model weights are +available. The main reason is that “the data that’s available in a company is +a drop in the bucket compared to the data that is needed to finetune such big +models accordingly,” Polu says, “which is why we believe that retrieval +augmented generation is the way to go until we get much better at fine tuning”. -client = QdrantClient(url="http://localhost:6333") +For successful retrieval augmented generation (RAG) in the context of employee +productivity, it is important to get access to the company data and to be able +to ingest the data that is considered ‘shared knowledge’ of the company. This +data usually sits in various SaaS applications across the organization. -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), - quantization_config=models.ScalarQuantization( - scalar=models.ScalarQuantizationConfig( - type=models.ScalarType.INT8, - always_ram=True, - ), - ), -) +## Solution -``` +Dust provides companies with the core platform to execute on their GenAI bet +for their teams by deploying LLMs across the organization and providing context +aware AI assistants through [RAG](https://qdrant.tech/rag/rag-evaluation-guide/) +. Users can manage so-called data sources within +Dust and upload files or directly connect to it via APIs to ingest data from +tools like Notion, Google Drive, or Slack. Dust then handles the chunking +strategy with the embeddings models and performs retrieval augmented generation. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +![solution-laptop-screen](/case-studies/dust/laptop-solutions.jpg) -const client = new QdrantClient({ host: "localhost", port: 6333 }); +For this, Dust required a vector database and evaluated different options +including Pinecone and Weaviate, but ultimately decided on Qdrant as the +solution of choice. “We particularly liked Qdrant because it is open-source, +written in Rust, and it has a well-designed API,” Polu says. For example, Dust +was looking for high control and visibility in the context of their rapidly +scaling demand, which made the fact that Qdrant is open-source a key driver for +selecting Qdrant. Also, Dust's existing system which is interfacing with Qdrant, +is written in Rust, which allowed Dust to create synergies with regards to +library support. -client.createCollection("{collection_name}", { - vectors: { - size: 768, - distance: "Cosine", - }, - quantization_config: { - scalar: { - type: "int8", - always_ram: true, - }, - }, -}); +When building their solution with Qdrant, Dust took a two step approach: -``` +1. **Get started quickly:** Initially, Dust wanted to get started quickly and opted for +[Qdrant Cloud](https://qdrant.to/cloud), Qdrant’s managed solution, to reduce the administrative load on +Dust’s end. In addition, they created clusters and deployed them on Google +Cloud since Dust wanted to have those run directly in their existing Google +Cloud environment. This added a lot of value as it allowed Dust to centralize +billing and increase security by having the instance live within the same VPC. +“The early setup worked out of the box nicely,” Polu says. -```rust -use qdrant_client::qdrant::{ - CreateCollectionBuilder, Distance, QuantizationType, ScalarQuantizationBuilder, - VectorParamsBuilder, -}; -use qdrant_client::Qdrant; +2. **Scale and optimize:** As the load grew, Dust started to take advantage of Qdrant’s +features to tune the setup for optimization and scale. They started to look into +how they map and cache data, as well as applying some of Qdrant’s [built-in +compression features](/documentation/guides/quantization/). In particular, Dust leveraged the control of the [MMAP +payload threshold](/documentation/concepts/storage/#configuring-memmap-storage) as well as [Scalar Quantization](/articles/scalar-quantization/), which enabled Dust to manage +the balance between storing vectors on disk and keeping quantized vectors in RAM, +more effectively. “This allowed us to scale smoothly from there,” Polu says. -let client = Qdrant::from_url("http://localhost:6334").build()?; +## Results -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) - .quantization_config( - ScalarQuantizationBuilder::default() - .r#type(QuantizationType::Int8.into()) - .always_ram(true), - ), - ) - .await?; +Dust has seen success in using Qdrant as their vector database of choice, as Polu +acknowledges: “Qdrant’s ability to handle large-scale models and the flexibility +it offers in terms of data management has been crucial for us. The observability +features, such as historical graphs of RAM, Disk, and CPU, provided by Qdrant are +also particularly useful, allowing us to plan our scaling strategy effectively.” -``` +![“We were able to reduce the footprint of vectors in memory, which led to a significant cost reduction as +we don’t have to run lots of nodes in parallel. While being memory-bound, we were +able to push the same instances further with the help of quantization. While you +get pressure on MMAP in this case you maintain very good performance even if the +RAM is fully used. With this we were able to reduce our cost by 2x.” - Stanislas Polu, Co-Founder of Dust](/case-studies/dust/Dust-Quote.jpg) -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; -import io.qdrant.client.grpc.Collections.QuantizationConfig; -import io.qdrant.client.grpc.Collections.QuantizationType; -import io.qdrant.client.grpc.Collections.ScalarQuantization; -import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.grpc.Collections.VectorsConfig; +Dust was able to scale its application with Qdrant while maintaining low latency +across hundreds of thousands of collections with retrieval only taking +milliseconds, as well as maintaining high accuracy. Additionally, Polu highlights +the efficiency gains Dust was able to unlock with Qdrant: "We were able to reduce the footprint of vectors in memory, which led to a significant cost reduction as +we don’t have to run lots of nodes in parallel. While being memory-bound, we were +able to push the same instances further with the help of quantization. While you +get pressure on MMAP in this case you maintain very good performance even if the +RAM is fully used. With this we were able to reduce our cost by 2x." -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig( - VectorsConfig.newBuilder() - .setParams( - VectorParams.newBuilder() - .setSize(768) - .setDistance(Distance.Cosine) - .build()) - .build()) - .setQuantizationConfig( - QuantizationConfig.newBuilder() - .setScalar( - ScalarQuantization.newBuilder() - .setType(QuantizationType.Int8) - .setAlwaysRam(true) - .build()) - .build()) - .build()) - .get(); -``` +## Outlook -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +Dust will continue to build out their platform, aiming to be the platform of +choice for companies to execute on their internal GenAI strategy, unlocking +company knowledge and driving team productivity. Over the coming months, Dust +will add more connections, such as Intercom, Jira, or Salesforce. Additionally, +Dust will expand on its structured data capabilities. -var client = new QdrantClient("localhost", 6334); +To learn more about how Dust uses Qdrant to help employees in their day to day +tasks, check out our [Vector Space Talk](https://www.youtube.com/watch?v=toIgkJuysQ4) featuring Stanislas Polu, Co-Founder of Dust. -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine}, - quantizationConfig: new QuantizationConfig - { - Scalar = new ScalarQuantization { Type = QuantizationType.Int8, AlwaysRam = true } - } -); +<|page-380-lllmstxt|> +> *"If you haven't heard of the bitter lesson, it's actually a theorem. It's based on a blog post by Ricard Sutton, and it states basically that based on what we have learned from the development of machine learning and artificial intelligence systems in the previous decades, the methods that can leverage data and compute tends to or will eventually outperform the methods that are designed or handcrafted by humans.”*\ +-- Mikko LehtimĂ€ki +> -``` +Dr. Mikko LehtimĂ€ki is a data scientist, researcher and software engineer. He has delivered a range of data-driven solutions, from machine vision for robotics in circular economy to generative AI in journalism. Mikko is a co-founder of Softlandia, an innovative AI solutions provider. There, he leads the development of YOKOTAI, an LLM-based productivity booster that connects to enterprise data. -```go -import ( - "context" +Recently, Mikko has contributed software to Llama-index and Guardrails-AI, two leading open-source initiatives in the LLM space. He completed his PhD in the intersection of computational neuroscience and machine learning, which gives him a unique perspective on the design and implementation of AI systems. With Softlandia, Mikko also hosts chill hybrid-format data science meetups where everyone is welcome to participate. - "github.com/qdrant/go-client/qdrant" -) +***Listen to the episode on [Spotify](https://open.spotify.com/episode/5hAnDq7MH9qjjtYVjmsGrD?si=zByq7XXGSjOdLbXZDXTzoA), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/D8lOvz5xp5c).*** -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) + -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 768, - Distance: qdrant.Distance_Cosine, - }), - QuantizationConfig: qdrant.NewQuantizationScalar(&qdrant.ScalarQuantization{ - Type: qdrant.QuantizationType_Int8, - AlwaysRam: qdrant.PtrOf(true), - }), -}) + -``` +## **Top takeaways:** -### [Anchor](https://qdrant.tech/documentation/guides/optimize/\#fine-tuning-search-parameters) Fine-Tuning Search Parameters +Aren’t you curious about what the bitter lesson is and how it plays out in generative language model workflows? -You can adjust search parameters like `hnsw_ef` and `exact` to balance between speed and precision: +Check it out as Mikko delves into the intricate world of retrieval-augmented generation, discussing how Yokot AI manages vast diverse data inputs and how focusing on re-ranking can massively improve LLM workflows and output quality. -**Key Parameters:** +5 key takeaways you’ll get from this episode: -- `hnsw_ef`: Number of neighbors to visit during search (higher value = better accuracy, slower speed). -- `exact`: Set to `true` for exact search, which is slower but more accurate. You can use it to compare results of the search with different `hnsw_ef` values versus the ground truth. +1. **The Development of Yokot AI:** Mikko detangles the complex web of how Softlandia's in-house stack is changing the game for language model applications. +2. **Unpacking Retrieval-Augmented Generation:** Learn the rocket science behind uploading documents and scraping the web for that nugget of insight, all through the prowess of Yokot AI's LLMs. +3. **The "Bitter Lesson" Theory:** Dive into the theorem that's shaking the foundations of AI, suggesting the supremacy of data and computing over human design. +4. **High-Quality Content Generation:** Understand how the system's handling of massive data inputs is propelling content quality to stratospheric heights. +5. **Future Proofing with Re-Ranking:** Discover why improving the re-ranking component might be akin to discovering a new universe within our AI landscapes. -httppythontypescriptrustjavacsharpgo +> Fun Fact: Yokot AI incorporates a retrieval augmented generation mechanism to facilitate the retrieval of relevant information, which allows users to upload and leverage their own documents or scrape data from the web. +> -```http -POST /collections/{collection_name}/points/query -{ - "query": [0.2, 0.1, 0.9, 0.7], - "params": { - "hnsw_ef": 128, - "exact": false - }, - "limit": 3 -} +## Show notes: + +00:00 Talk on retrieval for language models and Yokot AI platform.\ +06:24 Data flexibility in various languages leads progress.\ +10:45 User inputs document, system converts to vectors.\ +13:40 Enhance data quality, reduce duplicates, streamline processing.\ +19:20 Reducing complexity by focusing on re-ranker.\ +21:13 Retrieval process enhances efficiency of language model.\ +24:25 Information retrieval methods evolving, leveraging data, computing.\ +28:11 Optimal to run lightning on local hardware. + +## More Quotes from Mikko: -``` +"*We used to build image analysis on this type of features that we designed manually... Whereas now we can just feed a bunch of images to a transformer, and we'll get beautiful bounding boxes and semantic segmentation outputs without building rules into the system.*”\ +-- Mikko LehtimĂ€ki -```python -from qdrant_client import QdrantClient, models +*"We cannot just leave it out and hope that someday soon we will have a language model that doesn't require us fetching the data for it in such a sophisticated manner. The reranker is a component that can leverage data and compute quite efficiently, and it doesn't require that much manual craftmanship either.”*\ +-- Mikko LehtimĂ€ki -client = QdrantClient(url="http://localhost:6333") +*"We can augment the data we store, for example, by using multiple chunking strategies or generating question answer pairs from the user's documents, and then we'll embed those and look them up when the queries come in.”*\ +-- Mikko LehtimĂ€ki in improving data quality in rack stack -client.query_points( - collection_name="{collection_name}", - query=[0.2, 0.1, 0.9, 0.7], - search_params=models.SearchParams(hnsw_ef=128, exact=False), - limit=3, -) +## Transcript: +Demetrios: +What is happening? Everyone, it is great to have you here with us for yet another vector space talks. I have the pleasure of being joined by Mikko today, who is the co founder of Softlandia, and he's also lead data scientist. He's done all kinds of great software engineering and data science in his career, and currently he leads the development of Yokot AI, which I just learned the pronunciation of, and he's going to tell us all about it. But I'll give you the TLDR. It's an LLM based productivity booster that can connect to your data. What's going on, Mikko? How you doing, bro? -``` +Mikko LehtimĂ€ki: +Hey, thanks. Cool to be here. Yes. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +Demetrios: +So, I have to say, I said it before we hit record or before we started going live, but I got to say it again. The talk title is spot on. Your talk title is the bitter lessons of retrieval in generative language model workflows. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +Mikko LehtimĂ€ki: +Exactly. -client.query("{collection_name}", { - query: [0.2, 0.1, 0.9, 0.7], - params: { - hnsw_ef: 128, - exact: false, - }, - limit: 3, -}); +Demetrios: +So I'm guessing you've got a lot of hardship that you've been through, and you're going to hopefully tell us all about it so that we do not have to make the same mistakes as you did. We can be wise and learn from your mistakes before we have to make them ourselves, right? All right. That's a great segue into you getting into it, man. I know you got to talk. I know you got some slides to share, so feel free to start throwing those up on the screen. And for everyone that is here joining, feel free to add some questions in the chat. I'll be monitoring it so that in case you have any questions, I can jump in and make sure that Mikko answers them before he moves on to the next slide. All right, Mikko, I see your screen, bro. -``` +Demetrios: +This is good stuff. -```rust -use qdrant_client::qdrant::{QueryPointsBuilder, SearchParamsBuilder}; -use qdrant_client::Qdrant; +Mikko LehtimĂ€ki: +Cool. So, shall we get into? Yeah. My name is Mikko. I'm the chief data scientist here at Softlandia. I finished my phd last summer and have been doing the Softlandia for two years now. I'm also a contributor to some open source AI LLM libraries like Llama index and cartrails AI. So if you haven't checked those out ever, please do. Here at Softlandia, we are primarily an AI consultancy that focuses on end to end AI solutions, but we've also developed our in house stack for large language model applications, which I'll be discussing today. -let client = Qdrant::from_url("http://localhost:6334").build()?; +Mikko LehtimĂ€ki: +So the topic of the talk is a bit provocative. Maybe it's a bitter lesson of retrieval for large language models, and it really stems from our experience in building production ready retrieval augmented generation solutions. I just want to say it's not really a lecture, so I'm going to tell you to do this or do that. I'll just try to walk you through the thought process that we've kind of adapted when we develop rack solutions, and we'll see if that resonates with you or not. So our LLM solution is called Yokot AI. It's really like a platform where enterprises can upload their own documents and get language model based insights from them. The typical example is question answering from your documents, but we're doing a bit more than that. For example, users can generate long form documents, leveraging their own data, and worrying about the token limitations that you typically run in when you ask an LLM to output something. -client - .query( - QueryPointsBuilder::new("{collection_name}") - .query(vec![0.2, 0.1, 0.9, 0.7]) - .limit(3) - .params(SearchParamsBuilder::default().hnsw_ef(128).exact(false)), - ) - .await?; +Mikko LehtimĂ€ki: +Here you see just a snapshot of the data management view that we have built. So users can bring their own documents or scrape the web, and then access the data with LLMS right away. This is the document generation output. It's longer than you typically see, and each section can be based on different data sources. We've got different generative flows, like we call them, so you can take your documents and change the style using llms. And of course, the typical chat view, which is really like the entry point, to also do these workflows. And you can see the sources that the language model is using when you're asking questions from your data. And this is all made possible with retrieval augmented generation. -``` +Mikko LehtimĂ€ki: +That happens behind the scenes. So when we ask the LLM to do a task, we're first fetching data from what was uploaded, and then everything goes from there. So we decide which data to pull, how to use it, how to generate the output, and how to present it to the user so that they can keep on conversing with the data or export it to their desired format, whatnot. But the primary challenge with this kind of system is that it is very open ended. So we don't really set restrictions on what kind of data the users can upload or what language the data is in. So, for example, we're based in Finland. Most of our customers are here in the Nordics. They talk, speak Finnish, Swedish. -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.QueryPoints; -import io.qdrant.client.grpc.Points.SearchParams; +Mikko LehtimĂ€ki: +Most of their data is in English, because why not? And they can just use whatever language they feel with the system. So we don't want to restrict any of that. The other thing is the chat view as an interface, it really doesn't set much limits. So the users have the freedom to do the task that they choose with the system. So the possibilities are really broad that we have to prepare for. So that's what we are building. Now, if you haven't heard of the bitter lesson, it's actually a theorem. It's based on a blog post by Ricard Sutton, and it states basically that based on what we have learned from the development of machine learning and artificial intelligence systems in the previous decades, the methods that can leverage data and compute tends to or will eventually outperform the methods that are designed or handcrafted by humans. -import static io.qdrant.client.QueryFactory.nearest; +Mikko LehtimĂ€ki: +So for example, I have an illustration here showing how this has manifested in image analysis. So on the left hand side, you see the output from an operation that extracts gradients from images. We used to build image analysis on this type of features that we designed manually. We would run some kind of edge extraction, we would count corners, we would compute the edge distances and design the features by hand in order to work with image data. Whereas now we can just feed a bunch of images to a transformer, and we'll get beautiful bounding boxes and semantic segmentation outputs without building rules into the system. So that's a prime example of the bitter lesson in action. Now, if we take this to the context of rack or retrieval augmented generation, let's have a look first at the simple rack architecture. Why do we do this in the first place? Well, it's because the language models themselves, they don't have up to date data because they've been trained a while ago. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +Mikko LehtimĂ€ki: +You don't really even know when. So we need to give them access to more recent data, and we need a method for doing that. And the other thing is problems like hallucinations. We found that if you just ask the model a question that is in the training data, you won't get always reliable results. But if you can crown the model's answers with data, you will get more factual results. So this is what can be done with the rack as well. And the final thing is that we just cannot give a book, for example, in one go the language model, because even if theoretically it could read the input in one go, the result quality that you get from the language model is going to suffer if you feed it too much data at once. So this is why we have designed retrieval augmented generation architectures. -client.queryAsync( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(nearest(0.2f, 0.1f, 0.9f, 0.7f)) - .setParams(SearchParams.newBuilder().setHnswEf(128).setExact(false).build()) - .setLimit(3) - .build()) - .get(); +Mikko LehtimĂ€ki: +And if we look at this system on the bottom, you see the typical data ingestion. So the user gives a document, we slice it to small chunks, and we compute a numerical representation with vector embeddings and store those in a vector database. Why a vector database? Because it's really efficient to retrieve vectors from it when we get users query. So that is also embedded and it's used to look up relevant sources from the data that was previously uploaded efficiently directly on the database, and then we can fit the resulting text, the language model, to synthesize an answer. And this is how the RHe works in very basic form. Now you can see that if you have only a single document that you work with, it's nice if the problem set that you want to solve is very constrained, but the more data you can bring to your system, the more workflows you can build on that data. So if you have, for example, access to a complete book or many books, it's easy to see you can also generate higher quality content from that data. So this architecture really must be such that it can also make use of those larger amounts of data. -``` +Mikko LehtimĂ€ki: +Anyway, once you implement this for the first time, it really feels like magic. It tends to work quite nicely, but soon you'll notice that it's not suitable for all kinds of tasks. Like you will see sometimes that, for example, the lists. If you retrieve lists, they may be broken. If you ask questions that are document comparisons, you may not get complete results. If you run summarization tasks without thinking about it anymore, then that will most likely lead to super results. So we'll have to extend the architecture quite a bit to take into account all the use cases that we want to enable with bigger amounts of data that the users upload. And this is what it may look like once you've gone through a few design iterations. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +Mikko LehtimĂ€ki: +So let's see, what steps can we add to our rack stack in order to make it deliver better quality results? If we start from the bottom again, we can see that we try to enhance the quality of the data that we upload by adding steps to the data ingestion pipeline. We can augment the data we store, for example, by using multiple chunking strategies or generating question answer pairs from the user's documents, and then we'll embed those and look them up when the queries come in. At the same time, we can reduce the data we upload, so we want to make sure there are no duplicates. We want to clean low quality things like HTML stuff, and we also may want to add some metadata so that certain data, for example references, can be excluded from the search results if they're not needed to run the tasks that we like to do. We've modeled this as a stream processing pipeline, by the way. So we're using Bytewax, which is another really nice open source framework. Just a tiny advertisement we're going to have a workshop with Bytewax about rack on February 16, so keep your eyes open for that. At the center I have added different databases and different retrieval methods. -var client = new QdrantClient("localhost", 6334); +Mikko LehtimĂ€ki: +We may, for example, add keyword based retrieval and metadata filters. The nice thing is that you can do all of this with quattron if you like. So that can be like a one stop shop for your document data. But some users may want to experiment with different databases, like graph databases or NoSQL databases and just ordinary SQL databases as well. They can enable different kinds of use cases really. So it's up to your service which one is really useful for you. If we look more to the left, we have a component called query planner and some query routers. And this really determines the response strategy. -await client.QueryAsync( - collectionName: "{collection_name}", - query: new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, - searchParams: new SearchParams { HnswEf = 128, Exact = false }, - limit: 3 -); +Mikko LehtimĂ€ki: +So when you get the query from the user, for example, you want to take different steps in order to answer it. For example, you may want to decompose the query to small questions that you answer individually, and each individual question may take a different path. So you may want to do a query based on metadata, for example pages five and six from a document. Or you may want to look up based on keywords full each page or chunk with a specific word. And there's really like a massive amount of choices how this can go. Another example is generating hypothetical documents based on the query and embedding those rather than the query itself. That will in some cases lead to higher quality retrieval results. But now all this leads into the right side of the query path. -``` +Mikko LehtimĂ€ki: +So here we have a re ranker. So if we implement all of this, we end up really retrieving a lot of data. We typically will retrieve more than it makes sense to give to the language model in a single call. So we can add a re ranker step here and it will firstly filter out low quality retrieved content and secondly, it will put the higher quality content on the top of the retrieved documents. And now when you pass this reranked content to the language model, it should be able to pay better attention to the details that actually matter given the query. And this should lead to you better managing the amount of data that you have to handle with your final response generator, LLM. And it should also make the response generator a bit faster because you will be feeding slightly less data in one go. The simplest way to build a re ranker is probably just asking a large language model to re rank or summarize the content that you've retrieved before you feed it to the language model. -```go -import ( - "context" +Mikko LehtimĂ€ki: +That's one way to do it. So yeah, that's a lot of complexity and honestly, we're not doing all of this right now with Yokot AI, either. We've tried all of it in different scopes, but really it's a lot of logic to maintain. And to me this just like screams the bitter lesson, because we're building so many steps, so much logic, so many rules into the system, when really all of this is done just because the language model can't be trusted, or it can't be with the current architectures trained reliably, or cannot be trained in real time with the current approaches that we have. So there's one thing in this picture, in my opinion, that is more promising than the others for leveraging data and compute, which should dominate the quality of the solution in the long term. And if we focus only on that, or not only, but if we focus heavily on that part of the process, we should be able to eliminate some complexity elsewhere. So if you're watching the recording, you can pause and think what this component may be. But in my opinion, it is the re ranker at the end. - "github.com/qdrant/go-client/qdrant" -) +Mikko LehtimĂ€ki: +And why is that? Well, of course you could argue that the language model itself is one, but with the current architectures that we have, I think we need the retrieval process. We cannot just leave it out and hope that someday soon we will have a language model that doesn't require us fetching the data for it in such a sophisticated manner. The reranker is a component that can leverage data and compute quite efficiently, and it doesn't require that much manual craftmanship either. It's a stakes in samples and outputs samples, and it plays together really well with efficient vector search that we have available now. Like quatrant being a prime example of that. The vector search is an initial filtering step, and then the re ranker is the secondary step that makes sure that we get the highest possible quality data to the final LLM. And the efficiency of the re ranker really comes from the fact that it doesn't have to be a full blown generative language model so often it is a language model, but it doesn't have to have the ability to generate GPT four level content. It just needs to understand, and in some, maybe even a very fixed way, communicate the importance of the inputs that you give it. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Mikko LehtimĂ€ki: +So typically the inputs are the user's query and the data that was retrieved. Like I mentioned earlier, the easiest way to use a read ranker is probably asking a large language model to rerank your chunks or sentences that you retrieved. But there are also models that have been trained specifically for this, the Colbert model being a primary example of that and we also have to remember that the rerankers have been around for a long time. They've been used in traditional search engines for a good while. We just now require a bit higher quality from them because there's no user checking the search results and deciding which of them is relevant. After the fact that the re ranking has already been run, we need to trust that the output of the re ranker is high quality and can be given to the language model. So you can probably get plenty of ideas from the literature as well. But the easiest way is definitely to use LLM behind a simple API. -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQuery(0.2, 0.1, 0.9, 0.7), - Params: &qdrant.SearchParams{ - HnswEf: qdrant.PtrOf(uint64(128)), - Exact: qdrant.PtrOf(false), - }, -}) +Mikko LehtimĂ€ki: +And that's not to say that you should ignore the rest like the query planner is of course a useful component, and the different methods of retrieval are still relevant for different types of user queries. So yeah, that's how I think the bitter lesson is realizing in these rack architectures I've collected here some methods that are recent or interesting in my opinion. But like I said, there's a lot of existing information from information retrieval research that is probably going to be rediscovered in the near future. So if we summarize the bitter lesson which we have or are experiencing firsthand, states that the methods that leverage data and compute will outperform the handcrafted approaches. And if we focus on the re ranking component in the RHE, we'll be able to eliminate some complexity elsewhere in the process. And it's good to keep in mind that we're of course all the time waiting for advances in the large language model technology. But those advances will very likely benefit the re ranker component as well. So keep that in mind when you find new, interesting research. -``` +Mikko LehtimĂ€ki: +Cool. That's pretty much my argument finally there. I hope somebody finds it interesting. -## [Anchor](https://qdrant.tech/documentation/guides/optimize/\#balancing-latency-and-throughput) Balancing Latency and Throughput +Demetrios: +Very cool. It was bitter like a black cup of coffee, or bitter like dark chocolate. I really like these lessons that you've learned, and I appreciate you sharing them with us. I know the re ranking and just the retrieval evaluation aspect is something on a lot of people's minds right now, and I know a few people at Qdrant are actively thinking about that too, and how to make it easier. So it's cool that you've been through it, you've felt the pain, and you also are able to share what has helped you. And so I appreciate that. In case anyone has any questions, now would be the time to ask them. Otherwise we will take it offline and we'll let everyone reach out to you on LinkedIn, and I can share your LinkedIn profile in the chat to make it real easy for people to reach out if they want to, because this was cool, man. -When optimizing search performance, latency and throughput are two main metrics to consider: +Demetrios: +This was very cool, and I appreciate it. -- **Latency:** Time taken for a single request. -- **Throughput:** Number of requests handled per second. +Mikko LehtimĂ€ki: +Thanks. I hope it's useful to someone. -The following optimization approaches are not mutually exclusive, but in some cases it might be preferable to optimize for one or another. +Demetrios: +Excellent. Well, if that is all, I guess I've got one question for you. Even though we are kind of running up on time, so it'll be like a lightning question. You mentioned how you showed the really descriptive diagram where you have everything on there, and it's kind of like the dream state or the dream outcome you're going for. What is next? What are you going to create out of that diagram that you don't have yet? -### [Anchor](https://qdrant.tech/documentation/guides/optimize/\#minimizing-latency) Minimizing Latency +Mikko LehtimĂ€ki: +You want the lightning answer would be really good to put this run on a local hardware completely. I know that's not maybe the algorithmic thing or not necessarily in the scope of Yoko AI, but if we could run this on a physical device in that form, that would be super. -To minimize latency, you can set up Qdrant to use as many cores as possible for a single request. -You can do this by setting the number of segments in the collection to be equal to the number of cores in the system. +Demetrios: +I like it. I like it. All right. Well, Mikko, thanks for everything and everyone that is out there. All you vector space astronauts. Have a great day. Morning, night, wherever you are at in the world or in space. And we will see you later. -In this case, each segment will be processed in parallel, and the final result will be obtained faster. +Demetrios: +Thanks. -httppythontypescriptrustjavacsharpgo +Mikko LehtimĂ€ki: +See you. -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 768, - "distance": "Cosine" - }, - "optimizers_config": { - "default_segment_number": 16 - } -} +<|page-381-lllmstxt|> +> *"We have something like Qdrant, which is very geared towards doing Vector search. And so we understand the shape of the storage system now.”*\ +— Diptanu Gon Choudhury +> -``` +Diptanu Gon Choudhury is the founder of Tensorlake. They are building Indexify - an open-source scalable structured extraction engine for unstructured data to build near-real-time knowledgebase for AI/agent-driven workflows and query engines. Before building Indexify, Diptanu created the Nomad cluster scheduler at Hashicorp, inventor of the Titan/Titus cluster scheduler at Netflix, led the FBLearner machine learning platform, and built the real-time speech inference engine at Facebook. -```python -from qdrant_client import QdrantClient, models +***Listen to the episode on [Spotify](https://open.spotify.com/episode/6MSwo7urQAWE7EOxO7WTns?si=_s53wC0wR9C4uF8ngGYQlg), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/RoOgTxHkViA).*** -client = QdrantClient(url="http://localhost:6333") + -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), - optimizers_config=models.OptimizersConfigDiff(default_segment_number=16), -) + -``` +## **Top takeaways:** -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +Discover how reimagined data infrastructures revolutionize AI-agent workflows as Diptanu delves into Indexify, transforming raw data into real-time knowledge bases, and shares expert insights on optimizing rag-based applications, all amidst the ever-evolving landscape of Spark. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +Here's What You'll Discover: -client.createCollection("{collection_name}", { - vectors: { - size: 768, - distance: "Cosine", - }, - optimizers_config: { - default_segment_number: 16, - }, -}); +1. **Innovative Data Infrastructure**: Diptanu dives deep into how Indexify is revolutionizing the enterprise world by providing a sharper focus on data infrastructure and a refined abstraction for generative AI this year. +2. **AI-Copilot for Call Centers**: Learn how Indexify streamlines customer service with a real-time knowledge base, transforming how agents interact and resolve issues. +3. **Scaling Real-Time Indexing**: discover the system’s powerful capability to index content as it happens, enabling multiple extractors to run simultaneously. It’s all about the right model and the computing capacity for on-the-fly content generation. +4. **Revamping Developer Experience**: get a glimpse into the future as Diptanu chats with Demetrios about reimagining Spark to fit today's tech capabilities, vastly different from just two years ago! +5. **AI Agent Workflow Insights**: Understand the crux of AI agent-driven workflows, where models dynamically react to data, making orchestrated decisions in live environments. -``` +> Fun Fact: The development of Indexify by Diptanu was spurred by the rising use of Large Language Models in applications and the subsequent need for better data infrastructure to support these technologies. +> -```rust -use qdrant_client::qdrant::{ - CreateCollectionBuilder, Distance, OptimizersConfigDiffBuilder, VectorParamsBuilder, -}; -use qdrant_client::Qdrant; +## Show notes: -let client = Qdrant::from_url("http://localhost:6334").build()?; +00:00 AI's impact on model production and workflows.\ +05:15 Building agents need indexes for continuous updates.\ +09:27 Early RaG and LLMs adopters neglect data infrastructure.\ +12:32 Design partner creating copilot for call centers.\ +17:00 Efficient indexing and generation using scalable models.\ +20:47 Spark is versatile, used for many cases.\ +24:45 Recent survey paper on RAG covers tips.\ +26:57 Evaluation of various aspects of data generation.\ +28:45 Balancing trust and cost in factual accuracy. -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) - .optimizers_config( - OptimizersConfigDiffBuilder::default().default_segment_number(16), - ), - ) - .await?; +## More Quotes from Diptanu: -``` +*"In 2017, when I started doing machine learning, it would take us six months to ship a good model in production. And here we are today, in January 2024, new models are coming out every week, and people are putting them in production.”*\ +-- Diptanu Gon Choudhury -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; -import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.grpc.Collections.VectorsConfig; +*"Over a period of time, you want to extract new information out of existing data, because models are getting better continuously.”*\ +-- Diptanu Gon Choudhury -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +*"We are in the golden age of demos. Golden age of demos with LLMs. Almost anyone, I think with some programming knowledge can kind of like write a demo with an OpenAI API or with an embedding model and so on.”*\ +-- Diptanu Gon Choudhury -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig( - VectorsConfig.newBuilder() - .setParams( - VectorParams.newBuilder() - .setSize(768) - .setDistance(Distance.Cosine) - .build()) - .build()) - .setOptimizersConfig( - OptimizersConfigDiff.newBuilder().setDefaultSegmentNumber(16).build()) - .build()) - .get(); +## Transcript: +Demetrios: +We are live, baby. This is it. Welcome back to another vector space talks. I'm here with my man Diptanu. He is the founder and creator of Tenterlake. They are building indexify, an open source, scalable, structured extraction engine for unstructured data to build near real time knowledge bases for AI agent driven workflows and query engines. And if it sounds like I just threw every buzzword in the book into that sentence, you can go ahead and say, bingo, we are here, and we're about to dissect what all that means in the next 30 minutes. So, dude, first of all, I got to just let everyone know who is here, that you are a bit of a hard hitter. -``` +Demetrios: +You've got some track record under some notches on your belt. We could say before you created Tensorlake, let's just let people know that you were at Hashicorp, you created the nomad cluster scheduler, and you were the inventor of Titus cluster scheduler at Netflix. You led the FB learner machine learning platform and built real time speech inference engine at Facebook. You may be one of the most decorated people we've had on and that I have had the pleasure of talking to, and that's saying a lot. I've talked to a lot of people in my day, so I want to dig in, man. First question I've got for you, it's a big one. What the hell do you mean by AI agent driven workflows? Are you talking to autonomous agents? Are you talking, like the voice agents? What's that? -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +Diptanu Gon Choudhury: +Yeah, I was going to say that what a great last couple of years has been for AI. I mean, in context, learning has kind of, like, changed the way people do models and access models and use models in production, like at Facebook. In 2017, when I started doing machine learning, it would take us six months to ship a good model in production. And here we are today, in January 2024, new models are coming out every week, and people are putting them in production. It's a little bit of a Yolo where I feel like people have stopped measuring how well models are doing and just ship in production, but here we are. But I think underpinning all of this is kind of like this whole idea that models are capable of reasoning over data and non parametric knowledge to a certain extent. And what we are seeing now is workflows stop being completely heuristics driven, or as people say, like software 10 driven. And people are putting models in the picture where models are reacting to data that a workflow is seeing, and then people are using models behavior on the data and kind of like making the model decide what should the workflow do? And I think that's pretty much like, to me, what an agent is that an agent responds to information of the world and information which is external and kind of reacts to the information and kind of orchestrates some kind of business process or some kind of workflow, some kind of decision making in a workflow. -var client = new QdrantClient("localhost", 6334); +Diptanu Gon Choudhury: +That's what I mean by agents. And they can be like autonomous. They can be something that writes an email or writes a chat message or something like that. The spectrum is wide here. -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine }, - optimizersConfig: new OptimizersConfigDiff { DefaultSegmentNumber = 16 } -); +Demetrios: +Excellent. So next question, logical question is, and I will second what you're saying. Like the advances that we've seen in the last year, wow. And the times are a change in, we are trying to evaluate while in production. And I like the term, yeah, we just yoloed it, or as the young kids say now, or so I've heard, because I'm not one of them, but we just do it for the plot. So we are getting those models out there, we're seeing if they work. And I imagine you saw some funny quotes from the Chevrolet chat bot, that it was a chat bot on the Chevrolet support page, and it was asked if Teslas are better than Chevys. And it said, yeah, Teslas are better than Chevys. -``` +Demetrios: +So yes, that's what we do these days. This is 2024, baby. We just put it out there and test and prod. Anyway, getting back on topic, let's talk about indexify, because there was a whole lot of jargon that I said of what you do, give me the straight shooting answer. Break it down for me like I was five. Yeah. -```go -import ( - "context" +Diptanu Gon Choudhury: +So if you are building an agent today, which depends on augmented generation, like retrieval, augmented generation, and given that this is Qdrant's show, I'm assuming people are very much familiar with Arag and augmented generation. So if people are building applications where the data is external or non parametric, and the model needs to see updated information all the time, because let's say, the documents under the hood that the application is using for its knowledge base is changing, or someone is building a chat application where new chat messages are coming all the time, and the agent or the model needs to know about what is happening, then you need like an index, or a set of indexes, which are continuously updated. And you also, over a period of time, you want to extract new information out of existing data, because models are getting better continuously. And the other thing is, AI, until now, or until a couple of years back, used to be very domain oriented or task oriented, where modality was the key behind models. Now we are entering into a world where information being encoded in any form, documents, videos or whatever, are important to these workflows that people are building or these agents that people are building. And so you need capability to ingest any kind of data and then build indexes out of them. And indexes, in my opinion, are not just embedding indexes, they could be indexes of semi structured data. So let's say you have an invoice. - "github.com/qdrant/go-client/qdrant" -) +Diptanu Gon Choudhury: +You want to maybe transform that invoice into semi structured data of where the invoice is coming from or what are the line items and so on. So in a nutshell, you need good data infrastructure to store these indexes and serve these indexes. And also you need a scalable compute engine so that whenever new data comes in, you're able to index them appropriately and update the indexes and so on. And also you need capability to experiment, to add new extractors into your platform, add new models into your platform, and so on. Indexify helps you with all that, right? So indexify, imagine indexify to be an online service with an API so that developers can upload any form of unstructured data, and then a bunch of extractors run in parallel on the cluster and extract information out of this unstructured data, and then update indexes on something like Qdrant or postgres for semi structured data continuously. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Demetrios: +Okay? -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 768, - Distance: qdrant.Distance_Cosine, - }), - OptimizersConfig: &qdrant.OptimizersConfigDiff{ - DefaultSegmentNumber: qdrant.PtrOf(uint64(16)), - }, -}) +Diptanu Gon Choudhury: +And you basically get that in a single application, in a single binary, which is distributed on your cluster. You wouldn't have any external dependencies other than storage systems, essentially, to have a very scalable data infrastructure for your Rag applications or for your LLM agents. -``` +Demetrios: +Excellent. So then talk to me about the inspiration for creating this. What was it that you saw that gave you that spark of, you know what? There needs to be something on the market that can handle this. Yeah. -### [Anchor](https://qdrant.tech/documentation/guides/optimize/\#maximizing-throughput) Maximizing Throughput +Diptanu Gon Choudhury: +Earlier this year I was working with founder of a generative AI startup here. I was looking at what they were doing, I was helping them out, and I saw that. And then I looked around, I looked around at what is happening. Not earlier this year as in 2023. Somewhere in early 2023, I was looking at how developers are building applications with llms, and we are in the golden age of demos. Golden age of demos with llms. Almost anyone, I think with some programming knowledge can kind of like write a demo with an OpenAI API or with an embedding model and so on. And I mostly saw that the data infrastructure part of those demos or those applications were very basic people would do like one shot transformation of data, build indexes and then do stuff, build an application on top. -To maximize throughput, configure Qdrant to use as many cores as possible to process multiple requests in parallel. +Diptanu Gon Choudhury: +And then I started talking to early adopters of RaG and llms in enterprises, and I started talking to them about how they're building their data pipelines and their data infrastructure for llms. And I feel like people were mostly excited about the application layer, right? A very less amount of thought was being put on the data infrastructure, and it was almost like built out of duct tape, right, of pipeline, like pipelines and workflows like RabbitMQ, like x, Y and z, very bespoke pipelines, which are good at one shot transformation of data. So you put in some documents on a queue, and then somehow the documents get embedded and put into something like Qdrant. But there was no thought about how do you re index? How do you add a new capability into your pipeline? Or how do you keep the whole system online, right? Keep the indexes online while reindexing and so on. And so classically, if you talk to a distributed systems engineer, they would be, you know, this is a mapreduce problem, right? So there are tools like Spark, there are tools like any skills ray, and they would classically solve these problems, right? And if you go to Facebook, we use Spark for something like this, or like presto, or we have a ton of big data infrastructure for handling things like this. And I thought that in 2023 we need a better abstraction for doing something like this. The world is moving to our server less, right? Developers understand functions. Developer thinks about computers as functions and functions which are distributed on the cluster and can transform content into something that llms can consume. -To do that, use fewer segments (usually 2) of larger size (default 200Mb per segment) to handle more requests in parallel. +Diptanu Gon Choudhury: +And that was the inspiration I was thinking, what would it look like if we redid Spark or ray for generative AI in 2023? How can we make it so easy so that developers can write functions to extract content out of any form of unstructured data, right? You don't need to think about text, audio, video, or whatever. You write a function which can kind of handle a particular data type and then extract something out of it. And now how can we scale it? How can we give developers very transparently, like, all the abilities to manage indexes and serve indexes in production? And so that was the inspiration for it. I wanted to reimagine Mapreduce for generative AI. -Large segments benefit from the size of the index and overall smaller number of vector comparisons required to find the nearest neighbors. However, they will require more time to build the HNSW index. +Demetrios: +Wow. I like the vision you sent me over some ideas of different use cases that we can walk through, and I'd love to go through that and put it into actual tangible things that you've been seeing out there. And how you can plug it in to these different use cases. I think the first one that I wanted to look at was building a copilot for call center agents and what that actually looks like in practice. Yeah. -httppythontypescriptrustjavacsharpgo +Diptanu Gon Choudhury: +So I took that example because that was super close to my heart in the sense that we have a design partner like who is doing this. And you'll see that in a call center, the information that comes in into a call center or the information that an agent in a human being in a call center works with is very rich. In a call center you have phone calls coming in, you have chat messages coming in, you have emails going on, and then there are also documents which are knowledge bases for human beings to answer questions or make decisions on. Right. And so they're working with a lot of data and then they're always pulling up a lot of information. And so one of our design partner is like building a copilot for call centers essentially. And what they're doing is they want the humans in a call center to answer questions really easily based on the context of a conversation or a call that is happening with one of their users, or pull up up to date information about the policies of the company and so on. And so the way they are using indexify is that they ingest all the content, like the raw content that is coming in video, not video, actually, like audio emails, chat messages into indexify. -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 768, - "distance": "Cosine" - }, - "optimizers_config": { - "default_segment_number": 2, - "max_segment_size": 5000000 - } -} +Diptanu Gon Choudhury: +And then they have a bunch of extractors which handle different type of modalities, right? Some extractors extract information out of emails. Like they would do email classification, they would do embedding of emails, they would do like entity extraction from emails. And so they are creating many different types of indexes from emails. Same with speech. Right? Like data that is coming on through calls. They would transcribe them first using ASR extractor, and from there on the speech would be embedded and the whole pipeline for a text would be invoked into it, and then the speech would be searchable. If someone wants to find out what conversation has happened, they would be able to look up things. There is a summarizer extractor, which is like looking at a phone call and then summarizing what the customer had called and so on. -``` +Diptanu Gon Choudhury: +So they are basically building a near real time knowledge base of one what is happening with the customer. And also they are pulling in information from their documents. So that's like one classic use case. Now the only dependency now they have is essentially like a blob storage system and serving infrastructure for indexes, like in this case, like Qdrant and postgres. And they have a bunch of extractors that they have written in house and some extractors that we have written, they're using them out of the box and they can scale the system to as much as they need. And it's kind of like giving them a high level abstraction of building indexes and using them in llms. -```python -from qdrant_client import QdrantClient, models +Demetrios: +So I really like this idea of how you have the unstructured and you have the semi structured and how those play together almost. And I think one thing that is very clear is how you've got the transcripts, you've got the embeddings that you're doing, but then you've also got documents that are very structured and maybe it's from the last call and it's like in some kind of a database. And I imagine we could say whatever, salesforce, it's in a salesforce and you've got it all there. And so there is some structure to that data. And now you want to be able to plug into all of that and you want to be able to, especially in this use case, the call center agents, human agents need to make decisions and they need to make decisions fast. Right. So the real time aspect really plays a part of that. -client = QdrantClient(url="http://localhost:6333") +Diptanu Gon Choudhury: +Exactly. -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), - optimizers_config=models.OptimizersConfigDiff(default_segment_number=2, max_segment_size=5000000), -) +Demetrios: +You can't have it be something that it'll get back to you in 30 seconds, or maybe 30 seconds is okay, but really the less time the better. And so traditionally when I think about using llms, I kind of take real time off the table. Have you had luck with making it more real time? Yeah. -``` +Diptanu Gon Choudhury: +So there are two aspects of it. How quickly can your indexes be updated? As of last night, we can index all of Wikipedia under five minutes on AWS. We can run up to like 5000 extractors with indexify concurrently and parallel. I feel like we got the indexing part covered. Unless obviously you are using a model as behind an API where we don't have any control. But assuming you're using some kind of embedding model or some kind of extractor model, right, like a named entity extractor or an speech to text model that you control and you understand the I Ops, we can scale it out and our system can kind of handle the scale of getting it indexed really quickly. Now on the generation side, that's where it's a little bit more nuanced, right? Generation depends on how big the generation model is. If you're using GPD four, then obviously you would be playing with the latency budgets that OpenAI provides. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +Diptanu Gon Choudhury: +If you're using some other form of models like mixture MoE or something which is very optimized and you have worked on making the model optimized, then obviously you can cut it down. So it depends on the end to end stack. It's not like a single piece of software. It's not like a monolithic piece of software. So it depends on a lot of different factors. But I can confidently claim that we have gotten the indexing side of real time aspects covered as long as the models people are using are reasonable and they have enough compute in their cluster. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +Demetrios: +Yeah. Okay. Now talking again about the idea of rethinking the developer experience with this and almost reimagining what Spark would be if it were created today. -client.createCollection("{collection_name}", { - vectors: { - size: 768, - distance: "Cosine", - }, - optimizers_config: { - default_segment_number: 2, - max_segment_size: 5000000, - }, -}); +Diptanu Gon Choudhury: +Exactly. -``` +Demetrios: +How do you think that there are manifestations in what you've built that play off of things that could only happen because you created it today as opposed to even two years ago. -```rust -use qdrant_client::qdrant::{ - CreateCollectionBuilder, Distance, OptimizersConfigDiffBuilder, VectorParamsBuilder, -}; -use qdrant_client::Qdrant; +Diptanu Gon Choudhury: +Yeah. So I think, for example, take Spark, right? Spark was born out of big data, like the 2011 twelve era of big data. In fact, I was one of the committers on Apache Mesos, the cluster scheduler that Spark used for a long time. And then when I was at Hashicorp, we tried to contribute support for Nomad in Spark. What I'm trying to say is that Spark is a task scheduler at the end of the day and it uses an underlying scheduler. So the teams that manage spark today or any other similar tools, they have like tens or 15 people, or they're using like a hosted solution, which is super complex to manage. Right. A spark cluster is not easy to manage. -let client = Qdrant::from_url("http://localhost:6334").build()?; +Diptanu Gon Choudhury: +I'm not saying it's a bad thing or whatever. Software written at any given point in time reflect the world in which it was born. And so obviously it's from that era of systems engineering and so on. And since then, systems engineering has progressed quite a lot. I feel like we have learned how to make software which is scalable, but yet simpler to understand and to operate and so on. And the other big thing in spark that I feel like is missing or any skills, Ray, is that they are not natively integrated into the data stack. Right. They don't have an opinion on what the data stack is. -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) - .optimizers_config( - OptimizersConfigDiffBuilder::default().default_segment_number(2).max_segment_size(5000000), - ), - ) - .await?; +Diptanu Gon Choudhury: +They're like excellent Mapreduce systems, and then the data stuff is layered on top. And to a certain extent that has allowed them to generalize to so many different use cases. People use spark for everything. At Facebook, I was using Spark for batch transcoding of speech, to text, for various use cases with a lot of issues under the hood. Right? So they are tied to the big data storage infrastructure. So when I am reimagining Spark, I almost can take the position that we are going to use blob storage for ingestion and writing raw data, and we will have low latency serving infrastructure in the form of something like postgres or something like clickhouse or something for serving like structured data or semi structured data. And then we have something like Qdrant, which is very geared towards doing vector search and so on. And so we understand the shape of the storage system now. -``` +Diptanu Gon Choudhury: +We understand that developers want to integrate with them. So now we can control the compute layer such that the compute layer is optimized for doing the compute and producing data such that they can be written in those data stores, right? So we understand the I Ops, right? The I O, what is it called? The I O characteristics of the underlying storage system really well. And we understand that the use case is that people want to consume those data in llms, right? So we can make design decisions such that how we write into those, into the storage system, how we serve very specifically for llms, that I feel like a developer would be making those decisions themselves, like if they were using some other tool. -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; -import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.grpc.Collections.VectorsConfig; +Demetrios: +Yeah, it does feel like optimizing for that and recognizing that spark is almost like a swiss army knife. As you mentioned, you can do a million things with it, but sometimes you don't want to do a million things. You just want to do one thing and you want it to be really easy to be able to do that one thing. I had a friend who worked at some enterprise and he was talking about how spark engineers have all the job security in the world, because a, like you said, you need a lot of them, and b, it's hard stuff being able to work on that and getting really deep and knowing it and the ins and outs of it. So I can feel where you're coming from on that one. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +Diptanu Gon Choudhury: +Yeah, I mean, we basically integrated the compute engine with the storage so developers don't have to think about it. Plug in whatever storage you want. We support, obviously, like all the blob stores, and we support Qdrant and postgres right now, indexify in the future can even have other storage engines. And now all an application developer needs to do is deploy this on AWS or GCP or whatever, right? Have enough compute, point it to the storage systems, and then now build your application. You don't need to make any of the hard decisions or build a distributed systems by bringing together like five different tools and spend like five months building the data layer, focus on the application, build your agents. -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig( - VectorsConfig.newBuilder() - .setParams( - VectorParams.newBuilder() - .setSize(768) - .setDistance(Distance.Cosine) - .build()) - .build()) - .setOptimizersConfig( - OptimizersConfigDiff.newBuilder() - .setDefaultSegmentNumber(2) - .setMaxSegmentSize(5000000) - .build() - ) - .build()) - .get(); +Demetrios: +So there is something else. As we are winding down, I want to ask you one last thing, and if anyone has any questions, feel free to throw them in the chat. I am monitoring that also, but I am wondering about advice that you have for people that are building rag based applications, because I feel like you've probably seen quite a few out there in the wild. And so what are some optimizations or some nice hacks that you've seen that have worked really well? Yeah. -``` +Diptanu Gon Choudhury: +So I think, first of all, there is a recent paper, like a rack survey paper. I really like it. Maybe you can have the link on the show notes if you have one. There was a recent survey paper, I really liked it, and it covers a lot of tips and tricks that people can use with Rag. But essentially, Rag is an information. Rag is like a two step process in its essence. One is the document selection process and the document reading process. Document selection is how do you retrieve the most important information out of million documents that might be there, and then the reading process is how do you jam them in the context of a model, and so that the model can kind of ground its generation based on the context. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +Diptanu Gon Choudhury: +So I think the most tricky part here, and the part which has the most tips and tricks is the document selection part. And that is like a classic information retrieval problem. So I would suggest people doing a lot of experimentation around ranking algorithms, hitting different type of indexes, and refining the results by merging results from different indexes. One thing that always works for me is reducing the search space of the documents that I am selecting in a very systematic manner. So like using some kind of hybrid search where someone does the embedding lookup first, and then does the keyword lookup, or vice versa, or does lookups parallel and then merges results together? Those kind of things where the search space is narrowed down always works for me. -var client = new QdrantClient("localhost", 6334); +Demetrios: +So I think one of the Qdrant team members would love to know because I've been talking to them quite frequently about this, the evaluating of retrieval. Have you found any tricks or tips around that and evaluating the quality of what is retrieved? -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine }, - optimizersConfig: new OptimizersConfigDiff { DefaultSegmentNumber = 2, MaxSegmentSize = 5000000 } -); +Diptanu Gon Choudhury: +So I haven't come across a golden one trick that fits every use case type thing like solution for evaluation. Evaluation is really hard. There are open source projects like ragas who are trying to solve it, and everyone is trying to solve various, various aspects of evaluating like rag exactly. Some of them try to evaluate how accurate the results are, some people are trying to evaluate how diverse the answers are, and so on. I think the most important thing that our design partners care about is factual accuracy and factual accuracy. One process that has worked really well is like having a critique model. So let the generation model generate some data and then have a critique model go and try to find citations and look up how accurate the data is, how accurate the generation is, and then feed that back into the system. One another thing like going back to the previous point is what tricks can someone use for doing rag really well? I feel like people don't fine tune embedding models that much. -``` +Diptanu Gon Choudhury: +I think if people are using an embedding model, like sentence transformer or anything like off the shelf, they should look into fine tuning the embedding models on their data set that they are embedding. And I think a combination of fine tuning the embedding models and kind of like doing some factual accuracy checks lead to a long way in getting like rag working really well. -```go -import ( - "context" +Demetrios: +Yeah, it's an interesting one. And I'll probably leave it here on the extra model that is basically checking factual accuracy. You've always got these trade offs that you're playing with, right? And one of the trade offs is going to be, maybe you're making another LLM call, which could be more costly, but you're gaining trust or you're gaining confidence that what it's outputting is actually what it says it is. And it's actually factually correct, as you said. So it's like, what price can you put on trust? And we're going back to that whole thing that I saw on Chevy's website where they were saying that a Tesla is better. It's like that hopefully doesn't happen anymore as people deploy this stuff and they recognize that humans are cunning when it comes to playing around with chat bots. So this has been fascinating, man. I appreciate you coming on here and chatting me with it. - "github.com/qdrant/go-client/qdrant" -) +Demetrios: +I encourage everyone to go and either reach out to you on LinkedIn, I know you are on there, and we'll leave a link to your LinkedIn in the chat too. And if not, check out Tensorleg, check out indexify, and we will be in touch. Man, this was great. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Diptanu Gon Choudhury: +Yeah, same. It was really great chatting with you about this, Demetrius, and thanks for having me today. -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 768, - Distance: qdrant.Distance_Cosine, - }), - OptimizersConfig: &qdrant.OptimizersConfigDiff{ - DefaultSegmentNumber: qdrant.PtrOf(uint64(2)), - MaxSegmentSize: qdrant.PtrOf(uint64(5000000)), - }, -}) +Demetrios: +Cheers. I'll talk to you later. -``` +<|page-382-lllmstxt|> +# Qdrant x Dust: How Vector Search Helps Make Work Better with Stanislas Polu -## [Anchor](https://qdrant.tech/documentation/guides/optimize/\#summary) Summary +> *"We ultimately chose Qdrant due to its open-source nature, strong performance, being written in Rust, comprehensive documentation, and the feeling of control.”*\ +-- Stanislas Polu +> -By adjusting configurations like vector storage, quantization, and search parameters, you can optimize Qdrant for different use cases: +Stanislas Polu is the Co-Founder and an Engineer at Dust. He had previously sold a company to Stripe and spent 5 years there, seeing them grow from 80 to 3000 people. Then pivoted to research at OpenAI on large language models and mathematical reasoning capabilities. He started Dust 6 months ago to make work work better with LLMs. -- **Low Memory + High Speed:** Use vector quantization. -- **High Precision + Low Memory:** Store vectors and HNSW index on disk. -- **High Precision + High Speed:** Keep data in RAM, use quantization with re-scoring. -- **Latency vs. Throughput:** Adjust segment numbers based on the priority. -Choose the strategy that best fits your use case to get the most out of Qdrant’s performance capabilities. +***Listen to the episode on [Spotify](https://open.spotify.com/episode/2YgcSFjP7mKE0YpDGmSiq5?si=6BhlAMveSty4Yt7umPeHjA), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/1vKoiFAdorE).*** -##### Was this page useful? -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No + -Thank you for your feedback! 🙏 -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/optimize.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. + -On this page: -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/guides/optimize.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +## **Top takeaways:** -× -[Powered by](https://qdrant.tech/) +Curious about the interplay of SaaS platforms and AI in improving productivity? Stanislas Polu dives into the intricacies of enterprise data management, the selective use of SaaS tools, and the role of customized AI assistants in streamlining workflows, all while sharing insights from his experiences at Stripe, OpenAI, and his latest venture, Dust. -<|page-189-lllmstxt|> -## cluster-scaling -- [Documentation](https://qdrant.tech/documentation/) -- [Cloud](https://qdrant.tech/documentation/cloud/) -- Scale Clusters -# [Anchor](https://qdrant.tech/documentation/cloud/cluster-scaling/\#scaling-qdrant-cloud-clusters) Scaling Qdrant Cloud Clusters +Here are 5 golden nuggets you'll unearth from tuning in: -The amount of data is always growing and at some point you might need to upgrade or downgrade the capacity of your cluster. -![Cluster Scaling](https://qdrant.tech/documentation/cloud/cluster-scaling.png) +1. **The SaaS Universe**: Stan will give you the lowdown on why jumping between different SaaS galaxies like Salesforce and Slack is crucial for your business data's gravitational pull. +2. **API Expansions**: Learn how pushing the boundaries of APIs to include global payment methods can alter the orbit of your company's growth. +3. **A Bot for Every Star**: Discover how creating targeted assistants over general ones can skyrocket team productivity across various use cases. +4. **Behind the Tech Telescope**: Stan discusses the decision-making behind opting for Qdrant for their database cosmos, including what triggered their switch. +5. **Integrating AI Stardust**: They're not just talking about Gen AI; they're actively guiding companies on how to leverage it effectively, placing practicality over flashiness. -There are different options for how it can be done. -## [Anchor](https://qdrant.tech/documentation/cloud/cluster-scaling/\#vertical-scaling) Vertical Scaling +> Fun Fact: Stanislas Polu co-founded a company that was acquired by Stripe, providing him with the opportunity to work with Greg Brockman at Stripe. +> -Vertical scaling is the process of increasing the capacity of a cluster by adding or removing CPU, storage and memory resources on each database node. -You can start with a minimal cluster configuration of 2GB of RAM and resize it up to 64GB of RAM (or even more if desired) over the time step by step with the growing amount of data in your application. If your cluster consists of several nodes each node will need to be scaled to the same size. Please note that vertical cluster scaling will require a short downtime period to restart your cluster. In order to avoid a downtime you can make use of data replication, which can be configured on the collection level. Vertical scaling can be initiated on the cluster detail page via the button “scale”. +## Show notes: -If you want to scale your cluster down, the new, smaller memory size must be still sufficient to store all the data in the cluster. Otherwise, the database cluster could run out of memory and crash. Therefore, the new memory size must be at least as large as the current memory usage of the database cluster including a bit of buffer. Qdrant Cloud will automatically prevent you from scaling down the Qdrant database cluster with a too small memory size. -Note, that it is not possible to scale down the disk space of the cluster due to technical limitations of the underlying cloud providers. +00:00 Interview about an exciting career in AI technology.\ +06:20 Most workflows involve multiple SaaS applications.\ +09:16 Inquiring about history with Stripe and AI.\ +10:32 Stripe works on expanding worldwide payment methods.\ +14:10 Document insertion supports hierarchy for user experience.\ +18:29 Competing, yet friends in the same field.\ +21:45 Workspace solutions, marketplace, templates, and user feedback.\ +25:24 Avoid giving false hope; be accountable.\ +26:06 Model calls, external API calls, structured data.\ +30:19 Complex knobs, but powerful once understood. Excellent support.\ +33:01 Companies hire someone to support teams and find use cases. -## [Anchor](https://qdrant.tech/documentation/cloud/cluster-scaling/\#horizontal-scaling) Horizontal Scaling -Vertical scaling can be an effective way to improve the performance of a cluster and extend the capacity, but it has some limitations. The main disadvantage of vertical scaling is that there are limits to how much a cluster can be expanded. At some point, adding more resources to a cluster can become impractical or cost-prohibitive. +## More Quotes from Stan: -In such cases, horizontal scaling may be a more effective solution. -Horizontal scaling, also known as horizontal expansion, is the process of increasing the capacity of a cluster by adding more nodes and distributing the load and data among them. The horizontal scaling at Qdrant starts on the collection level. You have to choose the number of shards you want to distribute your collection around while creating the collection. Please refer to the [sharding documentation](https://qdrant.tech/documentation/guides/distributed_deployment/#sharding) section for details. +*"You really want to narrow the data exactly where that information lies. And that's where we're really relying hard on Qdrant as well. So the kind of indexing capabilities on top of the vector search."*\ +-- Stanislas Polu -After that, you can configure, or change the amount of Qdrant database nodes within a cluster during cluster creation, or on the cluster detail page via “Scale” button. -Important: The number of shards means the maximum amount of nodes you can add to your cluster. In the beginning, all the shards can reside on one node. With the growing amount of data you can add nodes to your cluster and move shards to the dedicated nodes using the [cluster setup API](https://qdrant.tech/documentation/guides/distributed_deployment/#cluster-scaling). +*"I think the benchmarking was really about quality of models, answers in the context of ritual augmented generation. So it's not as much as performance, but obviously, performance matters and that's why we love using Qdrant.”*\ +-- Stanislas Polu -When scaling down horizontally, the cloud platform will automatically ensure that any shards that are present on the nodes to be deleted, are moved to the remaining nodes. -We will be glad to consult you on an optimal strategy for scaling. +*"The workspace assistant are like the admin vetted the assistant, and it's kind of pushed to everyone by default.”*\ +-- Stanislas Polu -[Let us know](https://qdrant.tech/documentation/support/) your needs and decide together on a proper solution. -## [Anchor](https://qdrant.tech/documentation/cloud/cluster-scaling/\#resharding) Resharding +## Transcript: +Demetrios: +All right, so, my man, I think people are going to want to know all about you. This is a conversation that we have had planned for a while. I'm excited to chat about what you have been up to. You've had quite the run around when it comes to doing some really cool stuff. You spent a lot of time at Stripe in the early days and I imagine you were doing, doing lots of fun ML initiatives and then you started researching on llms at OpenAI. And recently you are doing the entrepreneurial thing and following the trend of starting a company and getting really cool stuff out the door with AI. I think we should just start with background on yourself. What did I miss in that quick introduction? -_Available as of Qdrant v1.13.0_ -When creating a collection, it has a specific number of shards. The ideal number of shards might change as your cluster evolves. +Stanislas Polu: +Okay, sounds good. Yeah, perfect. Now you didn't miss too much. Maybe the only point is that starting the current company, Dust, with Gabrielle, my co founder, with whom we started a Company together twelve years or maybe 14 years ago. -Resharding allows you to change the number of shards in your existing collections, both up and down, without having to recreate the collection from scratch. -Resharding is a transparent process, meaning that the collection is still available while resharding is going on without having downtime. This allows you to scale from one node to any number of nodes and back, keeping your data perfectly distributed without compromise. +Stanislas Polu: +I'm very bad with years that eventually got acquired to stripe. So that's how we joined Stripe, the both of us, pretty early. Stripe was 80 people when we joined, all the way to 2500 people and got to meet with and walk with Greg Brockman there. And that's how I found my way to OpenAI after stripe when I started interested in myself, in research at OpenAI, even if I'm not a trained researcher. -To increase the number of shards (reshard up), use the [Update collection cluster setup API](https://api.qdrant.tech/master/api-reference/distributed/update-collection-cluster) to initiate the resharding process: -```http -POST /collections/{collection_name}/cluster -{ - "start_resharding": { - "direction": "up", - "shard_key": null - } -} +Stanislas Polu: +I did research on fate, doing research. +On larger good models, reasoning capabilities, and in particular larger models mathematical reasoning capabilities. +And from there. +18 months ago, kind of decided to leave OpenAI with the motivation. That is pretty simple. +It's that basically the hypothesis is that. +It was pre chattivity, but basically those large language models, they're already extremely capable and yet they are completely under deployed compared to the potential they have. And so while research remains a very active subject and it's going to be. +A tailwind for the whole ecosystem, there's. -``` -To decrease the number of shards (reshard down), you may specify the `"down"` direction. +Stanislas Polu: +Probably a lot of to be done at the product layer, and most of the locks between us and deploying that technology in the world is probably sitting. +At the product layer as it is sitting at the research layer. +And so that's kind of the hypothesis behind dust, is we try to explore at the product layer what it means to interface between models and humans, try to make them happier and augment them. +With superpowers in their daily jobs. -The current status of resharding is listed in the [collection cluster info](https://api.qdrant.tech/v-1-12-x/api-reference/distributed/collection-cluster-info) which can be fetched with: -```http -GET /collections/{collection_name}/cluster +Demetrios: +So you say product layer, can you go into what you mean by that a little bit more? -``` -We always recommend to run an ongoing resharding operation till the end. But, if at any point the resharding operation needs to be aborted, you can use: +Stanislas Polu: +Well, basically we have a motto at dust, which is no gpu before PMF. And so the idea is that while it's extremely exciting to train models. It's extremely exciting to fine tune and align models. There is a ton to be done. +Above the model, not only to use. +Them as best as possible, but also to really find the interaction interfaces that make sense for humans to leverage that technology. And so we basically don't train any models ourselves today. +There's many reasons to that. +The first one is as an early startup. It's a fascinating subject and fascinating exercise. As an early startup, it's actually a very big investment to go into training. +Models because even if the costs are. +Not necessarily big in terms of compute. +It'S still research and development and pretty. +Hard research and development. It's basically research. We understand pretraining pretty well. We don't understand fine tuning that well. We believe it's a better idea to. -```http -POST /collections/{collection_name}/cluster -{ - "abort_resharding": {} -} -``` +Stanislas Polu: +Really try to explore the product layer. +The image I use generally is that training a model is very sexy and it's exciting, but really you're building a small rock that will get submerged by the waves of bigger models coming in the future. And iterating and positioning yourself at the interface between humans and those models at. +The product layer is more akin to. +Building a surfboard that you will be. +Able to use to surf those same waves. -A few things to be aware of with regards to resharding: -- during resharding, performance of your cluster may be slightly reduced -- during resharding, reported point counts will not be accurate -- resharding may be a long running operation on huge collections -- you can only run one resharding operation per collection at a time +Demetrios: +I like that because I am a big surfer and I have a lot. -##### Was this page useful? -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Stanislas Polu: +Of fun doing it. -Thank you for your feedback! 🙏 -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud/cluster-scaling.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Demetrios: +Now tell me about are you going after verticals? Are you going after different areas in a market, a certain subset of the market? -On this page: -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud/cluster-scaling.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Stanislas Polu: +How do you look at that? Yeah. +Basically the idea is to look at productivity within the enterprise. So we're first focusing on internal use. +By teams, internal teams of that technology. +We're not at all going after external use. So backing products that embed AI or having on projects maybe exposed through our users to actual end customers. So we really focused on the internal use case. So the first thing you want to. +Do is obviously if you're interested in. +Productivity within enterprise, you definitely want to have the enterprise data, right? Because otherwise there's a ton that can be done with Chat GPT as an example. But there is so much more that can be done when you have context. +On the data that comes from the company you're in. +That's pretty much kind of the use. +Case we're focusing on, and we're making. +A bet, which is a crazy bet to answer your question, that there's actually value in being quite horizontal for now. So that comes with a lot of risks because an horizontal product is hard. -× -[Powered by](https://qdrant.tech/) +Stanislas Polu: +To read and it's hard to figure. +Out how to use it. But at the same time, the reality is that when you are somebody working in a team, even if you spend. +A lot of time on one particular. +Application, let's say Salesforce for sales, or GitHub for engineers, or intercom for customer support, the reality of most of your workflows do involve many SaaS, meaning that you spend a lot of time in Salesforce, but you also spend a lot of time in slack and notion. Maybe, or we all spend as engineers a lot of time in GitHub, but we also use notion and slack a ton or Google Drive or whatnot. Jira. -<|page-190-lllmstxt|> -## large-scale-search -- [Documentation](https://qdrant.tech/documentation/) -- [Database tutorials](https://qdrant.tech/documentation/database-tutorials/) -- Large Scale Search -# [Anchor](https://qdrant.tech/documentation/database-tutorials/large-scale-search/\#upload-and-search-large-collections-cost-efficiently) Upload and Search Large collections cost-efficiently +Demetrios: +Good old Jira. Everybody loves spending time in Jira. -| Time: 2 days | Level: Advanced | | | -| --- | --- | --- | --- | -In this tutorial, we will describe an approach to upload, index, and search a large volume of data cost-efficiently, -on an example of the real-world dataset [LAION-400M](https://laion.ai/blog/laion-400-open-dataset/). +Stanislas Polu: +Yeah. And so basically, following our users where. +They are requires us to have access to those different SaaS, which requires us. +To be somewhat horizontal. +We had a bunch of signals that. +Kind of confirms that position, and yet. +We'Re still very conscious that it's a risky position. As an example, when we are benchmarked against other solutions that are purely verticalized, there is many instances where we actually do a better job because we have. +Access to all the data that matters within the company. -The goal of this tutorial is to demonstrate what minimal amount of resources is required to index and search a large dataset, -while still maintaining a reasonable search latency and accuracy. -All relevant code snippets are available in the [GitHub repository](https://github.com/qdrant/laion-400m-benchmark). +Demetrios: +Now, there is something very difficult when you have access to all of the data, and that is the data leakage issue and the data access. Right. How are you trying to conquer that hard problem? -The recommended Qdrant version for this tutorial is `v1.13.5` and higher. -## [Anchor](https://qdrant.tech/documentation/database-tutorials/large-scale-search/\#dataset) Dataset +Stanislas Polu: +Yeah, so we're basically focusing to continue. +Answering your questions through that other question. +I think we're focusing on tech companies. +That are less than 1000 people. And if you think about most recent tech companies, less than 1000 people. +There's been a wave of openness within. -The dataset we will use is [LAION-400M](https://laion.ai/blog/laion-400-open-dataset/), a collection of approximately 400 million vectors obtained from -images extracted from a Common Crawl dataset. Each vector is 512-dimensional and generated using a [CLIP](https://openai.com/blog/clip/) model. -Vectors are associated with a number of metadata fields, such as `url`, `caption`, `LICENSE`, etc. +Stanislas Polu: +Companies in terms of data access, meaning that it's becoming rare to see people actually relying on complex ACL for the internal data. You basically generally have silos. You have the exec silo with remuneration and ladders and whatnot. And this one is definitely not the. +Kind of data we're touching. +And then for the rest, you generally have a lot of data that is. +Accessible by every employee within your company. +So that's not a perfect answer, but that's really kind of the approach we're taking today. We give a lot of control on. -The overall payload size is approximately 200 GB, and the vectors are 400 GB. -The dataset is available in the form of 409 chunks, each containing approximately 1M vectors. -We will use the following [python script](https://github.com/qdrant/laion-400m-benchmark/blob/master/upload.py) to upload dataset chunks one by one. +Stanislas Polu: +Which data comes into dust, but once. +It'S into dust, and that control is pretty granular, meaning that you can select. +Specific slack channels, or you can select. +Specific notion pages, or you can select specific Google Drive subfolders. But once you decide to put it in dust, every dust user has access to this. And so we're really taking the silo. +Vision of the granular ACL story. +Obviously, if we were to go higher enterprise, that would become a very big issue, because I think larger are the enterprise, the more they rely on complex ackles. -## [Anchor](https://qdrant.tech/documentation/database-tutorials/large-scale-search/\#hardware) Hardware -After some initial experiments, we figured out a minimal hardware configuration for the task: +Demetrios: +And I have to ask about your history with stripe. Have you been focusing on specific financial pieces to this? First thing that comes to mind is what about all those e commerce companies that are living and breathing with stripe? Feels like they've got all kinds of use cases that they could leverage AI for, whether it is their supply chain or just getting better numbers, or getting answers that they have across all this disparate data. Have you looked at that at all? Is that informing any of your decisions that you're making these days? -- 8 CPU cores -- 64Gb RAM -- 650Gb Disk space -![Hardware configuration](https://qdrant.tech/documentation/tutorials/large-scale-search/hardware.png) +Stanislas Polu: +No, not quite. Not really. At stripe, when we joined, it was. +Very early, it was the quintessential curlb onechargers number 42. +42, 42. And that's pretty much what stripe was almost, I'm exaggerating, but not too much. So what I've been focusing at stripe. +Was really driven by my and our. +Perspective as european funders joining a quite. +Us centric company, which is, no, there. -Hardware configuration -This configuration is enough to index and explore the dataset in a single-user mode; latency is reasonable enough to build interactive graphs and navigate in the dashboard. +Stanislas Polu: +Is not credit card all over the world. Yes, there is also payment methods. And so most of my time spent at stripe was spent on trying to expand the API to not a couple us payment methods, but a variety of worldwide payment methods. So that requires kind of a change of paradigm from an API design, and that's where I spent most of my cycles +What I want to try. -Naturally, you might need more CPU cores and RAM for production-grade configurations. -It is important to ensure high network bandwidth for this experiment so you are running the client and server in the same region. +Demetrios: +Okay, the next question that I had is you talked about how benchmarking with the horizontal solution, surprisingly, has been more effective in certain use cases. I'm guessing that's why you got a little bit of love for [Qdrant](https://qdrant.tech/) and what we're doing here. -## [Anchor](https://qdrant.tech/documentation/database-tutorials/large-scale-search/\#uploading-and-indexing) Uploading and Indexing -We will use the following [python script](https://github.com/qdrant/laion-400m-benchmark/blob/master/upload.py) to upload dataset chunks one by one. +Stanislas Polu: +Yeah +I think the benchmarking was really about quality of models, answers in the context of [retrieval augmented generation](https://qdrant.tech/articles/what-is-rag-in-ai/). +So it's not as much as performance, but obviously performance matters, and that's why we love using Qdrants. But I think the main idea of. -```bash -export QDRANT_URL="https://xxxx-xxxx.xxxx.cloud.qdrant.io" -export QDRANT_API_KEY="xxxx-xxxx-xxxx-xxxx" -python upload.py +Stanislas Polu: +What I mentioned is that it's interesting because today the retrieval is noisy, because the embedders are not perfect, which is an interesting point. +Sorry, I'm double clicking, but I'll come back. The embedded are really not perfect. Are really not perfect. So that's interesting. When Qdrant release kind of optimization for [storage of vectors](https://qdrant.tech/documentation/concepts/storage/), they come with obviously warnings that you may have a loss. +Of precision because of the compression, et cetera, et cetera. +And that's funny, like in all kind of retrieval and mental generation world, it really doesn't matter. We take all the performance we can because the loss of precision coming from compression of those vectors at the vector DB level are completely negligible compared to. +The holon fuckness of the embedders in. -``` -This script will download chunks of the LAION dataset one by one and upload them to Qdrant. Intermediate data is not persisted on disk, so the script doesn’t require much disk space on the client side. +Stanislas Polu: +Terms of capability to correctly embed text, because they're extremely powerful, but they're far from being perfect. And so that's an interesting thing where you can really go as far as you want in terms of performance, because your error is dominated completely by the. +Quality of your embeddings. +Going back up. +I think what's interesting is that the. +Retrieval is noisy, mostly because of the embedders, and the models are not perfect. +And so the reality is that more. +Data in a rack context is not. +Necessarily better data because the retrievals become noisy. +The model kind of gets confused and it starts hallucinating stuff, et cetera. And so the right trade off is that you want to access to as. +Much data as possible, but you want +To give the ability to our users. +To select very narrowly the data required for a given task. -Let’s take a look at the collection configuration we used: -```python -client.create_collection( - QDRANT_COLLECTION_NAME, - vectors_config=models.VectorParams( - size=512, # CLIP model output size - distance=models.Distance.COSINE, # CLIP model uses cosine distance - datatype=models.Datatype.FLOAT16, # We only need 16 bits for float, otherwise disk usage would be 800Gb instead of 400Gb - on_disk=True # We don't need original vectors in RAM - ), - # Even though CLIP vectors don't work well with binary quantization, out of the box, - # we can rely on query-time oversampling to get more accurate results - quantization_config=models.BinaryQuantization( - binary=models.BinaryQuantizationConfig( - always_ram=True, - ) - ), - optimizers_config=models.OptimizersConfigDiff( - # Bigger size of segments are desired for faster search - # However it might be slower for indexing - max_segment_size=5_000_000, - ), - # Having larger M value is desirable for higher accuracy, - # but in our case we care more about memory usage - # We could still achieve reasonable accuracy even with M=6 + oversampling - hnsw_config=models.HnswConfigDiff( - m=6, # decrease M for lower memory usage - on_disk=False - ), - ) +Stanislas Polu: +And so that's kind of what our product does, is the ability to create assistants that are specialized to a given task. And most of the specification of an assistant is obviously a prompt, but also. +Saying, oh, I'm working on helping sales find interesting next leads. +And you really want to narrow the data exactly where that information lies. And that's where there, we're really relying. +Hard on Qdrants as well. +So the kind of indexing capabilities on. +Top of the [vector search](https://qdrant.tech/), where whenever. -``` -There are a few important points to note: +Stanislas Polu: +We insert the documents, we kind of try to insert an array of parents that reproduces the hierarchy of whatever that document is coming from, which lets us create a very nice user experience where when you create an assistant, you can say, oh, I'm going down two levels within notion, and I select that page and all of those children will come together. And that's just one string in our specification, because then rely on those parents that have been injected in Qdrant, and then the Qdrant search really works well with a simple query like this thing has to be in parents. -- We use `FLOAT16` datatype for vectors, which allows us to store vectors in half the size compared to `FLOAT32`. There are no significant accuracy losses for this dataset. -- We use `BinaryQuantization` with `always_ram=True` to enable query-time oversampling. This allows us to get an accurate and resource-efficient search, even though 512d CLIP vectors don’t work well with binary quantization out of the box. -- We use `HnswConfig` with `m=6` to reduce memory usage. We will look deeper into memory usage in the next section. -Goal of this configuration is to ensure that prefetch component of the search never needs to load data from disk, and at least a minimal version of vectors and vector index is always in RAM. -The second stage of the search can explicitly determine how many times we can afford to load data from a disk. +Stanislas Polu: +And you filter by that and it. -In our experiment, the upload process was going at 5000 points per second. -The indexation process was going in parallel with the upload and was happening at the rate of approximately 4000 points per second. -![Upload and indexation process](https://qdrant.tech/documentation/tutorials/large-scale-search/upload_process.png) +Demetrios: +Feels like there's two levels to the evaluation that you can be doing with rags. One is the stuff you're retrieving and evaluating the retrieval, and then the other is the output that you're giving to the end user. How are you attacking both of those evaluation questions? -Upload and indexation process -## [Anchor](https://qdrant.tech/documentation/database-tutorials/large-scale-search/\#memory-usage) Memory Usage +Stanislas Polu: +Yeah, so the truth in whole transparency. +Is that we don't, we're just too early. -After the upload and indexation process is finished, let’s take a detailed look at the memory usage of the Qdrant server. -![Memory usage](https://qdrant.tech/documentation/tutorials/large-scale-search/memory_usage.png) +Demetrios: +Well, I'm glad you're honest with us, Alicia. -Memory usage -On the high level, memory usage consists of 3 components: +Stanislas Polu: +This is great, we should, but the rate is that we have so many other product priorities that I think evaluating the quality of retrievals, evaluating the quality. +Of retrieval, augmented generation. +Good sense but good sense is hard to define, because good sense with three. +Years doing research in that domain is probably better sense. +Better good sense than good sense with no clue on the domain. But basically with good sense I think. +You can get very far and then. +You'Ll be optimizing at the margin. +And the reality is that if you. +Get far enough with good sense, and that everything seems to work reasonably well, then your priority is not necessarily on pushing 5% performance, whatever is the metric. -- System memory - 8.34Gb - this is memory reserved for internal systems and OS, it doesn’t depend on the dataset size. -- Data memory - 39.27Gb - this is a resident memory of qdrant process, it can’t be evicter and qdrant process will crash if it exceeds the limit. -- Cache memory - 14.54Gb - this is a disk cache qdrant uses. It is necessary for fast search but can be evicted if needed. -The most interest for us is Data and Cache memory. Let’s look what exactly is stored in these components. +Stanislas Polu: +But more like I have a million other products questions to solve. +That is the kind of ten people answer to your question. And as we grow, we'll probably make a priority, of course, of benchmarking that better. In terms of benchmarking that better. Extremely interesting question as well, because the. +Embedding benchmarks are what they are, and. +I think they are not necessarily always a good representation of the use case you'll have in your products. And so that's something you want to be cautious of. And. +It'S quite hard to benchmark your use case. +The kind of solutions you have and the ones that seems more plausible, whether it's spending like full years on that. -In our scenario, Qdrant uses memory to store the following components: -- Storing vectors -- Storing vector index -- Storing information about IDs and versions of points +Stanislas Polu: +Is probably to. +Evaluate the retrieval with another model, right? +It's like you take five different embedding models, you record a bunch of questions. +That comes from your product, you use your product data and you run those retrievals against those five different embedders, and. +Then you ask GPT four to raise. +That would be something that seems sensible and probably will get you another step forward and is not perfect, but it's. +Probably really strong enough to go quite far. -### [Anchor](https://qdrant.tech/documentation/database-tutorials/large-scale-search/\#size-of-vectors) Size of vectors -In our scenario, we store only quantized vectors in RAM, so it is relatively easy to calculate the required size: +Stanislas Polu: +And then the second question is evaluating. +The end to end pipeline, which includes. +Both the retrieval and the generation. +And to be honest, again, it's a. +Known question today because GPT four is. +Just so much above all the models. -```text -400_000_000 * 512d / 8 bits / 1024 (Kb) / 1024 (Mb) / 1024 (Gb) = 23.84Gb -``` +Stanislas Polu: +That there's no point evaluating them. If you accept using GPD four, just use GP four. If you want to use open source models, then the questions is more important. But if you are okay with using GPD four for many reasons, then there. +Is no questions at this stage. -### [Anchor](https://qdrant.tech/documentation/database-tutorials/large-scale-search/\#size-of-vector-index) Size of vector index -Vector index is a bit more complicated, as it is not a simple matrix. +Demetrios: +So my next question there, because sounds like you got a little bit of a french accent, you're somewhere in Europe. Are you in France? -Internally, it is stored as a list of connections in a graph, and each connection is a 4-byte integer. -The number of connections is defined by the `M` parameter of the HNSW index, and in our case, it is `6` on the high level and `2 x M` on level 0. +Stanislas Polu: +Yes, we're based in France and billion team from Paris. -This gives us the following estimation: -```text -400_000_000 * (6 * 2) * 4 bytes / 1024 (Kb) / 1024 (Mb) / 1024 (Gb) = 17.881Gb +Demetrios: +So I was wondering if you were going to lean more towards the history of you working at OpenAI or the fraternity from your french group and go for your amiz in. -``` -In practice the size of index is a bit smaller due to the [compression](https://qdrant.tech/blog/qdrant-1.13.x/#hnsw-graph-compression) we implemented in Qdrant v1.13.0, but it is still a good estimation. +Stanislas Polu: +Mean, we are absolute BFF with Mistral. The fun story is that Guillaume Lamp is a friend, because we were working on exactly the same subjects while I was at OpenAI and he was at Meta. So we were basically frenemies. We're competing against the same metrics and same goals, but grew a friendship out of that. Our platform is quite model agnostic, so. +We support Mistral there. +Then we do decide to set the defaults for our users, and we obviously set the defaults to GP four today. I think it's the question of where. +Today there's no question, but when the. +Time comes where open source or non open source, it's not the question, but where Ozo models kind of start catching. +Up with GPT four, that's going to. -The HNSW index in Qdrant is stored as a mmap, and it can be evicted from RAM if needed. -So, the memory consumption of HNSW falls under the category of `Cache memory`. -### [Anchor](https://qdrant.tech/documentation/database-tutorials/large-scale-search/\#size-of-ids-and-versions) Size of IDs and versions +Stanislas Polu: +Be an interesting product question, and hopefully. +Mistral will get there. +I think that's definitely their goal, to be within reach of GPT four this year. +And so that's going to be extremely exciting. Yeah. -Qdrant must store additional information about each point, such as ID and version. -This information is needed on each request, so it is very important to keep it in RAM for fast access. -Let’s take a look at Qdrant internals to understand how much memory is required for this information. +Demetrios: +So then you mentioned how you have a lot of other product considerations that you're looking at before you even think about evaluation. What are some of the other considerations? -```rust -// This is s simplified version of the IdTracker struct -// It omits all optimizations and small details, -// but gives a good estimation of memory usage -IdTracker { - // Mapping of internal id to version (u64), compressed to 4 bytes - // Required for versioning and conflict resolution between segments - internal_to_version, // 400M x 4 = 1.5Gb +Stanislas Polu: +Yeah, so as I mentioned a bit. +The main hypothesis is we're going to do company productivity or team productivity. We need the company data. That was kind of hypothesis number zero. It's not even an hypothesis, almost an axiom. And then our first product was a conversational assistance, like chat. GPT, that is general, and has access. +To everything, and realized that didn't work. +Quite well enough on a bunch of use cases, was kind of good on some use cases, but not great on many others. +And so that's where we made that. +First strong product, the hypothesis, which is. So we want to have many assistants. +Not one assistant, but many assistants, targeted to specific tasks. +And that's what we've been exploring since the end of the summer. And that hypothesis has been very strongly confirmed with our users. And so an example of issue that. +We have is, obviously, you want to. +Activate your product, so you want to make sure that people are creating assistance. So one thing that is much more important than the quality of rag is. +The ability of users to create personal assistance. +Before, it was only workspace assistance, and so only the admin or the builder could build it. And now we've basically, as an example, worked on having anybody can create the assistant. The assistant is scoped to themselves, they can publish it afterwards, et cetera. That's the kind of product questions that. +Are, to be honest, more important than rack rarity, at least for us. - // Mapping of external id to internal id, 4 bytes per point. - // Required to determine original point ID after search inside the segment - internal_to_external: Vec, // 400M x 16 = 6.4Gb - // Mapping of external id to internal id. For numeric ids it uses 8 bytes, - // UUIDs are stored as 16 bytes. - // Required to determine sequential point ID inside the segment - external_to_internal: Vec, // 400M x (8 + 4) = 4.5Gb -} +Demetrios: +All right, real quick, publish it for a greater user base or publish it for the internal company to be able to. -``` -In the v1.13.5 we introduced a [significant optimization](https://github.com/qdrant/qdrant/pull/6023) to reduce the memory usage of `IdTracker` by approximately 2 times. -So the total memory usage of `IdTracker` in our case is approximately `12.4Gb`. +Stanislas Polu: +Yeah, within the workspace. +Okay. -So total expected RAM usage of Qdrant server in our case is approximately `23.84Gb + 17.881Gb + 12.4Gb = 54.121Gb`, which is very close to the actual memory usage we observed: `39.27Gb + 14.54Gb = 53.81Gb`. -We had to apply some simplifications to the estimations, but they are good enough to understand the memory usage of the Qdrant server. +Demetrios: +It's not like, oh, I could publish this for. -## [Anchor](https://qdrant.tech/documentation/database-tutorials/large-scale-search/\#search) Search -After the dataset is uploaded and indexed, we can start searching for similar vectors. +Stanislas Polu: +We'Re not going there yet. And there's plenty to do internally to each workspace. +Before going there, though it's an interesting case because that's basically another big problem, is you have an horizontal platform, you can create an assistance, you're not an. +Expert and you're like, okay, what should I do? And so that's the kind of white blank page issue. -We can start by exploring the dataset in Web-UI. So you can get an intuition into the search performance, not just table numbers. -![Web-UI Bear image](https://qdrant.tech/documentation/tutorials/large-scale-search/web-ui-bear1.png) +Stanislas Polu: +And so there having templates, inspiration, you can sit that within workspace, but you also want to have solutions for the new workspace that gets created. And maybe a marketplace is a good idea. Or having templates, et cetera, are also product questions that are much more important than the rack performance. And finally, the users where dust works really well, one example is Alan in. +France, there are 600, and dust is. +Running there pretty healthily, and they've created. +More than 200 assistants. And so another big product question is like, when you get traction within a company, people start getting flooded with assistance. +And so how do they discover them? How did they, and do they know which one to use, et cetera? So that's kind of the kind of. +Many examples of product questions that are very first order compared to other things. -Web-UI Bear image -![Web-UI similar Bear image](https://qdrant.tech/documentation/tutorials/large-scale-search/web-ui-bear2.png) +Demetrios: +Because out of these 200 assistants, are you seeing a lot of people creating the same assistance? -Web-UI similar Bear image -Web-UI default requests do not use oversampling, but the observable results are still good enough to see the resemblance between images. +Stanislas Polu: +That's a good question. So far it's been kind of driven by somebody internally that was responsible for trying to push gen AI within the company. And so I think there's not that. +Much redundancy, which is interesting, but I. +Think there's a long tail of stuff that are mostly explorations, but from our perspective, it's very hard to distinguish the two. Obviously, usage is a very strong signal. +But yeah, displaying assistance by usage, pushing. +The right assistance to the right user. This problem seems completely trivial compared to building an LLM, obviously. But still, when you add the product layer requires a ton of work, and as a startup, that's where a lot of our resources go, and I think. +It'S the right thing to do. -### [Anchor](https://qdrant.tech/documentation/database-tutorials/large-scale-search/\#ground-truth-data) Ground truth data -However, to estimate the search performance more accurately, we need to compare search results with the ground truth. -Unfortunately, the LAION dataset doesn’t contain usable ground truth, so we had to generate it ourselves. +Demetrios: +Yeah, I wonder if, and you probably have thought about this, but if it's almost like you can tag it with this product, or this assistant is in beta or alpha or this is in production, you can trust that this one is stable, that kind of thing. -To do this, we need to perform a full-scan search for each vector in the dataset and store the results in a separate file. -Unfortunately, this process is very time-consuming and requires a lot of resources, so we had to limit the number of queries to 100, -we provide a ready-to-use [ground truth file](https://github.com/qdrant/laion-400m-benchmark/blob/master/expected.py) and the [script](https://github.com/qdrant/laion-400m-benchmark/blob/master/full_scan.py) to generate it (requires 512Gb RAM machine and about 20 hours of execution time). -Our ground truth file contains 100 queries, each with 50 results. The first 100 vectors of the dataset itself were used to generate queries. +Stanislas Polu: +Yeah. +So we have the concept of shared. +Assistant and the concept of workspace assistant. The workspace assistant are like the admin vetted the assistant, and it's kind of pushed to everyone by default. And then the published assistant is like, there's a gallery of assistant that you can visit, and there, the strongest signal is probably the usage metric. +Right? -### [Anchor](https://qdrant.tech/documentation/database-tutorials/large-scale-search/\#search-query) Search Query -To precisely control the amount of oversampling, we will use the following search query: +Demetrios: +Yeah. So when you're talking about assistance, just so that I'm clear, it's not autonomous agents, is it? -```python -limit = 50 -rescore_limit = 1000 # oversampling factor is 20 +Stanislas Polu: +No. -query = vectors[query_id] # One of existing vectors -response = client.query_points( - collection_name=QDRANT_COLLECTION_NAME, - query=query, - limit=limit, - # Go to disk - search_params=models.SearchParams( - quantization=models.QuantizationSearchParams( - rescore=True, - ), - ), - # Prefetch is performed using only in-RAM data, - # so querying even large amount of data is fast - prefetch=models.Prefetch( - query=query, - limit=rescore_limit, - params=models.SearchParams( - quantization=models.QuantizationSearchParams( - # Avoid rescoring in prefetch - # We should do it explicitly on the second stage - rescore=False, - ), - ) - ) - ) +Stanislas Polu: +Yeah. So it's a great question. +We are really focusing on the one. +Step, trying to solve very nicely the one step thing. I have one granular task to achieve. +And I can get accelerated on that. +Task and maybe save a few minutes or maybe save a few tens of minutes on one specific thing, because the identity version of that is obviously the future. +But the reality is that current models, even GB four, are not that great at kind of chaining decisions of tool use in a way that is sustainable. +Beyond the demo effect. So while we are very hopeful for the future, it's not our core focus, because I think there's a lot of risk that it creates more deception than anything else. But it's obviously something that we are. +Targeting in the future as models get better. + + +Demetrios: +Yeah. And you don't want to burn people by making them think something's possible. And then they go and check up on it and they leave it in the agent's hands, and then next thing they know they're getting fired because they don't actually do the work that they said they were going to do. + + +Stanislas Polu: +Yeah. One thing that we don't do today. +Is we have kind of different ways. +To bring data into the assistant before it creates generation. And we're expanding that. One of the domain use case is the one based on Qdrant, which is. +The kind of retrieval one. +We also have kind of a workflow system where you can create an app. +An LLM app, where you can make. + + +Stanislas Polu: +Multiple calls to a model, you can call external APIs and search. And another thing we're digging into our structured data use case, which this time doesn't use Qdrants, which the idea is that semantic search is great, but it's really atrociously bad for quantitative questions. +Basically, the typical use case is you. +Have a big CSV somewhere and it gets chunked and then you do retrieval. +And you get kind of disordered partial. +Chunks, all of that. +And on top of that, the moles. +Are really bad at counting stuff. And so you really get bullshit, you. + + +Demetrios: +Know better than anybody. + + +Stanislas Polu: +Yeah, exactly. Past life. And so garbage in, garbage out. Basically, we're looking into being able, whenever the data is structured, to actually store. + + +It in a structured way and as needed. +Just in time, generate an in memory SQL database so that the model can generate a SQL query to that data and get kind of a SQL. +Answer and as a consequence hopefully be able to answer quantitative questions better. +And finally, obviously the next step also is as we integrated with those platform notion, Google Drive, slack, et cetera, basically. +There'S some actions that we can take there. +We're not going to take the actions, but I think it's interesting to have. +The model prepare an action, meaning that here is the email I prepared, send. +It or iterate with me on it, or here is the slack message I prepare, or here is the edit to the notion doc that I prepared. -``` -As you can see, this query contains two stages: +Stanislas Polu: +This is still not agentic, it's closer. +To taking action, but we definitely want. +To keep the human in the loop. +But obviously some stuff that are on our roadmap. +And another thing that we don't support, which is one type of action would. +Be the first we will be working on is obviously code interpretation, which is I think is one of the things that all users ask because they use. +It on Chat GPT. +And so we'll be looking into that as well. -- First stage is a prefetch, which is performed using only in-RAM data. It is very fast and allows us to get a large amount of candidates. -- The second stage is a rescore, which is performed with full-size vectors stored on disks. -By using 2-stage search we can precisely control the amount of data loaded from disk and ensure the balance between search speed and accuracy. +Demetrios: +What made you choose Qdrant? -You can find the complete code of the search process in the [eval.py](https://github.com/qdrant/laion-400m-benchmark/blob/master/eval.py) -## [Anchor](https://qdrant.tech/documentation/database-tutorials/large-scale-search/\#performance-tweak) Performance tweak +Stanislas Polu: +So the decision was made, if I. +Remember correctly, something like February or March last year. And so the alternatives I looked into. +Were pine cone wavy eight, some click owls because Chroma was using click owls at the time. But Chroma was. +2000 lines of code. +At the time as well. +And so I was like, oh, Chroma, we're part of AI grant. And Chroma is as an example also part of AI grant. So I was like, oh well, let's look at Chroma. +And however, what I'm describing is last. +Year, but they were very early. And so it was definitely not something. +That seemed like to make sense for us. +So at the end it was between pine cone wavev eight and Qdrant wave v eight. +You look at the doc, you're like, yeah, not possible. +And then finally it's Qdrant and Pinecone. And I think we really appreciated obviously the open source nature of Qdrants.From. +Playing with it, the very strong performance, the fact that it's written in rust, the sanity of the documentation, and basically the feeling that because it's an open source, we're using the osted Qdrant cloud solution. But it's not a question of paying. +Or not paying, it's more a question. +Of being able to feel like you have more control. And at the time, I think it was the moment where Pinecon had their massive fuck up, where they erased gazillion database from their users and so we've been on Qdrants and I think it's. +Been a two step process, really. -One important performance tweak we found useful for this dataset is to enable [Async IO](https://qdrant.tech/articles/io_uring) in Qdrant. -By default, Qdrant uses synchronous IO, which is good for in-memory datasets but can be a bottleneck when we want to read a lot of data from a disk. +Stanislas Polu: +It's very smooth to start, but also Qdrants at this stage comes with a. +Lot of knobs to turns. +And so as you start scaling, you at some point reach a point where. +You need to start tweaking the knobs. +Which I think is great because the knobs, there's a lot of knobs, so they are hard to understand, but once you understand them, you see the power of them. And the Qdrant team has been excellent there supporting us. And so I think we've reached that first level of scale where you have. +To tweak the nodes, and we've reached. +The second level of scale where we. +Have to have multiple nodes. +But so far it's been extremely smooth. +And I think we've been able to. +Do with Qdrant some stuff that really are possible only because of the very good performance of the database. +As an example, we're not using your clustered setup. We have n number of independent nodes. +And as we scale, we kind of. +Reshuffle which users go on which nodes. +As we need, trying to keep our largest users and most paying users on. +Very well identified nodes. We have a kind of a garbage. +Node for all the free users, as an example, migrating even a very big collection from one node. One capability that we build is say, oh, I have that collection over there. It's pretty big. +I'm going to initiate on another node. +I'm going to set up shadow writing on both, and I'm going to migrate live the data. And that has been incredibly easy to do with Qdrant because crawling is fast, writing is fucking fast. And so even a pretty large collection. +You can migrate it in a minute. -Async IO (implemented with `io_uring`) allows to send parallel requests to the disk and saturate the disk bandwidth. -This is exactly what we are looking for when performing large-scale re-scoring with original vectors. +Stanislas Polu: +And so it becomes really within the realm of being able to administrate your cluster with that in mind, which I. +Think would have probably not been possible with the different systems. -Instead of reading vectors one by one and waiting for the disk response 1000 times, we can send 1000 requests to the disk and wait for all of them to complete. This allows us to saturate the disk bandwidth and get faster results. -To enable Async IO in Qdrant, you need to set the following environment variable: +Demetrios: +So it feels like when you are helping companies build out their assistants, are you going in there and giving them ideas on what they can do? + + +Stanislas Polu: +Yeah, we are at a stage where obviously we have to do that because. +I think the product basically starts to. +Have strong legs, but I think it's still very early and so there's still a lot to do on activation, as an example. And so we are in a mode today where we do what doesn't scale. +Basically, and we do spend some time. + + +Stanislas Polu: +With companies, obviously, because there's nowhere around that. But what we've seen also is that the users where it works the best and being on dust or anything else. +That is relative to having people adopt gen AI. +Within the company are companies where they. +Actually allocate resources to the problem, meaning that the companies where it works best. +Are the companies where there's somebody. Their role is really to go around the company, find, use cases, support the teams, et cetera. And in the case of companies using dust, this is kind of type of interface that is perfect for us because we provide them full support and we help them build whatever they think is. +Valuable for their team. + -```bash -QDRANT__STORAGE__PERFORMANCE__ASYNC_SCORER=true +Demetrios: +Are you also having to be the bearer of bad news and tell them like, yeah, I know you saw that demo on Twitter, but that is not actually possible or reliably possible? -``` -Or set parameter in config file: +Stanislas Polu: +Yeah, that's an interesting question. That's a good question. Not that much, because I think one of the big learning is that you take any company, even a pretty techy. +Company, pretty young company, and the reality. +Is that most of the people, they're not necessarily in the ecosystem, they just want shit done. And so they're really glad to have some shit being done by a computer. But they don't really necessarily say, oh, I want the latest shiniest thingy that. +I saw on Twitter. So we've been safe from that so far. -```yaml -storage: - performance: - async_scorer: true -``` +Demetrios: +Excellent. Well, man, this has been incredible. I really appreciate you coming on here and doing this. Thanks so much. And if anyone wants to check out dust, I encourage that they do. -In Qdrant Managed cloud Async IO can be enabled via `Advanced optimizations` section in cluster `Configuration` tab. -![Async IO configuration in Cloud](https://qdrant.tech/documentation/tutorials/large-scale-search/async_io.png) +Stanislas Polu: +It's dust. -Async IO configuration in Cloud -## [Anchor](https://qdrant.tech/documentation/database-tutorials/large-scale-search/\#running-search-requests) Running search requests +Demetrios: +It's a bit of an interesting website. What is it? -Once all the preparations are done, we can run the search requests and evaluate the results. -You can find the full code of the search process in the [eval.py](https://github.com/qdrant/laion-400m-benchmark/blob/master/eval.py) +Stanislas Polu: +Dust TT. -This script will run 100 search requests with configured oversampling factor and compare the results with the ground truth. -```bash -python eval.py --rescore_limit 1000 +Demetrios: +That's it. That's what I was missing, dust. There you go. So if anybody wants to look into it, I encourage them to. And thanks so much for coming on here. -``` -In our request we achieved the following results: +Stanislas Polu: +Yeah. -| Rescore Limit | Precision@50 | Time per request | -| --- | --- | --- | -| 1000 | 75.2% | 0.7s | -| 5000 | 81.0% | 2.2s | -Additional experiments with `m=16` demonstrated that we can achieve `85%` precision with `rescore_limit=1000`, but they would require slightly more memory. +Stanislas Polu: +And Qdrant is the shit. -![Log of search evaluation](https://qdrant.tech/documentation/tutorials/large-scale-search/precision.png) -Log of search evaluation +Demetrios: +There we go. Awesome, dude. Well, this has been great. -## [Anchor](https://qdrant.tech/documentation/database-tutorials/large-scale-search/\#conclusion) Conclusion -In this tutorial we demonstrated how to upload, index and search a large dataset in Qdrant cost-efficiently. -Binary quantization can be applied even on 512d vectors, if combined with query-time oversampling. +Stanislas Polu: +Yeah, thanks, Vintu. Have a good one. -Qdrant allows to precisely control where each part of storage is located, which allows to achieve a good balance between search speed and memory usage. +<|page-383-lllmstxt|> +Today, we are excited to announce our $28M Series A funding round, which is led by Spark Capital with participation from our existing investors Unusual Ventures and 42CAP. -### [Anchor](https://qdrant.tech/documentation/database-tutorials/large-scale-search/\#potential-improvements) Potential improvements +We have seen incredible user growth and support from our open-source community in the past two years - recently exceeding 5M downloads. This is a testament to our mission to build the most efficient, scalable, high-performance vector database on the market. We are excited to further accelerate this trajectory with our new partner and investor, Spark Capital, and the continued support of Unusual Ventures and 42CAP. This partnership uniquely positions us to empower enterprises with cutting edge vector search technology to build truly differentiating, next-gen AI applications at scale. -In this experiment, we investigated in detail which parts of the storage are responsible for memory usage and how to control them. +## The Emergence and Relevance of Vector Databases -One especially interesting part is the `VectorIndex` component, which is responsible for storing the graph of connections between vectors. +A paradigm shift is underway in the field of data management and information retrieval. Today, our world is increasingly dominated by complex, unstructured data like images, audio, video, and text. Traditional ways of retrieving data based on keyword matching are no longer sufficient. Vector databases are designed to handle complex high-dimensional data, unlocking the foundation for pivotal AI applications. They represent a new frontier in data management, in which complexity is not a barrier but an opportunity for innovation. -In our further research, we will investigate the possibility of making HNSW more disk-friendly so it can be offloaded to disk without significant performance losses. +The rise of generative AI in the last few years has shone a spotlight on vector databases, prized for their ability to power retrieval-augmented generation (RAG) applications. What we are seeing now, both within AI and beyond, is only the beginning of the opportunity for vector databases. Within our Qdrant community, we already see a multitude of unique solutions and applications leveraging our technology for multimodal search, anomaly detection, recommendation systems, complex data analysis, and more. -##### Was this page useful? +## What sets Qdrant apart? -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +To meet the needs of the next generation of AI applications, Qdrant has always been built with four keys in mind: efficiency, scalability, performance, and flexibility. Our goal is to give our users unmatched speed and reliability, even when they are building massive-scale AI applications requiring the handling of billions of vectors. We did so by building Qdrant on Rust for performance, memory safety, and scale. Additionally, [our custom HNSW search algorithm](/articles/filtrable-hnsw/) and unique [filtering](/documentation/concepts/filtering/) capabilities consistently lead to [highest RPS](/benchmarks/), minimal latency, and high control with accuracy when running large-scale, high-dimensional operations. -Thank you for your feedback! 🙏 +Beyond performance, we provide our users with the most flexibility in cost savings and deployment options. A combination of cutting-edge efficiency features, like [built-in compression options](/documentation/guides/quantization/), [multitenancy](/documentation/guides/multiple-partitions/) and the ability to [offload data to disk](/documentation/concepts/storage/), dramatically reduce memory consumption. Committed to privacy and security, crucial for modern AI applications, Qdrant now also offers on-premise and hybrid SaaS solutions, meeting diverse enterprise needs in a data-sensitive world. This approach, coupled with our open-source foundation, builds trust and reliability with engineers and developers, making Qdrant a game-changer in the vector database domain. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/database-tutorials/large-scale-search.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +## What's next? -On this page: +We are incredibly excited about our next chapter to power the new generation of enterprise-grade AI applications. The support of our open-source community has led us to this stage and we’re committed to continuing to build the most advanced vector database on the market, but ultimately it’s up to you to decide! We invite you to [test out](https://cloud.qdrant.io/) Qdrant for your AI applications today. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/database-tutorials/large-scale-search.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +<|page-384-lllmstxt|> +Great news! We've expanded Qdrant's managed vector database offering — [Qdrant Cloud](https://cloud.qdrant.io/) — to be available on Microsoft Azure. +You can now effortlessly set up your environment on Azure, which reduces deployment time, so you can hit the ground running. -× +[Get started](https://cloud.qdrant.io/) -[Powered by](https://qdrant.tech/) +What this means for you: -<|page-191-lllmstxt|> -## modern-sparse-neural-retrieval -- [Articles](https://qdrant.tech/articles/) -- Modern Sparse Neural Retrieval: From Theory to Practice +- **Rapid application development**: Deploy your own cluster through the Qdrant Cloud Console within seconds and scale your resources as needed. +- **Billion vector scale**: Seamlessly grow and handle large-scale datasets with billions of vectors. Leverage Qdrant features like horizontal scaling and binary quantization with Microsoft Azure's scalable infrastructure. -[Back to Machine Learning](https://qdrant.tech/articles/machine-learning/) +**"With Qdrant, we found the missing piece to develop our own provider independent multimodal generative AI platform at enterprise scale."** -- Jeremy Teichmann (AI Squad Technical Lead & Generative AI Expert), Daly Singh (AI Squad Lead & Product Owner) - Bosch Digital. -# Modern Sparse Neural Retrieval: From Theory to Practice +Get started by [signing up for a Qdrant Cloud account](https://cloud.qdrant.io). And learn more about Qdrant Cloud in our [docs](/documentation/cloud/). -Evgeniya Sukhodolskaya + -· +<|page-385-lllmstxt|> +It's time for an update to Qdrant's benchmarks! -October 23, 2024 +We've compared how Qdrant performs against the other vector search engines to give you a thorough performance analysis. Let's get into what's new and what remains the same in our approach. -![Modern Sparse Neural Retrieval: From Theory to Practice](https://qdrant.tech/articles_data/modern-sparse-neural-retrieval/preview/title.jpg) +### What's Changed? -Finding enough time to study all the modern solutions while keeping your production running is rarely feasible. -Dense retrievers, hybrid retrievers, late interaction
 How do they work, and where do they fit best? -If only we could compare retrievers as easily as products on Amazon! +#### All engines have improved -We explored the most popular modern sparse neural retrieval models and broke them down for you. -By the end of this article, you’ll have a clear understanding of the current landscape in sparse neural retrieval and how to navigate through complex, math-heavy research papers with sky-high NDCG scores without getting overwhelmed. +Since the last time we ran our benchmarks, we received a bunch of suggestions on how to run other engines more efficiently, and we applied them. -[The first part](https://qdrant.tech/articles/modern-sparse-neural-retrieval/#sparse-neural-retrieval-evolution) of this article is theoretical, comparing different approaches used in -modern sparse neural retrieval. +This has resulted in significant improvements across all engines. As a result, we have achieved an impressive improvement of nearly four times in certain cases. You can view the previous benchmark results [here](/benchmarks/single-node-speed-benchmark-2022/). -[The second part](https://qdrant.tech/articles/modern-sparse-neural-retrieval/#splade-in-qdrant) is more practical, showing how the best model in modern sparse neural retrieval, `SPLADE++`, -can be used in Qdrant and recommendations on when to choose sparse neural retrieval for your solutions. +#### Introducing a New Dataset -## [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#sparse-neural-retrieval-as-if-keyword-based-retrievers-understood-meaning) Sparse Neural Retrieval: As If Keyword-Based Retrievers Understood Meaning +To ensure our benchmark aligns with the requirements of serving RAG applications at scale, the current most common use-case of vector databases, we have introduced a new dataset consisting of 1 million OpenAI embeddings. -**Keyword-based (lexical) retrievers** like BM25 provide a good explainability. -If a document matches a query, it’s easy to understand why: query terms are present in the document, -and if these are rare terms, they are more important for retrieval. +![rps vs precision benchmark - up and to the right is better](/blog/qdrant-updated-benchmarks-2024/rps-bench.png) +#### Separation of Latency vs RPS Cases -![Keyword-based (Lexical) Retrieval](https://qdrant.tech/articles_data/modern-sparse-neural-retrieval/LexicalRetrievers.png) +Different applications have distinct requirements when it comes to performance. To address this, we have made a clear separation between latency and requests-per-second (RPS) cases. -With their mechanism of exact term matching, they are super fast at retrieval. -A simple **inverted index**, which maps back from a term to a list of documents where this term occurs, saves time on checking millions of documents. +For example, a self-driving car's object recognition system aims to process requests as quickly as possible, while a web server focuses on serving multiple clients simultaneously. By simulating both scenarios and allowing configurations for 1 or 100 parallel readers, our benchmark provides a more accurate evaluation of search engine performance. -![Inverted Index](https://qdrant.tech/articles_data/modern-sparse-neural-retrieval/InvertedIndex.png) +![mean-time vs precision benchmark - down and to the right is better](/blog/qdrant-updated-benchmarks-2024/latency-bench.png) +### What Hasn't Changed? -Lexical retrievers are still a strong baseline in retrieval tasks. -However, by design, they’re unable to bridge **vocabulary** and **semantic mismatch** gaps. -Imagine searching for a “ _tasty cheese_” in an online store and not having a chance to get “ _Gouda_” or “ _Brie_” in your shopping basket. +#### Our Principles of Benchmarking -**Dense retrievers**, based on machine learning models which encode documents and queries in dense vector representations, -are capable of breaching this gap and finding you “ _a piece of Gouda_”. +At Qdrant all code stays open-source. We ensure our benchmarks are accessible for everyone, allowing you to run them on your own hardware. Your input matters to us, and contributions and sharing of best practices are welcome! -![Dense Retrieval](https://qdrant.tech/articles_data/modern-sparse-neural-retrieval/DenseRetrievers.png) -However, explainability here suffers: why is this query representation close to this document representation? -Why, searching for “ _cheese_”, we’re also offered “ _mouse traps_”? What does each number in this vector representation mean? -Which one of them is capturing the cheesiness? +Our benchmarks are strictly limited to open-source solutions, ensuring hardware parity and avoiding biases from external cloud components. -Without a solid understanding, balancing result quality and resource consumption becomes challenging. -Since, hypothetically, any document could match a query, relying on an inverted index with exact matching isn’t feasible. -This doesn’t mean dense retrievers are inherently slower. However, lexical retrieval has been around long enough to inspire several effective architectural choices, which are often worth reusing. -Sooner or later, there should have been somebody who would say, -“ _Wait, but what if I want something timeproof like BM25 but with semantic understanding?_” +We deliberately don't include libraries or algorithm implementations in our comparisons because our focus is squarely on vector databases. -## [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#sparse-neural-retrieval-evolution) Sparse Neural Retrieval Evolution +Why? -Imagine searching for a “ _flabbergasting murder_” story. -” _Flabbergasting_” is a rarely used word, so a keyword-based retriever, for example, BM25, will assign huge importance to it. -Consequently, there is a high chance that a text unrelated to any crimes but mentioning something “ _flabbergasting_” will pop up in the top results. +Because libraries like FAISS, while useful for experiments, don’t fully address the complexities of real-world production environments. They lack features like real-time updates, CRUD operations, high availability, scalability, and concurrent access – essentials in production scenarios. A vector search engine is not only its indexing algorithm, but its overall performance in production. -What if we could instead of relying on term frequency in a document as a proxy of term’s importance as it happens in BM25, -directly predict a term’s importance? The goal is for rare but non-impactful terms to be assigned a much smaller weight than important terms with the same frequency, while both would be equally treated in the BM25 scenario. -How can we determine if one term is more important than another? -Word impact is related to its meaning, and its meaning can be derived from its context (words which surround this particular word). -That’s how dense contextual embedding models come into the picture. - -All the sparse retrievers are based on the idea of taking a model which produces contextual dense vector representations for terms -and teaching it to produce sparse ones. Very often, -[Bidirectional Encoder Representations from the Transformers (BERT)](https://huggingface.co/docs/transformers/en/model_doc/bert) is used as a -base model, and a very simple trainable neural network is added on top of it to sparsify the representations out. -Training this small neural network is usually done by sampling from the [MS MARCO](https://microsoft.github.io/msmarco/) dataset a query, -relevant and irrelevant to it documents and shifting the parameters of the neural network in the direction of relevancy. +We use the same benchmark datasets as the [ann-benchmarks](https://github.com/erikbern/ann-benchmarks/#data-sets) project so you can compare our performance and accuracy against it. -### [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#the-pioneer-of-sparse-neural-retrieval) The Pioneer Of Sparse Neural Retrieval +### Detailed Report and Access -![Deep Contextualized Term Weighting (DeepCT)](https://qdrant.tech/articles_data/modern-sparse-neural-retrieval/DeepCT.png) -The authors of one of the first sparse retrievers, the [`Deep Contextualized Term Weighting framework (DeepCT)`](https://arxiv.org/pdf/1910.10687), -predict an integer word’s impact value separately for each unique word in a document and a query. -They use a linear regression model on top of the contextual representations produced by the basic BERT model, the model’s output is rounded. +For an in-depth look at our latest benchmark results, we invite you to read the [detailed report](/benchmarks/). -When documents are uploaded into a database, the importance of words in a document is predicted by a trained linear regression model -and stored in the inverted index in the same way as term frequencies in BM25 retrievers. -Then, the retrieval process is identical to the BM25 one. -_**Why is DeepCT not a perfect solution?**_ To train linear regression, the authors needed to provide the true value ( **ground truth**) -of each word’s importance so the model could “see” what the right answer should be. -This score is hard to define in a way that it truly expresses the query-document relevancy. -Which score should have the most relevant word to a query when this word is taken from a five-page document? The second relevant? The third? +If you're interested in testing the benchmark yourself or want to contribute to its development, head over to our [benchmark repository](https://github.com/qdrant/vector-db-benchmark). We appreciate your support and involvement in improving the performance of vector databases. -### [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#sparse-neural-retrieval-on-relevance-objective) Sparse Neural Retrieval on Relevance Objective +<|page-386-lllmstxt|> +## Navigating challenges and innovations in search technologies -![DeepImpact](https://qdrant.tech/articles_data/modern-sparse-neural-retrieval/DeepImpact.png) -It’s much easier to define whether a document as a whole is relevant or irrelevant to a query. -That’s why the [`DeepImpact`](https://arxiv.org/pdf/2104.12016) Sparse Neural Retriever authors directly used the relevancy between a query and a document as a training objective. -They take BERT’s contextualized embeddings of the document’s words, transform them through a simple 2-layer neural network in a single scalar -score and sum these scores up for each word overlapping with a query. -The training objective is to make this score reflect the relevance between the query and the document. +We participated in a [podcast](#podcast-discussion-recap) on search technologies, specifically with retrieval-augmented generation (RAG) in language models. -_**Why is DeepImpact not a perfect solution?**_ -When converting texts into dense vector representations, -the BERT model does not work on a word level. Sometimes, it breaks the words into parts. -For example, the word “ _vector_” will be processed by BERT as one piece, but for some words that, for example, -BERT hasn’t seen before, it is going to cut the word in pieces -[as “Qdrant” turns to “Q”, “#dra” and “#nt”](https://huggingface.co/spaces/Xenova/the-tokenizer-playground) +RAG is a cutting-edge approach in natural language processing (NLP). It uses information retrieval and language generation models. We describe how it can enhance what AI can do to understand, retrieve, and generate human-like text. -The DeepImpact model (like the DeepCT model) takes the first piece BERT produces for a word and discards the rest. -However, what can one find searching for “ _Q_” instead of “ _Qdrant_”? +### More about RAG -### [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#know-thine-tokenization) Know Thine Tokenization +Think of RAG as a system that finds relevant knowledge from a vast database. It takes your query, finds the best available information, and then provides an answer. -![Term Independent Likelihood MoDEl v2 (TILDE v2)](https://qdrant.tech/articles_data/modern-sparse-neural-retrieval/TILDEv2.png) -To solve the problems of DeepImpact’s architecture, the [`Term Independent Likelihood MoDEl (TILDEv2)`](https://arxiv.org/pdf/2108.08513) model generates -sparse encodings on a level of BERT’s representations, not on words level. Aside from that, its authors use the identical architecture -to the DeepImpact model. +RAG is the next step in NLP. It goes beyond the limits of traditional generation models by integrating retrieval mechanisms. With RAG, NLP can access external knowledge sources, databases, and documents. This ensures more accurate, contextually relevant, and informative output. -_**Why is TILDEv2 not a perfect solution?**_ -A single scalar importance score value might not be enough to capture all distinct meanings of a word. -**Homonyms** (pizza, cocktail, flower, and female name “ _Margherita_”) are one of the troublemakers in information retrieval. +With RAG, we can set up more precise language generation as well as better context understanding. RAG helps us incorporate real-world knowledge into AI-generated text. This can improve overall performance in tasks such as: -### [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#sparse-neural-retriever-which-understood-homonyms) Sparse Neural Retriever Which Understood Homonyms +- Answering questions +- Creating summaries +- Setting up conversations -![COntextualized Inverted List (COIL)](https://qdrant.tech/articles_data/modern-sparse-neural-retrieval/COIL.png) +### The importance of evaluation for RAG and LLM -If one value for the term importance score is insufficient, we could describe the term’s importance in a vector form! -Authors of the [`COntextualized Inverted List (COIL)`](https://arxiv.org/pdf/2104.07186) model based their work on this idea. -Instead of squeezing 768-dimensional BERT’s contextualised embeddings into one value, -they down-project them (through the similar “relevance” training objective) to 32 dimensions. -Moreover, not to miss a detail, they also encode the query terms as vectors. +Evaluation is crucial for any application leveraging LLMs. It promotes confidence in the quality of the application. It also supports implementation of feedback and improvement loops. -For each vector representing a query token, COIL finds the closest match (using the maximum dot product) vector of the same token in a document. -So, for example, if we are searching for “ _Revolut bank _” and a document in a database has the sentence -“ _Vivid bank was moved to the bank of Amstel _”, out of two “banks”, -the first one will have a bigger value of a dot product with a “ _bank_” in the query, and it will count towards the final score. -The final relevancy score of a document is a sum of scores of query terms matched. - -_**Why is COIL not a perfect solution?**_ This way of defining the importance score captures deeper semantics; -more meaning comes with more values used to describe it. -However, storing 32-dimensional vectors for every term is far more expensive, -and an inverted index does not work as-is with this architecture. +### Unique challenges of evaluating RAG and LLM-based applications -### [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#back-to-the-roots) Back to the Roots +*Retrieval* is the key to Retrieval Augmented Generation, as it affects quality of the generated response. +Potential problems include: -![Universal COntextualized Inverted List (UniCOIL)](https://qdrant.tech/articles_data/modern-sparse-neural-retrieval/UNICOIL.png)[`Universal COntextualized Inverted List (UniCOIL)`](https://arxiv.org/pdf/2106.14807), made by the authors of COIL as a follow-up, goes back to producing a scalar value as the importance score -rather than a vector, leaving unchanged all other COIL design decisions. +- Setting up a defined or expected set of documents, which can be a significant challenge. +- Measuring *subjectiveness*, which relates to how well the data fits or applies to a given domain or use case. -It optimizes resources consumption but the deep semantics understanding tied to COIL architecture is again lost. +### Podcast Discussion Recap -## [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#did-we-solve-the-vocabulary-mismatch-yet) Did we Solve the Vocabulary Mismatch Yet? +In the podcast, we addressed the following: -With the retrieval based on the exact matching, -however sophisticated the methods to predict term importance are, we can’t match relevant documents which have no query terms in them. -If you’re searching for “ _pizza_” in a book of recipes, you won’t find “ _Margherita_”. +- **Model evaluation(LLM)** - Understanding the model at the domain-level for the given use case, supporting required context length and terminology/concept understanding. +- **Ingestion pipeline evaluation** - Evaluating factors related to data ingestion and processing such as chunk strategies, chunk size, chunk overlap, and more. +- **Retrieval evaluation** - Understanding factors such as average precision, [Distributed cumulative gain](https://en.wikipedia.org/wiki/Discounted_cumulative_gain) (DCG), as well as normalized DCG. +- **Generation evaluation(E2E)** - Establishing guardrails. Evaulating prompts. Evaluating the number of chunks needed to set up the context for generation. -A way to solve this problem is through the so-called **document expansion**. -Let’s append words which could be in a potential query searching for this document. -So, the “ _Margherita_” document becomes “ _Margherita pizza_”. Now, exact matching on “ _pizza_” will work! +### The recording -![Document Expansion](https://qdrant.tech/articles_data/modern-sparse-neural-retrieval/DocumentExpansion.png) +Thanks to the [DataTalks.Club](https://datatalks.club) for organizing [this podcast](https://www.youtube.com/watch?v=_fbe1QyJ1PY). -There are two types of document expansion that are used in sparse neural retrieval: -**external** (one model is responsible for expansion, another one for retrieval) and **internal** (all is done by a single model). +### Event Alert +If you're interested in a similar discussion, watch for the recording from the [following event](https://www.eventbrite.co.uk/e/the-evolution-of-genai-exploring-practical-applications-tickets-778359172237?aff=oddtdtcreator), organized by [DeepRec.ai](https://deeprec.ai). -### [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#external-document-expansion) External Document Expansion +### Further reading +- [Qdrant Blog](/blog/) -External document expansion uses a **generative model** (Mistral 7B, Chat-GPT, and Claude are all generative models, -generating words based on the input text) to compose additions to documents before converting them to sparse representations -and applying exact matching methods. +<|page-387-lllmstxt|> +# Optimizing Open Source Vector Search: Strategies from Andrey Vasnetsov at Qdrant -#### [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#external-document-expansion-with-doct5query) External Document Expansion with docT5query +> *"For systems like Qdrant, scalability and performance in my opinion, is much more important than transactional consistency, so it should be treated as a search engine rather than database."*\ +-- Andrey Vasnetsov +> -![External Document Expansion with docT5query](https://qdrant.tech/articles_data/modern-sparse-neural-retrieval/docT5queryDocumentExpansion.png)[`docT5query`](https://github.com/castorini/docTTTTTquery) is the most used document expansion model. -It is based on the [Text-to-Text Transfer Transformer (T5)](https://huggingface.co/docs/transformers/en/model_doc/t5) model trained to -generate top-k possible queries for which the given document would be an answer. -These predicted short queries (up to ~50-60 words) can have repetitions in them, -so it also contributes to the frequency of the terms if the term frequency is considered by the retriever. +Discussing core differences between search engines and databases, Andrey underlined the importance of application needs and scalability in database selection for vector search tasks. -The problem with docT5query expansion is a very long inference time, as with any generative model: -it can generate only one token per run, and it spends a fair share of resources on it. +Andrey Vasnetsov, CTO at Qdrant is an enthusiast of [Open Source](https://qdrant.tech/), machine learning, and vector search. He works on Open Source projects related to [Vector Similarity Search](https://qdrant.tech/articles/vector-similarity-beyond-search/) and Similarity Learning. He prefers practical over theoretical, working demo over arXiv paper. -#### [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#external-document-expansion-with-term-independent-likelihood-model-tilde) External Document Expansion with Term Independent Likelihood MODel (TILDE) +***You can watch this episode on [YouTube](https://www.youtube.com/watch?v=bU38Ovdh3NY).*** -![External Document Expansion with Term Independent Likelihood MODel (TILDE)](https://qdrant.tech/articles_data/modern-sparse-neural-retrieval/TILDEDocumentExpansion.png) + -[`Term Independent Likelihood MODel (TILDE)`](https://github.com/ielab/TILDE) is an external expansion method that reduces the passage expansion time compared to -docT5query by 98%. It uses the assumption that words in texts are independent of each other -(as if we were inserting in our speech words without paying attention to their order), which allows for the parallelisation of document expansion. +***This episode is part of the [ML⇄DB Seminar Series](https://db.cs.cmu.edu/seminar2023/#) (Machine Learning for Databases + Databases for Machine Learning) of the Carnegie Mellon University Database Research Group.*** -Instead of predicting queries, TILDE predicts the most likely terms to see next after reading a passage’s text -( **query likelihood paradigm**). TILDE takes the probability distribution of all tokens in a BERT vocabulary based on the document’s text -and appends top-k of them to the document without repetitions. +## **Top Takeaways:** -_**Problems of external document expansion:**_ External document expansion might not be feasible in many production scenarios where there’s not enough time or compute to expand each and every -document you want to store in a database and then additionally do all the calculations needed for retrievers. -To solve this problem, a generation of models was developed which do everything in one go, expanding documents “internally”. +Dive into the intricacies of [vector databases](https://qdrant.tech/articles/what-is-a-vector-database/) with Andrey as he unpacks Qdrant's approach to combining filtering and vector search, revealing how in-place filtering during graph traversal optimizes precision without sacrificing search exactness, even when scaling to billions of vectors. -### [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#internal-document-expansion) Internal Document Expansion +5 key insights you’ll learn: -Let’s assume we don’t care about the context of query terms, so we can treat them as independent words that we combine in random order to get -the result. Then, for each contextualized term in a document, we are free to pre-compute how this term affects every word in our vocabulary. +- 🧠 **The Strategy of Subgraphs:** Dive into how overlapping intervals and geo hash regions can enhance the precision and connectivity within vector search indices. -For each document, a vector of the vocabulary length is created. To fill this vector in, for each word in the vocabulary, it is checked if the -influence of any document term on it is big enough to consider it. Otherwise, the vocabulary word’s score in a document vector will be zero. -For example, by pre-computing vectors for the document “ _pizza Margherita_” on a vocabulary of 50,000 most used English words, -for this small document of two words, we will get a 50,000-dimensional vector of zeros, where non-zero values will be for a “ _pizza_”, “ _pizzeria_”, -“ _flower_”, “ _woman_”, “ _girl_”, “ _Margherita_”, “ _cocktail_” and “ _pizzaiolo_”. +- đŸ› ïž **Engine vs Database:** Discover the differences between search engines and relational databases and why considering your application's needs is crucial for scalability. -### [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#sparse-neural-retriever-with-internal-document-expansion) Sparse Neural Retriever with Internal Document Expansion +- 🌐 **Combining Searches with Relational Data:** Get insights on integrating relational and vector search for improved efficiency and performance. -![Sparse Transformer Matching (SPARTA)](https://qdrant.tech/articles_data/modern-sparse-neural-retrieval/SPARTA.png) +- 🚅 **Speed and Precision Tactics:** Uncover the techniques for controlling search precision and speed by tweaking the beam size in HNSW indices. -The authors of the [`Sparse Transformer Matching (SPARTA)`](https://arxiv.org/pdf/2009.13013) model use BERT’s model and BERT’s vocabulary (around 30,000 tokens). -For each token in BERT vocabulary, they find the maximum dot product between it and contextualized tokens in a document -and learn a threshold of a considerable (non-zero) effect. -Then, at the inference time, the only thing to be done is to sum up all scores of query tokens in that document. +- 🔗 **Connected Graph Challenges:** Learn about navigating the difficulties of maintaining a connected graph while filtering during search operations. -_**Why is SPARTA not a perfect solution?**_ Trained on the MS MARCO dataset, many sparse neural retrievers, including SPARTA, -show good results on MS MARCO test data, but when it comes to generalisation (working with other data), they -[could perform worse than BM25](https://arxiv.org/pdf/2307.10488). +> Fun Fact: [The Qdrant system](https://qdrant.tech/) is capable of in-place filtering during graph traversal, which is a novel approach compared to traditional post-filtering methods, ensuring the correct quantity of results that meet the filtering conditions. +> -### [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#state-of-the-art-of-modern-sparse-neural-retrieval) State-of-the-Art of Modern Sparse Neural Retrieval +## Timestamps: -![Sparse Lexical and Expansion Model Plus Plus, (SPLADE++)](https://qdrant.tech/articles_data/modern-sparse-neural-retrieval/SPLADE++.png) -The authors of the [`Sparse Lexical and Expansion Model (SPLADE)]`](https://arxiv.org/pdf/2109.10086) family of models added dense model training tricks to the -internal document expansion idea, which made the retrieval quality noticeably better. +00:00 Search professional with expertise in vectors and engines.\ +09:59 Elasticsearch: scalable, weak consistency, prefer vector search.\ +12:53 Optimize data structures for faster processing efficiency.\ +21:41 Vector indexes require special treatment, like HNSW's proximity graph and greedy search.\ +23:16 HNSW index: approximate, precision control, CPU intensive.\ +30:06 Post-filtering inefficient, prefiltering costly.\ +34:01 Metadata-based filters; creating additional connecting links.\ +41:41 Vector dimension impacts comparison speed, indexing complexity high.\ +46:53 Overlapping intervals and subgraphs for precision.\ +53:18 Postgres limits scalability, additional indexing engines provide faster queries.\ +59:55 Embedding models for time series data explained.\ +01:02:01 Cheaper system for serving billion vectors. -- The SPARTA model is not sparse enough by construction, so authors of the SPLADE family of models introduced explicit **sparsity regularisation**, -preventing the model from producing too many non-zero values. -- The SPARTA model mostly uses the BERT model as-is, without any additional neural network to capture the specifity of Information Retrieval problem, -so SPLADE models introduce a trainable neural network on top of BERT with a specific architecture choice to make it perfectly fit the task. -- SPLADE family of models, finally, uses **knowledge distillation**, which is learning from a bigger -(and therefore much slower, not-so-fit for production tasks) model how to predict good representations. +## More Quotes from Andrey: -One of the last versions of the SPLADE family of models is [`SPLADE++`](https://arxiv.org/pdf/2205.04733). +*"It allows us to compress vector to a level where a single dimension is represented by just a single bit, which gives total of 32 times compression for the vector."*\ +-- Andrey Vasnetsov on vector compression in AI -SPLADE++, opposed to SPARTA model, expands not only documents but also queries at inference time. -We’ll demonstrate this in the next section. +*"We build overlapping intervals and we build these subgraphs with additional links for those intervals. And also we can do the same with, let's say, location data where we have geocoordinates, so latitude, longitude, we encode it into geo hashes and basically build this additional graph for overlapping geo hash regions."*\ +-- Andrey Vasnetsov -## [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#splade-in-qdrant) SPLADE++ in Qdrant +*"We can further compress data using such techniques as delta encoding, as variable byte encoding, and so on. And this total effect, total combined effect of this optimization can make immutable data structures order of minute more efficient than mutable ones."*\ +-- Andrey Vasnetsov -In Qdrant, you can use [`SPLADE++`](https://arxiv.org/pdf/2205.04733) easily with our lightweight library for embeddings called [FastEmbed](https://qdrant.tech/documentation/fastembed/). +<|page-388-lllmstxt|> +> *"I really think it's something the technology is ready for and would really help this kind of embedding model jumping onto the text search projects.”*\ +-- NoĂ© Achache on the future of image embedding +> -#### [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#setup) Setup +Exploring the depths of vector search? Want an analysis of its application in image search and document retrieval? NoĂ© got you covered. -Install `FastEmbed`. +NoĂ© Achache is a Lead Data Scientist at Sicara, where he worked on a wide range of projects mostly related to computer vision, prediction with structured data, and more recently LLMs. -```python -pip install fastembed +***Listen to the episode on [Spotify](https://open.spotify.com/episode/2YgcSFjP7mKE0YpDGmSiq5?si=6BhlAMveSty4Yt7umPeHjA), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/1vKoiFAdorE).*** -``` + -Import sparse text embedding models supported in FastEmbed. + -```python -from fastembed import SparseTextEmbedding +## **Top Takeaways:** -``` +Discover the efficacy of Dino V2 in image representation and the complexities of deploying vector databases, while navigating the challenges of fine-tuning and data safety in sensitive fields. -You can list all sparse text embedding models currently supported. +In this episode, Noe, shares insights on vector search from image search to retrieval augmented generation, emphasizing practical application in complex projects. -```python -SparseTextEmbedding.list_supported_models() +5 key insights you’ll learn: -``` +1. Cutting-edge Image Search: Learn about the advanced model Dino V2 and its efficacy in image representation, surpassing traditional feature transform methods. +2. Data Deduplication Strategies: Gain knowledge on the sophisticated process of deduplicating real estate listings, a vital task in managing extensive data collections. +3. Document Retrieval Techniques: Understand the challenges and solutions in retrieval augmented generation for document searches, including the use of multi-language embedding models. +4. Protection of Sensitive Medical Data: Delve into strategies for handling confidential medical information and the importance of data safety in health-related applications. +5. The Path Forward in Model Development: Hear Noe discuss the pressing need for new types of models to address the evolving needs within the industry. -Output with a list of supported models +> Fun Fact: The best-performing model NoĂ© mentions for image representation in his image search project is Dino V2, which interestingly didn't require fine-tuning to understand objects and patterns. +> -```bash -[{'model': 'prithivida/Splade_PP_en_v1',\ - 'vocab_size': 30522,\ - 'description': 'Independent Implementation of SPLADE++ Model for English',\ - 'size_in_GB': 0.532,\ - 'sources': {'hf': 'Qdrant/SPLADE_PP_en_v1'},\ - 'model_file': 'model.onnx'},\ - {'model': 'prithvida/Splade_PP_en_v1',\ - 'vocab_size': 30522,\ - 'description': 'Independent Implementation of SPLADE++ Model for English',\ - 'size_in_GB': 0.532,\ - 'sources': {'hf': 'Qdrant/SPLADE_PP_en_v1'},\ - 'model_file': 'model.onnx'},\ - {'model': 'Qdrant/bm42-all-minilm-l6-v2-attentions',\ - 'vocab_size': 30522,\ - 'description': 'Light sparse embedding model, which assigns an importance score to each token in the text',\ - 'size_in_GB': 0.09,\ - 'sources': {'hf': 'Qdrant/all_miniLM_L6_v2_with_attentions'},\ - 'model_file': 'model.onnx',\ - 'additional_files': ['stopwords.txt'],\ - 'requires_idf': True},\ - {'model': 'Qdrant/bm25',\ - 'description': 'BM25 as sparse embeddings meant to be used with Qdrant',\ - 'size_in_GB': 0.01,\ - 'sources': {'hf': 'Qdrant/bm25'},\ - 'model_file': 'mock.file',\ - 'additional_files': ['arabic.txt',\ - 'azerbaijani.txt',\ - 'basque.txt',\ - 'bengali.txt',\ - 'catalan.txt',\ - 'chinese.txt',\ - 'danish.txt',\ - 'dutch.txt',\ - 'english.txt',\ - 'finnish.txt',\ - 'french.txt',\ - 'german.txt',\ - 'greek.txt',\ - 'hebrew.txt',\ - 'hinglish.txt',\ - 'hungarian.txt',\ - 'indonesian.txt',\ - 'italian.txt',\ - 'kazakh.txt',\ - 'nepali.txt',\ - 'norwegian.txt',\ - 'portuguese.txt',\ - 'romanian.txt',\ - 'russian.txt',\ - 'slovene.txt',\ - 'spanish.txt',\ - 'swedish.txt',\ - 'tajik.txt',\ - 'turkish.txt'],\ - 'requires_idf': True}] +## Show Notes: -``` +00:00 Relevant experience in vector DB projects and talks.\ +05:57 Match image features, not resilient to changes.\ +07:06 Compute crop vectors, and train to converge.\ +11:37 Simple training task, improve with hard examples.\ +15:25 Improving text embeddings using hard examples.\ +22:29 Future of image embedding for document search.\ +27:28 Efficient storage and retrieval process feature.\ +29:01 Models handle varied data; sparse vectors now possible.\ +35:59 Use memory, avoid disk for CI integration.\ +37:43 Challenging metadata filtering for vector databases and new models -Load SPLADE++. +## More Quotes from NoĂ©: -```python -sparse_model_name = "prithivida/Splade_PP_en_v1" -sparse_model = SparseTextEmbedding(model_name=sparse_model_name) +*"So basically what was great is that Dino manages to understand all objects and close patterns without fine tuning. So you can get an off the shelf model and get started very quickly and start bringing value very quickly without having to go through all the fine tuning processes.”*\ +-- NoĂ© Achache -``` +*"And at the end, the embeddings was not learning any very complex features, so it was not really improving it.”*\ +-- NoĂ© Achache -The model files will be fetched and downloaded, with progress showing. +*"When using an API model, it's much faster to use it in asynchronous mode like the embedding equation went something like ten times or 100 times faster. So it was definitely, it changed a lot of things.”*\ +-- NoĂ© Achache -#### [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#embed-data) Embed data +## Transcript: +Demetrios: +Noe. Great to have you here everyone. We are back for another vector space talks and today we are joined by my man Noe, who is the lead data scientist at Sicara, and if you do not know, he is working on a wide range of projects, mostly related to computer vision. Vision. And today we are talking about navigating the complexities of vector search. We're going to get some practical insights from diverse projects in image search and everyone's favorite topic these days, retrieval augmented generation, aka rags. So noe, I think you got something for us. You got something planned for us here? -We will use a toy movie description dataset. +Noe Acache: +Yeah, I do. I can share them. -Movie description dataset - -```python -descriptions = ["In 1431, Jeanne d'Arc is placed on trial on charges of heresy. The ecclesiastical jurists attempt to force Jeanne to recant her claims of holy visions.",\ - "A film projectionist longs to be a detective, and puts his meagre skills to work when he is framed by a rival for stealing his girlfriend's father's pocketwatch.",\ - "A group of high-end professional thieves start to feel the heat from the LAPD when they unknowingly leave a clue at their latest heist.",\ - "A petty thief with an utter resemblance to a samurai warlord is hired as the lord's double. When the warlord later dies the thief is forced to take up arms in his place.",\ - "A young boy named Kubo must locate a magical suit of armour worn by his late father in order to defeat a vengeful spirit from the past.",\ - "A biopic detailing the 2 decades that Punjabi Sikh revolutionary Udham Singh spent planning the assassination of the man responsible for the Jallianwala Bagh massacre.",\ - "When a machine that allows therapists to enter their patients' dreams is stolen, all hell breaks loose. Only a young female therapist, Paprika, can stop it.",\ - "An ordinary word processor has the worst night of his life after he agrees to visit a girl in Soho whom he met that evening at a coffee shop.",\ - "A story that revolves around drug abuse in the affluent north Indian State of Punjab and how the youth there have succumbed to it en-masse resulting in a socio-economic decline.",\ - "A world-weary political journalist picks up the story of a woman's search for her son, who was taken away from her decades ago after she became pregnant and was forced to live in a convent.",\ - "Concurrent theatrical ending of the TV series Neon Genesis Evangelion (1995).",\ - "During World War II, a rebellious U.S. Army Major is assigned a dozen convicted murderers to train and lead them into a mass assassination mission of German officers.",\ - "The toys are mistakenly delivered to a day-care center instead of the attic right before Andy leaves for college, and it's up to Woody to convince the other toys that they weren't abandoned and to return home.",\ - "A soldier fighting aliens gets to relive the same day over and over again, the day restarting every time he dies.",\ - "After two male musicians witness a mob hit, they flee the state in an all-female band disguised as women, but further complications set in.",\ - "Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household.",\ - "A renegade reporter trailing a young runaway heiress for a big story joins her on a bus heading from Florida to New York, and they end up stuck with each other when the bus leaves them behind at one of the stops.",\ - "Story of 40-man Turkish task force who must defend a relay station.",\ - "Spinal Tap, one of England's loudest bands, is chronicled by film director Marty DiBergi on what proves to be a fateful tour.",\ - "Oskar, an overlooked and bullied boy, finds love and revenge through Eli, a beautiful but peculiar girl."] +Demetrios: +All right, well, I'm very happy to have you on here, man. I appreciate you doing this. And let's get you sharing your screen so we can start rocking, rolling. -``` +Noe Acache: +Okay. Can you see my screen? -Embed movie descriptions with SPLADE++. +Demetrios: +Yeah. Awesome. -```python -sparse_descriptions = list(sparse_model.embed(descriptions)) +Noe Acache: +Great. Thank you, Demetrius, for the great introduction. I just completed quickly. So as you may have guessed, I'm french. I'm a lead data scientist at Sicara. So Secura is a service company helping its clients in data engineering and data science, so building projects for them. Before being there, I worked at realtics on optical character recognition, and I'm now working mostly on, as you said, computer vision and also Gen AI. So I'm leading the geni side and I've been there for more than three years. -``` +Noe Acache: +So some relevant experience on vector DB is why I'm here today, because I did four projects, four vector soft projects, and I also wrote an article on how to choose your database in 2023, your vector database. And I did some related talks in other conferences like Pydata, DVC, all the geni meetups of London and Paris. So what are we going to talk about today? First, an overview of the vector search projects. Just to give you an idea of the kind of projects we can do with vector search. Then we will dive into the specificities of the image search project and then into the specificities of the text search project. So here are the four projects. So two in image search, two in text search. The first one is about matching objects in videos to sell them afterwards. -You can check how a sparse vector generated by SPLADE++ looks in Qdrant. +Noe Acache: +So basically you have a video. We first detect the object. So like it can be a lamp, it can be a piece of clothes, anything, we classify it and then we compare it to a large selection of similar objects to retrieve the most similar one to a large collection of sellable objects. The second one is about deduplicating real estate adverts. So when agencies want to sell a property, like sometimes you have several agencies coming to take pictures of the same good. So you have different pictures of the same good. And the idea of this project was to match the different pictures of the same good, the same profile. -```python -sparse_descriptions[0] +Demetrios: +I've seen that dude. I have been a victim of that. When I did a little house shopping back like five years ago, it would be the same house in many different ones, and sometimes you wouldn't know because it was different photos. So I love that you were thinking about it that way. Sorry to interrupt. -``` +Noe Acache: +Yeah, so to be fair, it was the idea of my client. So basically I talk about it a bit later with aggregating all the adverts and trying to deduplicate them. And then the last two projects are about drugs retrieval, augmented generation. So the idea to be able to ask questions to your documentation. The first one was for my company's documentation and the second one was for a medical company. So different kind of complexities. So now we know all about this project, let's dive into them. So regarding the image search project, to compute representations of the images, the best performing model from the benchmark, and also from my experience, is currently Dino V two. -It is stored as **indices** of BERT tokens, weights of which are non-zero, and **values** of these weights. -```bash -SparseEmbedding( - values=array([1.57449973, 0.90787691, ..., 1.21796167, 1.1321187]), - indices=array([ 1040, 2001, ..., 28667, 29137]) -) +Noe Acache: +So a model developed by meta that you may have seen, which is using visual transformer. And what's amazing about it is that using the attention map, you can actually segment what's important in the picture, although you haven't told it specifically what's important. And as a human, it will learn to focus on the dog, on this picture and do not take into consideration the noisy background. So when I say best performing model, I'm talking about comparing to other architecture like Resnet efficient nets models, an approach I haven't tried, which also seems interesting. If anyone tried it for similar project, please reach out afterwards. I'll be happy to talk about it. Is sift for feature transform something about feature transform. It's basically a more traditional method without learned features through machine learning, as in you don't train the model, but it's more traditional methods. -``` +Noe Acache: +And you basically detect the different features in an image and then try to find the same features in an image which is supposed to post to be the same. All the blue line trying to match the different features. Of course it's made to match image with exactly the same content, so it wouldn't really work. Probably not work in the first use case, because we are trying to match similar clothes, but which are not exactly the same one. And also it's known to be not very resilient with the changes of angles when it changes too much, et cetera. So it may not be very good as well for the second use case, but again, I haven't tried it, so just leaving it here on the side. Just a quick word about how Dino works in case you're interested. So it's a vision transformer and it's trade in an unsupervised way, as in you don't have any labels provided, so you just take pictures and you first extract small crops and large crops and you augment them. -#### [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#upload-embeddings-to-qdrant) Upload Embeddings to Qdrant +Noe Acache: +And then you're going to use the model to compute vectors, representations of each of these crops. And since they all represent the same image, they should all be the same. So then you can compute a loss to see how they diverge and to basically train them to become the same. So this is how it works and how it works. And the difference between the second version is just that they use more data sets and the distillation method to have a very performant model, which is also very fast to run regarding the first use case. So, matching objects in videos to sellable items for people who use Google lengths before, it's quite similar, where in Google lens you can take a picture of something and then it will try to find similar objects to buy. So again, you have a video and then you detect one of the objects in the video, put it and compare it to a vector database which contains a lot of objects which are similar for the representation. And then it will output the most similar lamp here. -Install `qdrant-client` +Noe Acache: +Now we're going to try to analyze how this project went regarding the positive outcomes and the changes we faced. So basically what was great is that Dino manages to understand all objects and close patterns without fine tuning. So you can get an off the shelf model and get started very quickly and start bringing value very quickly without having to go through all the fine tuning processes. And it also manages to focus on the object without segmentation. What I mean here is that we're going to get a box of the object, and in this box there will be a very noisy background which may disturb the matching process. And since Dino really manages to focus on the object, that's important on the image. It doesn't really matter that we don't segmentate perfectly the image. Regarding the vector database, this project started a while ago, and I think we chose the vector database something like a year and a half ago. -```python -pip install qdrant-client +Noe Acache: +And so it was before all the vector database hype. And at the time, the most famous one was Milvos, the only famous one actually. And we went for an on premise development deployment. And actually our main learning is that the DevOps team really struggled to deploy it, because basically it's made of a lot of pods. And the documentations about how these pods are supposed to interact together is not really perfect. And it was really buggy at this time. So the clients lost a lot of time and money in this deployment. The challenges, other challenges we faced is that we noticed that the matching wasn't very resilient to large distortions. -``` +Noe Acache: +So for furnitures like lamps, it's fine. But let's say you have a trouser and a person walking. So the trouser won't exactly have the same shape. And since you haven't trained your model to specifically know, it shouldn't focus on the movements. It will encode this movement. And then in the matching, instead of matching trouser, which looks similar, it will just match trouser where in the product picture the person will be working as well, which is not really what we want. And the other challenges we faced is that we tried to fine tune the model, but our first fine tuning wasn't very good because we tried to take an open source model and, and get the labels it had, like on different furnitures, clothes, et cetera, to basically train a model to classify the different classes and then remove the classification layer to just keep the embedding parts. The thing is that the labels were not specific enough. -Qdrant Client has a simple in-memory mode that allows you to experiment locally on small data volumes. -Alternatively, you could use for experiments [a free tier cluster](https://qdrant.tech/documentation/cloud/create-cluster/#create-a-cluster) -in Qdrant Cloud. +Noe Acache: +So the training task was quite simple. And at the end, the embeddings was not learning any very complex features, so it was not really improving it. So jumping onto the areas of improvement, knowing all of that, the first thing I would do if I had to do it again will be to use the managed milboss for a better fine tuning, it would be to labyd hard examples, hard pairs. So, for instance, you know that when you have a matching pair where the similarity score is not too high or not too low, you know, it's where the model kind of struggles and you will find some good matching and also some mistakes. So it's where it kind of is interesting to level to then be able to fine tune your model and make it learn more complex things according to your tasks. Another possibility for fine tuning will be some sort of multilabel classification. So for instance, if you consider tab close, you could say, all right, those disclose contain buttons. It have a color, it have stripes. -```python -from qdrant_client import QdrantClient, models -qdrant_client = QdrantClient(":memory:") # Qdrant is running from RAM. +Noe Acache: +And for all of these categories, you'll get a score between zero and one. And concatenating all these scores together, you can get an embedding which you can put in a vector database for your vector search. It's kind of hard to scale because you need to do a specific model and labeling for each type of object. And I really wonder how Google lens does because their algorithm work very well. So are they working more like with this kind of functioning or this kind of functioning? So if anyone had any thought on that or any idea, again, I'd be happy to talk about it afterwards. And finally, I feel like we made a lot of advancements in multimodal training, trying to combine text inputs with image. We've made input to build some kind of complex embeddings. And how great would it be to have an image embeding you could guide with text. -``` +Noe Acache: +So you could just like when creating an embedding of your image, just say, all right, here, I don't care about the movements, I only care about the features on the object, for instance. And then it will learn an embedding according to your task without any fine tuning. I really feel like with the current state of the arts we are able to do this. I mean, we need to do it, but the technology is ready. -Now, let’s create a [collection](https://qdrant.tech/documentation/concepts/collections/) in which could upload our sparse SPLADE++ embeddings. +Demetrios: +Can I ask a few questions before you jump into the second use case? -For that, we will use the [sparse vectors](https://qdrant.tech/documentation/concepts/vectors/#sparse-vectors) representation supported in Qdrant. +Noe Acache: +Yes. -```python -qdrant_client.create_collection( - collection_name="movies", - vectors_config={}, - sparse_vectors_config={ - "film_description": models.SparseVectorParams(), - }, -) +Demetrios: +What other models were you looking at besides the dyno one? -``` +Noe Acache: +I said here, compared to Resnet, efficient nets and these kind of architectures. -To make this collection human-readable, let’s save movie metadata (name, description and movie’s length) together with an embeddings. +Demetrios: +Maybe this was too early, or maybe it's not actually valuable. Was that like segment anything? Did that come into the play? -Movie metadata +Noe Acache: +So segment anything? I don't think they redo embeddings. It's really about segmentation. So here I was just showing the segmentation part because it's a cool outcome of the model and it shows that the model works well here we are really here to build a representation of the image we cannot really play with segment anything for the matching, to my knowledge, at least. -```python -metadata = [{"movie_name": "The Passion of Joan of Arc", "movie_watch_time_min": 114, "movie_description": "In 1431, Jeanne d'Arc is placed on trial on charges of heresy. The ecclesiastical jurists attempt to force Jeanne to recant her claims of holy visions."},\ -{"movie_name": "Sherlock Jr.", "movie_watch_time_min": 45, "movie_description": "A film projectionist longs to be a detective, and puts his meagre skills to work when he is framed by a rival for stealing his girlfriend's father's pocketwatch."},\ -{"movie_name": "Heat", "movie_watch_time_min": 170, "movie_description": "A group of high-end professional thieves start to feel the heat from the LAPD when they unknowingly leave a clue at their latest heist."},\ -{"movie_name": "Kagemusha", "movie_watch_time_min": 162, "movie_description": "A petty thief with an utter resemblance to a samurai warlord is hired as the lord's double. When the warlord later dies the thief is forced to take up arms in his place."},\ -{"movie_name": "Kubo and the Two Strings", "movie_watch_time_min": 101, "movie_description": "A young boy named Kubo must locate a magical suit of armour worn by his late father in order to defeat a vengeful spirit from the past."},\ -{"movie_name": "Sardar Udham", "movie_watch_time_min": 164, "movie_description": "A biopic detailing the 2 decades that Punjabi Sikh revolutionary Udham Singh spent planning the assassination of the man responsible for the Jallianwala Bagh massacre."},\ -{"movie_name": "Paprika", "movie_watch_time_min": 90, "movie_description": "When a machine that allows therapists to enter their patients' dreams is stolen, all hell breaks loose. Only a young female therapist, Paprika, can stop it."},\ -{"movie_name": "After Hours", "movie_watch_time_min": 97, "movie_description": "An ordinary word processor has the worst night of his life after he agrees to visit a girl in Soho whom he met that evening at a coffee shop."},\ -{"movie_name": "Udta Punjab", "movie_watch_time_min": 148, "movie_description": "A story that revolves around drug abuse in the affluent north Indian State of Punjab and how the youth there have succumbed to it en-masse resulting in a socio-economic decline."},\ -{"movie_name": "Philomena", "movie_watch_time_min": 98, "movie_description": "A world-weary political journalist picks up the story of a woman's search for her son, who was taken away from her decades ago after she became pregnant and was forced to live in a convent."},\ -{"movie_name": "Neon Genesis Evangelion: The End of Evangelion", "movie_watch_time_min": 87, "movie_description": "Concurrent theatrical ending of the TV series Neon Genesis Evangelion (1995)."},\ -{"movie_name": "The Dirty Dozen", "movie_watch_time_min": 150, "movie_description": "During World War II, a rebellious U.S. Army Major is assigned a dozen convicted murderers to train and lead them into a mass assassination mission of German officers."},\ -{"movie_name": "Toy Story 3", "movie_watch_time_min": 103, "movie_description": "The toys are mistakenly delivered to a day-care center instead of the attic right before Andy leaves for college, and it's up to Woody to convince the other toys that they weren't abandoned and to return home."},\ -{"movie_name": "Edge of Tomorrow", "movie_watch_time_min": 113, "movie_description": "A soldier fighting aliens gets to relive the same day over and over again, the day restarting every time he dies."},\ -{"movie_name": "Some Like It Hot", "movie_watch_time_min": 121, "movie_description": "After two male musicians witness a mob hit, they flee the state in an all-female band disguised as women, but further complications set in."},\ -{"movie_name": "Snow White and the Seven Dwarfs", "movie_watch_time_min": 83, "movie_description": "Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household."},\ -{"movie_name": "It Happened One Night", "movie_watch_time_min": 105, "movie_description": "A renegade reporter trailing a young runaway heiress for a big story joins her on a bus heading from Florida to New York, and they end up stuck with each other when the bus leaves them behind at one of the stops."},\ -{"movie_name": "Nefes: Vatan Sagolsun", "movie_watch_time_min": 128, "movie_description": "Story of 40-man Turkish task force who must defend a relay station."},\ -{"movie_name": "This Is Spinal Tap", "movie_watch_time_min": 82, "movie_description": "Spinal Tap, one of England's loudest bands, is chronicled by film director Marty DiBergi on what proves to be a fateful tour."},\ -{"movie_name": "Let the Right One In", "movie_watch_time_min": 114, "movie_description": "Oskar, an overlooked and bullied boy, finds love and revenge through Eli, a beautiful but peculiar girl."}] +Demetrios: +And then on the next slide where you talked about things you would do differently, or the last slide, I guess the areas of improvement you mentioned label hard examples for fine tuning. And I feel like, yeah, there's one way of doing it, which is you hand picking the different embeddings that you think are going to be hard. And then there's another one where I think there's tools out there now that can kind of show you where there are different embeddings that aren't doing so well or that are more edge cases. -``` +Noe Acache: +Which tools are you talking about? -Upload embedded descriptions with movie metadata into the collection. +Demetrios: +I don't remember the names, but I definitely have seen demos online about how it'll give you a 3d space and you can kind of explore the different embeddings and explore what's going on I. -```python -qdrant_client.upsert( - collection_name="movies", - points=[\ - models.PointStruct(\ - id=idx,\ - payload=metadata[idx],\ - vector={\ - "film_description": models.SparseVector(\ - indices=vector.indices,\ - values=vector.values\ - )\ - },\ - )\ - for idx, vector in enumerate(sparse_descriptions)\ - ], -) +Noe Acache: +Know exactly what you're talking about. So tensorboard embeddings is a good tool for that. I could actually demo it afterwards. -``` +Demetrios: +Yeah, I don't want to get you off track. That's something that came to mind if. -Implicitly generate sparse vectors (Click to expand) +Noe Acache: +You'Re talking about the same tool. Turns out embedding. So basically you have an embedding of like 1000 dimensions and it just reduces it to free dimensions. And so you can visualize it in a 3d space and you can see how close your embeddings are from each other. -```python -qdrant_client.upsert( - collection_name="movies", - points=[\ - models.PointStruct(\ - id=idx,\ - payload=metadata[idx],\ - vector={\ - "film_description": models.Document(\ - text=description, model=sparse_model_name\ - )\ - },\ - )\ - for idx, description in enumerate(descriptions)\ - ], -) +Demetrios: +Yeah, exactly. -``` +Noe Acache: +But it's really for visualization purposes, not really for training purposes. -#### [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#querying) Querying +Demetrios: +Yeah, okay, I see. -Let’s query our collection! +Noe Acache: +Talking about the same thing. -```python -query_embedding = list(sparse_model.embed("A movie about music"))[0] +Demetrios: +Yeah, I think that sounds like what I'm talking about. So good to know on both of these. And you're shooting me straight on it. Mike is asking a question in here, like text embedding, would that allow you to include an image with alternate text? -response = qdrant_client.query_points( - collection_name="movies", - query=models.SparseVector(indices=query_embedding.indices, values=query_embedding.values), - using="film_description", - limit=1, - with_vectors=True, - with_payload=True -) -print(response) +Noe Acache: +An image with alternate text? I'm not sure the question. -``` +Demetrios: +So it sounds like a way to meet regulatory accessibility requirements if you have. I think it was probably around where you were talking about the multimodal and text to guide the embeddings and potentially would having that allow you to include an image with alternate text? -Implicitly generate sparse vectors (Click to expand) +Noe Acache: +The idea is not to. I feel like the question is about inserting text within the image. It's what I understand. My idea was just if you could create an embedding that could combine a text inputs and the image inputs, and basically it would be trained in such a way that the text would basically be used as a guidance of the image to only encode the parts of the image which are required for your task to not be disturbed by the noisy. -```python -response = qdrant_client.query_points( - collection_name="movies", - query=models.Document(text="A movie about music", model=sparse_model_name), - using="film_description", - limit=1, - with_vectors=True, - with_payload=True, -) -print(response) +Demetrios: +Okay. Yeah. All right, Mike, let us know if that answers the question or if you have more. Yes. He's saying, yeah, inserting text with image for people who can't see. -``` +Noe Acache: +Okay, cool. -Output looks like this: +Demetrios: +Yeah, right on. So I'll let you keep cruising and I'll try not to derail it again. But that was great. It was just so pertinent. I wanted to stop you and ask some questions. -```bash -points=[ScoredPoint(\ - id=18,\ - version=0,\ - score=9.6779785,\ - payload={\ - 'movie_name': 'This Is Spinal Tap',\ - 'movie_watch_time_min': 82,\ - 'movie_description': "Spinal Tap, one of England's loudest bands,\ - is chronicled by film director Marty DiBergi on what proves to be a fateful tour."\ - },\ - vector={\ - 'film_description': SparseVector(\ - indices=[1010, 2001, ..., 25316, 25517],\ - values=[0.49717945, 0.19760133, ..., 1.2124698, 0.58689135])\ - },\ - shard_key=None,\ - order_value=None\ -)] +Noe Acache: +Larry, let's just move in. So second use case is about deduplicating real estate adverts. So as I was saying, you have two agencies coming to take different pictures of the same property. And the thing is that they may not put exactly the same price or the same surface or the same location. So you cannot just match them with metadata. So what our client was doing beforehand, and he kind of built a huge if machine, which is like, all right, if the location is not too far and if the surface is not too far. And the price, and it was just like very complex rules. And at the end there were a lot of edge cases. -``` +Noe Acache: +It was very hard to maintain. So it was like, let's just do a simpler solution just based on images. So it was basically the task to match images of the same properties. Again on the positive outcomes is that the dino really managed to understand the patterns of the properties without any fine tuning. And it was resilient to read different angles of the same room. So like on the pictures I shown, I just showed, the model was quite good at identifying. It was from the same property. Here we used cudrant for this project was a bit more recent. -As you can see, there are no overlapping words in the query and a description of a found movie, -even though the answer fits the query, and yet we’re working with **exact matching**. +Noe Acache: +We leveraged a lot the metadata filtering because of course we can still use the metadata even it's not perfect just to say, all right, only search vectors, which are a price which is more or less 10% this price. The surface is more or less 10% the surface, et cetera, et cetera. And indexing of this metadata. Otherwise the search is really slowed down. So we had 15 million vectors and without this indexing, the search could take up to 20, 30 seconds. And with indexing it was like in a split second. So it was a killer feature for us. And we use quantization as well to save costs because the task was not too hard. -This is possible due to the **internal expansion** of the query and the document that SPLADE++ does. +Noe Acache: +Since using the metadata we managed to every time reduce the task down to a search of 1000 vector. So it wasn't too annoying to quantize the vectors. And at the end for 15 million vectors, it was only $275 per month, which with the village version, which is very decent. The challenges we faced was really about bathrooms and empty rooms because all bathrooms kind of look similar. They have very similar features and same for empty rooms since there is kind of nothing in them, just windows. The model would often put high similarity scores between two bathroom of different properties and same for the empty rooms. So again, the method to overcome this thing will be to label harpers. So example were like two images where the model would think they are similar to actually tell the model no, they are not similar to allow it to improve its performance. -#### [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#internal-expansion-by-splade) Internal Expansion by SPLADE++ +Noe Acache: +And again, same thing on the future of image embedding. I really think it's something the technology is ready for and would really help this kind of embedding model jumping onto the text search projects. So the principle of retribution generation for those of you who are not familiar with it is just you take some documents, you have an embedding model here, an embedding model trained on text and not on images, which will output representations from these documents, put it in a vector database, and then when a user will ask a question over the documentation, it will create an embedding of the request and retrieve the most similar documents. And afterwards we usually pass it to an LLM, which will generate an answer. But here in this talk, we won't focus on the overall product, but really on the vector search part. So the two projects was one, as I told you, a rack for my nutrition company, so endosion with around a few hundred thousand of pages, and the second one was for medical companies, so for the doctors. So it was really about the documentation search rather than the LLM, because you cannot output any mistake. The model we used was OpenAI Ada two. -Let’s check how did SPLADE++ expand the query and the document we got as an answer. +Noe Acache: +Why? Mostly because for the first use case it's multilingual and it was off the shelf, very easy to use, so we did not spend a lot of time on this project. So using an API model made it just much faster. Also it was multilingual, approved by the community, et cetera. For the second use case, we're still working on it. So since we use GPT four afterwards, because it's currently the best LLM, it was also easier to use adatu to start with, but we may use a better one afterwards because as I'm saying, it's not the best one if you refer to the MTAB. So the massive text embedding benchmark made by hugging face, which basically gathers a lot of embeddings benchmark such as retrieval for instance, and so classified the different model for these benchmarks. The M tab is not perfect because it's not taking into account cross language capabilities. All the benchmarks are just for one language and it's not as well taking into account most of the languages, like it's only considering English, Polish and Chinese. -For that, we will need to use the HuggingFace library called [Tokenizers](https://huggingface.co/docs/tokenizers/en/index). -With it, we will be able to decode back to human-readable format **indices** of words in a vocabulary SPLADE++ uses. +Noe Acache: +And also it's probably biased for models trained on close source data sets. So like most of the best performing models are currently closed source APIs and hence closed source data sets, and so we don't know how they've been trained. So they probably trained themselves on these data sets. At least if I were them, it's what I would do. So I assume they did it to gain some points in these data sets. -Firstly we will need to install this library. +Demetrios: +So both of these rags are mainly with documents that are in French? -```python -pip install tokenizers +Noe Acache: +Yes. So this one is French and English, and this one is French only. -``` +Demetrios: +Okay. Yeah, that's why the multilingual is super important for these use cases. -Then, let’s write a function which will decode SPLADE++ sparse embeddings and return words SPLADE++ uses for encoding the input. +Noe Acache: +Exactly. Again, for this one there are models for French working much better than other two, so we may change it afterwards, but right now the performance we have is decent. Since both projects are very similar, I'll jump into the conclusion for both of them together. So Ada two is good for understanding diverse context, wide range of documentation, medical contents, technical content, et cetera, without any fine tuning. The cross language works quite well, so we can ask questions in English and retrieve documents in French and the other way around. And also, quick note, because I did not do it from the start, is that when using an API model, it's much faster to use it in asynchronous mode like the embedding equation went something like ten times or 100 times faster. So it was definitely, it changed a lot of things. Again, here we use cudrant mostly to leverage the free tier so they have a free version. -We would like to return them in the descending order based on the weight ( **impact score**), SPLADE++ assigned them. +Noe Acache: +So you can pop it in a second, get the free version, and using the feature which allows to put the vectors on disk instead of storing them on ram, which makes it a bit slower, you can easily support few hundred thousand of vectors and with a very decent response time. The challenge we faced is that mostly for the notion, so like mostly in notion, we have a lot of pages which are just a title because they are empty, et cetera. And so when pages have just a title, the content is so small that it will be very similar actually to a question. So often the documents were retrieved were document with very little content, which was a bit frustrating. Chunking appropriately was also tough. Basically, if you want your retrieval process to work well, you have to divide your documents the right way to create the embeddings. So you can use matrix rules, but basically you need to divide your documents in content which semantically makes sense and it's not always trivial. And also for the rag, for the medical company, sometimes we are asking questions about a specific drug and it's just not under our search is just not retrieving the good documents, which is very frustrating because a basic search would. -```python -from tokenizers import Tokenizer +Noe Acache: +So to handle these changes, a good option would be to use models handing differently question and documents like Bg or cohere. Basically they use the same model but trained differently on long documents and questions which allow them to map them differently in the space. And my guess is that using such model documents, which are only a title, et cetera, will not be as close as the question as they are right now because they will be considered differently. So I hope it will help this problem. Again, it's just a guess, maybe I'm wrong. Heap research so for the keyword problem I was mentioning here, so in the recent release, Cudran just enabled sparse vectors which make actually TFEdev vectors possible. The TFEDEF vectors are vectors which are based on keywords, but basically there is one number per possible word in the data sets, and a lot of zeros, so storing them as a normal vector will make the vector search very expensive. But as a sparse vector it's much better. -tokenizer = Tokenizer.from_pretrained('Qdrant/SPLADE_PP_en_v1') +Noe Acache: +And so you can build a debrief search combining the TFDF search for keyword search and the other search for semantic search to get the best of both worlds and overcome this issue. And finally, I'm actually quite surprised that with all the work that is going on, generative AI and rag, nobody has started working on a model to help with chunking. It's like one of the biggest challenge, and I feel like it's quite doable to have a model which will our model, or some kind of algorithm which will understand the structure of your documentation and understand why it semantically makes sense to chunk your documents. Dude, so good. -def get_tokens_and_weights(sparse_embedding, tokenizer): - token_weight_dict = {} - for i in range(len(sparse_embedding.indices)): - token = tokenizer.decode([sparse_embedding.indices[i]]) - weight = sparse_embedding.values[i] - token_weight_dict[token] = weight +Demetrios: +I got questions coming up. Don't go anywhere. Actually, it's not just me. Tom's also got some questions, so I'm going to just blame it on Tom, throw him under the bus. Rag with medical company seems like a dangerous use case. You can work to eliminate hallucinations and other security safety concerns, but you can't make sure that they're completely eliminated, right? You can only kind of make sure they're eliminated. And so how did you go about handling these concerns? - # Sort the dictionary by weights - token_weight_dict = dict(sorted(token_weight_dict.items(), key=lambda item: item[1], reverse=True)) - return token_weight_dict +Noe Acache: +This is a very good question. This is why I mentioned this project is mostly about the document search. Basically what we do is that we use chainlit, which is a very good tool for chatting, and then you can put a react front in front of it to make it very custom. And so when the user asks a question, we provide the LLM answer more like as a second thought, like something the doctor could consider as a fagon thought. But what's the most important is that we directly put the, instead of just citing the sources, we put the HTML of the pages the source is based on, and what bring the most value is really these HTML pages. And so we know the answer may have some problems. The fact is, based on documents, hallucinations are almost eliminated. Like, we don't notice any hallucinations, but of course they can happen. -``` +Noe Acache: +So it's really the way, it's really a product problem rather than an algorithm problem, an algorithmic problem, yeah. The documents retrieved rather than the LLM answer. -Firstly, we apply our function to the query. +Demetrios: +Yeah, makes sense. My question around it is a lot of times in the medical space, the data that is being thrown around is super sensitive. Right. And you have a lot of Pii. How do you navigate that? Are you just not touching that? -```python -query_embedding = list(sparse_model.embed("A movie about music"))[0] -print(get_tokens_and_weights(query_embedding, tokenizer)) +Noe Acache: +So basically we work with a provider in front which has public documentation. So it's public documentation. There is no PII. -``` +Demetrios: +Okay, cool. So it's not like some of it. -That’s how SPLADE++ expanded the query: +Noe Acache: +Is private, but still there is no PII in the documents. -```bash -{ - "music": 2.764289617538452, - "movie": 2.674748420715332, - "film": 2.3489091396331787, - "musical": 2.276120901107788, - "about": 2.124547004699707, - "movies": 1.3825485706329346, - "song": 1.2893378734588623, - "genre": 0.9066758751869202, - "songs": 0.8926399946212769, - "a": 0.8900706768035889, - "musicians": 0.5638002157211304, - "sound": 0.49310919642448425, - "musician": 0.46415239572525024, - "drama": 0.462990403175354, - "tv": 0.4398191571235657, - "book": 0.38950803875923157, - "documentary": 0.3758136034011841, - "hollywood": 0.29099565744400024, - "story": 0.2697228491306305, - "nature": 0.25306591391563416, - "concerning": 0.205053448677063, - "game": 0.1546829640865326, - "rock": 0.11775632947683334, - "definition": 0.08842901140451431, - "love": 0.08636035025119781, - "soundtrack": 0.06807517260313034, - "religion": 0.053535860031843185, - "filmed": 0.025964470580220222, - "sounds": 0.0004048719711136073 -} +Demetrios: +Yeah, because I think that's another really incredibly hard problem is like, oh yeah, we're just sending all this sensitive information over to the IDA model to create embeddings with it. And then we also pass it through Chat GPT before we get it back. And next thing you know, that is the data that was used to train GPT five. And you can say things like create an unlimited poem and get that out of it. So it's super sketchy, right? -``` +Noe Acache: +Yeah, of course, one way to overcome that is to, for instance, for the notion project, it's our private documentation. We use Ada over Azure, which guarantees data safety. So it's quite a good workaround. And when you have to work with different level of security, if you deal with PII, a good way is to play with metadata. Depending on the security level of the person who has the question, you play with the metadata to output only some kind of documents. The database metadata. -Then, we apply our function to the answer. +Demetrios: +Excellent. Well, don't let me stop you. I know you had some conclusionary thoughts there. -```python -query_embedding = list(sparse_model.embed("A movie about music"))[0] +Noe Acache: +No, sorry, I was about to conclude anyway. So just to wrap it up, so we got some good models without any fine tuning. With the model, we tried to overcome them, to overcome these limitations we still faced. For MS search, fine tuning is required at the moment. There's no really any other way to overcome it otherwise. While for tech search, fine tuning is not really necessary, it's more like tricks which are required about using eBrid search, using better models, et cetera. So two kind of approaches, Qdrant really made a lot of things easy. For instance, I love the feature where you can use the database as a disk file. -response = qdrant_client.query_points( - collection_name="movies", - query=models.SparseVector(indices=query_embedding.indices, values=query_embedding.values), - using="film_description", - limit=1, - with_vectors=True, - with_payload=True -) +Noe Acache: +You can even also use it in memory for CI integration and stuff. But since for all my experimentations, et cetera, I won't use it as a disk file because it's much easier to play with. I just like this feature. And then it allows to use the same tool for your experiment and in production. When I was playing with milverse, I had to use different tools for experimentation and for the database in production, which was making the technical stock a bit more complex. Sparse vector for Tfedef, as I was mentioning, which allows to search based on keywords to make your retrieval much better. Manage deployment again, we really struggle with the deployment of the, I mean, the DevOps team really struggled with the deployment of the milverse. And I feel like in most cases, except if you have some security requirements, it will be much cheaper to use the managed deployments rather than paying dev costs. -print(get_tokens_and_weights(response.points[0].vector['film_description'], tokenizer)) +Noe Acache: +And also with the free cloud and on these vectors, you can really do a lot of, at least start a lot of projects. And finally, the metadata filtering and indexing. So by the way, we went into a small trap. It's that indexing. It's recommended to index on your metadata before adding your vectors. Otherwise your performance may be impacted. So you may not retrieve the good vectors that you need. So it's interesting thing to take into consideration. -``` +Noe Acache: +I know that metadata filtering is something quite hard to do for vector database, so I don't really know how it works, but I assume there is a good reason for that. And finally, as I was mentioning before, in my view, new types of models are needed to answer industrial needs. So the model we are talking about, tech guidance to make better image embeddings and automatic chunking, like some kind of algorithm and model which will automatically chunk your documents appropriately. So thank you very much. If you still have questions, I'm happy to answer them. Here are my social media. If you want to reach me out afterwards, twitch out afterwards, and all my writing and talks are gathered here if you're interested. -Implicitly generate sparse vectors (Click to expand) +Demetrios: +Oh, I like how you did that. There is one question from Tom again, asking about if you did anything to handle images and tables within the documentation when you were doing those rags. -```python -response = qdrant_client.query_points( - collection_name="movies", - query=models.Document(text="A movie about music", model=sparse_model_name), - using="film_description", - limit=1, - with_vectors=True, - with_payload=True, -) +Noe Acache: +No, I did not do anything for the images and for the tables. It depends when they are well structured. I kept them because the model manages to understand them. But for instance, we did a small pock for the medical company when he tried to integrate some external data source, which was a PDF, and we wanted to use it as an HTML to be able to display the HTML otherwise explained to you directly in the answer. So we converted the PDF to HTML and in this conversion, the tables were absolutely unreadable. So even after cleaning. So we did not include them in this case. -print(get_tokens_and_weights(response.points[0].vector["film_description"], tokenizer)) +Demetrios: +Great. Well, dude, thank you so much for coming on here. And thank you all for joining us for yet another vector space talk. If you would like to come on to the vector space talk and share what you've been up to and drop some knowledge bombs on the rest of us, we'd love to have you. So please reach out to me. And I think that is it for today. Noe, this was awesome, man. I really appreciate you doing this. -``` +Noe Acache: +Thank you, Demetrius. Have a nice day. -And that’s how SPLADE++ expanded the answer. - -```python -{'spinal': 2.6548674, 'tap': 2.534881, 'marty': 2.223297, '##berg': 2.0402722, -'##ful': 2.0030282, 'fate': 1.935915, 'loud': 1.8381964, 'spine': 1.7507898, -'di': 1.6161551, 'bands': 1.5897619, 'band': 1.589473, 'uk': 1.5385966, 'tour': 1.4758654, -'chronicle': 1.4577943, 'director': 1.4423795, 'england': 1.4301306, '##est': 1.3025658, -'taps': 1.2124698, 'film': 1.1069428, '##berger': 1.1044296, 'tapping': 1.0424755, 'best': 1.0327196, -'louder': 0.9229055, 'music': 0.9056678, 'directors': 0.8887502, 'movie': 0.870712, 'directing': 0.8396196, -'sound': 0.83609974, 'genre': 0.803052, 'dave': 0.80212915, 'wrote': 0.7849579, 'hottest': 0.7594193, 'filmed': 0.750105, -'english': 0.72807616, 'who': 0.69502294, 'tours': 0.6833075, 'club': 0.6375339, 'vertebrae': 0.58689135, 'chronicles': 0.57296354, -'dance': 0.57278687, 'song': 0.50987065, ',': 0.49717945, 'british': 0.4971719, 'writer': 0.495709, 'directed': 0.4875775, -'cork': 0.475757, '##i': 0.47122696, '##band': 0.46837863, 'most': 0.44112885, '##liest': 0.44084555, 'destiny': 0.4264851, -'prove': 0.41789067, 'is': 0.40306947, 'famous': 0.40230379, 'hop': 0.3897451, 'noise': 0.38770816, '##iest': 0.3737782, -'comedy': 0.36903998, 'sport': 0.35883865, 'quiet': 0.3552795, 'detail': 0.3397654, 'fastest': 0.30345848, 'filmmaker': 0.3013101, -'festival': 0.28146765, '##st': 0.28040633, 'tram': 0.27373192, 'well': 0.2599603, 'documentary': 0.24368097, 'beat': 0.22953634, -'direction': 0.22925079, 'hardest': 0.22293334, 'strongest': 0.2018861, 'was': 0.19760133, 'oldest': 0.19532987, -'byron': 0.19360808, 'worst': 0.18397793, 'touring': 0.17598206, 'rock': 0.17319143, 'clubs': 0.16090117, -'popular': 0.15969758, 'toured': 0.15917331, 'trick': 0.1530599, 'celebrity': 0.14458777, 'musical': 0.13888633, -'filming': 0.1363699, 'culture': 0.13616633, 'groups': 0.1340591, 'ski': 0.13049376, 'venue': 0.12992987, -'style': 0.12853126, 'history': 0.12696269, 'massage': 0.11969914, 'theatre': 0.11673525, 'sounds': 0.108338095, -'visit': 0.10516077, 'editing': 0.078659914, 'death': 0.066746496, 'massachusetts': 0.055702563, 'stuart': 0.0447934, -'romantic': 0.041140396, 'pamela': 0.03561337, 'what': 0.016409796, 'smallest': 0.010815808, 'orchestra': 0.0020691194} +Demetrios: +We'll see you all later. Bye. -``` +<|page-389-lllmstxt|> +# How to Superpower Your Semantic Search Using a Vector Database with Nicolas Mauti -Due to the expansion both the query and the document overlap in “ _music_”, “ _film_”, “ _sounds_”, -and others, so **exact matching** works. +> *"We found a trade off between performance and precision in Qdrant’s that were better for us than what we can found on Elasticsearch.”*\ +> -- Nicolas Mauti +> -## [Anchor](https://qdrant.tech/articles/modern-sparse-neural-retrieval/\#key-takeaways-when-to-choose-sparse-neural-models-for-retrieval) Key Takeaways: When to Choose Sparse Neural Models for Retrieval +Want precision & performance in freelancer search? Malt's move to the Qdrant database is a masterstroke, offering geospatial filtering & seamless scaling. How did Nicolas Mauti and the team at Malt identify the need to transition to a retriever-ranker architecture for their freelancer matching app? -Sparse Neural Retrieval makes sense: +Nicolas Mauti, a computer science graduate from INSA Lyon Engineering School, transitioned from software development to the data domain. Joining Malt in 2021 as a data scientist, he specialized in recommender systems and NLP models within a freelancers-and-companies marketplace. Evolving into an MLOps Engineer, Nicolas adeptly combines data science, development, and ops knowledge to enhance model development tools and processes at Malt. Additionally, he has served as a part-time teacher in a French engineering school since 2020. Notably, in 2023, Nicolas successfully deployed Qdrant at scale within Malt, contributing to the implementation of a new matching system. -- In areas where keyword matching is crucial but BM25 is insufficient for initial retrieval, semantic matching (e.g., synonyms, homonyms) adds significant value. This is especially true in fields such as medicine, academia, law, and e-commerce, where brand names and serial numbers play a critical role. Dense retrievers tend to return many false positives, while sparse neural retrieval helps narrow down these false positives. +***Listen to the episode on [Spotify](https://open.spotify.com/episode/5aTPXqa7GMjekUfD8aAXWG?si=otJ_CpQNScqTK5cYq2zBow), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/OSZSingUYBM).*** -- Sparse neural retrieval can be a valuable option for scaling, especially when working with large datasets. It leverages exact matching using an inverted index, which can be fast depending on the nature of your data. + -- If you’re using traditional retrieval systems, sparse neural retrieval is compatible with them and helps bridge the semantic gap. + +## **Top Takeaways:** -##### Was this page useful? +Dive into the intricacies of [semantic search](https://qdrant.tech/documentation/tutorials/search-beginners/) enhancement with Nicolas Mauti, MLOps Engineer at Malt. Discover how Nicolas and his team at Malt revolutionize the way freelancers connect with projects. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +In this episode, Nicolas delves into enhancing semantics search at Malt by implementing a retriever-ranker architecture with multilingual transformer-based models, improving freelancer-project matching through a transition to [Qdrant](https://qdrant.tech/) that reduced latency from 10 seconds to 1 second and bolstering the platform's overall performance and scaling capabilities. -Thank you for your feedback! 🙏 +5 Keys to Learning from the Episode: -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/modern-sparse-neural-retrieval.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +1. **Performance Enhancement Tactics**: Understand the technical challenges Malt faced due to increased latency brought about by their expansion to over half a million freelancers and the solutions they enacted. +2. **Advanced Matchmaking Architecture**: Learn about the retriever-ranker model adopted by Malt, which incorporates semantic searching alongside a KNN search for better efficacy in pairing projects with freelancers. +3. **Cutting-Edge Model Training**: Uncover the deployment of a multilingual transformer-based encoder that effectively creates high-fidelity embeddings to streamline the matchmaking process. +4. **Database Selection Process**: Mauti discusses the factors that shaped Malt's choice of database systems, facilitating a balance between high performance and accurate filtering capabilities. +5. **Operational Improvements**: Gain knowledge of the significant strides Malt made post-deployment, including a remarkable reduction in application latency and its positive effects on scalability and matching quality. -On this page: +> Fun Fact: Malt employs a multilingual transformer-based encoder model to generate 384-dimensional embeddings, which improved their semantic search capability. +> -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/modern-sparse-neural-retrieval.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +## Show Notes: -× +00:00 Matching app experiencing major performance issues.\ +04:56 Filtering freelancers and adopting retriever-ranker architecture.\ +09:20 Multilingual encoder model for adapting semantic space.\ +10:52 Review, retrain, categorize, and organize freelancers' responses.\ +16:30 Trouble with geospatial filtering databases\ +17:37 Benchmarking performance and precision of search algorithms.\ +21:11 Deployed in Kubernetes. Stored in Git repository, synchronized with Argo CD.\ +27:08 Improved latency quickly, validated architecture, aligned steps.\ +28:46 Invitation to discuss work using specific methods. -[Powered by](https://qdrant.tech/) +## More Quotes from Nicolas: -<|page-192-lllmstxt|> -## storage -- [Documentation](https://qdrant.tech/documentation/) -- [Concepts](https://qdrant.tech/documentation/concepts/) -- Storage +*"And so GitHub's approach is basic idea that your git repository is your source of truth regarding what you must have in your Kubernetes clusters.”*\ +-- Nicolas Mauti -# [Anchor](https://qdrant.tech/documentation/concepts/storage/\#storage) Storage +*"And so we can see that our space seems to be well organized, where the tech freelancer are close to each other and the graphic designer for example, are far from the tech family.”*\ +-- Nicolas Mauti -All data within one collection is divided into segments. -Each segment has its independent vector and payload storage as well as indexes. +*"And also one thing that interested us is that it's multilingual. And as Malt is a European company, we have to have to model a multilingual model.”*\ +-- Nicolas Mauti -Data stored in segments usually do not overlap. -However, storing the same point in different segments will not cause problems since the search contains a deduplication mechanism. +## Transcript: +Demetrios: +We're live. We are live in the flesh. Nicholas, it's great to have you here, dude. And welcome to all those vector space explorers out there. We are back with another vector space talks. Today we're going to be talking all about how to superpower your semantics search with my man Nicholas, an ML ops engineer at Malt, in case you do not know what Malt is doing. They are pairing up, they're making a marketplace. They are connecting freelancers and companies. -The segments consist of vector and payload storages, vector and payload [indexes](https://qdrant.tech/documentation/concepts/indexing/), and id mapper, which stores the relationship between internal and external ids. +Demetrios: +And Nicholas, you're doing a lot of stuff with recommender systems, right? -A segment can be `appendable` or `non-appendable` depending on the type of storage and index used. -You can freely add, delete and query data in the `appendable` segment. -With `non-appendable` segment can only read and delete data. +Nicolas Mauti: +Yeah, exactly. -The configuration of the segments in the collection can be different and independent of one another, but at least one \`appendable’ segment must be present in a collection. +Demetrios: +I love that. Well, as I mentioned, I am in an interesting spot because I'm trying to take in all the vitamin D I can while I'm listening to your talk. Everybody that is out there listening with us, get involved. Let us know where you're calling in from or watching from. And also feel free to drop questions in the chat as we go along. And if need be, I will jump in and stop Nicholas. But I know you got a little presentation for us, man you want to get into. -## [Anchor](https://qdrant.tech/documentation/concepts/storage/\#vector-storage) Vector storage +Nicolas Mauti: +Thanks for the, thanks for the introduction and hello, everyone. And thanks for the invitation to this talk, of course. So let's start. Let's do it. -Depending on the requirements of the application, Qdrant can use one of the data storage options. -The choice has to be made between the search speed and the size of the RAM used. +Demetrios: +I love it. Superpowers. -**In-memory storage** \- Stores all vectors in RAM, has the highest speed since disk access is required only for persistence. +Nicolas Mauti: +Yeah, we will have superpowers at the end of this presentation. So, yeah, hello, everyone. So I think the introduction was already done and perfectly done by Dimitrios. So I'm Nicola and yeah, I'm working as an Mlaps engineer at Malt. And also I'm a part time teacher in a french engineering school where I teach some mlaps course. So let's dig in today's subjects. So in fact, as Dimitrio said, malt is a marketplace and so our goal is to match on one side freelancers. And those freelancers have a lot of attributes, for example, a description, some skills and some awesome skills. -**Memmap storage** \- Creates a virtual address space associated with the file on disk. [Wiki](https://en.wikipedia.org/wiki/Memory-mapped_file). -Mmapped files are not directly loaded into RAM. Instead, they use page cache to access the contents of the file. -This scheme allows flexible use of available memory. With sufficient RAM, it is almost as fast as in-memory storage. +Nicolas Mauti: +And they also have some preferences and also some attributes that are not specifically semantics. And so it will be a key point of our topics today. And on other sides we have what we call projects that are submitted by companies. And this project also have a lot of attributes, for example, description, also some skills and need to find and also some preferences. And so our goal at the end is to perform a match between these two entities. And so for that we add a matching app in production already. And so in fact, we had a major issue with this application is performance of this application because the application becomes very slow. The p 50 latency was around 10 seconds. -### [Anchor](https://qdrant.tech/documentation/concepts/storage/\#configuring-memmap-storage) Configuring Memmap storage +Nicolas Mauti: +And what you have to keep from this is that if your latency, because became too high, you won't be able to perform certain scenarios. Sometimes you want some synchronous scenario where you fill your project and then you want to have directly your freelancers that match this project. And so if it takes too much time, you won't be able to have that. And so you will have to have some asynchronous scenario with email or stuff like that. And it's not very a good user experience. And also this problem were amplified by the exponential growth of the platform. Absolutely, we are growing. And so to give you some numbers, when I arrived two years ago, we had two time less freelancers. -There are two ways to configure the usage of memmap(also known as on-disk) storage: +Nicolas Mauti: +And today, and today we have around 600,000 freelancers in your base. So it's growing. And so with this grow, we had some, several issue. And something we have to keep in mind about this matching app. And so it's not only semantic app, is that we have two things in these apps that are not semantic. We have what we call art filters. And so art filters are art rules defined by the project team at Malt. And so these rules are hard and we have to respect them. -- Set up `on_disk` option for the vectors in the collection create API: +Nicolas Mauti: +For example, the question is hard rule at malt we have a local approach, and so we want to provide freelancers that are next to the project. And so for that we have to filter the freelancers and to have art filters for that and to be sure that we respect these rules. And on the other side, as you said, demetrius, we are talking about Rexis system here. And so in a rexy system, you also have to take into account some other parameters, for example, the preferences of the freelancers and also the activity on the platform of the freelancer, for example. And so in our system, we have to keep this in mind and to have this working. And so if we do a big picture of how our system worked, we had an API with some alphilter at the beginning, then ML model that was mainly semantic and then some rescoring function with other parameters. And so we decided to rework this architecture and to adopt a retriever ranker architecture. And so in this architecture, you will have your pool of freelancers. -_Available as of v1.2.0_ +Nicolas Mauti: +So here is your wall databases, so your 600,000 freelancers. And then you will have a first step that is called the retrieval, where we will constrict a subsets of your freelancers. And then you can apply your wrong kill algorithm. That is basically our current application. And so the first step will be, semantically, it will be fast, and it must be fast because you have to perform a quick selection of your more interesting freelancers and it's built for recall, because at this step you want to be sure that you have all your relevant freelancers selected and you don't want to exclude at this step some relevant freelancer because the ranking won't be able to take back these freelancers. And on the other side, the ranking can contain more features, not only semantics, it less conference in time. And if your retrieval part is always giving you a fixed size of freelancers, your ranking doesn't have to scale because you will always have the same number of freelancers in inputs. And this one is built for precision. -httppythontypescriptrustjavacsharpgo +Nicolas Mauti: +At this point you don't want to keep non relevant freelancers and you have to be able to rank them and you have to be state of the art for this part. So let's focus on the first part. That's what will interesting us today. So for the first part, in fact, we have to build this semantic space where freelancers that are close regarding their skills or their jobs are closed in this space too. And so for that we will build this semantic space. And so then when we receive a project, we will have just to project this project in our space. And after that you will have just to do a search and a KNN search for knee arrest neighbor search. And in practice we are not doing a KNN search because it's too expensive, but inn search for approximate nearest neighbors. -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 768, - "distance": "Cosine", - "on_disk": true - } -} +Nicolas Mauti: +Keep this in mind, it will be interesting in our next slides. And so, to get this semantic space and to get this search, we need two things. The first one is a model, because we need a model to compute some vectors and to project our opportunity and our project and our freelancers in this space. And on another side, you will have to have a tool to operate this semantic step page. So to store the vector and also to perform the search. So for the first part, for the model, I will give you some quick info about how we build it. So for this part, it was more on the data scientist part. So the data scientist started from an e five model. -``` +Nicolas Mauti: +And so the e five model will give you a common knowledge about the language. And also one thing that interested us is that it's multilingual. And as Malt is an european company, we have to have to model a multilingual model. And on top of that we built our own encoder model based on a transformer architecture. And so this model will be in charge to be adapted to Malchus case and to transform this very generic semantic space into a semantic space that is used for skills and jobs. And this model is also able to take into account the structure of a profile of a freelancer profile because you have a description and job, some skills, some experiences. And so this model is capable to take this into account. And regarding the training, we use some past interaction on the platform to train it. -```python -from qdrant_client import QdrantClient, models +Nicolas Mauti: +So when a freelancer receives a project, he can accept it or not. And so we use that to train this model. And so at the end we get some embeddings with 384 dimensions. -client = QdrantClient(url="http://localhost:6333") +Demetrios: +One question from my side, sorry to stop you right now. Do you do any type of reviews or feedback and add that into the model? -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams( - size=768, distance=models.Distance.COSINE, on_disk=True - ), -) +Nicolas Mauti: +Yeah. In fact we continue to have some response about our freelancers. And so we also review them, sometimes manually because sometimes the response are not so good or we don't have exactly what we want or stuff like that, so we can review them. And also we are retraining the model regularly, so this way we can include new feedback from our freelancers. So now we have our model and if we want to see how it looks. So here I draw some ponds and color them by the category of our freelancer. So on the platform the freelancer can have category, for example tech or graphic or soon designer or this kind of category. And so we can see that our space seems to be well organized, where the tech freelancer are close to each other and the graphic designer for example, are far from the tech family. -``` +Nicolas Mauti: +So it seems to be well organized. And so now we have a good model. So okay, now we have our model, we have to find a way to operate it, so to store this vector and to perform our search. And so for that, Vectordb seems to be the good candidate. But if you follow the news, you can see that vectordb is very trendy and there is plenty of actor on the market. And so it could be hard to find your loved one. And so I will try to give you the criteria we had and why we choose Qdrant at the end. So our first criteria were performances. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +Nicolas Mauti: +So I think I already talked about this ponds, but yeah, we needed performances. The second ones was about inn quality. As I said before, we cannot do a KnN search, brute force search each time. And so we have to find a way to approximate but to be close enough and to be good enough on these points. And so otherwise we won't be leveraged the performance of our model. And the last one, and I didn't talk a lot about this before, is filtering. Filtering is a big problem for us because we have a lot of filters, of art filters, as I said before. And so if we think about my architecture, we can say, okay, so filtering is not a problem. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +Nicolas Mauti: +You can just have a three step process and do filtering, semantic search and then ranking, or do semantic search, filtering and then ranking. But in both cases, you will have some troubles if you do that. The first one is if you want to apply prefiltering. So filtering, semantic search, ranking. If you do that, in fact, you will have, so we'll have this kind of architecture. And if you do that, you will have, in fact, to flag each freelancers before asking the [vector database](https://qdrant.tech/articles/what-is-a-vector-database/) and performing a search, you will have to flag each freelancer whether there could be selected or not. And so with that, you will basically create a binary mask on your freelancers pool. And as the number of freelancers you have will grow, your binary namask will also grow. -client.createCollection("{collection_name}", { - vectors: { - size: 768, - distance: "Cosine", - on_disk: true, - }, -}); +Nicolas Mauti: +And so it's not very scalable. And regarding the performance, it will be degraded as your freelancer base grow. And also you will have another problem. A lot of [vector database](https://qdrant.tech/articles/what-is-a-vector-database/) and Qdrants is one of them using hash NSW algorithm to do your inn search. And this kind of algorithm is based on graph. And so if you do that, you will deactivate some nodes in your graph, and so your graph will become disconnected and you won't be able to navigate in your graph. And so your quality of your matching will degrade. So it's definitely not a good idea to apply prefiltering. -``` +Nicolas Mauti: +So, no, if we go to post filtering here, I think the issue is more clear. You will have this kind of architecture. And so, in fact, if you do that, you will have to retrieve a lot of freelancer for your [vector database](https://qdrant.tech/articles/what-is-a-vector-database/). If you apply some very aggressive filtering and you exclude a lot of freelancer with your filtering, you will have to ask for a lot of freelancer in your vector database and so your performances will be impacted. So filtering is a problem. So we cannot do pre filtering or post filtering. So we had to find a database that do filtering and matching and semantic matching and search at the same time. And so Qdrant is one of them, you have other one in the market. -```rust -use qdrant_client::qdrant::{CreateCollectionBuilder, Distance, VectorParamsBuilder}; -use qdrant_client::Qdrant; +Nicolas Mauti: +But in our case, we had one filter that caused us a lot of troubles. And this filter is the geospatial filtering and a few of databases under this filtering, and I think Qdrant is one of them that supports it. But there is not a lot of databases that support them. And we absolutely needed that because we have a local approach and we want to be sure that we recommend freelancer next to the project. And so now that I said all of that, we had three candidates that we tested and we benchmarked them. We had elasticsearch PG vector, that is an extension of PostgreSQL and Qdrants. And on this slide you can see Pycon for example, and Pycon was excluded because of the lack of geospatial filtering. And so we benchmark them regarding the qps. -let client = Qdrant::from_url("http://localhost:6334").build()?; +Nicolas Mauti: +So query per second. So this one is for performance, and you can see that quadron was far from the others, and we also benchmark it regarding the precision, how we computed the precision, for the precision we used a corpus that it's called textmax, and Textmax corpus provide 1 million vectors and 1000 queries. And for each queries you have your grown truth of the closest vectors. They used brute force knn for that. And so we stored this vector in our databases, we run the query and we check how many vectors we found that were in the ground truth. And so they give you a measure of your precision of your inn algorithm. For this metric, you could see that elasticsearch was a little bit better than Qdrants, but in fact we were able to tune a little bit the parameter of the AsHNSW algorithm and indexes. And at the end we found a better trade off, and we found a trade off between performance and precision in Qdrants that were better for us than what we can found on elasticsearch. -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine).on_disk(true)), - ) - .await?; +Nicolas Mauti: +So at the end we decided to go with Qdrant. So we have, I think all know we have our model and we have our tool to operate them, to operate our model. So a final part of this presentation will be about the deployment. I will talk about it a little bit because I think it's interesting and it's also part of my job as a development engineer. So regarding the deployment, first we decided to deploy a Qdrant in a cluster configuration. We decided to start with three nodes and so we decided to get our collection. So collection are where all your vector are stored in Qdrant, it's like a table in SQL or an index in elasticsearch. And so we decided to split our collection between three nodes. -``` +Nicolas Mauti: +So it's what we call shards. So you have a shard of a collection on each node, and then for each shard you have one replica. So the replica is basically a copy of a shard that is living on another node than the primary shard. So this way you have a copy on another node. And so this way if we operate normal conditions, your query will be split across your three nodes, and so you will have your response accordingly. But what is interesting is that if we lose one node, for example, this one, for example, because we are performing a rolling upgrade or because kubernetes always kill pods, we will be still able to operate because we have the replica to get our data. And so this configuration is very robust and so we are very happy with it. And regarding the deployment. -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.VectorParams; +Nicolas Mauti: +So as I said, we deployed it in kubernetes. So we use the Qdrant M chart, the official M chart provided by Qdrants. In fact we subcharted it because we needed some additional components in your clusters and some custom configuration. So I didn't talk about this, but M chart are just a bunch of file of Yaml files that will describe the Kubernetes object you will need in your cluster to operate your databases in your case, and it's collection of file and templates to do that. And when you have that at malt we are using what we called a GitHub's approach. And so GitHub's approach is basic idea that your git repository is your groom truth regarding what you must have in your Kubernetes clusters. And so we store these files and these M charts in git, and then we have a tool that is called Argo CD that will pull our git repository at some time and it will check the differences between what we have in git and what we have in our cluster and what is living in our cluster. And it will then synchronize what we have in git directly in our cluster, either automatically or manually. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +Nicolas Mauti: +So this is a very good approach to collaborate and to be sure that what we have in git is what you have in your cluster. And to be sure about what you have in your cluster by just looking at your git repository. And I think that's pretty all I have one last slide, I think that will interest you. It's about the outcome of the project, because we did that at malt. We built this architecture with our first phase with Qdrants that do the semantic matching and that apply all the filtering we have. And in the second part we keep our all drunking system. And so if we look at the latency of our apps, at the P 50 latency of our apps, so it's a wall app with the two steps and with the filters, the semantic matching and the ranking. As you can see, we started in a debate test in mid October. -client - .createCollectionAsync( - "{collection_name}", - VectorParams.newBuilder() - .setSize(768) - .setDistance(Distance.Cosine) - .setOnDisk(true) - .build()) - .get(); +Nicolas Mauti: +Before that it was around 10 seconds latency, as I said at the beginning of the talk. And so we already saw a huge drop in the application and we decided to go full in December and we can see another big drop. And so we were around 10 seconds and now we are around 1 second and alpha. So we divided the latency of more than five times. And so it's a very good news for us because first it's more scalable because the retriever is very scalable and with the cluster deployment of Qdrants, if we need, we can add more nodes and we will be able to scale this phase. And after that we have a fixed number of freelancers that go into the matching part. And so the matching part doesn't have to scale. No. -``` +Nicolas Mauti: +And the other good news is that now that we are able to scale and we have a fixed size, after our first parts, we are able to build more complex and better matching model and we will be able to improve the quality of our matching because now we are able to scale and to be able to handle more freelancers. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +Demetrios: +That's incredible. -var client = new QdrantClient("localhost", 6334); +Nicolas Mauti: +Yeah, sure. It was a very good news for us. And so that's all. And so maybe you have plenty of question and maybe we can go with that. -await client.CreateCollectionAsync( - "{collection_name}", - new VectorParams - { - Size = 768, - Distance = Distance.Cosine, - OnDisk = true - } -); +Demetrios: +All right, first off, I want to give a shout out in case there are freelancers that are watching this or looking at this, now is a great time to just join Malt, I think. It seems like it's getting better every day. So I know there's questions that will come through and trickle in, but we've already got one from Luis. What's happening, Luis? He's asking what library or service were you using for Ann before considering Qdrant, in fact. -``` +Nicolas Mauti: +So before that we didn't add any library or service or we were not doing any ann search or [semantic searc](https://qdrant.tech/documentation/tutorials/search-beginners/) in the way we are doing it right now. We just had one model when we passed the freelancers and the project at the same time in the model, and we got relevancy scoring at the end. And so that's why it was also so slow because you had to constrict each pair and send each pair to your model. And so right now we don't have to do that and so it's much better. -```go -import ( - "context" +Demetrios: +Yeah, that makes sense. One question from my side is it took you, I think you said in October you started with the A B test and then in December you rolled it out. What was that last slide that you had? - "github.com/qdrant/go-client/qdrant" -) +Nicolas Mauti: +Yeah, that's exactly that. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Demetrios: +Why the hesitation? Why did it take you from October to December to go down? What was the part that you weren't sure about? Because it feels like you saw a huge drop right there and then why did you wait until December? -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 768, - Distance: qdrant.Distance_Cosine, - OnDisk: qdrant.PtrOf(true), - }), -}) +Nicolas Mauti: +Yeah, regarding the latency and regarding the drop of the latency, the result was very clear very quickly. I think maybe one week after that, we were convinced that the latency was better. First, our idea was to validate the architecture, but the second reason was to be sure that we didn't degrade the quality of the matching because we have a two step process. And the risk is that the two model doesn't agree with each other. And so if the intersection of your first step and the second step is not good enough, you will just have some empty result at the end because your first part will select a part of freelancer and the second step, you select another part and so your intersection is empty. And so our goal was to assess that the two steps were aligned and so that we didn't degrade the quality of the matching. And regarding the volume of projects we have, we had to wait for approximately two months. -``` +Demetrios: +It makes complete sense. Well, man, I really appreciate this. And can you go back to the slide where you show how people can get in touch with you if they want to reach out and talk more? I encourage everyone to do that. And thanks so much, Nicholas. This is great, man. -This will create a collection with all vectors immediately stored in memmap storage. -This is the recommended way, in case your Qdrant instance operates with fast disks and you are working with large collections. +Nicolas Mauti: +Thanks. -- Set up `memmap_threshold` option. This option will set the threshold after which the segment will be converted to memmap storage. +Demetrios: +All right, everyone. By the way, in case you want to join us and talk about what you're working on and how you're using Qdrant or what you're doing in the semantic space or [semantic search](https://qdrant.tech/documentation/tutorials/search-beginners/) or vector space, all that fun stuff, hit us up. We would love to have you on here. One last question for you, Nicola. Something came through. What indexing method do you use? Is it good for using OpenAI embeddings? -There are two ways to do this: +Nicolas Mauti: +So in our case, we have our own model to build the embeddings. -1. You can set the threshold globally in the [configuration file](https://qdrant.tech/documentation/guides/configuration/). The parameter is called `memmap_threshold` (previously `memmap_threshold_kb`). -2. You can set the threshold for each collection separately during [creation](https://qdrant.tech/documentation/concepts/collections/#create-collection) or [update](https://qdrant.tech/documentation/concepts/collections/#update-collection-parameters). +Demetrios: +Yeah, I remember you saying that at the beginning, actually. All right, cool. Well, man, thanks a lot and we will see everyone next week for another one of these vector space talks. Thank you all for joining and take care. Care. Thanks. -httppythontypescriptrustjavacsharpgo +<|page-390-lllmstxt|> +> *"There are 10 billion search queries a day, estimated half of them go unanswered. Because people don't actually use search as what we used.”*\ +> -- Hamza Farooq +> -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 768, - "distance": "Cosine" - }, - "optimizers_config": { - "memmap_threshold": 20000 - } -} +How do you think Hamza's background in machine learning and previous experiences at Google and Walmart Labs have influenced his approach to building LLM-powered applications? -``` +Hamza Farooq, an accomplished educator and AI enthusiast, is the founder of Traversaal.ai. His journey is marked by a relentless passion for AI exploration, particularly in building Large Language Models. As an adjunct professor at UCLA Anderson, Hamza shapes the future of AI by teaching cutting-edge technology courses. At Traversaal.ai, he empowers businesses with domain-specific AI solutions, focusing on conversational search and recommendation systems to deliver personalized experiences. With a diverse career spanning academia, industry, and entrepreneurship, Hamza brings a wealth of experience from time at Google. His overarching goal is to bridge the gap between AI innovation and real-world applications, introducing transformative solutions to the market. Hamza eagerly anticipates the dynamic challenges and opportunities in the ever-evolving field of AI and machine learning. -```python -from qdrant_client import QdrantClient, models +***Listen to the episode on [Spotify](https://open.spotify.com/episode/1oh31JA2XsqzuZhCUQVNN8?si=viPPgxiZR0agFhz1QlimSA), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/0N9ozwgmEQM).*** -client = QdrantClient(url="http://localhost:6333") + -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), - optimizers_config=models.OptimizersConfigDiff(memmap_threshold=20000), -) + -``` +## Top Takeaways: -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +UX specialist? Your expertise in designing seamless user experiences for GenAI products is guaranteed to be in high demand. Let's elevate the user interface for next-gen technology! -const client = new QdrantClient({ host: "localhost", port: 6333 }); +In this episode, Hamza presents the future of large language models and complex search, discussing real-world applications and the challenges of implementing these technologies in production. -client.createCollection("{collection_name}", { - vectors: { - size: 768, - distance: "Cosine", - }, - optimizers_config: { - memmap_threshold: 20000, - }, -}); +5 Keys to Learning from the Episode: -``` +1. **Complex Search** - Discover how LLMs are revolutionizing the way we interact with search engines and enhancing the search experience beyond basic queries. +2. **Conversational Search and Personalization** - Explore the potential of conversational search and personalized recommendations using open-source LLMs, bringing a whole new level of user engagement. +3. **Challenges and Solutions** - Uncover the downtime challenges faced by LLM services and learn the strategies deployed to mitigate these issues for seamless operation. +4. **Traversal AI's Unique Approach** - Learn how Traversal AI has created a unified platform with a myriad of applications, simplifying the integration of LLMs and domain-specific search. +5. **The Importance of User Experience (UX)** - Understand the unparalleled significance of UX professionals in shaping the future of Gen AI products, and how they play a pivotal role in enhancing user interactions with LLM-powered applications. -```rust -use qdrant_client::qdrant::{ - CreateCollectionBuilder, Distance, OptimizersConfigDiffBuilder, VectorParamsBuilder, -}; -use qdrant_client::Qdrant; +> Fun Fact: User experience (UX) designers are anticipated to be crucial in the development of AI-powered products as they bridge the gap between user interaction and the technical aspects of the AI systems. +> -let client = Qdrant::from_url("http://localhost:6334").build()?; +## Show Notes: -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine)) - .optimizers_config(OptimizersConfigDiffBuilder::default().memmap_threshold(20000)), - ) - .await?; +00:00 Teaching GPU AI with open source products.\ +06:40 Complex search leads to conversational search implementation.\ +07:52 Generating personalized travel itineraries with ease.\ +12:02 Maxwell's talk highlights challenges in search technology.\ +16:01 Balancing preferences and trade-offs in travel.\ +17:45 Beta mode, selective, personalized database.\ +22:15 Applications needed: chatbot, knowledge retrieval, recommendation, job matching, copilot\ +23:59 Challenges for UX in developing gen AI. -``` +## More Quotes from Hamza: -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.OptimizersConfigDiff; -import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.grpc.Collections.VectorsConfig; +*"Ux people are going to be more rare who can work on gen AI products than product managers and tech people, because for tech people, they can follow and understand code and they can watch videos, business people, they're learning GPT prompting and so on and so forth. But the UX people, there's literally no teaching guide except for a Chat GPT interface. So this user experience, they are going to be, their worth is going to be inequal in gold.”*\ +-- Hamza Farooq -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +*"Usually they don't come to us and say we need a pine cone or we need a quadrant or we need a local llama, they say, this is the problem you're trying to solve. And we are coming from a problem solving initiative from our company is that we got this. You don't have to hire three ML engineers and two NLP research scientists and three people from here for the cost of two people. We can do an entire end to end implementation. Because what we have is 80% product which is built and we can tune the 20% to what you need.”*\ +-- Hamza Farooq -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig( - VectorsConfig.newBuilder() - .setParams( - VectorParams.newBuilder() - .setSize(768) - .setDistance(Distance.Cosine) - .build()) - .build()) - .setOptimizersConfig( - OptimizersConfigDiff.newBuilder().setMemmapThreshold(20000).build()) - .build()) - .get(); +*"Imagine you're trying to book a hotel, and you also get an article from New York Times that says, this is why this is a great, or a blogger that you follow and it sort of shows up in your. That is the strength that we have been powering, that you don't need to wait or you don't need to depend anymore on just the company's website itself. You can use the entire Internet to come up with an arsenal.”*\ +-- Hamza Farooq -``` +## Transcript: +Demetrios: +Yes, we are live. So what is going on? Hamza, it's great to have you here for this edition of the Vector Space Talks. +Let's first start with this. Everybody that is here with us right now, great to have you. +Let us know where you're dialing in from in the chat and feel free over the course of the next 20 - 25 minutes to ask any questions as they. +Come up in the chat. +I'll be monitoring it and maybe jumping. +In in case we need to stop. +Hunts at any moment. And if you or anybody you know would like to come and give a presentation on our vector space talks, we are very open to that. Reach out to me either on discord or LinkedIn or your preferred method of communication. +Maybe it's carrier Pigeon. +Whatever it may be, I am here and ready to hear your pitch about. +What you want to talk about. It's always cool hearing about how people are building with Qdrant or what they. +Are building in this space. So without further ado, let's jump into this with my man Hamza. Great to have you here, dude. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +Hamza Farooq: Thank you for having me. It's an honor. -var client = new QdrantClient("localhost", 6334); +Demetrios: You say that now. Just wait. You don't know me that well. I guess that's the only thing. So let's just say this. You're doing some incredible stuff. +You're the founder of Traversaal.ai. +You have been building large language models in the past, and you're also a professor at UCLA. You're doing all kinds of stuff. +And that is why I think it. +Is my honor to have you here with us today. I know you've got all kinds of fun stuff that you want to get. +Into, and it's really about building llm powered applications in production. +You have some slides for us, I believe. So I'm going to kick it over. +To you, let you start rocking, and in case anything comes up, I'll jump. +In and stop you from going too. +Far down the road. -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine }, - optimizersConfig: new OptimizersConfigDiff { MemmapThreshold = 20000 } -); +Hamza Farooq: +Awesome. Thank you for that. I really like your joke of the carrier pigeon. Is it a geni carrier pigeon with multiple areas and h 100 attached to it? -``` +Demetrios: +Exactly. Those are the expensive carrier pigeons. That's the premium version. I am not quite that GPU rich yet. -```go -import ( - "context" +Hamza Farooq: +Absolutely. All right. I think that's a great segue. I usually tell people that I'm going to teach you all how to be a GPU poor AI gap person, and my job is to basically teach everyone, or the thesis of my organization is also, how can we build powerful solutions, LLM powered solutions by using open source products and open source llms and architectures so that we can stretch the dollar as much as possible. That's been my thesis and I have always pushed for open source because they've done some great job over there and they are coming in close to pretty much at par of what the industry standard is. But I digress. Let's start with my overall presentation. I'm here to talk about the future of search and copilots and just the overall experience which we are looking with llms. - "github.com/qdrant/go-client/qdrant" -) +Hamza Farooq: +So I know you gave a background about me. I am a founder at Traversaal.ai. Previously I was at Google and Walmart Labs. I have quite a few years of experience in machine learning. In fact, my first job in 2007 was working for SaaS and I was implementing trees for identifying fraud, for fraud detection. And I did not know that was honestly data science, but we were implementing that. I have had the experience of teaching at multiple universities and that sort of experience has really helped me do better at what I do, because when you can teach something, you actually truly understand that. All right, so why are we here? Why are we really here? I have a very strong mean game. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Hamza Farooq: +So we started almost a year ago, Char GPT came into our lives and almost all of a sudden we started using it. And I think in January, February, March, it was just an explosion of usage. And now we know all the different things that have been going on and we've seen peripheration of a lot of startups that have come in this space. Some of them are wrappers, some of them have done a lot, have a lot more motor. There are many, many different ways that we have been using it. I don't think we even know how many ways we can use charge GBT, but most often it's just been text generation, one form or the other. And that is what the focus has been. But if we look deeper, the llms that we know, they also can help us with a very important part, something which is called complex search. -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 768, - Distance: qdrant.Distance_Cosine, - }), - OptimizersConfig: &qdrant.OptimizersConfigDiff{ - MaxSegmentSize: qdrant.PtrOf(uint64(20000)), - }, -}) +Hamza Farooq: +And complex search is basically when we converse with a search system to actually give a much longer query of how we would talk to a human being. And that is something that has been missing for the longest time in our interfacing with any kind of search engine. Google has always been at the forefront of giving the best form of search for us all. But imagine if you were to look at any other e commerce websites other than Amazon. Imagine you go to Nike.com, you go to gap, you go to Banana Republic. What you see is that their search is really basic and this is an opportunity for a lot of companies to actually create a great search experience for the users with a multi tier engagement model. So you basically make a request. I would like to buy a Nike blue t shirt specially designed for golf with all these features which I need and at a reasonable price point. -``` +Hamza Farooq: +It shows you a set of results and then from that you can actually converse more to it and say, hey, can you remove five or six or reduce this by a certain degree? That is the power of what we have at hand with complex search. And complex search is becoming quickly a great segue to why we need to implement conversational search. We would need to implement large language models in our ecosystem so that we can understand the context of what users have been asking. So I'll show you a great example of sort of know complex search that TripAdvisor has been. Last week in one of my classes at Stanford, we had head of AI from Trivia Advisor come in and he took us through an experience of a new way of planning your trips. So I'll share this example. So if you go to the website, you can use AI and you can actually select a city. So let's say I'm going to select London for that matter. -The rule of thumb to set the memmap threshold parameter is simple: +Hamza Farooq: +And I can say I'm going to go for a few days, I do next and I'm going to go with my partner now at the back end. This is just building up a version of complex search and I want to see attractions, great food, hidden gems. I basically just want to see almost everything. And then when I hit submit, the great thing what it does is that it sort of becomes a starting point for something that would have taken me quite a while to put it together, sort of takes all my information and generates an itinerary. Now see what's different about this. It has actual data about places where I can stay, things I can do literally day by day, and it's there for you free of cost generated within 10 seconds. This is an experience that did not exist before. You would have to build this by yourself and what you would usually do is you would go to chat. -- if you have a balanced use scenario - set memmap threshold the same as `indexing_threshold` (default is 20000). In this case the optimizer will not make any extra runs and will optimize all thresholds at once. -- if you have a high write load and low RAM - set memmap threshold lower than `indexing_threshold` to e.g. 10000. In this case the optimizer will convert the segments to memmap storage first and will only apply indexing after that. +Hamza Farooq: +GPT if you've started this year, you would say seven day itinerary to London and it would identify a few things over here. However, you see it has able to integrate the ability to book, the ability to actually see those restaurants all in one place. That is something that has not been done before. And this is the truest form of taking complex search and putting that into production and sort of create a great experience for the user so that they can understand what they can select. They can highlight and sort of interact with it. Going to pause here. Is there any question or I can help answer anything? -In addition, you can use memmap storage not only for vectors, but also for HNSW index. -To enable this, you need to set the `hnsw_config.on_disk` parameter to `true` during collection [creation](https://qdrant.tech/documentation/concepts/collections/#create-a-collection) or [updating](https://qdrant.tech/documentation/concepts/collections/#update-collection-parameters). +Demetrios: +No. -httppythontypescriptrustjavacsharpgo +Demetrios: +Man, this is awesome though. I didn't even realize that this is already live, but it's 100% what a travel agent would be doing. And now you've got that at your fingertips. -```http -PUT /collections/{collection_name} -{ - "vectors": { - "size": 768, - "distance": "Cosine", - "on_disk": true - }, - "hnsw_config": { - "on_disk": true - } -} +Hamza Farooq: +So they have built a user experience which takes 10 seconds to build. Now, was it really happening in the back end? You have this macro task that I want to plan a vacation in Paris, I want to plan a vacation to London. And what web agents or auto agents or whatever you want to call them, they are recursively breaking down tasks into subtasks. And when you reach to an individual atomic subtask, it is able to divide it into actions which can be taken. So there's a task decomposition and a task recognition scene that is going on. And from that, for instance, Stripadvisor is able to build something of individual actions. And then it makes one interface for you where you can see everything ready to go. And that's the part that I have always been very interested in. -``` +Hamza Farooq: +Whenever we go to Amazon or anything for search, we just do one tier search. We basically say, I want to buy a jeans, I want to buy a shirt, I want to buy. It's an atomic thing. Do you want to get a flight? Do you want to get an accommodation? Imagine if you could do, I would like to go to Tokyo or what kind of gear do I need? What kind of overall grade do I need to go to a glacier? And it can identify all the different subtasks that are involved in it and then eventually show you the action. Well, it's all good that it exists, but the biggest thing is that it's actually difficult to build complex search. Google can get away with it. Amazon can get away with it. But if you imagine how do we make sure that it's available to the larger masses? It's available to just about any company for that matter, if they want to build that experience at this point. -```python -from qdrant_client import QdrantClient, models +Hamza Farooq: +This is from a talk that was given by Maxwell a couple of months ago. There are 10 billion search queries a day, estimated half of them go unanswered. Because people don't actually use search as what we used. Because again, also because of GPT coming in and the way we have been conversing with our products, our search is getting more coherent, as we would expect it to be. We would talk to a person and it's great for finding a website for more complex questions or tasks. It often falls too short because a lot of companies, 99.99% companies, I think they are just stuck on elasticsearch because it's cheaper to run it, it's easier, it's out of the box, and a lot of companies do not want to spend the money or they don't have the people to help them build that as a product, as an SDK that is available and they can implement and starts working for them. And the biggest thing is that there are complex search is not just one query, it's multiple queries, sessions or deep, which requires deep engagement with search. And what I mean by deep engagement is imagine when you go to Google right now, you put in a search, you can give feedback on your search, but there's nothing that you can do that it can unless you start a new search all over again. -client = QdrantClient(url="http://localhost:6333") +Hamza Farooq: +In perplexity, you can ask follow up questions, but it's also a bit of a broken experience because you can't really reduce as you would do with Jarvis in Ironman. So imagine there's a human aspect to it. And let me show you another example of a copilot system, let's say. So this is an example of a copilot which we have been working on. -client.create_collection( - collection_name="{collection_name}", - vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE, on_disk=True), - hnsw_config=models.HnswConfigDiff(on_disk=True), -) +Demetrios: +There is a question, there's actually two really good questions that came through, so I'm going to stop you before you get into this. Cool copilot Carlos was asking, what about downtime? When it comes to these LLM services. -``` +Hamza Farooq: +I think the downtime. This is the perfect question. If you have a production level system running on Chat GPT, you're going to learn within five days that you can't run a production system on Chat GPT and you need to host it by yourself. And then you start with hugging face and then you realize hugging face can also go down. So you basically go to bedrock, or you go to an AWS or GCP and host your LLM over there. So essentially it's all fun with demos to show oh my God, it works beautifully. But consistently, if you have an SLA that 99.9% uptime, you need to deploy it in an architecture with redundancies so that it's up and running. And the eventual solution is to have dedicated support to it. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +Hamza Farooq: +It could be through Azure open AI, I think, but I think even Azure openi tends to go down with open ais out of it's a little bit. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +Demetrios: +Better, but it's not 100%, that is for sure. -client.createCollection("{collection_name}", { - vectors: { - size: 768, - distance: "Cosine", - on_disk: true, - }, - hnsw_config: { - on_disk: true, - }, -}); +Hamza Farooq: +Can I just give you an example? Recently we came across a new thing, the token speed. Also varies with the day and with the time of the day. So the token generation. And another thing that we found out that instruct, GPT. Instruct was great, amazing. But it's leaking the data. Even in a rack solution, it's leaking the data. So you have to go back to then 16k. -``` +Hamza Farooq: +It's really slow. So to generate an answer can take up to three minutes. -```rust -use qdrant_client::qdrant::{ - CreateCollectionBuilder, Distance, HnswConfigDiffBuilder, - VectorParamsBuilder, -}; -use qdrant_client::Qdrant; +Demetrios: +Yeah. So it's almost this catch 22. What do you prefer, leak data or slow speeds? There's always trade offs, folks. There's always trade offs. So Mike has another question coming through in the chat. And Carlos, thanks for that awesome question Mike is asking, though I presume you could modify the search itinerary with something like, I prefer italian restaurants when possible. And I was thinking about that when it comes to. So to add on to what Mike is saying, it's almost like every single piece of your travel or your itinerary would be prefaced with, oh, I like my flights at night, or I like to sit in the aisle row, and I don't want to pay over x amount, but I'm cool if we go anytime in December, et cetera, et cetera. -let client = Qdrant::from_url("http://localhost:6334").build()?; +Demetrios: +And then once you get there, I like to go into hotels that are around this part of this city. I think you get what I'm going at, but the preference list for each of these can just get really detailed. And you can preference all of these different searches with what you were talking about. -client - .create_collection( - CreateCollectionBuilder::new("{collection_name}") - .vectors_config(VectorParamsBuilder::new(768, Distance::Cosine).on_disk(true)) - .hnsw_config(HnswConfigDiffBuilder::default().on_disk(true)), - ) - .await?; +Hamza Farooq: +Absolutely. So I think that's a great point. And I will tell you about a company that we have been closely working with. It's called Tripsby or Tripspy AI, and we actually help build them the ecosystem where you can have personalized recommendations with private discovery. It's pretty much everything that you just said. I prefer at this time, I prefer this. I prefer this. And it sort of takes audio and text, and you can converse it through WhatsApp, you can converse it through different ways. -``` +Hamza Farooq: +They are still in the beta mode, and they go selectively, but literally, they have built this, they have taken a lot more personalization into play, and because the database is all the same, it's Ahmedius who gives out, if I'm pronouncing correct, they give out the database for hotels or restaurants or availability, and then you can build things on top of it. So they have gone ahead and built something, but with more user expectation. Imagine you're trying to book a hotel, and you also get an article from New York Times that says, this is why this is a great, or a blogger that you follow and it sort of shows up in your. That is the strength that we have been powering, that you don't need to wait or you don't need to depend anymore on just the company's website itself. You can use the entire Internet to come up with an arsenal. -```java -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Collections.CreateCollection; -import io.qdrant.client.grpc.Collections.Distance; -import io.qdrant.client.grpc.Collections.HnswConfigDiff; -import io.qdrant.client.grpc.Collections.VectorParams; -import io.qdrant.client.grpc.Collections.VectorsConfig; +Demetrios: +Yeah. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +Demetrios: +And your ability. I think another example of this would be how I love to watch TikTok videos and some of the stuff that pops up on my TikTok feed is like Amazon finds you need to know about, and it's talking about different cool things you can buy on Amazon. If Amazon knew that I was liking that on TikTok, it would probably show it to me next time I'm on Amazon. -client - .createCollectionAsync( - CreateCollection.newBuilder() - .setCollectionName("{collection_name}") - .setVectorsConfig( - VectorsConfig.newBuilder() - .setParams( - VectorParams.newBuilder() - .setSize(768) - .setDistance(Distance.Cosine) - .setOnDisk(true) - .build()) - .build()) - .setHnswConfig(HnswConfigDiff.newBuilder().setOnDisk(true).build()) - .build()) - .get(); +Hamza Farooq: +Yeah, I mean, that's what cookies are, right? Yeah. It's a conspiracy theory that you're talking about a product and it shows up on. -``` +Demetrios: +Exactly. Well, so, okay. This website that you're showing is absolutely incredible. Carlos had a follow up question before we jump into the next piece, which is around the quality of these open source models and how you deal with that, because it does seem that OpenAI, the GPT-3 four, is still quite a. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +Hamza Farooq: +Bit ahead these days, and that's the silver bullet you have to buy. So what we suggest is have open llms as a backup. So at a point in time, I know it will be subpar, but something subpar might be a little better than breakdown of your complete system. And that's what we have been employed, we have deployed. What we've done is that when we're building large scale products, we basically tend to put an ecosystem behind or a backup behind, which is like, if the token rate is not what we want, if it's not working, it's taking too long, we automatically switch to a redundant version, which is open source. It does perform. Like, for instance, even right now, perplexity is running a lot of things on open source llms now instead of just GPT wrappers. -var client = new QdrantClient("localhost", 6334); +Demetrios: +Yeah. Gives you more control. So I didn't want to derail this too much more. I know we're kind of running low on time, so feel free to jump back into it and talk fast. -await client.CreateCollectionAsync( - collectionName: "{collection_name}", - vectorsConfig: new VectorParams { Size = 768, Distance = Distance.Cosine, OnDisk = true }, - hnswConfig: new HnswConfigDiff { OnDisk = true } -); +Demetrios: +Yeah. -``` +Hamza Farooq: +So can you give me a time check? How are we doing? -```go -import ( - "context" +Demetrios: +Yeah, we've got about six to eight minutes left. - "github.com/qdrant/go-client/qdrant" -) +Hamza Farooq: +Okay, so I'll cover one important thing of why I built my company, Traversaal.ai. This is a great slide to see what everyone is doing everywhere. Everyone is doing so many different things. They're looking into different products for each different thing. You can pick one thing. Imagine the concern with this is that you actually have to think about every single product that you have to pick up because you have to meticulously go through, oh, for this I need this. For this I need this. For this I need this. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Hamza Farooq: +All what we have done is that we have created one platform which has everything under one roof. And I'll show you with a very simple example. This is our website. We call ourselves one platform with multiple applications. And in this what we have is we have any kind of data format, pretty much that you have any kind of integrations which you need, for example, any applications. And I'll zoom in a little bit. And if you need domain specific search. So basically, if you're looking for Internet search to come in any kind of llms that are in the market, and vector databases, you see Qdrant right here. -client.CreateCollection(context.Background(), &qdrant.CreateCollection{ - CollectionName: "{collection_name}", - VectorsConfig: qdrant.NewVectorsConfig(&qdrant.VectorParams{ - Size: 768, - Distance: qdrant.Distance_Cosine, - OnDisk: qdrant.PtrOf(true), - }), - HnswConfig: &qdrant.HnswConfigDiff{ - OnDisk: qdrant.PtrOf(true), - }, -}) +Hamza Farooq: +And what kind of applications that are needed? Do you need a chatbot? You need a knowledge retrieval system, you need recommendation system? You need something which is a job matching tool or a copilot. So if you've built a one stop shop where a lot of times when a customer comes in, usually they don't come to us and say we need a pine cone or we need a Qdrant or we need a local llama, they say, this is the problem you're trying to solve. And we are coming from a problem solving initiative from our company is that we got this. You don't have to hire three ML engineers and two NLP research scientists and three people from here for the cost of two people. We can do an entire end to end implementation. Because what we have is 80% product which is built and we can tune the 20% to what you need. And that is such a powerful thing that once they start trusting us, and the best way to have them trust me is they can come to my class on maven, they can come to my class in Stanford, they come to my class in UCLA, or they can. -``` +Demetrios: +Listen to this podcast and sort of. -## [Anchor](https://qdrant.tech/documentation/concepts/storage/\#payload-storage) Payload storage +Hamza Farooq: +It adds credibility to what we have been doing with them. Sorry, stop sharing what we have been doing with them and sort of just goes in that direction that we can do these things pretty fast and we tend to update. I want to just cover one slide. At the end of the day, this is the main slide. Right now. All engineers and product managers think of, oh, llms and Gen AI and this and that. I think one thing we don't talk about is UX experience. I just showed you a UX experience on Tripadvisor. -Qdrant supports two types of payload storages: InMemory and OnDisk. +Hamza Farooq: +It's so easy to explain, right? Like you're like, oh, I know how to use it and you can already find problems with it, which means that they've done a great job thinking about a user experience. I predict one main thing. Ux people are going to be more rare who can work on gen AI products than product managers and tech people, because for tech people, they can follow and understand code and they can watch videos, business people, they're learning GPT prompting and so on and so forth. But the UX people, there's literally no teaching guide except for a Chat GPT interface. So this user experience, they are going to be, their worth is going to be inequal in gold. Not bitcoin, but gold. It's basically because they will have to build user experiences because we can't imagine right now what it will look like. -InMemory payload storage is organized in the same way as in-memory vectors. -The payload data is loaded into RAM at service startup while disk and [Gridstore](https://qdrant.tech/articles/gridstore-key-value-storage/) are used for persistence only. -This type of storage works quite fast, but it may require a lot of space to keep all the data in RAM, especially if the payload has large values attached - abstracts of text or even images. +Demetrios: +Yeah, I 100% agree with that, actually. -In the case of large payload values, it might be better to use OnDisk payload storage. -This type of storage will read and write payload directly to RocksDB, so it won’t require any significant amount of RAM to store. -The downside, however, is the access latency. -If you need to query vectors with some payload-based conditions - checking values stored on disk might take too much time. -In this scenario, we recommend creating a payload index for each field used in filtering conditions to avoid disk access. -Once you create the field index, Qdrant will preserve all values of the indexed field in RAM regardless of the payload storage type. +Demetrios: +I. -You can specify the desired type of payload storage with [configuration file](https://qdrant.tech/documentation/guides/configuration/) or with collection parameter `on_disk_payload` during [creation](https://qdrant.tech/documentation/concepts/collections/#create-collection) of the collection. +Demetrios: +Imagine you have seen some of the work from Linus Lee from notion and how notion is trying to add in the clicks. Instead of having to always chat with the LLM, you can just point and click and give it things that you want to do. I noticed with the demo that you shared, it was very much that, like, you're highlighting things that you like to do and you're narrowing that search and you're giving it more context without having to type in. I like italian food and I don't like meatballs or whatever it may be. -## [Anchor](https://qdrant.tech/documentation/concepts/storage/\#versioning) Versioning +Hamza Farooq: +Yes. -To ensure data integrity, Qdrant performs all data changes in 2 stages. -In the first step, the data is written to the Write-ahead-log(WAL), which orders all operations and assigns them a sequential number. +Demetrios: +So that's incredible. -Once a change has been added to the WAL, it will not be lost even if a power loss occurs. -Then the changes go into the segments. -Each segment stores the last version of the change applied to it as well as the version of each individual point. -If the new change has a sequential number less than the current version of the point, the updater will ignore the change. -This mechanism allows Qdrant to safely and efficiently restore the storage from the WAL in case of an abnormal shutdown. +Demetrios: +This is perfect, man. -##### Was this page useful? +Demetrios: +And so for anyone that wants to continue the conversation with you, you are on LinkedIn. We will leave a link to your LinkedIn. And you're also teaching on Maven. You're teaching in Stanford, UCLA, all this fun stuff. It's been great having you here. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Demetrios: +I'm very excited and I hope to have you back because it's amazing seeing what you're building and how you're building it. -Thank you for your feedback! 🙏 +Hamza Farooq: +Awesome. I think, again, it's a pleasure and an honor and thank you for letting. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/storage.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Demetrios: +Me speak about the UX part a. -On this page: +Hamza Farooq: +Lot because when you go to your customers, you realize that you need the UX and all those different things. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/storage.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Demetrios: +Oh, yeah, it's so true. It is so true. Well, everyone that is out there watching. -× +Demetrios: +Us, thank you for joining and we will see you next time. Next week we'll be back for another. -[Powered by](https://qdrant.tech/) +Demetrios: +Session of these vector talks and I am pleased to have you again. -<|page-193-lllmstxt|> -## indexing-optimization -- [Articles](https://qdrant.tech/articles/) -- Optimizing Memory for Bulk Uploads +Demetrios: +Reach out to me if you want to join us. -[Back to Vector Search Manuals](https://qdrant.tech/articles/vector-search-manuals/) +Demetrios: +You want to give a talk? I'll see you all later. Have a good one. -# Optimizing Memory for Bulk Uploads +Hamza Farooq: +Thank you. Bye. -Sabrina Aquino +<|page-391-lllmstxt|> +> *"When we were building proof of concept for this solution, we initially started with Postgres. But after some experimentation, we realized that it basically does not perform very well in terms of recall and speed... then we came to know that Qdrant performs a lot better as compared to other solutions that existed at the moment.”*\ +> -- Rishabh Bhardwaj +> -· +How does the HNSW (Hierarchical Navigable Small World) algorithm benefit the solution built by Rishabh? -February 13, 2025 +Rhishabh, a Data Engineer at HRS Group, excels in designing, developing, and maintaining data pipelines and infrastructure crucial for data-driven decision-making processes. With extensive experience, Rhishabh brings a profound understanding of data engineering principles and best practices to the role. Proficient in SQL, Python, Airflow, ETL tools, and cloud platforms like AWS and Azure, Rhishabh has a proven track record of delivering high-quality data solutions that align with business needs. Collaborating closely with data analysts, scientists, and stakeholders at HRS Group, Rhishabh ensures the provision of valuable data and insights for informed decision-making. -![Optimizing Memory for Bulk Uploads](https://qdrant.tech/articles_data/indexing-optimization/preview/title.jpg) +***Listen to the episode on [Spotify](https://open.spotify.com/episode/3IMIZljXqgYBqt671eaR9b?si=HUV6iwzIRByLLyHmroWTFA), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/tDWhMAOyrcE).*** -# [Anchor](https://qdrant.tech/articles/indexing-optimization/\#optimizing-memory-consumption-during-bulk-uploads) Optimizing Memory Consumption During Bulk Uploads + -Efficient memory management is a constant challenge when you’re dealing with **large-scale vector data**. In high-volume ingestion scenarios, even seemingly minor configuration choices can significantly impact stability and performance. + -Let’s take a look at the best practices and recommendations to help you optimize memory usage during bulk uploads in Qdrant. We’ll cover scenarios with both **dense** and **sparse** vectors, helping your deployments remain performant even under high load and avoiding out-of-memory errors. +## **Top Takeaways:** -## [Anchor](https://qdrant.tech/articles/indexing-optimization/\#indexing-for-dense-vs-sparse-vectors) Indexing for dense vs. sparse vectors +Data inconsistency, duplication, and real-time processing challenges? Rishabh Bhardwaj, Data Engineer at HRS Group has the solution! -**Dense vectors** +In this episode, Rishabh dives into the nitty-gritty of creating a high-performance hotel matching solution with Qdrant, covering everything from data inconsistency challenges to the speed and accuracy enhancements achieved through the HNSW algorithm. -Qdrant employs an **HNSW-based index** for fast similarity search on dense vectors. By default, HNSW is built or updated once the number of **unindexed** vectors in a segment exceeds a set `indexing_threshold`. Although it delivers excellent query speed, building or updating the HNSW graph can be **resource-intensive** if it occurs frequently or across many small segments. +5 Keys to Learning from the Episode: -**Sparse vectors** +1. Discover the importance of data consistency and the challenges it poses when dealing with multiple sources and languages. +2. Learn how Qdrant, an open-source vector database, outperformed other solutions and provided an efficient solution for high-speed matching. +3. Explore the unique modification of the HNSW algorithm in Qdrant and how it optimized the performance of the solution. +4. Dive into the crucial role of geofiltering and how it ensures accurate matching based on hotel locations. +5. Gain insights into the considerations surrounding GDPR compliance and the secure handling of hotel data. -Sparse vectors use an **inverted index**. This index is updated at the **time of upsertion**, meaning you cannot disable or postpone it for sparse vectors. In most cases, its overhead is smaller than that of building an HNSW graph, but you should still be aware that each upsert triggers a sparse index update. +> Fun Fact: Did you know that Rishabh and his team experimented with multiple transformer models to find the best fit for their entity resolution use case? Ultimately, they found that the Mini LM model struck the perfect balance between speed and accuracy. Talk about a winning combination! +> -## [Anchor](https://qdrant.tech/articles/indexing-optimization/\#bulk-upload-configuration-for-dense-vectors) Bulk upload configuration for dense vectors +## Show Notes: -When performing high-volume vector ingestion, you have **two primary options** for handling indexing overhead. You should choose one depending on your specific workload and memory constraints: +02:24 Data from different sources is inconsistent and complex.\ +05:03 Using Postgres for proof, switched to Qdrant for better results\ +09:16 Geofiltering is crucial for validating our matches.\ +11:46 Insights on performance metrics and benchmarks.\ +16:22 We experimented with different values and found the desired number.\ +19:54 We experimented with different models and found the best one.\ +21:01 API gateway connects multiple clients for entity resolution.\ +24:31 Multiple languages supported, using transcript API for accuracy. -- **Disable HNSW indexing** +## More Quotes from Rishabh: -To reduce memory and CPU pressure during bulk ingestion, you can **disable HNSW indexing entirely** by setting `"m": 0`. -For dense vectors, the `m` parameter defines how many edges each node in the HNSW graph can have. -This way, no dense vector index will be built, preventing unnecessary CPU usage during ingestion. +*"One of the major challenges is the data inconsistency.”*\ +-- Rishabh Bhardwaj -**Figure 1:** A description of three key HNSW parameters. +*"So the only thing of how to know that which model would work for us is to again experiment with the models on our own data sets. But after doing those experiments, we realized that this is the best model that offers the best balance between speed and accuracy cool of the embeddings.”*\ +-- Rishabh Bhardwaj -![](https://qdrant.tech/articles_data/indexing-optimization/hnsw-parameters.png) +*"Qdrant basically optimizes a lot using for the compute resources and this also helped us to scale the whole infrastructure in a really efficient manner.”*\ +-- Rishabh Bhardwaj -```json -PATCH /collections/your_collection -{ - "hnsw_config": { - "m": 0 - } -} +## Transcript: +Demetrios: +Hello, fellow travelers in vector space. Dare, I call you astronauts? Today we've got an incredible conversation coming up with Rishabh, and I am happy that you all have joined us. Rishabh, it's great to have you here, man. How you doing? -``` +Rishabh Bhardwaj: +Thanks for having me, Demetrios. I'm doing really great. -**After ingestion is complete**, you can **re-enable HNSW** by setting `m` back to a production value (commonly 16 or 32). -Remember that search won’t use HNSW until the index is built, so search performance may be slower during this period. +Demetrios: +Cool. I love hearing that. And I know you are in India. It is a little bit late there, so I appreciate you taking the time to come on the Vector space talks with us today. You've got a lot of stuff that you're going to be talking about. For anybody that does not know you, you are a data engineer at Hrs Group, and you're responsible for designing, developing, and maintaining data pipelines and infrastructure that supports the company. I am excited because today we're going to be talking about building a high performance hotel matching solution with Qdrant. Of course, there's a little kicker there. -- **Disabling optimizations completely** +Demetrios: +We want to get into how you did that and how you leveraged Qdrant. Let's talk about it, man. Let's get into it. I want to know give us a quick overview of what exactly this is. I gave the title, but I think you can tell us a little bit more about building this high performance hotel matching solution. -The `indexing_threshold` tells Qdrant how many unindexed dense vectors can accumulate in a segment before building the HNSW graph. Setting `"indexing_threshold"=0` defers indexing entirely, keeping **ingestion speed at maximum**. However, this means uploaded vectors are not moved to disk while uploading, which can lead to **high RAM usage**. +Rishabh Bhardwaj: +Definitely. So to start with, a brief description about the project. So we have some data in our internal databases, and we ingest a lot of data on a regular basis from different sources. So Hrs is basically a global tech company focused on business travel, and we have one of the most used hotel booking portals in Europe. So one of the major things that is important for customer satisfaction is the content that we provide them on our portals. Right. So the issue or the key challenges that we have is basically with the data itself that we ingest from different sources. One of the major challenges is the data inconsistency. -```json -PATCH /collections/your_collection -{ - "optimizer_config": { - "indexing_threshold": 0 - } -} +Rishabh Bhardwaj: +So different sources provide data in different formats, not only in different formats. It comes in multiple languages as well. So almost all the languages being used across Europe and also other parts of the world as well. So, Majorly, the data is coming across 20 different languages, and it makes it really difficult to consolidate and analyze this data. And this inconsistency in data often leads to many errors in data interpretation and decision making as well. Also, there is a challenge of data duplication, so the same piece of information can be represented differently across various sources, which could then again lead to data redundancy. And identifying and resolving these duplicates is again a significant challenge. Then the last challenge I can think about is that this data processing happens in real time. -``` +Rishabh Bhardwaj: +So we have a constant influx of data from multiple sources, and processing and updating this information in real time is a really daunting task. Yeah. -After bulk ingestion, set `indexing_threshold` to a positive value to ensure vectors are indexed and searchable via HNSW. **Vectors will not be searchable via HNSW until indexing is performed.** +Demetrios: +And when you are talking about this data duplication, are you saying things like, it's the same information in French and German? Or is it something like it's the same column, just a different way in like, a table? -Small thresholds (e.g., 100) mean more frequent indexing, which can still be costly if many segments exist. Larger thresholds (e.g., 10000) delay indexing to batch more vectors at once, potentially using more RAM at the moment of index build, but fewer builds overall. +Rishabh Bhardwaj: +Actually, it is both the cases, so the same entities can be coming in multiple languages. And then again, second thing also wow. -Between these two approaches, we generally recommend disabling HNSW ( `"m"=0`) during bulk ingestion to keep memory usage predictable. Using `indexing_threshold=0` can be an alternative, but only if your system has enough memory to accommodate the unindexed vectors in RAM. +Demetrios: +All right, cool. Well, that sets the scene for us. Now, I feel like you brought some slides along. Feel free to share those whenever you want. I'm going to fire away the first question and ask about this. I'm going to go straight into Qdrant questions and ask you to elaborate on how the unique modification of Qdrant of the HNSW algorithm benefits your solution. So what are you doing there? How are you leveraging that? And how also to add another layer to this question, this ridiculously long question that I'm starting to get myself into, how do you handle geo filtering based on longitude and latitude? So, to summarize my lengthy question, let's just start with the HNSW algorithm. How does that benefit your solution? -* * * +Rishabh Bhardwaj: +Sure. So to begin with, I will give you a little backstory. So when we were building proof of concept for this solution, we initially started with Postgres, because we had some Postgres databases lying around in development environments, and we just wanted to try out and build a proof of concept. So we installed an extension called Pgvector. And at that point of time, it used to have IVF Flat indexing approach. But after some experimentation, we realized that it basically does not perform very well in terms of recall and speed. Basically, if we want to increase the speed, then we would suffer a lot on basis of recall. Then we started looking for native vector databases in the market, and then we saw some benchmarks and we came to know that Qdrant performs a lot better as compared to other solutions that existed at the moment. -## [Anchor](https://qdrant.tech/articles/indexing-optimization/\#on-disk-storage-in-qdrant) On-Disk storage in Qdrant +Rishabh Bhardwaj: +And also, it was open source and really easy to host and use. We just needed to deploy a docker image in EC two instance and we can really start using it. -By default, Qdrant keeps **vectors**, **payload data**, and **indexes** in memory to ensure low-latency queries. However, in large-scale or memory-constrained scenarios, you can configure some or all of them to be stored on-disk. This helps reduce RAM usage at the cost of potential increases in query latency, particularly for cold reads. +Demetrios: +Did you guys do your own benchmarks too? Or was that just like, you looked, you saw, you were like, all right, let's give this thing a spin. -**When to use on-disk**: +Rishabh Bhardwaj: +So while deciding initially we just looked at the publicly available benchmarks, but later on, when we started using Qdrant, we did our own benchmarks internally. Nice. -- You have **very large** or **rarely used** payload data or indexes, and freeing up RAM is worth potential I/O overhead. -- Your dataset doesn’t fit comfortably in available memory. -- You want to reduce memory pressure. -- You can tolerate slower queries if it ensures the system remains stable under heavy loads. +Demetrios: +All right. -* * * +Rishabh Bhardwaj: +We just deployed a docker image of Qdrant in one of the EC Two instances and started experimenting with it. Very soon we realized that the HNSW indexing algorithm that it uses to build the indexing for the vectors, it was really efficient. We noticed that as compared to the PG Vector IVF Flat approach, it was around 16 times faster. And it didn't mean that it was not that accurate. It was actually 5% more accurate as compared to the previous results. So hold up. -## [Anchor](https://qdrant.tech/articles/indexing-optimization/\#memmap-storage-and-segmentation) Memmap storage and segmentation +Demetrios: +16 times faster and 5% more accurate. And just so everybody out there listening knows we're not paying you to say this, right? -Qdrant uses **memory-mapped files** (segments) to store data on-disk. Rather than loading all vectors into RAM, Qdrant maps each segment into its address space, paging data in and out on demand. This helps keep the active RAM footprint lower, because data can be paged out if memory pressure is high. But each segment still incurs overhead (metadata, page table entries, etc.). +Rishabh Bhardwaj: +No, not at all. -During **high-volume ingestion**, you can accumulate dozens of small segments. Qdrant’s **optimizer** can later merge these into fewer, larger segments, reducing per-segment overhead and lowering total memory usage. +Demetrios: +All right, keep going. I like it. -When you create a collection with `"on_disk": true`, Qdrant will store newly inserted vectors in memmap storage from the start. For example: +Rishabh Bhardwaj: +Yeah. So initially, during the experimentations, we begin with the default values for the HNSW algorithm that Qdrant ships with. And these benchmarks that I just told you about, it was based on those parameters. But as our use cases evolved, we also experimented on multiple values of basically M and EF construct that Qdrant allow us to specify in the indexing algorithm. -```json -PATCH /collections/your_collection -{ - "vectors": { - "on_disk": true - } -} +Demetrios: +Right. -``` +Rishabh Bhardwaj: +So also the other thing is, Qdrant also provides the functionality to specify those parameters while making the search as well. So it does not mean if we build the index initially, we only have to use those specifications. We can again specify them during the search as well. -This approach immediately places all incoming vectors on disk, which can be very efficient in case of bulk ingestion. +Demetrios: +Okay. -However, **vector data and indexes are stored separately**, so enabling `on_disk` for vectors does not automatically store their indexes on disk. To fully optimize memory usage, you may need to configure **both vector storage and index storage** independently. +Rishabh Bhardwaj: +Yeah. So some use cases we have requires 100% accuracy. It means we do not need to worry about speed at all in those use cases. But there are some use cases in which speed is really important when we need to match, like, a million scale data set. In those use cases, speed is really important, and we can adjust a little bit on the accuracy part. So, yeah, this configuration that Qdrant provides for indexing really benefited us in our approach. -For dense vectors, you can enable on-disk storage for both the **vector data** and the **HNSW index**: +Demetrios: +Okay, so then layer into that all the fun with how you're handling geofiltering. -```json -PATCH /collections/your_collection -{ - "vectors": { - "on_disk": true - }, - "hnsw_config": { - "on_disk": true - } -} +Rishabh Bhardwaj: +So geofiltering is also a very important feature in our solution because the entities that we are dealing with in our data majorly consist of hotel entities. Right. And hotel entities often comes with the geocordinates. So even if we match it using one of the Embedding models, then we also need to make sure that whatever the model has matched with a certain cosine similarity is also true. So in order to validate that, we use geofiltering, which also comes in stacked with Qdrant. So we provide geocordinate data from our internal databases, and then we match it from what we get from multiple sources as well. And it also has a radius parameter, which we can provide to tune in. How much radius do we want to take in account in order for this to be filterable? -``` +Demetrios: +Yeah. Makes sense. I would imagine that knowing where the hotel location is is probably a very big piece of the puzzle that you're serving up for people. So as you were doing this, what are some things that came up that were really important? I know you talked about working with Europe. There's a lot of GDPR concerns. Was there, like, privacy considerations that you had to address? Was there security considerations when it comes to handling hotel data? Vector, Embeddings, how did you manage all that stuff? -For sparse vectors, you need to enable `on_disk` for both the vector data and the sparse index separately: +Rishabh Bhardwaj: +So GDP compliance? Yes. It does play a very important role in this whole solution. -```json -PATCH /collections/your_collection -{ - "sparse_vectors": { - "text": { - "on_disk": true, - "index": { - "on_disk": true - } - } - } -} +Demetrios: +That was meant to be a thumbs up. I don't know what happened there. Keep going. Sorry, I derailed that. -``` +Rishabh Bhardwaj: +No worries. Yes. So GDPR compliance is also one of the key factors that we take in account while building this solution to make sure that nothing goes out of the compliance. We basically deployed Qdrant inside a private EC two instance, and it is also protected by an API key. And also we have built custom authentication workflows using Microsoft Azure SSO. -* * * +Demetrios: +I see. So there are a few things that I also want to ask, but I do want to open it up. There are people that are listening, watching live. If anyone wants to ask any questions in the chat, feel free to throw something in there and I will ask away. In the meantime, while people are typing in what they want to talk to you about, can you talk to us about any insights into the performance metrics? And really, these benchmarks that you did where you saw it was, I think you said, 16 times faster and then 5% more accurate. What did that look like? What benchmarks did you do? How did you benchmark it? All that fun stuff. And what are some things to keep in mind if others out there want to benchmark? And I guess you were just benchmarking it against Pgvector, right? -## [Anchor](https://qdrant.tech/articles/indexing-optimization/\#best-practices-for-high-volume-vector-ingestion)**Best practices for high-volume vector ingestion** +Rishabh Bhardwaj: +Yes, we did. -Bulk ingestion can lead to high memory consumption and even out-of-memory (OOM) errors. **If you’re experiencing out-of-memory errors with your current setup**, scaling up temporarily (increasing available RAM) will provide a buffer while you adjust Qdrant’s configuration for more a efficient data ingestion. +Demetrios: +Okay, cool. -The key here is to control indexing overhead. Let’s walk through the best practices for high-volume vector ingestion in a constrained-memory environment. +Rishabh Bhardwaj: +So for benchmarking, we have some data sets that are already matched to some entities. This was done partially by humans and partially by other algorithms that we use for matching in the past. And it is already consolidated data sets, which we again used for benchmarking purposes. Then the benchmarks that I specified were only against PG vector, and we did not benchmark it any further because the speed and the accuracy that Qdrant provides, I think it is already covering our use case and it is way more faster than we thought the solution could be. So right now we did not benchmark against any other vector database or any other solution. -### [Anchor](https://qdrant.tech/articles/indexing-optimization/\#1-store-vector-data-on-disk-immediately) 1\. Store vector data on disk immediately +Demetrios: +Makes sense just to also get an idea in my head kind of jumping all over the place, so forgive me. The semantic components of the hotel, was it text descriptions or images or a little bit of both? Everything? -The most effective way to reduce memory usage is to store vector data on disk right from the start using `on_disk: true`. This prevents RAM from being overloaded with raw vectors before optimization kicks in. +Rishabh Bhardwaj: +Yes. So semantic comes just from the descriptions of the hotels, and right now it does not include the images. But in future use cases, we are also considering using images as well to calculate the semantic similarity between two entities. -```json -PATCH /collections/your_collection -{ - "vectors": { - "on_disk": true - } -} +Demetrios: +Nice. Okay, cool. Good. I am a visual guy. You got slides for us too, right? If I'm not mistaken? Do you want to share those or do you want me to keep hitting you with questions? We have something from Brad in the chat and maybe before you share any slides, is there a map visualization as part of the application UI? Can you speak to what you used? -``` +Rishabh Bhardwaj: +If so, not right now, but this is actually a great idea and we will try to build it as soon as possible. -Previously, vector data had to be held in RAM until optimizers could move it to disk, which caused significant memory pressure. Now, by writing vectors to disk directly, memory overhead is significantly reduced, making bulk ingestion much more efficient. +Demetrios: +Yeah, it makes sense. Where you have the drag and you can see like within this area, you have X amount of hotels, and these are what they look like, et cetera, et cetera. -### [Anchor](https://qdrant.tech/articles/indexing-optimization/\#2-disable-hnsw-for-dense-vectors-m0) 2\. Disable HNSW for dense vectors ( `m=0`) +Rishabh Bhardwaj: +Yes, definitely. -During an **initial bulk load**, you can **disable** dense indexing by setting `"m": 0.` This ensures Qdrant won’t build an HNSW graph for incoming vectors, avoiding unnecessary memory and CPU usage. +Demetrios: +Awesome. All right, so, yeah, feel free to share any slides you have, otherwise I can hit you with another question in the meantime, which is I'm wondering about the configurations you used for the HNSW index in Qdrant and what were the number of edges per node and the number of neighbors to consider during the index building. All of that fun stuff that goes into the nitty gritty of it. -```json -PATCH /collections/your_collection -{ - "hnsw_config": { - "m": 0 - }, - "optimizer_config": { - "indexing_threshold": 10000 - } -} +Rishabh Bhardwaj: +So should I go with the slide first or should I answer your question first? -``` +Demetrios: +Probably answer the question so we don't get too far off track, and then we can hit up your slides. And the slides, I'm sure, will prompt many other questions from my side and the audience's side. -### [Anchor](https://qdrant.tech/articles/indexing-optimization/\#3-let-the-optimizer-run-after-bulk-uploads) 3\. Let the optimizer run **after** bulk uploads +Rishabh Bhardwaj: +So, for HNSW configuration, we have specified the value of M, which is, I think, basically the layers as 64, and the value for EF construct is 256. -Qdrant’s optimizers continuously restructure data to improve search efficiency. However, during a bulk upload, this can lead to excessive data movement and overhead as segments are constantly reorganized while new data is still arriving. +Demetrios: +And how did you go about that? -To avoid this, **upload all data first**, then allow the optimizer to process everything in one go. This minimizes redundant operations and ensures a more efficient segment structure. +Rishabh Bhardwaj: +So we did some again, benchmarks based on the single model that we have selected, which is mini LM, L six, V two. I will talk about it later also. But we basically experimented with different values of M and EF construct, and we came to this number that this is the value that we want to go ahead with. And also when I said that in some cases, indexing is not required at all, speed is not required at all, we want to make sure that whatever we are matching is 100% accurate. In that case, the Python client for Qdrant also provides a parameter called exact, and if we specify it as true, then it basically does not use indexing and it makes a full search on the whole vector collection, basically. -### [Anchor](https://qdrant.tech/articles/indexing-optimization/\#4-wait-for-indexation-to-clear-up-memory)**4\. Wait for indexation to clear up memory** +Demetrios: +Okay, so there's something for me that's pretty fascinating there on these different use cases. What else differs in the different ones? Because you have certain needs for speed or accuracy. It seems like those are the main trade offs that you're working with. What differs in the way that you set things up? -Before performing additional operations, **allow Qdrant to finish any ongoing indexing**. Large indexing jobs can keep memory usage high until they fully complete. +Rishabh Bhardwaj: +So in some cases so there are some internal databases that need to have hotel entities in a very sophisticated manner. It means it should not contain even a single duplicate entity. In those cases, accuracy is the most important thing we look at, and in some cases, for data analytics and consolidation purposes, we want speed more, but the accuracy should not be that much in value. -Monitor Qdrant logs or metrics to confirm when indexing finishes—once that happens, memory consumption should drop as intermediate data structures are freed. +Demetrios: +So what does that look like in practice? Because you mentioned okay, when we are looking for the accuracy, we make sure that it comes through all of the different records. Right. Are there any other things in practice that you did differently? -### [Anchor](https://qdrant.tech/articles/indexing-optimization/\#5-re-enable-hnsw-post-ingestion) 5\. Re-enable HNSW post-ingestion +Rishabh Bhardwaj: +Not really. Nothing I can think of right now. -After the ingestion phase is over and memory usage has stabilized, re-enable HNSW for dense vectors by setting `m` back to a production value (commonly `16` or `32`): +Demetrios: +Okay, if anything comes up yeah, I'll remind you, but hit us with the slides, man. What do you got for the visual learners out there? -```json -PATCH /collections/your_collection -{ - "hnsw_config": { - "m": 16 - } -} +Rishabh Bhardwaj: +Sure. So I have an architecture diagram of what the solution looks like right now. So, this is the current architecture that we have in production. So, as I mentioned, we have deployed the Qdrant vector database in an EC Two, private EC Two instance hosted inside a VPC. And then we have some batch jobs running, which basically create Embeddings. And the source data basically first comes into S three buckets into a data lake. We do a little bit of preprocessing data cleaning and then it goes through a batch process of generating the Embeddings using the Mini LM model, mini LML six, V two. And this model is basically hosted in a SageMaker serverless inference endpoint, which allows us to not worry about servers and we can scale it as much as we want. -``` +Rishabh Bhardwaj: +And it really helps us to build the Embeddings in a really fast manner. -### [Anchor](https://qdrant.tech/articles/indexing-optimization/\#5-enable-quantization) 5\. Enable quantization +Demetrios: +Why did you choose that model? Did you go through different models or was it just this one worked well enough and you went with it? -If you had planned to store all dense vectors on disk, be aware that searches can slow down drastically due to frequent disk I/O while memory pressure is high. A more balanced approach is **scalar quantization**: compress vectors (e.g., to `int8`) so they fit in RAM without occupying as much space as full floating-point values. +Rishabh Bhardwaj: +No, actually this was, I think the third or the fourth model that we tried out with. So what happens right now is if, let's say we want to perform a task such as sentence similarity and we go to the Internet and we try to find a model, it is really hard to see which model would perform best in our use case. So the only thing of how to know that which model would work for us is to again experiment with the models on our own data sets. So we did a lot of experiments. We used, I think, Mpnet model and a lot of multilingual models as well. But after doing those experiments, we realized that this is the best model that offers the best balance between speed and accuracy cool of the Embeddings. So we have deployed it in a serverless inference endpoint in SageMaker. And once we generate the Embeddings in a glue job, we then store them into the vector database Qdrant. -```json -PATCH /collections/your_collection -{ - "quantization_config": { - "scalar": { - "type": "int8", - "always_ram": true - } - } -} +Rishabh Bhardwaj: +Then this part here is what goes on in the real time scenario. So, we have multiple clients, basically multiple application that would connect to an API gateway. We have exposed this API gateway in such a way that multiple clients can connect to it and they can use this entity resolution service according to their use cases. And we take in different parameters. Some are mandatory, some are not mandatory, and then they can use it based on their use case. The API gateway is connected to a lambda function which basically performs search on Qdrant vector database using the same Embeddings that can be generated from the same model that we hosted in the serverless inference endpoint. So, yeah, this is how the diagram looks right now. It did not used to look like this sometime back, but we have evolved it, developed it, and now we have got to this point where it is really scalable because most of the infrastructure that we have used here is serverless and it can be scaled up to any number of requests that you want. -``` +Demetrios: +What did you have before that was the MVP. -Quantized vectors remain **in-memory** yet consume less space, preserving much of the performance advantage of RAM-based search. Learn more about [vector quantization](https://qdrant.tech/articles/what-is-vector-quantization/). +Rishabh Bhardwaj: +So instead of this one, we had a real time inference endpoint which basically limited us to some number of requests that we had preset earlier while deploying the model. So this was one of the bottlenecks and then lambda function was always there, I think this one and also I think in place of this Qdrant vector database, as I mentioned, we had Postgres. So yeah, that was also a limitation because it used to use a lot of compute capacity within the EC two instance as compared to Qdrant. Qdrant basically optimizes a lot using for the compute resources and this also helped us to scale the whole infrastructure in a really efficient manner. -### [Anchor](https://qdrant.tech/articles/indexing-optimization/\#conclusion) Conclusion +Demetrios: +Awesome. Cool. This is fascinating. From my side, I love seeing what you've done and how you went about iterating on the architecture and starting off with something that you had up and running and then optimizing it. So this project has been how long has it been in the making and what has the time to market been like that first MVP from zero to one and now it feels like you're going to one to infinity by making it optimized. What's the time frames been here? -High-volume vector ingestion can place significant memory demands on Qdrant, especially if dense vectors are indexed in real time. By following these tips, you can substantially reduce the risk of out-of-memory errors and maintain stable performance in a memory-limited environment. +Rishabh Bhardwaj: +I think we started this in the month of May this year. Now it's like five to six months already. So the first working solution that we built was in around one and a half months and then from there onwards we have tried to iterate it to make it better and better. -As always, monitor your system’s behavior. Review logs, watch metrics, and keep an eye on memory usage. Each workload is different, so it’s wise to fine-tune Qdrant’s parameters according to your hardware and data scale. +Demetrios: +Cool. Very cool. Some great questions come through in the chat. Do you have multiple language support for hotel names? If so, did you see any issues with such mappings? -##### Was this page useful? +Rishabh Bhardwaj: +Yes, we do have support for multiple languages and we do not do it using currently using the multilingual models because what we realized is the multilingual models are built on journal sentences and not based it is not trained on entities like names, hotel names and traveler names, et cetera. So when we experimented with the multilingual models it did not provide much satisfactory results. So we used transcript API from Google and it is able to basically translate a lot of languages across that we have across the data and it really gives satisfactory results in terms of entity resolution. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Demetrios: +Awesome. What other transformers were considered for the evaluation? -Thank you for your feedback! 🙏 +Rishabh Bhardwaj: +The ones I remember from top of my head are Mpnet, then there is a Chinese model called Text to VEC, Shiba something and Bert uncased, if I remember correctly. Yeah, these were some of the models. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/indexing-optimization.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Demetrios: +That we considered and nothing stood out that worked that well or was it just that you had to make trade offs on all of them? -On this page: +Rishabh Bhardwaj: +So in terms of accuracy, Mpnet was a little bit better than Mini LM but then again it was a lot slower than the Mini LM model. It was around five times slower than the Mini LM model, so it was not a big trade off to give up with. So we decided to go ahead with Mini LM. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/indexing-optimization.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Demetrios: +Awesome. Well, dude, this has been pretty enlightening. I really appreciate you coming on here and doing this. If anyone else has any questions for you, we'll leave all your information on where to get in touch in the chat. Rishabh, thank you so much. This is super cool. I appreciate you coming on here. Anyone that's listening, if you want to come onto the vector space talks, feel free to reach out to me and I'll make it happen. -× +Demetrios: +This is really cool to see the different work that people are doing and how you all are evolving the game, man. I really appreciate this. -[Powered by](https://qdrant.tech/) +Rishabh Bhardwaj: +Thank you, Demetrios. Thank you for inviting inviting me and have a nice day. -<|page-194-lllmstxt|> -## cloud-rbac -- [Documentation](https://qdrant.tech/documentation/) -- Cloud RBAC +<|page-392-lllmstxt|> +> *"When things are actually similar or how we define similarity. They are close to each other and if they are not, they're far from each other. This is what a model or embedding model tries to do.”*\ +>-- Nirant Kasliwal -# [Anchor](https://qdrant.tech/documentation/cloud-rbac/\#cloud-rbac) Cloud RBAC +Heard about FastEmbed? It's a game-changer. Nirant shares tricks on how to improve your embedding models. You might want to give it a shot! -## [Anchor](https://qdrant.tech/documentation/cloud-rbac/\#about-cloud-rbac) About Cloud RBAC +Nirant Kasliwal, the creator and maintainer of FastEmbed, has made notable contributions to the Finetuning Cookbook at OpenAI Cookbook. His contributions extend to the field of Natural Language Processing (NLP), with over 5,000 copies of the NLP book sold. -Qdrant Cloud enables you to manage permissions for your cloud resources with greater precision within the Qdrant Cloud console. This feature ensures that only authorized users have access to sensitive data and capabilities, covering the following areas: +***Listen to the episode on [Spotify](https://open.spotify.com/episode/4QWCyu28SlURZfS2qCeGKf?si=GDHxoOSQQ_W_UVz4IzzC_A), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/e67jLAx_F2A).*** -- Billing -- Identity and Access Management -- Clusters\* -- Hybrid Cloud -- Account Configuration + -_Note: Current permissions control access to ALL clusters. Per Cluster permissions will be in a future release._ + -> 💡 You can access this in **Access Management > User & Role Management** _if enabled._ +## **Top Takeaways:** -## [Anchor](https://qdrant.tech/documentation/cloud-rbac/\#guides) Guides +Nirant Kasliwal, AI Engineer at Qdrant joins us on Vector Space Talks to dive into FastEmbed, a lightning-quick method for generating embeddings. -- [Role Management](https://qdrant.tech/documentation/cloud-rbac/role-management/) -- [User Management](https://qdrant.tech/documentation/cloud-rbac/user-management/) +In this episode, Nirant shares insights, tips, and innovative ways to enhance embedding generation. -## [Anchor](https://qdrant.tech/documentation/cloud-rbac/\#reference) Reference +5 Keys to Learning from the Episode: -- [Permission List](https://qdrant.tech/documentation/cloud-rbac/permission-reference/) +1. Nirant introduces some hacker tricks for improving embedding models - you won't want to miss these! +2. Learn how quantized embedding models can enhance CPU performance. +3. Get an insight into future plans for GPU-friendly quantized models. +4. Understand how to select default models in Qdrant based on MTEB benchmark, and how to calibrate them for domain-specific tasks. +5. Find out how Fast Embed, a Python library created by Nirant, can solve common challenges in embedding creation and enhance the speed and efficiency of your workloads. -##### Was this page useful? +> Fun Fact: The largest header or adapter used in production is only about 400-500 KBs -- proof that bigger doesn't always mean better! +> -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +## Show Notes: -Thank you for your feedback! 🙏 +00:00 Nirant discusses FastEmbed at Vector Space Talks.\ +05:00 Tokens are expensive and slow in open air.\ +08:40 FastEmbed is fast and lightweight.\ +09:49 Supporting multimodal embedding is our plan.\ +15:21 No findings. Enhancing model downloads and performance.\ +16:59 Embed creation on your own compute, not cloud. Control and simplicity are prioritized.\ +21:06 Qdrant is fast for embedding similarity search.\ +24:07 Engineer's mindset: make informed guesses, set budgets.\ +26:11 Optimize embeddings with questions and linear layers.\ +29:55 Fast, cheap inference using mixed precision embeddings. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-rbac/_index.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +## More Quotes from Nirant: -On this page: +*"There is the academic way of looking at and then there is the engineer way of looking at it, and then there is the hacker way of looking at it. And I will give you all these three answers in that order.”*\ +-- Nirant Kasliwal -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/cloud-rbac/_index.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +*"The engineer's mindset now tells you that the best way to build something is to make an informed guess about what workload or challenges you're going to foresee. Right. Like a civil engineer builds a bridge around how many cars they expect, they're obviously not going to build a bridge to carry a shipload, for instance, or a plane load, which are very different.”*\ +-- Nirant Kasliwal -× +*"I think the more correct way to look at it is that we use the CPU better.”*\ +-- Nirant Kasliwal -[Powered by](https://qdrant.tech/) +## Transcript: +Demetrios: +Welcome back, everyone, to another vector space talks. Today we've got my man Nirant coming to us talking about FastEmbed. For those, if this is your first time at our vector space talks, we like to showcase some of the cool stuff that the community in Qdrant is doing, the Qdrant community is doing. And we also like to show off some of the cool stuff that Qdrant itself is coming out with. And this is one of those times that we are showing off what Qdrant itself came out with with FastEmbed. And we've got my man Nirant around here somewhere. I am going to bring him on stage and I will welcome him by saying Nirant a little bit about his bio, we could say. So, Naran, what's going on, dude? Let me introduce you real fast before we get cracking. -<|page-195-lllmstxt|> -## case-study-dust-v2 -0 +Demetrios: +And you are a man that wears many hats. You're currently working on the Devrel team at Qdrant, right? I like that shirt that you got there. And you have worked with ML models and embeddings since 2017. That is wild. You are also the creator and maintainer of fast embed. So you're the perfect guy to talk to about this very topic that we are doing today. Now, if anyone has questions, feel free to throw them into the chat and I will ask Nirant as he's going through it. I will also take this moment to encourage anyone who is watching to come and join us in discord, if you are not already there for the Qdrant discord. -# How Dust Scaled to 5,000+ Data Sources with Qdrant +Demetrios: +And secondly, I will encourage you if you have something that you've been doing with Qdrant or in the vector database space, or in the AI application space and you want to show it off, we would love to have you talk at the vector space talks. So without further ado, Nirant, my man, I'm going to kick it over to you and I am going to start it off with what are the challenges with embedding creation today? -Daniel Azoulai +Nirant Kasliwal: +I think embedding creation has it's not a standalone problem, as you might first think like that's a first thought that it's a standalone problem. It's actually two problems. One is a classic compute that how do you take any media? So you can make embeddings from practically any form of media, text, images, video. In theory, you could make it from bunch of things. So I recently saw somebody use soup as a metaphor. So you can make soup from almost anything. So you can make embeddings from almost anything. Now, what do we want to do though? Embedding are ultimately a form of compression. -· +Nirant Kasliwal: +So now we want to make sure that the compression captures something of interest to us. In this case, we want to make sure that embeddings capture some form of meaning of, let's say, text or images. And when we do that, what does that capture mean? We want that when things are actually similar or whatever is our definition of similarity. They are close to each other and if they are not, they're far from each other. This is what a model or embedding model tries to do basically in this piece. The model itself is quite often trained and built in a way which retains its ability to learn new things. And you can separate similar embeddings faster and all of those. But when we actually use this in production, we don't need all of those capabilities, we don't need the train time capabilities. -April 29, 2025 +Nirant Kasliwal: +And that means that all the extra compute and features and everything that you have stored for training time are wasted in production. So that's almost like saying that every time I have to speak to you I start over with hello, I'm Nirant and I'm a human being. It's extremely infuriating but we do this all the time with embedding and that is what fast embed primarily tries to fix. We say embeddings from the lens of production and we say that how can we make a Python library which is built for speed, efficiency and accuracy? Those are the core ethos in that sense. And I think people really find this relatable as a problem area. So you can see this on our GitHub issues. For instance, somebody says that oh yeah, we actually does what it says and yes, that's a good thing. So for 8 million tokens we took about 3 hours on a MacBook Pro M one while some other Olama embedding took over two days. -![How Dust Scaled to 5,000+ Data Sources with Qdrant](https://qdrant.tech/blog/case-study-dust-v2/preview/title.jpg) +Nirant Kasliwal: +You can expect what 8 million tokens would cost on open air and how slow it would be given that they frequently rate limit you. So for context, we made a 1 million embedding set which was a little more than it was a lot more than 1 million tokens and that took us several hundred of us. It was not expensive, but it was very slow. So as a batch process, if you want to embed a large data set, it's very slow. I think the more colorful version of this somebody wrote on LinkedIn, Prithvira wrote on LinkedIn that your embeddings will go and I love that idea that we have optimized speed so that it just goes fast. That's the idea. So what do we I mean let's put names to these things, right? So one is we want it to be fast and light. And I'll explain what do we mean by light? We want recall to be fast, right? I mean, that's what we started with that what are embedding we want to be make sure that similar things are similar. -On this page: +Nirant Kasliwal: +That's what we call recall. We often confuse this with accuracy but in retrieval sense we'll call it recall. We want to make sure it's still easy to use, right? Like there is no reason for this to get complicated. And we are fast, I mean we are very fast. And part of that is let's say we use BGE small En, the English model only. And let's say this is all in tokens per second and the token is model specific. So for instance, the way BGE would count a token might be different from how OpenAI might count a token because the tokenizers are slightly different and they have been trained on slightly different corporates. So that's the idea. -- [Share on X](https://twitter.com/intent/tweet?url=https%3A%2F%2Fqdrant.tech%2Fblog%2Fcase-study-dust-v2%2F&text=How%20Dust%20Scaled%20to%205,000+%20Data%20Sources%20with%20Qdrant "x") -- [Share on LinkedIn](https://www.linkedin.com/sharing/share-offsite/?url=https%3A%2F%2Fqdrant.tech%2Fblog%2Fcase-study-dust-v2%2F "LinkedIn") +Nirant Kasliwal: +I would love you to try this so that I can actually brag about you trying it. -## [Anchor](https://qdrant.tech/blog/case-study-dust-v2/\#inside-dusts-vector-stack-overhaul-scaling-to-5000-data-sources-with-qdrant) Inside Dust’s Vector Stack Overhaul: Scaling to 5,000+ Data Sources with Qdrant +Demetrios: +What was the fine print on that slide? Benchmarks are my second most liked way to brag. What's your first most liked way to brag? -![How Dust Scaled to 5,000+ Data Sources with Qdrant](https://qdrant.tech/blog/case-study-dust-v2/case-study-dust-v2-v2-bento-dark.jpg) +Nirant Kasliwal: +The best way is that when somebody tells me that they're using it. -### [Anchor](https://qdrant.tech/blog/case-study-dust-v2/\#the-challenge-scaling-ai-infrastructure-for-thousands-of-data-sources) The Challenge: Scaling AI Infrastructure for Thousands of Data Sources +Demetrios: +There we go. So I guess that's an easy way to get people to try and use it. -Dust, an OS for AI-native companies enabling users to build AI agents powered by actions and company knowledge, faced a set of growing technical hurdles as it scaled its operations. The company’s core product enables users to give AI agents secure access to internal and external data resources, enabling enhanced workflows and faster access to information. However, this mission hit bottlenecks when their infrastructure began to strain under the weight of thousands of data sources and increasingly demanding user queries. +Nirant Kasliwal: +Yeah, I would love it if you try it. Tell us how it went for you, where it's working, where it's broken, all of that. I love it if you report issue then say I will even appreciate it if you yell at me because that means you're not ignoring me. -Initially, Dust employed a strategy of creating a separate vector collection per data source, which rapidly became unsustainable. As the number of data sources ballooned beyond 5,000, the platform began experiencing significant performance degradation. RAM consumption skyrocketed, and vector search performance slowed dramatically, especially as the memory-mapped vectors spilled onto disk storage. At one point, they were managing nearly a thousand collections simultaneously and processing over a million vector upsert and delete operations in a single cycle. +Demetrios: +That's it. There we go. Bug reports are good to throw off your mojo. Keep it rolling. -### [Anchor](https://qdrant.tech/blog/case-study-dust-v2/\#evaluation-and-selection-why-dust-chose-qdrant) Evaluation and Selection: Why Dust Chose Qdrant +Nirant Kasliwal: +So we said fast and light. So what does light mean? So you will see a lot of these Embedding servers have really large image sizes. When I say image, I mean typically or docker image that can typically go to a few GPS. For instance, in case of sentence transformers, which somebody's checked out with Transformers the package and PyTorch, you get a docker image of roughly five GB. The Ram consumption is not that high by the way. Right. The size is quite large and of that the model is just 400 MB. So your dependencies are very large. -The Dust team explored several popular vector databases. While each had merits, none met all of Dust’s increasingly complex needs. Some providers’ developer experience didn’t align with their workflows, and others lacked the deployment flexibility required. Dust needed a solution capable of handling multi-tenancy at scale, embedding model flexibility, efficient memory usage, and deep configurability. +Nirant Kasliwal: +And every time you do this on, let's say an AWS Lambda, or let's say if you want to do horizontal scaling, your cold start times can go in several minutes. That is very slow and very inefficient if you are working in a workload which is very spiky. And if you were to think about it, people have more queries than, let's say your corpus quite often. So for instance, let's say you are in customer support for an ecommerce food delivery app. Bulk of your order volume will be around lunch and dinner timing. So that's a very spiky load. Similarly, ecommerce companies, which are even in fashion quite often see that people check in on their orders every evening and for instance when they leave from office or when they get home. And that's another spike. -Qdrant stood out thanks to its open-source Rust foundation, giving Dust the control they needed over memory, performance, and customization. Its intuitive API and strong developer community also made the integration experience more seamless. Critically, Qdrant’s design allowed Dust to consolidate their fragmented architecture—replacing thousands of individual collections with a few shared, multi-tenant ones powered by robust sharding and payload filtering. +Nirant Kasliwal: +So whenever you have a spiky load, you want to be able to scale horizontally and you want to be able to do it fast. And that speed comes from being able to be light. And that is why Fast Embed is very light. So you will see here that we call out that Fast Embed is just half a GB versus five GB. So on the extreme cases, this could be a ten x difference in your docker, image sizes and even Ram consumptions recall how good or bad are these embeddings? Right? So we said we are making them fast but do we sacrifice how much performance do we trade off for that? So we did a cosine similarity test with our default embeddings which was VG small en initially and now 1.5 and they're pretty robust. We don't sacrifice a lot of performance. Everyone with me? I need some audio to you. -### [Anchor](https://qdrant.tech/blog/case-study-dust-v2/\#implementation-highlights-advanced-architecture-with-qdrant) Implementation Highlights: Advanced Architecture with Qdrant +Demetrios: +I'm totally with you. There is a question that came through the chat if this is the moment to ask it. -One of the most impactful features Dust adopted was scalar quantization. This reduced vector storage size by a factor of four, enabling the team to keep data in memory rather than falling back to slower disk storage. This shift alone led to dramatic latency improvements. Where queries in large collections once took 5 to 10 seconds, they now returned in under a second. Even in collections with over a million vectors and heavy payloads, search responses consistently clocked in well below the one-second mark. +Nirant Kasliwal: +Yes, please go for it. -Dust also built a custom `DustQdrantClient` to manage all vector-related operations. This client abstracted away differences between cluster versions, embedding models, and sharding logic, simplifying ongoing development. Their infrastructure runs in Google Cloud Platform, with Qdrant deployed in isolated VPCs that communicate with Dust’s core APIs using secure authentication. The architecture is replicated across two major regions—US and EU—ensuring both high availability and compliance with data residency laws. +Demetrios: +All right it's from a little bit back like a few slides ago. So I'm just warning you. Are there any plans to support audio or image sources in fast embed? -### [Anchor](https://qdrant.tech/blog/case-study-dust-v2/\#results-faster-performance-lower-costs-better-user-experience) Results: Faster Performance, Lower Costs, Better User Experience +Nirant Kasliwal: +If there is a request for that we do have a plan to support multimodal embedding. We would love to do that. If there's specific model within those, let's say you want Clip or Seglip or a specific audio model, please mention that either on that discord or our GitHub so that we can plan accordingly. So yeah, that's the idea. We need specific suggestions so that we keep adding it. We don't want to have too many models because then that creates confusion for our end users and that is why we take opinated stance and that is actually a good segue. Why do we prioritize that? We want this package to be easy to use so we're always going to try and make the best default choice for you. So this is a very Linux way of saying that we do one thing and we try to do that one thing really well. -The impact of Qdrant was felt immediately. Search latency was slashed from multi-second averages to sub-second responsiveness. Collections that once consumed over 30 GB of RAM were optimized to run efficiently at a quarter of that size. The shift to in-memory quantized vectors, while keeping original vectors on disk for fallback, proved to be the perfect hybrid model for balancing performance and resource usage. +Nirant Kasliwal: +And here, let's say for instance, if you were to look at Qdrant client it's just passing everything as you would. So docs is a list of strings, metadata is a list of dictionaries and IDs again is a list of IDs valid IDs as per the Qdrant Client spec. And the search is also very straightforward. The entire search query is basically just two params. You could even see a very familiar integration which is let's say langchain. I think most people here would have looked at this in some shape or form earlier. This is also very familiar and very straightforward. And under the hood what are we doing is just this one line. -These backend improvements directly translated into user-facing gains. Dust’s AI agents became more responsive and reliable. Even as customers loaded larger and more complex datasets, the system continued to deliver consistent performance. The platform’s ability to scale without degrading UX marked a turning point, empowering Dust to expand its customer base with confidence. +Nirant Kasliwal: +We have a dot embed which is a generator and we call a list on that so that we actually get a list of embeddings. You will notice that we have a passage and query keys here which means that our retrieval model which we have used as default here, takes these into account that if there is a passage and a query they need to be mapped together and a question and answer context is captured in the model training itself. The other caveat is that we pass on the token limits or context windows from the embedding model creators themselves. So in the case of this model, which is BGE base, that is 512 BGE tokens. -The move to a multi-embedding-model architecture also paid dividends. By grouping data sources by embedder, Dust enabled smoother migrations and more efficient model experimentation. Qdrant’s flexibility let them evolve their architecture without reindexing massive datasets or disrupting end-user functionality. +Demetrios: +One thing on this, we had Neil's from Cohere on last week and he was talking about Cohere's embed version three, I think, or V three, he was calling it. How does this play with that? Does it is it supported or no? -### [Anchor](https://qdrant.tech/blog/case-study-dust-v2/\#lessons-learned-and-roadmap) Lessons Learned and Roadmap +Nirant Kasliwal: +As of now, we only support models which are open source so that we can serve those models directly. Embed V three is cloud only at the moment, so that is why it is not supported yet. But that said, we are not opposed to it. In case there's a requirement for that, we are happy to support that so that people can use it seamlessly with Qdrant and fast embed does the heavy lifting of passing it to Qdrant, structuring the schema and all of those for you. So that's perfectly fair. As I ask, if we have folks who would love to try coherent embed V three, we'd use that. Also, I think Nils called out that coherent embed V three is compatible with binary quantization. And I think that's the only embedding which officially supports that. -As they scaled, Dust uncovered a critical insight: users tend to ask more structured, analytical questions when they know a database is involved—queries better suited to SQL than vector search. This prompted the team to pair Qdrant with a text-to-SQL system, blending unstructured and structured query capabilities for a more versatile agent. +Nirant Kasliwal: +Okay, we are binary quantization aware and they've been trained for it. Like compression awareness is, I think, what it was called. So Qdrant supports that. So please of that might be worth it because it saves about 30 x in memory costs. So that's quite powerful. -Looking forward, Qdrant remains a foundational pillar of Dust’s product roadmap. They’re building multi-region sharding for more granular data residency, scaling their clusters both vertically and horizontally, and supporting newer embedding models from providers like OpenAI and Mistral. Future collections will be organized by embedder, with tenant-aware sharding and index optimizations tailored to each use case. +Demetrios: +Excellent. -### [Anchor](https://qdrant.tech/blog/case-study-dust-v2/\#a-new-tier-of-performance-scalability-and-architectural-flexibility) A new tier of performance, scalability, and architectural flexibility +Nirant Kasliwal: +All right, so behind the scenes, I think this is my favorite part of this. It's also very short. We do literally two things. Why are we fast? We use ONNX runtime as of now, our configurations are such that it runs on CPU and we are still very fast. And that's because of all the multiple processing and ONNX runtime itself at some point in the future. We also want to support GPUs. We had some configuration issues on different Nvidia configurations. As the GPU changes, the OnX runtime does not seamlessly change the GPU. -By adopting Qdrant, Dust unlocked a new tier of performance, scalability, and architectural flexibility. Their platform is now equipped to support millions of vectors, operate efficiently across regions, and deliver low-latency search, even at enterprise scale. For teams building sophisticated AI agents, Qdrant provides not just a vector database—but the infrastructure backbone to grow with confidence. +Nirant Kasliwal: +So that is why we do not allow that as a provider. But you can pass that. It's not prohibited, it's just not a default. We want to make sure your default is always available and will be available in the happy path, always. And we quantize the models for you. So when we quantize, what it means is we do a bunch of tricks supported by a huge shout out to hugging faces optimum. So we do a bunch of optimizations in the quantization, which is we compress some activations, for instance, gelu. We also do some graph optimizations and we don't really do a lot of dropping the bits, which is let's say 32 to 16 or 64 to 32 kind of quantization only where required. -### Get Started with Qdrant Free +Nirant Kasliwal: +Most of these gains come from the graph optimizations themselves. So there are different modes which optimum itself calls out. And if there are folks interested in that, happy to share docs and details around that. Yeah, that's about it. Those are the two things which we do from which we get bulk of these speed gains. And I think this goes back to the question which you opened with. Yes, we do want to support multimodal. We are looking at how we can do an on and export of Clip, which is as robust as Clip. -[Get Started](https://cloud.qdrant.io/signup) +Nirant Kasliwal: +So far we have not found anything. I've spent some time looking at this, the quality of life upgrades. So far, most of our model downloads have been through Google cloud storage hosted by Qdrant. We want to support hugging Face hub so that we can launch new models much, much faster. So we will do that soon. And the next thing is, as I called out, we always want to take performance as a first class citizen. So we are looking at how we can allow you to change or adapt frozen Embeddings, let's say open a Embedding or any other model to your specific domain. So maybe a separate toolkit within Fast Embed which is optional and not a part of the default path, because this is not something which you will use all the time. -![](https://qdrant.tech/img/rocket.svg) +Nirant Kasliwal: +We want to make sure that your training and experience parts are separate. So we will do that. Yeah, that's it. Fast and sweet. -Up! +Demetrios: +Amazing. Like FastEmbed. -<|page-196-lllmstxt|> -## qdrant-0-11-release -- [Articles](https://qdrant.tech/articles/) -- Introducing Qdrant 0.11 +Nirant Kasliwal: +Yes. -[Back to Qdrant Articles](https://qdrant.tech/articles/) +Demetrios: +There was somebody that talked about how you need to be good at your puns and that might be the best thing, best brag worthy stuff you've got. There's also a question coming through that I want to ask you. Is it true that when we use Qdrant client add Fast Embedding is included? We don't have to do it? -# Introducing Qdrant 0.11 +Nirant Kasliwal: +What do you mean by do it? As in you don't have to specify a Fast Embed model? -Kacper Ɓukawski +Demetrios: +Yeah, I think it's more just like you don't have to add it on to Qdrant in any way or this is completely separated. -· +Nirant Kasliwal: +So this is client side. You own all your data and even when you compress it and send us all the Embedding creation happens on your own compute. This Embedding creation does not happen on Cauldron cloud, it happens on your own compute. It's consistent with the idea that you should have as much control as possible. This is also why, as of now at least, Fast Embed is not a dedicated server. We do not want you to be running two different docker images for Qdrant and Fast Embed. Or let's say two different ports for Qdrant and Discord within the sorry, Qdrant and Fast Embed in the same docker image or server. So, yeah, that is more chaos than we would like. -October 26, 2022 +Demetrios: +Yeah, and I think if I understood it, I understood that question a little bit differently, where it's just like this comes with Qdrant out of the box. -![Introducing Qdrant 0.11](https://qdrant.tech/articles_data/qdrant-0-11-release/preview/title.jpg) +Nirant Kasliwal: +Yes, I think that's a good way to look at it. We set all the defaults for you, we select good practices for you and that should work in a vast majority of cases based on the MTEB benchmark, but we cannot guarantee that it will work for every scenario. Let's say our default model is picked for English and it's mostly tested on open domain open web data. So, for instance, if you're doing something domain specific, like medical or legal, it might not work that well. So that is where you might want to still make your own Embeddings. So that's the edge case here. -We are excited to [announce the release of Qdrant v0.11](https://github.com/qdrant/qdrant/releases/tag/v0.11.0), -which introduces a number of new features and improvements. +Demetrios: +What are some of the other knobs that you might want to be turning when you're looking at using this. -## [Anchor](https://qdrant.tech/articles/qdrant-0-11-release/\#replication) Replication +Nirant Kasliwal: +With Qdrant or without Qdrant? -One of the key features in this release is replication support, which allows Qdrant to provide a high availability -setup with distributed deployment out of the box. This, combined with sharding, enables you to horizontally scale -both the size of your collections and the throughput of your cluster. This means that you can use Qdrant to handle -large amounts of data without sacrificing performance or reliability. +Demetrios: +With Qdrant. -## [Anchor](https://qdrant.tech/articles/qdrant-0-11-release/\#administration-api) Administration API +Nirant Kasliwal: +So one thing which I mean, one is definitely try the different models which we support. We support a reasonable range of models, including a few multilingual ones. Second is while we take care of this when you do use with Qdrants. So, for instance, let's say this is how you would have to manually specify, let's say, passage or query. When you do this, let's say add and query. What we do, we add the passage and query keys while creating the Embeddings for you. So this is taken care of. So whatever is your best practices for the Embedding model, make sure you use it when you're using it with Qdrant or just in isolation as well. -Another new feature is the administration API, which allows you to disable write operations to the service. This is -useful in situations where search availability is more critical than updates, and can help prevent issues like memory -usage watermarks from affecting your searches. +Nirant Kasliwal: +So that is one knob. The second is, I think it's very commonly recommended, we would recommend that you start with some evaluation, like have maybe let's even just five sentences to begin with and see if they're actually close to each other. And as a very important shout out in Embedding retrieval, when we use Embedding for retrieval or vector similarity search, it's the relative ordering which matters. So, for instance, we cannot say that zero nine is always good. It could also mean that the best match is, let's say, 0.6 in your domain. So there is no absolute cut off for threshold in terms of match. So sometimes people assume that we should set a minimum threshold so that we get no noise. So I would suggest that you calibrate that for your queries and domain. -## [Anchor](https://qdrant.tech/articles/qdrant-0-11-release/\#exact-search) Exact search +Nirant Kasliwal: +And you don't need a lot of queries. Even if you just, let's say, start with five to ten questions, which you handwrite based on your understanding of the domain, you will do a lot better than just picking a threshold at random. -We have also added the ability to report indexed payload points in the info API, which allows you to verify that -payload values were properly formatted for indexing. In addition, we have introduced a new `exact` search parameter -that allows you to force exact searches of vectors, even if an ANN index is built. This can be useful for validating -the accuracy of your HNSW configuration. +Demetrios: +This is good to know. Okay, thanks for that. So there's a question coming through in the chat from Shreya asking how is the latency in comparison to elasticsearch? -## [Anchor](https://qdrant.tech/articles/qdrant-0-11-release/\#backward-compatibility) Backward compatibility +Nirant Kasliwal: +Elasticsearch? I believe that's a Qdrant benchmark question and I'm not sure how is elastics HNSW index, because I think that will be the fair comparison. I also believe elastics HNSW index puts some limitations on how many vectors they can store with the payload. So it's not an apples to apples comparison. It's almost like comparing, let's say, a single page with the entire book, because that's typically the ratio from what I remember I also might be a few months outdated on this, but I think the intent behind that question is, is Qdrant fast enough for what Qdrant does? It is definitely fast is, which is embedding similarity search. So for that, it's exceptionally fast. It's written in Rust and Twitter for all C. Similar tweets uses this at really large scale. They run a Qdrant instance. -This release is backward compatible with v0.10.5 storage in single node deployment, but unfortunately, distributed -deployment is not compatible with previous versions due to the large number of changes required for the replica set -implementation. However, clients are tested for backward compatibility with the v0.10.x service. +Nirant Kasliwal: +So I think if a Twitter scale company, which probably does about anywhere between two and 5 million tweets a day, if they can embed and use Qdrant to serve that similarity search, I think most people should be okay with that latency and throughput requirements. -##### Was this page useful? +Demetrios: +It's also in the name. I mean, you called it Fast Embed for a reason, right? -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Nirant Kasliwal: +Yes. -Thank you for your feedback! 🙏 +Demetrios: +So there's another question that I've got coming through and it's around the model selection and embedding size. And given the variety of models and the embedding sizes available, how do you determine the most suitable models and embedding sizes? You kind of got into this on how yeah, one thing that you can do to turn the knobs are choosing a different model. But how do you go about choosing which model is better? There. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/qdrant-0-11-release.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Nirant Kasliwal: +There is the academic way of looking at and then there is the engineer way of looking at it, and then there is the hacker way of looking at it. And I will give you all these three answers in that order. So the academic and the gold standard way of doing this would probably look something like this. You will go at a known benchmark, which might be, let's say, something like Kilt K-I-L-T or multilingual text embedding benchmark, also known as MTEB or Beer, which is beir one of these three benchmarks. And you will look at their retrieval section and see which one of those marks very close to whatever is your domain or your problem area, basically. So, for instance, let's say you're working in Pharmacology, the ODS that a customer support retrieval task is relevant to. You are near zero unless you are specifically in, I don't know, a Pharmacology subscription app. So that is where you would start. -On this page: +Nirant Kasliwal: +This will typically take anywhere between two to 20 hours, depending on how familiar you are with these data sets already. But it's not going to take you, let's say, a month to do this. So just to put a rough order of magnitude, once you have that, you try to take whatever is the best model on that subdomain data set and you see how does it work within your domain and you launch from there. At that point, you switch into the engineer's mindset. The engineer's mindset now tells you that the best way to build something is to make an informed guess about what workload or challenges you're going to foresee. Right. Like a civil engineer builds a bridge around how many cars they expect, they're obviously not going to build a bridge to carry a ship load, for instance, or a plane load, which are very different. So you start with that and you say, okay, this is the number of requests which I expect, this is what my budget is, and your budget will quite often be, let's say, in terms of latency budgets, compute and memory budgets. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/qdrant-0-11-release.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Nirant Kasliwal: +So for instance, one of the reasons I mentioned binary quantization and product quantization is with something like binary quantization you can get 98% recall, but with 30 to 40 x memory savings because it discards all the extraneous bits and just keeps the zero or one bit of the embedding itself. And Qdrant has already measured it for you. So we know that it works for OpenAI and Cohere embeddings for sure. So you might want to use that to just massively scale while keeping your budgets as an engineer. Now, in order to do this, you need to have some sense of three numbers, right? What are your latency requirements, your cost requirements, and your performance requirement. Now, for the performance, which is where engineers are most unfamiliar with, I will give the hacker answer, which is this. -× +Demetrios: +Is what I was waiting for. Man, so excited for this one, exactly this. Please tell us the hacker answer. -[Powered by](https://qdrant.tech/) +Nirant Kasliwal: +The hacker answer is this there are two tricks which I will share. One is write ten questions, figure out the best answer, and see which model gets as many of those ten, right? The second is most embedding models which are larger or equivalent to 768 embeddings, can be optimized and improved by adding a small linear head over it. So for instance, I can take the Open AI embedding, which is 1536 embedding, take my text, pass it through that, and for my own domain, adapt the Open A embedding by adding two or three layers of linear functions, basically, right? Y is equals to MX plus C or Ax plus B y is equals to C, something like that. So it's very simple, you can do it on NumPy, you don't need Torch for it because it's very small. The header or adapter size will typically be in this range of few KBS to be maybe a megabyte, maybe. I think the largest I have used in production is about 400 500 KBS. That's about it. And that will improve your recall several, several times. -<|page-197-lllmstxt|> -## langchain-integration -- [Articles](https://qdrant.tech/articles/) -- Using LangChain for Question Answering with Qdrant +Nirant Kasliwal: +So that's one, that's two tricks. And a third bonus hacker trick is if you're using an LLM, sometimes what you can do is take a question and rewrite it with a prompt and make embeddings from both, and pull candidates from both. And then with Qdrant Async, you can fire both these queries async so that you're not blocked, and then use the answer of both the original question which the user gave and the one which you rewrote using the LLM and see select the results which are there in both, or figure some other combination method. Also, so most Kagglers would be familiar with the idea of ensembling. This is the way to do query inference time ensembling, that's awesome. -[Back to Practical Examples](https://qdrant.tech/articles/practicle-examples/) +Demetrios: +Okay, dude, I'm not going to lie, that was a lot more than I was expecting for that answer. -# Using LangChain for Question Answering with Qdrant +Nirant Kasliwal: +Got into the weeds of retrieval there. Sorry. -Kacper Ɓukawski +Demetrios: +I like it though. I appreciate it. So what about when it comes to the know, we had Andre V, the CTO of Qdrant on here a few weeks ago. He was talking about binary quantization. But then when it comes to quantizing embedding models, in the docs you mentioned like quantized embedding models for fast CPU generation. Can you explain a little bit more about what quantized embedding models are and how they enhance the CPU performance? -· +Nirant Kasliwal: +So it's a shorthand to say that they optimize CPU performance. I think the more correct way to look at it is that we use the CPU better. But let's talk about optimization or quantization, which we do here, right? So most of what we do is from optimum and the way optimum call set up is they call these levels. So you can basically go from let's say level zero, which is there are no optimizations to let's say 99 where there's a bunch of extra optimizations happening. And these are different flags which you can switch. And here are some examples which I remember. So for instance, there is a norm layer which you can fuse with the previous operation. Then there are different attention layers which you can fuse with the previous one because you're not going to update them anymore, right? So what we do in training is we update them. -January 31, 2023 +Nirant Kasliwal: +You know that you're not going to update them because you're using them for inference. So let's say when somebody asks a question, you want that to be converted into an embedding as fast as possible and as cheaply as possible. So you can discard all these extra information which you are most likely to not going to use. So there's a bunch of those things and obviously you can use mixed precision, which most people have heard of with projects, let's say like lounge CPP that you can use FP 16 mixed precision or a bunch of these things. Let's say if you are doing GPU only. So some of these things like FP 16 work better on GPU. The CPU part of that claim comes from how ONNX the runtime which we use allows you to optimize whatever CPU instruction set you are using. So as an example with intel you can say, okay, I'm going to use the Vino instruction set or the optimization. -![Using LangChain for Question Answering with Qdrant](https://qdrant.tech/articles_data/langchain-integration/preview/title.jpg) +Nirant Kasliwal: +So when we do quantize it, we do quantization right now with CPUs in mind. So what we would want to do at some point in the future is give you a GPU friendly quantized model and we can do a device check and say, okay, we can see that a GPU is available and download the GPU friendly model first for you. Awesome. Does that answer the. Question. -# [Anchor](https://qdrant.tech/articles/langchain-integration/\#streamlining-question-answering-simplifying-integration-with-langchain-and-qdrant) Streamlining Question Answering: Simplifying Integration with LangChain and Qdrant +Demetrios: +I mean, for me, yeah, but we'll see what the chat says. -Building applications with Large Language Models doesn’t have to be complicated. A lot has been going on recently to simplify the development, -so you can utilize already pre-trained models and support even complex pipelines with a few lines of code. [LangChain](https://langchain.readthedocs.io/) -provides unified interfaces to different libraries, so you can avoid writing boilerplate code and focus on the value you want to bring. +Nirant Kasliwal: +Yes, let's do that. -## [Anchor](https://qdrant.tech/articles/langchain-integration/\#why-use-qdrant-for-question-answering-with-langchain) Why Use Qdrant for Question Answering with LangChain? +Demetrios: +What everybody says there. Dude, this has been great. I really appreciate you coming and walking through everything we need to know, not only about fast embed, but I think about embeddings in general. All right, I will see you later. Thank you so much, Naran. Thank you, everyone, for coming out. If you want to present, please let us know. Hit us up, because we would love to have you at our vector space talks. -It has been reported millions of times recently, but let’s say that again. ChatGPT-like models struggle with generating factual statements if no context -is provided. They have some general knowledge but cannot guarantee to produce a valid answer consistently. Thus, it is better to provide some facts we -know are actual, so it can just choose the valid parts and extract them from all the provided contextual data to give a comprehensive answer. [Vector database,\\ -such as Qdrant](https://qdrant.tech/), is of great help here, as their ability to perform a [semantic search](https://qdrant.tech/documentation/tutorials/search-beginners/) over a huge knowledge base is crucial to preselect some possibly valid -documents, so they can be provided into the LLM. That’s also one of the **chains** implemented in [LangChain](https://qdrant.tech/documentation/frameworks/langchain/), which is called `VectorDBQA`. And Qdrant got -integrated with the library, so it might be used to build it effortlessly. +<|page-393-lllmstxt|> +> *"Was it possible to somehow maybe find a way to transfer this feeling that we have this vibe and get the help of AI to understand what exactly we need at that moment in terms of songs?”*\ +> -- Filip Makraduli +> -### [Anchor](https://qdrant.tech/articles/langchain-integration/\#the-two-model-approach) The Two-Model Approach +Imagine if the recommendation system could understand spoken instructions or hummed melodies. This would greatly impact the user experience and accuracy of the recommendations. -Surprisingly enough, there will be two models required to set things up. First of all, we need an embedding model that will convert the set of facts into -vectors, and store those into Qdrant. That’s an identical process to any other semantic search application. We’re going to use one of the -`SentenceTransformers` models, so it can be hosted locally. The embeddings created by that model will be put into Qdrant and used to retrieve the most -similar documents, given the query. +Filip Makraduli, an electrical engineering graduate from Skopje, Macedonia, expanded his academic horizons with a Master's in Biomedical Data Science from Imperial College London. -However, when we receive a query, there are two steps involved. First of all, we ask Qdrant to provide the most relevant documents and simply combine all -of them into a single text. Then, we build a prompt to the LLM (in our case [OpenAI](https://openai.com/)), including those documents as a context, of course together with the -question asked. So the input to the LLM looks like the following: +Currently a part of the Digital and Technology team at Marks and Spencer (M&S), he delves into retail data science, contributing to various ML and AI projects. His expertise spans causal ML, XGBoost models, NLP, and generative AI, with a current focus on improving outfit recommendation systems. -```text -Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. -It's as certain as 2 + 2 = 4 -... +Filip is not only professionally engaged but also passionate about tech startups, entrepreneurship, and ML research, evident in his interest in Qdrant, a startup he admires. -Question: How much is 2 + 2? -Helpful Answer: +***Listen to the episode on [Spotify](https://open.spotify.com/episode/6a517GfyUQLuXwFRxvwtp5?si=ywXPY_1RRU-qsMt9qrRS6w), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/WIBtZa7mcCs).*** -``` + -There might be several context documents combined, and it is solely up to LLM to choose the right piece of content. But our expectation is, the model should -respond with just `4`. + -## [Anchor](https://qdrant.tech/articles/langchain-integration/\#why-do-we-need-two-different-models) Why do we need two different models? +## **Top Takeaways:** -Both solve some different tasks. The first model performs feature extraction, by converting the text into vectors, while -the second one helps in text generation or summarization. Disclaimer: This is not the only way to solve that task with LangChain. Such a chain is called `stuff` -in the library nomenclature. +Take a look at the song vibe recommender system created by Filip Makraduli. Find out how it works! -![](https://qdrant.tech/articles_data/langchain-integration/flow-diagram.png) +Filip discusses how AI can assist in finding the perfect songs for any mood. He takes us through his unique approach, using human language and AI models to capture the essence of a song and generate personalized recommendations. -Enough theory! This sounds like a pretty complex application, as it involves several systems. But with LangChain, it might be implemented in just a few lines -of code, thanks to the recent integration with [Qdrant](https://qdrant.tech/). We’re not even going to work directly with `QdrantClient`, as everything is already done in the background -by LangChain. If you want to get into the source code right away, all the processing is available as a -[Google Colab notebook](https://colab.research.google.com/drive/19RxxkZdnq_YqBH5kBV10Rt0Rax-kminD?usp=sharing). +Here are 5 key things you'll learn from this video: -## [Anchor](https://qdrant.tech/articles/langchain-integration/\#how-to-implement-question-answering-with-langchain-and-qdrant) How to Implement Question Answering with LangChain and Qdrant +1. How AI can help us understand and capture the vibe and feeling of a song +2. The use of language to transfer the experience and feeling of a song +3. The role of data sets and descriptions in building unconventional song recommendation systems +4. The importance of encoding text and using sentence transformers to generate song embeddings +5. How vector spaces and cosine similarity search are used to generate song recommendations -### [Anchor](https://qdrant.tech/articles/langchain-integration/\#step-1-configuration) Step 1: Configuration +> Fun Fact: Filip actually created a Spotify playlist in real-time during the video, based on the vibe and mood Demetrios described, showing just how powerful and interactive this AI music recommendation system can be! +> -A journey of a thousand miles begins with a single step, in our case with the configuration of all the services. We’ll be using [Qdrant Cloud](https://cloud.qdrant.io/), -so we need an API key. The same is for OpenAI - the API key has to be obtained from their website. +## Show Notes: -![](https://qdrant.tech/articles_data/langchain-integration/code-configuration.png) +01:25 Using AI to capture desired music vibes.\ +06:17 Faster and accurate model.\ +10:07 Sentence embedding model maps song descriptions.\ +14:32 Improving recommendations, user personalization in music.\ +15:49 Qdrant Python client creates user recommendations.\ +21:26 Questions about getting better embeddings for songs.\ +25:04 Contextual information for personalized walking recommendations.\ +26:00 Need predictions, voice input, and music options. -### [Anchor](https://qdrant.tech/articles/langchain-integration/\#step-2-building-the-knowledge-base) Step 2: Building the knowledge base +## More Quotes from Filip: -We also need some facts from which the answers will be generated. There is plenty of public datasets available, and -[Natural Questions](https://ai.google.com/research/NaturalQuestions/visualization) is one of them. It consists of the whole HTML content of the websites they were -scraped from. That means we need some preprocessing to extract plain text content. As a result, we’re going to have two lists of strings - one for questions and -the other one for the answers. +*"When you log in with Spotify, you could get recommendations related to your taste on Spotify or on whatever app you listen your music on.”*\ +-- Filip Makraduli -The answers have to be vectorized with the first of our models. The `sentence-transformers/all-mpnet-base-v2` is one of the possibilities, but there are some -other options available. LangChain will handle that part of the process in a single function call. +*"Once the user writes a query and the query mentions, like some kind of a mood, for example, I feel happy and it's a sunny day and so on, you would get the similarity to the song that has this kind of language explanations and language intricacies in its description.”*\ +-- Filip Makraduli -![](https://qdrant.tech/articles_data/langchain-integration/code-qdrant.png) +*"I've explored Qdrant and as I said with Spotify web API there are a lot of things to be done with these specific user-created recommendations.”*\ +-- Filip Makraduli -### [Anchor](https://qdrant.tech/articles/langchain-integration/\#step-3-setting-up-qa-with-qdrant-in-a-loop) Step 3: Setting up QA with Qdrant in a loop +## Transcript: +Demetrios: +So for those who do not know, you are going to be talking to us about when the music we listen to does not match our vibe. And can we get AI to help us on that? And you're currently working as a data scientist at Marks and Spencer. I know you got some slides to share, right? So I'll let you share your screen. We can kick off the slides and then we'll have a little presentation and I'll be back on to answer some questions. And if Neil's is still around at the end, which I don't think he will be able to hang around, but we'll see, we can pull him back on and have a little discussion at the end of the. -`VectorDBQA` is a chain that performs the process described above. So it, first of all, loads some facts from Qdrant and then feeds them into OpenAI LLM which -should analyze them to find the answer to a given question. The only last thing to do before using it is to put things together, also with a single function call. +Filip Makraduli: +That's. That's great. All right, cool. I'll share my screen. -![](https://qdrant.tech/articles_data/langchain-integration/code-vectordbqa.png) +Demetrios: +Right on. -## [Anchor](https://qdrant.tech/articles/langchain-integration/\#step-4-testing-out-the-chain) Step 4: Testing out the chain +Filip Makraduli: +Yeah. -And that’s it! We can put some queries, and LangChain will perform all the required processing to find the answer in the provided context. +Demetrios: +There we go. -![](https://qdrant.tech/articles_data/langchain-integration/code-answering.png) +Filip Makraduli: +Yeah. So I had to use this slide because it was really well done as an introductory slide. Thank you. Yeah. Thank you also for making it so. Yeah, the idea was, and kind of the inspiration with music, we all listen to it. It's part of our lives in many ways. Sometimes it's like the gym. -```text -> what kind of music is scott joplin most famous for - Scott Joplin is most famous for composing ragtime music. +Filip Makraduli: +We're ready to go, we're all hyped up, ready to do a workout, and then we click play. But the music and the playlist we get, it's just not what exactly we're looking for at that point. Or if we try to work for a few hours and try to get concentrated and try to code for hours, we can do the same and then we click play, but it's not what we're looking for again. So my inspiration was here. Was it possible to somehow maybe find a way to transfer this feeling that we have this vibe and get the help of AI to understand what exactly we need at that moment in terms of songs. So the obvious first question is how do we even capture a vibe and feel of a song? So initially, one approach that's popular and that works quite well is basically using a data set that has a lot of features. So Spotify has one data set like this and there are many others open source ones which include different features like loudness, key tempo, different kind of details related to the acoustics, the melody and so on. And this would work. -> who died from the band faith no more - Chuck Mosley +Filip Makraduli: +And this is kind of a way that a lot of song recommendation systems are built. However, what I wanted to do was maybe try a different approach in a way. Try to have a more unconventional recommender system, let's say. So what I did here was I tried to concentrate just on language. So my idea was, okay, is it possible to use human language to transfer this experience, this feeling that we have, and just use that and try to maybe encapsulate these features of songs. And instead of having a data set, just have descriptions of songs or sentences that explain different aspects of a song. So, as I said, this is a bit of a less traditional approach, and it's more of kind of testing the waters, but it worked to a decent extent. So what I did was, first I created a data set where I queried a large language model. -> when does maggie come on grey's anatomy - Maggie first appears in season 10, episode 1, which aired on September 26, 2013. +Filip Makraduli: +So I tried with llama and chat GPT, both. And the idea was to ask targeted questions, for example, like, what movie character does this song make you feel like? Or what's the tempo like? So, different questions that would help us understand maybe in what situation we would listen to this song, how will it make us feel like? And so on. And the idea was, as I said, again, to only use song names as queries for this large language model. So not have the full data sets with multiple features, but just song name, and kind of use this pretrained ability of all these LLMs to get this info that I was looking for. So an example of the generated data was this. So this song called Deep Sea Creature. And we have, like, a small description of the song. So it says a heavy, dark, mysterious vibe. -> can't take my eyes off you lyrics meaning - I don't know. +Filip Makraduli: +It will make you feel like you're descending into the unknown and so on. So a bit of a darker choice here, but that's the general idea. So trying to maybe do a bit of prompt engineering in a way to get the right features of a song, but through human language. So that was the first step. So the next step was how to encode this text. So all of this kind of querying reminds me of sentences. And this led me to sentence transformers and sentence Bird. And the usual issue with kind of doing this sentence similarity in the past was this, what I have highlighted here. -> who lasted the longest on alone season 2 - David McIntyre lasted the longest on Alone season 2, with a total of 66 days. +Filip Makraduli: +So this is actually a quote from a paper that Nils published a few years ago. So, basically, the way that this similarity was done was using cross encoders in the past, and that worked well, but it was really slow and unscalable. So Nils and his colleague created this kind of model, which helped scale this and make this a lot quicker, but also keep a lot of the accuracy. So Bert and Roberta were used, but they were not, as I said, quite scalable or useful for larger applications. So that's how sentence Bert was created. So the idea here was that there would be, like, a Siamese network that would train the model so that there could be, like, two bird models, and then the training would be done using this like zero, one and two tags, where kind of the sentences would be compared, whether there is entailment, neutrality or contradiction. So how similar these sentences are to each other. And by training a model like this and doing mean pooling, in the end, the model performed quite well and was able to kind of encapsulate this language intricacies of sentences. -``` +Filip Makraduli: +So I decided to use and try out sentence transformers for my use case, and that was the encoding bit. So we have the model, we encode the text, and we have the embedding. So now the question is, how do we actually generate the recommendations? How is the similarity performed? So the similarity was done using vector spaces and cosine similarity search here. There were multiple ways of doing this. First, I tried things with a flat index and I tried Qdrant and I tried FIS. So I've worked with both. And with the flat index, it was good. It works well. -The great thing about such a setup is that the knowledge base might be easily extended with some new facts and those will be included in the prompts -sent to LLM later on. Of course, assuming their similarity to the given question will be in the top results returned by Qdrant. +Filip Makraduli: +It's quick for small number of examples, small number of songs, but there is an issue when scaling. So once the vector indices get bigger, there might be a problem. So one popular kind of index architecture is this one here on the left. So hierarchical, navigable, small world graphs. So the idea here is that you wouldn't have to kind of go through all of the examples, but search through the examples in different layers, so that the search for similarities quicker. And this is a really popular approach. And Qdrant have done a really good customizable version of this, which is quite useful, I think, for very larger scales of application. And this graph here illustrates kind of well what the idea is. -If you want to run the chain on your own, the simplest way to reproduce it is to open the -[Google Colab notebook](https://colab.research.google.com/drive/19RxxkZdnq_YqBH5kBV10Rt0Rax-kminD?usp=sharing). +Filip Makraduli: +So there is the sentence in this example. It's like a stripped striped blue shirt made from cotton, and then there is the network or the encoder. So in my case, this sentence is the song description, the neural network is the sentence transformer in my case. And then this embeddings are generated, which are then mapped into this vector space, and then this vector space is queryed and the cosine similarity is found, and the recommendations are generated in this way, so that once the user writes a query and the query mentions, like some kind of a mood, for example, I feel happy and it's a sunny day and so on, you would get the similarity to the song that has this kind of language explanations and language intricacies in its description. And there are a lot of ways of doing this, as Nils mentioned, especially with different embedding models and doing context related search. So this is an interesting area for improvement, even in my use case. And the quick screenshot looks like this. So for example, the mood that the user wrote, it's a bit rainy, but I feel like I need a long walk in London. -##### Was this page useful? +Filip Makraduli: +And these are the top five suggested songs. This is also available on Streamlit. In the end I'll share links of everything and also after that you can click create a Spotify playlist and this playlist will be saved in your Spotify account. As you can see here, it says playlist generated earlier today. So yeah, I tried this, it worked. I will try live demo bit later. Hopefully it works again. But this is in beta currently so you won't be able to try it at home because Spotify needs to approve my app first and go through that process so that then I can do this part fully. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Filip Makraduli: +And the front end bit, as I mentioned, was done in Streamlit. So why Streamlit? I like the caching bit. So of course this general part where it's really easy and quick to do a lot of data dashboarding and data applications to test out models, that's quite nice. But this caching options that they have help a lot with like loading models from hugging face or if you're loading models from somewhere, or if you're loading different databases. So if you're combining models and data. In my case I had a binary file of the index and also the model. So it was quite useful and quick to do these things and to be able to try things out quickly. So this is kind of the step by step outline of everything I've mentioned and the whole project. -Thank you for your feedback! 🙏 +Filip Makraduli: +So the first step is encoding this descriptions into embeddings. Then this vector embeddings are mapped into a vector space. Examples here with how I've used Qdrant for this, which was quite nice. I feel like the developer experience is really good for scalable purposes. It's really useful. So if the number of songs keep increasing it's quite good. And the query and more similar embeddings. The front is done with Streamlit and the Spotify API to save the playlists on the Spotify account. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/langchain-integration.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Filip Makraduli: +All of these steps can be improved and tweaked in certain ways and I will talk a bit about that too. So a lot more to be done. So now there are 2000 songs, but as I've mentioned, in this vector space, the more songs that are there, the more representative this recommendations would be. So this is something I'm currently exploring and doing, generating, filtering and user specific personalization. So once maybe you log in with Spotify, you could get recommendations related to your taste on Spotify or on whatever app you listen your music on. And referring to the talk that Niels had a lot of potential for better models and embeddings and embedding models. So also the contrastive learning bits or the contents aware querying, that could be useful too. And a vector database because currently I'm using a binary file. -On this page: +Filip Makraduli: +But I've explored Qdrant and as I said with Spotify web API there are a lot of things to be done with this specific user created recommendations. So with Qdrant, the Python client is quite good. The getting started helps a lot. So I wrote a bit of code. I think for production use cases it's really great. So for my use case here, as you can see on the right, I just read the text from a column and then I encode with the model. So the sentence transformer is the model that I encode with. And there is this collections that they're so called in Qdrant that are kind of like this vector spaces that you can create and you can also do different things with them, which I think one of the more helpful ones is the payload one and the batch one. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/langchain-integration.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Filip Makraduli: +So you can batch things in terms of how many vectors will go to the server per single request. And also the payload helps if you want to add extra context. So maybe I want to filter by genres. I can add useful information to the vector embedding. So this is quite a cool feature that I'm planning on using. And another potential way of doing this and kind of combining things is using audio waves too, lyrics and descriptions and combining all of this as embeddings and then going through the similar process. So that's something that I'm looking to do also. And yeah, you also might have noticed that I'm a data scientist at Marks and Spencer and I just wanted to say that there are a lot of interesting ML and data related stuff going on there. -× +Filip Makraduli: +So a lot of teams that work on very interesting use cases, like in recommender systems, personalization of offers different stuff about forecasting. There is a lot going on with causal ML and yeah, the digital and tech department is quite well developed and I think it's a fun place to explore if you're interested in retail data science use cases. So yeah, thank you for your attention. I'll try the demo. So this is the QR code with the repo and all the useful links. You can contact me on LinkedIn. This is the screenshot of the repo and you have the link in the QR code. The name of the repo is song Vibe. -[Powered by](https://qdrant.tech/) +Filip Makraduli: +A friend of mine said that that wasn't a great name of a repo. Maybe he was right. But yeah, here we are. I'll just try to do the demo quickly and then we can step back to the. -<|page-198-lllmstxt|> -## rag-customer-support-cohere-airbyte-aws -- [Documentation](https://qdrant.tech/documentation/) -- [Examples](https://qdrant.tech/documentation/examples/) -- Question-Answering System for AI Customer Support +Demetrios: +I love dude, I got to say, when you said you can just automatically create the Spotify playlist, that made me. -# [Anchor](https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws/\#question-answering-system-for-ai-customer-support) Question-Answering System for AI Customer Support +Filip Makraduli: +Go like, oh, yes, let's see if it works locally. Do you have any suggestion what mood are you in? -| Time: 120 min | Level: Advanced | | | -| --- | --- | --- | --- | +Demetrios: +I was hoping you would ask me, man. I am in a bit of an esoteric mood and I want female kind of like Gaelic voices, but not Gaelic music, just Gaelic voices and lots of harmonies, heavy harmonies. -Maintaining top-notch customer service is vital to business success. As your operation expands, so does the influx of customer queries. Many of these queries are repetitive, making automation a time-saving solution. -Your support team’s expertise is typically kept private, but you can still use AI to automate responses securely. +Filip Makraduli: +Also. -In this tutorial we will setup a private AI service that answers customer support queries with high accuracy and effectiveness. By leveraging Cohere’s powerful models (deployed to [AWS](https://cohere.com/deployment-options/aws)) with Qdrant Hybrid Cloud, you can create a fully private customer support system. Data synchronization, facilitated by [Airbyte](https://airbyte.com/), will complete the setup. +Demetrios: +You didn't realize you're asking a musician. Let's see what we got. -![Architecture diagram](https://qdrant.tech/documentation/examples/customer-support-cohere-airbyte/architecture-diagram.png) +Filip Makraduli: +Let's see if this works in 2000 songs. Okay, so these are the results. Okay, yeah, you'd have to playlist. Let's see. -## [Anchor](https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws/\#system-design) System design +Demetrios: +Yeah, can you make the playlist public and then I'll just go find it right now. Here we go. -The history of past interactions with your customers is not a static dataset. It is constantly evolving, as new -questions are coming in. You probably have a ticketing system that stores all the interactions, or use a different way -to communicate with your customers. No matter what is the communication channel, you need to bring the correct answers -to the selected Large Language Model, and have an established way to do it in a continuous manner. Thus, we will build -an ingestion pipeline and then a Retrieval Augmented Generation application that will use the data. +Filip Makraduli: +Let's see. Okay, yeah, open in. Spotify playlist created now. Okay, cool. I can also rename it. What do you want to name the playlist? -- **Dataset:** a [set of Frequently Asked Questions from Qdrant\\ -users](https://qdrant.tech/documentation/faq/qdrant-fundamentals/) as an incrementally updated Excel sheet -- **Embedding model:** Cohere `embed-multilingual-v3.0`, to support different languages with the same pipeline -- **Knowledge base:** Qdrant, running in Hybrid Cloud mode -- **Ingestion pipeline:** [Airbyte](https://airbyte.com/), loading the data into Qdrant -- **Large Language Model:** Cohere [Command-R](https://docs.cohere.com/docs/command-r) -- **RAG:** Cohere [RAG](https://docs.cohere.com/docs/retrieval-augmented-generation-rag) using our knowledge base -through a custom connector +Demetrios: +Esoteric Gaelic Harmonies. That's what I think we got to go with AI. Well, I mean, maybe we could just put maybe in parenthes. -All the selected components are compatible with the [AWS](https://aws.amazon.com/) infrastructure. Thanks to Cohere models’ availability, you can build a fully private customer support system completely isolates data within your infrastructure. Also, if you have AWS credits, you can now use them without spending additional money on the models or -semantic search layer. +Filip Makraduli: +Yeah. So I'll share this later with you. Excellent. But yeah, basically that was it. -### [Anchor](https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws/\#data-ingestion) Data ingestion +Demetrios: +It worked. Ten out of ten for it. Working. That is also very cool. -Building a RAG starts with a well-curated dataset. In your specific case you may prefer loading the data directly from -a ticketing system, such as [Zendesk Support](https://airbyte.com/connectors/zendesk-support), -[Freshdesk](https://airbyte.com/connectors/freshdesk), or maybe integrate it with a shared inbox. However, in case of -customer questions quality over quantity is the key. There should be a conscious decision on what data to include in the -knowledge base, so we do not confuse the model with possibly irrelevant information. We’ll assume there is an [Excel\\ -sheet](https://docs.airbyte.com/integrations/sources/file) available over HTTP/FTP that Airbyte can access and load into -Qdrant in an incremental manner. +Filip Makraduli: +Live demo working. That's good. So now doing the infinite screen, which I have stopped now. -### [Anchor](https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws/\#cohere--qdrant-connector-for-rag) Cohere <> Qdrant Connector for RAG +Demetrios: +Yeah, classic, dude. Well, I've got some questions coming through and the chat has been active too. So I'll ask a few of the questions in the chat for a minute. But before I ask those questions in the chat, one thing that I was thinking about when you were talking about how to, like, the next step is getting better embeddings. And so was there a reason that you just went with the song title and then did you check, you said there was 2000 songs or how many songs? So did you do anything to check the output of the descriptions of these songs? -Cohere RAG relies on [connectors](https://docs.cohere.com/docs/connectors) which brings additional context to the model. -The connector is a web service that implements a specific interface, and exposes its data through HTTP API. With that -setup, the Large Language Model becomes responsible for communicating with the connectors, so building a prompt with the -context is not needed anymore. +Filip Makraduli: +Yeah, so I didn't do like a systematic testing in terms of like, oh, yeah, the output is structured in this way. But yeah, I checked it roughly went through a few songs and they seemed like, I mean, of course you could add more info, but they seemed okay. So I was like, okay, let me try kind of whether this works. And, yeah, the descriptions were nice. -### [Anchor](https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws/\#answering-bot) Answering bot +Demetrios: +Awesome. Yeah. So that kind of goes into one of the questions that mornie's asking. Let me see. Are you going to team this up with other methods, like collaborative filtering, content embeddings and stuff like that. -Finally, we want to automate the responses and send them automatically when we are sure that the model is confident -enough. Again, the way such an application should be created strongly depends on the system you are using within the -customer support team. If it exposes a way to set up a webhook whenever a new question is coming in, you can create a -web service and use it to automate the responses. In general, our bot should be created specifically for the platform -you use, so we’ll just cover the general idea here and build a simple CLI tool. +Filip Makraduli: +Yeah, I was thinking about this different kind of styles, but I feel like I want to first try different things related to embeddings and language just because I feel like with the other things, with the other ways of doing these recommendations, other companies and other solutions have done a really great job there. So I wanted to try something different to see whether that could work as well or maybe to a similar degree. So that's why I went towards this approach rather than collaborative filtering. -## [Anchor](https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws/\#prerequisites) Prerequisites +Demetrios: +Yeah, it kind of felt like you wanted to test the boundaries and see if something like this, which seems a little far fetched, is actually possible. And it seems like I would give it a yes. -### [Anchor](https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws/\#cohere-models-on-aws) Cohere models on AWS +Filip Makraduli: +It wasn't that far fetched, actually, once you see it working. -One of the possible ways to deploy Cohere models on AWS is to use AWS SageMaker. Cohere’s website has [a detailed\\ -guide on how to deploy the models in that way](https://docs.cohere.com/docs/amazon-sagemaker-setup-guide), so you can -follow the steps described there to set up your own instance. +Demetrios: +Yeah, totally. Another question is coming through is asking, is it possible to merge the current mood so the vibe that you're looking for with your musical preferences? -### [Anchor](https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws/\#qdrant-hybrid-cloud-on-aws) Qdrant Hybrid Cloud on AWS +Filip Makraduli: +Yeah. So I was thinking of that when we're doing this, the playlist creation that I did for you, there is a way to get your top ten songs or your other playlists and so on from Spotify. So my idea of kind of capturing this added element was through Spotify like that. But of course it could be that you could enter that in your own profile in the app or so on. So one idea would be how would you capture that preferences of the user once you have the user there. So you'd need some data of the preferences of the user. So that's the problem. But of course it is possible. -Our documentation covers the deployment of Qdrant on AWS as a Hybrid Cloud Environment, so you can follow the steps described -there to set up your own instance. The deployment process is quite straightforward, and you can have your Qdrant cluster -up and running in a few minutes. +Demetrios: +You know what I'd lOve? Like in your example, you put that, I feel like going for a walk or it's raining, but I still feel like going through for a long walk in London. Right. You could probably just get that information from me, like what is the weather around me, where am I located? All that kind of stuff. So I don't have to give you that context. You just add those kind of contextual things, especially weather. And I get the feeling that that would be another unlock too. Unless you're like, you are the exact opposite of a sunny day on a sunny day. And it's like, why does it keep playing this happy music? I told you I was sad. -Once you perform all the steps, your Qdrant cluster should be running on a specific URL. You will need this URL and the -API key to interact with Qdrant, so let’s store them both in the environment variables: +Filip Makraduli: +Yeah. You're predicting not just the songs, but the mood also. -shellpython +Demetrios: +Yeah, totally. -```shell -export QDRANT_URL="https://qdrant.example.com" -export QDRANT_API_KEY="your-api-key" +Filip Makraduli: +You don't have to type anything, just open the website and you get everything. -``` +Demetrios: +Exactly. Yeah. Give me a few predictions just right off the bat and then maybe later we can figure it out. The other thing that I was thinking, could be a nice add on. I mean, the infinite feature request, I don't think you realized you were going to get so many feature requests from me, but let it be known that if you come on here and I like your app, you'll probably get some feature requests from me. So I was thinking about how it would be great if I could just talk to it instead of typing it in, right? And I could just explain my mood or explain my feeling and even top that off with a few melodies that are going on in my head, or a few singers or songwriters or songs that I really want, something like this, but not this song, and then also add that kind of thing, do the. -```python -import os +Filip Makraduli: +Humming sound a bit and you play your melody and then you get. -os.environ["QDRANT_URL"] = "https://qdrant.example.com" -os.environ["QDRANT_API_KEY"] = "your-api-key" +Demetrios: +Except I hum out of tune, so I don't think that would work very well. I get a lot of random songs, that's for sure. It would probably be just about as accurate as your recommendation engine is right now. Yeah. Well, this is awesome, man. I really appreciate you coming on here. I'm just going to make sure that there's no other questions that came through the chat. No, looks like we're good. -``` +Demetrios: +And for everyone out there that is listening, if you want to come on and talk about anything cool that you have built with Qdrant, or how you're using Qdrant, or different ways that you would like Qdrant to be better, or things that you enjoy, whatever it may be, we'd love to have you on here. And I think that is it. We're going to call it a day for the vector space talks, number two. We'll see you all later. Philip, thanks so much for coming on. It's. -### [Anchor](https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws/\#airbyte-open-source) Airbyte Open Source +<|page-394-lllmstxt|> +> *"Everything changed when we actually tried binary quantization with OpenAI model.”*\ +> -- Andrey Vasnetsov -Airbyte is an open-source data integration platform that helps you replicate your data in your warehouses, lakes, and -databases. You can install it on your infrastructure and use it to load the data into Qdrant. The installation process is described in the [official documentation](https://docs.airbyte.com/deploying-airbyte/). -Please follow the instructions to set up your own instance. +Ever wonder why we need quantization for vector indexes? Andrey Vasnetsov explains the complexities and challenges of searching through proximity graphs. Binary quantization reduces storage size and boosts speed by 30x, but not all models are compatible. -#### [Anchor](https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws/\#setting-up-the-connection) Setting up the connection +Andrey worked as a Machine Learning Engineer most of his career. He prefers practical over theoretical, working demo over arXiv paper. He is currently working as the CTO at Qdrant a Vector Similarity Search Engine, which can be used for semantic search, similarity matching of text, images or even videos, and also recommendations. -Once you have an Airbyte up and running, you can configure the connection to load the data from the respective source -into Qdrant. The configuration will require setting up the source and destination connectors. In this tutorial we will -use the following connectors: +***Listen to the episode on [Spotify](https://open.spotify.com/episode/7dPOm3x4rDBwSFkGZuwaMq?si=Ip77WCa_RCCYebeHX6DTMQ), Apple Podcast, Podcast addicts, Castbox. You can also watch this episode on [YouTube](https://youtu.be/4aUq5VnR_VI).*** -- **Source:** [File](https://docs.airbyte.com/integrations/sources/file) to load the data from an Excel sheet -- **Destination:** [Qdrant](https://docs.airbyte.com/integrations/destinations/qdrant) to load the data into Qdrant + -Airbyte UI will guide you through the process of setting up the source and destination and connecting them. Here is how -the configuration of the source might look like: + -![Airbyte source configuration](https://qdrant.tech/documentation/examples/customer-support-cohere-airbyte/airbyte-excel-source.png) +## Top Takeaways: -Qdrant is our target destination, so we need to set up the connection to it. We need to specify which fields should be -included to generate the embeddings. In our case it makes complete sense to embed just the questions, as we are going -to look for similar questions asked in the past and provide the answers. +Discover how oversampling optimizes precision in real-time, enhancing the accuracy without altering stored data structures in our very first episode of the Vector Space Talks by Qdrant, with none other than the CTO of Qdrant, Andrey Vasnetsov. -![Airbyte destination configuration](https://qdrant.tech/documentation/examples/customer-support-cohere-airbyte/airbyte-qdrant-destination.png) +In this episode, Andrey shares invaluable insights into the world of binary quantization and its profound impact on Vector Space technology. -Once we have the destination set up, we can finally configure a connection. The connection will define the schedule -of the data synchronization. +5 Keys to Learning from the Episode: -![Airbyte connection configuration](https://qdrant.tech/documentation/examples/customer-support-cohere-airbyte/airbyte-connection.png) +1. The necessity of quantization and the complex challenges it helps to overcome. +2. The transformative effects of binary quantization on processing speed and storage size reduction. +3. A detailed exploration of oversampling and its real-time precision control in query search. +4. Understanding the simplicity and effectiveness of binary quantization, especially when compared to more intricate quantization methods. +5. The ongoing research and potential impact of binary quantization on future models. -Airbyte should now be ready to accept any data updates from the source and load them into Qdrant. You can monitor the -progress of the synchronization in the UI. +> Fun Fact: Binary quantization can deliver processing speeds over 30 times faster than traditional quantization methods, which is a revolutionary advancement in Vector Space technology. +> -## [Anchor](https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws/\#rag-connector) RAG connector +## Show Notes: -One of our previous tutorials, guides you step-by-step on [implementing custom connector for Cohere\\ -RAG](https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws/documentation/examples/cohere-rag-connector/) with Cohere Embed v3 and Qdrant. You can just point it to use your Hybrid Cloud -Qdrant instance running on AWS. Created connector might be deployed to Amazon Web Services in various ways, even in a -[Serverless](https://aws.amazon.com/serverless/) manner using [AWS\\ -Lambda](https://aws.amazon.com/lambda/?c=ser&sec=srv). +00:00 Overview of HNSW vector index.\ +03:57 Efficient storage needed for large vector sizes.\ +07:49 Oversampling controls precision in real-time search.\ +12:21 Comparison of vectors using dot production.\ +15:20 Experimenting with models, OpenAI has compatibility.\ +18:29 Qdrant architecture doesn't support removing original vectors. -In general, RAG connector has to expose a single endpoint that will accept POST requests with `query` parameter and -return the matching documents as JSON document with a specific structure. Our FastAPI implementation created [in the\\ -related tutorial](https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws/documentation/examples/cohere-rag-connector/) is a perfect fit for this task. The only difference is that you -should point it to the Cohere models and Qdrant running on AWS infrastructure. +## More Quotes from Andrey: -> Our connector is a lightweight web service that exposes a single endpoint and glues the Cohere embedding model with -> our Qdrant Hybrid Cloud instance. Thus, it perfectly fits the serverless architecture, requiring no additional -> infrastructure to run. +*"Inside Qdrant we use HNSW vector Index, which is essentially a proximity graph. You can imagine it as a number of vertices where each vertex is representing one vector and links between those vertices representing nearest neighbors.”*\ +-- Andrey Vasnetsov -You can also run the connector as another service within your [Kubernetes cluster running on AWS\\ -(EKS)](https://aws.amazon.com/eks/), or by launching an [EC2](https://aws.amazon.com/ec2/) compute instance. This step -is dependent on the way you deploy your other services, so we’ll leave it to you to decide how to run the connector. +*"The main idea is that we convert the float point elements of the vector into binary representation. So, it's either zero or one, depending if the original element is positive or negative.”*\ +-- Andrey Vasnetsov -Eventually, the web service should be available under a specific URL, and it’s a good practice to store it in the -environment variable, so the other services can easily access it. +*"We tried most popular open source models, and unfortunately they are not as good compatible with binary quantization as OpenAI.”*\ +-- Andrey Vasnetsov -shellpython +## Transcript: +Demetrios: +Okay, welcome everyone. This is the first and inaugural vector space talks, and who better to kick it off than the CTO of Qdrant himself? Andrey V. Happy to introduce you and hear all about this binary quantization that you're going to be talking about. I've got some questions for you, and I know there are some questions that came through in the chat. And the funny thing about this is that we recorded it live on Discord yesterday. But the thing about Discord is you cannot trust the recordings on there. And so we only got the audio and we wanted to make this more visual for those of you that are watching on YouTube. Hence here we are recording it again. -```shell -export RAG_CONNECTOR_URL="https://rag-connector.example.com/search" +Demetrios: +And so I'll lead us through some questions for you, Andrey. And I have one thing that I ask everyone who is listening to this, and that is if you want to give a talk and you want to showcase either how you're using Qdrant, how you've built a rag, how you have different features or challenges that you've overcome with your AI, landscape or ecosystem or stack that you've set up, please reach out to myself and I will get you on here and we can showcase what you've done and you can give a talk for the vector space talk. So without further ado, let's jump into this, Andrey, we're talking about binary quantization, but let's maybe start a step back. Why do we need any quantization at all? Why not just use original vectors? -``` +Andrey Vasnetsov: +Yep. Hello, everyone. Hello Demetrios. And it's a good question, and I think in order to answer it, I need to first give a short overview of what is vector index, how it works and what challenges it possess. So, inside Qdrant we use so called HNSW vector Index, which is essentially a proximity graph. You can imagine it as a number of vertices where each vertex is representing one vector and links between those vertices representing nearest neighbors. So in order to search through this graph, what you actually need to do is do a greedy deep depth first search, and you can tune the precision of your search with the beam size of the greedy search process. But this structure of the index actually has its own challenges and first of all, its index building complexity. -```python -os.environ["RAG_CONNECTOR_URL"] = "https://rag-connector.example.com/search" +Andrey Vasnetsov: +Inserting one vector into the index is as complicated as searching for one vector in the graph. And the graph structure overall have also its own limitations. It requires a lot of random reads where you can go in any direction. It's not easy to predict which path the graph will take. The search process will take in advance. So unlike traditional indexes in traditional databases, like binary trees, like inverted indexes, where we can pretty much serialize everything. In HNSW it's always random reads and it's actually always sequential reads, because you need to go from one vertex to another in a sequential manner. And this actually creates a very strict requirement for underlying storage of vectors. -``` +Andrey Vasnetsov: +It had to have a very low latency and it have to support this randomly spatter. So basically we can only do it efficiently if we store all the vectors either in very fast solid state disks or if we use actual RAM to store everything. And RAM is not cheap these days, especially considering that the size of vectors increases with each new version of the model. And for example, OpenAI model is already more than 1000 dimensions. So you can imagine one vector is already 6 data, no matter how long your text is, and it's just becoming more and more expensive with the advancements of new models and so on. So in order to actually fight this, in order to compensate for the growth of data requirement, what we propose to do, and what we already did with different other quantization techniques is we actually compress vectors into quantized vector storage, which is usually much more compact for the in memory representation. For example, on one of the previous releases we have scalar quantization and product quantization, which can compress up to 64 times the size of the vector. And we only keep in fast storage these compressed vectors. -## [Anchor](https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws/\#customer-interface) Customer interface +Andrey Vasnetsov: +We retrieve them and get a list of candidates which will later rescore using the original vectors. And the benefit here is this reordering or rescoring process actually doesn't require any kind of sequential or random access to data, because we already know all the IDs we need to rescore, and we can efficiently read it from the disk using asynchronous I O, for example, and even leverage the advantage of very cheap network mounted disks. And that's the main benefit of quantization. -At this part we have all the data loaded into Qdrant, and the RAG connector is ready to serve the relevant context. The -last missing piece is the customer interface, that will call the Command model to create the answer. Such a system -should be built specifically for the platform you use and integrated into its workflow, but we will build the strong -foundation for it and show how to use it in a simple CLI tool. +Demetrios: +I have a few questions off the back of this one, being just a quick thing, and I'm wondering if we can double benefit by using this binary quantization, but also if we're using smaller models that aren't the GBTs, will that help? -> Our application does not have to connect to Qdrant anymore, as the model will connect to the RAG connector directly. +Andrey Vasnetsov: +Right. So not all models are as big as OpenAI, but what we see, the trend in this area, the trend of development of different models, indicates that they will become bigger and bigger over time. Just because we want to store more information inside vectors, we want to have larger context, we want to have more detailed information, more detailed separation and so on. This trend is obvious if like five years ago the usual size of the vector was 100 dimensions now the usual size is 700 dimensions, so it's basically. -First of all, we have to create a connection to Cohere services through the Cohere SDK. +Demetrios: +Preparing for the future while also optimizing for today. -```python -import cohere +Andrey Vasnetsov: +Right? -# Create a Cohere client pointing to the AWS instance -cohere_client = cohere.Client(...) +Demetrios: +Yeah. Okay, so you mentioned on here oversampling. Can you go into that a little bit more and explain to me what that is? -``` +Andrey Vasnetsov: +Yeah, so oversampling is a special technique we use to control precision of the search in real time, in query time. And the thing is, we can internally retrieve from quantized storage a bit more vectors than we actually need. And when we do rescoring with original vectors, we assign more precise score. And therefore from this overselection, we can pick only those vectors which are actually good for the user. And that's how we can basically control accuracy without rebuilding index, without changing any kind of parameters inside the stored data structures. But we can do it real time in just one parameter change of the search query itself. -Next, our connector should be registered. **Please make sure to do it once, and store the id of the connector in the** -**environment variable or in any other way that will be accessible to the application.** +Demetrios: +I see, okay, so basically this is the quantization. And now let's dive into the binary quantization and how it works. -```python -import os +Andrey Vasnetsov: +Right, so binary quantization is actually very simple. The main idea that we convert the float point elements of the vector into binary representation. So it's either zero or one, depending if the original element is positive or negative. And by doing this we can approximate dot production or cosine similarity, whatever metric you use to compare vectors with just hemming distance, and hemming distance is turned to be very simple to compute. It uses only two most optimized CPU instructions ever. It's Pixor and Popcount. Instead of complicated float point subprocessor, you only need those tool. It works with any register you have, and it's very fast. -connector_response = cohere_client.connectors.create( - name="customer-support", - url=os.environ["RAG_CONNECTOR_URL"], -) +Andrey Vasnetsov: +It uses very few CPU cycles to actually produce a result. That's why binary quantization is over 30 times faster than regular product. And it actually solves the problem of complicated index building, because this computation of dot products is the main source of computational requirements for HNSW. -# The id returned by the API should be stored for future use -connector_id = connector_response.connector.id +Demetrios: +So if I'm understanding this correctly, it's basically taking all of these numbers that are on the left, which can be, yes, decimal numbers. -``` +Andrey Vasnetsov: +On the left you can see original vector and it converts it in binary representation. And of course it does lose a lot of precision in the process. But because first we have very large vector and second, we have oversampling feature, we can compensate for this loss of accuracy and still have benefit in both speed and the size of the storage. -Finally, we can create a prompt and get the answer from the model. Additionally, we define which of the connectors -should be used to provide the context, as we may have multiple connectors and want to use specific ones, depending on -some conditions. Let’s start with asking a question. +Demetrios: +So if I'm understanding this correctly, it's basically saying binary quantization on its own probably isn't the best thing that you would want to do. But since you have these other features that will help counterbalance the loss in accuracy. You get the speed from the binary quantization and you get the accuracy from these other features. -```python -query = "Why Qdrant does not return my vectors?" +Andrey Vasnetsov: +Right. So the speed boost is so overwhelming that it doesn't really matter how much over sampling is going to be, we will still benefit from that. -``` +Demetrios: +Yeah. And how much faster is it? You said that, what, over 30 times faster? -Now we can send the query to the model, get the response, and possibly send it back to the customer. +Andrey Vasnetsov: +Over 30 times and some benchmarks is about 40 times faster. -```python -response = cohere_client.chat( - message=query, - connectors=[\ - cohere.ChatConnector(id=connector_id),\ - ], - model="command-r", -) +Demetrios: +Wow. Yeah, that's huge. And so then on the bottom here you have dot product versus hammering distance. And then there's. Yeah, hamming. Sorry, I'm inventing words over here on your slide. Can you explain what's going on there? -print(response.text) +Andrey Vasnetsov: +Right, so dot production is the metric we usually use in comparing a pair of vectors. It's basically the same as cosine similarity, but this normalization on top. So internally, both cosine and dot production actually doing only dot production, that's usual metric we use. And in order to do this operation, we first need to multiply each pair of elements to the same element of the other vector and then add all these multiplications in one number. It's going to be our score instead of this in binary quantization, in binary vector, we do XOR operation and then count number of ones. So basically, Hemming distance is an approximation of dot production in this binary space. -``` +Demetrios: +Excellent. Okay, so then it looks simple enough, right? Why are you implementing it now after much more complicated product quantization? -The output should be the answer to the question, generated by the model, for example: +Andrey Vasnetsov: +It's actually a great question. And the answer to this is binary questization looked too simple to be true, too good to be true. And we thought like this, we tried different things with open source models that didn't work really well. But everything changed when we actually tried binary quantization with OpenAI model. And it turned out that OpenAI model has very good compatibility with this type of quantization. Unfortunately, not every model have as good compatibility as OpenAI. And to be honest, it's not yet absolutely clear for us what makes models compatible and whatnot. We do know that it correlates with number of dimensions, but it is not the only factor. -> Qdrant is set up by default to minimize network traffic and therefore doesn’t return vectors in search results. However, you can make Qdrant return your vectors by setting the ‘with\_vector’ parameter of the Search/Scroll function to true. +Andrey Vasnetsov: +So there is some secret source which exists and we should find it, which should enable models to be compatible with binary quantization. And I think it's actually a future of this space because the benefits of this hemming distance benefits of binary quantization is so great that it makes sense to incorporate these tricks on the learning process of the model to make them more compatible. -Customer support should not be fully automated, as some completely new issues might require human intervention. We -should play with prompt engineering and expect the model to provide the answer with a certain confidence level. If the -confidence is too low, we should not send the answer automatically but present it to the support team for review. +Demetrios: +Well, you mentioned that OpenAI's model is one that obviously works well with binary quantization, but there are models that don't work well with it, which models have not been very good. -## [Anchor](https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws/\#wrapping-up) Wrapping up +Andrey Vasnetsov: +So right now we are in the process of experimenting with different models. We tried most popular open source models, and unfortunately they are not as good compatible with binary quantization as OpenAI. We also tried different closed source models, for example Cohere AI, which is on the same level of compatibility with binary quantization as OpenAI, but they actually have much larger dimensionality. So instead of 1500 they have 4000. And it's not yet clear if only dimensionality makes this model compatible. Or there is something else in training process, but there are open source models which are getting close to OpenAI 1000 dimensions, but they are not nearly as good as Openi in terms of this compression compatibility. -This tutorial shows how to build a fully private customer support system using Cohere models, Qdrant Hybrid Cloud, and -Airbyte, which runs on AWS infrastructure. You can ensure your data does not leave your premises and focus on providing -the best customer support experience without bothering your team with repetitive tasks. +Demetrios: +So let that be something that hopefully the community can help us figure out. Why is it that this works incredibly well with these closed source models, but not with the open source models? Maybe there is something that we're missing there. -##### Was this page useful? +Andrey Vasnetsov: +Not all closed source models are compatible as well, so some of them work similar as open source, but a few works well. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +Demetrios: +Interesting. Okay, so is there a plan to implement other quantization methods, like four bit quantization or even compressing two floats into one bit? -Thank you for your feedback! 🙏 +Andrey Vasnetsov: +Right, so our choice of quantization is mostly defined by available CPU instructions we can apply to perform those computations. In case of binary quantization, it's straightforward and very simple. That's why we like binary quantization so much. In case of, for example, four bit quantization, it is not as clear which operation we should use. It's not yet clear. Would it be efficient to convert into four bits and then apply multiplication of four bits? So this would require additional investigation, and I cannot say that we have immediate plans to do so because still the binary quincellation field is not yet explored on 100% and we think it's a lot more potential with this than currently unlocked. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/rag-customer-support-cohere-airbyte-aws.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Demetrios: +Yeah, there's some low hanging fruits still on the binary quantization field, so tackle those first and then move your way over to four bit and all that fun stuff. Last question that I've got for you is can we remove original vectors and only keep quantized ones in order to save disk space? -On this page: +Andrey Vasnetsov: +Right? So unfortunately Qdrant architecture is not designed and not expecting this type of behavior for several reasons. First of all, removing of the original vectors will compromise some features like oversampling, like segment building. And actually removing of those original vectors will only be compatible with some types of quantization for example, it won't be compatible with scalar quantization because in this case we won't be able to rebuild index to do maintenance of the system. And in order to maintain, how would you say, consistency of the API, consistency of the engine, we decided to enforce always enforced storing of the original vectors. But the good news is that you can always keep original vectors on just disk storage. It's very cheap. Usually it's ten times or even more times cheaper than RAM, and it already gives you great advantage in terms of price. That's answer excellent. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/examples/rag-customer-support-cohere-airbyte-aws.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +Demetrios: +Well man, I think that's about it from this end, and it feels like it's a perfect spot to end it. As I mentioned before, if anyone wants to come and present at our vector space talks, we're going to be doing these, hopefully biweekly, maybe weekly, if we can find enough people. And so this is an open invitation for you, and if you come present, I promise I will send you some swag. That is my promise to you. And if you're listening after the fact and you have any questions, come into discord on the Qdrant. Discord. And ask myself or Andrey any of the questions that you may have as you're listening to this talk about binary quantization. We will catch you all later. -× +Demetrios: +See ya, have a great day. Take care. -[Powered by](https://qdrant.tech/) +<|page-395-lllmstxt|> +Building powerful applications with Qdrant starts with loading vector representations into the system. Traditionally, this involves scraping or extracting data from sources, performing operations such as cleaning, chunking, and generating embeddings, and finally loading it into Qdrant. While this process can be complex, Unstructured.io includes Qdrant as an ingestion destination. -<|page-199-lllmstxt|> -## explore -- [Documentation](https://qdrant.tech/documentation/) -- [Concepts](https://qdrant.tech/documentation/concepts/) -- Explore +In this blog post, we'll demonstrate how to load data into Qdrant from the channels of a Discord server. You can use a similar process for the [20+ vetted data sources](https://unstructured-io.github.io/unstructured/ingest/source_connectors.html) supported by Unstructured. + +### Prerequisites -# [Anchor](https://qdrant.tech/documentation/concepts/explore/\#explore-the-data) Explore the data +- A running Qdrant instance. Refer to our [Quickstart guide](/documentation/quick-start/) to set up an instance. +- A Discord bot token. Generate one [here](https://discord.com/developers/applications) after adding the bot to your server. +- Unstructured CLI with the required extras. For more information, see the Discord [Getting Started guide](https://discord.com/developers/docs/getting-started). Install it with the following command: -After mastering the concepts in [search](https://qdrant.tech/documentation/concepts/search/), you can start exploring your data in other ways. Qdrant provides a stack of APIs that allow you to find similar vectors in a different fashion, as well as to find the most dissimilar ones. These are useful tools for recommendation systems, data exploration, and data cleaning. +```bash +pip install unstructured[discord,local-inference,qdrant] +``` -## [Anchor](https://qdrant.tech/documentation/concepts/explore/\#recommendation-api) Recommendation API +Once you have the prerequisites in place, let's begin the data ingestion. -In addition to the regular search, Qdrant also allows you to search based on multiple positive and negative examples. The API is called _**recommend**_, and the examples can be point IDs, so that you can leverage the already encoded objects; and, as of v1.6, you can also use raw vectors as input, so that you can create your vectors on the fly without uploading them as points. +### Retrieving Data from Discord -REST API - API Schema definition is available [here](https://api.qdrant.tech/api-reference/search/recommend-points) +To generate structured data from Discord using the Unstructured CLI, run the following command with the [channel IDs](https://www.pythondiscord.com/pages/guides/pydis-guides/contributing/obtaining-discord-ids/): -httppythontypescriptrustjavacsharpgo +```bash +unstructured-ingest \ + discord \ + --channels \ + --token "" \ + --output-dir "discord-output" +``` -```http -POST /collections/{collection_name}/points/query -{ - "query": { - "recommend": { - "positive": [100, 231], - "negative": [718, [0.2, 0.3, 0.4, 0.5]], - "strategy": "average_vector" - } - }, - "filter": { - "must": [\ - {\ - "key": "city",\ - "match": {\ - "value": "London"\ - }\ - }\ - ] - } -} +This command downloads and structures the data in the `"discord-output"` directory. + +For a complete list of options supported by this source, run: +```bash +unstructured-ingest discord --help ``` -```python -from qdrant_client import QdrantClient, models +### Ingesting into Qdrant -client = QdrantClient(url="http://localhost:6333") +Before loading the data, set up a collection with the information you need for the following REST call. In this example we use a local Huggingface model generating 384-dimensional embeddings. You can create a Qdrant [API key](/documentation/cloud/authentication/#create-api-keys) and set names for your Qdrant [collections](/documentation/concepts/collections/). -client.query_points( - collection_name="{collection_name}", - query=models.RecommendQuery( - recommend=models.RecommendInput( - positive=[100, 231], - negative=[718, [0.2, 0.3, 0.4, 0.5]], - strategy=models.RecommendStrategy.AVERAGE_VECTOR, - ) - ), - query_filter=models.Filter( - must=[\ - models.FieldCondition(\ - key="city",\ - match=models.MatchValue(\ - value="London",\ - ),\ - )\ - ] - ), - limit=3, -) +We set up the collection with the following command: +```bash +curl -X PUT \ + /collections/ \ + -H 'Content-Type: application/json' \ + -H 'api-key: ' \ + -d '{ + "vectors": { + "size": 384, + "distance": "Cosine" + } +}' ``` -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +You should receive a response similar to: -const client = new QdrantClient({ host: "localhost", port: 6333 }); +```console +{"result":true,"status":"ok","time":0.196235768} +``` -client.query("{collection_name}", { - query: { - recommend: { - positive: [100, 231], - negative: [718, [0.2, 0.3, 0.4, 0.5]], - strategy: "average_vector" - } - }, - filter: { - must: [\ - {\ - key: "city",\ - match: {\ - value: "London",\ - },\ - },\ - ], - }, - limit: 3 -}); +To ingest the Discord data into Qdrant, run: +```bash +unstructured-ingest \ + local \ + --input-path "discord-output" \ + --embedding-provider "langchain-huggingface" \ + qdrant \ + --collection-name "" \ + --api-key "" \ + --location "" ``` -```rust -use qdrant_client::qdrant::{ - Condition, Filter, QueryPointsBuilder, RecommendInputBuilder, RecommendStrategy, -}; -use qdrant_client::Qdrant; - -let client = Qdrant::from_url("http://localhost:6334").build()?; +This command loads structured Discord data into Qdrant with sensible defaults. You can configure the data fields for which embeddings are generated in the command options. Qdrant ingestion also supports partitioning and chunking of your data, configurable directly from the CLI. Learn more about it in the [Unstructured documentation](https://unstructured-io.github.io/unstructured/core.html). -client - .query( - QueryPointsBuilder::new("{collection_name}") - .query( - RecommendInputBuilder::default() - .add_positive(100) - .add_positive(231) - .add_positive(vec![0.2, 0.3, 0.4, 0.5]) - .add_negative(718) - .strategy(RecommendStrategy::AverageVector) - .build(), - ) - .limit(3) - .filter(Filter::must([Condition::matches(\ - "city",\ - "London".to_string(),\ - )])), - ) - .await?; +To list all the supported options of the Qdrant ingestion destination, run: +```bash +unstructured-ingest local qdrant --help ``` -```java -import java.util.List; +Unstructured can also be used programmatically or via the hosted API. Refer to the [Unstructured Reference Manual](https://unstructured-io.github.io/unstructured/introduction.html). -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.QueryPoints; -import io.qdrant.client.grpc.Points.RecommendInput; -import io.qdrant.client.grpc.Points.RecommendStrategy; -import io.qdrant.client.grpc.Points.Filter; +For more information about the Qdrant ingest destination, review how Unstructured.io configures their [Qdrant](https://unstructured-io.github.io/unstructured/ingest/destination_connectors/qdrant.html) interface. -import static io.qdrant.client.ConditionFactory.matchKeyword; -import static io.qdrant.client.VectorInputFactory.vectorInput; -import static io.qdrant.client.QueryFactory.recommend; +<|page-396-lllmstxt|> +n8n (pronounced n-eight-n) helps you connect any app with an API. You can then manipulate its data with little or no code. With the Qdrant node on n8n, you can build AI-powered workflows visually. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +Let's go through the process of building a workflow. We'll build a chat with a codebase service. -client.queryAsync(QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(recommend(RecommendInput.newBuilder() - .addAllPositive(List.of(vectorInput(100), vectorInput(200), vectorInput(100.0f, 231.0f))) - .addAllNegative(List.of(vectorInput(718), vectorInput(0.2f, 0.3f, 0.4f, 0.5f))) - .setStrategy(RecommendStrategy.AverageVector) - .build())) - .setFilter(Filter.newBuilder().addMust(matchKeyword("city", "London"))) - .setLimit(3) - .build()).get(); +## Prerequisites -``` +- A running Qdrant instance. If you need one, use our [Quick start guide](/documentation/quick-start/) to set it up. +- An OpenAI API Key. Retrieve your key from the [OpenAI API page](https://platform.openai.com/account/api-keys) for your account. +- A GitHub access token. If you need to generate one, start at the [GitHub Personal access tokens page](https://github.com/settings/tokens/). -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -using static Qdrant.Client.Grpc.Conditions; +## Building the App -var client = new QdrantClient("localhost", 6334); +Our workflow has two components. Refer to the [n8n quick start guide](https://docs.n8n.io/workflows/create/) to get acquainted with workflow semantics. -await client.QueryAsync( - collectionName: "{collection_name}", - query: new RecommendInput { - Positive = { 100, 231 }, - Negative = { 718 } - }, - filter: MatchKeyword("city", "London"), - limit: 3 -); +- A workflow to ingest a GitHub repository into Qdrant +- A workflow for a chat service with the ingested documents -``` +#### Workflow 1: GitHub Repository Ingestion into Qdrant -```go -import ( - "context" +![GitHub to Qdrant workflow](/blog/qdrant-n8n/load-demo.gif) - "github.com/qdrant/go-client/qdrant" -) +For this workflow, we'll use the following nodes: -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +- [Qdrant Vector Store - Insert](https://docs.n8n.io/integrations/builtin/cluster-nodes/root-nodes/n8n-nodes-langchain.vectorstoreqdrant/#insert-documents): Configure with [Qdrant credentials](https://docs.n8n.io/integrations/builtin/credentials/qdrant/) and a collection name. If the collection doesn't exist, it's automatically created with the appropriate configurations. -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQueryRecommend(&qdrant.RecommendInput{ - Positive: []*qdrant.VectorInput{ - qdrant.NewVectorInputID(qdrant.NewIDNum(100)), - qdrant.NewVectorInputID(qdrant.NewIDNum(231)), - }, - Negative: []*qdrant.VectorInput{ - qdrant.NewVectorInputID(qdrant.NewIDNum(718)), - }, - }), - Filter: &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("city", "London"), - }, - }, -}) +- [GitHub Document Loader](https://docs.n8n.io/integrations/builtin/cluster-nodes/sub-nodes/n8n-nodes-langchain.documentgithubloader/): Configure the GitHub access token, repository name, and branch. In this example, we'll use [qdrant/demo-food-discovery@main](https://github.com/qdrant/demo-food-discovery). + +- [Embeddings OpenAI](https://docs.n8n.io/integrations/builtin/cluster-nodes/sub-nodes/n8n-nodes-langchain.embeddingsopenai/): Configure with OpenAI credentials and the embedding model options. We use the [text-embedding-ada-002](https://platform.openai.com/docs/models/embeddings) model. + +- [Recursive Character Text Splitter](https://docs.n8n.io/integrations/builtin/cluster-nodes/sub-nodes/n8n-nodes-langchain.textsplitterrecursivecharactertextsplitter/): Configure the [text splitter options](https://docs.n8n.io/integrations/builtin/cluster-nodes/sub-nodes/n8n-nodes-langchain.textsplitterrecursivecharactertextsplitter/#node-parameters ). We use the defaults in this example. + +Connect the workflow to a manual trigger. Click "Test Workflow" to run it. You should be able to see the progress in real-time as the data is fetched from GitHub, transformed into vectors and loaded into Qdrant. + +#### Workflow 2: Chat Service with Ingested Documents + +![Chat workflow](/blog/qdrant-n8n/chat.png) + +The workflow use the following nodes: -``` +- [Qdrant Vector Store - Retrieve](https://docs.n8n.io/integrations/builtin/cluster-nodes/root-nodes/n8n-nodes-langchain.vectorstoreqdrant/#retrieve-documents-for-agentchain): Configure with [Qdrant credentials](https://docs.n8n.io/integrations/builtin/credentials/qdrant/) and the name of the collection the data was loaded into in workflow 1. -Example result of this API would be +- [Retrieval Q&A Chain](https://docs.n8n.io/integrations/builtin/cluster-nodes/root-nodes/n8n-nodes-langchain.chainretrievalqa/): Configure with default values. -```json -{ - "result": [\ - { "id": 10, "score": 0.81 },\ - { "id": 14, "score": 0.75 },\ - { "id": 11, "score": 0.73 }\ - ], - "status": "ok", - "time": 0.001 -} +- [Embeddings OpenAI](https://docs.n8n.io/integrations/builtin/cluster-nodes/sub-nodes/n8n-nodes-langchain.embeddingsopenai/): Configure with OpenAI credentials and the embedding model options. We use the [text-embedding-ada-002](https://platform.openai.com/docs/models/embeddings) model. -``` +- [OpenAI Chat Model](https://docs.n8n.io/integrations/builtin/cluster-nodes/sub-nodes/n8n-nodes-langchain.lmchatopenai/): Configure with OpenAI credentials and the chat model name. We use [gpt-3.5-turbo](https://platform.openai.com/docs/models/gpt-3-5) for the demo. -The algorithm used to get the recommendations is selected from the available `strategy` options. Each of them has its own strengths and weaknesses, so experiment and choose the one that works best for your case. +Once configured, hit the "Chat" button to initiate the chat interface and begin a conversation with your codebase. -### [Anchor](https://qdrant.tech/documentation/concepts/explore/\#average-vector-strategy) Average vector strategy +![Chat demo](/blog/qdrant-n8n/chat-demo.png) -The default and first strategy added to Qdrant is called `average_vector`. It preprocesses the input examples to create a single vector that is used for the search. Since the preprocessing step happens very fast, the performance of this strategy is on-par with regular search. The intuition behind this kind of recommendation is that each vector component represents an independent feature of the data, so, by averaging the examples, we should get a good recommendation. +To embed the chat in your applications, consider using the [@n8n/chat](https://www.npmjs.com/package/@n8n/chat) package. Additionally, N8N supports scheduled workflows and can be triggered by events across various applications. -The way to produce the searching vector is by first averaging all the positive and negative examples separately, and then combining them into a single vector using the following formula: +## Further reading -```rust -avg_positive + avg_positive - avg_negative +- [n8n Documentation](https://docs.n8n.io/) +- [n8n Qdrant Node documentation](https://docs.n8n.io/integrations/builtin/cluster-nodes/root-nodes/n8n-nodes-langchain.vectorstoreqdrant/#qdrant-vector-store) -``` +<|page-397-lllmstxt|> + -In the case of not having any negative examples, the search vector will simply be equal to `avg_positive`. +Andrey Vasnetsov, Co-founder and CTO at Qdrant has shared about vector search and applications with Learn NLP Academy.  -This is the default strategy that’s going to be set implicitly, but you can explicitly define it by setting `"strategy": "average_vector"` in the recommendation request. + -### [Anchor](https://qdrant.tech/documentation/concepts/explore/\#best-score-strategy) Best score strategy +He covered the following topics: -_Available as of v1.6.0_ +* Qdrant search engine and Quaterion similarity learning framework; +* Similarity learning to multimodal settings; +* Elastic search embeddings vs vector search engines; +* Support for multiple embeddings; +* Fundraising and VC discussions; +* Vision for vector search evolution; +* Finetuning for out of domain. -A new strategy introduced in v1.6, is called `best_score`. It is based on the idea that the best way to find similar vectors is to find the ones that are closer to a positive example, while avoiding the ones that are closer to a negative one. -The way it works is that each candidate is measured against every example, then we select the best positive and best negative scores. The final score is chosen with this step formula: + -```rust -// Sigmoid function to normalize the score between 0 and 1 -let sigmoid = |x| 0.5 * (1.0 + (x / (1.0 + x.abs()))); +<|page-398-lllmstxt|> +For the second edition of our Vector Space Talks we were joined by none other than Cohere’s Head of Machine Learning Nils Reimers. -let score = if best_positive_score > best_negative_score { - sigmoid(best_positive_score) -} else { - -sigmoid(best_negative_score) -}; +## Key Takeaways -``` +Let's dive right into the five key takeaways from Nils' talk: -Since we are computing similarities to every example at each step of the search, the performance of this strategy will be linearly impacted by the amount of examples. This means that the more examples you provide, the slower the search will be. However, this strategy can be very powerful and should be more embedding-agnostic. +1. Content Quality Estimation: Nils explained how embeddings have traditionally focused on measuring topic match, but content quality is just as important. He demonstrated how their model can differentiate between informative and non-informative documents. -To use this algorithm, you need to set `"strategy": "best_score"` in the recommendation request. +2. Compression-Aware Training: He shared how they've tackled the challenge of reducing the memory footprint of embeddings, making it more cost-effective to run vector databases on platforms like [Qdrant](https://cloud.qdrant.io/login). -#### [Anchor](https://qdrant.tech/documentation/concepts/explore/\#using-only-negative-examples) Using only negative examples +3. Reinforcement Learning from Human Feedback: Nils revealed how they've borrowed a technique from reinforcement learning and applied it to their embedding models. This allows the model to learn preferences based on human feedback, resulting in highly informative responses. -A beneficial side-effect of `best_score` strategy is that you can use it with only negative examples. This will allow you to find the most dissimilar vectors to the ones you provide. This can be useful for finding outliers in your data, or for finding the most dissimilar vectors to a given one. +4. Evaluating Embedding Quality: Nils emphasized the importance of evaluating embedding quality in relative terms rather than looking at individual vectors. It's all about understanding the context and how embeddings relate to each other. -Combining negative-only examples with filtering can be a powerful tool for data exploration and cleaning. +5. New Features in the Pipeline: Lastly, Nils gave us a sneak peek at some exciting features they're developing, including input type support for Langchain and improved compression techniques. -### [Anchor](https://qdrant.tech/documentation/concepts/explore/\#sum-scores-strategy) Sum scores strategy +Now, here's a fun fact from the episode: Did you know that the content quality estimation model *can't* differentiate between true and fake statements? It's a challenging task, and the model relies on the information present in its pretraining data. -Another strategy for using multiple query vectors simultaneously is to just sum their scores against the candidates. In qdrant, this is called `sum_scores` strategy. +We loved having Nils as our guest, check out the full talk below. If you or anyone you know would like to come on the Vector Space Talks -This strategy was used in [this paper](https://arxiv.org/abs/2210.10695) by [UKP Lab](http://www.ukp.tu-darmstadt.de/), [hessian.ai](https://hessian.ai/) and [cohere.ai](https://cohere.ai/) to incorporate relevance feedback into a subsequent search. In the paper this boosted the nDCG@20 performance by 5.6% points when using 2-8 positive feedback documents. + -The formula that this strategy implements is +<|page-399-lllmstxt|> +The partnership between Pienso and Qdrant is set to revolutionize interactive deep learning, making it practical, efficient, and scalable for global customers. Pienso's low-code platform provides a streamlined and user-friendly process for deep learning tasks. This exceptional level of convenience is augmented by Qdrant’s scalable and cost-efficient high vector computation capabilities, which enable reliable retrieval of similar vectors from high-dimensional spaces. -si=∑vq∈Q+s(vq,vi)−∑vq∈Q−s(vq,vi) +Together, Pienso and Qdrant will empower enterprises to harness the full potential of generative AI on a large scale. By combining the technologies of both companies, organizations will be able to train their own large language models and leverage them for downstream tasks that demand data sovereignty and model autonomy. This collaboration will help customers unlock new possibilities and achieve advanced AI-driven solutions. +Strengthening LLM Performance -where Q+ is the set of positive examples, Q− is the set of negative examples, and s(vq,vi) is the score of the vector vq against the vector vi +Qdrant enhances the accuracy of large language models (LLMs) by offering an alternative to relying solely on patterns identified during the training phase. By integrating with Qdrant, Pienso will empower customer LLMs with dynamic long-term storage, which will ultimately enable them to generate concrete and factual responses. Qdrant effectively preserves the extensive context windows managed by advanced LLMs, allowing for a broader analysis of the conversation or document at hand. By leveraging this extended context, LLMs can achieve a more comprehensive understanding and produce contextually relevant outputs. -As with `best_score`, this strategy also allows using only negative examples. +## Joint Dedication to Scalability, Efficiency and Reliability -### [Anchor](https://qdrant.tech/documentation/concepts/explore/\#multiple-vectors) Multiple vectors +> “Every commercial generative AI use case we encounter benefits from faster training and inference, whether mining customer interactions for next best actions or sifting clinical data to speed a therapeutic through trial and patent processes.” - Birago Jones, CEO, Pienso -_Available as of v0.10.0_ +Pienso chose Qdrant for its exceptional LLM interoperability, recognizing the potential it offers in maximizing the power of large language models and interactive deep learning for large enterprises. Qdrant excels in efficient nearest neighbor search, which is an expensive and computationally demanding task. Our ability to store and search high-dimensional vectors with remarkable performance and precision will offer a significant peace of mind to Pienso’s customers. Through intelligent indexing and partitioning techniques, Qdrant will significantly boost the speed of these searches, accelerating both training and inference processes for users. -If the collection was created with multiple vectors, the name of the vector should be specified in the recommendation request: +### Scalability: Preparing for Sustained Growth in Data Volumes -httppythontypescriptrustjavacsharpgo +Qdrant's distributed deployment mode plays a vital role in empowering large enterprises dealing with massive data volumes. It ensures that increasing data volumes do not hinder performance but rather enrich the model's capabilities, making scalability a seamless process. Moreover, Qdrant is well-suited for Pienso’s enterprise customers as it operates best on bare metal infrastructure, enabling them to maintain complete control over their data sovereignty and autonomous LLM regimes. This ensures that enterprises can maintain their full span of control while leveraging the scalability and performance benefits of Qdrant's solution. -```http -POST /collections/{collection_name}/points/query -{ - "query": { - "recommend": { - "positive": [100, 231], - "negative": [718] - } - }, - "using": "image", - "limit": 10 -} +### Efficiency: Maximizing the Customer Value Proposition -``` +Qdrant's storage efficiency delivers cost savings on hardware while ensuring a responsive system even with extensive data sets. In an independent benchmark stress test, Pienso discovered that Qdrant could efficiently store 128 million documents, consuming a mere 20.4GB of storage and only 1.25GB of memory. This storage efficiency not only minimizes hardware expenses for Pienso’s customers, but also ensures optimal performance, making Qdrant an ideal solution for managing large-scale data with ease and efficiency. -```python -client.query_points( - collection_name="{collection_name}", - query=models.RecommendQuery( - recommend=models.RecommendInput( - positive=[100, 231], - negative=[718], - ) - ), - using="image", - limit=10, -) +### Reliability: Fast Performance in a Secure Environment -``` +Qdrant's utilization of Rust, coupled with its memmap storage and write-ahead logging, offers users a powerful combination of high-performance operations, robust data protection, and enhanced data safety measures. Our memmap storage feature offers Pienso fast performance comparable to in-memory storage. In the context of machine learning, where rapid data access and retrieval are crucial for training and inference tasks, this capability proves invaluable. Furthermore, our write-ahead logging (WAL), is critical to ensuring changes are logged before being applied to the database. This approach adds additional layers of data safety, further safeguarding the integrity of the stored information. -```typescript -client.query("{collection_name}", { - query: { - recommend: { - positive: [100, 231], - negative: [718], - } - }, - using: "image", - limit: 10 -}); +> “We chose Qdrant because it's fast to query, has a small memory footprint and allows for instantaneous setup of a new vector collection that is going to be queried. Other solutions we evaluated had long bootstrap times and also long collection initialization times {..} This partnership comes at a great time, because it allows Pienso to use Qdrant to its maximum potential, giving our customers a seamless experience while they explore and get meaningful insights about their data.” - Felipe Balduino Cassar, Senior Software Engineer, Pienso -``` +## What's Next? -```rust -use qdrant_client::qdrant::{QueryPointsBuilder, RecommendInputBuilder}; +Pienso and Qdrant are dedicated to jointly develop the most reliable customer offering for the long term. Our partnership will deliver a combination of no-code/low-code interactive deep learning with efficient vector computation engineered for open source models and libraries. -client - .query( - QueryPointsBuilder::new("{collection_name}") - .query( - RecommendInputBuilder::default() - .add_positive(100) - .add_positive(231) - .add_negative(718) - .build(), - ) - .limit(10) - .using("image"), - ) - .await?; +**To learn more about how we plan on achieving this, join the founders for a [technical fireside chat at 09:30 PST Thursday, 20th July on Discord](https://discord.gg/Vnvg3fHE?event=1128331722270969909).** -``` +![founders chat](/case-studies/pienso/founderschat.png) -```java -import java.util.List; +<|page-400-lllmstxt|> +Founded in early 2021, [bloop](https://bloop.ai/) was one of the first companies to tackle semantic +search for codebases. A fast, reliable Vector Search Database is a core component of a semantic +search engine, and bloop surveyed the field of available solutions and even considered building +their own. They found Qdrant to be the top contender and now use it in production. -import io.qdrant.client.grpc.Points.QueryPoints; -import io.qdrant.client.grpc.Points.RecommendInput; +This document is intended as a guide for people who intend to introduce semantic search to a novel +field and want to find out if Qdrant is a good solution for their use case. -import static io.qdrant.client.VectorInputFactory.vectorInput; -import static io.qdrant.client.QueryFactory.recommend; +## About bloop -client.queryAsync(QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(recommend(RecommendInput.newBuilder() - .addAllPositive(List.of(vectorInput(100), vectorInput(231))) - .addAllNegative(List.of(vectorInput(718))) - .build())) - .setUsing("image") - .setLimit(10) - .build()).get(); +![](/case-studies/bloop/screenshot.png) -``` +[bloop](https://bloop.ai/) is a fast code-search engine that combines semantic search, regex search +and precise code navigation into a single lightweight desktop application that can be run locally. It +helps developers understand and navigate large codebases, enabling them to discover internal libraries, +reuse code and avoid dependency bloat. bloop’s chat interface explains complex concepts in simple +language so that engineers can spend less time crawling through code to understand what it does, and +more time shipping features and fixing bugs. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +![](/case-studies/bloop/bloop-logo.png) -var client = new QdrantClient("localhost", 6334); +bloop’s mission is to make software engineers autonomous and semantic code search is the cornerstone +of that vision. The project is maintained by a group of Rust and Typescript engineers and ML researchers. +It leverages many prominent nascent technologies, such as [Tauri](http://tauri.app), [tantivy](https://docs.rs/tantivy), +[Qdrant](https://github.com/qdrant/qdrant) and [Anthropic](https://www.anthropic.com/). -await client.QueryAsync( - collectionName: "{collection_name}", - query: new RecommendInput { - Positive = { 100, 231 }, - Negative = { 718 } - }, - usingVector: "image", - limit: 10 -); +## About Qdrant -``` +![](/case-studies/bloop/qdrant-logo.png) -```go -import ( - "context" +Qdrant is an open-source Vector Search Database written in Rust . It deploys as an API service providing +a search for the nearest high-dimensional vectors. With Qdrant, embeddings or neural network encoders +can be turned into full-fledged applications for matching, searching, recommending, and many more solutions +to make the most of unstructured data. It is easy to use, deploy and scale, blazing fast and is accurate +simultaneously. - "github.com/qdrant/go-client/qdrant" -) +Qdrant was founded in 2021 in Berlin by Andre Zayarni and Andrey Vasnetsov with the mission to power the +next generation of AI applications with advanced and high-performant [vector similarity](https://qdrant.tech/articles/vector-similarity-beyond-search/) search technology. +Their flagship product is the vector search database which is available as an open source +https://github.com/qdrant/qdrant or managed cloud solution https://cloud.qdrant.io/. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +## The Problem -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQueryRecommend(&qdrant.RecommendInput{ - Positive: []*qdrant.VectorInput{ - qdrant.NewVectorInputID(qdrant.NewIDNum(100)), - qdrant.NewVectorInputID(qdrant.NewIDNum(231)), - }, - Negative: []*qdrant.VectorInput{ - qdrant.NewVectorInputID(qdrant.NewIDNum(718)), - }, - }), - Using: qdrant.PtrOf("image"), -}) +Firstly, what is semantic search? It’s finding relevant information by comparing meaning, rather than +simply measuring the textual overlap between queries and documents. We compare meaning by comparing +*embeddings* - these are vector representations of text that are generated by a neural network. Each document’s +embedding denotes a position in a *latent* space, so to search you embed the query and find its nearest document +vectors in that space. -``` +![](/case-studies/bloop/vector-space.png) -Parameter `using` specifies which stored vectors to use for the recommendation. +Why is semantic search so useful for code? As engineers, we often don’t know - or forget - the precise terms +needed to find what we’re looking for. Semantic search enables us to find things without knowing the exact +terminology. For example, if an engineer wanted to understand “*What library is used for payment processing?*” +a semantic code search engine would be able to retrieve results containing “*Stripe*” or “*PayPal*”. A traditional +lexical search engine would not. -### [Anchor](https://qdrant.tech/documentation/concepts/explore/\#lookup-vectors-from-another-collection) Lookup vectors from another collection +One peculiarity of this problem is that the **usefulness of the solution increases with the size of the code +base** – if you only have one code file, you’ll be able to search it quickly, but you’ll easily get lost in +thousands, let alone millions of lines of code. Once a codebase reaches a certain size, it is no longer +possible for a single engineer to have read every single line, and so navigating large codebases becomes +extremely cumbersome. -_Available as of v0.11.6_ +In software engineering, we’re always dealing with complexity. Programming languages, frameworks and tools +have been developed that allow us to modularize, abstract and compile code into libraries for reuse. Yet we +still hit limits: Abstractions are still leaky, and while there have been great advances in reducing incidental +complexity, there is still plenty of intrinsic complexity[^1] in the problems we solve, and with software eating +the world, the growth of complexity to tackle has outrun our ability to contain it. Semantic code search helps +us navigate these inevitably complex systems. -If you have collections with vectors of the same dimensionality, -and you want to look for recommendations in one collection based on the vectors of another collection, -you can use the `lookup_from` parameter. +But semantic search shouldn’t come at the cost of speed. Search should still feel instantaneous, even when +searching a codebase as large as Rust (which has over 2.8 million lines of code!). Qdrant gives bloop excellent +semantic search performance whilst using a reasonable amount of resources, so they can handle concurrent search +requests. -It might be useful, e.g. in the item-to-user recommendations scenario. -Where user and item embeddings, although having the same vector parameters (distance type and dimensionality), are usually stored in different collections. +## The Upshot -httppythontypescriptrustjavacsharpgo +[bloop](https://bloop.ai/) are really happy with how Qdrant has slotted into their semantic code search engine: +it’s performant and reliable, even for large codebases. And it’s written in Rust(!) with an easy to integrate +qdrant-client crate. In short, Qdrant has helped keep bloop’s code search fast, accurate and reliable. -```http -POST /collections/{collection_name}/points/query -{ - "query": { - "recommend": { - "positive": [100, 231], - "negative": [718] - } - }, - "limit": 10, - "lookup_from": { - "collection": "{external_collection_name}", - "vector": "{external_vector_name}" - } -} +#### Footnotes: -``` +[^1]: Incidental complexity is the sort of complexity arising from weaknesses in our processes and tools, whereas + intrinsic complexity is the sort that we face when trying to describe, let alone solve the problem. -```python -client.query_points( - collection_name="{collection_name}", - query=models.RecommendQuery( - recommend=models.RecommendInput( - positive=[100, 231], - negative=[718], - ) - ), - using="image", - limit=10, - lookup_from=models.LookupLocation( - collection="{external_collection_name}", vector="{external_vector_name}" - ), -) +<|page-401-lllmstxt|> +The processor architecture is a thing that the end-user typically does not care much about, as long as all the applications they use run smoothly. If you use a PC then chances are you have an x86-based device, while your smartphone rather runs on an ARM processor. In 2020 Apple introduced their ARM-based M1 chip which is used in modern Mac devices, including notebooks. The main differences between those two architectures are the set of supported instructions and energy consumption. ARM’s processors have a way better energy efficiency and are cheaper than their x86 counterparts. That’s why they became available as an affordable alternative in the hosting providers, including the cloud. -``` +![](/blog/from_cms/1_seaglc6jih2qknoshqbf1q.webp "An image generated by Stable Diffusion with a query “two computer processors fightning against each other”") -```typescript -client.query("{collection_name}", { - query: { - recommend: { - positive: [100, 231], - negative: [718], - } - }, - using: "image", - limit: 10, - lookup_from: { - collection: "{external_collection_name}", - vector: "{external_vector_name}" - } -}); +In order to make an application available for ARM users, it has to be compiled for that platform. Otherwise, it has to be emulated by the device, which gives an additional overhead and reduces its performance. We decided to provide the [Docker images](https://hub.docker.com/r/qdrant/qdrant/) targeted especially at ARM users. Of course, using a limited set of processor instructions may impact the performance of your vector search, and that’s why we decided to test both architectures using a similar setup. -``` +## Test environments -```rust -use qdrant_client::qdrant::{LookupLocationBuilder, QueryPointsBuilder, RecommendInputBuilder}; +AWS offers ARM-based EC2 instances that are 20% cheaper than the x86 corresponding alternatives with a similar configuration. That estimate has been done for the eu-central-1 region (Frankfurt) and R6g/R6i instance families. For the purposes of this comparison, we used an r6i.large instance (Intel Xeon) and compared it to r6g.large one (AWS Graviton2). Both setups have 2 vCPUs and 16 GB of memory available and these were the smallest comparable instances available. -client - .query( - QueryPointsBuilder::new("{collection_name}") - .query( - RecommendInputBuilder::default() - .add_positive(100) - .add_positive(231) - .add_negative(718) - .build(), - ) - .limit(10) - .using("image") - .lookup_from( - LookupLocationBuilder::new("{external_collection_name}") - .vector_name("{external_vector_name}"), - ), - ) - .await?; +## The results -``` +For the purposes of this test, we created some random vectors which were compared with cosine distance. -```java -import java.util.List; +### Vector search -import io.qdrant.client.grpc.Points.LookupLocation; -import io.qdrant.client.grpc.Points.QueryPoints; -import io.qdrant.client.grpc.Points.RecommendInput; +During our experiments, we performed 1000 search operations for both ARM64 and x86-based setups. We didn’t measure the network overhead, only the time measurements returned by the engine in the API response. The chart below shows the distribution of that time, separately for each architecture. -import static io.qdrant.client.VectorInputFactory.vectorInput; -import static io.qdrant.client.QueryFactory.recommend; +![](/blog/from_cms/1_zvuef4ri6ztqjzbsocqj_w.webp "The latency distribution of search requests: arm vs x86") -client.queryAsync(QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(recommend(RecommendInput.newBuilder() - .addAllPositive(List.of(vectorInput(100), vectorInput(231))) - .addAllNegative(List.of(vectorInput(718))) - .build())) - .setUsing("image") - .setLimit(10) - .setLookupFrom( - LookupLocation.newBuilder() - .setCollectionName("{external_collection_name}") - .setVectorName("{external_vector_name}") - .build()) - .build()).get(); +It seems that ARM64 might be an interesting alternative if you are on a budget. It is 10% slower on average, and 20% slower on the median, but the performance is more consistent. It seems like it won’t be randomly 2 times slower than the average, unlike x86. That makes ARM64 a cost-effective way of setting up vector search with Qdrant, keeping in mind it’s 20% cheaper on AWS. You do get less for less, but surprisingly more than expected. -``` +<|page-402-lllmstxt|> +Recently we've become a member of the NVIDIA Inception. It is a program that helps boost the evolution of technology startups through access to their cutting-edge technology and experts, connects startups with venture capitalists, and provides marketing support. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +Along with the various opportunities it gives, we are the most excited about GPU support since it is an essential feature in Qdrant's roadmap. +Stay tuned for our new updates. -var client = new QdrantClient("localhost", 6334); +<|page-403-lllmstxt|> +# Benchmarking Vector Databases -await client.QueryAsync( - collectionName: "{collection_name}", - query: new RecommendInput { - Positive = { 100, 231 }, - Negative = { 718 } - }, - usingVector: "image", - limit: 10, - lookupFrom: new LookupLocation - { - CollectionName = "{external_collection_name}", - VectorName = "{external_vector_name}", - } -); +At Qdrant, performance is the top-most priority. We always make sure that we use system resources efficiently so you get the **fastest and most accurate results at the cheapest cloud costs**. So all of our decisions from [choosing Rust](/articles/why-rust/), [io optimisations](/articles/io_uring/), [serverless support](/articles/serverless/), [binary quantization](/articles/binary-quantization/), to our [fastembed library](/articles/fastembed/) are all based on our principle. In this article, we will compare how Qdrant performs against the other vector search engines. -``` +Here are the principles we followed while designing these benchmarks: -```go -import ( - "context" +- We do comparative benchmarks, which means we focus on **relative numbers** rather than absolute numbers. +- We use affordable hardware, so that you can reproduce the results easily. +- We run benchmarks on the same exact machines to avoid any possible hardware bias. +- All the benchmarks are [open-sourced](https://github.com/qdrant/vector-db-benchmark), so you can contribute and improve them. - "github.com/qdrant/go-client/qdrant" -) +
+ Scenarios we tested -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +1. Upload & Search benchmark on single node [Benchmark](/benchmarks/single-node-speed-benchmark/) +2. Filtered search benchmark - [Benchmark](/benchmarks/#filtered-search-benchmark) +3. Memory consumption benchmark - Coming soon +4. Cluster mode benchmark - Coming soon -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQueryRecommend(&qdrant.RecommendInput{ - Positive: []*qdrant.VectorInput{ - qdrant.NewVectorInputID(qdrant.NewIDNum(100)), - qdrant.NewVectorInputID(qdrant.NewIDNum(231)), - }, - Negative: []*qdrant.VectorInput{ - qdrant.NewVectorInputID(qdrant.NewIDNum(718)), - }, - }), - Using: qdrant.PtrOf("image"), - LookupFrom: &qdrant.LookupLocation{ - CollectionName: "{external_collection_name}", - VectorName: qdrant.PtrOf("{external_vector_name}"), - }, -}) +
-``` +Some of our experiment design decisions are described in the [F.A.Q Section](/benchmarks/#benchmarks-faq). +Reach out to us on our [Discord channel](https://qdrant.to/discord) if you want to discuss anything related Qdrant or these benchmarks. -Vectors are retrieved from the external collection by ids provided in the `positive` and `negative` lists. -These vectors then used to perform the recommendation in the current collection, comparing against the “using” or default vector. +<|page-404-lllmstxt|> +## Observations -## [Anchor](https://qdrant.tech/documentation/concepts/explore/\#batch-recommendation-api) Batch recommendation API +Most of the engines have improved since [our last run](/benchmarks/single-node-speed-benchmark-2022/). Both life and software have trade-offs but some clearly do better: -_Available as of v0.10.0_ +* **`Qdrant` achives highest RPS and lowest latencies in almost all the scenarios, no matter the precision threshold and the metric we choose.** It has also shown 4x RPS gains on one of the datasets. +* `Elasticsearch` has become considerably fast for many cases but it's very slow in terms of indexing time. It can be 10x slower when storing 10M+ vectors of 96 dimensions! (32mins vs 5.5 hrs) +* `Milvus` is the fastest when it comes to indexing time and maintains good precision. However, it's not on-par with others when it comes to RPS or latency when you have higher dimension embeddings or more number of vectors. +* `Redis` is able to achieve good RPS but mostly for lower precision. It also achieved low latency with single thread, however its latency goes up quickly with more parallel requests. Part of this speed gain comes from their custom protocol. +* `Weaviate` has improved the least since our last run. -Similar to the batch search API in terms of usage and advantages, it enables the batching of recommendation requests. +## How to read the results -httppythontypescriptrustjavacsharpgo +- Choose the dataset and the metric you want to check. +- Select a precision threshold that would be satisfactory for your usecase. This is important because ANN search is all about trading precision for speed. This means in any vector search benchmark, **two results must be compared only when you have similar precision**. However most benchmarks miss this critical aspect. +- The table is sorted by the value of the selected metric (RPS / Latency / p95 latency / Index time), and the first entry is always the winner of the category 🏆 -```http -POST /collections/{collection_name}/query/batch -{ - "searches": [\ - {\ - "query": {\ - "recommend": {\ - "positive": [100, 231],\ - "negative": [718]\ - }\ - },\ - "filter": {\ - "must": [\ - {\ - "key": "city",\ - "match": {\ - "value": "London"\ - }\ - }\ - ]\ - },\ - "limit": 10\ - },\ - {\ - "query": {\ - "recommend": {\ - "positive": [200, 67],\ - "negative": [300]\ - }\ - },\ - "filter": {\ - "must": [\ - {\ - "key": "city",\ - "match": {\ - "value": "London"\ - }\ - }\ - ]\ - },\ - "limit": 10\ - }\ - ] -} +### Latency vs RPS -``` +In our benchmark we test two main search usage scenarios that arise in practice. -```python -from qdrant_client import QdrantClient, models +- **Requests-per-Second (RPS)**: Serve more requests per second in exchange of individual requests taking longer (i.e. higher latency). This is a typical scenario for a web application, where multiple users are searching at the same time. +To simulate this scenario, we run client requests in parallel with multiple threads and measure how many requests the engine can handle per second. +- **Latency**: React quickly to individual requests rather than serving more requests in parallel. This is a typical scenario for applications where server response time is critical. Self-driving cars, manufacturing robots, and other real-time systems are good examples of such applications. +To simulate this scenario, we run client in a single thread and measure how long each request takes. -client = QdrantClient(url="http://localhost:6333") -filter_ = models.Filter( - must=[\ - models.FieldCondition(\ - key="city",\ - match=models.MatchValue(\ - value="London",\ - ),\ - )\ - ] -) +### Tested datasets -recommend_queries = [\ - models.QueryRequest(\ - query=models.RecommendQuery(\ - recommend=models.RecommendInput(positive=[100, 231], negative=[718])\ - ),\ - filter=filter_,\ - limit=3,\ - ),\ - models.QueryRequest(\ - query=models.RecommendQuery(\ - recommend=models.RecommendInput(positive=[200, 67], negative=[300])\ - ),\ - filter=filter_,\ - limit=3,\ - ),\ -] +Our [benchmark tool](https://github.com/qdrant/vector-db-benchmark) is inspired by [github.com/erikbern/ann-benchmarks](https://github.com/erikbern/ann-benchmarks/). We used the following datasets to test the performance of the engines on ANN Search tasks: -client.query_batch_points( - collection_name="{collection_name}", requests=recommend_queries -) +
-``` +| Datasets | # Vectors | Dimensions | Distance | +|---------------------------------------------------------------------------------------------------|-----------|------------|-------------------| +| [dbpedia-openai-1M-angular](https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M) | 1M | 1536 | cosine | +| [deep-image-96-angular](http://sites.skoltech.ru/compvision/noimi/) | 10M | 96 | cosine | +| [gist-960-euclidean](http://corpus-texmex.irisa.fr/) | 1M | 960 | euclidean | +| [glove-100-angular](https://nlp.stanford.edu/projects/glove/) | 1.2M | 100 | cosine | -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +
-const client = new QdrantClient({ host: "localhost", port: 6333 }); +### Setup -const filter = { - must: [\ - {\ - key: "city",\ - match: {\ - value: "London",\ - },\ - },\ - ], -}; +{{< figure src=/benchmarks/client-server.png caption="Benchmarks configuration" width=70% >}} -const searches = [\ - {\ - query: {\ - recommend: {\ - positive: [100, 231],\ - negative: [718]\ - }\ - },\ - filter,\ - limit: 3,\ - },\ - {\ - query: {\ - recommend: {\ - positive: [200, 67],\ - negative: [300]\ - }\ - },\ - filter,\ - limit: 3,\ - },\ -]; -client.queryBatch("{collection_name}", { - searches, -}); +- This was our setup for this experiment: + - Client: 8 vcpus, 16 GiB memory, 64GiB storage (`Standard D8ls v5` on Azure Cloud) + - Server: 8 vcpus, 32 GiB memory, 64GiB storage (`Standard D8s v3` on Azure Cloud) +- The Python client uploads data to the server, waits for all required indexes to be constructed, and then performs searches with configured number of threads. We repeat this process with different configurations for each engine, and then select the best one for a given precision. +- We ran all the engines in docker and limited their memory to 25GB. This was used to ensure fairness by avoiding the case of some engine configs being too greedy with RAM usage. This 25 GB limit is completely fair because even to serve the largest `dbpedia-openai-1M-1536-angular` dataset, one hardly needs `1M * 1536 * 4bytes * 1.5 = 8.6GB` of RAM (including vectors + index). Hence, we decided to provide all the engines with ~3x the requirement. -``` +Please note that some of the configs of some engines crashed on some datasets because of the 25 GB memory limit. That's why you might see fewer points for some engines on choosing higher precision thresholds. -```rust -use qdrant_client::qdrant::{ - Condition, Filter, QueryBatchPointsBuilder, QueryPointsBuilder, - RecommendInputBuilder, -}; -use qdrant_client::Qdrant; +<|page-405-lllmstxt|> +This is an archived version of Single node benchmarks. Please refer to the new version [here](/benchmarks/single-node-speed-benchmark/). -let client = Qdrant::from_url("http://localhost:6334").build()?; +<|page-406-lllmstxt|> +# Filtered search benchmark -let filter = Filter::must([Condition::matches("city", "London".to_string())]); +Applying filters to search results brings a whole new level of complexity. +It is no longer enough to apply one algorithm to plain data. With filtering, it becomes a matter of the _cross-integration_ of the different indices. -let recommend_queries = vec![\ - QueryPointsBuilder::new("{collection_name}")\ - .query(\ - RecommendInputBuilder::default()\ - .add_positive(100)\ - .add_positive(231)\ - .add_negative(718)\ - .build(),\ - )\ - .filter(filter.clone())\ - .build(),\ - QueryPointsBuilder::new("{collection_name}")\ - .query(\ - RecommendInputBuilder::default()\ - .add_positive(200)\ - .add_positive(67)\ - .add_negative(300)\ - .build(),\ - )\ - .filter(filter)\ - .build(),\ -]; +To measure how well different search engines perform in this scenario, we have prepared a set of **Filtered ANN Benchmark Datasets** - + https://github.com/qdrant/ann-filtering-benchmark-datasets -client - .query_batch(QueryBatchPointsBuilder::new( - "{collection_name}", - recommend_queries, - )) - .await?; -``` +It is similar to the ones used in the [ann-benchmarks project](https://github.com/erikbern/ann-benchmarks/) but enriched with payload metadata and pre-generated filtering requests. It includes synthetic and real-world datasets with various filters, from keywords to geo-spatial queries. -```java -import java.util.List; +### Why filtering is not trivial? -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.QueryPoints; -import io.qdrant.client.grpc.Points.RecommendInput; +Not many ANN algorithms are compatible with filtering. +HNSW is one of the few of them, but search engines approach its integration in different ways: -import static io.qdrant.client.ConditionFactory.matchKeyword; -import static io.qdrant.client.VectorInputFactory.vectorInput; -import static io.qdrant.client.QueryFactory.recommend; +- Some use **post-filtering**, which applies filters after ANN search. It doesn't scale well as it either loses results or requires many candidates on the first stage. +- Others use **pre-filtering**, which requires a binary mask of the whole dataset to be passed into the ANN algorithm. It is also not scalable, as the mask size grows linearly with the dataset size. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +On top of it, there is also a problem with search accuracy. +It appears if too many vectors are filtered out, so the HNSW graph becomes disconnected. -Filter filter = Filter.newBuilder().addMust(matchKeyword("city", "London")).build(); +Qdrant uses a different approach, not requiring pre- or post-filtering while addressing the accuracy problem. +Read more about the Qdrant approach in our [Filtrable HNSW](/articles/filtrable-hnsw/) article. -List recommendQueries = List.of( - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(recommend( - RecommendInput.newBuilder() - .addAllPositive(List.of(vectorInput(100), vectorInput(231))) - .addAllNegative(List.of(vectorInput(731))) - .build())) - .setFilter(filter) - .setLimit(3) - .build(), - QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(recommend( - RecommendInput.newBuilder() - .addAllPositive(List.of(vectorInput(200), vectorInput(67))) - .addAllNegative(List.of(vectorInput(300))) - .build())) - .setFilter(filter) - .setLimit(3) - .build()); +<|page-407-lllmstxt|> +## Filtered Results -client.queryBatchAsync("{collection_name}", recommendQueries).get(); +As you can see from the charts, there are three main patterns: -``` +- **Speed boost** - for some engines/queries, the filtered search is faster than the unfiltered one. It might happen if the filter is restrictive enough, to completely avoid the usage of the vector index. -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -using static Qdrant.Client.Grpc.Conditions; +- **Speed downturn** - some engines struggle to keep high RPS, it might be related to the requirement of building a filtering mask for the dataset, as described above. -var client = new QdrantClient("localhost", 6334); +- **Accuracy collapse** - some engines are loosing accuracy dramatically under some filters. It is related to the fact that the HNSW graph becomes disconnected, and the search becomes unreliable. -var filter = MatchKeyword("city", "london"); +Qdrant avoids all these problems and also benefits from the speed boost, as it implements an advanced [query planning strategy](/documentation/search/#query-planning). -await client.QueryBatchAsync( - collectionName: "{collection_name}", - queries: - [\ - new QueryPoints()\ - {\ - CollectionName = "{collection_name}",\ - Query = new RecommendInput {\ - Positive = { 100, 231 },\ - Negative = { 718 },\ - },\ - Limit = 3,\ - Filter = filter,\ - },\ - new QueryPoints()\ - {\ - CollectionName = "{collection_name}",\ - Query = new RecommendInput {\ - Positive = { 200, 67 },\ - Negative = { 300 },\ - },\ - Limit = 3,\ - Filter = filter,\ - }\ - ] -); + -``` +<|page-408-lllmstxt|> +# Benchmarks F.A.Q. -```go -import ( - "context" +## Are we biased? - "github.com/qdrant/go-client/qdrant" -) +Probably, yes. Even if we try to be objective, we are not experts in using all the existing vector databases. +We build Qdrant and know the most about it. +Due to that, we could have missed some important tweaks in different vector search engines. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +However, we tried our best, kept scrolling the docs up and down, experimented with combinations of different configurations, and gave all of them an equal chance to stand out. If you believe you can do it better than us, our **benchmarks are fully [open-sourced](https://github.com/qdrant/vector-db-benchmark), and contributions are welcome**! -filter := qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("city", "London"), - }, -} -client.QueryBatch(context.Background(), &qdrant.QueryBatchPoints{ - CollectionName: "{collection_name}", - QueryPoints: []*qdrant.QueryPoints{ - { - CollectionName: "{collection_name}", - Query: qdrant.NewQueryRecommend(&qdrant.RecommendInput{ - Positive: []*qdrant.VectorInput{ - qdrant.NewVectorInputID(qdrant.NewIDNum(100)), - qdrant.NewVectorInputID(qdrant.NewIDNum(231)), - }, - Negative: []*qdrant.VectorInput{ - qdrant.NewVectorInputID(qdrant.NewIDNum(718)), - }, - }, - ), - Filter: &filter, - }, - { - CollectionName: "{collection_name}", - Query: qdrant.NewQueryRecommend(&qdrant.RecommendInput{ - Positive: []*qdrant.VectorInput{ - qdrant.NewVectorInputID(qdrant.NewIDNum(200)), - qdrant.NewVectorInputID(qdrant.NewIDNum(67)), - }, - Negative: []*qdrant.VectorInput{ - qdrant.NewVectorInputID(qdrant.NewIDNum(300)), - }, - }, - ), - Filter: &filter, - }, - }, -}, -) -``` +## What do we measure? -The result of this API contains one array per recommendation requests. +There are several factors considered while deciding on which database to use. +Of course, some of them support a different subset of functionalities, and those might be a key factor to make the decision. +But in general, we all care about the search precision, speed, and resources required to achieve it. -```json -{ - "result": [\ - [\ - { "id": 10, "score": 0.81 },\ - { "id": 14, "score": 0.75 },\ - { "id": 11, "score": 0.73 }\ - ],\ - [\ - { "id": 1, "score": 0.92 },\ - { "id": 3, "score": 0.89 },\ - { "id": 9, "score": 0.75 }\ - ]\ - ], - "status": "ok", - "time": 0.001 -} +There is one important thing - **the speed of the vector databases should to be compared only if they achieve the same precision**. Otherwise, they could maximize the speed factors by providing inaccurate results, which everybody would rather avoid. Thus, our benchmark results are compared only at a specific search precision threshold. -``` +## How we select hardware? + +In our experiments, we are not focusing on the absolute values of the metrics but rather on a relative comparison of different engines. +What is important is the fact we used the same machine for all the tests. +It was just wiped off between launching different engines. -## [Anchor](https://qdrant.tech/documentation/concepts/explore/\#discovery-api) Discovery API +We selected an average machine, which you can easily rent from almost any cloud provider. No extra quota or custom configuration is required. -_Available as of v1.7_ -REST API Schema definition available [here](https://api.qdrant.tech/api-reference/search/discover-points) +## Why you are not comparing with FAISS or Annoy? -In this API, Qdrant introduces the concept of `context`, which is used for splitting the space. Context is a set of positive-negative pairs, and each pair divides the space into positive and negative zones. In that mode, the search operation prefers points based on how many positive zones they belong to (or how much they avoid negative zones). +Libraries like FAISS provide a great tool to do experiments with vector search. But they are far away from real usage in production environments. +If you are using FAISS in production, in the best case, you never need to update it in real-time. In the worst case, you have to create your custom wrapper around it to support CRUD, high availability, horizontal scalability, concurrent access, and so on. -The interface for providing context is similar to the recommendation API (ids or raw vectors). Still, in this case, they need to be provided in the form of positive-negative pairs. +Some vector search engines even use FAISS under the hood, but a search engine is much more than just an indexing algorithm. -Discovery API lets you do two new types of search: +We do, however, use the same benchmark datasets as the famous [ann-benchmarks project](https://github.com/erikbern/ann-benchmarks), so you can align your expectations for any practical reasons. -- **Discovery search**: Uses the context (the pairs of positive-negative vectors) and a target to return the points more similar to the target, but constrained by the context. -- **Context search**: Using only the context pairs, get the points that live in the best zone, where loss is minimized -The way positive and negative examples should be arranged in the context pairs is completely up to you. So you can have the flexibility of trying out different permutation techniques based on your model and data. -### [Anchor](https://qdrant.tech/documentation/concepts/explore/\#discovery-search) Discovery search +### Why we decided to test with the Python client -This type of search works specially well for combining multimodal, vector-constrained searches. Qdrant already has extensive support for filters, which constrain the search based on its payload, but using discovery search, you can also constrain the vector space in which the search is performed. +There is no consensus when it comes to the best technology to run benchmarks. You’re free to choose Go, Java or Rust-based systems. But there are two main reasons for us to use Python for this: +1. While generating embeddings you're most likely going to use Python and python based ML frameworks. +2. Based on GitHub stars, python clients are one of the most popular clients across all the engines. + +From the user’s perspective, the crucial thing is the latency perceived while using a specific library - in most cases a Python client. +Nobody can and even should redefine the whole technology stack, just because of using a specific search tool. +That’s why we decided to focus primarily on official Python libraries, provided by the database authors. +Those may use some different protocols under the hood, but at the end of the day, we do not care how the data is transferred, as long as it ends up in the target location. -![Discovery search](https://qdrant.tech/docs/discovery-search.png) -The formula for the discovery score can be expressed as: +## What about closed-source SaaS platforms? + +There are some vector databases available as SaaS only so that we couldn’t test them on the same machine as the rest of the systems. +That makes the comparison unfair. That’s why we purely focused on testing the Open Source vector databases, so everybody may reproduce the benchmarks easily. + +This is not the final list, and we’ll continue benchmarking as many different engines as possible. + +## How to reproduce the benchmark? + +The source code is available on [Github](https://github.com/qdrant/vector-db-benchmark) and has a `README.md` file describing the process of running the benchmark for a specific engine. + +## How to contribute? -rank(v+,v−)={1,s(v+)≄s(v−)−1,s(v+) +# Bug Bounty Program Overview +We prioritize user trust and adhere to the highest privacy and security standards. This is why we actively invite security experts to identify vulnerabilities and commit to collaborating with them to resolve issues swiftly and effectively. +Qdrant values the security research community and supports the responsible disclosure of vulnerabilities in our products and services. Through our bug bounty program, we reward researchers who help enhance the security of our platform. + +## Responsible Disclosure Program Rules +- Include detailed, reproducible steps in your reports. We will not reward issues you cannot reproduce. +- Submit one vulnerability per report unless you need to chain multiple vulnerabilities to demonstrate impact. +- In cases of duplicate reports, we will reward only the first reproducible report. +- We will consider vulnerabilities stemming from the same root cause as a single issue and award only one bounty. +- We strictly prohibit social engineering attacks (e.g., phishing, vishing, smishing). +- Interact only with accounts you own or have explicit permission to access. Do not test using Qdrant employee accounts or internal tools. +- Before you run automated scanners, please check with us first. -httppythontypescriptrustjavacsharpgo +### In Scope +The Bug Bounty program covers the following areas: +- *.cloud.qdrant.io Qdrant Cloud Application +- [qdrant.tech](http://qdrant.tech/) Website -```http -POST /collections/{collection_name}/points/query -{ - "query": { - "discover": { - "target": [0.2, 0.1, 0.9, 0.7], - "context": [\ - {\ - "positive": 100,\ - "negative": 718\ - },\ - {\ - "positive": 200,\ - "negative": 300\ - }\ - ] - } - }, - "limit": 10 -} +In most cases, we will only reward the following types of vulnerabilities: +- Arbitrary code execution and OS Command Injection +- Stored Cross-Site Scripting (Stored XSS) +- SQL injection +- File Upload +- Authentication bypass and privilege escalation (authentication / authorization circumvention) +- Significant Sensitive Data Exposure +- Server-Side Request Forgery (SSRF) +- Critical Business Logic Flaws -``` +### Out of Scope +We always exclude the following areas: +- Findings related to intended functionality or accepted business risks +- Qdrant support system on https://support.qdrant.io +- Third-party applications or websites +- Staging or test environments +- Social engineering attacks +- DoS/DDoS attacks +- User/email enumeration +- Brute-force attacks +- Physical security issues +- Reports from automated tools or scanners +- Generic information disclosure, such as the `Server` or `X-Powered-By` headers +- Email security: DMARC, DKIM, SPF, etc. +- Spamming that rate limiting techniques can prevent +- Missing DNSSEC +- CSRF for Login, Logout and Signup pages +- Cross-site scripting that requires full control of a http header, such as Referer, Host etc. +- Clickjacking and Tabnabbing -```python -from qdrant_client import QdrantClient, models +## Severity Levels and Rewards +- We assess reported bugs based on their risk and other relevant factors; our response may take some time. +- We tend to award higher rewards for submissions that include detailed remediation steps or recommendations. +- We determine bounty amounts based on multiple factors, including the vulnerability’s impact, the ease of exploitation, and the quality of the report. Please note that we may not award a bounty for very low-risk issues. +- We use the CVSS v4 framework to evaluate the criticality of issues and ensure a consistent risk assessment. +- We aim to reward similar vulnerabilities with comparable compensation; however, we also consider factors such as the time and effort required to discover the issue. Keep in mind that we may not match previous compensations for future reports. -client = QdrantClient(url="http://localhost:6333") +## Disclosure Policy +Contact us at our [Bug Bounty Program Support Portal](https://get.support.qdrant.io/servicedesk/customer/portal/35) to report vulnerabilities. Our security team will provide an initial response within 5 business days and triage the issue within 5-7 business days. We vary fix implementation timelines based on severity, and we process bounty payments after verifying the fix. -discover_queries = [\ - models.QueryRequest(\ - query=models.DiscoverQuery(\ - discover=models.DiscoverInput(\ - target=[0.2, 0.1, 0.9, 0.7],\ - context=[\ - models.ContextPair(\ - positive=100,\ - negative=718,\ - ),\ - models.ContextPair(\ - positive=200,\ - negative=300,\ - ),\ - ],\ - )\ - ),\ - limit=10,\ - ),\ -] +Follow these guidelines when disclosing vulnerabilities to us: +- Report any potential security vulnerabilities immediately upon discovery, as we commit to resolving issues swiftly. +- Maintain strict confidentiality regarding discovered vulnerabilities. Obtain explicit authorization from the Qdrant security team before publicly disclosing any vulnerabilities. +- Exercise caution to prevent data loss, privacy breaches, or service disruptions while conducting security research. +- Limit testing to your own accounts or those for which you have received explicit permission. Report any accidental access to unauthorized data immediately. +- **Safe Harbor:** We support ethical security research and promise not to initiate legal action against researchers who report vulnerabilities in good faith and comply with this disclosure policy. Ensure that your testing remains non-disruptive and respects the outlined guidelines so you qualify for Safe Harbor protections. -client.query_batch_points( - collection_name="{collection_name}", requests=discover_queries -) +### Contact +For questions about the program or to report security issues, contact: +- Web Portal: [Bug Bounty Program Support Portal](https://get.support.qdrant.io/servicedesk/customer/portal/35) +- Email: [security@qdrant.com](mailto:security@qdrant.com) +- PGP Key Fingerprint: [07E3 6646 E0D0 A3BF 0AFC B302 26C5 016B 97EB 804B](/misc/qdrant-security-public-key.asc) -``` +<|page-410-lllmstxt|> +Icons made by [srip](https://www.flaticon.com/authors/srip) from [flaticon.com](https://www.flaticon.com/) -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +Email Marketing Vector created by [storyset](https://de.freepik.com/vektoren/geschaeft) from [freepik.com](https://www.freepik.com/) -const client = new QdrantClient({ host: "localhost", port: 6333 }); +<|page-411-lllmstxt|> +# Impressum -client.query("{collection_name}", { - query: { - discover: { - target: [0.2, 0.1, 0.9, 0.7], - context: [\ - {\ - positive: 100,\ - negative: 718,\ - },\ - {\ - positive: 200,\ - negative: 300,\ - },\ - ], - } - }, - limit: 10, -}); +Angaben gemĂ€ĂŸ § 5 TMG -``` +Qdrant Solutions GmbH -```rust -use qdrant_client::qdrant::{ContextInputBuilder, DiscoverInputBuilder, QueryPointsBuilder}; -use qdrant_client::Qdrant; +Chausseestraße 86 +10115 Berlin -client - .query( - QueryPointsBuilder::new("{collection_name}").query( - DiscoverInputBuilder::new( - vec![0.2, 0.1, 0.9, 0.7], - ContextInputBuilder::default() - .add_pair(100, 718) - .add_pair(200, 300), - ) - .build(), - ), - ) - .await?; +#### Vertreten durch: -``` +AndrĂ© Zayarni -```java -import java.util.List; +#### Kontakt: -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.ContextInput; -import io.qdrant.client.grpc.Points.ContextInputPair; -import io.qdrant.client.grpc.Points.DiscoverInput; -import io.qdrant.client.grpc.Points.QueryPoints; +Telefon: +49 30 120 201 01 -import static io.qdrant.client.VectorInputFactory.vectorInput; -import static io.qdrant.client.QueryFactory.discover; +E-Mail: info@qdrant.com -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +#### Registereintrag: -client.queryAsync(QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(discover(DiscoverInput.newBuilder() - .setTarget(vectorInput(0.2f, 0.1f, 0.9f, 0.7f)) - .setContext(ContextInput.newBuilder() - .addAllPairs(List.of( - ContextInputPair.newBuilder() - .setPositive(vectorInput(100)) - .setNegative(vectorInput(718)) - .build(), - ContextInputPair.newBuilder() - .setPositive(vectorInput(200)) - .setNegative(vectorInput(300)) - .build())) - .build()) - .build())) - .setLimit(10) - .build()).get(); +Eintragung im Registergericht: Berlin Charlottenburg +Registernummer: HRB 235335 B -``` -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +#### Umsatzsteuer-ID: +Umsatzsteuer-Identifikationsnummer gemĂ€ĂŸ §27a Umsatzsteuergesetz: DE347779324 -var client = new QdrantClient("localhost", 6334); -await client.QueryAsync( - collectionName: "{collection_name}", - query: new DiscoverInput { - Target = new float[] { 0.2f, 0.1f, 0.9f, 0.7f }, - Context = new ContextInput { - Pairs = { - new ContextInputPair { - Positive = 100, - Negative = 718 - }, - new ContextInputPair { - Positive = 200, - Negative = 300 - }, - } - }, - }, - limit: 10 -); +### Verantwortlich fĂŒr den Inhalt nach § 55 Abs. 2 RStV: -``` +AndrĂ© Zayarni +Chausseestraße 86 +10115 Berlin -```go -import ( - "context" +## Haftungsausschluss: - "github.com/qdrant/go-client/qdrant" -) +### Haftung fĂŒr Inhalte -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Die Inhalte unserer Seiten wurden mit grĂ¶ĂŸter Sorgfalt erstellt. FĂŒr die Richtigkeit, VollstĂ€ndigkeit und AktualitĂ€t der Inhalte können wir jedoch keine GewĂ€hr ĂŒbernehmen. Als Diensteanbieter sind wir gemĂ€ĂŸ § 7 Abs.1 TMG fĂŒr eigene Inhalte auf diesen Seiten nach den allgemeinen Gesetzen verantwortlich. Nach §§ 8 bis 10 TMG sind wir als Diensteanbieter jedoch nicht verpflichtet, ĂŒbermittelte oder gespeicherte fremde Informationen zu ĂŒberwachen oder nach UmstĂ€nden zu forschen, die auf eine rechtswidrige TĂ€tigkeit hinweisen. Verpflichtungen zur Entfernung oder Sperrung der Nutzung von Informationen nach den allgemeinen Gesetzen bleiben hiervon unberĂŒhrt. Eine diesbezĂŒgliche Haftung ist jedoch erst ab dem Zeitpunkt der Kenntnis einer konkreten Rechtsverletzung möglich. Bei Bekanntwerden von entsprechenden Rechtsverletzungen werden wir diese Inhalte umgehend entfernen. -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQueryDiscover(&qdrant.DiscoverInput{ - Target: qdrant.NewVectorInput(0.2, 0.1, 0.9, 0.7), - Context: &qdrant.ContextInput{ - Pairs: []*qdrant.ContextInputPair{ - { - Positive: qdrant.NewVectorInputID(qdrant.NewIDNum(100)), - Negative: qdrant.NewVectorInputID(qdrant.NewIDNum(718)), - }, - { - Positive: qdrant.NewVectorInputID(qdrant.NewIDNum(200)), - Negative: qdrant.NewVectorInputID(qdrant.NewIDNum(300)), - }, - }, - }, - }), -}) +### Haftung fĂŒr Links -``` +Unser Angebot enthĂ€lt Links zu externen Webseiten Dritter, auf deren Inhalte wir keinen Einfluss haben. Deshalb können wir fĂŒr diese fremden Inhalte auch keine GewĂ€hr ĂŒbernehmen. FĂŒr die Inhalte der verlinkten Seiten ist stets der jeweilige Anbieter oder Betreiber der Seiten verantwortlich. Die verlinkten Seiten wurden zum Zeitpunkt der Verlinkung auf mögliche RechtsverstĂ¶ĂŸe ĂŒberprĂŒft. Rechtswidrige Inhalte waren zum Zeitpunkt der Verlinkung nicht erkennbar. Eine permanente inhaltliche Kontrolle der verlinkten Seiten ist jedoch ohne konkrete Anhaltspunkte einer Rechtsverletzung nicht zumutbar. Bei Bekanntwerden von Rechtsverletzungen werden wir derartige Links umgehend entfernen. -### [Anchor](https://qdrant.tech/documentation/concepts/explore/\#context-search) Context search +### Datenschutz -Conversely, in the absence of a target, a rigid integer-by-integer function doesn’t provide much guidance for the search when utilizing a proximity graph like HNSW. Instead, context search employs a function derived from the [triplet-loss](https://qdrant.tech/articles/triplet-loss/) concept, which is usually applied during model training. For context search, this function is adapted to steer the search towards areas with fewer negative examples. +Die Nutzung unserer Webseite ist in der Regel ohne Angabe personenbezogener Daten möglich. Soweit auf unseren Seiten personenbezogene Daten (beispielsweise Name, Anschrift oder eMail-Adressen) erhoben werden, erfolgt dies, soweit möglich, stets auf freiwilliger Basis. Diese Daten werden ohne Ihre ausdrĂŒckliche Zustimmung nicht an Dritte weitergegeben. +Wir weisen darauf hin, dass die DatenĂŒbertragung im Internet (z.B. bei der Kommunikation per E-Mail) SicherheitslĂŒcken aufweisen kann. Ein lĂŒckenloser Schutz der Daten vor dem Zugriff durch Dritte ist nicht möglich. +Der Nutzung von im Rahmen der Impressumspflicht veröffentlichten Kontaktdaten durch Dritte zur Übersendung von nicht ausdrĂŒcklich angeforderter Werbung und Informationsmaterialien wird hiermit ausdrĂŒcklich widersprochen. Die Betreiber der Seiten behalten sich ausdrĂŒcklich rechtliche Schritte im Falle der unverlangten Zusendung von Werbeinformationen, etwa durch Spam-Mails, vor. -![Context search](https://qdrant.tech/docs/context-search.png) -We can directly associate the score function to a loss function, where 0.0 is the maximum score a point can have, which means it is only in positive areas. As soon as a point exists closer to a negative example, its loss will simply be the difference of the positive and negative similarities. +### Google Analytics -context score=∑min(s(vi+)−s(vi−),0.0) +Diese Website benutzt Google Analytics, einen Webanalysedienst der Google Inc. (''Google''). Google Analytics verwendet sog. ''Cookies'', Textdateien, die auf Ihrem Computer gespeichert werden und die eine Analyse der Benutzung der Website durch Sie ermöglicht. Die durch den Cookie erzeugten Informationen ĂŒber Ihre Benutzung dieser Website (einschließlich Ihrer IP-Adresse) wird an einen Server von Google in den USA ĂŒbertragen und dort gespeichert. Google wird diese Informationen benutzen, um Ihre Nutzung der Website auszuwerten, um Reports ĂŒber die WebsiteaktivitĂ€ten fĂŒr die Websitebetreiber zusammenzustellen und um weitere mit der Websitenutzung und der Internetnutzung verbundene Dienstleistungen zu erbringen. Auch wird Google diese Informationen gegebenenfalls an Dritte ĂŒbertragen, sofern dies gesetzlich vorgeschrieben oder soweit Dritte diese Daten im Auftrag von Google verarbeiten. Google wird in keinem Fall Ihre IP-Adresse mit anderen Daten der Google in Verbindung bringen. Sie können die Installation der Cookies durch eine entsprechende Einstellung Ihrer Browser Software verhindern; wir weisen Sie jedoch darauf hin, dass Sie in diesem Fall gegebenenfalls nicht sĂ€mtliche Funktionen dieser Website voll umfĂ€nglich nutzen können. Durch die Nutzung dieser Website erklĂ€ren Sie sich mit der Bearbeitung der ĂŒber Sie erhobenen Daten durch Google in der zuvor beschriebenen Art und Weise und zu dem zuvor benannten Zweck einverstanden. -Where vi+ and vi− are the positive and negative examples of each pair, and s(v) is the similarity function. +<|page-412-lllmstxt|> +# Privacy Policy -Using this kind of search, you can expect the output to not necessarily be around a single point, but rather, to be any point that isn’t closer to a negative example, which creates a constrained diverse result. So, even when the API is not called [`recommend`](https://qdrant.tech/documentation/concepts/explore/#recommendation-api), recommendation systems can also use this approach and adapt it for their specific use-cases. +## **1\. Introduction** -Example: +In the following, we provide information about the collection of personal data when using: -httppythontypescriptrustjavacsharpgo +* our website ([https://qdrant.tech](https://qdrant.tech)) +* our Cloud Panel (https://cloud.qdrant.io/) +* Qdrant’s social media profiles. -```http -POST /collections/{collection_name}/points/query -{ - "query": { - "context": [\ - {\ - "positive": 100,\ - "negative": 718\ - },\ - {\ - "positive": 200,\ - "negative": 300\ - }\ - ] - }, - "limit": 10 -} +Personal data is any data that can be related to a specific natural person, such as their name or IP address. -``` +### **1.1. Contact details** -```python -from qdrant_client import QdrantClient, models +The controller within the meaning of Art. 4 para. 7 EU General Data Protection Regulation (GDPR) is Qdrant Solutions GmbH, Chausseestraße 86, 10115 Berlin, Germany, email: info@qdrant.com. We are legally represented by AndrĂ© Zayarni. -client = QdrantClient(url="http://localhost:6333") +Our data protection officer can be reached via heyData GmbH, SchĂŒtzenstraße 5, 10117 Berlin, [www.heydata.eu](https://www.heydata.eu), E-Mail: datenschutz@heydata.eu. -discover_queries = [\ - models.QueryRequest(\ - query=models.ContextQuery(\ - context=[\ - models.ContextPair(\ - positive=100,\ - negative=718,\ - ),\ - models.ContextPair(\ - positive=200,\ - negative=300,\ - ),\ - ],\ - ),\ - limit=10,\ - ),\ -] +### **1.2. Scope of data processing, processing purposes and legal bases** -client.query_batch_points( - collection_name="{collection_name}", requests=discover_queries -) +We detail the scope of data processing, processing purposes and legal bases below. In principle, the following come into consideration as the legal basis for data processing: -``` +* Art. 6 para. 1 s. 1 lit. a GDPR serves as our legal basis for processing operations for which we obtain consent. +* Art. 6 para. 1 s. 1 lit. b GDPR is the legal basis insofar as the processing of personal data is necessary for the performance of a contract, e.g. if a site visitor purchases a product from us or we perform a service for him. This legal basis also applies to processing that is necessary for pre-contractual measures, such as in the case of inquiries about our products or services. +* Art. 6 para. 1 s. 1 lit. c GDPR applies if we fulfill a legal obligation by processing personal data, as may be the case, for example, in tax law. +* Art. 6 para. 1 s. 1 lit. f GDPR serves as the legal basis when we can rely on legitimate interests to process personal data, e.g. for cookies that are necessary for the technical operation of our website. -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +### **1.3. Data processing outside the EEA** -const client = new QdrantClient({ host: "localhost", port: 6333 }); +Insofar as we transfer data to service providers or other third parties outside the EEA, the security of the data during the transfer is guaranteed by adequacy decisions of the EU Commission, insofar as they exist (e.g. for Great Britain, Canada and Israel) (Art. 45 para. 3 GDPR). -client.query("{collection_name}", { - query: { - context: [\ - {\ - positive: 100,\ - negative: 718,\ - },\ - {\ - positive: 200,\ - negative: 300,\ - },\ - ] - }, - limit: 10, -}); +In the case of data transfer to service providers in the USA, the legal basis for the data transfer is an adequacy decision of the EU Commission if the service provider has also certified itself under the EU US Data Privacy Framework. -``` +In other cases (e.g. if no adequacy decision exists), the legal basis for the data transfer are usually, i.e. unless we indicate otherwise, standard contractual clauses. These are a set of rules adopted by the EU Commission and are part of the contract with the respective third party. According to Art. 46 para. 2 lit. b GDPR, they ensure the security of the data transfer. Many of the providers have given contractual guarantees that go beyond the standard contractual clauses to protect the data. These include, for example, guarantees regarding the encryption of data or regarding an obligation on the part of the third party to notify data subjects if law enforcement agencies wish to access the respective data. -```rust -use qdrant_client::qdrant::{ContextInputBuilder, QueryPointsBuilder}; -use qdrant_client::Qdrant; +### **1.4. Storage duration** -let client = Qdrant::from_url("http://localhost:6334").build()?; +Unless expressly stated in this privacy policy, the data stored by us will be deleted as soon as they are no longer required for their intended purpose and no legal obligations to retain data conflict with the deletion. If the data are not deleted because they are required for other and legally permissible purposes, their processing is restricted, i.e. the data are blocked and not processed for other purposes. This applies, for example, to data that must be retained for commercial or tax law reasons. -client - .query( - QueryPointsBuilder::new("{collection_name}").query( - ContextInputBuilder::default() - .add_pair(100, 718) - .add_pair(200, 300) - .build(), - ), - ) - .await?; +### **1.5. Rights of data subjects** -``` +Data subjects have the following rights against us with regard to their personal data: -```java -import java.util.List; +* Right of access, +* Right to correction or deletion, +* Right to limit processing, +* Right to object to the processing, +* Right to data transferability, +* Right to revoke a given consent at any time. -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.ContextInput; -import io.qdrant.client.grpc.Points.ContextInputPair; -import io.qdrant.client.grpc.Points.QueryPoints; +Data subjects also have the right to complain to a data protection supervisory authority about the processing of their personal data. Contact details of the data protection supervisory authorities are available at https://www.bfdi.bund.de/EN/Service/Anschriften/Laender/Laender-node.html. -import static io.qdrant.client.VectorInputFactory.vectorInput; -import static io.qdrant.client.QueryFactory.context; +### **1.6. Obligation to provide data** -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +Within the scope of the business or other relationship, customers, prospective customers or third parties need to provide us with personal data that is necessary for the establishment, execution and termination of a business or other relationship or that we are legally obliged to collect. Without this data, we will generally have to refuse to conclude the contract or to provide a service or will no longer be able to perform an existing contract or other relationship. -client.queryAsync(QueryPoints.newBuilder() - .setCollectionName("{collection_name}") - .setQuery(context(ContextInput.newBuilder() - .addAllPairs(List.of( - ContextInputPair.newBuilder() - .setPositive(vectorInput(100)) - .setNegative(vectorInput(718)) - .build(), - ContextInputPair.newBuilder() - .setPositive(vectorInput(200)) - .setNegative(vectorInput(300)) - .build())) - .build())) - .setLimit(10) - .build()).get(); +Mandatory data are marked as such. -``` +### **1.7. No automatic decision making in individual cases** -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; +As a matter of principle, we do not use a fully automated decision-making process in accordance with article 22 GDPR to establish and implement the business or other relationship. Should we use these procedures in individual cases, we will inform of this separately if this is required by law. -var client = new QdrantClient("localhost", 6334); +### **1.8. Making contact** -await client.QueryAsync( - collectionName: "{collection_name}", - query: new ContextInput { - Pairs = { - new ContextInputPair { - Positive = 100, - Negative = 718 - }, - new ContextInputPair { - Positive = 200, - Negative = 300 - }, - } - }, - limit: 10 -); +When contacting us, e.g. by e-mail or telephone, the data provided to us (e.g. names and e-mail addresses) will be stored by us in order to answer questions. The legal basis for the processing is our legitimate interest (Art. 6 para. 1 s. 1 lit. f GDPR) to answer inquiries directed to us. We delete the data accruing in this context after the storage is no longer necessary or restrict the processing if there are legal retention obligations. -``` +### **1.9. Customer surveys** -```go -import ( - "context" +From time to time, we conduct customer surveys to get to know our customers and their wishes better. In doing so, we collect the data requested in each case. It is our legitimate interest to get to know our customers and their wishes better, so that the legal basis for the associated data processing is Art. 6 para. 1 s. 1 lit f GDPR. We delete the data accruing in this context after the storage is no longer necessary, or restrict the processing if there are legal retention obligations. - "github.com/qdrant/go-client/qdrant" -) +### **1.10. Educational Resources** -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +Occasionally, we offer educational resources via our website or in other ways, for example, in the form of webinars, livestreams, as well as downloadable content such as ebooks and white papers. We process the data requested in these cases in order to perform the webinar or delivery of the requested resources. Afterwards, we delete the data accruing in this context after the storage is no longer necessary or restrict the processing if there are legal retention obligations. It is our legitimate interest to offer educational resources to attract customers or to interact with our existing customers. The legal basis for data processing is Art. 6 para. 1 s. 1 lit. f GDPR. -client.Query(context.Background(), &qdrant.QueryPoints{ - CollectionName: "{collection_name}", - Query: qdrant.NewQueryContext(&qdrant.ContextInput{ - Pairs: []*qdrant.ContextInputPair{ - { - Positive: qdrant.NewVectorInputID(qdrant.NewIDNum(100)), - Negative: qdrant.NewVectorInputID(qdrant.NewIDNum(718)), - }, - { - Positive: qdrant.NewVectorInputID(qdrant.NewIDNum(200)), - Negative: qdrant.NewVectorInputID(qdrant.NewIDNum(300)), - }, - }, - }), -}) +We also offer materials to download. In each of the cases, we process the data requested in order to assess results of the survey, provide access to the webinar or livestream, or provide the guide to download. -``` +If a consent is asked, then the legal basis for the processing is Art. 6 para. 1 s. 1 lit. a GDPR. The processing is based on consent. Data subjects may revoke their consent at any time by contacting us, for example, using the contact details provided in our privacy policy. The revocation does not affect the lawfulness of the processing until the revocation. -## [Anchor](https://qdrant.tech/documentation/concepts/explore/\#distance-matrix) Distance Matrix +## **2\. Newsletter** -_Available as of v1.12.0_ +We reserve the right to inform customers who have already used services from us or purchased goods from time to time by e-mail or other means about our offers, if they have not objected to this. The legal basis for this data processing is Art. 6 para. 1 s. 1 lit. f GDPR. Our legitimate interest is to conduct direct advertising (recital 47 GDPR). Customers can object to the use of their e-mail address for advertising purposes at any time without incurring additional costs, for example via the link at the end of each e-mail or by sending an e-mail to our above-mentioned e-mail address. -The distance matrix API allows to calculate the distance between sampled pairs of vectors and to return the result as a sparse matrix. +Interested parties have the option to subscribe to a free newsletter. We process the data provided during registration exclusively for sending the newsletter. Subscription takes place by selecting the corresponding field on our website, by ticking the corresponding field in a paper document or by another clear action, whereby interested parties declare their consent to the processing of their data, so that the legal basis is Art. 6 para. p. 1 lit. a GDPR. Consent can be revoked at any time, e.g. by clicking the corresponding link in the newsletter or notifying our e-mail address given above. The processing of the data until revocation remains lawful even in the event of revocation. -Such API enables new data exploration use cases such as clustering similar vectors, visualization of connections or dimension reduction. +Based on the consent of the recipients (Art. 6 para. 1 s. 1 lit. a GDPR), we also measure the opening and click-through rate of our newsletters to understand what is relevant for our audience. -The API input request consists of the following parameters: +We send newsletters with the tool HubSpot of the provider HubSpot, Inc., 25 1st Street Cambridge, MA 0214, USA. The provider processes content, usage, meta/communication data and contact data in the process in the EU. Further information is available in the provider's privacy policy at https://legal.hubspot.com/privacy-policy. -- `sample`: the number of vectors to sample -- `limit`: the number of scores to return per sample -- `filter`: the filter to apply to constraint the samples +We send product information, including upcoming maintenance windows, downtime notifications, product alerts such as if a cluster’s payment failed or is running out of resources, with the tool Mailjet of the provider Mailjet GmbH, Friedrichstraße 68, 10117 Berlin. The provider processes content, usage, meta/communication data and contact data in the process in the EU. Further information is available in the provider's privacy policy at [https://www.mailjet.com/privacy-policy/](https://www.mailjet.com/privacy-policy/). -Let’s have a look at a basic example with `sample=100`, `limit=10`: +We send product information, including upcoming maintenance windows, downtime notifications, product alerts such as if a cluster’s payment failed or is running out of resources, with the tool HubSpot of the provider HubSpot, Inc., 25 1st Street Cambridge, MA 0214, USA. The provider processes content, usage, meta/communication data and contact data in the process in the EU. Further information is available in the provider's privacy policy at [https://legal.hubspot.com/privacy-policy](https://legal.hubspot.com/privacy-policy). -The engine starts by selecting `100` random points from the collection, then for each of the selected points, it will compute the top `10` closest points **within** the samples. +## **3\. Data processing on our website** -This will results in a total of 1000 scores represented as a sparse matrix for efficient processing. +### **3.1. Notice for website visitors from Germany** -The distance matrix API offers two output formats to ease the integration with different tools. +Our website stores information in the terminal equipment of website visitors (e.g. cookies) or accesses information that is already stored in the terminal equipment (e.g. IP addresses). What information this is in detail can be found in the following sections. -### [Anchor](https://qdrant.tech/documentation/concepts/explore/\#pairwise-format) Pairwise format +This storage and access is based on the following provisions: -Returns the distance matrix as a list of pairs of point `ids` with their respective score. +* Insofar as this storage or access is absolutely necessary for us to provide the service of our website expressly requested by website visitors (e.g., to carry out a chatbot used by the website visitor or to ensure the IT security of our website), it is carried out on the basis of Section 25 para. 2 no. 2 of the German Telecommunications Digital Services Data Protection Act (Telekommunikation-Digitale-Dienste-Datenschutzgesetz, "TDDDG"). +* Otherwise, this storage or access takes place on the basis of the website visitor's consent (Section 25 para. 1 TDDDG). -httppythontypescriptrustjavacsharpgo +The subsequent data processing is carried out in accordance with the following sections and on the basis of the provisions of the GDPR. -```http -POST /collections/{collection_name}/points/search/matrix/pairs -{ - "sample": 10, - "limit": 2, - "filter": { - "must": { - "key": "color", - "match": { "value": "red" } - } - } -} +### **3.2. Informative use of our website** -``` +During the informative use of the website, i.e. when site visitors do not separately transmit information to us, we collect the personal data that the browser transmits to our server in order to ensure the stability and security of our website. This is our legitimate interest, so that the legal basis is Art. 6 para. 1 s. 1 lit. f GDPR. -```python -from qdrant_client import QdrantClient, models +These data are: -client = QdrantClient(url="http://localhost:6333") +* IP address +* Date and time of the request +* Time zone difference to Greenwich Mean Time (GMT) +* Content of the request (specific page) +* Access status/HTTP status code +* Amount of data transferred in each case +* Website from which the request comes +* Browser +* Operating system and its interface +* Language and version of the browser software. -client.search_matrix_pairs( - collection_name="{collection_name}", - sample=10, - limit=2, - query_filter=models.Filter( - must=[\ - models.FieldCondition(\ - key="color", match=models.MatchValue(value="red")\ - ),\ - ] - ), -) +This data is also stored in log files. They are deleted when their storage is no longer necessary, at the latest after 14 days. -``` +### **3.3. Web hosting and provision of the website** -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +Our website is hosted by Netlify. The provider is Netlify, Inc., 44 Montgomery Street, Suite 300, San Francisco, California 94104, USA. In doing so, the provider processes the personal data transmitted via the website, e.g. content, usage, meta/communication data or contact data in the USA. Further information can be found in the provider's privacy policy at https://www.netlify.com/privacy/. -const client = new QdrantClient({ host: "localhost", port: 6333 }); +It is our legitimate interest to provide a website, so the legal basis of the described data processing is Art. 6 para. 1 s. 1 lit. f GDPR. -client.searchMatrixPairs("{collection_name}", { - filter: { - must: [\ - {\ - key: "color",\ - match: {\ - value: "red",\ - },\ - },\ - ], - }, - sample: 10, - limit: 2, -}); +The legal basis of the transfer to a country outside the EEA are standard contractual clauses. The security of the data transferred to the third country (i.e. a country outside the EEA) is guaranteed by standard data protection clauses (Art. 46 para. 2 lit. c GDPR) adopted by the EU Commission in accordance with the examination procedure under Art. 93 para. 2 of the GDPR, which we have agreed to with the provider. -``` +We use the content delivery network Netlify for our website. The provider is Netlify, Inc., 44 Montgomery Street, Suite 300, San Francisco, California 94104, USA. The provider thereby processes the personal data transmitted via the website, e.g. content, usage, meta/communication data or contact data in the USA. Further information can be found in the provider's privacy policy at https://www.netlify.com/privacy/. -```rust -use qdrant_client::qdrant::{Condition, Filter, SearchMatrixPointsBuilder}; -use qdrant_client::Qdrant; +We have a legitimate interest in using sufficient storage and delivery capacity to ensure optimal data throughput even during large peak loads. Therefore, the legal basis of the described data processing is Art. 6 para. 1 s. 1 lit. f GDPR. -client - .search_matrix_pairs( - SearchMatrixPointsBuilder::new("collection_name") - .filter(Filter::must(vec![Condition::matches(\ - "color",\ - "red".to_string(),\ - )])) - .sample(10) - .limit(2), - ) - .await?; +Legal basis of the transfer to a country outside the EEA are standard contractual clauses. The security of the data transferred to the third country (i.e. a country outside the EEA) is guaranteed by standard data protection clauses (Art. 46 para. 2 lit. c GDPR) adopted by the EU Commission in accordance with the examination procedure under Art. 93 para. 2 of the GDPR, which we have agreed to with the provider. -``` +#### 3.3.1. Cloud Providers: -```java -import static io.qdrant.client.ConditionFactory.matchKeyword; +Depending on the operating environment of our solution, we store information about the cloud provider used in the log files: -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.SearchMatrixPoints; +- **Managed Cloud:** If our solution is operated in a cloud environment managed by us, we store which cloud provider (e.g., AWS, GCP, Azure) the cluster is running on. +- **Hybrid Cloud:** In a hybrid cloud environment on the customer's infrastructure, we also store which cloud provider is detected (e.g., AWS, GCP, Azure, DigitalOcean, and others). +- **Private Cloud:** If the operation takes place in a customer's private cloud, we do not store any data in this context regarding the cloud provider. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +### **3.4. Contact form** -client - .searchMatrixPairsAsync( - Points.SearchMatrixPoints.newBuilder() - .setCollectionName("{collection_name}") - .setFilter(Filter.newBuilder().addMust(matchKeyword("color", "red")).build()) - .setSample(10) - .setLimit(2) - .build()) - .get(); +When contacting us via the contact forms on our website, we store the data requested there and the content of the message. +The legal basis for the processing is our legitimate interest in answering inquiries directed to us. The legal basis for the processing is therefore Art. 6 para. 1 s. 1 lit. f GDPR. +We delete the data accruing in this context after the storage is no longer necessary or restrict the processing if there are legal retention obligations. -``` +### **3.5. Vacant positions** -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -using static Qdrant.Client.Grpc.Conditions; +We publish positions that are vacant in our company on our website, on pages linked to the website or on third-party websites. -var client = new QdrantClient("localhost", 6334); +The processing of the data provided as part of the application is carried out for the purpose of implementing the application process. Insofar as this is necessary for our decision to establish an employment relationship, the legal basis is Art. 88 para. GDPR in conjunction with Sec. 26 para. 1 of the German Data Protection Act (Bundesdatenschutzgesetz). We have marked the data required to carry out the application process accordingly or refer to them. If applicants do not provide this data, we cannot process the application. +Further data is voluntary and not required for an application. If applicants provide further information, the basis is their consent (Art. 6 para. 1 s. 1 lit. a GDPR). -await client.SearchMatrixPairsAsync( - collectionName: "{collection_name}", - filter: MatchKeyword("color", "red"), - sample: 10, - limit: 2 -); +We ask applicants to refrain from providing information on political opinions, religious beliefs and similarly sensitive data in their CV and cover letter. They are not required for an application. If applicants nevertheless provide such information, we cannot prevent their processing as part of the processing of the resume or cover letter. Their processing is then also based on the consent of the applicants (Art. 9 para. 2 lit. a GDPR). -``` +Finally, we process the applicants' data for further application procedures if they have given us their consent to do so. In this case, the legal basis is Art. 6 para. 1 s. 1 lit. a GDPR. -```go -import ( - "context" +We pass on the applicants' data to the responsible employees in the HR department, to our data processors in the area of recruiting and to the employees otherwise involved in the application process. - "github.com/qdrant/go-client/qdrant" -) +If we enter into an employment relationship with the applicant following the application process, we delete the data only after the employment relationship has ended. Otherwise, we delete the data no later than six months after rejecting an applicant. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +If applicants have given us their consent to use their data for further application procedures as well, we will not delete their data until one year after receiving the application. -sample := uint64(10) -limit := uint64(2) -res, err := client.SearchMatrixPairs(ctx, &qdrant.SearchMatrixPoints{ - CollectionName: "{collection_name}", - Sample: &sample, - Limit: &limit, - Filter: &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("color", "red"), - }, - }, -}) +### **3.6. Customer account** -``` +Site visitors can open a customer account on our website. We process the data requested in this context based on the consent of the site visitor. Legal basis for the processing is Art. 6 para. 1 s. 1 lit. a GDPR. -Returns +The consent may be revoked at any time by contacting us, for example, using the contact details provided in our privacy policy. The revocation does not affect the lawfulness of the processing until the revocation. If the consent is revoked we will delete the data insofar as we are not obliged or have a right to retain it further. -```json -{ - "result": { - "pairs": [\ - {"a": 1, "b": 3, "score": 1.4063001},\ - {"a": 1, "b": 4, "score": 1.2531},\ - {"a": 2, "b": 1, "score": 1.1550001},\ - {"a": 2, "b": 8, "score": 1.1359},\ - {"a": 3, "b": 1, "score": 1.4063001},\ - {"a": 3, "b": 4, "score": 1.2218001},\ - {"a": 4, "b": 1, "score": 1.2531},\ - {"a": 4, "b": 3, "score": 1.2218001},\ - {"a": 5, "b": 3, "score": 0.70239997},\ - {"a": 5, "b": 1, "score": 0.6146},\ - {"a": 6, "b": 3, "score": 0.6353},\ - {"a": 6, "b": 4, "score": 0.5093},\ - {"a": 7, "b": 3, "score": 1.0990001},\ - {"a": 7, "b": 1, "score": 1.0349001},\ - {"a": 8, "b": 2, "score": 1.1359},\ - {"a": 8, "b": 3, "score": 1.0553}\ - ] - } -} +### **3.7. Single-sign on** -``` +Users can log in to our website using one or more single sign-on methods. In doing so, they use the login data already created for a provider. The prerequisite is that the user is already registered with the respective provider. When a user logs in using a single sign-on procedure, we receive information from the provider that the user is logged in to the provider and the provider receives information that the user is using the single sign-on procedure on our website. Depending on the user's settings in his account on the provider's site, additional information may be provided to us by the provider. The legal basis for this processing is Art. 6 para. 1 sentence 1 lit. f GDPR. We have a legitimate interest in providing users with a simple log-in option. At the same time, the interests of the users are safeguarded, as use is only voluntary. -### [Anchor](https://qdrant.tech/documentation/concepts/explore/\#offset-format) Offset format +Providers of the offered method(s) are: -Returns the distance matrix as a four arrays: +* Google Ireland Limited, Gordon House, Barrow Street, Dublin 4, Irland (privacy policy: https://policies.google.com/privacy) -- `offsets_row` and `offsets_col`, represent the positions of non-zero distance values in the matrix. -- `scores` contains the distance values. -- `ids` contains the point ids corresponding to the distance values. +* GitHub B.V., Vijzelstraat 68-72, 1017 HL Amsterdam, Netherlands -httppythontypescriptrustjavacsharpgo +### **3.8. Offer of services** -```http -POST /collections/{collection_name}/points/search/matrix/offsets -{ - "sample": 10, - "limit": 2, - "filter": { - "must": { - "key": "color", - "match": { "value": "red" } - } - } -} +We offer services via our website. In doing so, we process the following data as part of the ordering process: -``` +* First and last name +* E-mail address -```python -from qdrant_client import QdrantClient, models +The processing of the data is carried out for the performance of the contract concluded with the respective site visitor (Art. 6 para. 1 s. 1 lit. b GDPR). -client = QdrantClient(url="http://localhost:6333") +### **3.9. Payment processors** -client.search_matrix_offsets( - collection_name="{collection_name}", - sample=10, - limit=2, - query_filter=models.Filter( - must=[\ - models.FieldCondition(\ - key="color", match=models.MatchValue(value="red")\ - ),\ - ] - ), -) +For the processing of payments, we use payment processors who are themselves data controllers within the meaning of Art. 4 No. 7 GDPR. Insofar as they receive data and payment data entered by us in the ordering process, we thereby fulfill the contract concluded with our customers (Art. 6 para. 1 s. 1 lit. b GDPR). -``` +These payment processors are: -```typescript -import { QdrantClient } from "@qdrant/js-client-rest"; +* Stripe Payments Europe, Ltd., Ireland -const client = new QdrantClient({ host: "localhost", port: 6333 }); +### **3.10. Third parties** -client.searchMatrixOffsets("{collection_name}", { - filter: { - must: [\ - {\ - key: "color",\ - match: {\ - value: "red",\ - },\ - },\ - ], - }, - sample: 10, - limit: 2, -}); +#### **3.10.1. ​HubSpot​** -``` +We use HubSpot to manage leads, for landing pages, marketing automations, forms on the website, and for analytics. The provider is HubSpot, Inc., 25 1st Street Cambridge, MA 0214, USA. The provider processes usage data (e.g. web pages visited, interest in content, access times), content data (e.g. entries in online forms), and meta/communication data (e.g. device information, IP addresses) in the EU. -```rust -use qdrant_client::qdrant::{Condition, Filter, SearchMatrixPointsBuilder}; -use qdrant_client::Qdrant; +The legal basis for the processing is Art. 6 para. 1 s. 1 lit. f GDPR. We have a legitimate interest in managing data in a simple and inexpensive way. -client - .search_matrix_offsets( - SearchMatrixPointsBuilder::new("collection_name") - .filter(Filter::must(vec![Condition::matches(\ - "color",\ - "red".to_string(),\ - )])) - .sample(10) - .limit(2), - ) - .await?; +The data will be deleted when the purpose for which it was collected no longer applies and there is no obligation to retain it. Further information is available in the provider's privacy policy at https://legal.hubspot.com/de/privacy-policy. -``` +#### **3.10.2. ​Segment​** -```java -import static io.qdrant.client.ConditionFactory.matchKeyword; +We use Segment for analytics. The provider is Segment.io, Inc., 100 California Street Suite 700 San Francisco, CA 94111, USA. The provider processes usage data (e.g. web pages visited, interest in content, access times) and meta/communication data (e.g. device information, IP addresses) in the USA. -import io.qdrant.client.QdrantClient; -import io.qdrant.client.QdrantGrpcClient; -import io.qdrant.client.grpc.Points.Filter; -import io.qdrant.client.grpc.Points.SearchMatrixPoints; +The legal basis for the processing is Art. 6 para. 1 s. 1 lit. a GDPR. The processing is based on consent. Data subjects may revoke their consent at any time by contacting us, for example, using the contact details provided in our privacy policy. The revocation does not affect the lawfulness of the processing until the revocation. -QdrantClient client = - new QdrantClient(QdrantGrpcClient.newBuilder("localhost", 6334, false).build()); +The legal basis for the transfer to a country outside the EEA are standard contractual clauses. The security of the data transferred to the third country (i.e. a country outside the EEA) is guaranteed by standard data protection clauses (Art. 46 para. 2 lit. c GDPR) adopted by the EU Commission in accordance with the examination procedure under Art. 93 para. 2 of the GDPR, which we have agreed to with the provider. -client - .searchMatrixOffsetsAsync( - SearchMatrixPoints.newBuilder() - .setCollectionName("{collection_name}") - .setFilter(Filter.newBuilder().addMust(matchKeyword("color", "red")).build()) - .setSample(10) - .setLimit(2) - .build()) - .get(); +We delete the data when the purpose for which it was collected no longer applies. Further information is available in the provider's privacy policy at https://segment.com/legal/privacy/. -``` +#### **3.10.3. heyData** -```csharp -using Qdrant.Client; -using Qdrant.Client.Grpc; -using static Qdrant.Client.Grpc.Conditions; +We have integrated a data protection seal on our website. The provider is heyData GmbH, SchĂŒtzenstraße 5, 10117 Berlin, Germany. The provider processes meta/communication data (e.g. IP addresses) in the EU. -var client = new QdrantClient("localhost", 6334); +The legal basis of the processing is Art. 6 para. 1 s. 1 lit. f GDPR. We have a legitimate interest in providing website visitors with confirmation of our data privacy compliance. At the same time, the provider has a legitimate interest in ensuring that only customers with existing contracts use its seals, which is why a mere image copy of the certificate is not a viable alternative as confirmation. -await client.SearchMatrixOffsetsAsync( - collectionName: "{collection_name}", - filter: MatchKeyword("color", "red"), - sample: 10, - limit: 2 -); +As the data is masked after collection, there is no possibility to identify website visitors. Further information is available in the privacy policy of the provider at [https://heydata.eu/en/privacy-policy](https://heydata.eu/datenschutzerklaerung). -``` +#### **3.10.4. ​Google Analytics​** -```go -import ( - "context" +We use Google Analytics for analytics. The provider is Google Ireland Limited, Gordon House, Barrow Street, Dublin 4, Dublin, Ireland. The provider processes usage data (e.g. web pages visited, interest in content, access times) and meta/communication data (e.g. device information, IP addresses) in the USA. - "github.com/qdrant/go-client/qdrant" -) +The legal basis for the processing is Art. 6 para. 1 s. 1 lit. a GDPR. The processing is based on consent. Data subjects may revoke their consent at any time by contacting us, for example, using the contact details provided in our privacy policy. The revocation does not affect the lawfulness of the processing until the revocation. -client, err := qdrant.NewClient(&qdrant.Config{ - Host: "localhost", - Port: 6334, -}) +The legal basis for the transfer to a country outside the EEA are standard contractual clauses. The security of the data transferred to the third country (i.e. a country outside the EEA) is guaranteed by standard data protection clauses (Art. 46 para. 2 lit. c GDPR) adopted by the EU Commission in accordance with the examination procedure under Art. 93 para. 2 of the GDPR, which we have agreed to with the provider. -sample := uint64(10) -limit := uint64(2) -res, err := client.SearchMatrixOffsets(ctx, &qdrant.SearchMatrixPoints{ - CollectionName: "{collection_name}", - Sample: &sample, - Limit: &limit, - Filter: &qdrant.Filter{ - Must: []*qdrant.Condition{ - qdrant.NewMatch("color", "red"), - }, - }, -}) +The data will be deleted when the purpose for which it was collected no longer applies and there is no obligation to retain it. Further information is available in the provider's privacy policy at [https://policies.google.com/privacy?hl=en-US](https://policies.google.com/privacy?hl=en-US). -``` +#### **3.10.5. ​Google Tag Manager​** -Returns +We use Google Tag Manager for analytics and for advertisement. The provider is Google Ireland Limited, Gordon House, Barrow Street, Dublin 4, Ireland. The provider processes usage data (e.g. web pages visited, interest in content, access times) in the USA. -```json -{ - "result": { - "offsets_row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7], - "offsets_col": [2, 3, 0, 7, 0, 3, 0, 2, 2, 0, 2, 3, 2, 0, 1, 2], - "scores": [\ - 1.4063001, 1.2531, 1.1550001, 1.1359, 1.4063001,\ - 1.2218001, 1.2531, 1.2218001, 0.70239997, 0.6146, 0.6353,\ - 0.5093, 1.0990001, 1.0349001, 1.1359, 1.0553\ - ], - "ids": [1, 2, 3, 4, 5, 6, 7, 8] - } -} +The legal basis for the processing is Art. 6 para. 1 s. 1 lit. a GDPR. The processing is based on consent. Data subjects may revoke their consent at any time by contacting us, for example, using the contact details provided in our privacy policy. The revocation does not affect the lawfulness of the processing until the revocation. -``` +The legal basis for the transfer to a country outside the EEA are adequacy decision. The security of the data transferred to the third country (i.e. a country outside the EEA) is guaranteed because the EU Commission has decided as part of an adequacy decision in accordance with Art. 45 para. 3 GDPR that the third country ensures an adequate level of protection. -##### Was this page useful? +We delete the data when the purpose for which it was collected no longer applies. Further information is available in the provider's privacy policy at https://policies.google.com/privacy?hl=en-US. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +#### **3.10.6. ​Mixpanel​** -Thank you for your feedback! 🙏 +We use Mixpanel for analytics. The provider is Mixpanel, Inc., One Front Street, Floor 28, San Francisco, CA 94111, USA. The provider processes contact data (e.g. e-mail addresses, telephone numbers), meta/communication data (e.g. device information, IP addresses), and master data (e.g. names, addresses) in the USA. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/explore.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +The legal basis for the processing is Art. 6 para. 1 s. 1 lit. a GDPR. The processing is based on consent. Data subjects may revoke their consent at any time by contacting us, for example, using the contact details provided in our privacy policy. The revocation does not affect the lawfulness of the processing until the revocation. -On this page: +The legal basis for the transfer to a country outside the EEA are adequacy decision. The security of the data transferred to the third country (i.e. a country outside the EEA) is guaranteed because the EU Commission has decided as part of an adequacy decision in accordance with Art. 45 para. 3 GDPR that the third country ensures an adequate level of protection. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/documentation/concepts/explore.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +The data will be deleted when the purpose for which it was collected no longer applies and there is no obligation to retain it. Further information is available in the provider's privacy policy at [https://mixpanel.com/legal/privacy-policy/](https://mixpanel.com/legal/privacy-policy/). -× +#### **3.10.7. ​OneTrust​** -[Powered by](https://qdrant.tech/) +We use OneTrust to manage consents. The provider is OneTrust Technology Limited, Atlanta, GA, 1200 Abernathy Rd NE, Building 600, Atlanta, GA 30328, USA. The provider processes meta/communication data (e.g. device information, IP addresses) in the USA. -<|page-200-lllmstxt|> -## chatgpt-plugin -- [Articles](https://qdrant.tech/articles/) -- Extending ChatGPT with a Qdrant-based knowledge base - -[Back to Practical Examples](https://qdrant.tech/articles/practicle-examples/) - -# Extending ChatGPT with a Qdrant-based knowledge base - -Kacper Ɓukawski - -· - -March 23, 2023 - -![Extending ChatGPT with a Qdrant-based knowledge base](https://qdrant.tech/articles_data/chatgpt-plugin/preview/title.jpg) - -In recent months, ChatGPT has revolutionised the way we communicate, learn, and interact -with technology. Our social platforms got flooded with prompts, responses to them, whole -articles and countless other examples of using Large Language Models to generate content -unrecognisable from the one written by a human. - -Despite their numerous benefits, these models have flaws, as evidenced by the phenomenon -of hallucination - the generation of incorrect or nonsensical information in response to -user input. This issue, which can compromise the reliability and credibility of -AI-generated content, has become a growing concern among researchers and users alike. -Those concerns started another wave of entirely new libraries, such as Langchain, trying -to overcome those issues, for example, by combining tools like vector databases to bring -the required context into the prompts. And that is, so far, the best way to incorporate -new and rapidly changing knowledge into the neural model. So good that OpenAI decided to -introduce a way to extend the model capabilities with external plugins at the model level. -These plugins, designed to enhance the model’s performance, serve as modular extensions -that seamlessly interface with the core system. By adding a knowledge base plugin to -ChatGPT, we can effectively provide the AI with a curated, trustworthy source of -information, ensuring that the generated content is more accurate and relevant. Qdrant -may act as a vector database where all the facts will be stored and served to the model -upon request. - -If you’d like to ask ChatGPT questions about your data sources, such as files, notes, or -emails, starting with the official [ChatGPT retrieval plugin repository](https://github.com/openai/chatgpt-retrieval-plugin) -is the easiest way. Qdrant is already integrated, so that you can use it right away. In -the following sections, we will guide you through setting up the knowledge base using -Qdrant and demonstrate how this powerful combination can significantly improve ChatGPT’s -performance and output quality. - -## [Anchor](https://qdrant.tech/articles/chatgpt-plugin/\#implementing-a-knowledge-base-with-qdrant) Implementing a knowledge base with Qdrant - -The official ChatGPT retrieval plugin uses a vector database to build your knowledge base. -Your documents are chunked and vectorized with the OpenAI’s text-embedding-ada-002 model -to be stored in Qdrant. That enables semantic search capabilities. So, whenever ChatGPT -thinks it might be relevant to check the knowledge base, it forms a query and sends it -to the plugin to incorporate the results into its response. You can now modify the -knowledge base, and ChatGPT will always know the most recent facts. No model fine-tuning -is required. Let’s implement that for your documents. In our case, this will be Qdrant’s -documentation, so you can ask even technical questions about Qdrant directly in ChatGPT. - -Everything starts with cloning the plugin’s repository. +The legal basis for the processing is Art. 6 para. 1 s. 1 lit. f GDPR. We have a legitimate interest in managing the consent of website visitors to cookies in a simple manner. -```bash -git clone git@github.com:openai/chatgpt-retrieval-plugin.git +The transfer of personal data to a country outside the EEA takes place on the legal basis standard contractual clauses. The security of the data transferred to the third country (i.e. a country outside the EEA) is guaranteed by standard data protection clauses (Art. 46 para. 2 lit. c GDPR) adopted by the EU Commission in accordance with the examination procedure under Art. 93 para. 2 of the GDPR, which we have agreed to with the provider. -``` +The data will be deleted when the purpose for which it was collected no longer applies and there is no obligation to retain it. Further information is available in the provider's privacy policy at https://www.onetrust.com/privacy-notice/. -Please use your favourite IDE to open the project once cloned. +## **4\. Data processing on our Cloud Panel** -### [Anchor](https://qdrant.tech/articles/chatgpt-plugin/\#prerequisites) Prerequisites +### **4.1. Processing of data by means of log files** -You’ll need to ensure three things before we start: +When the Qdrant Cloud Service is called up, so-called log files are stored on the basis of Art. 6 para. 1 letter f) of the GDPR, in which certain access data are stored. The thereby stored data set contains the following data: -1. Create an OpenAI API key, so you can use their embeddings model programmatically. If -you already have an account, you can generate one at [https://platform.openai.com/account/api-keys](https://platform.openai.com/account/api-keys). -Otherwise, registering an account might be required. -2. Run a Qdrant instance. The instance has to be reachable from the outside, so you -either need to launch it on-premise or use the [Qdrant Cloud](https://cloud.qdrant.io/) -offering. A free 1GB cluster is available, which might be enough in many cases. We’ll -use the cloud. -3. Since ChatGPT will interact with your service through the network, you must deploy it, -making it possible to connect from the Internet. Unfortunately, localhost is not an -option, but any provider, such as Heroku or fly.io, will work perfectly. We will use -[fly.io](https://fly.io/), so please register an account. You may also need to install -the flyctl tool for the deployment. The process is described on the homepage of fly.io. +* the IP address, +* the date, +* the time, +* which file was accessed, +* the status, +* the request that your browser has made to the server, +* the amount of data transferred, +* the Internet page from which you came to the requested page (referrer URL), as well as +* the product and version information of the browser used, your operating system, and the country from which the request was made. +* Customer ID, Region +* Account created/deleted +* Cluster status (created/deleted/amount of cluster) +* Cloud provider +* Authentication type +* Payment information ID +* RAM (booked amount and its changes, paid or free) +* Deployment type (hybrid cloud, managed cloud, etc.) -### [Anchor](https://qdrant.tech/articles/chatgpt-plugin/\#configuration) Configuration +The temporary storage of this data is technically necessary in order to be able trace back errors and security incidents. IP addresses are generally only stored for a maximum of 90 days and then deleted. -The retrieval plugin is a FastAPI-based application, and its default functionality might -be enough in most cases. However, some configuration is required so ChatGPT knows how and -when to use it. However, we can start setting up Fly.io, as we need to know the service’s -hostname to configure it fully. +Our legitimate interest in the further processing of your data is outlined below: We continue to store the log files in anonymized form after deletion of the IP address. We can use this data for statistical evaluations, e.g. to find out on which days and at which times the Qdrant Cloud Service is particularly popular and how much data volume is generated on the Qdrant Cloud Service. In addition, the log files may enable us to detect errors, e.g. faulty links or program errors. Thus, we can use the logfiles for the further development of the Qdrant Cloud Service. -First, let’s login into the Fly CLI: +We reserve the right to use log files before deleting the IP address to identify you in the event that certain facts give rise to the suspicion that users are using the Qdrant Cloud Service and/or individual services in violation of the law or the Cloud Service Agreement. In the event of such suspicion, IP addresses may have to be stored longer than usual or forwarded to investigating authorities. However, we will immediately delete the IP addresses as soon as they are no longer needed or further investigations appear futile. -```bash -flyctl auth login +### **4.2. Registration for and use of the Qdrant Cloud Service** -``` +To use the Qdrant Cloud Service, your registration is required. The legal basis for the processing of your data is Art. 6 para. 1 letter b) of the GDPR, insofar as we require your data for the establishment and implementation of the contract for the use of the Qdrant Cloud Service. In the context of registration and profile creation, we process data as follows: -That will open the browser, so you can simply provide the credentials, and all the further -commands will be executed with your account. If you have never used fly.io, you may need -to give the credit card details before running any instance, but there is a Hobby Plan -you won’t be charged for. +#### **4.2.1. Registration** -Let’s try to launch the instance already, but do not deploy it. We’ll get the hostname -assigned and have all the details to fill in the configuration. The retrieval plugin -uses TCP port 8080, so we need to configure fly.io, so it redirects all the traffic to it -as well. +Registration only requires you to provide an email address. -```bash -flyctl launch --no-deploy --internal-port 8080 +After you input your email address to log in, you will receive an email with an authorization code. -``` +For registration, you can also use your login data from Github or Google, provided you have an active account with these services. By means of this so-called single sign-on procedure, we want to make it easier for you to register and log in to the Qdrant Cloud Service. Because in this way you do not have to remember any further access and login data for your use of the Qdrant Cloud Service. If you use a single sign-on procedure, we receive the information from the relevant provider that you have released for transmission. The legal basis for processing by us is your express consent pursuant to Art. 6 para. 1 letter a) of the GDPR. This information may be, in particular, your name, your e-mail address, the user ID with the provider concerned and, if applicable, a profile picture. -We’ll be prompted about the application name and the region it should be deployed to. -Please choose whatever works best for you. After that, we should see the hostname of the -newly created application: +We would like to point out that, in accordance with the data protection conditions and terms of use of the providers, there may also be a transfer of further data when consent is given if this has been marked as “public” in your privacy settings or otherwise approved by you for transfer for the purposes of the single sign-on procedure. However, of the data transmitted to us, we only process the data that is necessary for registration and login to the Qdrant Cloud Service (Art. 6 para. 1 letter b) of the GDPR); we delete any further data transmitted to us immediately upon receipt. -```text -... -Hostname: your-application-name.fly.dev -... +For the purpose and scope of data transmission in the context of the use of single sign-on procedures and the further processing and use of your data by the providers, as well as your rights in this regard and setting options for protecting your privacy, please refer to the data protection notices of the providers concerned: -``` +* Google: Google Ireland Limited, Gordon House, Barrow Street, Dublin 4, Ireland; https://policies.google.com/privacy +* Github: Github B.V., Prins Bernhardplein 200, Amsterdam, 1097JB, The Netherlands: https://docs.github.com/en/site-policy/privacy-policies/github-privacy-statement -Let’s note it down. We’ll need it for the configuration of the service. But we’re going -to start with setting all the applications secrets: +To further secure the registration process and to offer the single sign-on procedure, we also use the “Auth0” service of the provider Auth0, Inc., 10800 NE 8th Street, Suite 600, Bellevue, WA 98004, U.S.A (“Auth0”). Auth0 and its subcontractors act for us as processors (Art. 28 of the GDPR) and process data solely for the purposes specified by us. In some cases, data may be transferred to and processed in countries outside the EU or the European Economic Area for this purpose (“Third Countries”). We have entered into an agreement with Auth0 which contains the standard contractual clauses pursuant to the EU Commission's Implementing Decision (EU) 2021/914 of 04.06.2021. Auth0 has also taken supplementary security measures, in particular implemented comprehensive encryption mechanisms, to ensure an adequate level of data protection even when processing your data in the U.S., and Auth0 has committed itself to the principles established under the EU-US Data Privacy Framework. The EU-US Data Privacy Framework has been acknowledged by the EU Commission as an adequate data transfer mechanism with respect to data transfers from the EU to the United States (Art. 45 of the GDPR). -```bash -flyctl secrets set DATASTORE=qdrant \ - OPENAI_API_KEY= \ - QDRANT_URL=https://.aws.cloud.qdrant.io \ - QDRANT_API_KEY= \ - BEARER_TOKEN=eyJhbGciOiJIUzI1NiJ9.e30.ZRrHA1JJJW8opsbCGfG_HACGpVUMN_a9IV7pAx_Zmeo +#### **4.2.2. Use of the Qdrant Cloud Service** -``` +Upon registration or receipt of an invitation e-mail, you may, at your sole discretion, create a user account to access and use the Qdrant Cloud Service. Using the Qdrant Cloud Service requires adherence to the terms and conditions of the agreement concluded between us and our customer. As set out therein in further detail, your account is a personal account, and only you are allowed to use the Qdrant Cloud Service under your user account. Thus, we will process your personal data that (a) you submit in the course of the registration or account creation procedure, and (b) we collect or generate in connection with your use of the Qdrant Cloud Service (including without limitation, any information related to your computer, server, or laptop that is part of your company’s systems or network and that accesses, is managed or tracked by, or is registered to access, the Qdrant Cloud Service. We will process such personal data for the purposes of entering and maintaining a contractual relationship with our customer (Art. 6 para. 1 letter b of the GDPR), surveilling your compliance with and enforcing the agreement, ensuring system availability, IT and data security, all these purposes and processing activities serving and being required for our legitimate interest to run and constantly improve the Qdrant Cloud Service for the benefit of our customers, yourself as a user and ourselves (Art. 6 para. 1 letter f of the GDPR). -The secrets will be staged for the first deployment. There is an example of a minimal -Bearer token generated by [https://jwt.io/](https://jwt.io/). **Please adjust the token and do not expose** -**it publicly, but you can keep the same value for the demo.** +Please note, however, that we will not use for our own business purposes any data (including your personal data), information or material you provide, submit or upload to the Qdrant Cloud Service unless: (a) to support our customer’s and your use of the Qdrant Cloud Service and prevent or address service or technical problems; (b) in order to create aggregated data in accordance with our agreement with our customer; or (c) as our customer expressly permits in writing. In this respect, we have entered into an agreement in accordance with Art. 28 of the GDPR with our customer. Inasmuch as this data processing agreement allows us to create aggregated data, your personal data will by anonymized such that it does not include any identifying information of, or reasonably permit the identification of, our customer or any individual (including yourself). -Right now, let’s dive into the application config files. You can optionally provide your -icon and keep it as `.well-known/logo.png` file, but there are two additional files we’re -going to modify. +#### **4.2.3. Storage Periods** -The `.well-known/openapi.yaml` file describes the exposed API in the OpenAPI format. -Lines 3 to 5 might be filled with the application title and description, but the essential -part is setting the server URL the application will run. Eventually, the top part of the -file should look like the following: +If you, upon registration or receipt of an invitation email, decide to use and subscribe to the Qdrant Cloud Service, we will process your personal data as described above and store such personal data for as long as it is required for the respective purposes. However, we shall delete your personal data upon termination or expiry of the agreement between us and our customer at the latest. This does not apply, and we will be under no obligation to delete your personal data if and inasmuch as we are under a statutory retention obligation, in which case we will delete your personal data as soon as such obligation has expired. -```yaml -openapi: 3.0.0 -info: - title: Qdrant Plugin API - version: 1.0.0 - description: Plugin for searching through the Qdrant doc
 -servers: - - url: https://your-application-name.fly.dev -... +### **4.3. Payment** -``` +The use of the Qdrant Cloud Service may be subject to a fee. For billing purposes, we may use data from contact persons within the company. This data, along with other billing information, is also transmitted to our service provider Stripe, Inc., 510 Townsend Street, San Francisco, CA 94103 USA (“Stripe”). Stripe is represented in the EU by Stripe Payments Europe, Ltd., The One Building, 1 Lower Grand Canal Street, Dublin, D02 HD59 Ireland (Art. 27 GDPR), but nonetheless also processes data in the U.S.A. We have entered into an agreement with Stripe that includes the standard contractual clauses pursuant to the EU Commission’s Implementing Decision (EU) 2021/914 of 04 June 2021\. Stripe has also taken supplementary security measures to ensure an adequate level of data protection when processing your data in the U.S., and Stripe has committed itself to the principles established under the EU-US Data Privacy Framework. The EU-US Data Privacy Framework has been acknowledged by the EU Commission as an adequate data transfer mechanism with respect to data transfers from the EU to the United States (Art. 45 of the GDPR). -There is another file in the same directory, and that’s the most crucial piece to -configure. It contains the description of the plugin we’re implementing, and ChatGPT -uses this description to determine if it should communicate with our knowledge base. -The file is called `.well-known/ai-plugin.json`, and let’s edit it before we finally -deploy the app. There are various properties we need to fill in: +### **4.4. Links to other websites** -| **Property** | **Meaning** | **Example** | -| --- | --- | --- | -| `name_for_model` | Name of the plugin for the ChatGPT model | _qdrant_ | -| `name_for_human` | Human-friendly model name, to be displayed in ChatGPT UI | _Qdrant Documentation Plugin_ | -| `description_for_model` | Description of the purpose of the plugin, so ChatGPT knows in what cases it should be using it to answer a question. | _Plugin for searching through the Qdrant documentation to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be related to Qdrant vector database or semantic vector search_ | -| `description_for_human` | Short description of the plugin, also to be displayed in the ChatGPT UI. | _Search through Qdrant docs_ | -| `auth` | Authorization scheme used by the application. By default, the bearer token has to be configured. | `{"type": "user_http", "authorization_type": "bearer"}` | -| `api.url` | Link to the OpenAPI schema definition. Please adjust based on your application URL. | _[https://your-application-name.fly.dev/.well-known/openapi.yaml](https://your-application-name.fly.dev/.well-known/openapi.yaml)_ | -| `logo_url` | Link to the application logo. Please adjust based on your application URL. | _[https://your-application-name.fly.dev/.well-known/logo.png](https://your-application-name.fly.dev/.well-known/logo.png)_ | +Our Qdrant Cloud Service may contain links to websites of other providers. We point out that this information on data protection applies exclusively to the websites and other offers of Qdrant. When accessing the websites of other providers, please check the data protection information stored there. We have no influence on and cannot control that such other providers comply with the applicable data protection provisions at all times and in full. -A complete file may look as follows: +### **4.5. Categories of recipients of data; data transfers to a third country** -```json -{ - "schema_version": "v1", - "name_for_model": "qdrant", - "name_for_human": "Qdrant Documentation Plugin", - "description_for_model": "Plugin for searching through the Qdrant documentation to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be related to Qdrant vector database or semantic vector search", - "description_for_human": "Search through Qdrant docs", - "auth": { - "type": "user_http", - "authorization_type": "bearer" - }, - "api": { - "type": "openapi", - "url": "https://your-application-name.fly.dev/.well-known/openapi.yaml", - "has_user_authentication": false - }, - "logo_url": "https://your-application-name.fly.dev/.well-known/logo.png", - "contact_email": "email@domain.com", - "legal_info_url": "email@domain.com" -} +We have commissioned various service providers who process data of the users of the Qdrant Cloud Service on our behalf. These include, for example, cloud providers for software that we use, or email service providers, but also our host provider on whose servers the Qdrant Cloud Service is operated. As a matter of principle, we carefully select all service providers and oblige them to maintain the protection of personal data. Data is not transferred to third countries unless expressly described otherwise herein. -``` +### **4.6. Encryption** -That was the last step before running the final command. The command that will deploy -the application on the server: +If you enter data on the Qdrant Cloud Service, this data is transmitted via the Internet using SSL encryption. We secure our Qdrant Cloud Service and other systems in an appropriate manner (Art. 24, 32 of the GDPR) by technical and organizational measures against loss, destruction, access, modification, or distribution of your data by unauthorized persons. -```bash -flyctl deploy +### **4.7. Your rights** -``` +#### **4.7.1. Rights as a data subject** -The command will build the image using the Dockerfile and deploy the service at a given -URL. Once the command is finished, the service should be running on the hostname we got -previously: +Pursuant to Art. 15 of the GDPR, you have the right to request information free of charge about the personal data that has been stored about you. In accordance with Art. 16, 17, and 18 of the GDPR, you also have the right to correct incorrect data and to restrict the processing or deletion of your personal data. All these rights exist in each case under the legal conditions or to the extent provided by law. -```text -https://your-application-name.fly.dev +You are also entitled, under the conditions set out in Art. 20 of the GDPR, to receive the personal data relating to you that has been stored in a structured, common, and machine-readable format and to transmit this data to another person responsible or to have it transmitted by us. -``` +#### **4.7.2. In particular: Your right to object** -## [Anchor](https://qdrant.tech/articles/chatgpt-plugin/\#integration-with-chatgpt) Integration with ChatGPT +In addition, pursuant to Art. 21 para. 1 of the GDPR, you have the right to object to the processing of personal data concerning you which is carried out on the basis of Art. 6 para. 1 letter f) of the GDPR, including profiling, on grounds relating to your particular situation. We will comply with this objection insofar as the legal requirements for its assertion are met. -Once we have deployed the service, we can point ChatGPT to it, so the model knows how to -connect. When you open the ChatGPT UI, you should see a dropdown with a Plugins tab -included: +If your personal data is processed for direct marketing purposes, you have the right to object at any time to the processing of your data for such marketing, including profiling, insofar as it is related to such direct marketing, in accordance with Art. 21 para. 2 of the GDPR. In such a case, we will no longer use your personal data for the purposes of direct marketing. -![](https://qdrant.tech/articles_data/chatgpt-plugin/step-1.png) +#### **4.7.3. Contact address for exercising your rights** -Once selected, you should be able to choose one of check the plugin store: +Please address any requests regarding your personal data to the contact details provided at the beginning of this privacy policy. -![](https://qdrant.tech/articles_data/chatgpt-plugin/step-2.png) +#### **4.7.4. Right of appeal to the supervisory authority** -There are some premade plugins available, but there’s also a possibility to install your -own plugin by clicking on the “ _Develop your own plugin_” option in the bottom right -corner: +You also have the right to lodge a complaint with a data protection supervisory authority about our processing of personal data. -![](https://qdrant.tech/articles_data/chatgpt-plugin/step-3.png) +### **4.8. Duration of storage and routine deletion** -We need to confirm our plugin is ready, but since we relied on the official retrieval -plugin from OpenAI, this should be all fine: +Unless otherwise expressly stated in this Privacy Policy, we process and store personal data only for the period of time necessary to achieve the purpose of the processing or as soon as provided for by laws or regulations to which we are subject. -![](https://qdrant.tech/articles_data/chatgpt-plugin/step-4.png) +If the purpose of storage no longer applies or if a legally prescribed storage period expires, the personal data will be routinely restricted in its processing or deleted in accordance with the statutory provisions. -After clicking on “ _My manifest is ready_”, we can already point ChatGPT to our newly -created service: +## **5\. Data processing on social media platforms** -![](https://qdrant.tech/articles_data/chatgpt-plugin/step-5.png) +We are represented in social media networks in order to present our organization and our services there. The operators of these networks regularly process their users' data for advertising purposes. Among other things, they create user profiles from their online behavior, which are used, for example, to show advertising on the pages of the networks and elsewhere on the Internet that corresponds to the interests of the users. To this end, the operators of the networks store information on user behavior in cookies on the users' computers. Furthermore, it cannot be ruled out that the operators merge this information with other data. Users can obtain further information and instructions on how to object to processing by the site operators in the data protection declarations of the respective operators listed below. It is also possible that the operators or their servers are located in non-EU countries, so that they process data there. This may result in risks for users, e.g. because it is more difficult to enforce their rights or because government agencies access the data. -A successful plugin installation should end up with the following information: +If users of the networks contact us via our profiles, we process the data provided to us in order to respond to the inquiries. This is our legitimate interest, so that the legal basis is Art. 6 para. 1 s. 1 lit. f GDPR. -![](https://qdrant.tech/articles_data/chatgpt-plugin/step-6.png) +### **5.1. YouTube** -There is a name and a description of the plugin we provided. Let’s click on “ _Done_” and -return to the “ _Plugin store_” window again. There is another option we need to choose in -the bottom right corner: +We maintain a profile on YouTube. The operator is Google Ireland Limited Gordon House, Barrow Street Dublin 4\. Ireland. The privacy policy is available here: [https://policies.google.com/privacy?hl=de](https://policies.google.com/privacy?hl=de). -![](https://qdrant.tech/articles_data/chatgpt-plugin/step-7.png) +### **5.2. X (formerly Twitter)** -Our plugin is not officially verified, but we can, of course, use it freely. The -installation requires just the service URL: +We maintain a profile on X. The operator is Twitter Inc, 1355 Market Street, Suite 900, San Francisco, CA 94103, USA. The privacy policy is available here: https://twitter.com/de/privacy. One way to object to data processing is via the settings for advertisements: https://twitter.com/personalization. -![](https://qdrant.tech/articles_data/chatgpt-plugin/step-8.png) +### **5.3. LinkedIn** -OpenAI cannot guarantee the plugin provides factual information, so there is a warning -we need to accept: +We maintain a profile on LinkedIn. The operator is LinkedIn Ireland Unlimited Company, Wilton Place, Dublin 2, Ireland. The privacy policy is available here: [https://www.linkedin.com/legal/privacy-policy](https://www.linkedin.com/legal/privacy-policy). +One way to object to data processing is via the settings for advertisements: [https://www.linkedin.com/psettings/guest-controls/retargeting-opt-out](https://www.linkedin.com/psettings/guest-controls/retargeting-opt-out). -![](https://qdrant.tech/articles_data/chatgpt-plugin/step-9.png) +### **5.4. Facebook** -Finally, we need to provide the Bearer token again: +We maintain a profile on Facebook. The operator is Meta Platforms Ireland Ltd., 4 Grand Canal Square, Grand Canal Harbour, Dublin 2, Ireland. The privacy policy is available here: https://www.facebook.com/policy.php. A possibility to object to data processing arises via settings for advertisements: https://www.facebook.com/settings?tab=ads. +We are joint controllers for processing the data of visitors to our profile on the basis of an agreement within the meaning of Art. 26 GDPR with Facebook. Facebook explains exactly what data is processed at [https://www.facebook.com/legal/terms/information_about_page_insights_data](https://www.facebook.com/legal/terms/information_about_page_insights_data). Data subjects can exercise their rights both against us and against Facebook. However, according to our agreement with Facebook, we are obliged to forward requests to Facebook. Data subjects will therefore receive a faster response if they contact Facebook directly. -![](https://qdrant.tech/articles_data/chatgpt-plugin/step-10.png) +## **6\. Changes to this privacy policy** -Our plugin is now ready to be tested. Since there is no data inside the knowledge base, -extracting any facts is impossible, but we’re going to put some data using the Swagger UI -exposed by our service at [https://your-application-name.fly.dev/docs](https://your-application-name.fly.dev/docs). We need to authorize -first, and then call the upsert method with some docs. For the demo purposes, we can just -put a single document extracted from the Qdrant documentation to see whether integration -works properly: +We reserve the right to change this privacy policy with effect for the future. A current version is always available here. -![](https://qdrant.tech/articles_data/chatgpt-plugin/step-11.png) +## **7\. Questions and comments** -We can come back to ChatGPT UI, and send a prompt, but we need to make sure the plugin -is selected: +If you have any questions or comments regarding this privacy policy, please feel free to contact us using the contact information provided above. -![](https://qdrant.tech/articles_data/chatgpt-plugin/step-12.png) +<|page-413-lllmstxt|> +## Terms and Conditions +Last updated: December 10, 2021 -Now if our prompt seems somehow related to the plugin description provided, the model -will automatically form a query and send it to the HTTP API. The query will get vectorized -by our app, and then used to find some relevant documents that will be used as a context -to generate the response. +Please read these terms and conditions carefully before using Our Service. -![](https://qdrant.tech/articles_data/chatgpt-plugin/step-13.png) +### Interpretation and Definitions +#### Interpretation +The words of which the initial letter is capitalized have meanings defined under the following conditions. The following definitions shall have the same meaning regardless of whether they appear in singular or in plural. -We have a powerful language model, that can interact with our knowledge base, to return -not only grammatically correct but also factual information. And this is how your -interactions with the model may start to look like: +#### Definitions +For the purposes of these Terms and Conditions: -ChatGPT Plugin with Qdrant Vector Database - YouTube +* **Affiliate** means an entity that controls, is controlled by or is under common control with a party, where "control" means ownership of 50% or more of the shares, equity interest or other securities entitled to vote for election of directors or other managing authority. -[Photo image of Andre Zayarni](https://www.youtube.com/channel/UCexRNCxjOZnYTMxFSxpcKpw?embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +* **Country** refers to: Berlin, Germany -Andre Zayarni +* **Company** (referred to as either "the Company", "We", "Us" or "Our" in this Agreement) refers to Qdrant Solutions GmbH, Chausseestraße 86, 10115 Berlin. -14 subscribers +* **Device** means any device that can access the Service such as a computer, a cellphone or a digital tablet. -[ChatGPT Plugin with Qdrant Vector Database](https://www.youtube.com/watch?v=fQUGuHEYeog) +* **Service** refers to the Website. -Andre Zayarni +* **Terms and Conditions** (also referred as "Terms") mean these Terms and Conditions that form the entire agreement between You and the Company regarding the use of the Service. This Terms and Conditions agreement has been created with the help of the Terms and Conditions Generator. -Search +* **Third-party Social Media Service** means any services or content (including data, information, products or services) provided by a third-party that may be displayed, included or made available by the Service. -Watch later +* **Website** refers to Qdrant, accessible from https://qdrant.tech -Share +* **You** means the individual accessing or using the Service, or the company, or other legal entity on behalf of which such individual is accessing or using the Service, as applicable. -Copy link +### Acknowledgment +These are the Terms and Conditions governing the use of this Service and the agreement that operates between You and the Company. These Terms and Conditions set out the rights and obligations of all users regarding the use of the Service. -Info +Your access to and use of the Service is conditioned on Your acceptance of and compliance with these Terms and Conditions. These Terms and Conditions apply to all visitors, users and others who access or use the Service. -Shopping +By accessing or using the Service You agree to be bound by these Terms and Conditions. If You disagree with any part of these Terms and Conditions then You may not access the Service. -Tap to unmute +You represent that you are over the age of 18. The Company does not permit those under 18 to use the Service. -If playback doesn't begin shortly, try restarting your device. +Your access to and use of the Service is also conditioned on Your acceptance of and compliance with the Privacy Policy of the Company. Our Privacy Policy describes Our policies and procedures on the collection, use and disclosure of Your personal information when You use the Application or the Website and tells You about Your privacy rights and how the law protects You. Please read Our Privacy Policy carefully before using Our Service. -More videos +### Links to Other Websites +Our Service may contain links to third-party web sites or services that are not owned or controlled by the Company. -## More videos +The Company has no control over, and assumes no responsibility for, the content, privacy policies, or practices of any third party web sites or services. You further acknowledge and agree that the Company shall not be responsible or liable, directly or indirectly, for any damage or loss caused or alleged to be caused by or in connection with the use of or reliance on any such content, goods or services available on or through any such web sites or services. -You're signed out +We strongly advise You to read the terms and conditions and privacy policies of any third-party web sites or services that You visit. -Videos you watch may be added to the TV's watch history and influence TV recommendations. To avoid this, cancel and sign in to YouTube on your computer. +### Termination +We may terminate or suspend Your access immediately, without prior notice or liability, for any reason whatsoever, including without limitation if You breach these Terms and Conditions. -CancelConfirm +Upon termination, Your right to use the Service will cease immediately. -Share +### Limitation of Liability +Notwithstanding any damages that You might incur, the entire liability of the Company and any of its suppliers under any provision of this Terms and Your exclusive remedy for all of the foregoing shall be limited to the amount actually paid by You through the Service or 100 USD if You haven't purchased anything through the Service. -Include playlist +To the maximum extent permitted by applicable law, in no event shall the Company or its suppliers be liable for any special, incidental, indirect, or consequential damages whatsoever (including, but not limited to, damages for loss of profits, loss of data or other information, for business interruption, for personal injury, loss of privacy arising out of or in any way related to the use of or inability to use the Service, third-party software and/or third-party hardware used with the Service, or otherwise in connection with any provision of this Terms), even if the Company or any supplier has been advised of the possibility of such damages and even if the remedy fails of its essential purpose. -An error occurred while retrieving sharing information. Please try again later. +Some states do not allow the exclusion of implied warranties or limitation of liability for incidental or consequential damages, which means that some of the above limitations may not apply. In these states, each party's liability will be limited to the greatest extent permitted by law. -[Watch on](https://www.youtube.com/watch?v=fQUGuHEYeog&embeds_referring_euri=https%3A%2F%2Fqdrant.tech%2F) +### "AS IS" and "AS AVAILABLE" Disclaimer +The Service is provided to You "AS IS" and "AS AVAILABLE" and with all faults and defects without warranty of any kind. To the maximum extent permitted under applicable law, the Company, on its own behalf and on behalf of its Affiliates and its and their respective licensors and service providers, expressly disclaims all warranties, whether express, implied, statutory or otherwise, with respect to the Service, including all implied warranties of merchantability, fitness for a particular purpose, title and non-infringement, and warranties that may arise out of course of dealing, course of performance, usage or trade practice. Without limitation to the foregoing, the Company provides no warranty or undertaking, and makes no representation of any kind that the Service will meet Your requirements, achieve any intended results, be compatible or work with any other software, applications, systems or services, operate without interruption, meet any performance or reliability standards or be error free or that any errors or defects can or will be corrected. -0:00 +Without limiting the foregoing, neither the Company nor any of the company's provider makes any representation or warranty of any kind, express or implied: (i) as to the operation or availability of the Service, or the information, content, and materials or products included thereon; (ii) that the Service will be uninterrupted or error-free; (iii) as to the accuracy, reliability, or currency of any information or content provided through the Service; or (iv) that the Service, its servers, the content, or e-mails sent from or on behalf of the Company are free of viruses, scripts, trojan horses, worms, malware, timebombs or other harmful components. -0:00 / 1:54 -‱Live +Some jurisdictions do not allow the exclusion of certain types of warranties or limitations on applicable statutory rights of a consumer, so some or all of the above exclusions and limitations may not apply to You. But in such a case the exclusions and limitations set forth in this section shall be applied to the greatest extent enforceable under applicable law. -‱ +### Governing Law +The laws of the Country, excluding its conflicts of law rules, shall govern this Terms and Your use of the Service. Your use of the Application may also be subject to other local, state, national, or international laws. -[Watch on YouTube](https://www.youtube.com/watch?v=fQUGuHEYeog "Watch on YouTube") +### Disputes Resolution +If You have any concern or dispute about the Service, You agree to first try to resolve the dispute informally by contacting the Company. -However, a single document is not enough to enable the full power of the plugin. If you -want to put more documents that you have collected, there are already some scripts -available in the `scripts/` directory that allows converting JSON, JSON lines or even -zip archives. +### For European Union (EU) Users +If You are a European Union consumer, you will benefit from any mandatory provisions of the law of the country in which you are resident in. -##### Was this page useful? +### United States Legal Compliance +You represent and warrant that (i) You are not located in a country that is subject to the United States government embargo, or that has been designated by the United States government as a "terrorist supporting" country, and (ii) You are not listed on any United States government list of prohibited or restricted parties. -![Thumb up icon](https://qdrant.tech/icons/outline/thumb-up.svg) -Yes -![Thumb down icon](https://qdrant.tech/icons/outline/thumb-down.svg) -No +### Severability and Waiver +#### Severability +If any provision of these Terms is held to be unenforceable or invalid, such provision will be changed and interpreted to accomplish the objectives of such provision to the greatest extent possible under applicable law and the remaining provisions will continue in full force and effect. -Thank you for your feedback! 🙏 +#### Waiver +Except as provided herein, the failure to exercise a right or to require performance of an obligation under this Terms shall not effect a party's ability to exercise such right or require such performance at any time thereafter nor shall the waiver of a breach constitute a waiver of any subsequent breach. -We are sorry to hear that. 😔 You can [edit](https://qdrant.tech/github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/chatgpt-plugin.md) this page on GitHub, or [create](https://github.com/qdrant/landing_page/issues/new/choose) a GitHub issue. +Translation Interpretation +These Terms and Conditions may have been translated if We have made them available to You on our Service. You agree that the original English text shall prevail in the case of a dispute. -On this page: +### Changes to These Terms and Conditions +We reserve the right, at Our sole discretion, to modify or replace these Terms at any time. If a revision is material We will make reasonable efforts to provide at least 30 days' notice prior to any new terms taking effect. What constitutes a material change will be determined at Our sole discretion. -- [Edit on Github](https://github.com/qdrant/landing_page/tree/master/qdrant-landing/content/articles/chatgpt-plugin.md) -- [Create an issue](https://github.com/qdrant/landing_page/issues/new/choose) +By continuing to access or use Our Service after those revisions become effective, You agree to be bound by the revised terms. If You do not agree to the new terms, in whole or in part, please stop using the website and the Service. -× +### Contact Us +If you have any questions about these Terms and Conditions, You can contact us: -[Powered by](https://qdrant.tech/) +By email: info@qdrant.com diff --git a/qdrant-landing/static/llms.txt b/qdrant-landing/static/llms.txt index 3e840d5d31..0c074b81da 100644 --- a/qdrant-landing/static/llms.txt +++ b/qdrant-landing/static/llms.txt @@ -1,64 +1,417 @@ # https://qdrant.tech/ llms.txt ## Overall Summary > Qdrant is a cutting-edge platform focused on delivering exceptional performance and efficiency in vector similarity search. As a robust vector database, it specializes in managing, searching, and retrieving high-dimensional vector data, essential for enhancing AI applications, machine learning, and modern search engines. With a suite of powerful features such as state-of-the-art hybrid search capabilities, retrieval-augmented generation (RAG) applications, and dense and sparse vector support, Qdrant stands out as an industry leader. Its offerings include managed cloud services, enabling users to harness the robust functionality of Qdrant without the burden of maintaining infrastructure. The platform supports advanced data security measures and seamless integrations with popular platforms and frameworks, catering to diverse data handling and analytic needs. Additionally, Qdrant offers comprehensive solutions for complex searching requirements through its innovative Query API and multivector representations, allowing for precise matching and enhanced retrieval quality. With its commitment to open-source principles and continuous innovation, Qdrant tailors solutions to meet both small-scale projects and enterprise-level demands efficiently, helping organizations unlock profound insights from their unstructured data and optimize their AI capabilities. + ## Page Links -- [Private Cloud Backups](https://qdrant.tech/documentation/private-cloud/backups): Learn how to create and manage backups in Qdrant's private cloud. -- [Qdrant Managed Cloud](https://qdrant.tech/documentation/cloud): Explore Qdrant's Managed Cloud for efficient, scalable, and reliable database solutions. -- [Qdrant API Interfaces](https://qdrant.tech/documentation/interfaces): Explore Qdrant's API offerings and client libraries for seamless integration. -- [Single Node Speed Benchmark](https://qdrant.tech/benchmarks/single-node-speed-benchmark-2022): Explore 2022 single node speed benchmarks comparing Qdrant and other engines. -- [Multivector Representations Guide](https://qdrant.tech/documentation/advanced-tutorials/using-multivector-representations): Learn to efficiently use Qdrant's multivector representations for improved document retrieval. -- [Qdrant Cloud API](https://qdrant.tech/documentation/cloud-api): Explore Qdrant's powerful Cloud API for automation and resource management. -- [Private Cloud Configuration](https://qdrant.tech/documentation/private-cloud/configuration): Explore Qdrant's private cloud configuration options for efficient deployment and management. -- [Understanding Sparse Vectors](https://qdrant.tech/articles/sparse-vectors): Explore sparse vectors for efficient vector-based hybrid search with Qdrant insights. -- [Late Interaction Models](https://qdrant.tech/articles/late-interaction-models): Explore adapting embedding models for enhanced retrieval performance with Qdrant's innovative solutions. -- [Platform Integrations Overview](https://qdrant.tech/documentation/platforms): Explore various platform integrations to enhance your Qdrant experience and capabilities. -- [Common Errors Guide](https://qdrant.tech/documentation/guides/common-errors): Discover solutions for common Qdrant errors to enhance your database experience. -- [Understanding Vector Databases](https://qdrant.tech/articles/what-is-a-vector-database): Explore vector databases for unstructured data management and advanced analytics with Qdrant. -- [Comprehensive Data Management](https://qdrant.tech/documentation/data-management): Explore Qdrant's data management integrations for streamlined data processing and transformation. -- [Qdrant Collections Guide](https://qdrant.tech/documentation/concepts/collections): Explore Qdrant's collections for efficient vector management and search optimization. -- [Understanding Qdrant Points](https://qdrant.tech/documentation/concepts/points): Learn how to create and manage points central to Qdrant's vector search technology. -- [Qdrant Cloud Authentication](https://qdrant.tech/documentation/cloud/authentication): Learn how to manage API keys and secure access in Qdrant Cloud. -- [Food Discovery Demo](https://qdrant.tech/articles/food-discovery-demo): Explore Qdrant's open-source food discovery demo for innovative image-based search solutions. -- [Capacity Planning Guide](https://qdrant.tech/documentation/guides/capacity-planning): Optimize your Qdrant cluster with effective RAM and disk storage strategies. -- [Machine Learning Insights](https://qdrant.tech/articles/machine-learning): Discover machine learning techniques and Qdrant's powerful vector search capabilities. -- [Qdrant Internals Overview](https://qdrant.tech/articles/qdrant-internals): Explore Qdrant's vector search engine architecture and components for improved performance. -- [Qdrant Operator Configuration](https://qdrant.tech/documentation/hybrid-cloud/operator-configuration): Explore advanced configuration options for the Qdrant Operator in hybrid cloud environments. -- [Qdrant Installation Guide](https://qdrant.tech/documentation/guides/installation): Explore requirements and options for installing Qdrant efficiently and securely. -- [Qdrant Cloud RBAC Permissions](https://qdrant.tech/documentation/cloud-rbac/permission-reference): Explore Qdrant's documentation for managing cloud permissions effectively and securely. -- [Qdrant Snapshots Overview](https://qdrant.tech/documentation/concepts/snapshots): Learn about snapshot creation and management for data protection in Qdrant. -- [Q&A with Similarity Learning](https://qdrant.tech/articles/faq-question-answering): Explore how Qdrant improves machine learning with efficient question-answering and similarity learning solutions. -- [Understanding Vector Search](https://qdrant.tech/documentation/overview/vector-search): Explore how Qdrant enhances vector search for efficient information retrieval and project integration. -- [Vector Database Benchmarks](https://qdrant.tech/benchmarks): Explore Qdrant's superior benchmarks for vector databases, ensuring efficient, fast, and accurate results. -- [Qdrant Concepts Overview](https://qdrant.tech/documentation/concepts): Discover essential AI concepts with Qdrant's comprehensive and user-friendly documentation. -- [Indexing with Qdrant](https://qdrant.tech/documentation/concepts/indexing): Learn effective indexing strategies for optimized vector and traditional searches in Qdrant. -- [Practice Datasets Overview](https://qdrant.tech/documentation/datasets): Explore ready-made datasets for practical use with Qdrant's advanced embedding technology. -- [Hybrid Search Simplified](https://qdrant.tech/articles/hybrid-search): Enhance your retrieval systems with Qdrant's new Query API for hybrid search. -- [Local Quickstart Guide](https://qdrant.tech/documentation/quickstart): Quickly set up Qdrant locally, create collections, and manage vector data effectively. -- [Metric Learning Insights](https://qdrant.tech/articles/metric-learning-tips): Explore essential tips and tricks for effective metric learning from Qdrant experts. -- [Qdrant Cluster Monitoring](https://qdrant.tech/documentation/cloud/cluster-monitoring): Monitor your Qdrant Cloud clusters with metrics, logs, and alerts for optimal performance. -- [Efficient Layer Recycling](https://qdrant.tech/articles/embedding-recycler): Discover layer recycling techniques to enhance model training speed and efficiency. -- [Data Privacy Solutions](https://qdrant.tech/articles/data-privacy): Learn how Qdrant enhances data privacy with role-based access control and security strategies. -- [Data Ingestion Guide](https://qdrant.tech/documentation/data-ingestion-beginners): Learn how to ingest data into Qdrant for effective semantic search solutions. -- [Immutable Data Structures](https://qdrant.tech/articles/immutable-data-structures): Explore Qdrant's insights on immutable data structures for optimized performance. -- [Understanding Vector Embeddings](https://qdrant.tech/articles/what-are-embeddings): Explore how vector embeddings enhance search and personalized experiences using Qdrant technology. -- [RAG Chatbot Tutorial](https://qdrant.tech/documentation/examples/rag-chatbot-vultr-dspy-ollama): Learn to build private RAG chatbots with Qdrant and Vultr for secure data handling. -- [RAG and GenAI Insights](https://qdrant.tech/articles/rag-and-genai): Explore RAG techniques with Qdrant for advanced AI agents and data retrieval solutions. -- [Medical Chatbot Example](https://qdrant.tech/documentation/examples/qdrant-dspy-medicalbot): Learn to build a reliable medical chatbot using Qdrant and DSPy technologies. -- [Framework Integrations Overview](https://qdrant.tech/documentation/frameworks): Explore Qdrant's comprehensive frameworks for developing innovative AI-powered applications. -- [RAG Analysis Insights](https://qdrant.tech/articles/rag-is-dead): Explore the relevance of vector databases in today’s AI landscape with Qdrant. -- [Memory Consumption Insights](https://qdrant.tech/articles/memory-consumption): Learn to accurately measure RAM needs and optimize Qdrant for efficiency. -- [Distance-Based Exploration](https://qdrant.tech/articles/distance-based-exploration): Discover hidden data structures effortlessly with Qdrant's Distance Matrix API. -- [GPU Support Guide](https://qdrant.tech/documentation/guides/running-with-gpu): Learn to run Qdrant with GPU support for enhanced performance and efficiency. -- [Scaling PDF Retrieval](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale): Learn efficient PDF retrieval using Qdrant and Vision Large Language Models. -- [FastEmbed Semantic Search Guide](https://qdrant.tech/documentation/fastembed/fastembed-semantic-search): Learn to implement FastEmbed with Qdrant for efficient vector searches. -- [Multitenancy & Partitioning](https://qdrant.tech/documentation/guides/multiple-partitions): Learn how to configure multitenancy and partitioning with Qdrant for efficiency. -- [Qdrant's Seed Funding News](https://qdrant.tech/articles/seed-round): Discover Qdrant's innovative vector databases and their recent $7.5M seed funding success. -- [Vector Search Concepts](https://qdrant.tech/documentation/concepts/search): Explore Qdrant's powerful vector search capabilities, including similarity and query APIs. -- [Hybrid Cloud Cluster Creation](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-cluster-creation): Learn to create and configure a Qdrant cluster in your hybrid cloud environment. -- [Enhancing Semantic Search](https://qdrant.tech/documentation/beginner-tutorials/retrieval-quality): Learn to measure and improve retrieval quality in Qdrant's semantic search. -- [Advanced Filtering Techniques](https://qdrant.tech/documentation/concepts/filtering): Explore Qdrant's powerful filtering features for precise vector searches and retrieval. -- [Create Qdrant Snapshots](https://qdrant.tech/documentation/database-tutorials/create-snapshot): Learn to create and restore snapshots for efficient data management in Qdrant. -- [Qdrant Storage Overview](https://qdrant.tech/documentation/concepts/storage): Discover how Qdrant manages data storage segments for efficient vector handling. -- [Qdrant 0.11 Release](https://qdrant.tech/articles/qdrant-0-11-release): Discover key features and improvements in Qdrant v0.11 for enhanced performance. -- [AI Customer Support Guide](https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws): Setup private AI for customer support using Qdrant, Cohere, and Airbyte seamlessly. -- [Explore Qdrant APIs](https://qdrant.tech/documentation/concepts/explore): Discover Qdrant's powerful APIs for innovative data exploration and recommendation. +- [Concepts](https://qdrant.tech/documentation/concepts/): Qdrant's glossary introduces key AI concepts, such as Collections (sets of searchable points), Payloads (associated vector data), and Points (vector records). It also covers tools like Search, Explore, Filtering, Hybrid Queries, and processes like Indexing, Optimization, and Snapshots for efficient storage and retrieval in vector databases. +- [Collections](https://qdrant.tech/documentation/concepts/collections/): A collection in Qdrant is a structured set of points (vectors) that support searching, with specific dimensionality and distance metrics like Dot product, Cosine similarity, and Euclidean distance. Collections can be customized with various configurations (e.g., multitenancy, on-disk storage, sparse/dense vectors, and indexing options), allowing flexibility in structure, storage, and performance tuning, including features like multiple named vectors and uint8 embeddings. +- [Points](https://qdrant.tech/documentation/concepts/points/): Qdrant uses "points" as its core data structure, which consist of vectors and optional payloads, and these are searchable within collections via vector similarity. It supports dense, sparse, and named vectors, allows batch uploads with idempotent APIs, and provides methods for modifying points, including updating and deleting vectors or payloads, while ensuring data consistency with a write-ahead-log mechanism. +- [Vectors](https://qdrant.tech/documentation/concepts/vectors/): Qdrant's vector search engine uses vectors (or embeddings) to represent object similarity in vector space, with dense vectors (fixed-length numerical arrays) as the most common type, while also supporting sparse vectors, multivectors, and named vectors for diverse use cases. By offering options like Float32, Float16, and Uint8 data types, Qdrant optimizes memory use and precision, enabling tasks like similarity search, clustering, and cross-modal representation of data. +- [Payload](https://qdrant.tech/documentation/concepts/payload/): Qdrant enables storing additional metadata, called "payloads," alongside vectors in JSON format and supports various data types like integers, floats, keywords, geo-coordinates, and datetimes. It offers flexible methods for payload management, including setting, overwriting, clearing, deleting, and indexing payloads, which enhance customization and search performance in dynamic datasets. +- [Search](https://qdrant.tech/documentation/concepts/search/): Similarity search utilizes vector-based methods to find objects close in meaning or attributes (like texts, images, or music) in vector space, with Qdrant's `Query API` supporting various search types such as k-NN, filtering, recommendations, and hybrid queries. Key features include customizable metrics (e.g., cosine similarity), filtering by conditions like scores or payloads, efficient approximate and exact search options, and capabilities for handling both dense and sparse vectors, with batch search available for multiple queries in one request. +- [Explore](https://qdrant.tech/documentation/concepts/explore/): Qdrant offers advanced APIs for data exploration, including the **Recommendation API** and **Discovery API**, which support flexible search strategies like average vector, best score, and sum scores, allowing for recommendations based on positive/negative examples or dissimilarity to outliers. These tools, along with multi-vector support, batch processing, and inter-collection lookup, enable powerful use cases such as recommendations, data cleaning, and embedding-agnostic searches optimized by strategies tailored to specific needs and performance trade-offs. +- [Hybrid Queries #required](https://qdrant.tech/documentation/concepts/hybrid-queries/): Qdrant's `Query API` enables advanced search techniques like hybrid and multi-stage queries by combining multiple queries or stages for more precise results. Key features include prefetching for nested queries, result fusion methods (`rrf` and `dbsf`), multi-stage re-scoring, diversity-focused Maximal Marginal Relevance (MMR), and customizable score boosting to incorporate business logic. +- [Filtering](https://qdrant.tech/documentation/concepts/filtering/): Qdrant enables advanced filtering for vector search by allowing the use of logical clauses like `AND`, `OR`, and `NOT`, alongside conditions such as `Match`, `Match Any`, `Match Except`, and nested filters, to refine searches based on payload attributes or point IDs. These filters facilitate complex queries addressing both business-specific requirements, like availability or pricing, and structured JSON data, such as nested fields or arrays. +- [Optimizer](https://qdrant.tech/documentation/concepts/optimizer/): Qdrant employs three key optimizers to enhance data storage and retrieval efficiency: Vacuum Optimizer removes accumulated deleted records to free memory and maintain performance, Merge Optimizer consolidates small segments to improve search efficiency while adhering to size thresholds, and Indexing Optimizer dynamically enables indexes and memmap storage based on data size, optimizing resource usage. All optimizer behaviors are configurable through a configuration file or dynamically adjustable at the collection level. +- [Storage](https://qdrant.tech/documentation/concepts/storage/): Qdrant stores data in segments, which handle vector and payload storage along with indexing, and can be either appendable (supporting additions, deletions, and queries) or non-appendable (read-only). Storage options include in-memory (high speed but RAM-intensive) and memmap (disk-based with flexible memory use), with configurable thresholds for optimization, while payload storage can be in-memory for speed or on-disk for lower RAM usage, supported by indexing to reduce latency during queries. +- [Indexing](https://qdrant.tech/documentation/concepts/indexing/): Qdrant combines vector and traditional indexes to optimize vector search with filters. It offers payload indexes for various data types, parameterized and on-disk options for performance and memory efficiency, and specialized indexing (tenant and principal indexes) to enhance multi-tenant and time-based searches. +- [Snapshots](https://qdrant.tech/documentation/concepts/snapshots/): Snapshots in Qdrant are `tar` archive files capturing the data and configuration of a specific collection on a specific node, requiring separate snapshots for each node in a distributed setup. They facilitate data archiving, replication, and recovery, with multiple restoration methods available, such as via URL, file upload, or start-up parameter, and support full storage snapshots for single-node deployments. +- [Local Quickstart](https://qdrant.tech/documentation/quickstart/): To get started with Qdrant locally, install and run its Docker container, then use the supported client libraries like Python, JavaScript, or others to initialize a connection, create a collection, and add vectors with associated payloads. Ensure security measures are in place as Qdrant runs without encryption or authentication by default. +- [Distance-based data exploration](https://qdrant.tech/articles/distance-based-exploration/): To uncover hidden structures in unstructured high-dimensional datasets, tools like Qdrant’s Distance Matrix API simplify distance calculations, which can then be used with methods such as UMAP for dimensionality reduction or KMeans for clustering. These approaches enable visualizing data relationships and patterns effectively, supporting deeper data exploration without requiring computationally intensive operations. +- [Modern Sparse Neural Retrieval: From Theory to Practice](https://qdrant.tech/articles/modern-sparse-neural-retrieval/): The article explores modern sparse neural retrieval methods that aim to combine the precision of keyword-based approaches like BM25 with the semantic understanding of dense retrievers. It compares key models like DeepCT, DeepImpact, and TILDEv2, detailing their evolution, strengths, and limitations, while providing practical guidance on using SPLADE++ in Qdrant for optimized retrieval solutions. +- [Qdrant Summer of Code 2024 - ONNX Cross Encoders in Python](https://qdrant.tech/articles/cross-encoder-integration-gsoc/): Huong (Celine) Hoang's Summer of Code 2024 internship at Qdrant focused on integrating cross-encoders into the lightweight FastEmbed library to enhance re-ranking capabilities for more context-aware search applications. Through overcoming challenges like tokenization, ONNX model integration, and testing, she successfully expanded FastEmbed's functionality while ensuring user-friendliness and scalability, with future improvements aimed at model support and optimization. +- [An Introduction to Vector Databases](https://qdrant.tech/articles/what-is-a-vector-database/): A **vector database** is a specialized system designed to manage unstructured data by converting it into vectors—numerical representations that capture context and semantics—enabling advanced similarity searches and analysis that traditional databases cannot perform. Unlike conventional structured databases, vector databases excel in applications such as recommendation systems, anomaly detection, and semantic search, leveraging metadata and filtering for more refined results. +- [What is Vector Quantization?](https://qdrant.tech/articles/what-is-vector-quantization/): Vector quantization is a data compression technique that reduces memory usage for high-dimensional data while retaining essential information, enabling faster and more efficient storage and search operations, particularly in large datasets. Methods like Scalar and Binary Quantization significantly shrink vector sizes (e.g., Binary Quantization reduces memory by 32x), with Binary Quantization providing the most substantial speed gains for distance computations using optimized binary operations like XOR and Popcount. +- [Vector Search Resource Optimization Guide](https://qdrant.tech/articles/vector-search-resource-optimization/): This guide provides strategies for optimizing vector database performance and resource efficiency, focusing on techniques like indexing (using HNSW), parameter tuning for balancing speed and accuracy, and data compression via quantization (e.g., reducing 32-bit float vectors to 8-bit representations). These methods aim to help users scale applications cost-effectively while tailoring performance to specific use cases. +- [A Complete Guide to Filtering in Vector Search](https://qdrant.tech/articles/vector-search-filtering/): Qdrant's filtered vector search combines semantic similarity and metadata constraints to improve search precision and efficiency, particularly in applications like e-commerce. By using methods like pre-filtering, post-filtering, and Qdrant's novel filterable HNSW index, it addresses inefficiencies in traditional search approaches, balancing accuracy and computational resources for datasets of varying sizes and metadata cardinality. +- [Qdrant Internals: Immutable Data Structures](https://qdrant.tech/articles/immutable-data-structures/): The article discusses the trade-offs and performance considerations of different data structures based on hardware efficiency, highlighting that no single structure is universally optimal. It emphasizes the benefits of immutability, such as optimized memory allocation and faster access, and introduces advanced techniques like minimal perfect hash functions to reduce hash collisions and improve performance in real-world systems like vector databases (e.g., Qdrant). +- [miniCOIL: on the Road to Usable Sparse Neural Retrieval](https://qdrant.tech/articles/minicoil/): Sparse neural retrieval bridges the gap between term-based and dense retrieval by combining lightweight, explainable methods like BM25 with the ability to capture word meanings, but prior models faced challenges in domain adaptability and efficiency. To address this, Qdrant developed **miniCOIL**, a sparse neural retriever inspired by Contextualized Inverted Lists (COIL), designed to produce lightweight, semantically-aware representations while maintaining robust out-of-domain performance. +- [Relevance Feedback in Informational Retrieval](https://qdrant.tech/articles/search-feedback-loop/): A well-defined search query significantly aids information retrieval, but users often struggle to articulate precise queries, making relevance feedback a critical tool for refining results iteratively. Despite decades of research, integrating relevance feedback into modern vector search systems remains underutilized due to challenges like user reluctance, computational costs, and the early-stage development of neural search techniques, prompting a need to explore approaches like query and scoring refinement for better adoption of these methods. +- [Built for Vector Search](https://qdrant.tech/articles/dedicated-vector-search/): Vector databases are specialized tools designed for efficient high-dimensional data searches, where general-purpose databases struggle with scalability and performance due to vectors' heavy storage and computational demands. With BASE-oriented architectures prioritizing availability and scalability, vector databases leverage fixed-size embeddings for fast indexing and retrieval, making them ideal for applications requiring high-throughput, low-latency search, unlike ACID-compliant systems built for strict consistency. +- [Any* Embedding Model Can Become a Late Interaction Model... If You Give It a Chance!](https://qdrant.tech/articles/late-interaction-models/): Qdrant 1.10's multi-vector support allows traditional dense embedding models to be adapted for late interaction retrieval by leveraging output token embeddings, achieving competitive or superior performance compared to specialized models like ColBERT while maintaining efficiency. Experimental results show that using output token embeddings enhances retrieval quality across multiple datasets, suggesting significant optimization potential for advanced search systems. +- [Optimizing Memory for Bulk Uploads](https://qdrant.tech/articles/indexing-optimization/): Efficient memory management during bulk ingestion in Qdrant can be achieved by temporarily disabling dense vector indexing (`m=0`), deferring HNSW graph construction, and enabling on-disk storage (`on_disk: true`) for vectors and indexes to reduce RAM usage. These practices help avoid out-of-memory errors and maintain performance, particularly in large-scale datasets or memory-constrained environments. +- [Introducing Gridstore: Qdrant's Custom Key-Value Store](https://qdrant.tech/articles/gridstore-key-value-storage/): Qdrant built its own storage engine, Gridstore, after encountering limitations with RocksDB, such as compaction-induced latency and configuration complexity, as well as a need for tailored features like efficient handling of sequential keys and variable-sized data. Gridstore's architecture consists of three layers—Data, Mask, and Gaps—that optimize fast retrieval, space reuse, and scalable updates, while ensuring data integrity through a Write-Ahead Log for crash recovery. +- [What is Agentic RAG? Building Agents with Qdrant](https://qdrant.tech/articles/agentic-rag/): Standard Retrieval-Augmented Generation (RAG) follows a linear flow of retrieving documents and generating responses, while Agentic RAG integrates agents, which use large language models (LLMs) to make decisions and take non-linear, multi-step actions, such as querying databases, improving queries, or self-correcting. While tools like LangGraph facilitate building such agentic systems by enabling cyclic workflows and decision-making, their practical applications often depend on use cases, with limitations like higher cost and latency compared to simpler RAG systems. +- [Hybrid Search Revamped - Building with Qdrant's Query API](https://qdrant.tech/articles/hybrid-search/): Qdrant 1.10 introduces a powerful new Query API that simplifies building hybrid search systems by enabling server-side integration of dense, sparse, and multivector search strategies, as well as supporting complex search pipelines and late interaction models like ColBERT for efficient reranking. Unlike simple linear combinations of scores, Qdrant employs advanced methods such as Reciprocal Rank Fusion and layered reranking to improve retrieval quality, emphasizing data-driven experimentation and effectiveness evaluation. +- [What is RAG: Understanding Retrieval-Augmented Generation](https://qdrant.tech/articles/what-is-rag-in-ai/): Retrieval-augmented generation (RAG) enhances Large Language Models (LLMs) by integrating external information retrieval, enabling them to access and generate responses based on relevant data from a vector database rather than relying solely on pre-trained knowledge. The RAG architecture includes a "retriever" using vector similarity techniques (dense or sparse) to identify relevant data and a "generator" to create accurate, non-hallucinated outputs, with hybrid search approaches often improving results further. +- [BM42: New Baseline for Hybrid Search](https://qdrant.tech/articles/bm42/): BM25 has remained a robust and widely used search algorithm due to its simplicity and focus on term importance within a corpus (via IDF), though its limitations in handling modern Retrieval-Augmented Generation (RAG) use cases with shorter, fixed-length documents have reduced its effectiveness. Despite alternatives like SPLADE showing promise, challenges such as inefficient token expansion, domain dependency, and high computational costs persist, motivating efforts like Qdrant's approach to combine BM25's interpretability with transformer-based semantic insights. +- [Qdrant 1.8.0: Enhanced Search Capabilities for Better Results](https://qdrant.tech/articles/qdrant-1.8.x/): Qdrant 1.8.0 introduces significant performance enhancements, including up to 16x faster hybrid search with sparse vectors, dynamic CPU resource allocation for optimized indexing, and improved text data indexing that reduces RAM usage by 10%. Additional features include ordering search results by metadata, datetime payload support, collection existence checks, and enhanced nested field modification capabilities, offering a more efficient and scalable search experience. +- [Optimizing RAG Through an Evaluation-Based Methodology](https://qdrant.tech/articles/rapid-rag-optimization-with-qdrant-and-quotient/): AI, particularly through Retrieval Augmented Generation (RAG) methods that integrate vector databases like Qdrant and evaluation tools like Quotient, significantly improves knowledge management by enhancing the accuracy, relevance, and reliability of generated responses in organizational contexts. Key findings show that optimizing document retrieval strategies, adapting to query needs, and careful evaluation of LLM outputs are essential to reducing hallucinations and ensuring effective RAG solutions. +- [Is RAG Dead? The Role of Vector Databases in Vector Search | Qdrant](https://qdrant.tech/articles/rag-is-dead/): Despite advances like larger LLM context windows (e.g., Gemini 1.5’s 10M tokens), Retrieval Augmented Generation (RAG) and vector databases remain essential for efficiency, accuracy, and cost-effectiveness, especially in enterprise applications. Relying solely on increased context is computationally expensive and less precise, whereas vector search offers faster, cheaper, and more targeted information retrieval, making it indispensable for scalable AI systems. +- [Optimizing OpenAI Embeddings: Enhance Efficiency with Qdrant's Binary Quantization](https://qdrant.tech/articles/binary-quantization-openai/): OpenAI Ada-003 embeddings provide advanced performance for NLP tasks but are large and resource-intensive; the article details how Qdrant's Binary Quantization addresses this by significantly reducing storage requirements and accelerating real-time search while maintaining strong accuracy. Experimental results demonstrate that features like rescoring further boost accuracy—especially for higher-dimensional models—making binary quantization a practical solution for efficient and scalable use of OpenAI embeddings in real-world applications. +- [How to Implement Multitenancy and Custom Sharding in Qdrant](https://qdrant.tech/articles/multitenancy/): Qdrant enables scalable machine learning deployments by supporting multitenancy and custom sharding, allowing data from multiple customers to be efficiently isolated and managed within a single cluster. These features improve performance, reduce costs, enable precise data placement (such as by region or time), and provide multiple levels of data isolation, making Qdrant suitable for complex, compliance-sensitive, and large-scale applications. +- [Data Privacy with Qdrant: Implementing Role-Based Access Control (RBAC)" #required](https://qdrant.tech/articles/data-privacy/): Vector databases often contain sensitive proprietary data and are subject to strict compliance regulations, making robust security measures—such as encryption, role-based access control, and deployment flexibility—essential to prevent data breaches and meet legal requirements. Qdrant addresses these needs by offering features like API key and JWT-based authentication, TLS encryption, and granular access control, enabling enterprises to build secure, compliant, and data-sovereign vector search solutions. +- [Discovery needs context](https://qdrant.tech/articles/discovery-search/): Discovery search in Qdrant 1.7 enables more controlled exploration of vector spaces by allowing users to define contextual boundaries using pairs of positive and negative vectors, either with or without a specific target point. This approach supports advanced search and filtering applications—like refining image or recommendation results—by partitioning the search space and incorporating positive/negative feedback dynamically. +- [What are Vector Embeddings? - Revolutionize Your Search Experience](https://qdrant.tech/articles/what-are-embeddings/): Embeddings are numerical vector representations created by deep learning models to capture the meaning and context of complex data like text, images, or audio, enabling efficient processing, search, and personalized recommendations by measuring semantic similarity within high-dimensional spaces. Advanced models such as BERT and GPT generate context-aware embeddings for nuanced understanding, and these vectors can be integrated with APIs for applications like search, recommendations, and retrieval augmented generation. +- [What is a Sparse Vector? How to Achieve Vector-based Hybrid Search](https://qdrant.tech/articles/sparse-vectors/): Sparse vectors represent documents by assigning non-zero weights only to select tokens (words or subwords), making them efficient and interpretable for tasks like search and ranking, whereas dense vectors assign non-zero values to all elements, capturing more nuanced relationships but at a higher computational cost. Methods like SPLADE use neural networks to improve upon traditional sparse models like BM25, enabling better handling of specialized terms and large datasets, though sparse vectors still struggle with capturing deeper semantic relationships compared to dense embeddings. +- [Qdrant 1.7.0 has just landed!](https://qdrant.tech/articles/qdrant-1.7.x/): Qdrant 1.7.0 introduces major new features including support for sparse vectors for keyword-based search, a Discovery API for advanced vector exploration, user-defined sharding for more flexible data organization, and snapshot-based shard transfer for efficient cluster scaling. The release also includes various minor improvements and bug fixes to enhance overall performance. +- [Deliver Better Recommendations with Qdrant’s new API](https://qdrant.tech/articles/new-recommendation-api/): Qdrant 1.6 introduces a more flexible Recommendation API, enabling users to mix IDs and vector embeddings as positive and negative examples, and to choose between the traditional `average_vector` and the new `best_score` strategies for tailored recommendations. The update enhances semantic and recommendation search capabilities by allowing more control over how vectors are combined and how candidate points are evaluated during approximate nearest neighbor searches using the HNSW graph structure. +- [Vector Search as a dedicated service](https://qdrant.tech/articles/dedicated-service/): The article discusses the debate over whether to use specialized vector databases or existing database plugins for storing embeddings in LLM applications, highlighting that vector databases (better termed "search engines") are architecturally distinct for scalability and search speed. It argues for dedicated vector solutions over integrated plugins, debunking misconceptions about data duplication, synchronization complexity, costs, and maintenance, and emphasizing the flexibility, performance, and manageability advantages of specialized vector search engines. +- [FastEmbed: Qdrant's Efficient Python Library for Embedding Generation](https://qdrant.tech/articles/fastembed/): FastEmbed is a Python library designed to simplify and accelerate the creation of text embeddings by providing easy-to-use workflows, quantized models, minimal dependencies, and ONNXRuntime integration, making it faster and more resource-efficient than traditional frameworks like PyTorch Transformers. It supports a select set of high-quality, quantized transformer models optimized for CPU usage, offering significant speed and installation advantages for production NLP applications. +- [Google Summer of Code 2023 - Polygon Geo Filter for Qdrant Vector Database](https://qdrant.tech/articles/geo-polygon-filter-gsoc/): As a Google Summer of Code 2023 participant, Zein Wen worked on enhancing Qdrant's geo filtering capabilities by adding a Geo Polygon Filter, allowing users to refine vector database queries with complex geographic boundaries for greater flexibility and real-world applicability. The project involved addressing technical challenges in geometry computation and API design, focusing on efficiency, usability, and user experience, while providing valuable personal growth and open-source collaboration experience. +- [Binary Quantization - Vector Search, 40x Faster](https://qdrant.tech/articles/binary-quantization/): Qdrant’s new binary quantization (BQ) feature significantly reduces memory usage and boosts retrieval speeds—up to 40x—by converting high-dimensional vectors into binary embeddings, with recall accuracy customizable during search. While BQ is ideal for large vectors (over 1024 dimensions), its compression is most effective at scale, and it supports a flexible trade-off between speed and accuracy with minimal user intervention. +- [Food Discovery Demo](https://qdrant.tech/articles/food-discovery-demo/): Qdrant's open-source Food Discovery Demo uses semantic image search powered by the CLIP model and the Qdrant vector database to help users explore food options even without explicit queries, offering features like random dish suggestions, text-based search, and personalized recommendations through user feedback (including negative-only feedback). The system's architecture includes a FastAPI backend, React frontend, and supports diverse search modes on a dataset of over 2M dishes, with CLIP embeddings enabling effective cross-modal search and feedback-driven result refinement. +- [Google Summer of Code 2023 - Web UI for Visualization and Exploration](https://qdrant.tech/articles/web-ui-gsoc/): Kartik Gupta participated in Google Summer of Code 2023, where he developed a user-friendly web-based UI for Qdrant, a vector search engine, completing six major milestones including UI design, data exploration, and advanced query features while overcoming technical and project management challenges. His experience deepened his skills in vector data visualization, optimization techniques like web workers, and collaborative problem-solving, and highlighted areas for future improvement such as enhanced autocomplete and expanded data visualization. +- [Qdrant Summer of Code 2024 - WASM based Dimension Reduction](https://qdrant.tech/articles/dimension-reduction-qsoc/): Jishan Bhattacharya interned at Qdrant, focusing on optimizing high-dimensional vector visualization by rewriting the t-SNE algorithm in Rust with WebAssembly, introducing multi-threading, and implementing the Barnes-Hut approximation for significant speed improvements. Key challenges included handling large datasets, optimizing data transfer and rendering, and future improvements were identified, such as faster payload parsing, more efficient data handling, and adopting a WebGL-based chart library. +- [Semantic Search As You Type](https://qdrant.tech/articles/search-as-you-type/): The author optimized semantic search-as-you-type for their website by switching from a Python-based Qdrant implementation to one in Rust, using ONNX Runtime for embeddings and a prefix cache to greatly accelerate short queries—achieving more than a 10x speedup in some cases. Further quality improvements include prioritized, parallelized searches for exact matches and semantic relevance, with batch requests for efficiency and deduplication, resulting in a faster and more responsive search experience. +- [Vector Similarity: Going Beyond Full-Text Search | Qdrant](https://qdrant.tech/articles/vector-similarity-beyond-search/): Vector similarity search enables advanced data exploration beyond traditional full-text search by allowing semantic comparisons, cross-modal retrieval, dissimilarity and diversity searches, outlier detection, and enhanced recommendations, functions not possible or limited in keyword-based search. Vector databases leverage these capabilities to unlock new insights from unstructured data, though they require dedicated design patterns and should be used distinctly from full-text search for optimal results. +- [Serverless Semantic Search](https://qdrant.tech/articles/serverless/): This guide demonstrates how to create a free, non-commercial semantic search engine for your website or app by combining Rust, AWS Lambda, an embedding provider (like Cohere), and a Qdrant instance, with step-by-step setup instructions for each tool and service. It covers deploying a Rust-based Lambda function, safely managing API keys for embedding services, and integrating all components to achieve semantic search functionality. +- [Introducing Qdrant 1.3.0](https://qdrant.tech/articles/qdrant-1.3.x/): Qdrant 1.3.0 introduces major updates including an asynchronous I/O interface for faster disk operations, oversampling for improved quantization accuracy, a grouping API for cross-collection lookups, a new Web UI dashboard, and support for specifying a temporary directory for snapshots. These enhancements focus on boosting performance, storage efficiency, and user experience, while addressing key user feature requests. +- [Qdrant under the hood: io_uring](https://qdrant.tech/articles/io_uring/): Qdrant version 1.3.0 adds an io_uring-based async storage backend on Linux, significantly improving disk IO throughput and reducing CPU overhead compared to traditional mmap-based IO, especially for large datasets and quantization workloads. Benchmark results show a dramatic reduction in processing time (e.g., from 43s to 12s) and higher IOPS using io_uring, demonstrating its effectiveness for high-performance, IO-bound scenarios on modern Linux systems. +- [Product Quantization in Vector Search | Qdrant](https://qdrant.tech/articles/product-quantization/): Product Quantization, introduced in Qdrant 1.2.0, is a customizable technique that significantly reduces memory usage in vector search by dividing vectors into chunks, clustering them, and storing centroid IDs, achieving higher compression rates than Scalar Quantization at the cost of some search precision and potentially slower indexing/search times. It is particularly beneficial for low-RAM environments or high-dimensional vectors, but Scalar Quantization may be preferable when accuracy and speed are more critical. +- [Scalar Quantization: Background, Practices & More | Qdrant](https://qdrant.tech/articles/scalar-quantization/): Scalar quantization compresses high-dimensional vector embeddings by converting float32 values to int8, significantly reducing memory usage (by up to 75%) and improving performance with minimal loss in search precision. Benchmarks show that scalar quantization in Qdrant yields reduced search latency and indexing time, making it highly effective for large-scale vector databases. +- [On Unstructured Data, Vector Databases, New AI Age, and Our Seed Round.](https://qdrant.tech/articles/seed-round/): Qdrant, an open-source vector database company, has raised $7.5M in seed funding led by Unusual Ventures to advance its high-performance solutions for managing unstructured data and powering AI applications. Their focus is on making billion-scale vector search affordable and accessible, leveraging innovations like Scalar Quantization and a robust technical stack to serve modern AI use cases, with strong support from the developer community. +- [Using LangChain for Question Answering with Qdrant](https://qdrant.tech/articles/langchain-integration/): LangChain's integration with Qdrant enables streamlined question answering by combining semantic search—using vector embeddings stored in Qdrant—with large language models that generate answers from retrieved context, all implemented in just a few lines of code. This approach leverages two models (one for embeddings, one for language generation) and simplifies complex application pipelines, making it easy to build, extend, and maintain QA systems. +- [Minimal RAM you need to serve a million vectors](https://qdrant.tech/articles/memory-consumption/): Measuring memory usage with tools like `htop` can be inaccurate due to memory pre-allocation, unfreed deallocations, shared memory in forked processes, and disk cache, so the best way to determine actual RAM requirements is to experimentally limit process memory and observe functionality. Benchmarks with Qdrant show that, depending on configuration, serving 1 million vectors may require as little as 1.2GB RAM in-memory, or down to 135MB using memory-mapped files—with the trade-off of slower search speed as memory allocation decreases. +- [Question Answering as a Service with Cohere and Qdrant](https://qdrant.tech/articles/qa-with-cohere-and-qdrant/): Bi-encoders efficiently power semantic Question Answering systems by using the same neural model to generate vector embeddings for both questions and answers, enabling similarity-based retrieval via tools like Cohere’s co.embed API and Qdrant’s vector database without maintaining your own infrastructure. This setup streamlines deploying scalable semantic search, and can be readily implemented using public datasets (like pubmed_qa) and SaaS solutions for embedding generation and vector storage. +- [Introducing Qdrant 1.2.x](https://qdrant.tech/articles/qdrant-1.2.x/): Qdrant 1.2 introduces significant new features including Product Quantization for up to 64x memory reduction, optional named vectors, server-side grouping requests, and advanced nested filters, alongside important changes like recovery mode, appendable mmap, and enhanced security with API key and TLS support. The update also encourages user feedback and detailed documentation is available for all these enhancements. +- [Finding errors in datasets with Similarity Search](https://qdrant.tech/articles/dataset-quality/): Modern applications rely on accurate data categorization, but manual and model-based approaches to labeling are prone to costly errors that affect user experience and business outcomes. Distance-based methods like similarity and diversity search, which use embeddings to identify outliers or misplaced items, can help improve dataset quality, especially in contexts like online furniture marketplaces, though combining multiple techniques may be necessary for best results. +- [Q&A with Similarity Learning](https://qdrant.tech/articles/faq-question-answering/): Traditional classification methods for question-answering systems require extensive data labeling and frequent retraining, making them inefficient for dynamic tasks like customer support automation. Instead, similarity learning with embeddings—further improved through fine-tuning using frameworks like Quaterion—offers a more flexible and scalable solution by matching questions and answers based on vector similarity rather than fixed classes. +- [Why Rust?](https://qdrant.tech/articles/why-rust/): Qdrant is built in Rust due to its superior memory safety, performance, and control compared to languages like Java, Scala, Python, and Go, resulting in efficient, robust, and maintainable cloud services. Rust's strict type system minimizes bugs, its growing ecosystem and community support innovation, and its adoption by major tech companies further validates Qdrant’s technology choice. +- [Layer Recycling and Fine-tuning Efficiency](https://qdrant.tech/articles/embedding-recycler/): Layer recycling, which involves caching outputs from frozen layers to avoid redundant computation during training and inference, can achieve significant speedups (~83%) with minimal performance loss, but its effectiveness and optimal percentage of layers to recycle are task-dependent. Experiments with Quaterion show that recycling 50% of layers often closely matches full fine-tuning performance, though results vary with task type and dataset size. +- [Fine Tuning Similar Cars Search](https://qdrant.tech/articles/cars-recognition/): Supervised classification is not suitable for tasks where class categories vary, are incomplete, or cannot be fully enumerated, which can be addressed by similarity learning, though it introduces challenges like larger batch sizes, complex loss functions, and different training/inference architectures. The Quaterion framework, built on PyTorch Lightning, facilitates similarity learning with modules for model training and inference, built-in loss functions, dataset handling, and an efficient workflow, as exemplified in a tutorial using the Stanford Cars dataset with novel-class generalization. +- [Metric Learning Tips & Tricks](https://qdrant.tech/articles/metric-learning-tips/): The article explains how metric learning, specifically the representation-based approach, can be used to train object matching models without labeled data by leveraging relative similarities and self-supervised datasets, enabling more scalable and flexible solutions than traditional classification—especially when dealing with a large or growing number of classes. Key methods include using embeddings, vector databases, and hard negative mining to effectively match objects (e.g., job positions and candidates) in production environments without manual labeling. +- [Metric Learning for Anomaly Detection](https://qdrant.tech/articles/detecting-coffee-anomalies/): Anomaly detection is challenging due to scarce and evolving data, making traditional supervised approaches costly and hard to maintain; however, a metric learning approach leveraging autoencoders and as few as 200 labeled samples achieved performance comparable to supervised classification trained on ~30,000 samples, offering a more efficient and scalable solution. This method allows for easier adaptation, reduced labeling requirements, and practical production deployment using vector search engines. +- [Triplet Loss - Advanced Intro](https://qdrant.tech/articles/triplet-loss/): Triplet Loss, introduced in the FaceNet paper, is a supervised metric learning loss that ensures an anchor sample is closer to a positive (same label) than to a negative (different label) by at least a margin, and is more tolerant of intra-class variance compared to Contrastive Loss. Efficient training with Triplet Loss relies on strategies like online triplet mining and computational tricks such as distance matrices and masking to select useful triplets and speed up calculations. +- [Neural Search 101: A Complete Guide and Step-by-Step Tutorial](https://qdrant.tech/articles/neural-search-tutorial/): Neural search uses neural networks to generate vector embeddings of queries and documents, enabling searches based on meaning rather than just keywords, which improves search performance for fuzzy or imprecise queries such as images, audio, or long text. The guide explains how to build a neural search system step-by-step, using pre-trained models for embedding (like sentence-transformers), and a vector search engine (like Qdrant) to store and retrieve relevant items. +- [Filtrable HNSW](https://qdrant.tech/articles/filtrable-hnsw/): Standard vector similarity libraries like Annoy, FAISS, and NMSLib offer fast approximate nearest neighbor search but have limited options for incorporating constraints during search. By modifying the HNSW algorithm to apply filters—such as categorical, numerical, or geographical constraints—during the search phase, it is possible to maintain graph connectivity and efficient search performance, with specific strategies suggested for each filtering type. +- [Introducing Qdrant 0.11](https://qdrant.tech/articles/qdrant-0-11-release/): Qdrant v0.11 introduces key features such as replication support for high availability and scalability, an administration API to control write operations, and an exact search parameter for improved validation, while maintaining backward compatibility for single node deployments but not for distributed setups. These enhancements focus on performance, reliability, and management capabilities. +- [Qdrant 0.10 released](https://qdrant.tech/articles/qdrant-0-10-release/): Qdrant 0.10 introduces major performance improvements and key features including support for storing multiple vectors per object in a single collection, batch vector search via a single API call, built-in ARM support with optimized Docker images, and new full-text filtering capabilities alongside keyword filters. These enhancements simplify multi-vector management, reduce network overhead, broaden platform compatibility, and improve search flexibility. +- [Vector Search in constant time](https://qdrant.tech/articles/quantum-quantization/): Quantum quantization leverages quantum computing to convert float32 vectors into entangled qbit vectors, enabling highly efficient vector search in artificial neural networks. By applying Grover's algorithm and a technique called transposition, this approach allows vector search on arbitrarily large databases to be performed in constant time. +- [How to choose an embedding model](https://qdrant.tech/articles/how-to-choose-an-embedding-model/): Choosing the right embedding model for vector search depends on multiple factors including search quality, language coverage, resource requirements, tokenizer compatibility, model size, sequence length, and support for optimization techniques, with no universal best model for all scenarios. The article emphasizes the importance of evaluating models on domain-specific data and tasks, understanding tokenizer effects, and building relevant ground truth datasets to make informed decisions. +- [Vector Search in Production](https://qdrant.tech/articles/vector-search-production/): Running vector search in production requires careful management of system resources (especially memory), optimizing indexing and search parameters, and leveraging strategies like data quantization to maintain high performance and reliability under real-world loads. Default configurations are often inadequate, so tuning settings, indexing relevant metadata, balancing speed and accuracy, and properly offloading or compressing data are essential for a resilient and efficient search infrastructure. +- [Semantic Cache: Accelerating AI with Lightning-Fast Data Retrieval](https://qdrant.tech/articles/semantic-cache-ai-data-retrieval/): Semantic cache is an AI optimization technique that stores answers to semantically similar queries—rather than just exact matches—to improve retrieval speed and reduce computational costs, especially in applications like Retrieval-Augmented Generation (RAG) systems. By leveraging vector databases like Qdrant, semantic cache retrieves responses for questions with similar meaning (even if phrased differently), providing scalability and cost-efficiency for question-answering and chatbot applications. +- [Full-text filter and index are already available!](https://qdrant.tech/articles/qdrant-introduces-full-text-filters-and-indexes/): Qdrant, an efficient vector database, now supports full-text filters in addition to keyword filtering, enabling more advanced search capabilities that can be combined with other filter types. Using a full-text index on filtered fields significantly improves query performance, offers customizable text tokenization options, and allows case-insensitive searches, making frequent queries more efficient. +- [Optimizing Semantic Search by Managing Multiple Vectors](https://qdrant.tech/articles/storing-multiple-vectors-per-object-in-qdrant/): Qdrant 0.10 introduced support for storing multiple vectors per object within a single collection, enabling efficient semantic search across various data types (e.g., image and text) without duplicating payloads. By configuring named vectors and using pretrained models to generate embeddings, users can flexibly search using different vector types for richer, more customizable search results. +- [Mastering Batch Search for Vector Optimization](https://qdrant.tech/articles/batch-vector-search-with-qdrant/): Qdrant 0.10.0 introduces a batch search feature that allows multiple vector searches in a single API call, greatly reducing network overhead and simplifying application code. Benchmarks showed that using batch search and multiprocessing together can speed up vector search operations by over 30% compared to sequential requests. +- [Vultr and Qdrant Hybrid Cloud Support Next-Gen AI Projects](https://qdrant.tech/blog/hybrid-cloud-vultr/): Qdrant and Vultr have partnered to offer a flexible, scalable, and secure deployment of Qdrant Hybrid Cloud on Vultr's global infrastructure, enabling rapid and customizable development of AI and ML applications with seamless vector search and centralized management. This integration streamlines deployment, optimizes performance and costs, and ensures data privacy while supporting global reach and compliance. +- [STACKIT and Qdrant Hybrid Cloud for Best Data Privacy](https://qdrant.tech/blog/hybrid-cloud-stackit/): Qdrant and STACKIT have launched Qdrant Hybrid Cloud, enabling developers to deploy a fully managed, scalable vector database on German data centers via STACKIT, ensuring full data privacy, GDPR compliance, and seamless integration. This solution empowers businesses to build secure, AI-driven applications—such as contract management platforms—while maintaining complete control over their sensitive data. +- [Qdrant Hybrid Cloud and Scaleway Empower GenAI](https://qdrant.tech/blog/hybrid-cloud-scaleway/): Qdrant and Scaleway have partnered to launch Qdrant Hybrid Cloud, a fully managed, scalable vector database that enables developers—especially in startups and Europe—to easily deploy advanced AI applications like RAG on Scaleway's secure and sustainable infrastructure while maintaining data sovereignty. This integration offers rapid setup, intuitive management, and AI-focused resources, making scalable vector search and modern AI workloads accessible and efficient for a broad range of users. +- [Red Hat OpenShift and Qdrant Hybrid Cloud Offer Seamless and Scalable AI](https://qdrant.tech/blog/hybrid-cloud-red-hat-openshift/): Qdrant has partnered with Red Hat to make its Hybrid Cloud vector database available on Red Hat OpenShift, allowing enterprises to easily deploy, scale, and securely manage AI-powered vector search workloads across hybrid cloud environments while maintaining full data sovereignty. This integration leverages OpenShift’s scalability, automation, and security features to simplify AI infrastructure for use cases like retrieval augmented generation and recommendation systems. +- [Qdrant and OVHcloud Bring Vector Search to All Enterprises](https://qdrant.tech/blog/hybrid-cloud-ovhcloud/): Qdrant Hybrid Cloud is now available as a fully managed vector database on OVHcloud, enabling European businesses to deploy AI-driven solutions quickly and securely within their existing, GDPR-compliant infrastructure while maintaining data sovereignty. The seamless integration offers simple setup, strong security, open ecosystem compatibility, and cost-efficient performance, with a tutorial provided to demonstrate building scalable recommendation systems entirely within OVHcloud. +- [New RAG Horizons with Qdrant Hybrid Cloud and LlamaIndex](https://qdrant.tech/blog/hybrid-cloud-llamaindex/): LlamaIndex and Qdrant have partnered to launch Qdrant Hybrid Cloud, enabling engineers and scientists to securely and flexibly develop, scale, and deploy advanced GenAI applications using Kubernetes-based architecture and robust vector search capabilities. This integration allows seamless, open-source, and scalable deployments, supports advanced hybrid semantic search, and includes comprehensive resources such as tutorials for building context-augmented AI solutions like document retrieval systems. +- [Developing Advanced RAG Systems with Qdrant Hybrid Cloud and LangChain](https://qdrant.tech/blog/hybrid-cloud-langchain/): LangChain and Qdrant have launched Qdrant Hybrid Cloud, a Kubernetes-based solution that enables secure, scalable deployment of GenAI applications with advanced vector search, seamless integration, and robust RAG (Retrieval-Augmented Generation) capabilities in any environment. This collaboration empowers organizations to efficiently build and deploy AI products, including complex QA systems, with open-source compatibility, enterprise features, and comprehensive documentation and tutorials. +- [Cutting-Edge GenAI with Jina AI and Qdrant Hybrid Cloud](https://qdrant.tech/blog/hybrid-cloud-jinaai/): Qdrant and Jina AI have partnered to launch Qdrant Hybrid Cloud, a Kubernetes-native solution that integrates Jina AI's advanced embedding models and APIs to enable rapid, flexible, and secure deployment of scalable Generative AI applications, particularly for Retrieval Augmented Generation (RAG) use cases. This collaboration provides seamless deployment, scalable and secure vector search, and cost efficiency, with comprehensive tutorials and documentation to help users quickly build and deploy AI-powered solutions. +- [Qdrant Hybrid Cloud and Haystack for Enterprise RAG](https://qdrant.tech/blog/hybrid-cloud-haystack/): Qdrant and Haystack have expanded their integration with the launch of Qdrant Hybrid Cloud, enabling developers to easily deploy production-ready, customizable Retrieval-Augmented Generation (RAG) AI applications with full data control in any environment. Their open-source, modular tools—now optimized with Haystack 2.0 and features like Hayhooks—simplify building, customizing, and deploying enterprise-grade AI solutions, with comprehensive tutorials and documentation available for rapid adoption. +- [Qdrant Hybrid Cloud and DigitalOcean for Scalable and Secure AI Solutions](https://qdrant.tech/blog/hybrid-cloud-digitalocean/): DigitalOcean and Qdrant have integrated to offer a managed vector database solution—Qdrant Hybrid Cloud—on DigitalOcean Kubernetes, enabling developers to efficiently deploy advanced AI applications while maintaining data privacy and control within their own infrastructure. This integration streamlines deployment, enhances flexibility, and makes it easy to build scalable AI-powered services such as semantic search and recommendation systems. +- [Enhance AI Data Sovereignty with Aleph Alpha and Qdrant Hybrid Cloud](https://qdrant.tech/blog/hybrid-cloud-aleph-alpha/): Aleph Alpha and Qdrant have partnered to offer a hybrid cloud vector database solution that prioritizes data sovereignty, security, and compliance—particularly for European enterprises—enabling organizations to securely deploy scalable, AI-powered applications within their own infrastructure. Their joint platform integrates advanced AI models, seamless scalability, and regional regulatory alignment, with resources and tutorials available to help businesses build secure, region-specific AI systems. +- [Elevate Your Data With Airbyte and Qdrant Hybrid Cloud](https://qdrant.tech/blog/hybrid-cloud-airbyte/): Airbyte and Qdrant have partnered to launch Qdrant Hybrid Cloud, the first managed vector database deployable on any environment, integrating Airbyte's data ingestion capabilities with Qdrant’s advanced indexing and search for scalable, private, and flexible AI data infrastructure. This collaboration enables seamless, up-to-date data pipelines for advanced GenAI and Retrieval Augmented Generation (RAG) applications, offering enterprise users flexible deployment, cost control, and open-source compatibility. +- [How PortfolioMind Delivered Real-Time Crypto Intelligence with Qdrant](https://qdrant.tech/blog/case-study-portfolio-mind/): PortfolioMind leveraged Qdrant’s multivector search capabilities to transform real-time user interactions and diverse crypto data into dynamic, personalized research insights, resulting in a 70% reduction in query latency, a 58% increase in interaction relevance, and a 22% improvement in user retention. Future enhancements include mapping cross-user interests, tracking changes in curiosity over time, and optimizing onboarding for new users. +- [Qdrant Edge: Vector Search for Embedded AI](https://qdrant.tech/blog/qdrant-edge/): Qdrant Edge is a lightweight, embedded vector search engine designed for on-device AI applications, enabling low-latency, multimodal search and retrieval in resource-constrained or offline environments such as robotics, mobile devices, and IoT agents. Now in private beta, it offers core Qdrant features re-architected for edge deployments, supporting local inference and decision-making where traditional cloud-based vector stores are unsuitable. +- [Qdrant for Research: The Story Behind ETH & Stanford’s MIRIAD Dataset](https://qdrant.tech/blog/miriad-qdrant/): Researchers from ETH Zurich and Stanford released MIRIAD, an open-source dataset containing 5.8 million high-quality, literature-grounded medical question-answer pairs designed to improve the reliability and reduce hallucinations in medical AI applications by serving as a structured knowledge base for Retrieval Augmented Generation (RAG). Built from 900,000 curated medical papers using a multi-stage filtering process and leveraging Qdrant for scalable retrieval, MIRIAD demonstrated significant improvements in benchmark accuracy and hallucination detection, and is freely available for the community with interactive exploration tools. +- [Qdrant 1.15 - Smarter Quantization & better Text Filtering](https://qdrant.tech/blog/qdrant-1.15.x/): Qdrant 1.15.0 introduces new 1.5- and 2-bit quantization modes and asymmetric quantization for improved vector compression and search accuracy, along with major text index upgrades including multilingual tokenization, configurable stopwords, stemming, and phrase matching. Additional optimizations include HNSW healing for faster index rebuilding and migration to Gridstore for increased data ingestion speed. +- [Qdrant joins AI Agent category on AWS Marketplace to accelerate Agentic AI development](https://qdrant.tech/blog/ai-agents-aws-marketplace/): Qdrant’s high-performance vector search engine is now available in the new AWS Marketplace AI Agents and Tools category, enabling customers to easily discover, purchase, and deploy enterprise AI agent solutions—such as real-time, step-aware reasoning and context retrieval—directly within their AWS environments. This integration streamlines procurement, enhances agent performance with advanced vector-native capabilities, and offers both managed SaaS and hybrid deployment options for scalability and compliance. +- [How &AI scaled global legal retrieval with Qdrant](https://qdrant.tech/blog/case-study-and-ai/): &AI uses Qdrant’s scalable vector database to power fast, accurate semantic search across billions of global patent documents, enabling legal professionals to efficiently invalidate patents and streamline litigation workflows. By focusing on retrieval over generation and leveraging cloud-native, multi-tenant infrastructure, &AI dramatically reduced DevOps overhead while delivering sub-second search performance and robust data privacy. +- [Introducing Qdrant Cloud Inference](https://qdrant.tech/blog/qdrant-cloud-inference-launch/): Qdrant Cloud Inference lets users generate, store, and index embeddings for text and images in a single API call within the Qdrant Cloud environment, eliminating the need for separate services and reducing complexity, latency, and costs for AI applications such as RAG, hybrid, and multimodal search. It supports multiple curated models, offers generous free monthly token allowances to paid users, is live on major cloud providers in US regions, and enables simplified, faster development with unified billing and easy activation. +- [Announcing Vector Space Day 2025 in Berlin](https://qdrant.tech/blog/vector-space-day-2025/): Vector Space Day 2025, hosted by Qdrant on September 26 in Berlin, is a full-day event for engineers, researchers, and AI builders to explore advancements in vector search, RAG pipelines, AI memory, and context engineering through talks, panels, workshops, and networking with industry leaders and emerging voices. The event features sessions on topics like vector databases, semantic search, agentic AI, and includes an after-party, with tickets available for €50 and participation from major organizations such as AWS and Microsoft. +- [How Pento modeled aesthetic taste with Qdrant](https://qdrant.tech/blog/case-study-pento/): The article describes a novel art recommendation system that uses Qdrant’s vector search and clustering (via HDBSCAN) to model individuals' evolving, multi-faceted artistic tastes, representing each user as a dynamic set of weighted clusters built from their artwork interactions. By embedding artworks and user preferences in semantic space, scoring clusters by recency and strength, and prioritizing the most representative clusters, the system connects users with others who share specific, nuanced tastes rather than simply suggesting popular items. +- [How Alhena AI unified its AI stack and improved ecommerce conversions with Qdrant](https://qdrant.tech/blog/case-study-alhena/): Alhena AI unified its previously fragmented vector search infrastructure by migrating to Qdrant Cloud, consolidating multiple backends into a single scalable, high-performance system. This enabled faster onboarding, reduced retrieval latency by 90%, improved ecommerce recommendations and conversions, simplified global deployments, and supported rapid scaling and innovation through features like multitenancy and hybrid search. +- [How GoodData turbocharged AI analytics with Qdrant](https://qdrant.tech/blog/case-study-gooddata/): GoodData has evolved from a traditional BI platform into an AI-powered, API-first analytics solution focused on accelerating insights and providing real-time, conversational data experiences, leveraging Qdrant’s scalable vector database for high-performance, multilingual semantic search and embedding updates. This transition enables fast, efficient AI-driven analytics at enterprise scale, positioning GoodData as a foundation for next-generation, embedded AI applications and advanced personalization features. +- [The Hitchhiker's Guide to Vector Search](https://qdrant.tech/blog/hitchhikers-guide/): Clelia, an Open Source Engineer at LlamaIndex, shares practical tips for building effective Retrieval Augmented Generation (RAG) pipelines, emphasizing the importance of clean text extraction, meaningful chunking, careful selection of dense and sparse embeddings (including hybrid search), and optimization strategies like semantic caching and binary quantization. The post draws from her hands-on experience to help improve the reliability, efficiency, and scalability of vector search-powered applications. +- [How FAZ unlocked 75 years of journalism with Qdrant](https://qdrant.tech/blog/case-study-faz/): Frankfurter Allgemeine Zeitung (FAZ) built a powerful semantic search system leveraging Qdrant and Azure OpenAI embeddings to make its 75-year archive of journalism deeply searchable, with robust support for rich metadata, rapid updates, and sub-second performance at scale. FAZ is now evolving toward a hybrid search architecture combining semantic, symbolic, and filtered queries to further enhance editorial research and user experience. +- [GraphRAG: How Lettria Unlocked 20% Accuracy Gains with Qdrant and Neo4j](https://qdrant.tech/blog/case-study-lettria-v2/): Lettria achieved a 20% increase in document retrieval accuracy for regulated industries by integrating Qdrant's vector search with Neo4j's knowledge graphs in a hybrid system, enabling more precise and explainable outputs than traditional vector-only RAG methods. Their solution addressed complex data ingestion, ontology generation, and transactional syncing challenges, ensuring reliability and consistency across both systems. +- [Vector Data Migration Tool](https://qdrant.tech/blog/beta-database-migration-tool/): Qdrant has released a beta Vector Data Migration Tool that enables easy, live batch migration of data between Qdrant instances or from other vector database providers with a single command, without needing node-specific snapshots. The tool supports flexible configuration changes during migration and can run as a container on any machine with access to the source and target databases. +- [How Lawme Scaled AI Legal Assistants and Significantly Cut Costs with Qdrant](https://qdrant.tech/blog/case-study-lawme/): Lawme.ai, a LegalTech startup, replaced their initial PGVector solution with Qdrant's vector database to better manage AI-driven legal workflows, resulting in a 75% reduction in infrastructure costs, faster query performance at scale, and enhanced compliance with strict data residency requirements. This migration enabled Lawme to confidently expand their AI legal assistant services while maintaining high security, operational simplicity, and meeting demanding client expectations. +- [How ConvoSearch Boosted Revenue for D2C Brands with Qdrant](https://qdrant.tech/blog/case-study-convosearch/): ConvoSearch, an AI-powered recommendation engine for e-commerce brands, switched from Pinecone to Qdrant for faster, more customizable vector search and robust metadata handling, reducing query latency from 50–100ms to 10ms and enabling real-time, hyper-personalized recommendations. This transition led to immediate business results, including a median 23–24% revenue uplift across clients and a 60% increase for The Closet Lover, while improving scalability and customer engagement. +- [LegalTech Builder's Guide: Navigating Strategic Decisions with Vector Search](https://qdrant.tech/blog/legal-tech-builders-guide/): LegalTech applications require highly accurate, nuanced search capabilities that traditional keyword methods cannot deliver, making vector search solutions like Qdrant essential for precision, scalability, and cost-efficiency. Qdrant addresses complex legal search needs with features like filterable HNSW, hybrid search, token-level reranking, domain-specific ranking logic, efficient scaling via GPU and quantization, and flexible deployment options for secure and responsive LegalTech development. +- [Qdrant Achieves SOC 2 Type II and HIPAA Certifications](https://qdrant.tech/blog/soc-2-type-ii-hipaa/): Qdrant has renewed its SOC 2 Type II certification and achieved HIPAA compliance, demonstrating a strong commitment to enterprise-grade security, confidentiality, and regulatory standards. These certifications, alongside advanced features like Single Sign-On, RBAC, and real-time monitoring, ensure the secure management of sensitive data for enterprises, including those in healthcare. +- [​​Introducing the Official Qdrant Node for n8n](https://qdrant.tech/blog/n8n-node/): Qdrant has released an official, fully-featured node for n8n, allowing users to integrate Qdrant’s advanced semantic and hybrid search capabilities directly into their n8n workflows without needing HTTP nodes, available for both cloud and self-hosted instances from version 1.94.0. This integration streamlines building robust solutions—such as RAG systems and agentic pipelines—by supporting features like batch upserts, hybrid queries, reranking, and customizable rescoring, with easy installation and ample community resources. +- [Qdrant + DataTalks.Club: Free 10-Week Course on LLM Applications](https://qdrant.tech/blog/datatalks-course/): A free 10-week online course, created in partnership with Alexey Grigorev and DataTalks.Club, teaches you how to build AI systems that answer questions about your knowledge base using LLMs, RAG, and vector search, with hands-on guidance from Qdrant experts. The course covers foundational and advanced topics, including semantic similarity search, hybrid and multi-vector search, reranking, and evaluation, and is open to everyone. +- [How Qovery Accelerated Developer Autonomy with Qdrant](https://qdrant.tech/blog/case-study-qovery/): Qovery integrated Qdrant’s high-performance vector database to power its AI-driven DevOps Copilot, enabling developers to autonomously manage complex infrastructure tasks in real time with minimal reliance on specialized DevOps expertise. This seamless integration accelerated deployment speeds, reduced operational overhead, and allowed Qovery to scale its services efficiently while focusing on continuous innovation. +- [How Tripadvisor Drives 2 to 3x More Revenue with Qdrant-Powered AI](https://qdrant.tech/blog/case-study-tripadvisor/): Tripadvisor is transforming its platform by leveraging Qdrant’s vector database and generative AI to turn its vast, unstructured user data into highly personalized, conversational travel experiences, such as the AI-powered Trip Planner. This shift has led to a 2–3x increase in revenue from users engaging with the AI features, and Tripadvisor plans to expand vector search and AI integration across its services. +- [Precision at Scale: How Aracor Accelerated Legal Due Diligence with Hybrid Vector Search](https://qdrant.tech/blog/case-study-aracor/): Aracor leveraged Qdrant’s scalable vector search technology to automate and dramatically accelerate legal due diligence workflows, achieving 90% faster processing, 70% reduced document turnaround time, and higher accuracy in legal data handling. Qdrant’s open-source, hybrid search capabilities enabled Aracor to efficiently process vast legal documents, deliver precise results, and continually innovate for complex, high-volume legal tasks. +- [How Garden Scaled Patent Intelligence with Qdrant](https://qdrant.tech/blog/case-study-garden-intel/): Garden, a startup revolutionizing patent analysis with large-scale AI, adopted Qdrant’s filterable vector search to efficiently manage and search hundreds of millions of patent vectors, enabling sub-100ms query latency, 10× cost reduction, and expansion of their addressable patent corpus from 20M to over 200M patents. This upgrade unlocked new business opportunities, such as high-confidence infringement detection, allowing clients to receive comprehensive analyses in minutes. +- [Exploring Qdrant Cloud Just Got Easier](https://qdrant.tech/blog/product-ui-changes/): Qdrant Cloud has introduced major user experience improvements, including streamlined login, effortless cluster creation with a free tier, enhanced cluster management tools (such as real-time monitoring, easy scaling, and improved UI), and a revamped Get Started page with comprehensive guides, tutorials, and community support to help users build and deploy vector search applications more easily. These updates aim to make the entire process—from initial setup to production deployment—simpler and more accessible for developers and teams. +- [How Pariti Doubled Its Fill Rate with Qdrant](https://qdrant.tech/blog/case-study-pariti/): Pariti transformed its talent-matching process by implementing Qdrant’s vector search, reducing rĂ©sumĂ© vetting time by 70%, boosting fill rates from 20% to 48%, and quadrupling candidate interview success—all while supporting rapid scalability and zero downtime. The new system enabled instant, accurate candidate shortlists from a 70,000-strong database, freeing analysts for higher-value work and positioning Pariti for further growth and client-facing innovation. +- [How Dust Scaled to 5,000+ Data Sources with Qdrant](https://qdrant.tech/blog/case-study-dust-v2/): Dust scaled its AI platform to handle over 5,000 data sources by migrating to Qdrant, leveraging features like multi-tenant collections and scalar quantization to dramatically improve search latency, reduce memory usage, and increase architectural flexibility. This overhaul resulted in a faster, more reliable, and cost-efficient system, enabling Dust’s AI agents to deliver consistently high performance even as data complexity and customer demand grew. +- [How SayOne Enhanced Government AI Services with Qdrant](https://qdrant.tech/blog/case-study-sayone/): SayOne switched from Pinecone to Qdrant for its government AI projects due to Qdrant’s superior performance, lower latency, easier deployment, better data sovereignty, and advanced capabilities, which proved crucial for meeting strict government requirements. As a result, SayOne achieved faster, more secure, and scalable AI solutions, leading to improved productivity and successful deployments across global public sector clients. +- [Beyond Multimodal Vectors: Hotel Search With Superlinked and Qdrant](https://qdrant.tech/blog/superlinked-multimodal-search/): Superlinked and Qdrant enable intelligent, multimodal search by converting natural language queries into specialized vector embeddings across different data types (e.g., text, price, rating), allowing for precise, customizable results in complex domains like hotel search. By leveraging distinct vector spaces and automatic query weighting, users can interact naturally while the system dynamically interprets preferences and efficiently retrieves relevant results. +- [Qdrant 1.14 - Reranking Support & Extensive Resource Optimizations](https://qdrant.tech/blog/qdrant-1.14.x/): Qdrant 1.14.0 introduces a Score-Boosting Reranker, enabling flexible result ranking by combining vector-based similarity with business logic (like metadata, recency, or location), along with incremental HNSW indexing for faster updates, optimized batch search via parallel processing, and improved CPU/disk resource and memory usage for large datasets. These updates make vector search more customized, efficient, and scalable for diverse real-world applications. +- [Pathwork Optimizes Life Insurance Underwriting with Precision Vector Search](https://qdrant.tech/blog/case-study-pathwork/): Pathwork revolutionized life insurance underwriting by implementing an AI-powered tool with Qdrant's vector search, leading to major accuracy gains (halving mean squared error) and faster processing (cutting latency from 9 to 2 seconds), which resulted in rapid user growth and significant improvements in underwriting precision and efficiency. Future plans include deeper carrier integration and continued advancements in accuracy and scalability, positioning Pathwork as an industry leader. +- [How Lyzr Supercharged AI Agent Performance with Qdrant](https://qdrant.tech/blog/case-study-lyzr/): Lyzr improved the performance, scalability, and cost-efficiency of its AI agent platform by switching from Weaviate and Pinecone to Qdrant, achieving over 90% faster query times, 2x faster data ingestion, and a 30% reduction in infrastructure costs while maintaining stability under heavy, real-world loads. This upgrade enabled Lyzr to handle increased agent concurrency and growing data demands, resulting in more accurate, low-latency retrieval for clients like NTT Data and NPD. +- [How Mixpeek Uses Qdrant for Efficient Multimodal Feature Stores](https://qdrant.tech/blog/case-study-mixpeek/): Mixpeek, a multimodal data processing platform, switched from MongoDB and Postgres to Qdrant as its feature store to support advanced hybrid retrieval, multi-vector indexing, and late interaction techniques, resulting in 40% faster query times and 80% less retriever code. Qdrant's vector-specialized capabilities streamlined feature extraction, improved scalability, and enabled efficient taxonomy, clustering, and multimodal retrieval for Mixpeek’s workflows. +- [Satellite Vector Broadcasting: Near-Zero Latency Retrieval from Space](https://qdrant.tech/blog/satellite-vector-broadcasting/): Qdrant has launched Satellite Vector Broadcasting, a groundbreaking system that uses a proprietary CubeSat constellation and inter-satellite laser relays to deliver near-zero latency vector search by processing queries in space rather than on Earth. This technology enables ultra-fast retrieval—experimental CubeSat swarms achieve 4 ms average latency—and introduces innovative features like Broadcast-to-Index Protocol, dynamic clustering, and plans for planetary-scale search expansion. +- [HubSpot & Qdrant: Scaling an Intelligent AI Assistant](https://qdrant.tech/blog/case-study-hubspot/): HubSpot selected Qdrant as its vector database to power Breeze AI, enabling highly personalized, real-time responses and scalable, efficient retrieval of relevant data to meet growing user demands. This integration has accelerated development, improved customer engagement, and positioned HubSpot’s AI infrastructure for future advancements and continued scalability. +- [Vibe Coding RAG with our MCP server](https://qdrant.tech/blog/webinar-vibe-coding-rag/): The webinar tested popular AI coding assistants—Cursor, GitHub Copilot, Aider, and Claude Code—by using “vibe coding,” a development approach that relies on AI for implementation while developers focus on desired outcomes and context. By integrating these tools with the Model Context Protocol and a Qdrant semantic memory server, the session demonstrated the potential and limitations of AI agents in handling complex tasks like Retrieval Augmented Generation, showing that context-aware coding is increasingly feasible though tool selection depends on specific needs. +- [How Deutsche Telekom Built a Multi-Agent Enterprise Platform Leveraging Qdrant](https://qdrant.tech/blog/case-study-deutsche-telekom/): Deutsche Telekom developed LMOS, an open-source, scalable multi-agent AI PaaS, to efficiently power sales and customer service chatbots across ten European countries, addressing complex challenges in data segregation, context sharing, and agent collaboration. To ensure seamless integration, high performance, and operational simplicity, they chose Qdrant as their vector database, enabling over 2 million AI-driven conversations and setting a new standard for enterprise AI deployment. +- [Introducing Qdrant Cloud’s New Enterprise-Ready Vector Search](https://qdrant.tech/blog/enterprise-vector-search/): Qdrant Cloud introduces enterprise-grade features—including a Cloud API with Terraform support, Cloud RBAC, Single Sign-On (SSO), granular Database API Keys, and advanced monitoring integrations—to help developers securely manage, scale, and monitor AI workloads with fine-grained access control and real-time observability. These enhancements simplify authentication and management, enforce security policies, and provide deep visibility into database performance, making Qdrant Cloud ready for compliant, large-scale AI applications. +- [Metadata automation and optimization - Reece Griffiths | Vector Space Talks](https://qdrant.tech/blog/metadata-deasy-labs/): Metadata is crucial for optimizing vector search and retrieval-augmented generation, enabling improved segmentation, enrichment, filtering, and access control, which dramatically boosts retrieval accuracy. Reece Griffiths, CEO of Deasy Labs, highlights how automating and scaling metadata extraction and classification through LLMs can bridge the gap to high-performance search systems. +- [How to Build Intelligent Agentic RAG with CrewAI and Qdrant](https://qdrant.tech/blog/webinar-crewai-qdrant-obsidian/): The live session demonstrated how to integrate CrewAI, Qdrant, and Obsidian to create an agentic RAG (Retrieval-Augmented Generation) system for semi-automating email responses by connecting to Gmail and using Obsidian notes as an up-to-date knowledge base. The system leverages CrewAI’s multi-agent framework and custom Qdrant integration to manage entity and short-term memory, automate knowledge updates, and generate contextually relevant email drafts. +- [Qdrant 1.13 - GPU Indexing, Strict Mode & New Storage Engine](https://qdrant.tech/blog/qdrant-1.13.x/): Qdrant 1.13.0 introduces GPU-accelerated, vendor-agnostic HNSW indexing for dramatically faster vector search on any major GPU, along with Strict Mode for operational control that enforces limits on resource usage and prevents system overload in multi-tenant or distributed environments. Additional features include HNSW graph compression, named vector filtering, and custom storage for efficient handling of payloads and sparse vectors. +- [Voiceflow & Qdrant: Powering No-Code AI Agent Creation with Scalable Vector Search](https://qdrant.tech/blog/case-study-voiceflow/): Voiceflow enables enterprises to build AI agents through a no-code, customizable platform, and after evaluating vector databases for their managed RAG solution, they switched from Pinecone to Qdrant due to its advanced scaling, security, metadata tagging, and responsive support. Leveraging Qdrant's features—such as quantization, robust infrastructure, and efficient metadata management—Voiceflow achieved enhanced scalability, performance, and operational efficiency, and plans to continue expanding filtering and customization capabilities for its users. +- [Building a Facial Recognition System with Qdrant](https://qdrant.tech/blog/facial-recognition/): The Twin Celebrity app matches users to their celebrity look-alike by analyzing selfies with a ResNet-based FaceNet model, storing facial embeddings in the Qdrant vector database, and delivering results via a Streamlit interface. The project demonstrates a scalable embedding search workflow using tools like ZenML for pipelines and MTCNN for face alignment, with practical deployment and optimization tips for developers. +- [Optimizing ColPali for Retrieval at Scale, 13x Faster Results](https://qdrant.tech/blog/colpali-qdrant-optimization/): ColPali, a precise document retrieval tool for visually rich PDFs, faced major computational challenges when scaling due to the generation of over a thousand vectors per page; this was overcome by introducing a two-stage retrieval process using row-wise pooling to reduce vectors followed by reranking, resulting in a 13x speedup with minimal loss in precision (NDCG@20 = 0.952, Recall@20 = 0.917) when using mean pooling. The experiment confirms that mean pooling effectively maintains ColPali’s accuracy while drastically improving efficiency, and further optimizations—like column pooling or quantization—are being explored. +- [Best Practices in RAG Evaluation: A Comprehensive Guide](https://qdrant.tech/blog/rag-evaluation-guide/): This guide covers how to evaluate RAG (Retrieval-Augmented Generation) systems for accuracy and quality by assessing search precision, recall, contextual relevance, and response accuracy, and highlights the importance of using proper data ingestion, embedding models, and optimized retrieval strategies. It also introduces three popular evaluation frameworks—Ragas, Quotient AI, and Arize Phoenix—to measure and visualize key system metrics, and discusses common issues that affect RAG performance and their solutions. +- [Empowering QA.tech’s Testing Agents with Real-Time Precision and Scale](https://qdrant.tech/blog/case-study-qatech/): QA.tech, specializing in AI-driven automated web application testing, developed browser-based testing agents that require fast, real-time analysis and decision-making supported by custom embeddings. After hitting scalability and performance limits with pgvector, QA.tech adopted Qdrant for its efficient, batchable, and multi-embedding vector search capabilities, enabling responsive high-velocity action handling and reliable AI agent performance. +- [Advanced Retrieval with ColPali & Qdrant Vector Database](https://qdrant.tech/blog/qdrant-colpali/): ColPali is a multimodal document retrieval approach that leverages Vision Language Models (VLMs) to generate multi-vector embeddings directly from document images, effectively capturing both visual and textual information and outperforming traditional text/OCR-based methods, particularly on complex, visually rich documents. Integrated with Qdrant and optimized using Binary Quantization, ColPali enables significantly faster and more efficient search without compromising accuracy, as demonstrated on challenging datasets. +- [How Sprinklr Leverages Qdrant to Enhance AI-Driven Customer Experience Solutions](https://qdrant.tech/blog/case-study-sprinklr/): Sprinklr, a leader in unified customer experience management, adopted Qdrant as its vector database to power AI-driven applications such as FAQ bots and advanced analytics, citing Qdrant's speed, developer-friendly customization, cost efficiency, and superior write and latency performance over alternatives like Elasticsearch. This transition led to a 30% reduction in retrieval infrastructure costs, improved developer efficiency, and enabled scalable, real-time AI capabilities across Sprinklr’s product suites. +- [Qdrant 1.12 - Distance Matrix, Facet Counting & On-Disk Indexing](https://qdrant.tech/blog/qdrant-1.12.x/): Qdrant 1.12.0 introduces a Distance Matrix API for efficient pairwise vector distance calculations, a GUI for visual data exploration, a Faceting API for dynamic metadata aggregation, and support for storing both text and geo indexes on disk to improve memory efficiency. These features enhance clustering/clustering tasks, enable flexible field-based aggregation, offer improved dataset visualization, and reduce in-memory resource requirements. +- [New DeepLearning.AI Course on Retrieval Optimization: From Tokenization to Vector Quantization](https://qdrant.tech/blog/qdrant-deeplearning-ai-course/): DeepLearning.AI, in collaboration with Qdrant, has launched a free, beginner-friendly, one-hour online course led by Kacper Ɓukawski, focused on retrieval optimization techniques such as tokenization, vector quantization, and enhancing vector search in applications. The course is designed for anyone with basic Python knowledge and aims to equip learners with practical skills to build and optimize Retrieval-Augmented Generation (RAG) applications. +- [Introducing Qdrant for Startups](https://qdrant.tech/blog/qdrant-for-startups-launch/): Qdrant for Startups is a new initiative providing early-stage AI startups with discounted Qdrant Cloud services, technical guidance, co-marketing opportunities, partner tool discounts (such as Hugging Face and LlamaIndex), and community support to help them scale AI-driven products efficiently. Eligible pre-seed to Series A startups can apply online, with selection based on innovation potential and alignment with Qdrant’s capabilities. +- [Qdrant and Shakudo: Secure & Performant Vector Search in VPC Environments](https://qdrant.tech/blog/case-study-shakudo/): Qdrant has partnered with Shakudo to offer Qdrant Hybrid Cloud as a fully managed, high-performance vector database within Shakudo’s virtual private cloud (VPC) deployments, enabling enterprises to maintain data sovereignty and privacy while achieving scalable, low-latency vector search for AI applications. This integration delivers seamless compatibility with existing data stacks, Kubernetes-based automation, and robust security, making it ideal for enterprises needing efficient, compliant, and flexible AI infrastructure. +- [Data-Driven RAG Evaluation: Testing Qdrant Apps with Relari AI](https://qdrant.tech/blog/qdrant-relari/): Qdrant and Relari have partnered to simplify the evaluation of Retrieval-Augmented Generation (RAG) systems by combining Qdrant’s vector database capabilities with Relari’s experiment and metrics tools, enabling fast, data-driven, and iterative RAG performance assessments using methods like Top-K Parameter Optimization and Auto Prompt Optimization. This approach streamlines development by allowing developers to easily benchmark and improve their RAG applications with reliable metrics and real-world datasets, as demonstrated with a GitLab legal policies case study. +- [Nyris & Qdrant: How Vectors are the Future of Visual Search](https://qdrant.tech/blog/case-study-nyris/): Nyris, founded in 2015, delivers advanced visual search technology for enterprises—enabling rapid and accurate identification of products and spare parts via images, synthetic data, and CAD-generated visuals, with prominent clients including IKEA and Trumpf. To achieve high-performance, scalable, and accurate vector search, Nyris selected Qdrant as their dedicated vector search engine for its speed, flexibility, cost-effectiveness, data sovereignty, and innovative features, positioning themselves at the forefront of next-generation product search. +- [Kern AI & Qdrant: Precision AI Solutions for Finance and Insurance](https://qdrant.tech/blog/case-study-kern/): Kern AI developed a low-code, data-centric AI platform that helps financial and insurance companies, such as Markel Insurance SE, drastically reduce customer service response times by leveraging advanced vector search and integration of large language models; using Qdrant's open-source vector database, Kern AI achieved <1% hallucination rates and streamlined claims management and support workflows. The company plans to further expand its use of Qdrant to enhance data handling and chatbot accuracy across the industry. +- [Qdrant 1.11 - The Vector Stronghold: Optimizing Data Structures for Scale and Efficiency](https://qdrant.tech/blog/qdrant-1.11.x/): Qdrant 1.11.0 introduces major improvements in memory usage and segment optimization, including defragmented multitenant storage, on-disk payload indexing for less frequently used data, and new query features such as GroupBy, random sampling, and hybrid search fusion. The release also offers enhanced web UI tools for search evaluation and graph exploration, providing better scalability and efficiency for large, multi-tenant datasets. +- [Kairoswealth & Qdrant: Transforming Wealth Management with AI-Driven Insights and Scalable Vector Search](https://qdrant.tech/blog/case-study-kairoswealth/): Kairoswealth, a comprehensive wealth management platform, adopted Qdrant as its vector database to overcome challenges with scalability, performance, and memory efficiency in use cases such as product recommendations and regulatory report generation. The switch enabled consistent high performance, reduced infrastructure costs, access to advanced features, and strong support, positioning Kairoswealth to drive further innovation with AI in the financial sector. +- [Qdrant 1.10 - Universal Query, Built-in IDF & ColBERT Support](https://qdrant.tech/blog/qdrant-1.10.x/): Qdrant 1.10.0 introduces a universal Query API that unifies all search types (including Hybrid Search and ColBERT multivector support) into a single endpoint, allowing users to flexibly combine query strategies with simple parameters. The update also adds built-in IDF support for search and indexing, automating TF-IDF/BM25 calculations within the engine and removing the need for external computation or updates when documents change. +- [Community Highlights #1](https://qdrant.tech/blog/community-highlights-1/): The first edition of Community Highlights showcases outstanding projects and guides on vector search technologies, celebrates Pavan Kumar as Creator of the Month for his multiple contributions, and recognizes the top three most active community members. Community members are also invited to join the upcoming Office Hours on Discord for learning and networking. +- [Response to CVE-2024-3829: Arbitrary file upload vulnerability](https://qdrant.tech/blog/cve-2024-3829-response/): A security vulnerability (CVE-2024-3829) affecting Qdrant versions before v1.9.0 allows attackers to upload arbitrary files that could enable remote code execution, but is mitigated in cloud deployments due to read-only filesystem and authentication. Users should confirm their Qdrant version and upgrade to at least v1.9.0 following appropriate installation or update procedures, though no action is required for cloud users. +- [Qdrant Attains SOC 2 Type II Audit Report](https://qdrant.tech/blog/qdrant-soc2-type2-audit/): Qdrant has successfully completed the SOC 2 Type II audit, demonstrating effective security, availability, and confidentiality controls for customer data from January 1 to April 7, 2024, with no exceptions noted. The company will continue annual audits to maintain compliance and underscores its commitment to top-tier data protection. +- [Introducing Qdrant Stars: Join Our Ambassador Program!](https://qdrant.tech/blog/qdrant-stars-announcement/): Qdrant has launched "Qdrant Stars," an ambassador program to honor and support impactful users advancing AI and vector search through projects, educational efforts, and community engagement. The inaugural group includes diverse experts—such as Robert Caulk, Joshua Mo, Nicholas Khami, Owen Colegrove, Kameshwara Pavan Kumar Mantha, and Niranjan Akella—who contribute through innovative applications, content creation, and fostering community collaboration. +- [Intel’s New CPU Powers Faster Vector Search](https://qdrant.tech/blog/qdrant-cpu-intel-benchmark/): Intel’s 5th generation Xeon processors, particularly "Emerald Rapids," significantly boost vector search performance and energy efficiency compared to previous models, making them ideal for enterprise-scale AI/ML workloads with vector databases like Qdrant. Qdrant, optimized for these new CPUs, enables efficient, large-scale semantic search for modern AI applications and recommends the latest Intel CPUs for cost-effective, high-performance deployments. +- [QSoC 2024: Announcing Our Interns!](https://qdrant.tech/blog/qsoc24-interns-announcement/): Qdrant has selected Jishan Bhattacharya and Celine Hoang as interns for its inaugural Summer of Code program; Jishan will develop a WASM-based dimension reduction visualization in Rust, while Celine will port advanced ranking models to ONNX in Python. These projects aim to improve Qdrant's data visualization capabilities and model support for applications like recommendation engines and search. +- [Are You Vendor Locked?](https://qdrant.tech/blog/are-you-vendor-locked/): Vendor lock-in—particularly around expensive AI hardware like GPUs—is a growing concern as cloud costs and feature differentiation complicate infrastructure decisions; to remain flexible and manage costs, businesses should embrace cloud-agnostic solutions like Kubernetes and consider hybrid cloud options, such as Qdrant Hybrid Cloud, which enable seamless migration and integration across cloud providers. Ultimately, balancing speed, cost, and flexibility is essential, and leveraging open and standardized tools can help organizations avoid costly dependencies. +- [Visua and Qdrant: Vector Search in Computer Vision](https://qdrant.tech/blog/case-study-visua/): VISUA, a leader in computer vision data analysis, adopted Qdrant's vector database to scale and automate its quality control and anomaly detection processes, achieving 40x faster query speeds and 10x greater scalability in data review and reinforcement learning. This integration not only improved operational efficiency but also enabled VISUA to explore new applications such as content moderation and expanded copyright infringement detection. +- [Qdrant 1.9.0 - Heighten Your Security With Role-Based Access Control Support](https://qdrant.tech/blog/qdrant-1.9.x/): Qdrant 1.9.0 introduces key enterprise features such as granular access control via JSON Web Tokens, much faster shard transfer for improved recovery, and native support for uint8 embeddings for significant memory and performance gains. Additional improvements include better write performance, enhanced sparse vector search, and various optimizations for more stable, secure, and efficient large-scale deployments. +- [Qdrant's Trusted Partners for Hybrid Cloud Deployment](https://qdrant.tech/blog/hybrid-cloud-launch-partners/): Qdrant Hybrid Cloud is a managed vector database that can be deployed in any environment—including cloud, on-premise, or edge—providing developers with seamless integration into modern AI stacks, full data sovereignty, and support from major industry partners. Comprehensive tutorials and documentation demonstrate its flexibility in building secure, production-ready AI applications across diverse platforms and use cases. +- [Qdrant Hybrid Cloud: the First Managed Vector Database You Can Run Anywhere](https://qdrant.tech/blog/hybrid-cloud/): Qdrant has launched Qdrant Hybrid Cloud, the first managed, Kubernetes-native vector database that can be deployed across any environment (cloud, on-premise, or edge), offering organizations unparalleled data sovereignty, privacy, and deployment flexibility. Through partnerships with major cloud providers and AI tool leaders, plus features like database isolation, scalable and secure architecture, and effortless setup, Qdrant Hybrid Cloud empowers businesses to easily build and scale AI applications with full control over their data. +- [Advancements and Challenges in RAG Systems - Syed Asad | Vector Space Talks](https://qdrant.tech/blog/rag-advancements-challenges/): Syed Asad, an AI/ML expert at Kiwi Tech, highlights that while many vector databases are scalable, their lack of user-friendliness sets them back, which is why he prefers Qdrant for vector search operations. In his podcast episode, Syed discusses advancements and challenges in AI, particularly in retrieval-augmented generation (RAG) systems, and shares insights on topics like multimodal AI, semantic job matching, privacy concerns, and building engaging, multimedia-rich applications. +- [Building Search/RAG for an OpenAPI spec - Nick Khami | Vector Space Talks](https://qdrant.tech/blog/building-search-rag-open-api/): Nick Khami, founder of Trieve, emphasized how Trieve and Qdrant make building precise search and recommendation systems over Open API specs simple and efficient, highlighting the advantages of Qdrant's group-based system. The episode explores key topics such as leveraging Open API specs, simplifying vector search infrastructure, and enhancing search relevance and analytics for diverse applications. +- [Iveta Lohovska on Gen AI and Vector Search | Qdrant](https://qdrant.tech/blog/gen-ai-and-vector-search/): Iveta Lohovska, Chief Technologist at HPE, highlights the importance of trustworthiness, transparency, and explainability in generative AI and vector search, especially for high-stakes sectors like government and energy, emphasizing rigorous data sourcing, source traceability, and data privacy. She also discusses the challenges of enterprise adoption, the need for on-premises control, and the evolving landscape of open-source and specialized AI model development. +- [Teaching Vector Databases at Scale - Alfredo Deza | Vector Space Talks](https://qdrant.tech/blog/teaching-vector-db-at-scale/): Alfredo Deza, a developer advocate and professor, chooses Qdrant for teaching vector databases because it is easy to set up and straightforward to use, which simplifies both teaching and student learning of complex topics. Drawing on his background as an Olympic athlete, Alfredo emphasizes the value of consistency and simplicity in both education and technology, advocating for engaging materials and up-to-date platforms to enhance the AI learning experience. +- [How to meow on the long tail with Cheshire Cat AI? - Piero and Nicola |](https://qdrant.tech/blog/meow-with-cheshire-cat/): Cheshire Cat AI, an open-source framework with strong community support, primarily uses Qdrant as its default vector database in multiple forms (file, container, cloud), leveraging features like quantization to enhance search accuracy and performance while optimizing memory usage. Founders Piero Savastano and Nicola Procopio highlighted Cheshire Cat’s plugin ecosystem, growing international adoption, and active Discord community, as well as its future plans for a cloud version and user-generated plugin marketplace. +- [Response to CVE-2024-2221: Arbitrary file upload vulnerability](https://qdrant.tech/blog/cve-2024-2221-response/): A security vulnerability (CVE-2024-2221) affecting Qdrant versions before 1.9.0 allows attackers to upload arbitrary files and potentially execute remote code, but this does not significantly impact cloud deployments due to read-only filesystems and enabled authentication. Users should check their Qdrant version and upgrade to at least v1.9.0 to ensure protection against this vulnerability. +- [Introducing FastLLM: Qdrant’s Revolutionary LLM](https://qdrant.tech/blog/fastllm-announcement/): Qdrant has launched FastLLM (FLLM), a lightweight Language Model optimized for Retrieval Augmented Generation (RAG) applications, boasting a groundbreaking 1 billion token context window and seamless integration with Qdrant for large-scale AI-driven content generation. Featuring a mixture-of-experts architecture with 1 trillion parameters and industry-leading performance benchmarks, FastLLM is now available in Early Access for developers. +- [VirtualBrain: Best RAG to unleash the real power of AI - Guillaume](https://qdrant.tech/blog/virtualbrain-best-rag/): Guillaume Marquis, CTO and Co-Founder of VirtualBrain, highlights the necessity of using a fast, scalable vector database like Qdrant to efficiently handle large-scale document retrieval and AI tasks for commercial proposal drafting. He emphasizes the importance of open source tools, system scalability, precision in search, and user experience in minimizing hallucinations, positioning VirtualBrain as an innovative solution for deep AI-powered data work. +- [Talk with YouTube without paying a cent - Francesco Saverio Zuppichini |](https://qdrant.tech/blog/youtube-without-paying-cent/): Francesco Saverio Zuppichini, a Senior Full Stack Machine Learning Engineer, highlights Qdrant as his favorite vector database because it is private, user-controlled, and easy to run locally, which supports better data privacy and ownership. In a recent podcast episode, he discusses building RAGs from YouTube content, practical coding tips, and criteria for selecting effective software tools. +- [Qdrant is Now Available on Azure Marketplace!](https://qdrant.tech/blog/azure-marketplace/): Qdrant, a high-performance and scalable open-source vector database, is now officially available on Azure Marketplace, enabling seamless integration with Azure’s ecosystem for rapid, enterprise-scale AI applications such as retrieval-augmented generation (RAG) and vector search. This partnership allows users to quickly deploy Qdrant clusters on Azure for efficient handling of billion-vector datasets, low-latency querying, and flexible scaling, all within Microsoft’s trusted cloud infrastructure. +- [Production-scale RAG for Real-Time News Distillation - Robert Caulk |](https://qdrant.tech/blog/real-time-news-distillation-rag/): Robert Caulk, founder of Emergent Methods, discusses how his team leverages open-source tools like Qdrant to model and distill over a million news articles daily, aiming to reduce media bias and improve news awareness through scalable context engineering and advanced semantic search. He highlights the importance of modular, robust infrastructure and startups’ agility in adopting new technologies for more reliable and efficient real-time news distribution. +- [Insight Generation Platform for LifeScience Corporation - Hooman](https://qdrant.tech/blog/insight-generation-platform/): Hooman Sedghamiz, a leader in AI/ML at Bayer AG, discusses the rapid growth in vector databases and highlights a lack of innovation in data pipelines supporting retrieval augmented generation, while emphasizing the importance of real-time, cost-effective, and customized evaluation methods for maintaining high-quality AI chatbot interactions. The episode delves into strategies for ensuring chatbot integrity, efficient evaluation using smaller models, tailored metrics, and advances in large language models, offering insights from Sedghamiz's extensive experience advancing AI in life sciences. +- [The challenges in using LLM-as-a-Judge - Sourabh Agrawal | Vector Space Talks](https://qdrant.tech/blog/llm-as-a-judge/): Sourabh Agrawal emphasizes that using expensive models like GPT-4 for evaluating LLM outputs is not cost-effective, advocating for smaller, cheaper models instead. As CEO of UpTrain AI, he is developing an open-source tool to systematically evaluate, test, and monitor LLM applications by providing automated insights, root-cause analysis, and suggestions for improvement. +- [Vector Search for Content-Based Video Recommendation - Gladys and Samuel](https://qdrant.tech/blog/vector-search-vector-recommendation/): Dailymotion's Machine Learning Engineers chose Qdrant as their vector search engine for its ability to handle technical requirements, rapid neighbor search, user-friendly Python API, and cost-effective implementation, helping them efficiently scale their video recommendation system and improve performance with features like handling low-signal content. Additionally, Qdrant’s responsive support and streaming integration enabled Dailymotion to overcome cold start issues, resulting in a threefold increase in click-through rates for new or less-interacted videos. +- [Integrating Qdrant and LangChain for Advanced Vector Similarity Search](https://qdrant.tech/blog/using-qdrant-and-langchain/): Qdrant and LangChain simplify building scalable, production-ready AI applications by combining vector search (RAG) for long-term memory with unified development interfaces, enhancing retrieval accuracy and reducing hallucinations in LLM-based systems. This integration streamlines workflows for use cases like chatbots, recommendation systems, and data analysis, enabling developers to focus on value rather than infrastructure complexity. +- [IrisAgent and Qdrant: Redefining Customer Support with AI](https://qdrant.tech/blog/iris-agent-qdrant/): IrisAgent, founded by Palak Dalal Bhatia, leverages advanced AI and generative AI to automate and optimize customer support by integrating cross-functional data, detecting customer intent, and improving response efficiency. Their adoption of the open-source vector database Qdrant further enhances their AI pipeline’s performance, scalability, and data security, supporting future innovations such as automated knowledge base content generation for improved self-service and customer experience. +- [Dailymotion's Journey to Crafting the Ultimate Content-Driven Video Recommendation Engine with Qdrant Vector Database](https://qdrant.tech/blog/case-study-dailymotion/): Dailymotion improved its video recommendation engine by implementing a content-based system using Qdrant’s vector database, enabling fast, scalable, and diverse recommendations from over 420 million videos. This solution overcame previous limitations—like slow, popularity-biased recommendations—by leveraging advanced vector embeddings and efficient similarity search, resulting in better real-time user experiences across their platforms. +- [Qdrant vs Pinecone: Vector Databases for AI Apps](https://qdrant.tech/blog/comparing-qdrant-vs-pinecone-vector-databases/): Qdrant and Pinecone are leading vector databases designed for storing and searching high-dimensional data critical to modern AI applications, addressing limitations of traditional databases in scalability, performance, and search efficiency. Qdrant, in particular, stands out with its Rust-based architecture, advanced similarity search, flexible deployment, strong security, and integration with popular machine learning frameworks, making it well-suited for enterprise-scale AI workloads. +- [What is Vector Similarity? Understanding its Role in AI Applications.](https://qdrant.tech/blog/what-is-vector-similarity/): Vector similarity enables AI applications to understand and retrieve relevant information from unstructured data (like text, images, or audio) by converting it into high-dimensional vectors and comparing their proximity using metrics like cosine similarity or Euclidean distance. This approach powers advanced search, recommendation systems, image and text analysis, and natural language processing by quantifying how closely related different data points are within vector space. +- [DSPy vs LangChain: A Comprehensive Framework Comparison" #required](https://qdrant.tech/blog/dspy-vs-langchain/): The rise of powerful LLMs and vector stores has led to frameworks like LangChain, which simplify building complex AI applications by providing modular components for model interfacing, retrieval, and composition, supporting rapid prototyping and integration with various data sources and models. However, while LangChain streamlines many aspects, prompt engineering and multi-stage reasoning still require expertise, a challenge recent frameworks like DSPy aim to address by automating prompt optimization. +- [Qdrant Summer of Code 24](https://qdrant.tech/blog/qdrant-summer-of-code-24/): Qdrant, although not accepted into Google Summer of Code 2024, is launching its own Qdrant Summer of Code program following GSoC's timeline and rules, offering stipends for contributors to work on projects in Rust and Python, such as WASM-based visualization, efficient BM25 ranking, ONNX cross encoders, ranking fusion algorithms, and distributed system testing. Interested contributors can apply by email starting March 18th, with full project details available on Qdrant's Notion page. +- [Dust and Qdrant: Using AI to Unlock Company Knowledge and Drive Employee Productivity](https://qdrant.tech/blog/dust-and-qdrant/): Dust, a French company co-founded by ex-OpenAI engineer Stanislas Polu, empowers businesses to build context-aware AI assistants using large language models (LLMs) through retrieval augmented generation (RAG), addressing the challenge of diverse, fluid enterprise use cases and limited tuning data by integrating company knowledge from various SaaS tools. By using Qdrant as their open-source vector database, Dust efficiently scaled their platform, reduced costs by 2x with advanced memory management and quantization, and maintained high performance and accuracy in AI-powered team productivity tools. +- [The Bitter Lesson of Retrieval in Generative Language Model Workflows -](https://qdrant.tech/blog/bitter-lesson-generative-language-model/): Dr. Mikko LehtimĂ€ki, co-founder and chief data scientist at Softlandia, discusses the "bitter lesson" that machine learning approaches leveraging large-scale data and computation eventually outperform hand-crafted solutions, and how this principle shapes the development of Yokot AI's retrieval-augmented generation workflows. He emphasizes the importance of efficient data handling, especially re-ranking, for improving large language model outputs, and shares practical insights from Softlandia’s innovative LLM platform. +- [Indexify Unveiled - Diptanu Gon Choudhury | Vector Space Talks](https://qdrant.tech/blog/indexify-content-extraction-engine/): Diptanu Gon Choudhury, founder of Tensorlake, is developing Indexify, an open-source engine that enables scalable, near-real-time structured extraction from unstructured data to support AI agent-driven workflows and improve knowledge bases. In a recent podcast, he discusses how innovative data infrastructure like Indexify is transforming enterprise AI applications by enabling real-time indexing, efficient agent workflows, and optimized experiences for developers and call centers. +- [Unlocking AI Potential: Insights from Stanislas Polu](https://qdrant.tech/blog/qdrant-x-dust-vector-search/): Stanislas Polu, co-founder of Dust and former Stripe and OpenAI engineer, discusses how his company uses Qdrant’s open-source, Rust-based vector database for its strong performance and control, enabling efficient enterprise data management and the deployment of AI-powered productivity assistants across SaaS platforms. He emphasizes prioritizing product-layer innovation—focusing on practical integration of large language models through tailored AI assistants and workflows—over model training to better augment and streamline daily work. +- [Announcing Qdrant's $28M Series A Funding Round](https://qdrant.tech/blog/series-a-funding-round/): Qdrant has raised $28M in Series A funding led by Spark Capital to accelerate its mission of building a highly efficient, scalable, and high-performance vector database, already demonstrated by rapid user growth and over 5 million downloads. With advanced features like custom search algorithms, flexible deployment options, and a strong focus on performance, Qdrant aims to empower enterprises to build cutting-edge AI applications using its open-source platform. +- [Introducing Qdrant Cloud on Microsoft Azure](https://qdrant.tech/blog/qdrant-cloud-on-microsoft-azure/): Qdrant Cloud, the managed vector database, is now available on Microsoft Azure, enabling users to quickly deploy scalable clusters and manage billion-vector datasets with ease. This expansion supports rapid application development and enterprise-scale AI solutions on Azure’s infrastructure. +- [Qdrant Updated Benchmarks 2024](https://qdrant.tech/blog/qdrant-benchmarks-2024/): Qdrant has updated its open-source benchmark to better compare vector search engines, featuring notable performance improvements across all engines, a new dataset with 1 million OpenAI embeddings, and separate evaluations for latency and requests-per-second. The benchmarking principles remain unchanged, focusing on real-world production features and unbiased, transparent testing of open-source vector databases. +- [Navigating challenges and innovations in search technologies](https://qdrant.tech/blog/navigating-challenges-innovations/): The podcast discussed retrieval-augmented generation (RAG) as an innovative approach in search technologies that combines information retrieval with language generation to improve context understanding and output accuracy in NLP tasks. It also emphasized the importance and challenges of evaluating RAG and LLM-based applications, including domain-specific model performance, data processing, retrieval quality, and generation evaluation. +- [Optimizing an Open Source Vector Database with Andrey Vasnetsov](https://qdrant.tech/blog/open-source-vector-search-engine-vector-database/): Andrey Vasnetsov, CTO at Qdrant, emphasizes that for vector search systems like Qdrant, scalability and search performance are more important than transactional consistency, advocating a search engine approach over a traditional database mindset. He outlines strategies such as in-place filtering during graph traversal, utilizing subgraphs and overlapping intervals, tuning beam size in HNSW indices, and combining vector with relational search to optimize precision and efficiency at scale. +- [Vector Search Complexities: Insights from Projects in Image Search and](https://qdrant.tech/blog/vector-image-search-rag/): NoĂ© Achache, Lead Data Scientist at Sicara, emphasizes the readiness and value of image embedding models like Dino V2 for advancing text and image search projects, highlighting their strong out-of-the-box performance without the need for fine-tuning. In a podcast episode, he shares practical insights on deploying vector search across diverse applications such as image search, real estate data deduplication, document retrieval, and handling sensitive medical information, while also addressing challenges like data safety and the importance of new model development. +- [How to Superpower Your Semantic Search Using a Vector Database](https://qdrant.tech/blog/semantic-search-vector-database/): Nicolas Mauti and the team at Malt improved their freelancer matching platform by adopting a retriever-ranker architecture with multilingual transformer-based models and transitioning to the Qdrant vector database, which dramatically increased performance—reducing search latency from 10 to 1 second—and enabled precise, scalable semantic search with geospatial filtering. Their choice balanced performance and accuracy better than Elasticsearch, significantly enhancing Malt's ability to connect freelancers and projects. +- [Building LLM Powered Applications in Production - Hamza Farooq | Vector](https://qdrant.tech/blog/llm-complex-search-copilot/): Hamza Farooq’s experience at Google and Walmart Labs, combined with his expertise in machine learning, has shaped his practical approach to building LLM-powered applications by focusing on real-world business problems, efficient open-source implementations, and user-centered conversational search and recommendation solutions. At Traversaal.ai, he leverages this background to deliver domain-specific AI products that bridge the gap between innovative technology and seamless user experiences. +- [Building a High-Performance Entity Matching Solution with Qdrant -](https://qdrant.tech/blog/entity-matching-qdrant/): Rishabh Bhardwaj and his team built a high-performance hotel matching solution, initially experimenting with Postgres but ultimately choosing Qdrant for its superior speed and recall. The HNSW algorithm in Qdrant enhanced their solution by enabling fast and accurate entity matching, effectively handling large-scale, multilingual, and inconsistent hotel data. +- [FastEmbed: Fast & Lightweight Embedding Generation - Nirant Kasliwal |](https://qdrant.tech/blog/fast-embed-models/): Nirant Kasliwal, creator of FastEmbed and contributor to OpenAI's Finetuning Cookbook, shares practical tips to improve and speed up embedding creation—highlighting FastEmbed’s efficiency, support for CPU/GPU quantization, and solutions for common production challenges in Natural Language Processing (NLP). In this Vector Space Talks episode, Nirant discusses model selection, optimizing embeddings, and the library's ability to deliver fast, lightweight, and domain-adaptable embedding workflows. +- [When music just doesn't match our vibe, can AI help? - Filip Makraduli |](https://qdrant.tech/blog/human-language-ai-models/): Filip Makraduli, a data scientist with expertise in causal ML and generative AI, has developed an innovative music recommendation system that uses language-based song descriptions and AI models to match music to users’ moods or vibes. By leveraging sentence embeddings and vector similarity, his approach enables more personalized and context-aware song recommendations, demonstrated through real-time Spotify playlist creation. +- [Binary Quantization - Andrey Vasnetsov | Vector Space Talks](https://qdrant.tech/blog/binary-quantization/): Andrey Vasnetsov, CTO of Qdrant, explains that binary quantization dramatically improves vector search efficiency—reducing storage size and boosting speed by up to 30x—though compatibility varies across models, with OpenAI models performing best. The episode highlights the importance of quantization for handling large vector indexes, the simplicity and effectiveness of binary quantization, and practical strategies like oversampling to optimize search precision in real time. +- [Loading Unstructured.io Data into Qdrant from the Terminal](https://qdrant.tech/blog/qdrant-unstructured/): The blog post explains how to use Unstructured.io’s CLI to extract data from Discord channels, process it into structured form, and ingest it into a Qdrant vector database, including necessary prerequisites and example commands for each step. It highlights that this workflow simplifies data ingestion from Discord and over 20 other data sources directly into Qdrant, supporting customization via the CLI. +- [Chat with a codebase using Qdrant and N8N](https://qdrant.tech/blog/qdrant-n8n/): n8n allows you to visually build AI-powered workflows that connect apps with APIs and manipulate data with minimal coding; using the Qdrant node and OpenAI integration, you can ingest a GitHub repository into a vector database and create a chat service to interact with the codebase. The process involves setting up necessary accounts, configuring nodes for data ingestion and retrieval, and optionally embedding the resulting chat into applications using the @n8n/chat package. +- [Vector search and applications" by Andrey Vasnetsov, CTO at Qdrant](https://qdrant.tech/blog/vector-search-and-applications-record/): Andrey Vasnetsov, Co-founder and CTO at Qdrant, discussed vector search and its applications with Learn NLP Academy, covering topics such as the Qdrant engine, Quaterion similarity learning, multimodal similarity, comparison with Elastic search, support for multiple embeddings, fundraising, and the future of vector search. He also addressed finetuning models for out-of-domain scenarios. +- [From Content Quality to Compression: The Evolution of Embedding Models](https://qdrant.tech/blog/cohere-embedding-v3/): Nils Reimers, Cohere’s Head of Machine Learning, discussed advancements in embeddings including content quality estimation, compression-aware training, reinforcement learning from human feedback, and the importance of evaluating embedding quality contextually, while also sharing upcoming features and noting current limitations in distinguishing truthfulness. He highlighted that these innovations enhance model usefulness and efficiency, but content quality models still struggle to discern true from fake statements due to reliance on pretraining data. +- [Pienso & Qdrant: Future Proofing Generative AI for Enterprise-Level Customers](https://qdrant.tech/blog/case-study-pienso/): Pienso and Qdrant are partnering to deliver scalable, efficient, and user-friendly interactive deep learning solutions by combining Pienso’s low-code platform with Qdrant’s high-performance vector storage and retrieval capabilities, enabling organizations to build and improve large language models while maintaining data sovereignty and model autonomy. This collaboration will enhance LLM performance through efficient context management, fast training and inference, and reliable, cost-effective storage, making it ideal for enterprise-scale AI-driven applications. +- [Powering Bloop semantic code search](https://qdrant.tech/blog/case-study-bloop/): bloop is a fast, local code-search engine founded in 2021 that enables semantic search of large codebases, helping developers understand and reuse code more efficiently; it relies on Qdrant, a high-performance, open-source vector search database, for fast, accurate semantic search at scale. The integration of Qdrant enables bloop to deliver reliable, instantaneous semantic search results even on very large codebases. +- [Qdrant supports ARM architecture!](https://qdrant.tech/blog/qdrant-supports-arm-architecture/): ARM-based processors, such as those used in AWS Graviton2 instances, offer better energy efficiency and are about 20% cheaper than x86 counterparts, though they are roughly 10-20% slower in vector search tasks but with more consistent performance. This makes ARM64 a cost-effective and reliable option for hosting vector search applications like Qdrant, despite a slight performance tradeoff. +- [Qdrant has joined NVIDIA Inception Program](https://qdrant.tech/blog/qdrant-joined-nvidia-inception-program/): We have joined the NVIDIA Inception program, which supports technology startups with advanced resources and connections. We are especially excited about gaining GPU support, a key feature in Qdrant's roadmap. +- [Bulk Upload Vectors](https://qdrant.tech/documentation/database-tutorials/bulk-upload/): To efficiently bulk upload vectors to a Qdrant collection, use the fastest available client (preferably Rust) and consider parallelizing uploads, while optimizing indexing settings: you can disable or defer HNSW indexing during ingestion (e.g., set `m:0` or `indexing_threshold:0`) for faster uploads and reduced memory usage, then re-enable or adjust indexing (e.g., set `m:16`) after data import for fast search performance. Choose the configuration that best balances upload speed, memory usage, and index availability for your needs. +- [Qdrant Fundamentals](https://qdrant.tech/documentation/faq/qdrant-fundamentals/): Qdrant supports dense vectors up to 65,535 dimensions, multiple vectors per data point, and flexible, real-time search and filtering features, with recommendations to optimize metadata and collection usage for performance. Compatibility requirements include careful version upgrades, primary CPU-based operation (with some GPU support), and a preference for multitenancy in single collections rather than many small collections. +- [Reranking in Semantic Search](https://qdrant.tech/documentation/search-precision/reranking-semantic-search/): Reranking in Retrieval-Augmented Generation (RAG) systems, especially with Qdrant, significantly improves the relevance and accuracy of search results by reordering top candidate documents based on relevance using specialized models such as cross-encoders, multi-vector rerankers like ColBERT, or large language models. This process enhances search precision, reduces information overload, and balances speed with relevance by first retrieving a broad set of results (for recall) and then refining them through reranking to surface the most contextually appropriate documents. +- [Role Management](https://qdrant.tech/documentation/cloud-rbac/role-management/): Qdrant Cloud's Role Management allows fine-grained access control through built-in and custom roles, where each role is defined by a set of permissions dictating user actions. Built-in roles cover common needs and cannot be modified, while custom roles can be created, edited, renamed, deleted, or duplicated to tailor access for specific resources. +- [Semantic Search 101](https://qdrant.tech/documentation/beginner-tutorials/search-beginners/): This beginner-friendly tutorial guides you through building a semantic search engine for science fiction books in just 5 minutes, using Python, Sentence Transformers for text embeddings, and Qdrant as a vector database for storage and querying. By following simple steps—installing required libraries, preparing your dataset, setting up storage, uploading data, and querying—you can create a search engine that delivers semantically relevant book recommendations beyond simple keyword matching. +- [Setup Hybrid Cloud](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-setup/): To set up a Qdrant cluster in a Hybrid Cloud Environment, you need a compliant Kubernetes cluster with CSI block storage, enough resources, cluster-admin permissions, required CLI tools (kubectl and helm), and ensure the agent can connect to Qdrant Cloud. Installation involves configuring the environment via the Qdrant Cloud Console, generating and running an installation script, and optionally mirroring necessary container images and Helm charts to your own registry for advanced use cases. +- [Setup Private Cloud](https://qdrant.tech/documentation/private-cloud/private-cloud-setup/): Qdrant Private Cloud requires a standard-compliant Kubernetes cluster with CSI block storage (not NFS/S3), `cluster-admin` permissions, and CLI tools `kubectl` and `helm`; installation involves configuring your registry for required container images and charts, then deploying and managing via Helm commands, with operator permissions scoped by namespace. Uninstallation removes both the software and all managed Qdrant clusters. +- [Understanding Vector Search in Qdrant](https://qdrant.tech/documentation/overview/vector-search/): Traditional search relies on inverted indexes and sparse vectors based on keyword matching, but modern vector search uses dense embeddings from deep language models to capture semantic meaning and handle synonyms and multiple languages automatically. Qdrant is a vector database optimized for efficient and scalable similarity search by using advanced indexing (like HNSW graphs), enabling fast retrieval of semantically similar documents without calculating distances to every item. +- [Vector Quantization](https://qdrant.tech/documentation/): Qdrant is an open-source, AI-native vector database and semantic search engine that enables efficient extraction and search of information from unstructured data, featuring cloud and local deployment options, advanced filtering, hybrid search, multitenancy, sharding, access control, and performance-enhancing features like quantization and multivector support. The platform offers comprehensive guidebooks and quickstart resources to help developers rapidly build and scale search solutions. +- [Automate filtering with LLMs](https://qdrant.tech/documentation/search-precision/automate-filtering-with-llms/): LLMs can automate the generation of structured filters for vector search applications (like Qdrant), allowing conversational or voice interfaces to create precise search queries without traditional UI elements. By using libraries such as Instructor and providing information about available and indexed fields (including their types), you can restrict and validate LLM-generated filters to match your collection schema, improving accuracy and reliability. +- [Build a Neural Search Service](https://qdrant.tech/documentation/beginner-tutorials/neural-search/): This beginner tutorial guides you through building a neural search service that uses Sentence Transformers to encode company descriptions from startups-list.com into vectors, stores them in Qdrant (a vector database), and serves search queries via a FastAPI-based API. Main steps include data preparation and encoding, vector storage using Qdrant (run with Docker), data upload, and API implementation, with all necessary code and prerequisites provided. +- [Configuration](https://qdrant.tech/documentation/private-cloud/configuration/): The Qdrant Private Cloud Helm chart offers extensive customization for deploying and managing the Qdrant operator and database clusters in Kubernetes, including options for replica counts, image sources, security contexts, networking, metrics, and resource allocation. Key features like cluster management, scheduling, network policies, and integration with services such as metrics scraping are configurable via YAML parameters. +- [Create & Restore Snapshots](https://qdrant.tech/documentation/database-tutorials/create-snapshot/): Qdrant collections, which store vectors and related data, can be efficiently backed up and restored using snapshots that capture all necessary data and structures; this process is especially important for large or distributed clusters and is done via HTTP endpoints on each cluster node. This tutorial provides step-by-step instructions for creating, downloading, and restoring collection snapshots, noting that the Python SDK does not support snapshot operations in local mode and requires either Qdrant via Docker or Qdrant Cloud. +- [Create a Cluster](https://qdrant.tech/documentation/hybrid-cloud/hybrid-cloud-cluster-creation/): To create a Qdrant cluster in a Hybrid Cloud environment, follow the standard cluster creation process, ensuring you select your Hybrid Cloud Environment and configure Kubernetes settings such as node selectors, tolerations, service types, and authentication using Kubernetes secrets. You can expose the cluster internally or externally via service types or ingress, configure API keys and TLS for security, and modify these settings anytime via the cluster detail page. +- [Data Ingestion for Beginners](https://qdrant.tech/documentation/data-ingestion-beginners/): This tutorial demonstrates how to build a data ingestion pipeline that extracts unstructured data (text and images) from an AWS S3 bucket, processes it with LangChain to generate embeddings using NLP models, and stores the embeddings in the Qdrant vector database for efficient semantic search. Prerequisites include an AWS and Qdrant account, a structured sample dataset, a Python environment with specific libraries, and secure storage of access keys, following step-by-step instructions for loading, processing, and embedding data. +- [Database Optimization](https://qdrant.tech/documentation/faq/database-optimization/): To optimize database performance and reduce memory usage, configure quantization or on-disk vector storage based on your needs, and choose machine configurations according to whether you prioritize speed (more RAM) or storage cost (faster disks). For better performance, ensure proper payload indexing, use fast local SSDs for on-disk storage, monitor query parameters, and set memory limits with Docker or Kubernetes if necessary. +- [How to Use Multivector Representations with Qdrant Effectively](https://qdrant.tech/documentation/advanced-tutorials/using-multivector-representations/): Multivector representations in Qdrant allow each document to be stored as multiple token-level vectors, enabling more precise query-document matching and accurate reranking, especially with models like ColBERT; however, to avoid high RAM and slow inserts, it's crucial to disable HNSW indexing for these multivectors and use them only during the reranking stage after initial dense vector retrieval. By combining Qdrant and FastEmbed, you can efficiently set up a scalable, production-ready ColBERT-style retrieval pipeline with fast retrieval and fine-grained reranking in a single workflow. +- [Large Scale Search](https://qdrant.tech/documentation/database-tutorials/large-scale-search/): This tutorial demonstrates how to efficiently upload, index, and search the massive LAION-400M dataset (400 million 512-dimensional vectors) on Qdrant using minimal hardware (8 CPUs, 64GB RAM, 650GB disk) by employing resource-saving configurations like FLOAT16 vectors, binary quantization, and optimized index parameters, achieving reasonable latency, accuracy, and throughput. Key considerations include memory usage breakdown, the necessity of high network bandwidth, and techniques ensuring vectors and indexes fit in RAM while leveraging on-disk storage and query-time optimizations. +- [Quickstart](https://qdrant.tech/documentation/fastembed/fastembed-quickstart/): To generate text embeddings with FastEmbed, install the package, load the default BAAI/bge-small-en-v1.5 model, and provide your documents as a list of strings; then, use the model’s `.embed()` method to obtain 384-dimensional vector representations of each document. You can print or visualize these embeddings, which are NumPy arrays, for further analysis. +- [Reranking in Hybrid Search](https://qdrant.tech/documentation/advanced-tutorials/reranking-hybrid-search/): Hybrid search in Qdrant combines dense, sparse (BM25), and late interaction (ColBERT) embeddings to deliver highly relevant search results, further refined through reranking for maximum relevance. Using the FastEmbed library, you can efficiently index documents with multiple embedding types and set up a multi-vector collection in Qdrant to support hybrid retrieval and advanced reranking workflows. +- [Search Through Your Codebase](https://qdrant.tech/documentation/advanced-tutorials/code-search/): This tutorial explains how to use Qdrant semantic search, along with NLP and code-specific embeddings, to navigate and find relevant code snippets within a codebase by chunking, preprocessing, and embedding code for effective semantic querying. It details codebase parsing, chunk extraction, conversion of code to natural language for general embeddings, and setting up an ingestion pipeline for semantic search using Qdrant. +- [Simple Agentic RAG System](https://qdrant.tech/documentation/agentic-rag-crewai-zoom/): This tutorial demonstrates how to build an Agentic RAG system using Qdrant for vector search and CrewAI for orchestrating modular agents, enabling deep analysis and insight extraction from meeting transcripts via a chat-like Streamlit interface. The workflow involves storing meeting data as vector embeddings in Qdrant, querying and analyzing this data with CrewAI agents and Anthropic Claude, and providing users with natural language insights, all supported by provided setup scripts and a ready-to-use GitHub repository. +- [User Management](https://qdrant.tech/documentation/cloud-rbac/user-management/): Users can be invited and assigned roles via the User Management section, with the option to select specific roles or use the default minimal-permission Base role; invites can be revoked before acceptance. Authorized users can update roles or remove active users through User Management after invitations have been accepted. +- [Agentic RAG With LangGraph](https://qdrant.tech/documentation/agentic-rag-langgraph/): Agentic RAG enhances traditional RAG systems by using AI agents—powered by LangGraph for workflow management and Qdrant for vector search—to dynamically select between multiple data sources or web search, enabling complex, multi-step information retrieval. The proposed system leverages OpenAI models, two Qdrant vector databases, and Brave Search, orchestrated via LangGraph, to flexibly answer questions about Hugging Face and Transformers documentation. +- [Build a Recommendation System with Collaborative Filtering](https://qdrant.tech/documentation/advanced-tutorials/collaborative-filtering/): This guide explains how to build a movie recommendation system using collaborative filtering and Qdrant, leveraging user ratings as sparse vectors for efficient similarity search without the need for traditional model training. The process involves preparing normalized user-movie rating data, uploading it as sparse vectors to Qdrant, and retrieving personalized movie recommendations by finding users with similar preferences. +- [Configure, Scale & Update Clusters](https://qdrant.tech/documentation/hybrid-cloud/configure-scale-upgrade/): Qdrant Hybrid Cloud Clusters allow you to configure advanced settings, scale clusters both vertically and horizontally with features like automatic shard rebalancing and resharding, and update cluster versions at any time. For more details, refer to the specific documentation links provided. +- [FastEmbed & Qdrant](https://qdrant.tech/documentation/fastembed/fastembed-semantic-search/): FastEmbed can be seamlessly integrated with Qdrant for local, in-memory vector search by installing the Qdrant client, setting up a collection with model-specific vector parameters, uploading documents (with semantic embeddings handled implicitly), and querying for semantically similar results. This allows efficient semantic search, as shown by retrieving the most relevant document based on the query. +- [Load a HuggingFace Dataset](https://qdrant.tech/documentation/database-tutorials/huggingface-datasets/): Hugging Face offers the "arxiv-titles-instructorxl-embeddings" dataset, containing 2.25 million precomputed vector embeddings of paper titles with associated metadata, which can be loaded, streamed, and efficiently imported into Qdrant for semantic search applications. The dataset is large (over 16 GB), and Qdrant provides tools and batching strategies to facilitate its integration and use for building AI-powered search systems. +- [Managing a Cluster](https://qdrant.tech/documentation/private-cloud/qdrant-cluster-management/): To manage a Qdrant cluster, configure the QdrantCluster resource with unique IDs, required labels, resource settings, and optionally expose the service and enable security features like API key authentication and TLS, referencing Kubernetes secrets as needed. Scaling, upgrading, and secure communication (including inter-node TLS) are managed by editing cluster specs, while ensuring proper network controls and following upgrade/versioning guidelines. +- [Permission Reference](https://qdrant.tech/documentation/cloud-rbac/permission-reference/): The document lists and describes all permissions available in Qdrant Cloud across areas such as Identity and Access Management, Clusters (including API Keys, backups, and schedules), Hybrid Cloud, Payment & Billing, Account Management, and Profile. Enabling any `write:*` permission automatically enables the corresponding `read:*`, ensuring users maintain required access when creating or updating resources. +- [Setup Hybrid Search with FastEmbed](https://qdrant.tech/documentation/beginner-tutorials/hybrid-search-fastembed/): This beginner-level tutorial demonstrates how to build and deploy a hybrid search service using FastEmbed for embedding generation, Qdrant as a vector database, and FastAPI for serving an API, enabling searches through a startup dataset with both dense (BERT-based) and sparse (full-text) embeddings. Key steps include preparing the dataset, running Qdrant in Docker, encoding and uploading data with FastEmbed, and creating a hybrid search API for querying similar startups. +- [What is Qdrant?](https://qdrant.tech/documentation/overview/): Vector databases, such as Qdrant, efficiently store and query high-dimensional vector representations (embeddings) generated by machine learning models, enabling fast similarity search for applications like recommendation systems and semantic search. Qdrant is a production-ready vector database offering flexible storage, efficient indexing, and support for multiple distance metrics, making it well-suited for handling large-scale, unstructured data in AI and machine learning workflows. +- [Agentic RAG Discord Bot with CAMEL-AI](https://qdrant.tech/documentation/agentic-rag-camelai-discord/): This tutorial guides you through building an agentic Retrieval-Augmented Generation (RAG) chatbot for Discord using CAMEL-AI, Qdrant, and OpenAI, which not only retrieves but actively reasons over relevant context for user queries. The workflow involves setting up the environment and APIs, scraping and embedding documentation into Qdrant, configuring intelligent multi-step reasoning with CAMEL-AI, and deploying the chatbot to Discord for dynamic, context-aware conversations. +- [Backups](https://qdrant.tech/documentation/private-cloud/backups/): To create backups in Qdrant, use the `QdrantClusterSnapshot` resource for one-time backups or `QdrantClusterScheduledSnapshot` for recurring backups, and restore using the `QdrantClusterRestore` resource. Ensure the `cluster-id` and `customer-id` labels on all resources match the associated QdrantCluster. +- [Build With Async API](https://qdrant.tech/documentation/database-tutorials/async-api/): Asynchronous programming in Python, supported by frameworks like FastAPI and tools such as Qdrant’s AsyncQdrantClient, enables efficient handling of IO-bound operations—like database interactions—by allowing multiple concurrent user requests without blocking server threads. To leverage the async API, use `async/await` syntax in an async context (e.g., with `asyncio.run`), and simply replace the synchronous client with `AsyncQdrantClient` while adding `await` to method calls, especially in multi-user web services. +- [Cloud Quickstart](https://qdrant.tech/documentation/cloud-quickstart/): To get started with Qdrant Cloud, sign up for an account, create a cluster to receive your API key, and access the Cluster UI dashboard. You can then authenticate and interact with your Qdrant instance using official SDKs or the interactive tutorial sandbox. +- [Configure the Qdrant Operator](https://qdrant.tech/documentation/hybrid-cloud/operator-configuration/): The Qdrant Operator offers a range of advanced configuration options for deployment in hybrid cloud environments, allowing customization of resource usage, scheduling, cluster management, storage, network policies, logging, and security through a structured YAML file. Key features include adjustable cluster and backup management, logging levels, pod scheduling constraints, ingress options, and support for monitoring and resource limits. +- [Logging & Monitoring](https://qdrant.tech/documentation/private-cloud/logging-monitoring/): Qdrant Private Cloud logging can be accessed via kubectl or any Kubernetes log management tool, with configurable log levels set in the QdrantCluster spec, and can be integrated into external log systems without special configuration. Monitoring is available through the Qdrant Cloud console or by scraping Prometheus/OpenTelemetry-compatible metrics from specific endpoints, with Grafana dashboards provided for visualization. +- [Measure Search Quality](https://qdrant.tech/documentation/beginner-tutorials/retrieval-quality/): Semantic search quality depends on both embedding quality and the performance of the Approximate Nearest Neighbors (ANN) algorithm. This tutorial demonstrates how to measure and improve retrieval quality in Qdrant by evaluating embeddings with benchmarks and comparing ANN results against exact kNN search using metrics like precision@k. +- [Networking, Logging & Monitoring](https://qdrant.tech/documentation/hybrid-cloud/networking-logging-monitoring/): Qdrant Hybrid Cloud provides configurable network policies for database clusters, allowing fine-grained ingress control, and supports flexible logging—accessible via kubectl or integrated log management systems—and customizable log levels. Monitoring is available through the Qdrant Cloud console and Prometheus/OpenTelemetry-compatible endpoints in each pod, with Grafana dashboards offered for visualization. +- [Scaling PDF Retrieval with Qdrant](https://qdrant.tech/documentation/advanced-tutorials/pdf-retrieval-at-scale/): The tutorial demonstrates a scalable approach to PDF retrieval using Qdrant and Vision Large Language Models (VLLMs) like ColPali and ColQwen2 by applying mean pooling to reduce multivector representations, enabling much faster indexing and retrieval while maintaining high retrieval quality. The process involves first-stage retrieval using mean-pooled vectors and reranking results with original multivectors, thus overcoming the computational challenges of large-scale PDF search. +- [Working with miniCOIL](https://qdrant.tech/documentation/fastembed/fastembed-minicoil/): miniCOIL is a sparse neural retriever that enhances BM25 keyword retrieval by leveraging the contextual meaning of keywords for scoring, improving search result relevance when exact keyword matches are required but with semantic awareness. It is integrated with Qdrant and FastEmbed, and outperforms BM25 by ranking documents with contextually appropriate keyword use, as shown in a sample query where miniCOIL surfaces medically relevant vector documents while BM25 does not. +- [API & SDKs](https://qdrant.tech/documentation/interfaces/): Qdrant provides official client libraries for Python, JavaScript/TypeScript, Rust, Go, .NET, and Java, or you can interact with its REST or gRPC APIs, with REST recommended for newcomers and prototyping, while gRPC is suited for advanced users needing optimal performance. The gRPC interface mirrors REST endpoints, requires explicit port exposure (default 6334), and is more complex but faster than REST. +- [API Reference](https://qdrant.tech/documentation/private-cloud/api-reference/): The qdrant.io/v1 API package defines schemas for managing Qdrant clusters, regions, entities, and related resources, including their phases, statuses, configurations, and GPU settings. It standardizes resource types and operational states to support orchestration and automated management of Qdrant infrastructure components. +- [Changelog](https://qdrant.tech/documentation/private-cloud/changelog/): This changelog details progressive updates to Qdrant Kubernetes components, including new features such as metrics exporting, automatic shard balancing, enhanced performance and stability, support for GPU instances, configurable settings, improved scaling, and better peer management. Key additions include OpenTelemetry/Prometheus integration, P2P TLS, and automatic shard replication, as well as numerous bug fixes and optimizations across releases. +- [Deployment Platforms](https://qdrant.tech/documentation/hybrid-cloud/platform-deployment-options/): This page outlines how to deploy Qdrant Hybrid Cloud on various managed Kubernetes platforms—including Akamai (Linode), AWS, Civo, DigitalOcean, Gcore, and GCP—by following provider-specific prerequisites and Qdrant’s setup guide. It highlights recommended instance types, storage configurations, and the need for appropriate Kubernetes storage drivers and backup solutions per platform. +- [GraphRAG with Qdrant and Neo4j](https://qdrant.tech/documentation/examples/graphrag-qdrant-neo4j/): GraphRAG combines knowledge graphs and vector search to address the limitations of traditional Retrieval-Augmented Generation (RAG) by enabling large language models to connect complex relationships and provide more accurate, context-rich responses. This tutorial demonstrates building a GraphRAG system using Neo4j and Qdrant, outlining an architecture that leverages both graph and vector databases for efficient data ingestion, semantic search, and contextual response generation. +- [Installation](https://qdrant.tech/documentation/guides/installation/): Qdrant requires a 64-bit CPU (x86_64/amd64 or arm64), sufficient RAM and SSD/NVMe storage with block-level POSIX compatibility, and open network ports 6333, 6334, and 6335; it does not support 32-bit systems, NFS or object storage. Installation options include fully managed Qdrant Cloud for production, Kubernetes deployments (with Helm chart or Enterprise Operator), Docker for development, and careful configuration of storage, security, and backups is essential for self-hosted production setups. +- [Multilingual & Multimodal RAG with LlamaIndex](https://qdrant.tech/documentation/multimodal-search/): This tutorial demonstrates how to build a multilingual and multimodal (text and image) semantic search application using LlamaIndex’s vdr-2b-multi-v1 embedding model and Qdrant vector database, enabling effective text-to-image, image-to-text, and cross-lingual searches without OCR or complex preprocessing. By uploading shared embeddings of images and captions into Qdrant, users can perform flexible and accurate searches for various applications, such as e-commerce and media management, simply by querying with either text or images in multiple languages. +- [Working with SPLADE](https://qdrant.tech/documentation/fastembed/fastembed-splade/): SPLADE is a method for generating efficient, interpretable sparse text vectors useful for information retrieval, with practical guidance provided on setup, embedding documents, and interpreting token weights using the FastEmbed library. The approach enables contextual term expansion and token importance ranking, with design features like unnormalized weights and flexible handling of vocabulary and typos. +- [5 Minute RAG with Qdrant and DeepSeek](https://qdrant.tech/documentation/rag-deepseek/): This tutorial provides a quick guide to building a Retrieval-Augmented Generation (RAG) pipeline by storing vector embeddings in Qdrant and enriching Large Language Model prompts with DeepSeek, enhancing response accuracy through context retrieval. It covers setting up the environment, ingesting documents as embeddings in Qdrant, and using DeepSeek to test and improve LLM responses with retrieved knowledge. +- [Qdrant Web UI](https://qdrant.tech/documentation/web-ui/): The Qdrant Web UI provides an intuitive interface to manage both local and cloud Qdrant deployments, offering features such as REST API interaction, collection management, and an interactive tutorial. Access is via http://localhost:6333/dashboard for local setups or by appending :6333/dashboard to your cloud cluster URL. +- [Working with ColBERT](https://qdrant.tech/documentation/fastembed/fastembed-colbert/): ColBERT is an embedding model that generates multivector representations—one vector per token—which capture nuanced semantics for strong reranking performance, though at increased computational cost; it is best used for reranking smaller candidate sets rather than for initial large-scale retrieval. Using the `fastembed` library, you can generate these multivectors and upload them to Qdrant, which natively supports multivector storage and retrieval, for efficient experimentation on text datasets. +- [Automating Processes with Qdrant and n8n](https://qdrant.tech/documentation/qdrant-n8n/): This tutorial demonstrates how to integrate Qdrant with the n8n low-code automation platform to enable advanced workflows such as recommendations and big data analysis beyond basic RAG, including use cases like movie recommendation systems and large-scale unstructured data analysis leveraging vector search. It explains how to set up both cloud and local Qdrant instances in n8n, and showcases workflows for recommendation and anomaly detection, while highlighting the newly available official Qdrant n8n node for simplified integration. +- [Reranking with FastEmbed](https://qdrant.tech/documentation/fastembed/fastembed-rerankers/): A reranker is a precise but resource-intensive model used to improve search result relevance by reordering a small subset of documents retrieved with a faster method; FastEmbed supports multiple cross-encoder reranker models, including Jina Reranker v2, which can be used after initial retrieval (e.g., with all-MiniLM-L6-v2) for more accurate ranking. The tutorial demonstrates how to set up FastEmbed with Qdrant, index data, and apply reranking models on a sample movie description dataset. +- [Administration](https://qdrant.tech/documentation/guides/administration/): Qdrant provides administration tools to modify instance behavior at runtime, including a locking API to restrict write operations (not persistent across restarts and acting per node), a recovery mode for resolving out-of-memory issues by allowing only collection deletion while loading minimal metadata, and strict mode to dynamically limit inefficient or resource-heavy operations for better stability and predictability. These features can be enabled or configured via API or environment variables and are useful for managing access, recovery, and performance in distributed and unpredictable environments. +- [Running with GPU](https://qdrant.tech/documentation/guides/running-with-gpu/): Starting with v1.13.0, Qdrant supports GPU acceleration via dedicated Docker images for NVIDIA or AMD GPUs, which require specific drivers and container toolkits; GPU usage is configured through various settings, and activation for indexing needs to be explicitly enabled. Limitations include Linux x86_64-only Docker support and a 16GB per-GPU vector data size cap per indexing iteration. +- [Capacity Planning](https://qdrant.tech/documentation/guides/capacity-planning/): Capacity planning for your cluster involves balancing RAM and disk storage based on vector count, dimensions, payload data, and indexing needs, with estimated memory requirements calculated using rough formulas that add 50% for metadata and indexing overhead. For performance, store frequently accessed and indexed data in RAM, use disk for less-used payloads, and consider scaling disk space via the cluster UI as needed, always validating with real data for accuracy. +- [FastEmbed](https://qdrant.tech/documentation/fastembed/): FastEmbed is a lightweight Python library for fast and accurate embedding generation, supporting a variety of popular and multilingual models with easy integration into Qdrant for multimodal search. It is optimized for performance with minimal dependencies, making it suitable for serverless environments, and offers beginner-to-advanced guides for tasks like semantic search, sparse and multivector embeddings, and reranking. +- [Optimize Performance](https://qdrant.tech/documentation/guides/optimize/): Qdrant performance can be optimized in three main ways: use vector quantization for high-speed search with low memory usage, store vectors and the HNSW index on disk for high precision with low memory, or keep all data in RAM and use quantization with re-scoring for both high precision and high speed. Additionally, latency and throughput can be balanced by adjusting the number and size of segments based on workload priorities. +- [Getting Started](https://qdrant.tech/documentation/cloud-getting-started/): To get started with Qdrant Managed Cloud, create an account (with required payment details), plan your cluster resources, and follow the provided guides to deploy and manage clusters, including production-ready multi-node setups. Automation options are available via the Qdrant Cloud API and Terraform Provider. +- [Multitenancy](https://qdrant.tech/documentation/guides/multiple-partitions/): For most users, efficient multitenancy is achieved by using a single collection with payload-based partitioning (e.g., by `group_id`), though multiple collections may be used for greater user isolation if needed. To maximize performance, customize your HNSW index to bypass global indexing and organize data by tenant, but note that global (cross-tenant) queries will be slower. +- [Account Setup](https://qdrant.tech/documentation/cloud-account-setup/): Qdrant Cloud accounts can be registered via email, Google, GitHub, or enterprise SSO, with functionalities for inviting and managing users, switching and creating multiple accounts, and configuring settings such as permissions, light/dark mode, and account details. Enterprise SSO is available for Premium Tier customers, supporting various identity providers. +- [Cloud RBAC](https://qdrant.tech/documentation/cloud-rbac/): Cloud RBAC in Qdrant Cloud allows precise management of user permissions for key areas like billing, identity, clusters (currently all clusters together), hybrid cloud, and account configuration via the console. Access is available through "Access Management > User & Role Management," with more detailed permission controls planned for future releases. +- [Managed Cloud](https://qdrant.tech/documentation/cloud/): Qdrant Managed Cloud is a SaaS offering that provides managed Qdrant database clusters for fast and reliable similarity search without infrastructure maintenance, accessible via a Qdrant Cloud account and API key. It supports high availability, scaling, monitoring, backups, disaster recovery, and can operate natively on major cloud providers or as a Hybrid Cloud with your own infrastructure. +- [Hybrid Cloud](https://qdrant.tech/documentation/hybrid-cloud/): Qdrant Hybrid Cloud allows you to manage vector databases across cloud, on-premises, or edge environments using your own Kubernetes clusters, ensuring data stays within your infrastructure for maximum security, privacy, and cost efficiency. It operates through a Kubernetes Operator and Cloud Agent that manage Qdrant databases locally while only sending telemetry (not user data) to Qdrant Cloud, with no need to expose your cluster or provide external credentials. +- [Vector Search Basics](https://qdrant.tech/documentation/beginner-tutorials/): The Beginner Tutorials section offers step-by-step guides on building semantic, neural, and hybrid search engines using Qdrant and related tools, as well as measuring and improving retrieval quality. These tutorials are designed to help newcomers quickly get started with advanced search technologies. +- [Advanced Retrieval](https://qdrant.tech/documentation/advanced-tutorials/): The Advanced Tutorials section offers guides on building a movie recommendation system with collaborative filtering, developing a text/image multimodal search with Qdrant and FastEmbed, navigating codebases using semantic search, and optimizing large-scale PDF retrieval with ColPali/ColQwen integration. Each tutorial demonstrates advanced applications of Qdrant for various AI-powered search and retrieval tasks. +- [Private Cloud](https://qdrant.tech/documentation/private-cloud/): Qdrant Private Cloud enables easy deployment and management of Qdrant database clusters on any Kubernetes infrastructure, offering features like zero-downtime upgrades, flexible scaling, multi-AZ support, backup and disaster recovery, extended telemetry, and enterprise support. It operates independently from the Qdrant Cloud Management Console, providing full control within your own environment. +- [Billing & Payments](https://qdrant.tech/documentation/cloud-pricing-payments/): Qdrant Cloud clusters are billed monthly based on CPU, memory, and disk usage, with payment options via credit card (processed by Stripe) or through AWS, GCP, or Azure Marketplace subscriptions—all with identical pricing and managed through the Qdrant Cloud Console. Regardless of payment method, clusters are deployed on Qdrant-owned infrastructure; those seeking deployment in their own environment should use the Hybrid Cloud solution. +- [Building a Chain-of-Thought Medical Chatbot with Qdrant and DSPy](https://qdrant.tech/documentation/examples/qdrant-dspy-medicalbot/): The article outlines building a reliable medical chatbot using Qdrant for vector search and DSPy for reasoning, leveraging Retrieval-Augmented Generation (RAG) to ground answers in current, specialty-filtered medical literature, and employing guardrails to ensure only medical queries are answered. By utilizing both dense and ColBERT embeddings for document retrieval and reranking, the system provides accurate, context-aware, and up-to-date responses while minimizing hallucinations and irrelevant answers. +- [Data Management](https://qdrant.tech/documentation/data-management/): This content lists key data management integrations, summarizing each tool’s core function: they cover ETL/ELT processing, workflow orchestration, real-time and batch data streaming, AI memory integration, and large-scale analytics. Platforms like Airbyte, Airflow, Spark, and others enable efficient data ingestion, transformation, and processing across diverse sources. +- [Multitenancy with LlamaIndex](https://qdrant.tech/documentation/examples/llama-index-multitenancy/): To implement multitenancy with LlamaIndex and Qdrant, use a single vector collection partitioned by user-specific metadata (payload-based partitioning) and set up the vector store, embedding, and chunking configuration accordingly. Index documents with tenant-specific metadata, optimize search performance with payload indexing, and use metadata constraints during retrieval to ensure users only access their own data. +- [Using the Database](https://qdrant.tech/documentation/database-tutorials/): The database tutorials cover key topics such as bulk uploading vectors, large scale search, backup and restore via snapshots, integration with Hugging Face datasets, efficient Python usage with Qdrant’s async API, migration guidance, and insights on static embeddings. These resources provide practical instructions for managing and optimizing Qdrant collections. +- [Embeddings](https://qdrant.tech/documentation/embeddings/): Qdrant supports a wide range of text and multimodal dense vector embedding models and services without limitations, including popular providers like OpenAI, AWS Bedrock, Cohere, and any open-source embeddings from HuggingFace. Users can easily integrate and utilize numerous embedding models tailored for different languages, modalities, and environments. +- [Premium Tier](https://qdrant.tech/documentation/cloud-premium/): Qdrant Cloud Premium offers enhanced features such as 24/7 priority support, a 99.9% uptime SLA, Single Sign-On, VPC Private Links, and storage encryption with user-provided keys. These benefits are designed for customers needing advanced support, improved security, and higher reliability compared to the standard tier. +- [Create a Cluster](https://qdrant.tech/documentation/cloud/create-cluster/): Qdrant Cloud offers Free (single-node, limited resources, good for testing) and Standard (multi-node, dedicated resources, high availability, disaster recovery) clusters, which can be easily created and configured through the Cloud Console by specifying provider, region, and resources. For production-ready clusters, use at least 3 nodes with replication, enable regular backups, and configure sufficient sharding for scalability; clusters can be managed, scaled, or deleted as needed via the console. +- [Frameworks](https://qdrant.tech/documentation/frameworks/): A wide range of frameworks exist to support building, deploying, and orchestrating AI and LLM-powered applications, offering capabilities such as multi-agent workflows, data orchestration, evaluation, memory, security, and domain-specific solutions in various programming languages. These tools enable rapid development, customization, testing, and production readiness for modern AI-driven systems. +- [Observability](https://qdrant.tech/documentation/observability/): The document lists observability integrations, including OpenLIT for OpenTelemetry-native observability and evaluations for LLMs and vector databases, OpenLLMetry as a set of OpenTelemetry extensions for LLM applications, and Datadog as a cloud-based monitoring and analytics platform. These tools enhance monitoring, evaluation, and analytics for large language model systems. +- [Platforms](https://qdrant.tech/documentation/platforms/): The Platform Integrations section lists various platforms specializing in web scraping, workflow automation, API and backend creation, data operations, RAG (Retrieval Augmented Generation) interfaces, and privacy-focused document querying. These tools enable users to automate tasks, integrate applications, manage data, and build business solutions with low-code or customizable options. +- [Private Chatbot for Interactive Learning](https://qdrant.tech/documentation/examples/rag-chatbot-red-hat-openshift-haystack/): This tutorial describes how to build a secure, private chatbot for corporate training using a Retrieval-Augmented Generation (RAG) pipeline with open-source tools on Red Hat OpenShift, integrating Mistral-7B-Instruct for LLM, BAAI embedding model, Qdrant for vector storage, and Haystack for orchestration—all within a closed infrastructure to protect sensitive data. It details setting up the environment, deploying and connecting each component, and constructing indexing and search pipelines to enable fast, private retrieval and question-answering over proprietary learning materials. +- [Implement Cohere RAG connector](https://qdrant.tech/documentation/examples/cohere-rag-connector/): This tutorial explains how to implement a custom HTTP-based connector for Cohere's Retrieval Augmented Generation (RAG) using Qdrant as the vector database, enabling semantic search over custom data (such as personal notes) and leveraging features like inline citations. It provides step-by-step guidance on setting up a Qdrant collection, embedding data with Cohere's API, and exposing the data to Cohere's LLM through a FastAPI service. +- [Send Data to Qdrant](https://qdrant.tech/documentation/send-data/): You can send data to a Qdrant cluster using various methods, such as streaming with Kafka, integrating with Databricks via Spark, or orchestrating data pipelines with Airflow and Astronomer. For migration from other vector databases, refer to the dedicated Migration Guide. +- [Build Prototypes](https://qdrant.tech/documentation/examples/): The provided content lists a variety of end-to-end code samples and interactive notebooks demonstrating how to build systems like chatbots, search engines, recommendation engines, and information extraction tools using Qdrant combined with frameworks such as LlamaIndex, Cohere, LangChain, and others. These examples cover use cases including multitenancy, customer support, document search, contract management, media recommendations, and medical chatbots, and are supported by detailed notebooks for hands-on learning and experimentation. +- [Question-Answering System for AI Customer Support](https://qdrant.tech/documentation/examples/rag-customer-support-cohere-airbyte-aws/): This tutorial explains how to build a fully private, AI-powered customer support system using Cohere models deployed on AWS, Qdrant as a knowledge base, and Airbyte for continuous data ingestion from sources like Excel files, enabling efficient and automated responses to customer queries via a Retrieval Augmented Generation (RAG) approach. The system ensures secure handling of proprietary support data and leverages connectors for seamless integration, allowing for scalable, multilingual, and accurate query resolution. +- [Chat With Product PDF Manuals Using Hybrid Search](https://qdrant.tech/documentation/examples/hybrid-search-llamaindex-jinaai/): This tutorial explains how to build an advanced RAG-based chatbot that uses hybrid search to answer queries from product PDF manuals, combining components like Jina Embeddings, Qdrant vector database, Mixtral LLM, LlamaIndex, and LlamaParse for data ingestion, retrieval, and natural language response. It provides step-by-step instructions for deploying necessary infrastructure, preparing and parsing documents, configuring embeddings and language models, storing vectors in Qdrant, and assembling prompts for effective hybrid search and customer support applications. +- [Qdrant Cloud API](https://qdrant.tech/documentation/cloud-api/): The Qdrant Cloud API enables automation of the Qdrant Cloud platform through both a high-performance gRPC API and a flexible REST/JSON API, allowing management of accounts, clusters, and related resources; authentication is handled via management API keys. A Terraform provider is also available, and users are advised to transition from the deprecated OpenAPI endpoint, which will be removed after November 1st, 2025. +- [Infrastructure Tools](https://qdrant.tech/documentation/cloud-tools/): Pulumi and Terraform are infrastructure as code tools that enable users to create, deploy, and manage cloud infrastructure using human-readable configuration files. Both tools streamline the process of defining and automating cloud resources. +- [Region-Specific Contract Management System](https://qdrant.tech/documentation/examples/rag-contract-management-stackit-aleph-alpha/): The tutorial outlines how to build a secure, region-specific contract management system using Retrieval Augmented Generation (RAG), leveraging Aleph Alpha embeddings, Qdrant Hybrid Cloud, and the German STACKIT cloud to ensure data residency and access controls compliant with regulations like GDPR. The process covers secure document ingestion, vector storage with per-user access permissions, and streamlined semantic search capabilities, all integrated with business workflows using LangChain for efficient and compliant contract management. +- [Practice Datasets](https://qdrant.tech/documentation/datasets/): Qdrant provides ready-made datasets in snapshot format with pre-computed embeddings (e.g., Arxiv.org titles and abstracts, Wolt food descriptions) for easy import and use in semantic search and machine learning projects, with datasets available for download or from Hugging Face. These datasets save resources by eliminating the need to generate your own embeddings and are suitable for non-commercial and academic purposes. +- [Authentication](https://qdrant.tech/documentation/cloud/authentication/): Qdrant Managed Cloud allows you to create Database API keys with granular access control (available in clusters v1.11.0+), which can be configured, rotated, and used to securely authenticate access to your cluster via REST, gRPC, or official SDKs. It is recommended to use these keys over legacy Admin keys for improved security, and to always include the API key in your request headers when connecting to your cluster. +- [RAG System for Employee Onboarding](https://qdrant.tech/documentation/examples/natural-language-search-oracle-cloud-infrastructure-cohere-langchain/): The document outlines how to build a Retrieval-Augmented Generation (RAG) system for employee onboarding that enables natural language search and chat with company documentation by integrating Cohere language models, Qdrant vector database, and LangChain, all deployed on Oracle Cloud Infrastructure to ensure confidentiality and scalability. It provides step-by-step instructions for setting up the environment, indexing website content, and preparing the system for semantic search, focusing specifically on leveraging Oracle Cloud services and open-source tools. +- [Private RAG Information Extraction Engine](https://qdrant.tech/documentation/examples/rag-chatbot-vultr-dspy-ollama/): This tutorial demonstrates how to build a private, on-premise RAG-based information extraction engine using DSPy, running on Vultr Kubernetes with Ollama-hosted LLMs and Qdrant as a knowledge base, enabling secure processing and structuring of unstructured documents without exposing data to external services. Key components include DSPy for extraction, Qdrant Hybrid Cloud for retrieval, FastEmbed for embeddings, and the integration is suited for regulated industries requiring strict data privacy. +- [Movie Recommendation System](https://qdrant.tech/documentation/examples/recommendation-system-ovhcloud/): This tutorial demonstrates how to build a movie recommendation system using the MovieLens dataset, leveraging collaborative filtering and sparse vectors stored in the Qdrant vector database deployed on OVHcloud's managed Kubernetes for secure, scalable, and efficient similarity searches. Users' movie ratings are normalized and represented as sparse vectors, which are indexed to enable personalized recommendations by comparing user preferences and suggesting films liked by similar users. +- [Blog-Reading Chatbot with GPT-4o](https://qdrant.tech/documentation/examples/rag-chatbot-scaleway/): This tutorial guides users through building a Retrieval-Augmented Generation (RAG) chatbot that leverages GPT-4o, Qdrant Hybrid Cloud, and LangChain to enable semantic search and question-answering over blog content, with deployment instructions focused on maintaining data privacy on Scaleway-managed Kubernetes. Key steps include ingesting and chunking HTML blog data, indexing embeddings in Qdrant, and integrating GPT-4o via LangChain for answer generation. +- [Cluster Access](https://qdrant.tech/documentation/cloud/cluster-access/): After creating a Qdrant Cloud cluster and setting up an API key, you can access your cluster via the Cluster UI, REST API (port 6333), or GRPC API (port 6334), with traffic automatically load balanced across all healthy nodes. Node-specific endpoints are also available for monitoring or shard management and can be found in the cluster detail page. +- [Support](https://qdrant.tech/documentation/support/): Qdrant Cloud offers community support via Discord and dedicated support for paying customers through a Jira Service Management portal, where users can submit tickets detailing their issues and select severity levels to prioritize responses. Customers are encouraged to provide detailed information and use the support bundle script for efficient troubleshooting, with response times determined by their support tier and the severity of the issue. +- [Using Cloud Inference to Build Hybrid Search](https://qdrant.tech/documentation/tutorials-and-examples/cloud-inference-hybrid-search/): This tutorial demonstrates how to build a hybrid semantic search engine with Qdrant Cloud, using cloud inference to embed data, integrating dense semantic embeddings with sparse BM25 keywords, and leveraging Reciprocal Rank Fusion (RRF) for effective hybrid search and reranking. Key steps include installing the Qdrant client, initializing it with cloud inference, creating a collection with both dense and sparse vectors, uploading data, and running a vector search to retrieve and rank relevant results. +- [Monitoring Hybrid/Private Cloud with Prometheus and Grafana](https://qdrant.tech/documentation/tutorials-and-examples/hybrid-cloud-prometheus/): This tutorial explains how to set up Prometheus and Grafana in a Kubernetes cluster for monitoring Qdrant databases in hybrid or private cloud environments, including installation with Helm, configuring Prometheus scraping with ServiceMonitors, accessing Grafana, and importing a pre-built Qdrant dashboard for visualizing metrics. It assumes you have a running Kubernetes cluster with Qdrant deployed and provides step-by-step commands and links to example resources. +- [Qdrant on Databricks](https://qdrant.tech/documentation/send-data/databricks/): This tutorial demonstrates how to use Databricks with Qdrant by first vectorizing a dataset into dense and sparse embeddings using FastEmbed, then storing these embeddings in a Qdrant collection via the Qdrant Spark connector. The step-by-step process includes setting up the environment, preparing data, generating embeddings, constructing a new Spark DataFrame, and uploading the results to Qdrant for efficient vector search and analytics. +- [Semantic Querying with Airflow and Astronomer](https://qdrant.tech/documentation/send-data/qdrant-airflow-astronomer/): This tutorial demonstrates how to use Apache Airflow with the Astronomer platform and Qdrant provider to build a Python-based data pipeline that generates embeddings in parallel from a book dataset and performs semantic retrieval for recommendations. It guides users through setting up the project environment, configuring credentials, ingesting data, and writing a DAG to enable vector search and personalized book suggestions. +- [Tutorials & Examples](https://qdrant.tech/documentation/tutorials-and-examples/): The page provides a tutorial on using cloud inference to implement hybrid search. It includes an example demonstrating this approach. +- [How to Setup Seamless Data Streaming with Kafka and Qdrant](https://qdrant.tech/documentation/send-data/data-streaming-kafka-qdrant/): The guide provides step-by-step instructions for setting up a real-time data streaming pipeline using Kafka (via Confluent), MongoDB, Azure Blob Storage, and Qdrant, with a focus on installing and configuring each component—including the Qdrant Kafka Sink Connector—for seamless integration and efficient vector data ingestion suited for semantic search and Retrieval-Augmented Generation (RAG) applications. By following this setup, users can achieve a scalable, fault-tolerant architecture that supports real-time data capture, processing, and high-performance vector search. +- [Scale Clusters](https://qdrant.tech/documentation/cloud/cluster-scaling/): Qdrant Cloud clusters can be scaled vertically (by increasing resources like CPU, memory, and storage for each node, which may require downtime) or horizontally (by adding nodes and distributing data using shards), and support resharding, which allows you to adjust the number of shards in a collection without downtime to optimize resource utilization as your needs change. The cloud platform automates shard balancing during horizontal scaling and supports transparent resharding on multi-node clusters, though performance may be temporarily reduced during the process. +- [Configure Clusters](https://qdrant.tech/documentation/cloud/configure-cluster/): Qdrant Cloud clusters can be customized with default collection settings, advanced performance options, client IP restrictions, configurable restart modes during maintenance, and automated or manual shard rebalancing strategies to optimize performance and security. These configurations are accessible from the Cluster Details page and allow users to tailor cluster behavior according to their specific requirements. +- [Monitor Clusters](https://qdrant.tech/documentation/cloud/cluster-monitoring/): Qdrant Cloud offers comprehensive cluster monitoring through easily accessible metrics, logs, and automated alerts in its console, while also supporting Prometheus-compatible endpoints (`/metrics` and `/sys_metrics`) for advanced monitoring and integration with tools like Grafana. Authentication via API keys is required for accessing node and system metrics, which include both database and operational infrastructure data, allowing for detailed cluster health and performance tracking. +- [Update Clusters](https://qdrant.tech/documentation/cloud/cluster-upgrades/): To update your Qdrant Cloud cluster to a new version, select the desired version on the Cluster Details page and click "Update." Multi-node clusters with a replication factor of 2 or more update with zero downtime, while single-node clusters or those with a replication factor of 1 will experience brief downtime. +- [Backup Clusters](https://qdrant.tech/documentation/cloud/backups/): Qdrant Cloud Clusters support both automatic and manual backups via the Cloud Dashboard or Snapshot API, allowing users to schedule, manage, and restore backups to protect against data loss or failure. Restoring from a backup resets the cluster to its previous state, and backup costs and mechanisms differ by cloud provider, with incremental backups for AWS/GCP and disk-usage-based costs for Azure. +- [Inference](https://qdrant.tech/documentation/cloud/inference/): Qdrant Managed Cloud offers built-in inference capabilities for generating vector embeddings from text and images using various machine learning models, with usage and billing managed through the cloud console; this feature is currently available only for paid clusters in US regions, and can be enabled or disabled per cluster. Inference can be accessed via Qdrant SDKs or APIs using special input objects, and the Python SDK supports seamless switching between local and cloud inference modes. +- [Distributed Deployment](https://qdrant.tech/documentation/guides/distributed_deployment/): Qdrant supports distributed deployment from v0.8.0, allowing multiple nodes to share data for improved scalability and resilience, with recommended production clusters having three or more nodes and replicated shards for optimal uptime and recovery. Distributed mode can be enabled via configuration (self-hosted) or automatically in Qdrant Cloud, but new nodes require manual collection replication or shard rebalancing, as data is not automatically redistributed. +- [Quantization](https://qdrant.tech/documentation/guides/quantization/): Quantization in Qdrant is an optional feature that reduces memory usage and accelerates search by compressing high-dimensional vectors using methods like scalar, binary, 1.5/2-bit, asymmetric, and product quantization, with each offering tradeoffs between speed, storage efficiency, and search accuracy. Scalar and binary quantization drastically compress vector data and speed up comparisons, while newer techniques (1.5/2-bit and asymmetric) further balance precision and performance, though all methods entail some loss of accuracy depending on parameters and data characteristics. +- [Monitoring & Telemetry](https://qdrant.tech/documentation/guides/monitoring/): Qdrant provides Prometheus/OpenMetrics-compatible endpoints for monitoring, including `/metrics` for node-level metrics and `/sys_metrics` for additional cloud-specific data, and recommends scraping each node individually in clusters to ensure metric consistency. It also offers a `/telemetry` endpoint for database state and Kubernetes health endpoints (`/healthz`, `/livez`, `/readyz`) for server status checks. +- [Configuration](https://qdrant.tech/documentation/guides/configuration/): Qdrant uses default configuration settings suitable for most users, but these can be customized using configuration files in various locations, environment-specific files, or environment variables (which have the highest priority). Configuration sources are loaded and merged in a specific order—with validation at startup—and settings from later sources override earlier ones; however, configuration changes are not allowed on Qdrant Cloud. +- [Security](https://qdrant.tech/documentation/guides/security/): Qdrant is unsecured by default and requires enabling security measures before production use, including API key authentication (with support for both full and read-only keys) and, for advanced control, granular access management via JWT-based RBAC, both of which should be combined with TLS for secure communication. Internal communication ports are not protected by these methods and must be restricted at the network level. +- [Usage Statistics](https://qdrant.tech/documentation/guides/usage-statistics/): Qdrant's open-source container image collects anonymized system, performance, and critical error usage statistics by default to improve the engine, but users can disable this telemetry at any time and request deletion of collected data. No personally identifiable information, user data, or collection names are ever gathered, and several anonymization techniques are used; in Qdrant Cloud, telemetry is disabled by default. +- [Troubleshooting](https://qdrant.tech/documentation/guides/common-errors/): To resolve “Too many files open (OS error 24),” increase the open file limit using `ulimit` or Docker’s `--ulimit` flag. For collection WAL errors or gRPC issues with Python's `multiprocessing`, ensure each Qdrant node has its own storage and use the `forkserver` or `spawn` process start methods, or switch to the REST API or async client. +- [Migration to Qdrant](https://qdrant.tech/documentation/database-tutorials/migration/): The Qdrant Migration Tool streamlines data migration between Qdrant instances and from other vector databases (like Pinecone), supporting features such as live batch streaming, resuming interrupted migrations, and collection reconfiguration, and can be run via Docker without requiring direct connectivity. Unlike native snapshotting, it offers greater flexibility and reliability for cross-platform and cross-region migrations. +- [Static Embeddings. Should you pay attention?](https://qdrant.tech/documentation/database-tutorials/static-embeddings/): Static embeddings are experiencing a resurgence due to new techniques that significantly speed up vector creation and reduce model size with minimal quality loss—achieving up to 400-500x faster performance compared to transformer-based models—making them ideal for resource-constrained environments like mobile apps and IoT devices. While retrieval speed in vector databases like Qdrant remains the same, static embeddings offer substantial benefits in encoding speed and cost-efficiency, with further potential gains when combined with quantization methods. +- [OpenLLMetry](https://qdrant.tech/documentation/observability/openllmetry/): OpenLLMetry by Traceloop is a set of OpenTelemetry-based extensions that provide comprehensive observability for LLM applications, specifically supporting instrumentation of the `qdrant_client` Python library and trace export to various platforms. Users can enable tracing by installing and initializing either the Traceloop SDK or the standalone OpenTelemetry instrumentation package for Qdrant. +- [OpenLIT](https://qdrant.tech/documentation/observability/openlit/): OpenLIT is an OpenTelemetry-native observability tool for LLM applications that provides auto-instrumentation to monitor Qdrant, offering insights to improve database operations and application performance. Installation and integration require just two lines of code, with options to forward telemetry data to various backends, and further data visualization and integration are available through its documentation. +- [Airbyte](https://qdrant.tech/documentation/data-management/airbyte/): Airbyte is an open-source data integration platform that enables easy data replication between systems and can be used to ingest data into Qdrant for search engine functionality, supporting various sync modes and embedding configurations. To use Airbyte with Qdrant, you need running instances of both platforms, after which you can set up Qdrant as a destination within Airbyte, configure text processing and embeddings, and create source-to-destination connections through the UI. +- [Aleph Alpha](https://qdrant.tech/documentation/embeddings/aleph-alpha/): Aleph Alpha provides multimodal and multilingual embeddings for text and images in a shared latent space, accessible via their official Python client, which supports both synchronous and asynchronous usage. These embeddings can be generated from text or images and stored in Qdrant, with specific methods for each input type (`Prompt.from_image` for images and `Prompt.from_text` for text). +- [Apache Airflow](https://qdrant.tech/documentation/data-management/airflow/): Apache Airflow is an open-source Python platform for managing workflows, and it offers a Qdrant provider that enables easy integration with Qdrant databases through hooks and operators for tasks like data ingestion. To use this, install the `apache-airflow-providers-qdrant` package, configure a Qdrant connection in Airflow, and leverage the provided hook and operator classes for interacting with Qdrant within your workflows. +- [Apache Spark](https://qdrant.tech/documentation/data-management/spark/): Apache Spark is a distributed computing framework for big data processing, and the Qdrant-Spark connector allows Spark to use Qdrant as a storage destination by supporting various vector formats and payloads via configurable options. Installation involves downloading or building the JAR file, configuring it in your Spark or Databricks environment, and specifying the appropriate options when writing data. +- [Apify](https://qdrant.tech/documentation/platforms/apify/): Apify is a web scraping and browser automation platform with over 1,500 pre-built "Actors," including the Website Content Crawler, which can cleanly extract website data for AI and RAG applications and integrate it directly with Qdrant for vector storage. Programmatic usage via the Apify Python SDK allows users to automate data extraction and upload cleaned, chunked content as embeddings to a Qdrant collection for further AI model training or retrieval. +- [Autogen](https://qdrant.tech/documentation/frameworks/autogen/): Microsoft AutoGen is an open-source framework for building customizable, collaborative AI agent workflows—including multi-agent conversations and human participation—with support for integrating Qdrant for enhanced retrieval capabilities. It provides flexible installation and usage options, allowing developers to configure agents, connect to vector databases like Qdrant, and run sophisticated agent interactions for complex tasks. +- [AWS Bedrock](https://qdrant.tech/documentation/embeddings/bedrock/): You can integrate AWS Bedrock embeddings, such as the Titan Embeddings G1 - Text model, with Qdrant by configuring AWS credentials (region, access key ID, and secret key) and using sample Python or JavaScript code to generate and upload 1536-dimensional embeddings to your Qdrant collection. Detailed steps and code samples for both languages are provided, requiring the appropriate AWS and Qdrant client libraries. +- [AWS Lakechain](https://qdrant.tech/documentation/frameworks/lakechain/): AWS Lakechain is a CDK-based framework for building scalable, modular document processing pipelines on AWS, offering over 60 components and supporting integration with Qdrant for storing vector embeddings. The Qdrant storage connector can be easily configured via code to upload embeddings and, optionally, document text to a specified Qdrant collection, using an API key managed in AWS Secrets Manager. +- [BuildShip](https://qdrant.tech/documentation/platforms/buildship/): BuildShip is a low-code visual builder for creating APIs, scheduled jobs, and backend workflows with AI assistance, featuring integration with Qdrant for semantic search capabilities. The Qdrant integration provides nodes for adding, retrieving, deleting, and searching points within workflows. +- [CamelAI](https://qdrant.tech/documentation/frameworks/camel/): Camel is a Python framework for building and using LLM-based agents, supporting integration with Qdrant as a vector database for efficient data storage and retrieval via its `QdrantStorage` class and compatible retrievers. Users can configure Camel to ingest, store, and query semantically similar data, automate workflows with Auto Retriever, and find end-to-end usage examples in its documentation and cookbooks. +- [Cheshire Cat](https://qdrant.tech/documentation/frameworks/cheshire-cat/): Cheshire Cat is an open-source framework for building intelligent agents on Large Language Models, leveraging Qdrant as its default vector memory for efficient document retrieval and management. It is easily launched via Docker, configurable through a web-based admin portal, and offers extensive documentation and community support for further development. +- [CocoIndex](https://qdrant.tech/documentation/data-management/cocoindex/): CocoIndex is a high-performance, stateful ETL framework for AI data transformation with real-time incremental processing, using Postgres for metadata tracking and Qdrant as a built-in vector database for embeddings storage. Users can install CocoIndex via pip and set up Postgres with Docker Compose, and must configure collection details before exporting data to Qdrant. +- [cognee](https://qdrant.tech/documentation/data-management/cognee/): Cognee is a Python-based memory management tool for AI apps and agents that integrates Qdrant as a built-in vector database for storing and retrieving embeddings. It supports Python 3.8–3.12, is installable via common package managers, and enables users to add, generate, and query knowledge graphs with simple code and environment setup. +- [Cohere](https://qdrant.tech/documentation/embeddings/cohere/): Qdrant integrates with Cohere's co.embed API, allowing users to generate and store embeddings in Qdrant via Cohere's Python SDK, with special support for the advanced Embed v3 model that requires specifying an input type for tasks like search or classification. For semantic search applications, documents and queries should be embedded separately using appropriate input types, and v3 models support multiple similarity metrics that yield identical rankings. +- [Confluent Kafka](https://qdrant.tech/documentation/data-management/confluent/): Confluent Cloud, developed by the original creators of Apache Kafka, is a fully managed, cloud-native data streaming platform with robust features and integrations, including the Qdrant-Kafka Sink Connector that enables seamless streaming of data from Confluent Cloud to Qdrant. The connector supports various vector message formats and can be easily set up on Confluent Cloud for real-time data ingestion into Qdrant. +- [CrewAI](https://qdrant.tech/documentation/frameworks/crewai/): CrewAI is a framework that enables collaborative, role-playing AI agents to work on complex tasks using advanced memory systems—including short-term and entity memory—that can be integrated and enhanced with Qdrant for vector-based storage and retrieval. The guide explains how to set up CrewAI with Qdrant to manage agent memories using custom storage classes, allowing for more effective information recall and relationship mapping during agent workflows. +- [Dagster](https://qdrant.tech/documentation/frameworks/dagster/): Dagster is a Python framework for data orchestration that offers integrated lineage, observability, and strong testability. The `dagster-qdrant` library allows seamless integration with Qdrant's vector database to build and manage AI-driven data pipelines within Dagster, including vector searches and data management. +- [Datadog](https://qdrant.tech/documentation/observability/datadog/): Datadog is a cloud-based monitoring platform that provides real-time visibility and analytics for servers, databases, and applications, enabling businesses to detect and mitigate issues proactively. The Qdrant integration allows users to monitor Qdrant deployments in Datadog by collecting key performance and readiness metrics via the Datadog Agent, configurable through YAML settings and supporting options like authentication. +- [DeepEval](https://qdrant.tech/documentation/frameworks/deepeval/): DeepEval is an open-source framework by Confident AI for testing large language model (LLM) outputs using customizable metrics like answer relevancy, faithfulness, hallucination, and G-Eval, with support for both single-turn and conversational evaluation through test cases. It integrates with tools like Qdrant for evaluating RAG pipelines, and results can be monitored on the Confident AI Dashboard for scalable, dataset-driven assessments. +- [DLT](https://qdrant.tech/documentation/data-management/dlt/): DLT is an open-source Python library that simplifies loading messy data from various sources into live, structured datasets, now supporting Qdrant as a destination with features like schema inference, automated maintenance, and easy configuration. To use, install the library with Qdrant support, configure your credentials, define your data and pipeline, and run it to load data into Qdrant, enabling vector search and supporting all write dispositions. +- [Dynamiq](https://qdrant.tech/documentation/frameworks/dynamiq/): Dynamiq is a Gen AI framework that simplifies the development of AI-powered applications by orchestrating retrieval-augmented generation (RAG) and large language model (LLM) agents, with built-in support for Qdrant as a vector database for document storage and retrieval. It provides easy-to-use retriever and writer nodes for managing documents, along with tutorials and documentation for further guidance. +- [Feast](https://qdrant.tech/documentation/frameworks/feast/): Feast is an open-source feature store for managing and serving production ML features at scale, and it now supports Qdrant as an online vector store by installing Feast with the `qdrant` extra. Users can configure Qdrant in their Feast setup and retrieve document vectors for entity keys, with further details available in the Feast documentation. +- [FiftyOne](https://qdrant.tech/documentation/frameworks/fifty-one/): FiftyOne is an open-source Python toolkit that improves computer vision workflows by optimizing dataset quality and providing insights into models, now featuring native integration with Qdrant for image and text similarity search using vector embeddings. It can be installed via pip, and further information on Qdrant integration is available in the documentation. +- [Firebase Genkit](https://qdrant.tech/documentation/frameworks/genkit/): Firebase Genkit is a framework for building and deploying AI-powered applications, supporting features like semantic search and content generation, with server-side JavaScript/TypeScript support. The Qdrant-Genkit plugin enables integration of Qdrant for semantic data retrieval, allowing configuration of indexing and retrieval within Genkit apps. +- [Gemini](https://qdrant.tech/documentation/embeddings/gemini/): Qdrant is compatible with Google's Gemini Embedding Models, allowing users to embed, index, and search documents using various `task_type` settings for optimal semantic retrieval, and supports advanced features like Binary Quantization to significantly reduce storage requirements while maintaining high recall. The guide provides step-by-step instructions for integrating Gemini embeddings into Qdrant and highlights best practices for semantic search tasks. +- [Haystack](https://qdrant.tech/documentation/frameworks/haystack/): Haystack is a modular NLP framework for building advanced AI applications, and it now integrates with the externally maintained QdrantDocumentStore for efficient vector-based text storage and retrieval. The QdrantDocumentStore is available as a separate installable package, supports all Qdrant Python client configuration options, and enables custom collection settings, such as enabling scalar quantization. +- [HoneyHive](https://qdrant.tech/documentation/frameworks/honeyhive/): HoneyHive is an AI observability platform that equips developers with tools to debug, monitor, and evaluate Generative AI and Retrieval-Augmented Generation (RAG) pipelines, including integration with vector databases like Qdrant for tracing, performance evaluation, and parameter optimization. The platform offers Python-based tracing for embedding generation, document insertion, retrieval, and response generation in RAG workflows, helping teams iterate efficiently and deliver robust AI products. +- [InfinyOn Fluvio](https://qdrant.tech/documentation/data-management/fluvio/): Fluvio is an open-source, cloud-native data streaming platform written in Rust, enabling high-speed real-time processing across diverse infrastructures. Using the Qdrant Fluvio Connector, users can stream data from Fluvio topics to Qdrant collections with strong delivery guarantees and support for various vector formats, after installing both platforms and configuring the connector. +- [Jina Embeddings](https://qdrant.tech/documentation/embeddings/jina-embeddings/): Qdrant integrates seamlessly with Jina AI’s multilingual embedding models—including the recommended jina-embeddings-v4—which support various tasks (retrieval, code, text-matching) and offer configurable embedding dimensions and features like Matryoshka Representation Learning and Late Chunking; users can obtain a 10% discount on Jina AI APIs using the code QDRANT. Example code demonstrates how to generate and use both text and image embeddings from Jina API with Qdrant for retrieval tasks. +- [Keboola](https://qdrant.tech/documentation/platforms/keboola/): Keboola is a data operations platform that enables seamless integration, transformation, and management of data, including vector embeddings, through its Qdrant component for advanced AI and semantic search applications. To use this integration, you need accounts for both Keboola and Qdrant, after which you can orchestrate workflows such as building Retrieval Augmented Generation (RAG) systems directly within Keboola. +- [Kotaemon](https://qdrant.tech/documentation/platforms/kotaemon/): Kotaemon is an open-source, customizable RAG UI for document chat, supporting Qdrant as a vectorstore for document ingestion and retrieval. To use Qdrant, update the `flowsettings.py` configuration with your Qdrant details and restart Kotaemon. +- [Langchain](https://qdrant.tech/documentation/frameworks/langchain/): Langchain is a library that simplifies building LLM-based applications by unifying access to major embedding providers and vector stores like Qdrant, supporting dense, sparse, and hybrid document retrieval modes. Its Qdrant integration allows for easy storage and search of text embeddings, flexible deployment (in-memory, on-disk, or server), and multiple search strategies, minimizing boilerplate and focusing on business value. +- [Langchain4J](https://qdrant.tech/documentation/frameworks/langchain4j/): LangChain for Java (Langchain4J) is a community-created port of Langchain that enables building context-aware AI applications in Java, with support for using Qdrant as a vector store through the `langchain4j-qdrant` module. To use it, add the appropriate dependency to your project and configure the Qdrant connection with your collection name, host URL, and API key. +- [LangGraph](https://qdrant.tech/documentation/frameworks/langgraph/): LangGraph is a library for building stateful, multi-actor applications and agentic workflows, offering flexible flow control, cycles, persistence, and seamless integration with LangChain for advanced retrieval and memory features. It supports Python and JavaScript, allowing easy creation of retrieval nodes using tools like Qdrant and enabling integration into agent workflows. +- [LlamaIndex](https://qdrant.tech/documentation/frameworks/llama-index/): LlamaIndex serves as an interface that connects external data to Large Language Models, enabling users to augment LLMs with private data and simplifying data ingestion and indexing by integrating vector databases like Qdrant. Installation requires separate setup of both LlamaIndex and its Qdrant integration, after which LlamaIndex can interact with a Qdrant server via a `QdrantClient` instance. +- [Make.com](https://qdrant.tech/documentation/platforms/make/): Make.com is a no-code platform for automating tasks and workflows, including integration with Qdrant via specialized modules. To use Qdrant with Make, you need accounts on both services, set up a connection, and can then trigger actions and map data between modules within your workflows. +- [Mastra](https://qdrant.tech/documentation/frameworks/mastra/): Mastra is a TypeScript framework for building AI applications quickly, providing primitives such as workflows, agents, RAG, and integrations, with support for running locally or deploying to the cloud. It integrates with Qdrant as a vector store, offering functions for creating and managing indexes, upserting and querying vectors, and retrieving results with customizable options. +- [Mem0](https://qdrant.tech/documentation/frameworks/mem0/): Mem0 is a self-improving memory layer for LLM applications that personalizes AI by remembering user preferences, supporting multiple vector stores like Qdrant for efficient data management. It can be easily installed via pip and offers flexible configuration and advanced query capabilities for AI systems. +- [Microsoft NLWeb](https://qdrant.tech/documentation/frameworks/nlweb/): Microsoft's NLWeb is a framework for enabling natural language interfaces on websites using Schema.org, RSS, and the MCP protocol, supporting Qdrant as a vector store backend for embedding storage and retrieval. It includes default Qdrant integration, can be easily installed and configured, and allows users to load content via RSS feeds and query it using a web UI or REST API. +- [Mistral](https://qdrant.tech/documentation/embeddings/mistral/): Qdrant now supports integration with the Mistral Embedding models via the official Python SDK, allowing users to easily embed documents, store, and search them using Qdrant’s vector database. Additionally, Mistral Embeddings are compatible with binary quantization, significantly reducing storage size while maintaining high retrieval quality. +- [MixedBread](https://qdrant.tech/documentation/embeddings/mixedbread/): MixedBread provides versatile, state-of-the-art embedding models suitable for various search and Retrieval Augmented Generation (RAG) tasks, which can be easily integrated with Qdrant to enhance search solutions. Installation is straightforward via pip, and embeddings can be generated and stored in Qdrant collections using a simple Python workflow. +- [Mixpeek](https://qdrant.tech/documentation/embeddings/mixpeek/): Mixpeek enables chunking and embedding of videos for semantic search, with Qdrant providing storage and efficient retrieval of these embeddings; users process videos, create embeddings for each chunk, insert them into a Qdrant collection, and can then perform similarity searches using text or video queries. Required setup includes installing the Mixpeek and Qdrant clients, setting API keys, and ensuring compatibility with Python 3.7+. +- [N8N](https://qdrant.tech/documentation/platforms/n8n/): N8N is an automation platform that supports deep data integration, and with the official Qdrant node, users can add semantic search capabilities to their workflows by connecting to a Qdrant instance. The Qdrant node allows a wide range of operations on collections, points, vectors, searches, and payloads, facilitating advanced data handling and automation within N8N. +- [Neo4j GraphRAG](https://qdrant.tech/documentation/frameworks/neo4j-graphrag/): Neo4j GraphRAG is a Python package for building graph retrieval augmented generation (GraphRAG) applications with Neo4j, offering robust features, high performance, and native integration with Qdrant for vector search. It supports both direct vector queries and integration with Langchain embedding providers for automatic text vectorization. +- [Nomic](https://qdrant.tech/documentation/embeddings/nomic/): The `nomic-embed-text-v1` is an open-source text encoder model with an 8192-context length, accessible via the Nomic API/SDK, FastEmbed, or direct HTTP requests, and can be used for creating and searching vector embeddings in databases like Qdrant. The model requires setting the `task_type` parameter (`search_document` for documents, `search_query` for queries) and proper API token configuration to generate embeddings for upserts and searches. +- [Nvidia](https://qdrant.tech/documentation/embeddings/nvidia/): Qdrant integrates with Nvidia embeddings, allowing users to generate and search high-dimensional vector representations of text using the NV-Embed-QA model and an Nvidia API key. The process involves embedding documents, storing them as points in Qdrant, and performing similarity searches using the generated vectors. +- [Ollama](https://qdrant.tech/documentation/embeddings/ollama/): Ollama offers specialized embedding models suitable for niche RAG applications and can be integrated with Qdrant by generating embeddings from text and storing them as vectors in a Qdrant collection. The integration requires installing both `ollama` and `qdrant-client` packages, and a sample code demonstrates connecting to local servers, creating collections, and uploading embedded data. +- [OpenAI](https://qdrant.tech/documentation/embeddings/openai/): Qdrant integrates seamlessly with OpenAI's embedding models, allowing users to generate, store, and search high-dimensional document embeddings using an official Python client and simple code setup. Additionally, OpenAI embeddings can be effectively compressed with Qdrant's Binary Quantization, significantly reducing embedding size while maintaining high recall in search tasks. +- [Pipedream](https://qdrant.tech/documentation/platforms/pipedream/): Pipedream is a platform for building automated workflows by connecting various apps, data sources, and APIs, offering code-level customization; it can integrate with Qdrant via the Qdrant app to add vector search capabilities to workflows. To use Qdrant in Pipedream, you need a Qdrant instance and a Pipedream project, after which you can connect, authenticate, and use pre-built actions within your workflows. +- [Power Apps](https://qdrant.tech/documentation/platforms/powerapps/): Microsoft Power Apps is a rapid development platform for building custom business apps that can connect to various data sources, including using the Qdrant Connector for adding vector search capabilities. To use the Qdrant Connector, you need a Qdrant instance and a Power Apps account; once set up, you can integrate Qdrant actions into your Power Apps flows. +- [Prem AI](https://qdrant.tech/documentation/embeddings/premai/): PremAI is a unified platform for developing, fine-tuning, deploying, and monitoring generative AI models, compatible with Qdrant for vector storage and search. The documentation provides code examples in Python and TypeScript for installing SDKs, setting up clients, generating embeddings, storing them in Qdrant, and performing semantic searches. +- [PrivateGPT](https://qdrant.tech/documentation/platforms/privategpt/): PrivateGPT is a production-ready AI project enabling users to query their documents with Large Language Models (LLMs) offline, utilizing Qdrant as the default vectorstore for document ingestion and retrieval. Qdrant settings can be customized in the `settings.yaml` file, allowing configuration of connection details such as URL, API key, ports, and other options. +- [Pulumi](https://qdrant.tech/documentation/cloud-tools/pulumi/): Pulumi is an open source infrastructure as code tool that supports using Qdrant through a generated SDK, allowing users to configure, deploy, and manage Qdrant cloud infrastructure via Pulumi in various programming languages. Supported resources include managing clusters and API keys, as well as querying package, key, and cluster information, with setup requiring Pulumi installation and a Qdrant cloud API key. +- [Redpanda Connect](https://qdrant.tech/documentation/data-management/redpanda/): Redpanda Connect is a declarative, data-agnostic streaming service that enables efficient, resilient, and stateless data processing with at-least-once delivery, configured via a YAML file organizing various input, processor, and output components. The Qdrant Output component allows streaming of vector data into Qdrant collections within Redpanda pipelines, and supports flexible batching, authentication, and payload mapping options. +- [Rig-rs](https://qdrant.tech/documentation/frameworks/rig-rs/): Rig-rs is a Rust library designed to simplify the creation of scalable, modular LLM-powered applications, with full support for language model completion, embeddings, and integration with Qdrant for semantic document storage and retrieval. It enables developers to ingest documents and perform vector-based search with minimal code using OpenAI embeddings and the Qdrant vector store. +- [Salesforce Mulesoft](https://qdrant.tech/documentation/platforms/mulesoft/): Salesforce MuleSoft Anypoint is an integration platform for connecting applications, data, and devices, while the open-source MAC Project adds AI capabilities (like LLMs and vector databases such as Qdrant) into the MuleSoft ecosystem via specialized connectors. The Mulesoft Vectors connector allows users to configure and perform operations such as adding, listing, querying, and removing documents in Qdrant collections within their MuleSoft projects. +- [Semantic-Router](https://qdrant.tech/documentation/frameworks/semantic-router/): Semantic-Router is a library for building decision-making layers in LLMs and agents using vector embeddings to semantically route tool-use decisions, with support for Qdrant as an index for route data storage and retrieval. Installation involves `pip install semantic-router[qdrant]`, and with minimal setup, routes can be defined and efficiently managed for different conversational contexts. +- [SmolAgents](https://qdrant.tech/documentation/frameworks/smolagents/): SmolAgents is a HuggingFace Python library for creating AI agents that use code-based tool orchestration via LLMs, shown to be more efficient and perform better on benchmarks than traditional dictionary-based approaches. It can be integrated with Qdrant for semantic search, as demonstrated with a movie recommendation agent that queries a vector database to retrieve relevant movie information. +- [Snowflake Models](https://qdrant.tech/documentation/embeddings/snowflake/): Qdrant supports integration with Snowflake’s text embedding models (available on HuggingFace), enabling users to generate and store vector embeddings of text documents for efficient search and retrieval. The provided examples show how to set up, embed, store, and search documents using the `snowflake-arctic-embed-s` model in both Python and TypeScript. +- [Spring AI](https://qdrant.tech/documentation/frameworks/spring-ai/): Spring AI is a Java framework offering Spring-style APIs for AI application development, supporting integration with Qdrant as a vector database through straightforward configuration in Spring Boot or Java code. The Qdrant vector store can be set up via application properties or a config bean, enabling use with any supported Spring AI embedding provider, and collections are auto-created if not pre-existing. +- [Stanford DSPy](https://qdrant.tech/documentation/frameworks/dspy/): Stanford DSPy is a Python framework that simplifies building advanced language model applications by integrating prompting, fine-tuning, reasoning, and retrieval (including with Qdrant as a retriever). It offers declarative modules, an automatic compiler, and a straightforward API for configuring and using retrieval-augmented generation (RAG) and other LM-powered workflows. +- [Sycamore](https://qdrant.tech/documentation/frameworks/sycamore/): Sycamore is an LLM-powered system for preparing, processing, and analyzing complex, unstructured documents, and it provides connectors to read from and write to Qdrant collections for managing document data. Writing and reading operations use the `docset.write.qdrant()` and `docset.read.qdrant()` functions, which accept various configuration parameters for customized interaction with Qdrant. +- [Terraform](https://qdrant.tech/documentation/cloud-tools/terraform/): HashiCorp Terraform is an infrastructure as code tool for managing both cloud and on-prem resources, and with the Qdrant Terraform Provider, users can automate the provisioning and management of Qdrant cloud clusters and related resources. To use the provider, you need a Terraform installation and a Qdrant API key, and it supports managing clusters, API keys, and retrieving related information. +- [Testcontainers](https://qdrant.tech/documentation/frameworks/testcontainers/): Testcontainers is a library that enables integration testing with real services in Docker containers, offering easy-to-use APIs in multiple languages. The Qdrant module allows developers to quickly spin up a Qdrant instance for end-to-end testing, with options to customize container configurations. +- [ToolJet](https://qdrant.tech/documentation/platforms/tooljet/): ToolJet is a low-code platform for building business applications that can connect to various data sources, including Qdrant, via a plugin in its marketplace. By configuring the Qdrant plugin, users can perform operations such as listing collections, viewing collection info, upserting, retrieving, deleting, and querying points within their Qdrant instance directly from ToolJet. +- [Twelve Labs](https://qdrant.tech/documentation/embeddings/twelvelabs/): Twelve Labs provides an Embed API that generates unified vector embeddings for videos, text, images, and audio, enabling cross-modal semantic search and advanced applications like sentiment analysis and recommendation systems. The guide demonstrates how to use the API with Python and Node SDKs to embed various content types, store and query them in Qdrant, and perform searches across different modalities. +- [txtai](https://qdrant.tech/documentation/frameworks/txtai/): txtai enables building semantic search applications using neural embeddings, and Qdrant can be used as an embedding backend by installing the additional qdrant-txtai package. More information and examples are available in the qdrant-txtai repository. +- [Unstructured](https://qdrant.tech/documentation/data-management/unstructured/): Unstructured is a library for preprocessing and structuring unstructured text documents for machine learning, supporting integration with Qdrant as an ingestion destination. It can be installed with Qdrant support and used via CLI or programmatically, allowing users to configure document processing, embedding, and upload workflows. +- [Upstage](https://qdrant.tech/documentation/embeddings/upstage/): Qdrant integrates with Upstage's Solar Embeddings API, allowing users to generate and store high-dimensional text embeddings (size 4096) for both documents and queries using dual models, and perform vector similarity search within a unified vector space. Authentication is performed via API key, and examples are provided for embedding, storing, and searching text data in both Python and TypeScript. +- [Vanna.AI](https://qdrant.tech/documentation/frameworks/vanna-ai/): Vanna.AI is a Python package that leverages retrieval-augmented generation (RAG) and LLMs to generate accurate SQL queries for your database, supporting integration with Qdrant as a vector store and various SQL databases. Users train the model with schema, documentation, and example queries, then ask natural language questions to receive relevant SQL statements. +- [VectaX - Mirror Security](https://qdrant.tech/documentation/frameworks/mirror-security/): VectaX by Mirror Security is an AI-driven access control and encryption system that secures vector embeddings through similarity-preserving encryption and fine-grained RBAC, enabling protected storage, search, and operations in vector databases like Qdrant. It allows integration via the Mirror SDK to encrypt vectors, define role-based policies, generate access keys, and ensure that only authorized users can decrypt and access specific vector data. +- [Vectorize.io](https://qdrant.tech/documentation/platforms/vectorize/): Vectorize.io is a SaaS platform that automates data extraction from multiple sources and enables rapid deployment and evaluation of real-time RAG pipelines for unstructured data, integrating seamlessly with Qdrant for vector storage and immediate processing of source updates. Setting up requires accounts with Vectorize and Qdrant, after which users can configure pipelines by connecting their vector database, selecting an embeddings provider, and choosing a data source. +- [VoltAgent](https://qdrant.tech/documentation/frameworks/voltagent/): VoltAgent is an open-source TypeScript framework for building AI agents with modular tool integration, LLM coordination, adaptable multi-agent architectures, and features an observability dashboard for monitoring and debugging. It supports seamless integration with Qdrant for vector search, enables embedding generation via OpenAI, and provides example implementations for retrieval-augmented generation workflows. +- [Voyage AI](https://qdrant.tech/documentation/embeddings/voyage/): Qdrant integrates with Voyage AI embeddings, allowing users to generate sentence embeddings (e.g., with the "voyage-large-2" model), store them as vectors in Qdrant collections, and perform vector searches for relevant documents. Setup involves obtaining a Voyage AI API key, embedding texts, converting embeddings to Qdrant points, inserting them into collections, and querying for similar documents. +- [How vector search should be benchmarked?](https://qdrant.tech/benchmarks/benchmarks-intro/): Qdrant prioritizes efficient resource usage to deliver fast, accurate, and cost-effective vector database performance, backing its design choices with comparative, reproducible, and open-sourced benchmarks against other vector search engines. Benchmark scenarios include single node upload & search, filtered search, with plans for memory consumption and cluster mode tests. +- [Single node benchmarks](https://qdrant.tech/benchmarks/single-node-speed-benchmark/): Qdrant outperformed other vector search engines by achieving the highest requests-per-second (RPS) and lowest latencies across almost all scenarios, while Elasticsearch improved speed but remained slow in indexing, Milvus excelled at indexing speed but lagged in RPS and latency for larger or high-dimensional datasets, Redis performed well at lower precision with latency increasing under more parallel requests, and Weaviate showed the least improvement since the last benchmark. Benchmarks were run fairly with each engine limited to 25GB RAM, using multiple datasets to evaluate both parallel (RPS) and single-request (latency) performance under varying precision thresholds. +- [Single node benchmarks (2022)](https://qdrant.tech/benchmarks/single-node-speed-benchmark-2022/): This is an archived version of Single node benchmarks. For the latest information, please refer to the new version at /benchmarks/single-node-speed-benchmark/. +- [Filtered search benchmark](https://qdrant.tech/benchmarks/filtered-search-intro/): Filtered search introduces significant complexity to search engines because most ANN algorithms struggle to efficiently incorporate filters, with existing solutions (pre-filtering and post-filtering) having scalability and accuracy issues. The Qdrant team has created benchmark datasets to test engines in this scenario and proposes an alternative approach that avoids the limitations of pre- and post-filtering while maintaining accuracy. +- [Benchmarks F.A.Q.](https://qdrant.tech/benchmarks/benchmark-faq/): The Qdrant team acknowledges potential bias in their open-source vector database benchmarks but strives for fairness by using consistent hardware, focusing on search precision, speed, and resource use at standardized precision thresholds, and testing primarily with official Python clients for reproducibility. They only test open-source databases (not closed SaaS platforms) and encourage community contributions and improvements via their public GitHub repository. +- [Bug Bounty Program](https://qdrant.tech/security/bug-bounty-program/): Qdrant’s Bug Bounty Program invites responsible disclosure of security vulnerabilities in its cloud application and website, rewarding researchers based on the impact and quality of their reports, while strictly prohibiting social engineering, unauthorized account access, and issues deemed out-of-scope. Submissions must be detailed and reproducible, follow ethical guidelines, and abide by disclosure confidentiality; all vulnerability reports are assessed using the CVSS v4 framework, and safe harbor protections apply when the program’s rules are followed. +- [Credits](https://qdrant.tech/legal/credits/): Icons are designed by srip and available on flaticon.com. The email marketing vector was created by storyset and can be found on freepik.com. +- [Impressum](https://qdrant.tech/legal/impressum/): Qdrant Solutions GmbH, represented by AndrĂ© Zayarni and based in Berlin, provides company and legal information, outlines data privacy practices, and disclaims liability for external links and content accuracy on their website. The site uses Google Analytics and warns users about potential data security risks, stating personal data is only collected voluntarily and not shared without consent. +- [Privacy Policy](https://qdrant.tech/legal/privacy-policy/): Qdrant Solutions GmbH collects and processes personal data on its website, cloud panel, and social media for purposes including service provision, customer support, marketing, and legal compliance, based on relevant GDPR legal frameworks, while implementing safeguards for international data transfers and allowing data subjects rights such as access, correction, objection, and deletion. Data is retained only as necessary, and users can contact Qdrant for privacy concerns or to withdraw consent at any time. +- [Terms and Conditions](https://qdrant.tech/legal/terms_and_conditions/): By using the Qdrant website or service, users agree to be bound by these Terms and Conditions, which define the rights and obligations of all parties, limit the Company's liability, and provide services "as is" without warranties; users must be over 18, comply with the Privacy Policy, and acknowledge the Company is not responsible for third-party sites. Violation of these terms may result in immediate termination of access, and disputes should be resolved by contacting the Company first, with local laws applying as relevant.