Skip to content

Commit 082161d

Browse files
committed
This PR contains scripts used for the article Building a recipe search with Elasticsearch
1 parent e74f9b8 commit 082161d

File tree

9 files changed

+12681
-0
lines changed

9 files changed

+12681
-0
lines changed
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Building a Recipe Search with Elasticsearch
2+
3+
This project demonstrates how to implement a semantic search using Elastic's
4+
ELSER and compare its results with a traditional lexical search. The setup is made practical and efficient by using a cluster created in Elastic Cloud, simplifying the use of ELSER and accelerating development.
5+
6+
> **Tip:** To learn more about Elastic Cloud and how to use it, visit: [https://www.elastic.co/pt/cloud](https://www.elastic.co/pt/cloud)
7+
8+
## Project Objectives
9+
10+
1. **Configure Elasticsearch infrastructure** to support semantic and lexical search indexes.
11+
2. **Data ingestion**: Use Python scripts to populate indexes with grocery product data.
12+
3. **Compare search types**: Perform searches and display the results for comparison.
13+
14+
## Prerequisites
15+
16+
- **Elasticsearch v8.15** (recommended): To support ELSER.
17+
- **Python 3.x**: Required to run the ingestion and search scripts.
18+
- **Python Libraries**: Required libraries are listed in the `requirements.txt` file.
19+
20+
To install the dependencies, use the following command:
21+
22+
```bash
23+
pip install -r requirements.txt
24+
```
25+
26+
## Creating the Indexes
27+
To create the semantic and lexical search indexes, run the following scripts:
28+
29+
### Semantic Index
30+
31+
```bash
32+
python infra.py
33+
```
34+
35+
### Lexical Index
36+
```bash
37+
python infra_lexical_index.py
38+
```
39+
40+
These scripts will automatically configure the indexes in Elasticsearch.
41+
42+
## Data Ingestion
43+
To ingest the recipe data into the indexes, use the commands below:
44+
45+
### Ingest Data into the Semantic Index
46+
47+
```bash
48+
python ingestion.py
49+
```
50+
51+
### Ingest Data into the Lexical Index
52+
```bash
53+
python ingestion_lexical_index.py
54+
```
55+
56+
## Search
57+
To perform searches and obtain results from both the semantic and lexical searches,
58+
run the following command:
59+
60+
```bash
61+
python search.py
62+
```
63+
64+
This script performs searches in both indexes and displays the results in the console,
65+
making it easy to compare the two approaches.
66+
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import yaml
2+
from elasticsearch import Elasticsearch, AsyncElasticsearch
3+
4+
5+
class ElasticsearchConnection:
6+
7+
def __init__(self, config_file="config.yml"):
8+
with open(config_file, 'r') as f:
9+
config = yaml.safe_load(f)
10+
self.client = Elasticsearch(
11+
cloud_id=config['cloud_id'],
12+
api_key=config['api_key']
13+
)
14+
15+
def get_client(self):
16+
return self.client
17+
18+
def get_async_client(self):
19+
with open("config.yml", 'r') as f:
20+
config = yaml.safe_load(f)
21+
self.client = AsyncElasticsearch(
22+
cloud_id=config['cloud_id'],
23+
api_key=config['api_key'],
24+
request_timeout=240)
25+
return self.client;

supporting-blog-content/building-a-recipe-search-with-elasticsearch/files/output.json

Lines changed: 12301 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from elasticsearch_connection import ElasticsearchConnection
2+
3+
client = ElasticsearchConnection().get_client()
4+
5+
6+
def create_index_embedding():
7+
response = client.indices.create(
8+
index="grocery-catalog-elser",
9+
mappings={
10+
"properties": {
11+
"id": {
12+
"type": "integer"
13+
},
14+
"name": {
15+
"type": "text",
16+
},
17+
"description": {
18+
"type": "text",
19+
"copy_to": "description_embedding"
20+
},
21+
"category": {
22+
"type": "keyword"
23+
},
24+
"brand": {
25+
"type": "keyword"
26+
},
27+
"price": {
28+
"type": "float"
29+
},
30+
"unit": {
31+
"type": "keyword"
32+
},
33+
"description_embedding": {
34+
"type": "semantic_text",
35+
"inference_id": "elser_embeddings"
36+
}
37+
}
38+
}
39+
)
40+
print(response)
41+
42+
43+
def create_inference():
44+
response = client.inference.put(
45+
inference_id="elser_embeddings",
46+
task_type="sparse_embedding", body={
47+
"service": "elser",
48+
"service_settings": {
49+
"num_allocations": 1,
50+
"num_threads": 1
51+
}
52+
})
53+
print(response)
54+
55+
56+
if __name__ == '__main__':
57+
58+
create_inference()
59+
60+
create_index_embedding()
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from elasticsearch_connection import ElasticsearchConnection
2+
3+
client = ElasticsearchConnection().get_client()
4+
5+
6+
def create_index():
7+
response = client.indices.create(
8+
index="grocery-catalog",
9+
mappings={
10+
"properties": {
11+
"id": {
12+
"type": "integer"
13+
},
14+
"name": {
15+
"type": "text",
16+
},
17+
"description": {
18+
"type": "text"
19+
},
20+
"category": {
21+
"type": "keyword"
22+
},
23+
"brand": {
24+
"type": "keyword"
25+
},
26+
"price": {
27+
"type": "float"
28+
},
29+
"unit": {
30+
"type": "keyword"
31+
}
32+
}
33+
}
34+
)
35+
print(response)
36+
37+
38+
if __name__ == '__main__':
39+
create_index()
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import asyncio
2+
import json
3+
4+
from elasticsearch import helpers
5+
6+
from elasticsearch_connection import ElasticsearchConnection
7+
8+
async_client = ElasticsearchConnection().get_async_client()
9+
10+
11+
def partition_list(lst, chunk_size):
12+
return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
13+
14+
15+
async def index_data():
16+
global partitions
17+
with open('files/output.json', 'r') as file:
18+
data_json = json.load(file)
19+
documents = []
20+
for doc in data_json:
21+
documents.append(
22+
{
23+
"_index": "grocery-catalog-elser",
24+
"_source": doc,
25+
}
26+
)
27+
28+
partitions = partition_list(documents, 500)
29+
30+
for i, partition in enumerate(partitions):
31+
print(f"partition {i + 1}")
32+
await async_bulk_indexing(async_client, partition)
33+
34+
35+
async def async_bulk_indexing(client, documents):
36+
success, failed = await helpers.async_bulk(client, documents)
37+
print(f"Successfully indexed {success} documents. Failed to index {failed} documents.")
38+
39+
40+
async def main():
41+
await index_data()
42+
43+
44+
loop = asyncio.get_event_loop()
45+
loop.run_until_complete(main())
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import asyncio
2+
import json
3+
4+
from elasticsearch import helpers
5+
6+
from elasticsearch_connection import ElasticsearchConnection
7+
8+
async_client = ElasticsearchConnection().get_async_client()
9+
10+
11+
def partition_list(lst, chunk_size):
12+
return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
13+
14+
15+
async def index_data():
16+
global partitions
17+
with open('files/output.json', 'r') as file:
18+
data_json = json.load(file)
19+
documents = []
20+
for doc in data_json:
21+
documents.append(
22+
{
23+
"_index": "grocery-catalog",
24+
"_source": doc,
25+
}
26+
)
27+
28+
partitions = partition_list(documents, 500)
29+
30+
for i, partition in enumerate(partitions):
31+
print(f"partition {i + 1}")
32+
await async_bulk_indexing(async_client, partition)
33+
34+
35+
async def async_bulk_indexing(client, documents):
36+
success, failed = await helpers.async_bulk(client, documents)
37+
print(f"Successfully indexed {success} documents. Failed to index {failed} documents.")
38+
39+
40+
async def main():
41+
await index_data()
42+
43+
44+
loop = asyncio.get_event_loop()
45+
loop.run_until_complete(main())
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
elasticsearch
2+
aiohttp
3+
pyyaml
4+
pandas
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import pandas as pd
2+
3+
from elasticsearch_connection import ElasticsearchConnection
4+
5+
es_client = ElasticsearchConnection().get_client()
6+
7+
term = "seafood for grilling"
8+
size = 5
9+
10+
11+
def format_text(description, line_length=120):
12+
words = description.split()
13+
if len(words) <= line_length:
14+
return description
15+
else:
16+
return ' '.join(words[:line_length]) + '...'
17+
18+
19+
def search_semantic(term):
20+
result = []
21+
response = es_client.search(
22+
index="grocery-catalog-elser",
23+
size=size,
24+
source_excludes="description_embedding",
25+
query={
26+
"semantic": {
27+
"field": "description_embedding",
28+
"query": term
29+
30+
}
31+
})
32+
33+
for hit in response["hits"]["hits"]:
34+
score = hit["_score"]
35+
name = format_text(hit["_source"]["name"], line_length=10)
36+
description = hit["_source"]["description"]
37+
formatted_description = format_text(description)
38+
result.append({
39+
'score': score,
40+
'name': name,
41+
'description': formatted_description,
42+
})
43+
return result
44+
45+
46+
def search_lexical(term):
47+
result = []
48+
response = es_client.search(
49+
index="grocery-catalog-elser",
50+
size=size,
51+
source_excludes="description_embedding",
52+
query={
53+
"multi_match": {
54+
"query": term,
55+
"fields": [
56+
"name",
57+
"description"]
58+
}
59+
}
60+
)
61+
62+
for hit in response["hits"]["hits"]:
63+
score = hit["_score"]
64+
name = format_text(hit["_source"]["name"], line_length=10)
65+
description = hit["_source"]["description"]
66+
result.append({
67+
'score': score,
68+
'name': name,
69+
'description': description,
70+
})
71+
return result
72+
73+
74+
if __name__ == '__main__':
75+
rs1 = search_semantic(term)
76+
rs2 = search_lexical(term)
77+
78+
df1 = pd.DataFrame(rs1)[['name', 'score']] if rs1 else pd.DataFrame(columns=['name', 'score'])
79+
df2 = pd.DataFrame(rs2)[['name', 'score']] if rs2 else pd.DataFrame(columns=['name', 'score'])
80+
df1 = pd.DataFrame(rs1)[['name', 'score']] if rs1 else pd.DataFrame(columns=['name', 'score'])
81+
df1['Search Type'] = 'Semantic'
82+
83+
df2 = pd.DataFrame(rs2)[['name', 'score']] if rs2 else pd.DataFrame(columns(['name', 'score']))
84+
df2['Search Type'] = 'Lexical'
85+
86+
tabela = pd.concat([df1, df2], axis=0).reset_index(drop=True)
87+
88+
tabela = tabela[['Search Type', 'name', 'score']]
89+
90+
tabela.columns = ['Search Type', 'Name', 'Score']
91+
92+
tabela['Search Type'] = tabela['Search Type'].astype(str).str.ljust(0)
93+
tabela['Name'] = tabela['Name'].astype(str).str.ljust(15)
94+
tabela['Score'] = tabela['Score'].astype(str).str.ljust(5)
95+
96+
print(tabela.to_string(index=False))

0 commit comments

Comments
 (0)