Skip to content

Commit 08ed40e

Browse files
committed
Added Licence Header and fixed .gitingore file
Signed-off-by: hmumtazz <[email protected]>
1 parent db78131 commit 08ed40e

File tree

7 files changed

+291
-21
lines changed

7 files changed

+291
-21
lines changed

opensearch_py_ml/ml_commons/rag_pipeline/rag/.gitignore

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@ ml_commons/rag_pipeline/ingestion/
44
ml_commons/rag_pipeline/rag/config.ini
55
# Ignore virtual environment
66
.venv/
7-
# Or, specify the full path
8-
/Users/hmumtazz/.cursor-tutor/opensearch-py-ml/.venv/
7+
98

109
# Ignore Python cache files
1110
__pycache__/

opensearch_py_ml/ml_commons/rag_pipeline/rag/ingest.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,27 @@
1-
# ingest_class.py
1+
# SPDX-License-Identifier: Apache-2.0
2+
# The OpenSearch Contributors require contributions made to
3+
# this file be licensed under the Apache-2.0 license or a
4+
# compatible open source license.
5+
# Any modifications Copyright OpenSearch Contributors. See
6+
# GitHub history for details.
7+
8+
9+
# Licensed to Elasticsearch B.V. under one or more contributor
10+
# license agreements. See the NOTICE file distributed with
11+
# this work for additional information regarding copyright
12+
# ownership. Elasticsearch B.V. licenses this file to you under
13+
# the Apache License, Version 2.0 (the "License"); you may
14+
# not use this file except in compliance with the License.
15+
# You may obtain a copy of the License at
16+
#
17+
# http://www.apache.org/licenses/LICENSE-2.0
18+
#
19+
# Unless required by applicable law or agreed to in writing,
20+
# software distributed under the License is distributed on an
21+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
22+
# KIND, either express or implied. See the License for the
23+
# specific language governing permissions and limitations
24+
# under the License.
225

326
import os
427
import glob
@@ -15,19 +38,19 @@
1538
import random
1639

1740

18-
from opensearch_class import OpenSearchClass
41+
from opensearch_connector import OpenSearchConnector
1942

2043
init(autoreset=True) # Initialize colorama
2144

22-
class IngestClass:
23-
EMBEDDING_MODEL_ID = 'amazon.titan-embed-text-v1'
45+
class Ingest:
46+
EMBEDDING_MODEL_ID = 'amazon.titan-embed-text-v2:0'
2447

2548
def __init__(self, config):
2649
self.config = config
2750
self.aws_region = config.get('region')
2851
self.index_name = config.get('index_name')
2952
self.bedrock_client = None
30-
self.opensearch = OpenSearchClass(config)
53+
self.opensearch = OpenSearchConnector(config)
3154

3255
def initialize_clients(self):
3356
try:
@@ -160,6 +183,7 @@ def process_and_ingest_data(self, file_paths: List[str]):
160183
for doc in all_documents:
161184
if 'embedding' in doc and doc['embedding'] is not None:
162185
action = {
186+
"_op_type": "index",
163187
"_index": self.index_name,
164188
"_source": {
165189
"nominee_text": doc['text'],
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# The OpenSearch Contributors require contributions made to
3+
# this file be licensed under the Apache-2.0 license or a
4+
# compatible open source license.
5+
# Any modifications Copyright OpenSearch Contributors. See
6+
# GitHub history for details.
7+
8+
9+
# Licensed to Elasticsearch B.V. under one or more contributor
10+
# license agreements. See the NOTICE file distributed with
11+
# this work for additional information regarding copyright
12+
# ownership. Elasticsearch B.V. licenses this file to you under
13+
# the Apache License, Version 2.0 (the "License"); you may
14+
# not use this file except in compliance with the License.
15+
# You may obtain a copy of the License at
16+
#
17+
# http://www.apache.org/licenses/LICENSE-2.0
18+
#
19+
# Unless required by applicable law or agreed to in writing,
20+
# software distributed under the License is distributed on an
21+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
22+
# KIND, either express or implied. See the License for the
23+
# specific language governing permissions and limitations
24+
# under the License.
25+
26+
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth, exceptions as opensearch_exceptions
27+
import boto3
28+
from urllib.parse import urlparse
29+
from opensearchpy import helpers as opensearch_helpers
30+
31+
class OpenSearchConnector:
32+
def __init__(self, config):
33+
self.config = config
34+
self.opensearch_client = None
35+
self.aws_region = config.get('region')
36+
self.index_name = config.get('index_name')
37+
self.is_serverless = config.get('is_serverless', 'False') == 'True'
38+
self.opensearch_endpoint = config.get('opensearch_endpoint')
39+
self.opensearch_username = config.get('opensearch_username')
40+
self.opensearch_password = config.get('opensearch_password')
41+
42+
def initialize_opensearch_client(self):
43+
if not self.opensearch_endpoint:
44+
print("OpenSearch endpoint not set. Please run setup first.")
45+
return False
46+
47+
parsed_url = urlparse(self.opensearch_endpoint)
48+
host = parsed_url.hostname
49+
port = parsed_url.port or 443
50+
51+
if self.is_serverless:
52+
credentials = boto3.Session().get_credentials()
53+
auth = AWSV4SignerAuth(credentials, self.aws_region, 'aoss')
54+
else:
55+
if not self.opensearch_username or not self.opensearch_password:
56+
print("OpenSearch username or password not set. Please run setup first.")
57+
return False
58+
auth = (self.opensearch_username, self.opensearch_password)
59+
60+
try:
61+
self.opensearch_client = OpenSearch(
62+
hosts=[{'host': host, 'port': port}],
63+
http_auth=auth,
64+
use_ssl=True,
65+
verify_certs=True,
66+
connection_class=RequestsHttpConnection,
67+
pool_maxsize=20
68+
)
69+
print(f"Initialized OpenSearch client with host: {host} and port: {port}")
70+
return True
71+
except Exception as ex:
72+
print(f"Error initializing OpenSearch client: {ex}")
73+
return False
74+
75+
def create_index(self, embedding_dimension, space_type):
76+
index_body = {
77+
"mappings": {
78+
"properties": {
79+
"nominee_text": {"type": "text"},
80+
"nominee_vector": {
81+
"type": "knn_vector",
82+
"dimension": embedding_dimension,
83+
"method": {
84+
"name": "hnsw",
85+
"space_type": space_type,
86+
"engine": "nmslib",
87+
"parameters": {"ef_construction": 512, "m": 16},
88+
},
89+
},
90+
}
91+
},
92+
"settings": {
93+
"index": {
94+
"number_of_shards": 2,
95+
"knn.algo_param": {"ef_search": 512},
96+
"knn": True,
97+
}
98+
},
99+
}
100+
try:
101+
self.opensearch_client.indices.create(index=self.index_name, body=index_body)
102+
print(f"KNN index '{self.index_name}' created successfully with dimension {embedding_dimension} and space type {space_type}.")
103+
except opensearch_exceptions.RequestError as e:
104+
if 'resource_already_exists_exception' in str(e).lower():
105+
print(f"Index '{self.index_name}' already exists.")
106+
else:
107+
print(f"Error creating index '{self.index_name}': {e}")
108+
109+
def verify_and_create_index(self, embedding_dimension, space_type):
110+
try:
111+
index_exists = self.opensearch_client.indices.exists(index=self.index_name)
112+
if index_exists:
113+
print(f"KNN index '{self.index_name}' already exists.")
114+
else:
115+
self.create_index(embedding_dimension, space_type)
116+
return True
117+
except Exception as ex:
118+
print(f"Error verifying or creating index: {ex}")
119+
return False
120+
121+
def bulk_index(self, actions):
122+
try:
123+
success_count, error_info = opensearch_helpers.bulk(self.opensearch_client, actions)
124+
error_count = len(error_info)
125+
print(f"Indexed {success_count} documents successfully. Failed to index {error_count} documents.")
126+
return success_count, error_count
127+
except Exception as e:
128+
print(f"Error during bulk indexing: {e}")
129+
return 0, len(actions)
130+
131+
def search(self, vector, k=5):
132+
try:
133+
response = self.opensearch_client.search(
134+
index=self.index_name,
135+
body={
136+
"size": k,
137+
"_source": ["nominee_text"],
138+
"query": {
139+
"knn": {
140+
"nominee_vector": {
141+
"vector": vector,
142+
"k": k
143+
}
144+
}
145+
}
146+
}
147+
)
148+
return response['hits']['hits']
149+
except Exception as e:
150+
print(f"Error during search: {e}")
151+
return []

opensearch_py_ml/ml_commons/rag_pipeline/rag/query.py

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,27 @@
1-
# query_class.py
1+
# SPDX-License-Identifier: Apache-2.0
2+
# The OpenSearch Contributors require contributions made to
3+
# this file be licensed under the Apache-2.0 license or a
4+
# compatible open source license.
5+
# Any modifications Copyright OpenSearch Contributors. See
6+
# GitHub history for details.
7+
8+
9+
# Licensed to Elasticsearch B.V. under one or more contributor
10+
# license agreements. See the NOTICE file distributed with
11+
# this work for additional information regarding copyright
12+
# ownership. Elasticsearch B.V. licenses this file to you under
13+
# the Apache License, Version 2.0 (the "License"); you may
14+
# not use this file except in compliance with the License.
15+
# You may obtain a copy of the License at
16+
#
17+
# http://www.apache.org/licenses/LICENSE-2.0
18+
#
19+
# Unless required by applicable law or agreed to in writing,
20+
# software distributed under the License is distributed on an
21+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
22+
# KIND, either express or implied. See the License for the
23+
# specific language governing permissions and limitations
24+
# under the License.
225

326
import json
427
import tiktoken
@@ -8,20 +31,20 @@
831
import botocore
932
import time
1033
import random
11-
from opensearch_class import OpenSearchClass
34+
from opensearch_connector import OpenSearchConnector
1235

1336
init(autoreset=True) # Initialize colorama
1437

15-
class QueryClass:
16-
EMBEDDING_MODEL_ID = 'amazon.titan-embed-text-v1'
38+
class Query:
39+
EMBEDDING_MODEL_ID = 'amazon.titan-embed-text-v2:0'
1740
LLM_MODEL_ID = 'amazon.titan-text-express-v1'
1841

1942
def __init__(self, config):
2043
self.config = config
2144
self.aws_region = config.get('region')
2245
self.index_name = config.get('index_name')
2346
self.bedrock_client = None
24-
self.opensearch = OpenSearchClass(config)
47+
self.opensearch = OpenSearchConnector(config)
2548

2649
def initialize_clients(self):
2750
try:

opensearch_py_ml/ml_commons/rag_pipeline/rag/rag.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,28 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# The OpenSearch Contributors require contributions made to
3+
# this file be licensed under the Apache-2.0 license or a
4+
# compatible open source license.
5+
# Any modifications Copyright OpenSearch Contributors. See
6+
# GitHub history for details.
7+
8+
9+
# Licensed to Elasticsearch B.V. under one or more contributor
10+
# license agreements. See the NOTICE file distributed with
11+
# this work for additional information regarding copyright
12+
# ownership. Elasticsearch B.V. licenses this file to you under
13+
# the Apache License, Version 2.0 (the "License"); you may
14+
# not use this file except in compliance with the License.
15+
# You may obtain a copy of the License at
16+
#
17+
# http://www.apache.org/licenses/LICENSE-2.0
18+
#
19+
# Unless required by applicable law or agreed to in writing,
20+
# software distributed under the License is distributed on an
21+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
22+
# KIND, either express or implied. See the License for the
23+
# specific language governing permissions and limitations
24+
# under the License.
25+
126
#!/usr/bin/env python3
227

328
"""
@@ -6,9 +31,9 @@
631

732
import argparse
833
import configparser
9-
from rag_setup import SetupClass
10-
from ingest import IngestClass
11-
from query import QueryClass
34+
from rag_setup import Setup
35+
from ingest import Ingest
36+
from query import Query
1237

1338
CONFIG_FILE = 'config.ini'
1439

@@ -35,7 +60,7 @@ def main():
3560
config = load_config()
3661

3762
if args.command == 'setup':
38-
setup = SetupClass()
63+
setup = Setup()
3964
setup.setup_command()
4065
save_config(setup.config)
4166
elif args.command == 'ingest':
@@ -48,7 +73,7 @@ def main():
4873
paths.append(path)
4974
else:
5075
paths = args.paths
51-
ingest = IngestClass(config)
76+
ingest = Ingest(config)
5277
ingest.ingest_command(paths)
5378
elif args.command == 'query':
5479
if not args.queries:
@@ -60,7 +85,7 @@ def main():
6085
queries.append(query)
6186
else:
6287
queries = args.queries
63-
query = QueryClass(config)
88+
query = Query(config)
6489
query.query_command(queries, num_results=args.num_results)
6590
else:
6691
parser.print_help()

opensearch_py_ml/ml_commons/rag_pipeline/rag/rag_setup.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,28 @@
1-
# setup_class.py
1+
# SPDX-License-Identifier: Apache-2.0
2+
# The OpenSearch Contributors require contributions made to
3+
# this file be licensed under the Apache-2.0 license or a
4+
# compatible open source license.
5+
# Any modifications Copyright OpenSearch Contributors. See
6+
# GitHub history for details.
7+
8+
9+
# Licensed to Elasticsearch B.V. under one or more contributor
10+
# license agreements. See the NOTICE file distributed with
11+
# this work for additional information regarding copyright
12+
# ownership. Elasticsearch B.V. licenses this file to you under
13+
# the Apache License, Version 2.0 (the "License"); you may
14+
# not use this file except in compliance with the License.
15+
# You may obtain a copy of the License at
16+
#
17+
# http://www.apache.org/licenses/LICENSE-2.0
18+
#
19+
# Unless required by applicable law or agreed to in writing,
20+
# software distributed under the License is distributed on an
21+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
22+
# KIND, either express or implied. See the License for the
23+
# specific language governing permissions and limitations
24+
# under the License.
25+
226
import boto3
327
import botocore
428
from botocore.config import Config
@@ -13,7 +37,7 @@
1337
from urllib.parse import urlparse
1438
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
1539

16-
class SetupClass:
40+
class Setup:
1741
CONFIG_FILE = 'config.ini'
1842
SERVICE_AOSS = 'opensearchserverless'
1943
SERVICE_BEDROCK = 'bedrock-runtime'

0 commit comments

Comments
 (0)