1- import logging
2- import coloredlogs
31import json
42import argparse
53import boto3
64from utils import dataset , secret , opensearch
5+ from loguru import logger
6+ import sys
7+ import os
78
8- coloredlogs .install (fmt = '%(asctime)s %(levelname)s %(message)s' , datefmt = '%H:%M:%S' , level = 'INFO' )
9- logging .basicConfig (level = logging .INFO )
10- logger = logging .getLogger (__name__ )
9+
10+ # logger
11+ logger .remove ()
12+ logger .add (sys .stdout , level = os .getenv ("LOG_LEVEL" , "INFO" ))
1113
1214
1315def parse_args ():
@@ -42,7 +44,7 @@ def create_vector_embedding_with_bedrock(text, name, bedrock_client):
4244
4345
4446def main ():
45- logging .info ("Starting" )
47+ logger .info ("Starting" )
4648
4749 dataset_url = "https://huggingface.co/datasets/sentence-transformers/embedding-training-data/resolve/main/gooaq_pairs.jsonl.gz"
4850 early_stop_record_count = 100
@@ -52,29 +54,29 @@ def main():
5254 name = args .index
5355
5456 # Prepare OpenSearch index with vector embeddings index mapping
55- logging .info (f"recreating opensearch index: { args .recreate } , using early stop: { args .early_stop } to insert only { early_stop_record_count } records" )
56- logging .info ("Preparing OpenSearch Index" )
57+ logger .info (f"recreating opensearch index: { args .recreate } , using early stop: { args .early_stop } to insert only { early_stop_record_count } records" )
58+ logger .info ("Preparing OpenSearch Index" )
5759 opensearch_password = secret .get_secret (name , region )
5860 opensearch_client = opensearch .get_opensearch_cluster_client (name , opensearch_password , region )
5961
6062 # Check if to delete OpenSearch index with the argument passed to the script --recreate 1
6163 if args .recreate :
6264 response = opensearch .delete_opensearch_index (opensearch_client , name )
6365 if response :
64- logging .info ("OpenSearch index successfully deleted" )
66+ logger .info ("OpenSearch index successfully deleted" )
6567
66- logging .info (f"Checking if index { name } exists in OpenSearch cluster" )
68+ logger .info (f"Checking if index { name } exists in OpenSearch cluster" )
6769 exists = opensearch .check_opensearch_index (opensearch_client , name )
6870 if not exists :
69- logging .info ("Creating OpenSearch index" )
71+ logger .info ("Creating OpenSearch index" )
7072 success = opensearch .create_index (opensearch_client , name )
7173 if success :
72- logging .info ("Creating OpenSearch index mapping" )
74+ logger .info ("Creating OpenSearch index mapping" )
7375 success = opensearch .create_index_mapping (opensearch_client , name )
74- logging .info (f"OpenSearch Index mapping created" )
76+ logger .info (f"OpenSearch Index mapping created" )
7577
7678 # Download sample dataset from HuggingFace
77- logging .info ("Downloading dataset from HuggingFace" )
79+ logger .info ("Downloading dataset from HuggingFace" )
7880 compressed_file_path = dataset .download_dataset (dataset_url )
7981 if compressed_file_path is not None :
8082 file_path = dataset .decompress_dataset (compressed_file_path )
@@ -86,7 +88,7 @@ def main():
8688
8789 # Vector embedding using Amazon Bedrock Titan text embedding
8890 all_json_records = []
89- logging .info (f"Creating embeddings for records" )
91+ logger .info (f"Creating embeddings for records" )
9092
9193 # using the arg --early-stop
9294 i = 0
@@ -96,24 +98,24 @@ def main():
9698 if i > early_stop_record_count :
9799 # Bulk put all records to OpenSearch
98100 success , failed = opensearch .put_bulk_in_opensearch (all_json_records , opensearch_client )
99- logging .info (f"Documents saved { success } , documents failed to save { failed } " )
101+ logger .info (f"Documents saved { success } , documents failed to save { failed } " )
100102 break
101103 records_with_embedding = create_vector_embedding_with_bedrock (record , name , bedrock_client )
102- logging .info (f"Embedding for record { i } created" )
104+ logger .info (f"Embedding for record { i } created" )
103105 all_json_records .append (records_with_embedding )
104106 if i % 500 == 0 or i == len (all_records )- 1 :
105107 # Bulk put all records to OpenSearch
106108 success , failed = opensearch .put_bulk_in_opensearch (all_json_records , opensearch_client )
107109 all_json_records = []
108- logging .info (f"Documents saved { success } , documents failed to save { failed } " )
110+ logger .info (f"Documents saved { success } , documents failed to save { failed } " )
109111
110- logging .info ("Finished creating records using Amazon Bedrock Titan text embedding" )
112+ logger .info ("Finished creating records using Amazon Bedrock Titan text embedding" )
111113
112- logging .info ("Cleaning up" )
114+ logger .info ("Cleaning up" )
113115 dataset .delete_file (compressed_file_path )
114116 dataset .delete_file (file_path )
115117
116- logging .info ("Finished" )
118+ logger .info ("Finished" )
117119
118120if __name__ == "__main__" :
119121 main ()
0 commit comments