|
11 | 11 | from haystack.components.preprocessors import DocumentSplitter |
12 | 12 | from haystack.components.preprocessors import DocumentCleaner |
13 | 13 |
|
| 14 | + |
| 15 | +from .logger import get_logger |
| 16 | + |
| 17 | +# Create logger instance from base logger config in `logger.py` |
| 18 | +logger = get_logger(__name__) |
| 19 | + |
14 | 20 | HUGGING_FACE_HUB_TOKEN = os.environ.get('HUGGING_FACE_HUB_TOKEN') |
15 | 21 |
|
16 | 22 | # disable this line to disable the embedding cache |
|
19 | 25 | top_k = 5 |
20 | 26 | input_documents = [] |
21 | 27 |
|
| 28 | +# TODO: Add the json strings as env variables |
22 | 29 | json_dir = 'json_input' |
23 | 30 | json_fname = 'excellent-articles_10.json' |
24 | 31 |
|
25 | 32 | json_fpath = os.path.join(json_dir, json_fname) |
26 | 33 |
|
27 | 34 | if os.path.isfile(json_fpath): |
28 | | - print(f'[INFO] Loading data from {json_fpath}') |
| 35 | + logger.info(f'Loading data from {json_fpath}') |
29 | 36 | with open(json_fpath, 'r') as finn: |
30 | 37 | json_obj = json.load(finn) |
31 | 38 |
|
32 | 39 | if isinstance(json_obj, dict): |
33 | | - for k, v in tqdm(json_obj.items()): |
34 | | - print(f"Loading {k}") |
35 | | - input_documents.append(Document(content=v, meta={"src": k})) |
36 | | - |
| 40 | + input_documents = [ |
| 41 | + Document( |
| 42 | + content=content_, |
| 43 | + meta={"src": url_} |
| 44 | + ) |
| 45 | + for url_, content_ in tqdm(json_obj.items()) |
| 46 | + ] |
37 | 47 | elif isinstance(json_obj, list): |
38 | | - for obj_ in tqdm(json_obj): |
39 | | - url = obj_['meta'] |
40 | | - content = obj_['content'] |
41 | | - input_documents.append( |
42 | | - Document( |
43 | | - content=content, |
44 | | - meta={'src': url} |
45 | | - ) |
| 48 | + input_documents = [ |
| 49 | + Document( |
| 50 | + content=obj_['content'], |
| 51 | + meta={'src': obj_['meta']} |
46 | 52 | ) |
| 53 | + for obj_ in tqdm(json_obj) |
| 54 | + ] |
47 | 55 | else: |
48 | 56 | input_documents = [ |
49 | 57 | Document( |
|
60 | 68 | ), |
61 | 69 | ] |
62 | 70 |
|
63 | | -splitter = DocumentSplitter(split_by="sentence", split_length=5, split_overlap=0) |
| 71 | +splitter = DocumentSplitter( |
| 72 | + split_by="sentence", |
| 73 | + split_length=5, |
| 74 | + split_overlap=0 |
| 75 | +) |
64 | 76 | input_documents = splitter.run(input_documents)['documents'] |
65 | 77 |
|
66 | 78 | cleaner = DocumentCleaner( |
67 | | - remove_empty_lines=True, |
68 | | - remove_extra_whitespaces=True, |
69 | | - remove_repeated_substrings=False) |
| 79 | + remove_empty_lines=True, |
| 80 | + remove_extra_whitespaces=True, |
| 81 | + remove_repeated_substrings=False |
| 82 | +) |
70 | 83 | input_documents = cleaner.run(input_documents)['documents'] |
71 | 84 |
|
72 | 85 |
|
|
78 | 91 |
|
79 | 92 | # https://huggingface.co/svalabs/german-gpl-adapted-covid |
80 | 93 | sentence_transformer_model = 'svalabs/german-gpl-adapted-covid' |
81 | | -print(f'Sentence Transformer Name: {sentence_transformer_model}') |
| 94 | +logger.info(f'Sentence Transformer Name: {sentence_transformer_model}') |
82 | 95 |
|
83 | 96 | embedder = SentenceTransformersDocumentEmbedder( |
84 | 97 | model=sentence_transformer_model, |
|
87 | 100 |
|
88 | 101 |
|
89 | 102 | if EMBEDDING_CACHE_FILE and os.path.isfile(EMBEDDING_CACHE_FILE): |
90 | | - print("[INFO] Loading embeddings from cache") |
| 103 | + logger.info('Loading embeddings from cache') |
91 | 104 |
|
92 | | - with open(EMBEDDING_CACHE_FILE, 'r') as f: |
93 | | - documentsDict = json.load(f) |
| 105 | + with open(EMBEDDING_CACHE_FILE, 'r') as f_in: |
| 106 | + documents_dict = json.load(f_in) |
94 | 107 | document_store.write_documents( |
95 | | - documents=[Document.from_dict(d) for d in documentsDict], |
| 108 | + documents=[Document.from_dict(d_) for d_ in documents_dict], |
96 | 109 | policy=DuplicatePolicy.OVERWRITE |
97 | 110 | ) |
98 | 111 |
|
99 | 112 | else: |
100 | | - print("[INFO] Generating embeddings") |
| 113 | + logger.debug("Generating embeddings") |
101 | 114 |
|
102 | 115 | embedded = embedder.run(input_documents) |
103 | 116 | document_store.write_documents( |
|
106 | 119 | ) |
107 | 120 |
|
108 | 121 | if EMBEDDING_CACHE_FILE: |
109 | | - with open(EMBEDDING_CACHE_FILE, 'w') as f: |
110 | | - documentsDict = [Document.to_dict(d) for d in embedded['documents']] |
111 | | - json.dump(documentsDict, f) |
| 122 | + with open(EMBEDDING_CACHE_FILE, 'w') as f_out: |
| 123 | + documents_dict = [ |
| 124 | + Document.to_dict(d_) |
| 125 | + for d_ in embedded['documents'] |
| 126 | + ] |
| 127 | + json.dump(documents_dict, f_out) |
112 | 128 |
|
113 | 129 | retriever = InMemoryEmbeddingRetriever(document_store=document_store) |
114 | | - |
|
0 commit comments