Skip to content

Commit a0ed859

Browse files
added new architecture_v3.py
1 parent 734f0c8 commit a0ed859

File tree

6 files changed

+1587
-73
lines changed

6 files changed

+1587
-73
lines changed

Feedback.docx

38.1 KB
Binary file not shown.

abbr.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
import re,os
2+
import json
3+
from pathlib import Path
4+
from docx import Document
5+
from PyPDF2 import PdfReader
6+
7+
def extract_text_from_pdf(file_path):
8+
reader = PdfReader(file_path)
9+
text = []
10+
for page in reader.pages:
11+
page_text = page.extract_text()
12+
if page_text:
13+
text.append(page_text)
14+
return "\n".join(text)
15+
16+
17+
# ------------------------------
18+
# 2. Extract text from DOCX
19+
# ------------------------------
20+
def extract_text_from_docx(file_path):
21+
doc = Document(file_path)
22+
return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
23+
24+
def extract_abbreviations(text):
25+
"""
26+
Extracts abbreviations like:
27+
- BP (Blood Pressure)
28+
- b.p. (Blood Pressure)
29+
- COPD (Chronic Obstructive Pulmonary Disease)
30+
"""
31+
abbr_dict = {}
32+
33+
# Match: ABC (Full form)
34+
pattern = re.compile(r"\b([A-Za-z\.]{2,10})\s*\(([^)]+)\)")
35+
36+
for match in pattern.finditer(text):
37+
abbr = match.group(1).replace(".", "").lower().strip()
38+
full_form = match.group(2).strip()
39+
40+
if 2 <= len(abbr) <= 10 and full_form:
41+
abbr_dict[abbr] = full_form
42+
43+
return abbr_dict
44+
45+
46+
# ------------------------------
47+
# 4. Build dictionary from multiple files
48+
# ------------------------------
49+
def build_abbreviation_dict(files, save_path="abbr_dict.json"):
50+
final_dict = {}
51+
52+
for file in files:
53+
print("processing :",file)
54+
ext = Path(file).suffix.lower()
55+
56+
if ext == ".pdf":
57+
text = extract_text_from_pdf(file)
58+
elif ext == ".docx":
59+
text = extract_text_from_docx(file)
60+
else:
61+
print(f"Skipping unsupported file: {file}")
62+
continue
63+
64+
abbrs = extract_abbreviations(text)
65+
final_dict.update(abbrs)
66+
67+
# Save clean JSON
68+
with open(save_path, "w", encoding="utf-8") as f:
69+
json.dump(final_dict, f, indent=2, ensure_ascii=False)
70+
71+
print(f"✅ Abbreviation dictionary saved to {save_path}")
72+
return final_dict
73+
74+
75+
# ------------------------------
76+
# 5. Expand abbreviations in query using dictionary
77+
# ------------------------------
78+
def expand_abbreviations_in_text(query, abbr_dict):
79+
"""
80+
Replace abbreviations in query with full form if found in dict.
81+
"""
82+
words = query.split()
83+
expanded = []
84+
85+
for word in words:
86+
key = word.lower().replace(".", "")
87+
if key in abbr_dict:
88+
expanded.append(abbr_dict[key])
89+
else:
90+
expanded.append(word)
91+
92+
return " ".join(expanded)
93+
94+
path = "D:/Documents/RHL-RAG-PROJECT/FILES"
95+
arr=[f"{path}/{i}" for i in os.listdir(r"D:\Documents\RHL-RAG-PROJECT\FILES")]
96+
97+
98+
abbr_dict = build_abbreviation_dict(arr)
99+
100+
query = "Patient has COPD and high BP."
101+
expanded_query = expand_abbreviations_in_text(query, abbr_dict)
102+
103+
print("\nOriginal:", query)
104+
print("Expanded:", expanded_query)

0 commit comments

Comments
 (0)