1+ import re ,os
2+ import json
3+ from pathlib import Path
4+ from docx import Document
5+ from PyPDF2 import PdfReader
6+
7+ def extract_text_from_pdf (file_path ):
8+ reader = PdfReader (file_path )
9+ text = []
10+ for page in reader .pages :
11+ page_text = page .extract_text ()
12+ if page_text :
13+ text .append (page_text )
14+ return "\n " .join (text )
15+
16+
17+ # ------------------------------
18+ # 2. Extract text from DOCX
19+ # ------------------------------
20+ def extract_text_from_docx (file_path ):
21+ doc = Document (file_path )
22+ return "\n " .join ([p .text for p in doc .paragraphs if p .text .strip ()])
23+
24+ def extract_abbreviations (text ):
25+ """
26+ Extracts abbreviations like:
27+ - BP (Blood Pressure)
28+ - b.p. (Blood Pressure)
29+ - COPD (Chronic Obstructive Pulmonary Disease)
30+ """
31+ abbr_dict = {}
32+
33+ # Match: ABC (Full form)
34+ pattern = re .compile (r"\b([A-Za-z\.]{2,10})\s*\(([^)]+)\)" )
35+
36+ for match in pattern .finditer (text ):
37+ abbr = match .group (1 ).replace ("." , "" ).lower ().strip ()
38+ full_form = match .group (2 ).strip ()
39+
40+ if 2 <= len (abbr ) <= 10 and full_form :
41+ abbr_dict [abbr ] = full_form
42+
43+ return abbr_dict
44+
45+
46+ # ------------------------------
47+ # 4. Build dictionary from multiple files
48+ # ------------------------------
49+ def build_abbreviation_dict (files , save_path = "abbr_dict.json" ):
50+ final_dict = {}
51+
52+ for file in files :
53+ print ("processing :" ,file )
54+ ext = Path (file ).suffix .lower ()
55+
56+ if ext == ".pdf" :
57+ text = extract_text_from_pdf (file )
58+ elif ext == ".docx" :
59+ text = extract_text_from_docx (file )
60+ else :
61+ print (f"Skipping unsupported file: { file } " )
62+ continue
63+
64+ abbrs = extract_abbreviations (text )
65+ final_dict .update (abbrs )
66+
67+ # Save clean JSON
68+ with open (save_path , "w" , encoding = "utf-8" ) as f :
69+ json .dump (final_dict , f , indent = 2 , ensure_ascii = False )
70+
71+ print (f"✅ Abbreviation dictionary saved to { save_path } " )
72+ return final_dict
73+
74+
75+ # ------------------------------
76+ # 5. Expand abbreviations in query using dictionary
77+ # ------------------------------
78+ def expand_abbreviations_in_text (query , abbr_dict ):
79+ """
80+ Replace abbreviations in query with full form if found in dict.
81+ """
82+ words = query .split ()
83+ expanded = []
84+
85+ for word in words :
86+ key = word .lower ().replace ("." , "" )
87+ if key in abbr_dict :
88+ expanded .append (abbr_dict [key ])
89+ else :
90+ expanded .append (word )
91+
92+ return " " .join (expanded )
93+
94+ path = "D:/Documents/RHL-RAG-PROJECT/FILES"
95+ arr = [f"{ path } /{ i } " for i in os .listdir (r"D:\Documents\RHL-RAG-PROJECT\FILES" )]
96+
97+
98+ abbr_dict = build_abbreviation_dict (arr )
99+
100+ query = "Patient has COPD and high BP."
101+ expanded_query = expand_abbreviations_in_text (query , abbr_dict )
102+
103+ print ("\n Original:" , query )
104+ print ("Expanded:" , expanded_query )
0 commit comments