sceptix-club · Rishal14 · Oct 4, 2024 · Oct 4, 2024 · Oct 4, 2024 · Oct 4, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+/venv
diff --git a/acetonitrile_sds_with_subheadings.json b/acetonitrile_sds_with_subheadings.json
diff --git a/data_preprocessor.ipynb b/data_preprocessor.ipynb
diff --git a/prepocess.py b/prepocess.py
@@ -0,0 +1,77 @@
+import fitz 
+import json
+import re
+
+def extract_text_with_structure(pdf_path):
+    doc = fitz.open(pdf_path)
+    extracted_data = []
+
+    for page_num in range(doc.page_count):
+        page = doc.load_page(page_num)
+        blocks = page.get_text("dict")['blocks'] 
+
+        for block in blocks:
+            if 'lines' in block:
+                for line in block['lines']:
+                    text_line = ''.join([span['text'] for span in line['spans']])
+                    font_size = line['spans'][0]['size']
+                    bold = line['spans'][0]['flags'] & 2  # Check if the text is bold
+
+                    # Add extracted line only if it is not empty and doesn't contain page numbers
+                    if text_line.strip() and not re.match(r'^\s*Page\s+\d+', text_line):  # Ignore lines with "Page X"
+                        extracted_data.append({
+                            "text": text_line.strip(),
+                            "font_size": font_size,
+                            "bold": bool(bold),
+                            "page": page_num + 1
+                        })
+
+    return extracted_data
+
+def process_extracted_text_with_subheadings(extracted_data):
+    json_output = {}
+    current_section = None
+    current_subheading = None
+
+    for item in extracted_data:
+        text = item['text']
+
+        # Check if the line indicates a new section (e.g., starts with a number and space)
+        if re.match(r'^\d+\.\s+[A-Za-z\s]+$', text):
+            current_section = text
+            json_output[current_section] = {}
+            current_subheading = None 
+
+        elif current_section:
+            if item['bold']:  # If the text is bold, treat it as a subheading
+                current_subheading = text
+                json_output[current_section][current_subheading] = []
+            else:
+                # Add text to the current subheading's list or main content list
+                if current_subheading:
+                    json_output[current_section][current_subheading].append(text)
+                else:
+                    if "content" not in json_output[current_section]:
+                        json_output[current_section]["content"] = []
+                    json_output[current_section]["content"].append(text)
+
+    return json_output
+
+def save_json_to_file(json_data, output_file):
+    """Saves the structured data to a JSON file."""
+    with open(output_file, 'w') as f:
+        json.dump(json_data, f, indent=4)
+
+def process_pdf_to_json_with_subheadings(pdf_path, output_file):
+    """Processes the PDF and converts its content into structured JSON format."""
+    extracted_data = extract_text_with_structure(pdf_path)
+    json_data = process_extracted_text_with_subheadings(extracted_data)
+    save_json_to_file(json_data, output_file)
+
+    print(f"PDF data successfully converted to JSON with subheadings and saved to {output_file}")
+
+# Example usage
+pdf_path = "./data/acetonitrile-hplc-grade-l (1).pdf"  
+output_file = "acetonitrile_sds_with_subheadings.json"
+
+process_pdf_to_json_with_subheadings(pdf_path, output_file)
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,109 @@
+aiohttp==3.9.5
+aiohttp-retry==2.8.3
+aiosignal==1.3.1
+altgraph==0.17.4
+attrs==23.2.0
+beautifulsoup4==4.12.2
+blinker==1.8.2
+bs4==0.0.1
+cachetools==5.3.2
+certifi==2023.11.17
+cffi==1.17.1
+chardet==5.2.0
+charset-normalizer==3.3.2
+ci-info==0.3.0
+click==8.1.7
+colorama==0.4.6
+configobj==5.0.9
+configparser==7.1.0
+cryptography==43.0.1
+cssselect==1.2.0
+cssutils==2.11.1
+decorator==5.1.1
+dnspython==2.6.1
+etelemetry==0.3.1
+filelock==3.16.1
+fitz==0.0.1.dev2
+Flask==3.0.3
+frozenlist==1.4.1
+future==1.0.0
+geocoder==1.38.1
+google-api-core==2.14.0
+google-api-python-client==2.109.0
+google-auth==2.24.0
+google-auth-httplib2==0.1.1
+google-auth-oauthlib==1.1.0
+googleapis-common-protos==1.61.0
+httplib2==0.22.0
+idna==3.6
+isodate==0.6.1
+itsdangerous==2.2.0
+Jinja2==3.1.4
+looseversion==1.3.0
+lxml==4.9.3
+MarkupSafe==2.1.5
+more-itertools==10.3.0
+multidict==6.0.5
+networkx==3.3
+nibabel==5.2.1
+nipype==1.8.6
+numpy==2.0.0
+oauthlib==3.2.2
+opencv-python==4.10.0.84
+opencv-python-headless==4.10.0.84
+packaging==24.0
+pandas==2.2.2
+pathlib==1.0.1
+pdfminer.six==20231228
+pdfplumber==0.11.4
+pefile==2023.2.7
+pillow==10.3.0
+premailer==3.10.0
+protobuf==4.25.1
+prov==2.0.1
+psycopg2==2.9.9
+pyasn1==0.5.1
+pyasn1-modules==0.3.0
+PyAudio==0.2.14
+pycparser==2.22
+pydot==3.0.2
+pyinstaller==6.7.0
+pyinstaller-hooks-contrib==2024.6
+PyJWT==2.8.0
+pymongo==4.8.0
+pyparsing==3.1.1
+pypdfium2==4.30.0
+pyperclip==1.9.0
+PyQt5==5.15.10
+PyQt5-Qt5==5.15.2
+PyQt5-sip==12.13.0
+PyQtWebEngine==5.15.6
+PyQtWebEngine-Qt5==5.15.2
+pytesseract==0.3.10
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.0
+pytz==2024.1
+pywin32-ctypes==0.2.2
+pyxnat==1.6.2
+ratelim==0.1.6
+rdflib==6.3.2
+reportlab==4.2.2
+requests==2.31.0
+requests-oauthlib==1.3.1
+rsa==4.9
+scipy==1.14.1
+setuptools==70.0.0
+simplejson==3.19.3
+six==1.16.0
+soupsieve==2.5
+SpeechRecognition==3.10.4
+tk==0.1.0
+traits==6.3.2
+twilio==9.2.3
+typing_extensions==4.12.2
+tzdata==2024.1
+uritemplate==4.1.1
+urllib3==2.1.0
+Werkzeug==3.0.3
+yagmail==0.15.293
+yarl==1.9.4