sceptix-club · MacDsouza · Oct 2, 2024 · Oct 2, 2024 · Oct 4, 2024 · Oct 5, 2024
diff --git a/data.ipynb b/data.ipynb
@@ -0,0 +1,22 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pre.py b/pre.py
@@ -0,0 +1,99 @@
+import fitz  # PyMuPDF
+import json
+import re
+
+# Function to extract and clean PDF content
+def extract_pdf_content(pdf_path):
+    pdf_document = fitz.open(pdf_path)
+    pdf_content = []
+
+    for page_num in range(len(pdf_document)):
+        page = pdf_document.load_page(page_num)
+        text = page.get_text("text")
+        cleaned_text = clean_content(text)
+        pdf_content.append({
+            "page_number": page_num + 1,
+            "content": cleaned_text
+        })
+
+    return pdf_content
+
+# Function to clean the content
+def clean_content(content):
+    # Remove page numbers and other unnecessary information
+    cleaned_content = re.sub(r'Page\s+\d+\s+/\s+\d+', '', content)
+    cleaned_content = re.sub(r'______________________________________________________________________________________________', '', cleaned_content)
+    return cleaned_content.strip()
+
+# Function to structure the content into a JSON format
+def structure_content(pdf_content):
+    structured_data = {
+        "Identification": "",
+        "Hazard_Identification": "",
+        "Composition": "",
+        "First_Aid_Measures": "",
+        "Fire_Fighting_Measures": "",
+        "Accidental_Release_Measures": "",
+        "Handling_and_Storage": "",
+        "Exposure_Controls_Personal_Protection": "",
+        "Physical_and_Chemical_Properties": "",
+        "Stability_and_Reactivity": "",
+        "Toxicological_Information": "",
+        "Ecological_Information": "",
+        "Disposal_Considerations": "",
+        "Transport_Information": "",
+        "Regulatory_Information": "",
+        "Other_Information": ""
+    }
+
+    section_titles = {
+        "1. Identification": "Identification",
+        "2. Hazard(s) identification": "Hazard_Identification",
+        "3. Composition/information on ingredients": "Composition",
+        "4. First-aid measures": "First_Aid_Measures",
+        "5. Fire-fighting measures": "Fire_Fighting_Measures",
+        "6. Accidental release measures": "Accidental_Release_Measures",
+        "7. Handling and storage": "Handling_and_Storage",
+        "8. Exposure controls/personal protection": "Exposure_Controls_Personal_Protection",
+        "9. Physical and chemical properties": "Physical_and_Chemical_Properties",
+        "10. Stability and reactivity": "Stability_and_Reactivity",
+        "11. Toxicological information": "Toxicological_Information",
+        "12. Ecological information": "Ecological_Information",
+        "13. Disposal considerations": "Disposal_Considerations",
+        "14. Transport information": "Transport_Information",
+        "15. Regulatory information": "Regulatory_Information",
+        "16. Other information": "Other_Information"
+    }
+
+    current_section = None
+    for page in pdf_content:
+        lines = page["content"].split('\n')
+        for line in lines:
+            line = line.strip()
+            if line in section_titles:
+                current_section = section_titles[line]
+            elif current_section:
+                structured_data[current_section] += line + ' '
+
+    return structured_data
+
+# Function to convert the structured data to JSON
+def convert_to_json(data):
+    return json.dumps(data, indent=4)
+
+# Function to save the JSON data to a file
+def save_json_to_file(json_data, output_path):
+    with open(output_path, 'w') as json_file:
+        json_file.write(json_data)
+
+# Main function to execute the steps
+def main(pdf_path, output_json_path):
+    pdf_content = extract_pdf_content(pdf_path)
+    structured_data = structure_content(pdf_content)
+    json_data = convert_to_json(structured_data)
+    save_json_to_file(json_data, output_json_path)
+
+# Example usage
+pdf_path = 'data/acetone-acs-l (1).pdf'
+output_json_path = 'structured_output.json'
+main(pdf_path, output_json_path)
diff --git a/preprocessor.py b/preprocessor.py
@@ -0,0 +1,21 @@
+from unstructured.partition.pdf import partition_pdf
+import json
+
+# Returns a List[Element] present in the pages of the parsed pdf document
+elements = partition_pdf("data/acetone-acs-l (1).pdf")
+
+print(elements)
+
+# def convert_to_json(data):
+#     return json.dumps(data, indent=4)
+
+# def save_json_to_file(json_data, output_path):
+#     with open(output_path, 'w') as json_file:
+#         json_file.write(json_data)
+
+# def main(el, output_json_path):
+#     json_data = convert_to_json(el)
+#     save_json_to_file(json_data, output_json_path)
+
+# output_json_path = 'structured.json'
+# main(elements, output_json_path)
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,228 @@
+absl-py==2.1.0
+anyio==3.5.0
+astunparse==1.6.3
+attrs==21.2.0
+Babel==2.8.0
+backoff==2.2.1
+bcc==0.18.0
+beautifulsoup4==4.10.0
+black==21.10b0
+blinker==1.4
+Brlapi==0.8.3
+certifi==2024.8.30
+cffi==1.17.1
+chardet==4.0.0
+charset-normalizer==3.3.2
+chrome-gnome-shell==0.0.0
+click==8.0.3
+colorama==0.4.4
+coloredlogs==15.0.1
+command-not-found==0.3
+commonmark==0.9.1
+contourpy==1.3.0
+cov-core==1.15.0
+coverage==6.2
+cryptography==43.0.1
+cupshelpers==1.0
+cycler==0.12.1
+dataclasses-json==0.6.7
+dbus-python==1.2.18
+deepdiff==8.0.1
+defer==1.0.6
+distlib==0.3.8
+distro==1.7.0
+emoji==2.13.2
+evdev==1.4.0
+execnet==1.9.0
+filelock==3.16.1
+filetype==1.2.0
+flake8==4.0.1
+flatbuffers==24.3.25
+fonttools==4.53.1
+fsspec==2024.9.0
+gast==0.6.0
+google-pasta==0.2.0
+gpg==1.16.0
+grpcio==1.66.1
+h11==0.13.0
+h5py==3.11.0
+hidpidaemon==18.4.6
+html5lib==1.1
+httpcore==1.0.6
+httplib2==0.20.2
+httpx==0.27.2
+huggingface-hub==0.25.1
+humanfriendly==10.0
+idna==3.10
+importlib-metadata==4.6.4
+iniconfig==1.1.1
+iopath==0.1.10
+isort==5.6.4
+jeepney==0.7.1
+Jinja2==3.0.3
+joblib==1.4.2
+jsonpath-python==1.0.6
+keras==3.5.0
+kernelstub==3.1.4
+keyring==23.5.0
+kiwisolver==1.4.7
+langdetect==1.0.9
+language-selector==0.1
+launchpadlib==1.10.16
+layoutparser==0.3.4
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+libclang==18.1.1
+livereload==2.6.3
+louis==3.20.0
+lxml==4.8.0
+macaroonbakery==1.3.1
+Markdown==3.3.6
+MarkupSafe==2.1.5
+marshmallow==3.22.0
+matplotlib==3.9.2
+mccabe==0.6.1
+mkdocs==1.1.2
+ml-dtypes==0.4.1
+more-itertools==8.10.0
+mpmath==1.3.0
+mypy==0.942
+mypy-extensions==1.0.0
+nala==0.11.1
+namex==0.0.8
+nest-asyncio==1.6.0
+netaddr==0.8.0
+netifaces==0.11.0
+networkx==3.3
+nltk==3.9.1
+nose2==0.9.2
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.6.77
+nvidia-nvtx-cu12==12.1.105
+oauthlib==3.2.0
+olefile==0.47
+onnx==1.17.0
+onnxruntime==1.19.2
+opencv-python==4.10.0.84
+opt-einsum==3.3.0
+optree==0.12.1
+orderly-set==5.2.2
+packaging==24.1
+pandas==2.2.3
+pathspec==0.9.0
+pdf2image==1.17.0
+pdfminer.six==20231228
+pdfplumber==0.11.4
+pexpect==4.8.0
+pi_heif==0.18.0
+pillow==10.4.0
+platformdirs==4.3.6
+pluggy==0.13.0
+pop-transition==1.1.2
+portalocker==2.10.1
+protobuf==4.25.5
+psutil==5.9.0
+ptyprocess==0.7.0
+py==1.10.0
+pycairo==1.20.1
+pycodestyle==2.8.0
+pycparser==2.22
+pycups==2.0.1
+pydbus==0.6.0
+pyflakes==2.4.0
+Pygments==2.11.2
+PyGObject==3.42.1
+pyinotify==0.9.6
+PyJWT==2.3.0
+pymacaroons==0.13.0
+PyMuPDF==1.24.10
+PyMuPDFb==1.24.10
+PyNaCl==1.5.0
+pyparsing==2.4.7
+pypdf==5.0.1
+pypdfium2==4.30.0
+pyRFC3339==1.1
+pytest==6.2.5
+pytest-cov==3.0.0
+pytest-forked==1.4.0
+pytest-sugar==0.9.4
+pytest-xdist==2.5.0
+python-apt==2.4.0+ubuntu4
+python-dateutil==2.9.0.post0
+python-debian==0.1.43+ubuntu1.1
+python-gnupg==0.4.8
+python-iso639==2024.4.27
+python-magic==0.4.27
+python-multipart==0.0.12
+python-oxmsg==0.0.1
+python-xlib==0.29
+pytz==2022.1
+pyxdg==0.27
+PyYAML==5.4.1
+RapidFuzz==3.10.0
+regex==2024.9.11
+repolib==2.2.1
+repoman==1.4.0
+requests==2.32.3
+requests-toolbelt==1.0.0
+rfc3986==1.5.0
+rich==11.2.0
+safetensors==0.4.5
+scipy==1.14.1
+screen-resolution-extra==0.0.0
+SecretStorage==3.3.1
+sessioninstaller==0.0.0
+shellingham==1.4.0
+six==1.16.0
+sniffio==1.2.0
+socksio==1.0.0
+soupsieve==2.3.1
+sympy==1.13.3
+system76driver==20.4.95
+systemd-python==234
+tabulate==0.9.0
+tensorboard==2.17.1
+tensorboard-data-server==0.7.2
+tensorflow==2.17.0
+tensorflow-io-gcs-filesystem==0.37.1
+termcolor==1.1.0
+timm==1.0.9
+tokenizers==0.20.0
+toml==0.10.2
+tomli==1.2.2
+torch==2.4.1
+torchvision==0.19.1
+tornado==6.1
+tqdm==4.66.5
+transformers==4.45.1
+triton==3.0.0
+typed-ast==1.4.3
+typer==0.4.0
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.2
+ubuntu-drivers-common==0.0.0
+ufw==0.36.1
+unstructured==0.15.13
+unstructured-client==0.25.9
+unstructured-inference==0.7.37
+urllib3==2.2.3
+urwid==2.1.2
+virtualenv==20.26.6
+wadllib==1.3.6
+webencodings==0.5.1
+Werkzeug==3.0.4
+wrapt==1.16.0
+xdg==5
+xkit==0.0.0
+zipp==1.0.0