Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions data.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": []
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
99 changes: 99 additions & 0 deletions pre.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import fitz # PyMuPDF
import json
import re

# Function to extract and clean PDF content
def extract_pdf_content(pdf_path):
pdf_document = fitz.open(pdf_path)
pdf_content = []

for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
text = page.get_text("text")
cleaned_text = clean_content(text)
pdf_content.append({
"page_number": page_num + 1,
"content": cleaned_text
})

return pdf_content

# Function to clean the content
def clean_content(content):
# Remove page numbers and other unnecessary information
cleaned_content = re.sub(r'Page\s+\d+\s+/\s+\d+', '', content)
cleaned_content = re.sub(r'______________________________________________________________________________________________', '', cleaned_content)
return cleaned_content.strip()

# Function to structure the content into a JSON format
def structure_content(pdf_content):
structured_data = {
"Identification": "",
"Hazard_Identification": "",
"Composition": "",
"First_Aid_Measures": "",
"Fire_Fighting_Measures": "",
"Accidental_Release_Measures": "",
"Handling_and_Storage": "",
"Exposure_Controls_Personal_Protection": "",
"Physical_and_Chemical_Properties": "",
"Stability_and_Reactivity": "",
"Toxicological_Information": "",
"Ecological_Information": "",
"Disposal_Considerations": "",
"Transport_Information": "",
"Regulatory_Information": "",
"Other_Information": ""
}

section_titles = {
"1. Identification": "Identification",
"2. Hazard(s) identification": "Hazard_Identification",
"3. Composition/information on ingredients": "Composition",
"4. First-aid measures": "First_Aid_Measures",
"5. Fire-fighting measures": "Fire_Fighting_Measures",
"6. Accidental release measures": "Accidental_Release_Measures",
"7. Handling and storage": "Handling_and_Storage",
"8. Exposure controls/personal protection": "Exposure_Controls_Personal_Protection",
"9. Physical and chemical properties": "Physical_and_Chemical_Properties",
"10. Stability and reactivity": "Stability_and_Reactivity",
"11. Toxicological information": "Toxicological_Information",
"12. Ecological information": "Ecological_Information",
"13. Disposal considerations": "Disposal_Considerations",
"14. Transport information": "Transport_Information",
"15. Regulatory information": "Regulatory_Information",
"16. Other information": "Other_Information"
}

current_section = None
for page in pdf_content:
lines = page["content"].split('\n')
for line in lines:
line = line.strip()
if line in section_titles:
current_section = section_titles[line]
elif current_section:
structured_data[current_section] += line + ' '

return structured_data

# Function to convert the structured data to JSON
def convert_to_json(data):
return json.dumps(data, indent=4)

# Function to save the JSON data to a file
def save_json_to_file(json_data, output_path):
with open(output_path, 'w') as json_file:
json_file.write(json_data)

# Main function to execute the steps
def main(pdf_path, output_json_path):
pdf_content = extract_pdf_content(pdf_path)
structured_data = structure_content(pdf_content)
json_data = convert_to_json(structured_data)
save_json_to_file(json_data, output_json_path)

# Example usage
pdf_path = 'data/acetone-acs-l (1).pdf'
output_json_path = 'structured_output.json'
main(pdf_path, output_json_path)
21 changes: 21 additions & 0 deletions preprocessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from unstructured.partition.pdf import partition_pdf
import json

# Returns a List[Element] present in the pages of the parsed pdf document
elements = partition_pdf("data/acetone-acs-l (1).pdf")

print(elements)

# def convert_to_json(data):
# return json.dumps(data, indent=4)

# def save_json_to_file(json_data, output_path):
# with open(output_path, 'w') as json_file:
# json_file.write(json_data)

# def main(el, output_json_path):
# json_data = convert_to_json(el)
# save_json_to_file(json_data, output_json_path)

# output_json_path = 'structured.json'
# main(elements, output_json_path)
228 changes: 228 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
absl-py==2.1.0
anyio==3.5.0
astunparse==1.6.3
attrs==21.2.0
Babel==2.8.0
backoff==2.2.1
bcc==0.18.0
beautifulsoup4==4.10.0
black==21.10b0
blinker==1.4
Brlapi==0.8.3
certifi==2024.8.30
cffi==1.17.1
chardet==4.0.0
charset-normalizer==3.3.2
chrome-gnome-shell==0.0.0
click==8.0.3
colorama==0.4.4
coloredlogs==15.0.1
command-not-found==0.3
commonmark==0.9.1
contourpy==1.3.0
cov-core==1.15.0
coverage==6.2
cryptography==43.0.1
cupshelpers==1.0
cycler==0.12.1
dataclasses-json==0.6.7
dbus-python==1.2.18
deepdiff==8.0.1
defer==1.0.6
distlib==0.3.8
distro==1.7.0
emoji==2.13.2
evdev==1.4.0
execnet==1.9.0
filelock==3.16.1
filetype==1.2.0
flake8==4.0.1
flatbuffers==24.3.25
fonttools==4.53.1
fsspec==2024.9.0
gast==0.6.0
google-pasta==0.2.0
gpg==1.16.0
grpcio==1.66.1
h11==0.13.0
h5py==3.11.0
hidpidaemon==18.4.6
html5lib==1.1
httpcore==1.0.6
httplib2==0.20.2
httpx==0.27.2
huggingface-hub==0.25.1
humanfriendly==10.0
idna==3.10
importlib-metadata==4.6.4
iniconfig==1.1.1
iopath==0.1.10
isort==5.6.4
jeepney==0.7.1
Jinja2==3.0.3
joblib==1.4.2
jsonpath-python==1.0.6
keras==3.5.0
kernelstub==3.1.4
keyring==23.5.0
kiwisolver==1.4.7
langdetect==1.0.9
language-selector==0.1
launchpadlib==1.10.16
layoutparser==0.3.4
lazr.restfulclient==0.14.4
lazr.uri==1.0.6
libclang==18.1.1
livereload==2.6.3
louis==3.20.0
lxml==4.8.0
macaroonbakery==1.3.1
Markdown==3.3.6
MarkupSafe==2.1.5
marshmallow==3.22.0
matplotlib==3.9.2
mccabe==0.6.1
mkdocs==1.1.2
ml-dtypes==0.4.1
more-itertools==8.10.0
mpmath==1.3.0
mypy==0.942
mypy-extensions==1.0.0
nala==0.11.1
namex==0.0.8
nest-asyncio==1.6.0
netaddr==0.8.0
netifaces==0.11.0
networkx==3.3
nltk==3.9.1
nose2==0.9.2
numpy==1.26.4
nvidia-cublas-cu12==12.1.3.1
nvidia-cuda-cupti-cu12==12.1.105
nvidia-cuda-nvrtc-cu12==12.1.105
nvidia-cuda-runtime-cu12==12.1.105
nvidia-cudnn-cu12==9.1.0.70
nvidia-cufft-cu12==11.0.2.54
nvidia-curand-cu12==10.3.2.106
nvidia-cusolver-cu12==11.4.5.107
nvidia-cusparse-cu12==12.1.0.106
nvidia-nccl-cu12==2.20.5
nvidia-nvjitlink-cu12==12.6.77
nvidia-nvtx-cu12==12.1.105
oauthlib==3.2.0
olefile==0.47
onnx==1.17.0
onnxruntime==1.19.2
opencv-python==4.10.0.84
opt-einsum==3.3.0
optree==0.12.1
orderly-set==5.2.2
packaging==24.1
pandas==2.2.3
pathspec==0.9.0
pdf2image==1.17.0
pdfminer.six==20231228
pdfplumber==0.11.4
pexpect==4.8.0
pi_heif==0.18.0
pillow==10.4.0
platformdirs==4.3.6
pluggy==0.13.0
pop-transition==1.1.2
portalocker==2.10.1
protobuf==4.25.5
psutil==5.9.0
ptyprocess==0.7.0
py==1.10.0
pycairo==1.20.1
pycodestyle==2.8.0
pycparser==2.22
pycups==2.0.1
pydbus==0.6.0
pyflakes==2.4.0
Pygments==2.11.2
PyGObject==3.42.1
pyinotify==0.9.6
PyJWT==2.3.0
pymacaroons==0.13.0
PyMuPDF==1.24.10
PyMuPDFb==1.24.10
PyNaCl==1.5.0
pyparsing==2.4.7
pypdf==5.0.1
pypdfium2==4.30.0
pyRFC3339==1.1
pytest==6.2.5
pytest-cov==3.0.0
pytest-forked==1.4.0
pytest-sugar==0.9.4
pytest-xdist==2.5.0
python-apt==2.4.0+ubuntu4
python-dateutil==2.9.0.post0
python-debian==0.1.43+ubuntu1.1
python-gnupg==0.4.8
python-iso639==2024.4.27
python-magic==0.4.27
python-multipart==0.0.12
python-oxmsg==0.0.1
python-xlib==0.29
pytz==2022.1
pyxdg==0.27
PyYAML==5.4.1
RapidFuzz==3.10.0
regex==2024.9.11
repolib==2.2.1
repoman==1.4.0
requests==2.32.3
requests-toolbelt==1.0.0
rfc3986==1.5.0
rich==11.2.0
safetensors==0.4.5
scipy==1.14.1
screen-resolution-extra==0.0.0
SecretStorage==3.3.1
sessioninstaller==0.0.0
shellingham==1.4.0
six==1.16.0
sniffio==1.2.0
socksio==1.0.0
soupsieve==2.3.1
sympy==1.13.3
system76driver==20.4.95
systemd-python==234
tabulate==0.9.0
tensorboard==2.17.1
tensorboard-data-server==0.7.2
tensorflow==2.17.0
tensorflow-io-gcs-filesystem==0.37.1
termcolor==1.1.0
timm==1.0.9
tokenizers==0.20.0
toml==0.10.2
tomli==1.2.2
torch==2.4.1
torchvision==0.19.1
tornado==6.1
tqdm==4.66.5
transformers==4.45.1
triton==3.0.0
typed-ast==1.4.3
typer==0.4.0
typing-inspect==0.9.0
typing_extensions==4.12.2
tzdata==2024.2
ubuntu-drivers-common==0.0.0
ufw==0.36.1
unstructured==0.15.13
unstructured-client==0.25.9
unstructured-inference==0.7.37
urllib3==2.2.3
urwid==2.1.2
virtualenv==20.26.6
wadllib==1.3.6
webencodings==0.5.1
Werkzeug==3.0.4
wrapt==1.16.0
xdg==5
xkit==0.0.0
zipp==1.0.0
Loading