Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/venv
730 changes: 730 additions & 0 deletions acetonitrile_sds_with_subheadings.json

Large diffs are not rendered by default.

682 changes: 682 additions & 0 deletions data_preprocessor.ipynb

Large diffs are not rendered by default.

77 changes: 77 additions & 0 deletions prepocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import fitz
import json
import re

def extract_text_with_structure(pdf_path):
doc = fitz.open(pdf_path)
extracted_data = []

for page_num in range(doc.page_count):
page = doc.load_page(page_num)
blocks = page.get_text("dict")['blocks']

for block in blocks:
if 'lines' in block:
for line in block['lines']:
text_line = ''.join([span['text'] for span in line['spans']])
font_size = line['spans'][0]['size']
bold = line['spans'][0]['flags'] & 2 # Check if the text is bold

# Add extracted line only if it is not empty and doesn't contain page numbers
if text_line.strip() and not re.match(r'^\s*Page\s+\d+', text_line): # Ignore lines with "Page X"
extracted_data.append({
"text": text_line.strip(),
"font_size": font_size,
"bold": bool(bold),
"page": page_num + 1
})

return extracted_data

def process_extracted_text_with_subheadings(extracted_data):
json_output = {}
current_section = None
current_subheading = None

for item in extracted_data:
text = item['text']

# Check if the line indicates a new section (e.g., starts with a number and space)
if re.match(r'^\d+\.\s+[A-Za-z\s]+$', text):
current_section = text
json_output[current_section] = {}
current_subheading = None

elif current_section:
if item['bold']: # If the text is bold, treat it as a subheading
current_subheading = text
json_output[current_section][current_subheading] = []
else:
# Add text to the current subheading's list or main content list
if current_subheading:
json_output[current_section][current_subheading].append(text)
else:
if "content" not in json_output[current_section]:
json_output[current_section]["content"] = []
json_output[current_section]["content"].append(text)

return json_output

def save_json_to_file(json_data, output_file):
"""Saves the structured data to a JSON file."""
with open(output_file, 'w') as f:
json.dump(json_data, f, indent=4)

def process_pdf_to_json_with_subheadings(pdf_path, output_file):
"""Processes the PDF and converts its content into structured JSON format."""
extracted_data = extract_text_with_structure(pdf_path)
json_data = process_extracted_text_with_subheadings(extracted_data)
save_json_to_file(json_data, output_file)

print(f"PDF data successfully converted to JSON with subheadings and saved to {output_file}")

# Example usage
pdf_path = "./data/acetonitrile-hplc-grade-l (1).pdf"
output_file = "acetonitrile_sds_with_subheadings.json"

process_pdf_to_json_with_subheadings(pdf_path, output_file)
109 changes: 109 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
aiohttp==3.9.5
aiohttp-retry==2.8.3
aiosignal==1.3.1
altgraph==0.17.4
attrs==23.2.0
beautifulsoup4==4.12.2
blinker==1.8.2
bs4==0.0.1
cachetools==5.3.2
certifi==2023.11.17
cffi==1.17.1
chardet==5.2.0
charset-normalizer==3.3.2
ci-info==0.3.0
click==8.1.7
colorama==0.4.6
configobj==5.0.9
configparser==7.1.0
cryptography==43.0.1
cssselect==1.2.0
cssutils==2.11.1
decorator==5.1.1
dnspython==2.6.1
etelemetry==0.3.1
filelock==3.16.1
fitz==0.0.1.dev2
Flask==3.0.3
frozenlist==1.4.1
future==1.0.0
geocoder==1.38.1
google-api-core==2.14.0
google-api-python-client==2.109.0
google-auth==2.24.0
google-auth-httplib2==0.1.1
google-auth-oauthlib==1.1.0
googleapis-common-protos==1.61.0
httplib2==0.22.0
idna==3.6
isodate==0.6.1
itsdangerous==2.2.0
Jinja2==3.1.4
looseversion==1.3.0
lxml==4.9.3
MarkupSafe==2.1.5
more-itertools==10.3.0
multidict==6.0.5
networkx==3.3
nibabel==5.2.1
nipype==1.8.6
numpy==2.0.0
oauthlib==3.2.2
opencv-python==4.10.0.84
opencv-python-headless==4.10.0.84
packaging==24.0
pandas==2.2.2
pathlib==1.0.1
pdfminer.six==20231228
pdfplumber==0.11.4
pefile==2023.2.7
pillow==10.3.0
premailer==3.10.0
protobuf==4.25.1
prov==2.0.1
psycopg2==2.9.9
pyasn1==0.5.1
pyasn1-modules==0.3.0
PyAudio==0.2.14
pycparser==2.22
pydot==3.0.2
pyinstaller==6.7.0
pyinstaller-hooks-contrib==2024.6
PyJWT==2.8.0
pymongo==4.8.0
pyparsing==3.1.1
pypdfium2==4.30.0
pyperclip==1.9.0
PyQt5==5.15.10
PyQt5-Qt5==5.15.2
PyQt5-sip==12.13.0
PyQtWebEngine==5.15.6
PyQtWebEngine-Qt5==5.15.2
pytesseract==0.3.10
python-dateutil==2.9.0.post0
python-dotenv==1.0.0
pytz==2024.1
pywin32-ctypes==0.2.2
pyxnat==1.6.2
ratelim==0.1.6
rdflib==6.3.2
reportlab==4.2.2
requests==2.31.0
requests-oauthlib==1.3.1
rsa==4.9
scipy==1.14.1
setuptools==70.0.0
simplejson==3.19.3
six==1.16.0
soupsieve==2.5
SpeechRecognition==3.10.4
tk==0.1.0
traits==6.3.2
twilio==9.2.3
typing_extensions==4.12.2
tzdata==2024.1
uritemplate==4.1.1
urllib3==2.1.0
Werkzeug==3.0.3
yagmail==0.15.293
yarl==1.9.4
Loading