diff --git a/json.ipynb b/json.ipynb new file mode 100644 index 00000000..13d276ab --- /dev/null +++ b/json.ipynb @@ -0,0 +1,100 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pdfplumber\n", + "import pandas as pd\n", + "import re\n", + "import json\n", + "\n", + "headers = [\n", + " r\"Product Name\", r\"Cat No.\", r\"Company\", r\"Synonyms\"\n", + "]\n", + "\n", + "def extract_text_from_pdf(pdf_path):\n", + " with pdfplumber.open(pdf_path) as pdf:\n", + " full_text = []\n", + " for page in pdf.pages:\n", + " page_text = page.extract_text()\n", + " if page_text:\n", + " full_text.append(page_text)\n", + " return \"\\n\".join(full_text)\n", + "\n", + "def segment_text_based_on_headers(text):\n", + " segments = re.split('|'.join(headers), text, flags=re.IGNORECASE)\n", + " headers_found = re.findall('|'.join(headers), text, flags=re.IGNORECASE)\n", + " \n", + " categorized_text = {}\n", + " for i, header in enumerate(headers_found):\n", + " categorized_text[header.strip()] = segments[i + 1].strip() if i + 1 < len(segments) else \"\"\n", + " \n", + " return categorized_text\n", + "\n", + "def extract_table_data(pdf_path):\n", + " table_data = []\n", + " with pdfplumber.open(pdf_path) as pdf:\n", + " for page in pdf.pages:\n", + " tables = page.extract_tables()\n", + " for table in tables:\n", + " table_data.extend(table)\n", + " return table_data\n", + "\n", + "def get_cleaned_text_remove_paragraph(pdf_path):\n", + " with pdfplumber.open(pdf_path) as pdf:\n", + " all_text = []\n", + " for page in pdf.pages:\n", + " page_text = page.extract_text()\n", + " if page_text:\n", + " page_text = re.sub(r\"Page \\d+ of \\d+\", \"\", page_text) \n", + " page_text = re.sub(r\"Specification File\", \"\", page_text)\n", + " page_text = re.sub(r\"(?s)Disclaimer.*?(\\n\\n|\\Z)\", \"\", page_text) \n", + " all_text.append(page_text.strip())\n", + " return \"\\n\\n\".join(all_text)\n", + "\n", + "def save_pdf_data_to_json(pdf_text_path, pdf_table_path, json_file):\n", + " cleaned_text = get_cleaned_text_remove_paragraph(pdf_text_path)\n", + " segmented_text = segment_text_based_on_headers(cleaned_text)\n", + " table_data = extract_table_data(pdf_table_path)\n", + " \n", + " data = {\n", + " \"text_data\": segmented_text,\n", + " \"table_data\": table_data\n", + " }\n", + " \n", + " with open(json_file, \"w\") as file:\n", + " json.dump(data, file, indent=4)\n", + "\n", + "pdf_path_text = 'A:/dp/data_preprocessor/data/citric-acid-gran-cert-acs-kg.pdf'\n", + "pdf_path_table = 'A:/dp/data_preprocessor/data/phenol-liquid-cert-.pdf'\n", + "json_file = 'A:/dp/data_preprocessor/combined_data.json'\n", + "\n", + "save_pdf_data_to_json(pdf_path_text, pdf_path_table, json_file)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/page_1.png b/page_1.png new file mode 100644 index 00000000..8f3939ef Binary files /dev/null and b/page_1.png differ diff --git a/page_2.png b/page_2.png new file mode 100644 index 00000000..f3b81a1a Binary files /dev/null and b/page_2.png differ diff --git a/page_3.png b/page_3.png new file mode 100644 index 00000000..9e691cac Binary files /dev/null and b/page_3.png differ diff --git a/page_4.png b/page_4.png new file mode 100644 index 00000000..b7ab3df0 Binary files /dev/null and b/page_4.png differ diff --git a/page_5.png b/page_5.png new file mode 100644 index 00000000..0f473877 Binary files /dev/null and b/page_5.png differ diff --git a/page_6.png b/page_6.png new file mode 100644 index 00000000..8a0614d0 Binary files /dev/null and b/page_6.png differ diff --git a/page_7.png b/page_7.png new file mode 100644 index 00000000..d8c9c818 Binary files /dev/null and b/page_7.png differ diff --git a/page_8.png b/page_8.png new file mode 100644 index 00000000..141e79ff Binary files /dev/null and b/page_8.png differ diff --git a/page_9.png b/page_9.png new file mode 100644 index 00000000..157b05a7 Binary files /dev/null and b/page_9.png differ diff --git a/tables2.ipynb b/tables2.ipynb new file mode 100644 index 00000000..ca2c786a --- /dev/null +++ b/tables2.ipynb @@ -0,0 +1,432 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pdfplumber\n", + "\n", + "with pdfplumber.open('A:/dp/data_preprocessor/data/chloroform-certified-acs-l.pdf') as pdf:\n", + " \n", + " for page in pdf.pages:\n", + " \n", + " text = page.extract_text()\n", + " print(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import fitz\n", + "pdf_document = \"A:/dp/data_preprocessor/data/chloroform-certified-acs-l.pdf\"\n", + "\"A:/dp/data_preprocessor/data/citric-acid-gran-cert-acs-kg.pdf\"\n", + "\"A:/dp/data_preprocessor/data/dichloromethane.pdf\"\n", + "\"A:/dp/data_preprocessor/data/edta--ph--lt.pdf\"\n", + "\"A:/dp/data_preprocessor/data/ethyl-acetate-cr-acs-l.pdf\"\n", + "\"A:/dp/data_preprocessor/data/hexanes-acs-l.pdf\"\n", + "\"A:/dp/data_preprocessor/data/hydrochloric-acid-n-ml.pdf\"\n", + "\"A:/dp/data_preprocessor/data/methanol-cert-acshplc-l (1).pdf\"\n", + "\"A:/dp/data_preprocessor/data/methanol-cert-acshplc-l.pdf\"\n", + "\"A:/dp/data_preprocessor/data/nitric-acid-reagent-acs-l.pdf\"\n", + "\"A:/dp/data_preprocessor/data/optima-propanol-ipa-optima.pdf\"\n", + "\"A:/dp/data_preprocessor/data/phenol-liquid-cert-.pdf\"\n", + "\"A:/dp/data_preprocessor/data/phosphoric-acid-acs--ml.pdf\"\n", + "\"A:/dp/data_preprocessor/data/pot-hydroxide-cert-acs-kg.pdf\"\n", + "\"A:/dp/data_preprocessor/data/pot-permanganate-cr-acs-kg.pdf\"\n", + "\"A:/dp/data_preprocessor/data/sod-chloride-cert-acs-lb.pdf\"\n", + "\"A:/dp/data_preprocessor/data/sodium-hydroxide-g.pdf\"\n", + "\"A:/dp/data_preprocessor/data/sodium-phosphate-dib-purif-kg.pdf\"\n", + "\"A:/dp/data_preprocessor/data/sulf-acid-sol-conc-in-cr-ml.pdf\"\n", + "\"A:/dp/data_preprocessor/data/water-hplc-nowpak-l (1).pdf\"\n", + "\"A:/dp/data_preprocessor/data/water-hplc-nowpak-l.pdf\"\n", + "\"A:/dp/data_preprocessor/data/acetone-acs-l (1).pdf\"\n", + "\"A:/dp/data_preprocessor/data/acetone-acs-l.pdf\"\n", + "\"A:/dp/data_preprocessor/data/acetonitrile-hplc-grade-l (1).pdf\"\n", + "\"A:/dp/data_preprocessor/data/acetonitrile-hplc-grade-l.pdf\"\n", + "\"A:/dp/data_preprocessor/data/ammonium-hydroxide-acs-lb.pdf\"\n", + "\"A:/dp/data_preprocessor/data/buffer-x-tbs-ph--ml.pdf\"\n", + "\n", + "\n", + "pdf = fitz.open(pdf_document)\n", + "\n", + "\n", + "for page_num in range(len(pdf)):\n", + " page = pdf.load_page(page_num) \n", + " text = page.get_text(\"text\") \n", + " print(f\"Page {page_num + 1}:\\n{text}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%html\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n", + "[]\n", + "[]\n", + "[]\n", + "[]\n", + "[]\n", + "[]\n", + "[]\n", + "[]\n", + "[]\n" + ] + } + ], + "source": [ + "import fitz\n", + "doc = fitz.open('A:/dp/data_preprocessor/data/chloroform-certified-acs-l.pdf')\n", + "for i in range(doc.page_count):\n", + " page = doc.load_page(i)\n", + " link = page.get_links()\n", + " print(link)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import PyPDF2\n", + "import pandas as pd\n", + "from PyPDF2 import PdfReader\n", + "Texts = []\n", + "with open('A:/dp/data_preprocessor/data/citric-acid-gran-cert-acs-kg.pdf', 'rb') as file:\n", + " reader = PdfReader(file)\n", + " num_pages = len(reader.pages)\n", + " \n", + "\n", + " \n", + " for p in range(1, num_pages):\n", + " page = reader.pages[p] \n", + " Texts.append(page.extract_text()) \n", + "\n", + "\n", + "output=pd.DataFrame({\n", + " 'pages':range(1,len(Texts)+1),\n", + " 'text':Texts\n", + "})\n", + "output.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output['split']=output['text'].apply(lambda t:t.split('\\n'))\n", + "len_max=output['split'].apply(lambda t:len(t)).max()\n", + "print(len_max)\n", + "output.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pdfplumber\n", + "with pdfplumber.open('A:/dp/data_preprocessor/data/citric-acid-gran-cert-acs-kg.pdf') as pdf:\n", + " \n", + " for page in pdf.pages:\n", + " print(page.extract_tables())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pdfplumber\n", + "import pandas as pd\n", + "\n", + "\n", + "table_data = []\n", + "\n", + "\n", + "with pdfplumber.open('A:/dp/data_preprocessor/data/citric-acid-gran-cert-acs-kg.pdf') as pdf:\n", + " \n", + " for page in pdf.pages:\n", + " \n", + " tables = page.extract_tables()\n", + " \n", + " # Process each table found on the page\n", + " for table in tables:\n", + " for row in table:\n", + " \n", + " if len(row) >= 2: \n", + " key = row[0] \n", + " value = row[1] \n", + " table_data.append((key, value)) \n", + "\n", + "\n", + "output = pd.DataFrame(table_data, columns=['Header', 'Value'])\n", + "\n", + "\n", + "print(output.head()) \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#extracting only 2 columns\n", + "import pdfplumber\n", + "import pandas as pd\n", + "from IPython.core.display import display, HTML\n", + "\n", + "\n", + "table_data = []\n", + "\n", + "\n", + "with pdfplumber.open('A:/dp/data_preprocessor/data/citric-acid-gran-cert-acs-kg.pdf') as pdf:\n", + " \n", + " for page in pdf.pages:\n", + " \n", + " tables = page.extract_tables()\n", + " \n", + " \n", + " for table in tables:\n", + " for row in table:\n", + " \n", + " if len(row) >= 2: \n", + " key = row[0] \n", + " value = row[1] \n", + " table_data.append((key, value)) \n", + "\n", + "\n", + "output = pd.DataFrame(table_data, columns=['Header', 'Value'])\n", + "\n", + "\n", + "html_style = \"\"\"\n", + "\n", + "\"\"\"\n", + "\n", + "\n", + "display(HTML(html_style))\n", + "display(output)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#extracting all the table values and columns\n", + "import pdfplumber\n", + "import pandas as pd\n", + "from IPython.core.display import display, HTML\n", + "\n", + "\n", + "table_data = []\n", + "\n", + "\n", + "with pdfplumber.open('A:\\dp\\data_preprocessor\\data\\phenol-liquid-cert-.pdf') as pdf:\n", + " \n", + " for page in pdf.pages:\n", + " \n", + " tables = page.extract_tables()\n", + " \n", + " \n", + " for table in tables:\n", + " table_data.extend(table)\n", + "\n", + "max_columns = max(len(row) for row in table_data)\n", + "\n", + "\n", + "columns = [f'Column {i+1}' for i in range(max_columns)]\n", + "\n", + "\n", + "output = pd.DataFrame([row + [None] * (max_columns - len(row)) for row in table_data], columns=columns)\n", + "\n", + "\n", + "html_style = \"\"\"\n", + "\n", + "\"\"\"\n", + "\n", + "display(HTML(html_style))\n", + "display(output)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#to remove disclaimer paragraph, header ,footer \n", + "import pdfplumber\n", + "import re\n", + "\n", + "def get_cleaned_text_remove_paragraph(pdf_path):\n", + " with pdfplumber.open(pdf_path) as pdf:\n", + " all_text = []\n", + "\n", + " for page in pdf.pages:\n", + " \n", + " page_text = page.extract_text()\n", + "\n", + " page_text = re.sub(r\"Page \\d+ of \\d+\", \"\", page_text) \n", + " page_text = re.sub(r\"Specification File\", \"\", page_text) \n", + "\n", + " \n", + " page_text = re.sub(r\"(?s)Disclaimer.*?(\\n\\n|\\Z)\", \"\", page_text) \n", + " \n", + " \n", + " all_text.append(page_text.strip()) \n", + "\n", + "\n", + " return \"\\n\\n\".join(all_text)\n", + "\n", + "\n", + "cleaned_text = get_cleaned_text_remove_paragraph('A:/dp/data_preprocessor/data/citric-acid-gran-cert-acs-kg.pdf')\n", + "print(cleaned_text)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pdfplumber\n", + "import pandas as pd\n", + "import re\n", + "\n", + "\n", + "headers = [\n", + " r\"Product Name\", r\"Cat No.\", r\"Company\", r\"Synonyms\"\n", + "]\n", + "\n", + "\n", + "def extract_text_from_pdf(pdf_path):\n", + " with pdfplumber.open(pdf_path) as pdf:\n", + " full_text = []\n", + " \n", + " for page in pdf.pages:\n", + " page_text = page.extract_text()\n", + " if page_text:\n", + " full_text.append(page_text)\n", + " \n", + " return \"\\n\".join(full_text) \n", + "\n", + "\n", + "def segment_text_based_on_headers(text):\n", + " segments = re.split('|'.join(headers), text, flags=re.IGNORECASE)\n", + " headers_found = re.findall('|'.join(headers), text, flags=re.IGNORECASE)\n", + " \n", + "\n", + " categorized_text = {}\n", + " for i, header in enumerate(headers_found):\n", + " categorized_text[header.strip()] = segments[i + 1].strip() if i + 1 < len(segments) else \"\"\n", + " \n", + " return categorized_text\n", + "\n", + "\n", + "def load_pdf_data(pdf_file):\n", + " full_text = extract_text_from_pdf(pdf_file)\n", + " \n", + "\n", + " segmented_text = segment_text_based_on_headers(full_text)\n", + " \n", + "\n", + " texts = list(segmented_text.values()) \n", + " labels = list(segmented_text.keys()) \n", + " \n", + " return texts, labels\n", + "\n", + "\n", + "pdf_path = 'A:\\dp\\data_preprocessor\\data\\citric-acid-gran-cert-acs-kg.pdf'\n", + "\n", + "\n", + "texts, labels = load_pdf_data(pdf_path)\n", + "\n", + "\n", + "df = pd.DataFrame({\n", + " 'Header': labels,\n", + " 'Text': texts\n", + "})\n", + "\n", + "\n", + "print(df.head())\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tess.ipynb b/tess.ipynb new file mode 100644 index 00000000..de292e1b --- /dev/null +++ b/tess.ipynb @@ -0,0 +1,158 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "pip install pymupdf pillow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "#pdf pages converted into images and saved\n", + "import fitz # PyMuPDF\n", + "from PIL import Image\n", + "\n", + "def convert_pdf_to_images(pdf_path, output_folder=\"output_images\"):\n", + " # Open the PDF\n", + " pdf_document = fitz.open(pdf_path)\n", + " \n", + " # Create output folder if it doesn't exist\n", + " import os\n", + " os.makedirs(output_folder, exist_ok=True)\n", + " \n", + " \n", + " for page_num in range(len(pdf_document)):\n", + " page = pdf_document.load_page(page_num) # Load page\n", + " pix = page.get_pixmap() # Render page to an image (pixel map)\n", + " \n", + " # Save the image\n", + " image_path = f\"{output_folder}/page_{page_num + 1}.png\"\n", + " pix.save(image_path)\n", + " print(f\"Saved {image_path}\")\n", + "\n", + "# Usage\n", + "convert_pdf_to_images(\"/content/acetone-acs-l.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# List of image paths\n", + "image_paths = [\n", + " 'output_images/page_1.png',\n", + " 'output_images/page_2.png',\n", + " 'output_images/page_3.png',\n", + " 'output_images/page_4.png',\n", + " 'output_images/page_5.png',\n", + " 'output_images/page_6.png',\n", + " 'output_images/page_7.png',\n", + " 'output_images/page_8.png',\n", + " 'output_images/page_9.png'\n", + "]\n", + "\n", + "# Set up the plot\n", + "plt.figure(figsize=(15, 10))\n", + "\n", + "# Loop through each image and display it\n", + "for i, image_path in enumerate(image_paths, start=1):\n", + " img = Image.open(image_path)\n", + " \n", + " plt.subplot(3, 3, i) # Adjust the grid size if needed\n", + " plt.imshow(img)\n", + " plt.axis('off') # Turn off axis labels\n", + " plt.title(f'Image {i}') # Add a title for each image\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "!apt-get install tesseract-ocr\n", + "!pip install pytesseract\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "#using pytesseract to detect key value pairs\n", + "import pytesseract\n", + "from PIL import Image\n", + "import json\n", + "from google.colab import drive\n", + "\n", + "# Mount Google Drive\n", + "drive.mount('/content/drive')\n", + "\n", + "# Load the image using PIL\n", + "IMAGE_PATH = '/content/page_1.png'\n", + "img = Image.open(IMAGE_PATH)\n", + "\n", + "# Perform OCR using Tesseract\n", + "result = pytesseract.image_to_string(img)\n", + "\n", + "# Remove newline characters from the extracted text\n", + "cleaned_result = result.replace('\\n', ' ').strip() # Replaces new lines with space and strips leading/trailing spaces\n", + "\n", + "# Print the cleaned extracted text\n", + "print(cleaned_result)\n", + "\n", + "# Specify the JSON file path in your Google Drive\n", + "json_file_path = '/content/drive/My Drive/extracted_text.json'\n", + "\n", + "# Save the cleaned extracted text to a JSON file\n", + "with open(json_file_path, 'w') as json_file:\n", + " json.dump({\"extracted_text\": cleaned_result}, json_file, indent=4)\n", + "\n", + "print(f\"Extracted text saved to {json_file_path}\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}