diff --git a/json.ipynb b/json.ipynb
new file mode 100644
index 00000000..13d276ab
--- /dev/null
+++ b/json.ipynb
@@ -0,0 +1,100 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pdfplumber\n",
+ "import pandas as pd\n",
+ "import re\n",
+ "import json\n",
+ "\n",
+ "headers = [\n",
+ " r\"Product Name\", r\"Cat No.\", r\"Company\", r\"Synonyms\"\n",
+ "]\n",
+ "\n",
+ "def extract_text_from_pdf(pdf_path):\n",
+ " with pdfplumber.open(pdf_path) as pdf:\n",
+ " full_text = []\n",
+ " for page in pdf.pages:\n",
+ " page_text = page.extract_text()\n",
+ " if page_text:\n",
+ " full_text.append(page_text)\n",
+ " return \"\\n\".join(full_text)\n",
+ "\n",
+ "def segment_text_based_on_headers(text):\n",
+ " segments = re.split('|'.join(headers), text, flags=re.IGNORECASE)\n",
+ " headers_found = re.findall('|'.join(headers), text, flags=re.IGNORECASE)\n",
+ " \n",
+ " categorized_text = {}\n",
+ " for i, header in enumerate(headers_found):\n",
+ " categorized_text[header.strip()] = segments[i + 1].strip() if i + 1 < len(segments) else \"\"\n",
+ " \n",
+ " return categorized_text\n",
+ "\n",
+ "def extract_table_data(pdf_path):\n",
+ " table_data = []\n",
+ " with pdfplumber.open(pdf_path) as pdf:\n",
+ " for page in pdf.pages:\n",
+ " tables = page.extract_tables()\n",
+ " for table in tables:\n",
+ " table_data.extend(table)\n",
+ " return table_data\n",
+ "\n",
+ "def get_cleaned_text_remove_paragraph(pdf_path):\n",
+ " with pdfplumber.open(pdf_path) as pdf:\n",
+ " all_text = []\n",
+ " for page in pdf.pages:\n",
+ " page_text = page.extract_text()\n",
+ " if page_text:\n",
+ " page_text = re.sub(r\"Page \\d+ of \\d+\", \"\", page_text) \n",
+ " page_text = re.sub(r\"Specification File\", \"\", page_text)\n",
+ " page_text = re.sub(r\"(?s)Disclaimer.*?(\\n\\n|\\Z)\", \"\", page_text) \n",
+ " all_text.append(page_text.strip())\n",
+ " return \"\\n\\n\".join(all_text)\n",
+ "\n",
+ "def save_pdf_data_to_json(pdf_text_path, pdf_table_path, json_file):\n",
+ " cleaned_text = get_cleaned_text_remove_paragraph(pdf_text_path)\n",
+ " segmented_text = segment_text_based_on_headers(cleaned_text)\n",
+ " table_data = extract_table_data(pdf_table_path)\n",
+ " \n",
+ " data = {\n",
+ " \"text_data\": segmented_text,\n",
+ " \"table_data\": table_data\n",
+ " }\n",
+ " \n",
+ " with open(json_file, \"w\") as file:\n",
+ " json.dump(data, file, indent=4)\n",
+ "\n",
+ "pdf_path_text = 'A:/dp/data_preprocessor/data/citric-acid-gran-cert-acs-kg.pdf'\n",
+ "pdf_path_table = 'A:/dp/data_preprocessor/data/phenol-liquid-cert-.pdf'\n",
+ "json_file = 'A:/dp/data_preprocessor/combined_data.json'\n",
+ "\n",
+ "save_pdf_data_to_json(pdf_path_text, pdf_path_table, json_file)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/page_1.png b/page_1.png
new file mode 100644
index 00000000..8f3939ef
Binary files /dev/null and b/page_1.png differ
diff --git a/page_2.png b/page_2.png
new file mode 100644
index 00000000..f3b81a1a
Binary files /dev/null and b/page_2.png differ
diff --git a/page_3.png b/page_3.png
new file mode 100644
index 00000000..9e691cac
Binary files /dev/null and b/page_3.png differ
diff --git a/page_4.png b/page_4.png
new file mode 100644
index 00000000..b7ab3df0
Binary files /dev/null and b/page_4.png differ
diff --git a/page_5.png b/page_5.png
new file mode 100644
index 00000000..0f473877
Binary files /dev/null and b/page_5.png differ
diff --git a/page_6.png b/page_6.png
new file mode 100644
index 00000000..8a0614d0
Binary files /dev/null and b/page_6.png differ
diff --git a/page_7.png b/page_7.png
new file mode 100644
index 00000000..d8c9c818
Binary files /dev/null and b/page_7.png differ
diff --git a/page_8.png b/page_8.png
new file mode 100644
index 00000000..141e79ff
Binary files /dev/null and b/page_8.png differ
diff --git a/page_9.png b/page_9.png
new file mode 100644
index 00000000..157b05a7
Binary files /dev/null and b/page_9.png differ
diff --git a/tables2.ipynb b/tables2.ipynb
new file mode 100644
index 00000000..ca2c786a
--- /dev/null
+++ b/tables2.ipynb
@@ -0,0 +1,432 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pdfplumber\n",
+ "\n",
+ "with pdfplumber.open('A:/dp/data_preprocessor/data/chloroform-certified-acs-l.pdf') as pdf:\n",
+ " \n",
+ " for page in pdf.pages:\n",
+ " \n",
+ " text = page.extract_text()\n",
+ " print(text)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import fitz\n",
+ "pdf_document = \"A:/dp/data_preprocessor/data/chloroform-certified-acs-l.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/citric-acid-gran-cert-acs-kg.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/dichloromethane.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/edta--ph--lt.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/ethyl-acetate-cr-acs-l.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/hexanes-acs-l.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/hydrochloric-acid-n-ml.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/methanol-cert-acshplc-l (1).pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/methanol-cert-acshplc-l.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/nitric-acid-reagent-acs-l.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/optima-propanol-ipa-optima.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/phenol-liquid-cert-.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/phosphoric-acid-acs--ml.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/pot-hydroxide-cert-acs-kg.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/pot-permanganate-cr-acs-kg.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/sod-chloride-cert-acs-lb.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/sodium-hydroxide-g.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/sodium-phosphate-dib-purif-kg.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/sulf-acid-sol-conc-in-cr-ml.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/water-hplc-nowpak-l (1).pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/water-hplc-nowpak-l.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/acetone-acs-l (1).pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/acetone-acs-l.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/acetonitrile-hplc-grade-l (1).pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/acetonitrile-hplc-grade-l.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/ammonium-hydroxide-acs-lb.pdf\"\n",
+ "\"A:/dp/data_preprocessor/data/buffer-x-tbs-ph--ml.pdf\"\n",
+ "\n",
+ "\n",
+ "pdf = fitz.open(pdf_document)\n",
+ "\n",
+ "\n",
+ "for page_num in range(len(pdf)):\n",
+ " page = pdf.load_page(page_num) \n",
+ " text = page.get_text(\"text\") \n",
+ " print(f\"Page {page_num + 1}:\\n{text}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "%%html\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[]\n",
+ "[]\n",
+ "[]\n",
+ "[]\n",
+ "[]\n",
+ "[]\n",
+ "[]\n",
+ "[]\n",
+ "[]\n",
+ "[]\n"
+ ]
+ }
+ ],
+ "source": [
+ "import fitz\n",
+ "doc = fitz.open('A:/dp/data_preprocessor/data/chloroform-certified-acs-l.pdf')\n",
+ "for i in range(doc.page_count):\n",
+ " page = doc.load_page(i)\n",
+ " link = page.get_links()\n",
+ " print(link)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import PyPDF2\n",
+ "import pandas as pd\n",
+ "from PyPDF2 import PdfReader\n",
+ "Texts = []\n",
+ "with open('A:/dp/data_preprocessor/data/citric-acid-gran-cert-acs-kg.pdf', 'rb') as file:\n",
+ " reader = PdfReader(file)\n",
+ " num_pages = len(reader.pages)\n",
+ " \n",
+ "\n",
+ " \n",
+ " for p in range(1, num_pages):\n",
+ " page = reader.pages[p] \n",
+ " Texts.append(page.extract_text()) \n",
+ "\n",
+ "\n",
+ "output=pd.DataFrame({\n",
+ " 'pages':range(1,len(Texts)+1),\n",
+ " 'text':Texts\n",
+ "})\n",
+ "output.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output['split']=output['text'].apply(lambda t:t.split('\\n'))\n",
+ "len_max=output['split'].apply(lambda t:len(t)).max()\n",
+ "print(len_max)\n",
+ "output.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pdfplumber\n",
+ "with pdfplumber.open('A:/dp/data_preprocessor/data/citric-acid-gran-cert-acs-kg.pdf') as pdf:\n",
+ " \n",
+ " for page in pdf.pages:\n",
+ " print(page.extract_tables())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pdfplumber\n",
+ "import pandas as pd\n",
+ "\n",
+ "\n",
+ "table_data = []\n",
+ "\n",
+ "\n",
+ "with pdfplumber.open('A:/dp/data_preprocessor/data/citric-acid-gran-cert-acs-kg.pdf') as pdf:\n",
+ " \n",
+ " for page in pdf.pages:\n",
+ " \n",
+ " tables = page.extract_tables()\n",
+ " \n",
+ " # Process each table found on the page\n",
+ " for table in tables:\n",
+ " for row in table:\n",
+ " \n",
+ " if len(row) >= 2: \n",
+ " key = row[0] \n",
+ " value = row[1] \n",
+ " table_data.append((key, value)) \n",
+ "\n",
+ "\n",
+ "output = pd.DataFrame(table_data, columns=['Header', 'Value'])\n",
+ "\n",
+ "\n",
+ "print(output.head()) \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#extracting only 2 columns\n",
+ "import pdfplumber\n",
+ "import pandas as pd\n",
+ "from IPython.core.display import display, HTML\n",
+ "\n",
+ "\n",
+ "table_data = []\n",
+ "\n",
+ "\n",
+ "with pdfplumber.open('A:/dp/data_preprocessor/data/citric-acid-gran-cert-acs-kg.pdf') as pdf:\n",
+ " \n",
+ " for page in pdf.pages:\n",
+ " \n",
+ " tables = page.extract_tables()\n",
+ " \n",
+ " \n",
+ " for table in tables:\n",
+ " for row in table:\n",
+ " \n",
+ " if len(row) >= 2: \n",
+ " key = row[0] \n",
+ " value = row[1] \n",
+ " table_data.append((key, value)) \n",
+ "\n",
+ "\n",
+ "output = pd.DataFrame(table_data, columns=['Header', 'Value'])\n",
+ "\n",
+ "\n",
+ "html_style = \"\"\"\n",
+ "\n",
+ "\"\"\"\n",
+ "\n",
+ "\n",
+ "display(HTML(html_style))\n",
+ "display(output)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#extracting all the table values and columns\n",
+ "import pdfplumber\n",
+ "import pandas as pd\n",
+ "from IPython.core.display import display, HTML\n",
+ "\n",
+ "\n",
+ "table_data = []\n",
+ "\n",
+ "\n",
+ "with pdfplumber.open('A:\\dp\\data_preprocessor\\data\\phenol-liquid-cert-.pdf') as pdf:\n",
+ " \n",
+ " for page in pdf.pages:\n",
+ " \n",
+ " tables = page.extract_tables()\n",
+ " \n",
+ " \n",
+ " for table in tables:\n",
+ " table_data.extend(table)\n",
+ "\n",
+ "max_columns = max(len(row) for row in table_data)\n",
+ "\n",
+ "\n",
+ "columns = [f'Column {i+1}' for i in range(max_columns)]\n",
+ "\n",
+ "\n",
+ "output = pd.DataFrame([row + [None] * (max_columns - len(row)) for row in table_data], columns=columns)\n",
+ "\n",
+ "\n",
+ "html_style = \"\"\"\n",
+ "\n",
+ "\"\"\"\n",
+ "\n",
+ "display(HTML(html_style))\n",
+ "display(output)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#to remove disclaimer paragraph, header ,footer \n",
+ "import pdfplumber\n",
+ "import re\n",
+ "\n",
+ "def get_cleaned_text_remove_paragraph(pdf_path):\n",
+ " with pdfplumber.open(pdf_path) as pdf:\n",
+ " all_text = []\n",
+ "\n",
+ " for page in pdf.pages:\n",
+ " \n",
+ " page_text = page.extract_text()\n",
+ "\n",
+ " page_text = re.sub(r\"Page \\d+ of \\d+\", \"\", page_text) \n",
+ " page_text = re.sub(r\"Specification File\", \"\", page_text) \n",
+ "\n",
+ " \n",
+ " page_text = re.sub(r\"(?s)Disclaimer.*?(\\n\\n|\\Z)\", \"\", page_text) \n",
+ " \n",
+ " \n",
+ " all_text.append(page_text.strip()) \n",
+ "\n",
+ "\n",
+ " return \"\\n\\n\".join(all_text)\n",
+ "\n",
+ "\n",
+ "cleaned_text = get_cleaned_text_remove_paragraph('A:/dp/data_preprocessor/data/citric-acid-gran-cert-acs-kg.pdf')\n",
+ "print(cleaned_text)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pdfplumber\n",
+ "import pandas as pd\n",
+ "import re\n",
+ "\n",
+ "\n",
+ "headers = [\n",
+ " r\"Product Name\", r\"Cat No.\", r\"Company\", r\"Synonyms\"\n",
+ "]\n",
+ "\n",
+ "\n",
+ "def extract_text_from_pdf(pdf_path):\n",
+ " with pdfplumber.open(pdf_path) as pdf:\n",
+ " full_text = []\n",
+ " \n",
+ " for page in pdf.pages:\n",
+ " page_text = page.extract_text()\n",
+ " if page_text:\n",
+ " full_text.append(page_text)\n",
+ " \n",
+ " return \"\\n\".join(full_text) \n",
+ "\n",
+ "\n",
+ "def segment_text_based_on_headers(text):\n",
+ " segments = re.split('|'.join(headers), text, flags=re.IGNORECASE)\n",
+ " headers_found = re.findall('|'.join(headers), text, flags=re.IGNORECASE)\n",
+ " \n",
+ "\n",
+ " categorized_text = {}\n",
+ " for i, header in enumerate(headers_found):\n",
+ " categorized_text[header.strip()] = segments[i + 1].strip() if i + 1 < len(segments) else \"\"\n",
+ " \n",
+ " return categorized_text\n",
+ "\n",
+ "\n",
+ "def load_pdf_data(pdf_file):\n",
+ " full_text = extract_text_from_pdf(pdf_file)\n",
+ " \n",
+ "\n",
+ " segmented_text = segment_text_based_on_headers(full_text)\n",
+ " \n",
+ "\n",
+ " texts = list(segmented_text.values()) \n",
+ " labels = list(segmented_text.keys()) \n",
+ " \n",
+ " return texts, labels\n",
+ "\n",
+ "\n",
+ "pdf_path = 'A:\\dp\\data_preprocessor\\data\\citric-acid-gran-cert-acs-kg.pdf'\n",
+ "\n",
+ "\n",
+ "texts, labels = load_pdf_data(pdf_path)\n",
+ "\n",
+ "\n",
+ "df = pd.DataFrame({\n",
+ " 'Header': labels,\n",
+ " 'Text': texts\n",
+ "})\n",
+ "\n",
+ "\n",
+ "print(df.head())\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tess.ipynb b/tess.ipynb
new file mode 100644
index 00000000..de292e1b
--- /dev/null
+++ b/tess.ipynb
@@ -0,0 +1,158 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "pip install pymupdf pillow"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "#pdf pages converted into images and saved\n",
+ "import fitz # PyMuPDF\n",
+ "from PIL import Image\n",
+ "\n",
+ "def convert_pdf_to_images(pdf_path, output_folder=\"output_images\"):\n",
+ " # Open the PDF\n",
+ " pdf_document = fitz.open(pdf_path)\n",
+ " \n",
+ " # Create output folder if it doesn't exist\n",
+ " import os\n",
+ " os.makedirs(output_folder, exist_ok=True)\n",
+ " \n",
+ " \n",
+ " for page_num in range(len(pdf_document)):\n",
+ " page = pdf_document.load_page(page_num) # Load page\n",
+ " pix = page.get_pixmap() # Render page to an image (pixel map)\n",
+ " \n",
+ " # Save the image\n",
+ " image_path = f\"{output_folder}/page_{page_num + 1}.png\"\n",
+ " pix.save(image_path)\n",
+ " print(f\"Saved {image_path}\")\n",
+ "\n",
+ "# Usage\n",
+ "convert_pdf_to_images(\"/content/acetone-acs-l.pdf\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from PIL import Image\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "# List of image paths\n",
+ "image_paths = [\n",
+ " 'output_images/page_1.png',\n",
+ " 'output_images/page_2.png',\n",
+ " 'output_images/page_3.png',\n",
+ " 'output_images/page_4.png',\n",
+ " 'output_images/page_5.png',\n",
+ " 'output_images/page_6.png',\n",
+ " 'output_images/page_7.png',\n",
+ " 'output_images/page_8.png',\n",
+ " 'output_images/page_9.png'\n",
+ "]\n",
+ "\n",
+ "# Set up the plot\n",
+ "plt.figure(figsize=(15, 10))\n",
+ "\n",
+ "# Loop through each image and display it\n",
+ "for i, image_path in enumerate(image_paths, start=1):\n",
+ " img = Image.open(image_path)\n",
+ " \n",
+ " plt.subplot(3, 3, i) # Adjust the grid size if needed\n",
+ " plt.imshow(img)\n",
+ " plt.axis('off') # Turn off axis labels\n",
+ " plt.title(f'Image {i}') # Add a title for each image\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "!apt-get install tesseract-ocr\n",
+ "!pip install pytesseract\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "#using pytesseract to detect key value pairs\n",
+ "import pytesseract\n",
+ "from PIL import Image\n",
+ "import json\n",
+ "from google.colab import drive\n",
+ "\n",
+ "# Mount Google Drive\n",
+ "drive.mount('/content/drive')\n",
+ "\n",
+ "# Load the image using PIL\n",
+ "IMAGE_PATH = '/content/page_1.png'\n",
+ "img = Image.open(IMAGE_PATH)\n",
+ "\n",
+ "# Perform OCR using Tesseract\n",
+ "result = pytesseract.image_to_string(img)\n",
+ "\n",
+ "# Remove newline characters from the extracted text\n",
+ "cleaned_result = result.replace('\\n', ' ').strip() # Replaces new lines with space and strips leading/trailing spaces\n",
+ "\n",
+ "# Print the cleaned extracted text\n",
+ "print(cleaned_result)\n",
+ "\n",
+ "# Specify the JSON file path in your Google Drive\n",
+ "json_file_path = '/content/drive/My Drive/extracted_text.json'\n",
+ "\n",
+ "# Save the cleaned extracted text to a JSON file\n",
+ "with open(json_file_path, 'w') as json_file:\n",
+ " json.dump({\"extracted_text\": cleaned_result}, json_file, indent=4)\n",
+ "\n",
+ "print(f\"Extracted text saved to {json_file_path}\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}