|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "metadata": {}, |
| 6 | + "source": [ |
| 7 | + "# Colab Edition\n", |
| 8 | + "\n", |
| 9 | + "<div align=\"center\">\n", |
| 10 | + "\n", |
| 11 | + "[](https://github.com/ipitio/ocr-pdf)\n", |
| 12 | + "\n", |
| 13 | + "<h1><a href=\"https://github.com/ipitio/ocr-pdf\" target=\"_blank\" rel=\"noopener noreferrer\">\n", |
| 14 | + " ocr2pdf\n", |
| 15 | + "</a></h1>\n", |
| 16 | + "\n", |
| 17 | + "**Convert images and scans to searchable PDFs!**\n", |
| 18 | + "\n", |
| 19 | + "---\n", |
| 20 | + "\n", |
| 21 | + "[](https://github.com/arevindh/pihole-speedtest/pkgs/container/pihole-speedtest) [](https://github.com/ipitio/ocr-pdf/actions/workflows/publish.yml)\n", |
| 22 | + "\n", |
| 23 | + "</div>\n", |
| 24 | + "\n", |
| 25 | + "This notebook is meant to be run on [Colab](https://colab.research.google.com/github/ipitio/ocr-pdf/blob/master/colab.ipynb). It will convert your files and can optionally save them to [Drive](https://drive.google.com/drive/my-drive) `/ocr-pdf`. Open the link above for more information.\n", |
| 26 | + "\n", |
| 27 | + "## Steps\n", |
| 28 | + "\n", |
| 29 | + "1. Make two new folders, one inside the other\n", |
| 30 | + " - The outer one can be named anything, say `pdf`\n", |
| 31 | + " - The inner one must be named `todo`\n", |
| 32 | + "2. Place your files in the `todo` folder\n", |
| 33 | + " - Those by themselves will just be converted\n", |
| 34 | + " - Those inside subfolders will also be merged in alphabetical order\n", |
| 35 | + "3. Share the outer `pdf` folder with this notebook\n", |
| 36 | + " - Zip the folder\n", |
| 37 | + " - Open this notebook in [Colab](https://colab.research.google.com/github/ipitio/ocr-pdf/blob/master/colab.ipynb)\n", |
| 38 | + " - Run the cell below to be prompted to connect Drive and upload the zip\n", |
| 39 | + "\n", |
| 40 | + "You'll be offered a zip of the converted (and merged) files to download locally, whether or not Drive was connected\n" |
| 41 | + ] |
| 42 | + }, |
| 43 | + { |
| 44 | + "cell_type": "code", |
| 45 | + "execution_count": null, |
| 46 | + "metadata": {}, |
| 47 | + "outputs": [], |
| 48 | + "source": [ |
| 49 | + "import os\n", |
| 50 | + "\n", |
| 51 | + "# Connect to Drive\n", |
| 52 | + "try:\n", |
| 53 | + " from google.colab import files, drive\n", |
| 54 | + " drive.mount(\"/content/drive\", force_remount=True)\n", |
| 55 | + " drive = True\n", |
| 56 | + "except:\n", |
| 57 | + " drive = False\n", |
| 58 | + "\n", |
| 59 | + "# Extract your PDFs\n", |
| 60 | + "files.upload()\n", |
| 61 | + "\n", |
| 62 | + "# Get the name of the zip file\n", |
| 63 | + "pdfs = [pdf for pdf in os.listdir() if pdf.endswith(\".zip\")]\n", |
| 64 | + "if len(pdfs) == 0:\n", |
| 65 | + " raise Exception(\"No ZIP file found\")\n", |
| 66 | + "\n", |
| 67 | + "# Transform them\n", |
| 68 | + "%pip install udocker\n", |
| 69 | + "!udocker --allow-root install\n", |
| 70 | + "\n", |
| 71 | + "for pdf in pdfs:\n", |
| 72 | + " !unzip -o \"$pdf\"\n", |
| 73 | + " !rm -f \"$pdf\"\n", |
| 74 | + " !udocker --allow-root run -v /content/\"$pdf\":/app/pdf ghcr.io/ipitio/ocr-pdf bash predict.sh pdf\n", |
| 75 | + " converted = os.listdir(\"$pdf/done\")\n", |
| 76 | + "\n", |
| 77 | + " # And load\n", |
| 78 | + " if drive and len(converted) > 0:\n", |
| 79 | + " ![ -d \"drive/MyDrive/ocr-pdf\" ] || mkdir \"drive/MyDrive/ocr-pdf\"\n", |
| 80 | + " !\\cp -r \"$pdf/done/\"* \"drive/MyDrive/ocr-pdf/\"\n", |
| 81 | + "\n", |
| 82 | + " if len(converted) == 1 and os.path.isfile(\"$pdf/done/\" + converted[0]):\n", |
| 83 | + " files.download(\"$pdf/done/\" + converted[0])\n", |
| 84 | + " elif len(converted) > 0:\n", |
| 85 | + " !zip -r \"$pdf.zip\" \"$pdf/done\"\n", |
| 86 | + " files.download(\"$pdf.zip\")\n", |
| 87 | + " else:\n", |
| 88 | + " print(\"No PDFs found\")" |
| 89 | + ] |
| 90 | + } |
| 91 | + ], |
| 92 | + "metadata": { |
| 93 | + "language_info": { |
| 94 | + "name": "python" |
| 95 | + } |
| 96 | + }, |
| 97 | + "nbformat": 4, |
| 98 | + "nbformat_minor": 2 |
| 99 | +} |
0 commit comments