|
26 | 26 | "\n", |
27 | 27 | "## Steps\n", |
28 | 28 | "\n", |
29 | | - "1. Make two new folders, one inside the other\n", |
30 | | - " - The outer one can be named anything, say `pdf`\n", |
31 | | - " - The inner one must be named `todo`\n", |
32 | | - "2. Place your files in the `todo` folder\n", |
33 | | - " - Those by themselves will just be converted\n", |
34 | | - " - Those inside subfolders will also be merged in alphabetical order\n", |
35 | | - "3. Share the outer `pdf` folder with this notebook\n", |
36 | | - " - Zip the folder\n", |
37 | | - " - Open this notebook in [Colab](https://colab.research.google.com/github/ipitio/ocr-pdf/blob/master/colab.ipynb)\n", |
38 | | - " - Run the cell below to be prompted to connect Drive and upload the zip\n", |
39 | | - "\n", |
40 | | - "You'll be offered a zip of the converted (and merged) files to download locally, whether or not Drive was connected\n" |
| 29 | + "To merge files, organize them into folders and zip each one. Ensure the files are named in alphabetical order, as they will be merged in that order. If you'd like to add any options for [OCRmyPDF](https://ocrmypdf.readthedocs.io/en/latest), append them to the `run` line in the cell below. At the end, you'll be offered a zip of the converted (and merged) files to download locally, whether or not Drive was connected.\n", |
| 30 | + "\n", |
| 31 | + "1. Run the cell below to get prompted to connect Drive and upload your files and/or zipped folders\n" |
41 | 32 | ] |
42 | 33 | }, |
43 | 34 | { |
|
58 | 49 | "\n", |
59 | 50 | "# Extract your PDFs\n", |
60 | 51 | "files.upload()\n", |
61 | | - "\n", |
62 | | - "# Get the name of the zip file\n", |
63 | | - "pdfs = [pdf for pdf in os.listdir() if pdf.endswith(\".zip\")]\n", |
64 | | - "if len(pdfs) == 0:\n", |
65 | | - " raise Exception(\"No ZIP file found\")\n", |
| 52 | + "![ -d pdf ] || mkdir pdf\n", |
| 53 | + "![ -d pdf/todo ] || mkdir pdf/todo\n", |
| 54 | + "![ -d pdf/done ] || mkdir pdf/done\n", |
| 55 | + "!unzip -o \"*.zip\" -d pdf/todo 2>/dev/null\n", |
| 56 | + "!rm -f *.zip\n", |
| 57 | + "!mv *.* pdf/todo 2>/dev/null\n", |
66 | 58 | "\n", |
67 | 59 | "# Transform them\n", |
68 | 60 | "%pip install udocker\n", |
69 | 61 | "!udocker --allow-root install\n", |
70 | | - "\n", |
71 | | - "for pdf in pdfs:\n", |
72 | | - " !unzip -o \"$pdf\"\n", |
73 | | - " !rm -f \"$pdf\"\n", |
74 | | - " !udocker --allow-root run -v /content/\"$pdf\":/app/pdf ghcr.io/ipitio/ocr-pdf bash predict.sh pdf\n", |
75 | | - " converted = os.listdir(\"$pdf/done\")\n", |
76 | | - "\n", |
77 | | - " # And load\n", |
78 | | - " if drive and len(converted) > 0:\n", |
79 | | - " ![ -d \"drive/MyDrive/ocr-pdf\" ] || mkdir \"drive/MyDrive/ocr-pdf\"\n", |
80 | | - " !\\cp -r \"$pdf/done/\"* \"drive/MyDrive/ocr-pdf/\"\n", |
81 | | - "\n", |
82 | | - " if len(converted) == 1 and os.path.isfile(\"$pdf/done/\" + converted[0]):\n", |
83 | | - " files.download(\"$pdf/done/\" + converted[0])\n", |
84 | | - " elif len(converted) > 0:\n", |
85 | | - " !zip -r \"$pdf.zip\" \"$pdf/done\"\n", |
86 | | - " files.download(\"$pdf.zip\")\n", |
87 | | - " else:\n", |
88 | | - " print(\"No PDFs found\")" |
| 62 | + "!udocker --allow-root run -v /content/pdf:/ocr2pdf/pdf ghcr.io/ipitio/ocr-pdf bash predict.sh pdf\n", |
| 63 | + "converted = os.listdir(\"pdf/done\")\n", |
| 64 | + "\n", |
| 65 | + "# And load\n", |
| 66 | + "if drive and len(converted) > 0:\n", |
| 67 | + " ![ -d \"drive/MyDrive/ocr-pdf\" ] || mkdir \"drive/MyDrive/ocr-pdf\"\n", |
| 68 | + " !\\cp -r \"pdf/done/\"* \"drive/MyDrive/ocr-pdf/\"\n", |
| 69 | + "\n", |
| 70 | + "if len(converted) == 1 and os.path.isfile(\"$pdf/done/\" + converted[0]):\n", |
| 71 | + " files.download(\"pdf/done/\" + converted[0])\n", |
| 72 | + "elif len(converted) > 0:\n", |
| 73 | + " !zip -r \"pdf.zip\" \"pdf/done\"\n", |
| 74 | + " files.download(\"pdf.zip\")\n", |
| 75 | + "else:\n", |
| 76 | + " print(\"No PDFs found\")" |
89 | 77 | ] |
90 | 78 | } |
91 | 79 | ], |
|
0 commit comments