|
| 1 | +import json |
| 2 | +import logging |
| 3 | +import os |
| 4 | +import re |
| 5 | + |
| 6 | +from modules.corrections import JSON, connectDict, correctValue |
| 7 | +from modules.gui import InstallError, PopupTag, mainGUI |
| 8 | +from modules.imageScraper import imageScraper |
| 9 | +from modules.sanity import checkBlankRow, sanityName |
| 10 | + |
| 11 | +# if opencv isnt installed, it'll install it for you |
| 12 | +try: |
| 13 | + import numpy as nm |
| 14 | + import cv2 |
| 15 | +except ImportError: |
| 16 | + if(os.system("pip install opencv-python")): |
| 17 | + os.system("pip install --user opencv-python") |
| 18 | +try: |
| 19 | + from PIL import Image, ImageTk |
| 20 | +except ModuleNotFoundError: |
| 21 | + if(os.system("pip install pillow")): |
| 22 | + os.system("pip install --user pillow") |
| 23 | +except ImportError: |
| 24 | + import Image |
| 25 | + import ImageTk |
| 26 | + |
| 27 | +# if tesseract isnt installed, itll install it for you |
| 28 | +try: |
| 29 | + import pytesseract as tess |
| 30 | +except ImportError: |
| 31 | + if(os.system("pip install pytesseract")): |
| 32 | + os.system("pip install --user pytesseract") |
| 33 | + import pytesseract as tess |
| 34 | +# installing pdf to image libraries |
| 35 | +try: |
| 36 | + from pdf2image import convert_from_path |
| 37 | +except ImportError: |
| 38 | + if(os.system("pip install pdf2image")): |
| 39 | + os.system("pip install --user pdf2image") |
| 40 | + from pdf2image import convert_from_path |
| 41 | + |
| 42 | +# Checking that external software is installed and ready to use |
| 43 | +# check if tesseract exists |
| 44 | +if os.system("tesseract --help"): |
| 45 | + if os.path.exists("C:\\Program Files\\Tesseract-OCR\\tesseract.exe"): |
| 46 | + tess.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract' |
| 47 | + else: |
| 48 | + InstallError( |
| 49 | + "Tesseract", "https://github.com/UB-Mannheim/tesseract/releases", "tesseract.exe").run() |
| 50 | +# check if poppler exists |
| 51 | +if os.system("pdfimages -help"): |
| 52 | + InstallError("Poppler", "https://poppler.freedesktop.org/", |
| 53 | + "pdfimages.exe").run() |
| 54 | + |
| 55 | + |
| 56 | +# Functions |
| 57 | + |
| 58 | + |
| 59 | +logging.getLogger().setLevel(logging.WARNING) |
| 60 | +if "info" in os.sys.argv: |
| 61 | + logging.basicConfig(format="%(asctime)s: INFO %(message)s", |
| 62 | + datefmt="%H:%M:%S", level=logging.INFO) |
| 63 | +elif "debug" in os.sys.argv: |
| 64 | + logging.basicConfig(format="%(asctime)s: DEBUG %(message)s", |
| 65 | + datefmt="%H:%M:%S", level=logging.DEBUG) |
| 66 | + if not os.path.exists("debugOutput/."): |
| 67 | + os.makedirs("debugOutput/dictionary", exist_ok=True) |
| 68 | + os.makedirs("debugOutput/scrapper", exist_ok=True) |
| 69 | + else: |
| 70 | + os.system("del /s debugOutput\\*.jpg") |
| 71 | + |
| 72 | +JSONFile = open("./aliases.json", "r") |
| 73 | +connectDict(json.load(JSONFile)) |
| 74 | +JSONFile.close() |
| 75 | +JSONChange = False # this is only used when the database is updated |
| 76 | +mainDisplay = None |
| 77 | + |
| 78 | + |
| 79 | +def debug(label: str, content: list): |
| 80 | + logging.debug("%s:", label) |
| 81 | + if(logging.getLogger().level <= logging.DEBUG): |
| 82 | + for i in content: |
| 83 | + print(i) |
| 84 | + |
| 85 | + |
| 86 | +def debugImageDictionary(diction): |
| 87 | + if (logging.getLogger().level <= logging.INFO): |
| 88 | + debugOutput = "Sheet | SheetLen | TableRow | TableCol\n" |
| 89 | + for sheet in range(len(diction)): |
| 90 | + debugOutput += "{ind: 5d} | {slen: 8d} | {trow: 8d} | {tcol: 8d}\n".format(ind=sheet, slen=len( |
| 91 | + diction[sheet]), trow=len(diction[sheet][1]), tcol=len(diction[sheet][1][0])) |
| 92 | + logging.info(debugOutput) |
| 93 | + exportToFile("debugOutput/dictionaryStats.txt", debugOutput) |
| 94 | + for sheet in range(len(diction)): |
| 95 | + for dates in range(len(diction[sheet][0])): |
| 96 | + cv2.imwrite("debugOutput/dictionary/sheet{sheet}date{date}.jpg".format( |
| 97 | + sheet=sheet, date=dates), diction[sheet][0][dates]) |
| 98 | + for row in range(len(diction[sheet][1])): |
| 99 | + for col in range(len(diction[sheet][1][row])): |
| 100 | + cv2.imwrite("debugOutput/dictionary/sheet{sheet}table{row}{col}.jpg".format( |
| 101 | + sheet=sheet, row=row, col=col), diction[sheet][1][row][col]) |
| 102 | + |
| 103 | + |
| 104 | +def exportToFile(dir, content): |
| 105 | + open(dir, "w").write(content) |
| 106 | + |
| 107 | + |
| 108 | +def appendToFile(dir, content): |
| 109 | + try: |
| 110 | + inside = open(dir, "r").read() |
| 111 | + open(dir, "w").write(inside + content) |
| 112 | + except: |
| 113 | + open(dir, "w").write(content) |
| 114 | + |
| 115 | + |
| 116 | +def TranslateDictionary(sheetsDict, gui=False, outputDict=None): |
| 117 | + """ Phase two of plan. This function goes through the image dictionary passed |
| 118 | + to it and creates a matrix of the dictionary in text.\n |
| 119 | + @param sheetsDict: a matrix of images made from a table.\n |
| 120 | + @param gui: whether to switch on global gui manipulation for the progress bar.\n |
| 121 | + @param outputDict: a variable passed by reference instead of using return.\n |
| 122 | + @return a matrix of strings that represents the text in the image dictionary. |
| 123 | + """ |
| 124 | + global JSON |
| 125 | + global JSONChange |
| 126 | + results = [[] for x in sheetsDict] # results the size of pages in dict |
| 127 | + |
| 128 | + # GUI widgets to manipulate while in middle of function |
| 129 | + if(gui): |
| 130 | + sheetMax = len(sheetsDict) |
| 131 | + sheetInd = 0 |
| 132 | + rowInd = 0 |
| 133 | + progressMax = 1 |
| 134 | + |
| 135 | + # Gui Texts |
| 136 | + textScan = "Scanning\tSheet: {sInd} of {sMax}\tRow: {rInd} of {rMax}" |
| 137 | + textSanitize = "Sanitizing\tSheet: {sInd} of {sMax}\tRow: {rInd} of {rMax}" |
| 138 | + |
| 139 | + # Getting max for progress bar |
| 140 | + for sheet in sheetsDict: |
| 141 | + progressMax += len(sheet[1]) - 1 |
| 142 | + mainDisplay.progressBar.configure( |
| 143 | + mode="determinate", maximum=progressMax) |
| 144 | + |
| 145 | + # Collecting data to database |
| 146 | + for sheet in range(len(sheetsDict)): |
| 147 | + if gui: |
| 148 | + sheetInd += 1 |
| 149 | + rowMax = len(sheetsDict[sheet][1]) - 1 |
| 150 | + # Collecting dates on page first |
| 151 | + dates = [] |
| 152 | + dformat = re.compile(r'\d{1,2}\/\d{1,2}\/(\d{4}|\d{2})') |
| 153 | + dstr = "" |
| 154 | + for date in sheetsDict[sheet][0]: |
| 155 | + dstr = tess.image_to_string(date).replace( |
| 156 | + "\n", "").replace(" ", "") |
| 157 | + if (bool(dformat.match(dstr))): |
| 158 | + dates.insert(0, (dstr, 1, True)) |
| 159 | + else: |
| 160 | + dates.append((dstr, 1, True)) |
| 161 | + |
| 162 | + # | Full name | Time in | Time out | hours (possibly blank) | purpose | date | day (possibly blank) | |
| 163 | + # skips first row which is dummy |
| 164 | + for row in range(1, len(sheetsDict[sheet][1])): |
| 165 | + if gui: |
| 166 | + rowInd += 1 |
| 167 | + mainDisplay.progressBar.step() |
| 168 | + mainDisplay.sheetStatus.configure( |
| 169 | + text=textScan.format(sInd=sheetInd, sMax=sheetMax, rInd=rowInd, rMax=rowMax)) |
| 170 | + mainDisplay.root.update_idletasks() |
| 171 | + results[sheet].append([None for x in range(5)]) # array of 5 slots |
| 172 | + # skip first col which is dummy |
| 173 | + for col in range(1, len(sheetsDict[sheet][1][row])): |
| 174 | + logging.info("Sheet[%d]: [%d, %d]", int( |
| 175 | + sheetInd), int(rowInd), int(col)) |
| 176 | + results[sheet][row - 1][col - |
| 177 | + 1] = correctValue(sheetsDict[sheet][1][row][col], col) |
| 178 | + results[sheet][-1].extend(dates) |
| 179 | + if (logging.getLogger().level <= logging.DEBUG): |
| 180 | + for e in range(len(results)): |
| 181 | + debug("Results Sheet[" + str(e) + "]", results[e]) |
| 182 | + |
| 183 | + # Checking names for repetitions |
| 184 | + results = sanityName(results) |
| 185 | + |
| 186 | + # Analysis |
| 187 | + for sheet in range(len(results)): |
| 188 | + # Iterating through results to see where errors occured |
| 189 | + for row in range(len(results[sheet])): |
| 190 | + for col in range(len(results[sheet][row][:-len(dates)])): |
| 191 | + mainDisplay.sheetStatus.configure( |
| 192 | + text=textSanitize.format(sInd=sheet + 1, sMax=len(results), rInd=row + 1, rMax=len(results[sheet]))) |
| 193 | + if (results[sheet][row][col][2] == False): |
| 194 | + results[sheet][row][col] = mainDisplay.requestCorrection( |
| 195 | + sheetsDict[sheet][1][row + 1][col + 1], results[sheet][row][col][0]) |
| 196 | + if (col + 1 in [1, 5]): |
| 197 | + for entry in JSON["names"][str(col + 1)]: |
| 198 | + if (results[sheet][row][col][0].lower() == entry): |
| 199 | + break |
| 200 | + else: |
| 201 | + JSONChange = True |
| 202 | + # if the name possibly entered in by the user doesnt exist in the database, add it |
| 203 | + JSON["names"][str( |
| 204 | + col + 1)].append(results[sheet][row][col][0].lower()) |
| 205 | + |
| 206 | + # Checking if any rows are blank |
| 207 | + for row in range(len(results[sheet])-1, -1, -1): |
| 208 | + if checkBlankRow(results[sheet][row]): |
| 209 | + results[sheet].pop(row) |
| 210 | + |
| 211 | + if(outputDict == None): |
| 212 | + return results |
| 213 | + else: |
| 214 | + globals()[outputDict] = results.copy() |
| 215 | + return |
| 216 | + |
| 217 | + |
| 218 | +def arrayToCsv(directory): |
| 219 | + """takes a matrix and returns a string in CSV format. |
| 220 | + var directory: a string[][] matrix that contains the information of people at the center. |
| 221 | + returns: a string that contains all the information in CSV format. |
| 222 | + """ |
| 223 | + cvarray = '' |
| 224 | + for i in range(len(directory)): |
| 225 | + for e in range(len(directory[i])-1): |
| 226 | + cvarray += (directory[i][e][0]+",") |
| 227 | + cvarray += (directory[i][-1][0]+"\n") |
| 228 | + logging.debug("cvarray:\n%s", cvarray) |
| 229 | + return (cvarray+"\n") |
| 230 | + |
| 231 | + |
| 232 | +def main(): |
| 233 | + ########################################## |
| 234 | + ## Phase 3: Hooking everything together ## |
| 235 | + ########################################## |
| 236 | + |
| 237 | + try: |
| 238 | + signinsheet = mainDisplay.signinsheet |
| 239 | + outputCSV = mainDisplay.outputCSV |
| 240 | + imageDictionary = imageScraper(signinsheet) |
| 241 | + debugImageDictionary(imageDictionary) |
| 242 | + textDictionary = TranslateDictionary(imageDictionary, gui=True) |
| 243 | + csvString = "" |
| 244 | + for sheet in textDictionary: |
| 245 | + csvString += arrayToCsv(sheet) |
| 246 | + exportToFile(mainDisplay.outputCSV, csvString) |
| 247 | + mainDisplay.errorLabel.configure(text="All finished.") |
| 248 | + except BaseException: |
| 249 | + import traceback |
| 250 | + PopupTag(mainDisplay, "Error", "Looks like something went wrong.\n" + |
| 251 | + str(os.sys.exc_info())+"\n"+str(traceback.format_exc()), "#ff0000").run() |
| 252 | + raise |
| 253 | + PopupTag(mainDisplay, "Done", |
| 254 | + "Congrats! its all finished.\nLook at your csv and see if it looks alright.").run() |
| 255 | + if (JSONChange): |
| 256 | + JSON["names"]["1"].sort() # Sorting new libraries for optimization |
| 257 | + JSON["names"]["5"].sort() |
| 258 | + JSONFile = open("aliases.json", "w") |
| 259 | + json.dump(JSON, JSONFile, indent=4, separators=( |
| 260 | + ",", ": "), ensure_ascii=True, sort_keys=True) |
| 261 | + JSONFile.close() |
| 262 | + |
| 263 | + # Cleaning old ocr files from tmp |
| 264 | + os.system("del /s /q %tmp%\\tess_*.hocr") |
| 265 | + return |
| 266 | + |
| 267 | + |
| 268 | +mainDisplay = mainGUI(main) |
| 269 | +if __name__ == "__main__": |
| 270 | + mainDisplay.run() |
0 commit comments