Skip to content
This repository was archived by the owner on Jun 8, 2022. It is now read-only.

Commit 2c60f8e

Browse files
authored
Merge pull request #19 from rad10/Python-Experimental
1.6.2 release
2 parents f99b069 + 35f16c7 commit 2c60f8e

File tree

8 files changed

+2085
-1135
lines changed

8 files changed

+2085
-1135
lines changed

htmlFileToCsv.py

Lines changed: 0 additions & 1135 deletions
This file was deleted.

main.py

Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
import json
2+
import logging
3+
import os
4+
import re
5+
6+
from modules.corrections import JSON, connectDict, correctValue
7+
from modules.gui import InstallError, PopupTag, mainGUI
8+
from modules.imageScraper import imageScraper
9+
from modules.sanity import checkBlankRow, sanityName
10+
11+
# if opencv isnt installed, it'll install it for you
12+
try:
13+
import numpy as nm
14+
import cv2
15+
except ImportError:
16+
if(os.system("pip install opencv-python")):
17+
os.system("pip install --user opencv-python")
18+
try:
19+
from PIL import Image, ImageTk
20+
except ModuleNotFoundError:
21+
if(os.system("pip install pillow")):
22+
os.system("pip install --user pillow")
23+
except ImportError:
24+
import Image
25+
import ImageTk
26+
27+
# if tesseract isnt installed, itll install it for you
28+
try:
29+
import pytesseract as tess
30+
except ImportError:
31+
if(os.system("pip install pytesseract")):
32+
os.system("pip install --user pytesseract")
33+
import pytesseract as tess
34+
# installing pdf to image libraries
35+
try:
36+
from pdf2image import convert_from_path
37+
except ImportError:
38+
if(os.system("pip install pdf2image")):
39+
os.system("pip install --user pdf2image")
40+
from pdf2image import convert_from_path
41+
42+
# Checking that external software is installed and ready to use
43+
# check if tesseract exists
44+
if os.system("tesseract --help"):
45+
if os.path.exists("C:\\Program Files\\Tesseract-OCR\\tesseract.exe"):
46+
tess.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract'
47+
else:
48+
InstallError(
49+
"Tesseract", "https://github.com/UB-Mannheim/tesseract/releases", "tesseract.exe").run()
50+
# check if poppler exists
51+
if os.system("pdfimages -help"):
52+
InstallError("Poppler", "https://poppler.freedesktop.org/",
53+
"pdfimages.exe").run()
54+
55+
56+
# Functions
57+
58+
59+
logging.getLogger().setLevel(logging.WARNING)
60+
if "info" in os.sys.argv:
61+
logging.basicConfig(format="%(asctime)s: INFO %(message)s",
62+
datefmt="%H:%M:%S", level=logging.INFO)
63+
elif "debug" in os.sys.argv:
64+
logging.basicConfig(format="%(asctime)s: DEBUG %(message)s",
65+
datefmt="%H:%M:%S", level=logging.DEBUG)
66+
if not os.path.exists("debugOutput/."):
67+
os.makedirs("debugOutput/dictionary", exist_ok=True)
68+
os.makedirs("debugOutput/scrapper", exist_ok=True)
69+
else:
70+
os.system("del /s debugOutput\\*.jpg")
71+
72+
JSONFile = open("./aliases.json", "r")
73+
connectDict(json.load(JSONFile))
74+
JSONFile.close()
75+
JSONChange = False # this is only used when the database is updated
76+
mainDisplay = None
77+
78+
79+
def debug(label: str, content: list):
80+
logging.debug("%s:", label)
81+
if(logging.getLogger().level <= logging.DEBUG):
82+
for i in content:
83+
print(i)
84+
85+
86+
def debugImageDictionary(diction):
87+
if (logging.getLogger().level <= logging.INFO):
88+
debugOutput = "Sheet | SheetLen | TableRow | TableCol\n"
89+
for sheet in range(len(diction)):
90+
debugOutput += "{ind: 5d} | {slen: 8d} | {trow: 8d} | {tcol: 8d}\n".format(ind=sheet, slen=len(
91+
diction[sheet]), trow=len(diction[sheet][1]), tcol=len(diction[sheet][1][0]))
92+
logging.info(debugOutput)
93+
exportToFile("debugOutput/dictionaryStats.txt", debugOutput)
94+
for sheet in range(len(diction)):
95+
for dates in range(len(diction[sheet][0])):
96+
cv2.imwrite("debugOutput/dictionary/sheet{sheet}date{date}.jpg".format(
97+
sheet=sheet, date=dates), diction[sheet][0][dates])
98+
for row in range(len(diction[sheet][1])):
99+
for col in range(len(diction[sheet][1][row])):
100+
cv2.imwrite("debugOutput/dictionary/sheet{sheet}table{row}{col}.jpg".format(
101+
sheet=sheet, row=row, col=col), diction[sheet][1][row][col])
102+
103+
104+
def exportToFile(dir, content):
105+
open(dir, "w").write(content)
106+
107+
108+
def appendToFile(dir, content):
109+
try:
110+
inside = open(dir, "r").read()
111+
open(dir, "w").write(inside + content)
112+
except:
113+
open(dir, "w").write(content)
114+
115+
116+
def TranslateDictionary(sheetsDict, gui=False, outputDict=None):
117+
""" Phase two of plan. This function goes through the image dictionary passed
118+
to it and creates a matrix of the dictionary in text.\n
119+
@param sheetsDict: a matrix of images made from a table.\n
120+
@param gui: whether to switch on global gui manipulation for the progress bar.\n
121+
@param outputDict: a variable passed by reference instead of using return.\n
122+
@return a matrix of strings that represents the text in the image dictionary.
123+
"""
124+
global JSON
125+
global JSONChange
126+
results = [[] for x in sheetsDict] # results the size of pages in dict
127+
128+
# GUI widgets to manipulate while in middle of function
129+
if(gui):
130+
sheetMax = len(sheetsDict)
131+
sheetInd = 0
132+
rowInd = 0
133+
progressMax = 1
134+
135+
# Gui Texts
136+
textScan = "Scanning\tSheet: {sInd} of {sMax}\tRow: {rInd} of {rMax}"
137+
textSanitize = "Sanitizing\tSheet: {sInd} of {sMax}\tRow: {rInd} of {rMax}"
138+
139+
# Getting max for progress bar
140+
for sheet in sheetsDict:
141+
progressMax += len(sheet[1]) - 1
142+
mainDisplay.progressBar.configure(
143+
mode="determinate", maximum=progressMax)
144+
145+
# Collecting data to database
146+
for sheet in range(len(sheetsDict)):
147+
if gui:
148+
sheetInd += 1
149+
rowMax = len(sheetsDict[sheet][1]) - 1
150+
# Collecting dates on page first
151+
dates = []
152+
dformat = re.compile(r'\d{1,2}\/\d{1,2}\/(\d{4}|\d{2})')
153+
dstr = ""
154+
for date in sheetsDict[sheet][0]:
155+
dstr = tess.image_to_string(date).replace(
156+
"\n", "").replace(" ", "")
157+
if (bool(dformat.match(dstr))):
158+
dates.insert(0, (dstr, 1, True))
159+
else:
160+
dates.append((dstr, 1, True))
161+
162+
# | Full name | Time in | Time out | hours (possibly blank) | purpose | date | day (possibly blank) |
163+
# skips first row which is dummy
164+
for row in range(1, len(sheetsDict[sheet][1])):
165+
if gui:
166+
rowInd += 1
167+
mainDisplay.progressBar.step()
168+
mainDisplay.sheetStatus.configure(
169+
text=textScan.format(sInd=sheetInd, sMax=sheetMax, rInd=rowInd, rMax=rowMax))
170+
mainDisplay.root.update_idletasks()
171+
results[sheet].append([None for x in range(5)]) # array of 5 slots
172+
# skip first col which is dummy
173+
for col in range(1, len(sheetsDict[sheet][1][row])):
174+
logging.info("Sheet[%d]: [%d, %d]", int(
175+
sheetInd), int(rowInd), int(col))
176+
results[sheet][row - 1][col -
177+
1] = correctValue(sheetsDict[sheet][1][row][col], col)
178+
results[sheet][-1].extend(dates)
179+
if (logging.getLogger().level <= logging.DEBUG):
180+
for e in range(len(results)):
181+
debug("Results Sheet[" + str(e) + "]", results[e])
182+
183+
# Checking names for repetitions
184+
results = sanityName(results)
185+
186+
# Analysis
187+
for sheet in range(len(results)):
188+
# Iterating through results to see where errors occured
189+
for row in range(len(results[sheet])):
190+
for col in range(len(results[sheet][row][:-len(dates)])):
191+
mainDisplay.sheetStatus.configure(
192+
text=textSanitize.format(sInd=sheet + 1, sMax=len(results), rInd=row + 1, rMax=len(results[sheet])))
193+
if (results[sheet][row][col][2] == False):
194+
results[sheet][row][col] = mainDisplay.requestCorrection(
195+
sheetsDict[sheet][1][row + 1][col + 1], results[sheet][row][col][0])
196+
if (col + 1 in [1, 5]):
197+
for entry in JSON["names"][str(col + 1)]:
198+
if (results[sheet][row][col][0].lower() == entry):
199+
break
200+
else:
201+
JSONChange = True
202+
# if the name possibly entered in by the user doesnt exist in the database, add it
203+
JSON["names"][str(
204+
col + 1)].append(results[sheet][row][col][0].lower())
205+
206+
# Checking if any rows are blank
207+
for row in range(len(results[sheet])-1, -1, -1):
208+
if checkBlankRow(results[sheet][row]):
209+
results[sheet].pop(row)
210+
211+
if(outputDict == None):
212+
return results
213+
else:
214+
globals()[outputDict] = results.copy()
215+
return
216+
217+
218+
def arrayToCsv(directory):
219+
"""takes a matrix and returns a string in CSV format.
220+
var directory: a string[][] matrix that contains the information of people at the center.
221+
returns: a string that contains all the information in CSV format.
222+
"""
223+
cvarray = ''
224+
for i in range(len(directory)):
225+
for e in range(len(directory[i])-1):
226+
cvarray += (directory[i][e][0]+",")
227+
cvarray += (directory[i][-1][0]+"\n")
228+
logging.debug("cvarray:\n%s", cvarray)
229+
return (cvarray+"\n")
230+
231+
232+
def main():
233+
##########################################
234+
## Phase 3: Hooking everything together ##
235+
##########################################
236+
237+
try:
238+
signinsheet = mainDisplay.signinsheet
239+
outputCSV = mainDisplay.outputCSV
240+
imageDictionary = imageScraper(signinsheet)
241+
debugImageDictionary(imageDictionary)
242+
textDictionary = TranslateDictionary(imageDictionary, gui=True)
243+
csvString = ""
244+
for sheet in textDictionary:
245+
csvString += arrayToCsv(sheet)
246+
exportToFile(mainDisplay.outputCSV, csvString)
247+
mainDisplay.errorLabel.configure(text="All finished.")
248+
except BaseException:
249+
import traceback
250+
PopupTag(mainDisplay, "Error", "Looks like something went wrong.\n" +
251+
str(os.sys.exc_info())+"\n"+str(traceback.format_exc()), "#ff0000").run()
252+
raise
253+
PopupTag(mainDisplay, "Done",
254+
"Congrats! its all finished.\nLook at your csv and see if it looks alright.").run()
255+
if (JSONChange):
256+
JSON["names"]["1"].sort() # Sorting new libraries for optimization
257+
JSON["names"]["5"].sort()
258+
JSONFile = open("aliases.json", "w")
259+
json.dump(JSON, JSONFile, indent=4, separators=(
260+
",", ": "), ensure_ascii=True, sort_keys=True)
261+
JSONFile.close()
262+
263+
# Cleaning old ocr files from tmp
264+
os.system("del /s /q %tmp%\\tess_*.hocr")
265+
return
266+
267+
268+
mainDisplay = mainGUI(main)
269+
if __name__ == "__main__":
270+
mainDisplay.run()

0 commit comments

Comments
 (0)