Skip to content
Open
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
8d3dd93
imports included
Jul 13, 2024
ee5aa3d
added tesseract exe.
Jul 13, 2024
ef5fe76
logic of image to text included
Jul 13, 2024
66ae761
update app.py
louisjoety Jul 13, 2024
9911983
update app.py
louisjoety Jul 13, 2024
dae6295
imports included
Jul 13, 2024
a4cabab
added tesseract exe.
Jul 13, 2024
cad2cce
logic of image to text included
Jul 13, 2024
29c997b
update app.py
louisjoety Jul 13, 2024
1e704af
update app.py
louisjoety Jul 13, 2024
18f0708
Merge branch 'feature/image-to-text-converter' of https://github.com/…
louisjoety Jul 15, 2024
dd89f24
imports included
Jul 13, 2024
a7510fe
added tesseract exe.
Jul 13, 2024
e017b8b
logic of image to text included
Jul 13, 2024
fe9bbee
update app.py
louisjoety Jul 13, 2024
22b4dd7
update app.py
louisjoety Jul 13, 2024
71330d7
update app.py
louisjoety Jul 13, 2024
72b794c
update app.py
louisjoety Jul 13, 2024
bcc5ae0
Merge branch 'feature/image-to-text-converter' of https://github.com/…
louisjoety Jul 15, 2024
5385b6c
refactor code into functions
louisjoety Jul 16, 2024
846e488
remove test file
louisjoety Jul 16, 2024
28386a2
Merge branch 'feature/image-to-text-converter' into feature/text-summ…
louisjoety Jul 30, 2024
554555a
add imports
louisjoety Jul 30, 2024
0dda431
downlaod nltk files
louisjoety Jul 30, 2024
362fedb
summariser made
louisjoety Jul 30, 2024
4cb21f4
summary compilation added
louisjoety Jul 30, 2024
deed873
nltk data checkers added
louisjoety Jul 30, 2024
497f7b7
removed data files
louisjoety Jul 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 85 additions & 1 deletion app.py
Original file line number Diff line number Diff line change
@@ -1 +1,85 @@
print('hello world')
import cv2
import pytesseract
from PIL import Image
import nltk
from nltk.data import find
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from heapq import nlargest

def ensure_nltk_data():
resources = [
('tokenizers/punkt', 'punkt'),
('corpora/stopwords', 'stopwords'),
('corpora/wordnet', 'wordnet')
]

for resource_path, resource_name in resources:
try:
find(resource_path)
except LookupError:
nltk.download(resource_name)

ensure_nltk_data()

def setup_tesseract(tesseract_cmd_path):
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd_path

def convert_image_to_text(image_path):
try:
image = cv2.imread(image_path)
if image is None:
raise ValueError(f"Image not found at the path: {image_path}")

rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

text = pytesseract.image_to_string(rgb_image)

return text
except Exception as e:
print(f"Error: {e}")
return ""

def summarize_text(text, n_sentences=3):
stop_words = set(stopwords.words('english'))
words = word_tokenize(text.lower())
filtered_words = [word for word in words if word.isalnum() and word not in stop_words]

word_frequencies = FreqDist(filtered_words)
max_frequency = max(word_frequencies.values())
for word in word_frequencies.keys():
word_frequencies[word] = (word_frequencies[word] / max_frequency)

sentence_scores = {}
sentences = sent_tokenize(text)
for sentence in sentences:
for word in word_tokenize(sentence.lower()):
if word in word_frequencies.keys():
if sentence not in sentence_scores.keys():
sentence_scores[sentence] = word_frequencies[word]
else:
sentence_scores[sentence] += word_frequencies[word]

summary_sentences = nlargest(n_sentences, sentence_scores, key=sentence_scores.get)
summary = ' '.join(summary_sentences)
return summary

def main(image_path, tesseract_cmd_path):
setup_tesseract(tesseract_cmd_path)
text = convert_image_to_text(image_path)
if text:
print("Extracted Text:")
print(text)

summary = summarize_text(text)
print("\nSummary:")
print(summary)
else:
print("No text extracted.")

if __name__ == "__main__":
image_path = '' # TODO: Add file path to image
tesseract_cmd_path = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
main(image_path, tesseract_cmd_path)