diff --git a/yale_daily_news/requirements.txt b/yale_daily_news/requirements.txt index 902fb7e..59ed1e8 100644 --- a/yale_daily_news/requirements.txt +++ b/yale_daily_news/requirements.txt @@ -1,3 +1,5 @@ numpy==1.11.3 scipy==0.18.1 -scikit-image==0.12.3 \ No newline at end of file +scikit-image==0.12.3 +pytesseract==0.1.7 +PIL==1.1.7 diff --git a/yale_daily_news/segment_ydn_images.py b/yale_daily_news/segment_ydn_images.py index cc41cc7..51d66a7 100644 --- a/yale_daily_news/segment_ydn_images.py +++ b/yale_daily_news/segment_ydn_images.py @@ -6,6 +6,9 @@ from shutil import Error, move, rmtree import numpy as np import glob, os, codecs, sys, json +from PIL import Image +import pytesseract + ''' ## Processing notes @@ -456,12 +459,21 @@ def store_article_titles(): # store the mapping from article path to first image path first_image_name = str(first_image['rect_id']) + '.png' path_to_first_image = os.path.join(article_path, first_image_name) - article_to_title[article_path] = path_to_first_image + best_guess_headline = pytesseract.image_to_string(Image.open('segmented_images' + path_to_first_image), lang='eng') + if best_guess_headline != '': + best_guess_headline = best_guess_headline.replace('-\n', '') # Fix hyphenation at column end + best_guess_headline = best_guess_headline.replace('\n', ' ') # Put short lines together + best_guess_headline = best_guess_headline[:100] # Take only the first part of the string (could be whole article!) + else: + best_guess_headline = '[Untitled]' # If we got nothing, use placeholder. + print('Processing article: "' + best_guess_headline) + article_to_title[article_path] = best_guess_headline with open('articles_to_titles.json', 'w') as out: json.dump(article_to_title, out) + ################## # Segment Images # ################## @@ -527,7 +539,14 @@ def segment_images(process_id): os.makedirs(out_path) io.imsave(out_path + str(rect_id) + '.png', cropped) - + # This is an ugly hack to force 300dpi into the png metadata. + # Without explicit dpi, tesseract assumes low (70) dpi; this is sub-optimal. + try: + Image.open(out_path + str(rect_id) + '.png').save(out_path + str(rect_id) + '.png',dpi=[300,300]) + except: + print("Couldn't write DPI to file " + out_path + str(rect_id) + ".png") + + def convert_coordinates(xml_coordinate_array, jp2_array, page): ''' @@ -656,7 +675,11 @@ def stack_segmented_images(): os.makedirs(composite_path) io.imsave(os.path.join(composite_path, str(article_id) + '.png'), composite_image) - + #ugly hack to get 300 dpi metadata which tesseract will need. + try: + Image.open(os.path.join(composite_path, str(article_id) + '.png')).save(os.path.join(composite_path, str(article_id) + '.png'),dpi=[300,300]) + except: + print("Couldn't write DPI to file " + composite_path, str(article_id) + ".png") ############## # Main Block # @@ -691,19 +714,19 @@ def stack_segmented_images(): pass # Define the directory that contains subdirectories for each paper issue - root_data_directory = '/Users/doug/Desktop/ydn-sample/' + root_data_directory = '/media/dhlab/PG4T/ydn/fourissuesonly' # Define whether to run code in verbose mode verbosity_level = 1 # Identify the maximum number of processors to use during analysis - n_processes = 4 + n_processes = 8 # Specify the maximum number of files to process - max_files_to_process = 20 + max_files_to_process = 1000000 # allow users to toggle multiprocessing on/off - multiprocess = False + multiprocess = True # specify how much padding to add to cropped images padding = 5 @@ -736,4 +759,4 @@ def stack_segmented_images(): store_article_titles() # Combine the segmented images for each article into one composite image - stack_segmented_images() \ No newline at end of file + stack_segmented_images()