@@ -55,6 +55,7 @@ def from_file(
5555 model : Optional [UnstructuredModel ] = None ,
5656 fixed_layouts : Optional [List [Optional [List [TextRegion ]]]] = None ,
5757 ocr_strategy : str = "auto" ,
58+ ocr_languages : str = "eng" ,
5859 extract_tables : bool = False ,
5960 ) -> DocumentLayout :
6061 """Creates a DocumentLayout from a pdf file."""
@@ -75,6 +76,7 @@ def from_file(
7576 model = model ,
7677 layout = layout ,
7778 ocr_strategy = ocr_strategy ,
79+ ocr_languages = ocr_languages ,
7880 fixed_layout = fixed_layout ,
7981 extract_tables = extract_tables ,
8082 )
@@ -87,6 +89,7 @@ def from_image_file(
8789 filename : str ,
8890 model : Optional [UnstructuredModel ] = None ,
8991 ocr_strategy : str = "auto" ,
92+ ocr_languages : str = "eng" ,
9093 fixed_layout : Optional [List [TextRegion ]] = None ,
9194 extract_tables : bool = False ,
9295 ) -> DocumentLayout :
@@ -104,6 +107,7 @@ def from_image_file(
104107 model = model ,
105108 layout = None ,
106109 ocr_strategy = ocr_strategy ,
110+ ocr_languages = ocr_languages ,
107111 fixed_layout = fixed_layout ,
108112 extract_tables = extract_tables ,
109113 )
@@ -120,6 +124,7 @@ def __init__(
120124 layout : Optional [List [TextRegion ]],
121125 model : Optional [UnstructuredModel ] = None ,
122126 ocr_strategy : str = "auto" ,
127+ ocr_languages : str = "eng" ,
123128 extract_tables : bool = False ,
124129 ):
125130 self .image = image
@@ -131,6 +136,7 @@ def __init__(
131136 if ocr_strategy not in VALID_OCR_STRATEGIES :
132137 raise ValueError (f"ocr_strategy must be one of { VALID_OCR_STRATEGIES } ." )
133138 self .ocr_strategy = ocr_strategy
139+ self .ocr_languages = ocr_languages
134140 self .extract_tables = extract_tables
135141
136142 def __str__ (self ) -> str :
@@ -159,7 +165,12 @@ def get_elements_from_layout(self, layout: List[TextRegion]) -> List[LayoutEleme
159165 layout .sort (key = lambda element : element .y1 )
160166 elements = [
161167 get_element_from_block (
162- e , self .image , self .layout , self .ocr_strategy , self .extract_tables
168+ block = e ,
169+ image = self .image ,
170+ pdf_objects = self .layout ,
171+ ocr_strategy = self .ocr_strategy ,
172+ ocr_languages = self .ocr_languages ,
173+ extract_tables = self .extract_tables ,
163174 )
164175 for e in layout
165176 ]
@@ -178,6 +189,7 @@ def from_image(
178189 model : Optional [UnstructuredModel ] = None ,
179190 layout : Optional [List [TextRegion ]] = None ,
180191 ocr_strategy : str = "auto" ,
192+ ocr_languages : str = "eng" ,
181193 extract_tables : bool = False ,
182194 fixed_layout : Optional [List [TextRegion ]] = None ,
183195 ):
@@ -188,6 +200,7 @@ def from_image(
188200 layout = layout ,
189201 model = model ,
190202 ocr_strategy = ocr_strategy ,
203+ ocr_languages = ocr_languages ,
191204 extract_tables = extract_tables ,
192205 )
193206 if fixed_layout is None :
@@ -202,6 +215,7 @@ def process_data_with_model(
202215 model_name : Optional [str ],
203216 is_image : bool = False ,
204217 ocr_strategy : str = "auto" ,
218+ ocr_languages : str = "eng" ,
205219 fixed_layouts : Optional [List [Optional [List [TextRegion ]]]] = None ,
206220 extract_tables : bool = False ,
207221) -> DocumentLayout :
@@ -214,6 +228,7 @@ def process_data_with_model(
214228 model_name ,
215229 is_image = is_image ,
216230 ocr_strategy = ocr_strategy ,
231+ ocr_languages = ocr_languages ,
217232 fixed_layouts = fixed_layouts ,
218233 extract_tables = extract_tables ,
219234 )
@@ -226,6 +241,7 @@ def process_file_with_model(
226241 model_name : Optional [str ],
227242 is_image : bool = False ,
228243 ocr_strategy : str = "auto" ,
244+ ocr_languages : str = "eng" ,
229245 fixed_layouts : Optional [List [Optional [List [TextRegion ]]]] = None ,
230246 extract_tables : bool = False ,
231247) -> DocumentLayout :
@@ -234,13 +250,18 @@ def process_file_with_model(
234250 model = get_model (model_name )
235251 layout = (
236252 DocumentLayout .from_image_file (
237- filename , model = model , ocr_strategy = ocr_strategy , extract_tables = extract_tables
253+ filename ,
254+ model = model ,
255+ ocr_strategy = ocr_strategy ,
256+ ocr_languages = ocr_languages ,
257+ extract_tables = extract_tables ,
238258 )
239259 if is_image
240260 else DocumentLayout .from_file (
241261 filename ,
242262 model = model ,
243263 ocr_strategy = ocr_strategy ,
264+ ocr_languages = ocr_languages ,
244265 fixed_layouts = fixed_layouts ,
245266 extract_tables = extract_tables ,
246267 )
@@ -253,13 +274,18 @@ def get_element_from_block(
253274 image : Optional [Image .Image ] = None ,
254275 pdf_objects : Optional [List [TextRegion ]] = None ,
255276 ocr_strategy : str = "auto" ,
277+ ocr_languages : str = "eng" ,
256278 extract_tables : bool = False ,
257279) -> LayoutElement :
258280 """Creates a LayoutElement from a given layout or image by finding all the text that lies within
259281 a given block."""
260282 element = LayoutElement .from_region (block )
261283 element .text = block .extract_text (
262- objects = pdf_objects , image = image , extract_tables = extract_tables , ocr_strategy = ocr_strategy
284+ objects = pdf_objects ,
285+ image = image ,
286+ extract_tables = extract_tables ,
287+ ocr_strategy = ocr_strategy ,
288+ ocr_languages = ocr_languages ,
263289 )
264290 return element
265291
0 commit comments