Skip to content

Commit b738f0d

Browse files
author
Kirill Makankov
committed
How about hOCR? #75
1 parent 52761d5 commit b738f0d

File tree

5 files changed

+130
-0
lines changed

5 files changed

+130
-0
lines changed

TesseractOCR/G8Tesseract.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,12 @@
112112
*/
113113
@property (nonatomic, readonly) NSString *recognizedText;
114114

115+
/*
116+
* Make an HTML-formatted string with hOCR markup from the internal data structures.
117+
* page_number is 0-based but will appear in the output as 1-based.
118+
*/
119+
- (NSString *)recognizedHOCRForPageNumber:(int)pageNumber;
120+
115121
/**
116122
* The result of Tesseract's orientation analysis of the target image. See
117123
* `G8Orientation` in G8Constants.h for the possible orientations.

TesseractOCR/G8Tesseract.mm

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,19 @@ - (NSArray *)recognizedBlocksByIteratorLevel:(G8PageIteratorLevel)pageIteratorLe
598598
return [array copy];
599599
}
600600

601+
- (NSString *)recognizedHOCRForPageNumber:(int)pageNumber {
602+
603+
_tesseract->SetInputName("");
604+
char* hocr = _tesseract->GetHOCRText(pageNumber);
605+
if (hocr) {
606+
NSString *text = [NSString stringWithUTF8String:hocr];
607+
free(hocr);
608+
return text;
609+
}
610+
611+
return nil;
612+
}
613+
601614
- (UIImage *)imageWithBlocks:(NSArray *)blocks drawText:(BOOL)drawText thresholded:(BOOL)thresholded
602615
{
603616
UIImage *image = thresholded ? self.thresholdedImage : self.image;

TestsProject/TestsProject.xcodeproj/project.pbxproj

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
736EFF271A5882730031B432 /* tessdata-rus in Resources */ = {isa = PBXBuildFile; fileRef = 736EFF231A5872CA0031B432 /* tessdata-rus */; };
3131
73BE4C091A598F47002C15F1 /* TesseractOCR.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 73C0A7BC1A59561F00D823D4 /* TesseractOCR.framework */; };
3232
73BE4C0A1A598F47002C15F1 /* TesseractOCR.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = 73C0A7BC1A59561F00D823D4 /* TesseractOCR.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
33+
73BE4C311A5B404C002C15F1 /* well_scaned_page.hOCR in Resources */ = {isa = PBXBuildFile; fileRef = 73BE4C301A5B404C002C15F1 /* well_scaned_page.hOCR */; };
3334
8FA2F9CE23919BEC8C64A5EA /* libPods-TestsProjectTests.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 5CD3C116A45C293ADAC81D1B /* libPods-TestsProjectTests.a */; };
3435
/* End PBXBuildFile section */
3536

@@ -91,6 +92,7 @@
9192
732C54771A5288CC000322DA /* Defaults.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Defaults.h; sourceTree = "<group>"; };
9293
732C54781A5288CC000322DA /* Defaults.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = Defaults.m; sourceTree = "<group>"; };
9394
736EFF231A5872CA0031B432 /* tessdata-rus */ = {isa = PBXFileReference; lastKnownFileType = folder; name = "tessdata-rus"; path = "TestsProjectTests/tessdata-rus"; sourceTree = SOURCE_ROOT; };
95+
73BE4C301A5B404C002C15F1 /* well_scaned_page.hOCR */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = well_scaned_page.hOCR; sourceTree = "<group>"; };
9496
73C0A7BC1A59561F00D823D4 /* TesseractOCR.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; path = TesseractOCR.framework; sourceTree = BUILT_PRODUCTS_DIR; };
9597
/* End PBXFileReference section */
9698

@@ -204,6 +206,7 @@
204206
41C68DB61A41854A00848AE1 /* Images */ = {
205207
isa = PBXGroup;
206208
children = (
209+
73BE4C301A5B404C002C15F1 /* well_scaned_page.hOCR */,
207210
412E9EAA1A44316C007DDAA5 /* well_scaned_page.png */,
208211
41C68DB21A41849100848AE1 /* image_sample.jpg */,
209212
41C68DB41A41854600848AE1 /* image_sample_tr.png */,
@@ -314,6 +317,7 @@
314317
41C68DB51A41854600848AE1 /* image_sample_tr.png in Resources */,
315318
41184B591A3EFD41007F5923 /* tessdata in Resources */,
316319
736EFF271A5882730031B432 /* tessdata-rus in Resources */,
320+
73BE4C311A5B404C002C15F1 /* well_scaned_page.hOCR in Resources */,
317321
412E9EAD1A45872A007DDAA5 /* image_sample_bl.png in Resources */,
318322
);
319323
runOnlyForDeploymentPostprocessing = 0;

0 commit comments

Comments
 (0)