Skip to content

Commit 72c39bb

Browse files
Merge pull request #45 from datalogics-josepha/ocr
Add new OCR sample.
2 parents 7201a36 + 1b0d86a commit 72c39bb

File tree

6 files changed

+199
-0
lines changed

6 files changed

+199
-0
lines changed
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
<?xml version="1.0" encoding="utf-8" ?>
2+
<configuration>
3+
<startup>
4+
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.7.2" />
5+
</startup>
6+
</configuration>
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using Datalogics.PDFL;
4+
5+
/*
6+
* Runs OCR on the document recognizing text found on its rasterized pages.
7+
*
8+
* Copyright (c) 2007-2025, Datalogics, Inc. All rights reserved.
9+
*
10+
*/
11+
12+
namespace OCRDocument
13+
{
14+
class OCRDocument
15+
{
16+
static void Main(string[] args)
17+
{
18+
Console.WriteLine("OCRDocument Sample:");
19+
20+
using (Library lib = new Library())
21+
{
22+
Console.WriteLine("Initialized the library.");
23+
24+
String sInput = Library.ResourceDirectory + "Sample_Input/scanned_images.pdf";
25+
String sOutput = "OCRDocument-out.pdf";
26+
27+
if (args.Length > 0)
28+
sInput = args[0];
29+
if (args.Length > 1)
30+
sOutput = args[1];
31+
32+
Console.WriteLine("Input file: " + sInput);
33+
Console.WriteLine("Writing output to: " + sOutput);
34+
35+
OCRParams ocrParams = new OCRParams();
36+
//The OCRParams.Languages parameter controls which languages the OCR engine attempts
37+
//to detect. By default the OCR engine searches for English.
38+
List<LanguageSetting> langList = new List<LanguageSetting>();
39+
LanguageSetting languageOne = new LanguageSetting(Language.English, false);
40+
langList.Add(languageOne);
41+
42+
//You could add additional languages for the OCR engine to detect by adding
43+
//more entries to the LanguageSetting list.
44+
45+
//LanguageSetting languageTwo = new LanguageSetting(Language.Japanese, false);
46+
//langList.Add(languageTwo);
47+
ocrParams.Languages = langList;
48+
49+
// If the resolution for the images in your document are not
50+
// 300 dpi, specify a default resolution here. Specifying a
51+
// correct resolution gives better results for OCR, especially
52+
// with automatic image preprocessing.
53+
// ocrParams.Resolution = 600;
54+
55+
using (OCREngine ocrEngine = new OCREngine(ocrParams))
56+
{
57+
//Create a document object using the input file
58+
using (Document doc = new Document(sInput))
59+
{
60+
for (int numPage = 0; numPage < doc.NumPages; numPage++)
61+
{
62+
using (Page page = doc.GetPage(numPage))
63+
{
64+
page.RecognizePageContents(doc, ocrEngine);
65+
}
66+
}
67+
68+
doc.Save(SaveFlags.Full, sOutput);
69+
}
70+
}
71+
}
72+
}
73+
}
74+
}
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
<?xml version="1.0" encoding="utf-8"?>
2+
<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3+
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
4+
<PropertyGroup>
5+
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
6+
<Platform Condition=" '$(Platform)' == '' ">x64</Platform>
7+
<ProjectGuid>{C9DD37F3-545F-4346-8EF2-FAE2DD20FDCF}</ProjectGuid>
8+
<OutputType>Exe</OutputType>
9+
<RootNamespace>OCRDocument</RootNamespace>
10+
<AssemblyName>OCRDocument</AssemblyName>
11+
<TargetFrameworkVersion>v4.7.2</TargetFrameworkVersion>
12+
<FileAlignment>512</FileAlignment>
13+
<AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
14+
<Deterministic>true</Deterministic>
15+
<NuGetPackageImportStamp>
16+
</NuGetPackageImportStamp>
17+
</PropertyGroup>
18+
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|x64' ">
19+
<PlatformTarget>x64</PlatformTarget>
20+
<DebugSymbols>true</DebugSymbols>
21+
<DebugType>full</DebugType>
22+
<Optimize>false</Optimize>
23+
<OutputPath>..\..\..\dle\build\win-x86-64\Debug\</OutputPath>
24+
<DefineConstants>DEBUG;TRACE</DefineConstants>
25+
<ErrorReport>prompt</ErrorReport>
26+
<WarningLevel>4</WarningLevel>
27+
</PropertyGroup>
28+
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|x64' ">
29+
<PlatformTarget>x64</PlatformTarget>
30+
<DebugType>pdbonly</DebugType>
31+
<Optimize>false</Optimize>
32+
<OutputPath>bin\Release\</OutputPath>
33+
<DefineConstants>TRACE</DefineConstants>
34+
<ErrorReport>prompt</ErrorReport>
35+
<WarningLevel>4</WarningLevel>
36+
</PropertyGroup>
37+
<ItemGroup>
38+
<PackageReference Include="Adobe.PDF.Library.LM.NETFramework">
39+
<Version>18.*</Version>
40+
</PackageReference>
41+
<Reference Include="System" />
42+
<Reference Include="System.Core" />
43+
<Reference Include="System.Xml.Linq" />
44+
<Reference Include="System.Data.DataSetExtensions" />
45+
<Reference Include="Microsoft.CSharp" />
46+
<Reference Include="System.Data" />
47+
<Reference Include="System.Net.Http" />
48+
<Reference Include="System.Xml" />
49+
</ItemGroup>
50+
<ItemGroup>
51+
<Compile Include="OCRDocument.cs" />
52+
<Compile Include="Properties\AssemblyInfo.cs" />
53+
</ItemGroup>
54+
<ItemGroup>
55+
<None Include="App.config" />
56+
</ItemGroup>
57+
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
58+
</Project>
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
2+
Microsoft Visual Studio Solution File, Format Version 12.00
3+
# Visual Studio Version 16
4+
VisualStudioVersion = 16.0.33328.57
5+
MinimumVisualStudioVersion = 10.0.40219.1
6+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OCRDocument", "OCRDocument.csproj", "{C9DD37F3-545F-4346-8EF2-FAE2DD20FDCF}"
7+
EndProject
8+
Global
9+
GlobalSection(SolutionConfigurationPlatforms) = preSolution
10+
Debug|x64 = Debug|x64
11+
Release|x64 = Release|x64
12+
EndGlobalSection
13+
GlobalSection(ProjectConfigurationPlatforms) = postSolution
14+
{C9DD37F3-545F-4346-8EF2-FAE2DD20FDCF}.Debug|x64.ActiveCfg = Debug|x64
15+
{C9DD37F3-545F-4346-8EF2-FAE2DD20FDCF}.Debug|x64.Build.0 = Debug|x64
16+
{C9DD37F3-545F-4346-8EF2-FAE2DD20FDCF}.Release|x64.ActiveCfg = Release|x64
17+
{C9DD37F3-545F-4346-8EF2-FAE2DD20FDCF}.Release|x64.Build.0 = Release|x64
18+
EndGlobalSection
19+
GlobalSection(SolutionProperties) = preSolution
20+
HideSolutionNode = FALSE
21+
EndGlobalSection
22+
GlobalSection(ExtensibilityGlobals) = postSolution
23+
SolutionGuid = {CEA60573-4A7F-49A3-8EC5-6DCC54E2E30B}
24+
EndGlobalSection
25+
EndGlobal
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
using System.Reflection;
2+
using System.Runtime.CompilerServices;
3+
using System.Runtime.InteropServices;
4+
5+
// General Information about an assembly is controlled through the following
6+
// set of attributes. Change these attribute values to modify the information
7+
// associated with an assembly.
8+
[assembly: AssemblyTitle("OCRDocument")]
9+
[assembly: AssemblyDescription("")]
10+
[assembly: AssemblyConfiguration("")]
11+
[assembly: AssemblyCompany("Datalogics, Inc.")]
12+
[assembly: AssemblyProduct("OCRDocument")]
13+
[assembly: AssemblyCopyright("Copyright © Datalogics 2019-2025")]
14+
[assembly: AssemblyTrademark("")]
15+
[assembly: AssemblyCulture("")]
16+
17+
// Setting ComVisible to false makes the types in this assembly not visible
18+
// to COM components. If you need to access a type in this assembly from
19+
// COM, set the ComVisible attribute to true on that type.
20+
[assembly: ComVisible(false)]
21+
22+
// The following GUID is for the ID of the typelib if this project is exposed to COM
23+
[assembly: Guid("a1a2f184-6250-4843-8d6b-3a72776dd27d")]
24+
25+
// Version information for an assembly consists of the following four values:
26+
//
27+
// Major Version
28+
// Minor Version
29+
// Build Number
30+
// Revision
31+
//
32+
[assembly: AssemblyVersion("1.0.0.0")]
33+
[assembly: AssemblyFileVersion("1.0.0.0")]

OpticalCharacterRecognition/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,6 @@ Places recognized text behind the OCR images found on a PDF page.
33

44
## ***AddTextToImage***
55
Adds an image file to a PDF page, runs OCR on the image, and place the recognized text behind it.
6+
7+
## ***OCRDocument***
8+
Runs OCR on the document recognizing text found on its rasterized pages.

0 commit comments

Comments
 (0)