Skip to content

Commit 6b0c0b7

Browse files
rchennafijenkins
andauthored
Added updates for latest PDFTronGo lib update. (#38)
Co-authored-by: jenkins <build@apryse.com>
1 parent cd5a1d7 commit 6b0c0b7

File tree

10 files changed

+63
-0
lines changed

10 files changed

+63
-0
lines changed

samples/DataExtractionTest/DataExtraction_test.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,60 @@ func GenericKeyValueTest() (err error) {
302302
return nil
303303
}
304304

305+
//---------------------------------------------------------------------------------------
306+
// The following sample illustrates how to extract document classes from PDF documents.
307+
//---------------------------------------------------------------------------------------
308+
309+
func DocClassifierTest() (err error) {
310+
defer catch(&err)
311+
312+
// Test if the add-on is installed
313+
if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_DocClassification) {
314+
fmt.Println("")
315+
fmt.Println("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
316+
fmt.Println("-----------------------------------------------------------------------------")
317+
fmt.Println("The Data Extraction suite is an optional add-on, available for download")
318+
fmt.Println("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
319+
fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
320+
fmt.Println("using the PDFNetAddResourceSearchPath() function.")
321+
fmt.Println("")
322+
return nil
323+
}
324+
325+
// Simple example: classify pages as a JSON file
326+
fmt.Println("Classify pages as a JSON file")
327+
328+
inputFile := inputPath + "Invoice.pdf"
329+
outputFile := outputPath + "Invoice_Classified.json"
330+
DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_DocClassification)
331+
332+
fmt.Println("Result saved in " + outputFile)
333+
334+
// Classify pages as a JSON string
335+
fmt.Println("Classify pages as a JSON string")
336+
337+
inputFile = inputPath + "Scientific_Publication.pdf"
338+
outputFile = outputPath + "Scientific_Publication_Classified.json"
339+
json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_DocClassification).(string)
340+
WriteTextToFile(outputFile, json)
341+
342+
fmt.Println("Result saved in " + outputFile)
343+
344+
// Example with customized options:
345+
fmt.Println("Classify pages with customized options")
346+
347+
inputFile = inputPath + "Email.pdf"
348+
outputFile = outputPath + "Email_Classified.json"
349+
options := NewDataExtractionOptions()
350+
// Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
351+
options.SetMinimumConfidenceThreshold(0.7)
352+
DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_DocClassification, options)
353+
354+
fmt.Println("Result saved in " + outputFile)
355+
356+
return nil
357+
}
358+
305359
//---------------------------------------------------------------------------------------
306360

307361
func TestDataExtraction(t *testing.T) {
@@ -335,13 +389,22 @@ func TestDataExtraction(t *testing.T) {
335389
fmt.Println(fmt.Errorf("Unable to extract form fields data, error: %s", err))
336390
}
337391

392+
//-----------------------------------------------------------------------------------
393+
338394
err = GenericKeyValueTest()
339395
if err != nil {
340396
fmt.Println(fmt.Errorf("Unable to extract key-value pairs, error: %s", err))
341397
}
342398

343399
//-----------------------------------------------------------------------------------
344400

401+
err = DocClassifierTest()
402+
if err != nil {
403+
fmt.Println(fmt.Errorf("Unable to extract document classifications, error: %s", err))
404+
}
405+
406+
//-----------------------------------------------------------------------------------
407+
345408
PDFNetTerminate()
346409
fmt.Println("Done.")
347410
}

0 commit comments

Comments
 (0)