@@ -302,6 +302,60 @@ func GenericKeyValueTest() (err error) {
302302 return nil
303303}
304304
305+ //---------------------------------------------------------------------------------------
306+ // The following sample illustrates how to extract document classes from PDF documents.
307+ //---------------------------------------------------------------------------------------
308+
309+ func DocClassifierTest () (err error ) {
310+ defer catch (& err )
311+
312+ // Test if the add-on is installed
313+ if ! DataExtractionModuleIsModuleAvailable (DataExtractionModuleE_DocClassification ) {
314+ fmt .Println ("" )
315+ fmt .Println ("Unable to run Data Extraction: PDFTron SDK Structured Output module not available." )
316+ fmt .Println ("-----------------------------------------------------------------------------" )
317+ fmt .Println ("The Data Extraction suite is an optional add-on, available for download" )
318+ fmt .Println ("at https://docs.apryse.com/documentation/core/info/modules/. If you have already" )
319+ fmt .Println ("downloaded this module, ensure that the SDK is able to find the required files" )
320+ fmt .Println ("using the PDFNetAddResourceSearchPath() function." )
321+ fmt .Println ("" )
322+ return nil
323+ }
324+
325+ // Simple example: classify pages as a JSON file
326+ fmt .Println ("Classify pages as a JSON file" )
327+
328+ inputFile := inputPath + "Invoice.pdf"
329+ outputFile := outputPath + "Invoice_Classified.json"
330+ DataExtractionModuleExtractData (inputFile , outputFile , DataExtractionModuleE_DocClassification )
331+
332+ fmt .Println ("Result saved in " + outputFile )
333+
334+ // Classify pages as a JSON string
335+ fmt .Println ("Classify pages as a JSON string" )
336+
337+ inputFile = inputPath + "Scientific_Publication.pdf"
338+ outputFile = outputPath + "Scientific_Publication_Classified.json"
339+ json := DataExtractionModuleExtractData (inputFile , DataExtractionModuleE_DocClassification ).(string )
340+ WriteTextToFile (outputFile , json )
341+
342+ fmt .Println ("Result saved in " + outputFile )
343+
344+ // Example with customized options:
345+ fmt .Println ("Classify pages with customized options" )
346+
347+ inputFile = inputPath + "Email.pdf"
348+ outputFile = outputPath + "Email_Classified.json"
349+ options := NewDataExtractionOptions ()
350+ // Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
351+ options .SetMinimumConfidenceThreshold (0.7 )
352+ DataExtractionModuleExtractData (inputFile , outputFile , DataExtractionModuleE_DocClassification , options )
353+
354+ fmt .Println ("Result saved in " + outputFile )
355+
356+ return nil
357+ }
358+
305359//---------------------------------------------------------------------------------------
306360
307361func TestDataExtraction (t * testing.T ) {
@@ -335,13 +389,22 @@ func TestDataExtraction(t *testing.T) {
335389 fmt .Println (fmt .Errorf ("Unable to extract form fields data, error: %s" , err ))
336390 }
337391
392+ //-----------------------------------------------------------------------------------
393+
338394 err = GenericKeyValueTest ()
339395 if err != nil {
340396 fmt .Println (fmt .Errorf ("Unable to extract key-value pairs, error: %s" , err ))
341397 }
342398
343399 //-----------------------------------------------------------------------------------
344400
401+ err = DocClassifierTest ()
402+ if err != nil {
403+ fmt .Println (fmt .Errorf ("Unable to extract document classifications, error: %s" , err ))
404+ }
405+
406+ //-----------------------------------------------------------------------------------
407+
345408 PDFNetTerminate ()
346409 fmt .Println ("Done." )
347410}
0 commit comments