diff --git a/README.md b/README.md index 7bf9ab0c..3e78db5f 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,19 @@ a pull request. While the majority of examples are fully in pure Go, there are a few examples that demonstrate additional functionality that requires CGO and external dependencies. Those examples are clarified by filename suffix "_cgo.go". +## Disclaimer + +**IMPORTANT:** The code examples provided in this repository are for educational and demonstration purposes only. They are provided "as is" without warranty of any kind, either express or implied. These examples are intended to help developers understand how to use the UniPDF library and may not be suitable for production environments without additional security hardening, error handling, and testing. + +UniDoc (the maintainers of this repository) shall not be held responsible for any risks, damages, or issues arising from the use of these code examples in production or any other environment. Users are solely responsible for reviewing, testing, and adapting the code to meet their specific requirements and security standards before deploying to production systems. + +It is strongly recommended that you: +- Conduct thorough security reviews and testing +- Implement proper input validation and sanitization +- Add appropriate error handling and logging +- Follow security best practices for your specific use case +- Consult with security professionals when handling sensitive data + ## License codes UniPDF requires license codes to operate, there are two options: - Metered License API keys: Free ones can be obtained at https://cloud.unidoc.io diff --git a/go.mod b/go.mod index e4fb3906..26173fdd 100644 --- a/go.mod +++ b/go.mod @@ -5,10 +5,12 @@ go 1.23.0 require ( cloud.google.com/go/kms v1.18.5 github.com/ThalesIgnite/crypto11 v1.2.5 + github.com/anthonynsimon/bild v0.14.0 github.com/aws/aws-sdk-go v1.55.6 github.com/bmatcuk/doublestar v1.3.4 github.com/boombuler/barcode v1.0.2 github.com/gabriel-vasile/mimetype v1.4.8 + github.com/stefanhengl/gohocr v0.0.0-20171024154250-dde96807b100 github.com/trimmer-io/go-xmp v1.0.0 github.com/unidoc/globalsign-dss v0.0.0-20220330092912-b69d85b63736 github.com/unidoc/pkcs7 v0.3.0 diff --git a/go.sum b/go.sum index 18b74f2d..8e9b5563 100644 --- a/go.sum +++ b/go.sum @@ -24,6 +24,8 @@ github.com/adrg/sysfont v0.1.2/go.mod h1:6d3l7/BSjX9VaeXWJt9fcrftFaD/t7l11xgSywC github.com/adrg/xdg v0.3.0/go.mod h1:7I2hH/IT30IsupOpKZ5ue7/qNi3CoKzD6tL3HwpaRMQ= github.com/adrg/xdg v0.5.3 h1:xRnxJXne7+oWDatRhR1JLnvuccuIeCoBu2rtuLqQB78= github.com/adrg/xdg v0.5.3/go.mod h1:nlTsY+NNiCBGCK2tpm09vRqfVzrc2fLmXGpBLF0zlTQ= +github.com/anthonynsimon/bild v0.14.0 h1:IFRkmKdNdqmexXHfEU7rPlAmdUZ8BDZEGtGHDnGWync= +github.com/anthonynsimon/bild v0.14.0/go.mod h1:hcvEAyBjTW69qkKJTfpcDQ83sSZHxwOunsseDfeQhUs= github.com/aws/aws-sdk-go v1.55.6 h1:cSg4pvZ3m8dgYcgqB97MrcdjUmZ1BeMYKUxMMB89IPk= github.com/aws/aws-sdk-go v1.55.6/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= github.com/bmatcuk/doublestar v1.3.4 h1:gPypJ5xD31uhX6Tf54sDPUOBXTqKH4c9aPY66CyQrS0= @@ -109,6 +111,8 @@ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZN github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/stefanhengl/gohocr v0.0.0-20171024154250-dde96807b100 h1:hF/ZvwhZFjvAXLTKinLJZwFf7ajPZp+LUyDc+qtoVzM= +github.com/stefanhengl/gohocr v0.0.0-20171024154250-dde96807b100/go.mod h1:cPiGn9y/mCPkH6dScOMVru1KnTdtzh/2DvvJrFDS7Sc= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= diff --git a/ocr/README.md b/ocr/README.md new file mode 100644 index 00000000..efb7ea1b --- /dev/null +++ b/ocr/README.md @@ -0,0 +1,16 @@ +# PDF OCR Examples + +UniPDF supports integration with HTTP-based OCR (Optical Character Recognition) services to extract text from images and scanned PDF documents. These examples demonstrate how to configure and use OCR services to process images and reconstruct searchable PDFs from scanned documents. + +The OCR functionality works by sending images to a configured HTTP endpoint that performs text recognition and returns the results in various formats including plain text and HOCR (HTML-based OCR format). + +## Examples + +- [hocr_sample.go](hocr_sample.go) illustrates how to process HOCR formatted OCR output, parsing word-level information including bounding boxes and confidence scores. +- [ocr_batch.go](ocr_batch.go) shows how to perform batch OCR processing on multiple images concurrently, with error handling and summary reporting. +- [ocr_sample.go](ocr_sample.go) demonstrates basic OCR usage by sending a single image to an HTTP OCR service and extracting the text content. +- [reconstruct_pdf_from_hocr.go](reconstruct_pdf_from_hocr.go) demonstrates a complete workflow to extract images from a PDF, perform OCR with HOCR output, parse the structured results, and reconstruct a searchable PDF with properly positioned text. + +## Requirements + +These examples require an HTTP OCR service running on `http://localhost:8080/file`. The examples are created using [unidoc/ocrserver](https://github.com/unidoc/ocrserver) as the OCR service. However, UniPDF's OCR API is designed to be flexible and should support other OCR services that accept image uploads via multipart form data and return text or HOCR formatted results. diff --git a/ocr/hocr_sample.go b/ocr/hocr_sample.go new file mode 100644 index 00000000..5773a91b --- /dev/null +++ b/ocr/hocr_sample.go @@ -0,0 +1,103 @@ +/** + * This is a sample Go program that demonstrates how to use the UniPDF library + * to perform OCR on an image using an HTTP OCR service that returns HOCR formatted + * output. The program parses the HOCR response and extracts word-level information + * including bounding boxes and confidence scores. + * + * This example uses https://github.com/unidoc/ocrserver as the OCR service. + * However, UniPDF's OCR API is designed to support other OCR services that accept + * image uploads via HTTP and return text or HOCR formatted results. + * + * Run as: go run hocr_sample.go input.jpg + */ +package main + +import ( + "context" + "encoding/json" + "fmt" + "os" + "strconv" + + "github.com/stefanhengl/gohocr" + "github.com/unidoc/unipdf/v4/common/license" + "github.com/unidoc/unipdf/v4/ocr" +) + +func init() { + // Make sure to load your metered License API key prior to using the library. + // If you need a key, you can sign up and create a free one at https://cloud.unidoc.io + err := license.SetMeteredKey(os.Getenv(`UNIDOC_LICENSE_API_KEY`)) + if err != nil { + panic(err) + } +} + +func main() { + if len(os.Args) < 2 { + fmt.Printf("Usage: go run hocr_sample.go input.jpg\n") + os.Exit(1) + } + + f, err := os.Open(os.Args[1]) + if err != nil { + fmt.Printf("Error opening file: %v\n", err) + os.Exit(1) + } + defer f.Close() + + // Configure OCR service options. + opts := ocr.OCROptions{ + Url: "http://localhost:8080/file", + Method: "POST", + FileFieldName: "file", + Headers: map[string]string{ + "Accept": "application/json", + }, + FormFields: map[string]string{ + "format": "hocr", + }, + TimeoutSeconds: 30, + } + + // Create OCR client. + client := ocr.NewHTTPOCRService(opts) + + result, err := client.ExtractText(context.Background(), f, "image.jpg") + if err != nil { + fmt.Printf("Error extracting text: %v\n", err) + os.Exit(1) + } + + // Parse JSON response to extract the "result" field. + var jsonObj map[string]interface{} + if err := json.Unmarshal(result, &jsonObj); err != nil { + fmt.Printf("Error parsing JSON response: %v\n", err) + os.Exit(1) + } + + content, ok := jsonObj["result"].(string) + if !ok { + fmt.Printf("Error: result field is not a string\n") + os.Exit(1) + } + fmt.Printf("Extracted text: %s\n", content) + + content, err = strconv.Unquote(content) + if err != nil { + fmt.Printf("Error unquoting content: %v\n", err) + os.Exit(1) + } + + contentBytes := []byte(content) + + data, err := gohocr.Parse(contentBytes) + if err != nil { + fmt.Printf("Error parsing HOCR data: %v\n", err) + os.Exit(1) + } + + for _, v := range data.Words { + fmt.Printf("Word: %s, Title: %f\n", v.Content, v.Title) + } +} diff --git a/ocr/ocr_batch.go b/ocr/ocr_batch.go new file mode 100644 index 00000000..87deb620 --- /dev/null +++ b/ocr/ocr_batch.go @@ -0,0 +1,98 @@ +/** + * This is a sample Go program that demonstrates how to use the UniPDF library + * to perform batch OCR processing on multiple images using an HTTP OCR service. + * The program processes multiple image files concurrently and displays the extracted + * text results along with a summary of successful and failed operations. + * + * This example uses https://github.com/unidoc/ocrserver as the OCR service. + * However, UniPDF's OCR API is designed to support other OCR services that accept + * image uploads via HTTP and return text or HOCR formatted results. + * + * Run as: go run ocr_batch.go image1.jpg image2.png [image3.jpg ...] + */ +package main + +import ( + "context" + "fmt" + "os" + "path/filepath" + + "github.com/unidoc/unipdf/v4/common/license" + "github.com/unidoc/unipdf/v4/ocr" +) + +func init() { + // Make sure to load your metered License API key prior to using the library. + // If you need a key, you can sign up and create a free one at https://cloud.unidoc.io + err := license.SetMeteredKey(os.Getenv(`UNIDOC_LICENSE_API_KEY`)) + if err != nil { + panic(err) + } +} + +func main() { + if len(os.Args) < 2 { + fmt.Printf("Usage: go run ocr_batch.go image1.jpg image2.png [image3.jpg ...]\n") + os.Exit(1) + } + + // Get list of image files from command line arguments + filePaths := os.Args[1:] + + // Validate that all files exist + for _, filePath := range filePaths { + if _, err := os.Stat(filePath); os.IsNotExist(err) { + fmt.Printf("Error: File does not exist: %s\n", filePath) + os.Exit(1) + } + } + + // Configure OCR service options. + opts := ocr.OCROptions{ + Url: "http://localhost:8080/file", + Method: "POST", + FileFieldName: "file", + Headers: map[string]string{ + "Accept": "application/json", + }, + TimeoutSeconds: 30, + } + + // Create OCR client. + client := ocr.NewOCRHTTPClient(opts) + + fmt.Printf("Processing %d files...\n", len(filePaths)) + + // Batch process files. + results, errors := client.BatchProcessFiles(context.Background(), filePaths) + + // Display results + for i, filePath := range filePaths { + filename := filepath.Base(filePath) + fmt.Printf("\n--- Results for %s ---\n", filename) + + if errors[i] != nil { + fmt.Printf("Error processing %s: %s\n", filename, errors[i]) + continue + } + + fmt.Printf("Extracted text from %s:\n%s\n", filename, string(results[i])) + } + + // Summary + successCount := 0 + errorCount := 0 + for _, err := range errors { + if err != nil { + errorCount++ + } else { + successCount++ + } + } + + fmt.Printf("\n--- Summary ---\n") + fmt.Printf("Successfully processed: %d files\n", successCount) + fmt.Printf("Failed to process: %d files\n", errorCount) + fmt.Printf("Total files: %d\n", len(filePaths)) +} diff --git a/ocr/ocr_sample.go b/ocr/ocr_sample.go new file mode 100644 index 00000000..4a2aae69 --- /dev/null +++ b/ocr/ocr_sample.go @@ -0,0 +1,66 @@ +/** + * This is a sample Go program that demonstrates how to use the UniPDF library + * to perform OCR on an image using an HTTP OCR service. The program sends an image + * to the configured OCR endpoint and displays the extracted text. + * + * This example uses https://github.com/unidoc/ocrserver as the OCR service. + * However, UniPDF's OCR API is designed to support other OCR services that accept + * image uploads via HTTP and return text or HOCR formatted results. + * + * Run as: go run ocr_sample.go input.jpg + */ +package main + +import ( + "context" + "fmt" + "os" + + "github.com/unidoc/unipdf/v4/common/license" + "github.com/unidoc/unipdf/v4/ocr" +) + +func init() { + // Make sure to load your metered License API key prior to using the library. + // If you need a key, you can sign up and create a free one at https://cloud.unidoc.io + err := license.SetMeteredKey(os.Getenv(`UNIDOC_LICENSE_API_KEY`)) + if err != nil { + panic(err) + } +} + +func main() { + if len(os.Args) < 2 { + fmt.Printf("Usage: go run ocr_sample.go input.jpg\n") + os.Exit(1) + } + + f, err := os.Open(os.Args[1]) + if err != nil { + fmt.Printf("Error opening file: %v\n", err) + os.Exit(1) + } + defer f.Close() + + // Configure OCR service options. + opts := ocr.OCROptions{ + Url: "http://localhost:8080/file", + Method: "POST", + FileFieldName: "file", + Headers: map[string]string{ + "Accept": "application/json", + }, + TimeoutSeconds: 30, + } + + // Create OCR client. + client := ocr.NewHTTPOCRService(opts) + + result, err := client.ExtractText(context.Background(), f, "image.jpg") + if err != nil { + fmt.Printf("Error extracting text: %v\n", err) + os.Exit(1) + } + + fmt.Printf("Extracted text: %s\n", string(result)) +} diff --git a/ocr/reconstruct_pdf_from_hocr.go b/ocr/reconstruct_pdf_from_hocr.go new file mode 100644 index 00000000..89150bd8 --- /dev/null +++ b/ocr/reconstruct_pdf_from_hocr.go @@ -0,0 +1,468 @@ +/** + * This is a sample Go program that demonstrates how to use the UniPDF library + * to extract text from within images in a PDF using an OCR service that returns + * HOCR formatted output then writes the reconstructed text to a new PDF. + * + * This example uses https://github.com/unidoc/ocrserver as the OCR service. + * However, UniPDF's OCR API is designed to support other OCR services that accept + * image uploads via HTTP and return text or HOCR formatted results. + * + * Run as: go run reconstruct_pdf_from_hocr.go input.pdf + */ +package main + +import ( + "bytes" + "context" + "encoding/json" + "encoding/xml" + "fmt" + "image" + "image/jpeg" + "os" + "regexp" + "strconv" + "strings" + + "github.com/anthonynsimon/bild/transform" + "github.com/unidoc/unipdf/v4/common/license" + "github.com/unidoc/unipdf/v4/creator" + "github.com/unidoc/unipdf/v4/extractor" + "github.com/unidoc/unipdf/v4/model" + "github.com/unidoc/unipdf/v4/ocr" +) + +// BBox represents a bounding box with coordinates. +type BBox struct { + X0 int + Y0 int + X1 int + Y1 int +} + +// Baseline represents baseline information for text. +type Baseline struct { + Slope float64 + Offset float64 +} + +// TitleAttributes contains parsed attributes from the title field. +type TitleAttributes struct { + BBox *BBox + Baseline *Baseline + XSize float64 + XDescenders float64 + XAscenders float64 + XWConf int // Word confidence (0-100) + PageNo int // Page number + Image string // Image path/name +} + +// OCRWord represents a word element in hOCR. +type OCRWord struct { + XMLName xml.Name `xml:"span"` + Class string `xml:"class,attr"` + ID string `xml:"id,attr"` + Title string `xml:"title,attr"` + Content string `xml:",chardata"` +} + +// OCRLine represents a line element in hOCR. +type OCRLine struct { + XMLName xml.Name `xml:"span"` + Class string `xml:"class,attr"` + ID string `xml:"id,attr"` + Title string `xml:"title,attr"` + Words []OCRWord `xml:"span"` +} + +// OCRPar represents a paragraph element in hOCR. +type OCRPar struct { + XMLName xml.Name `xml:"p"` + Class string `xml:"class,attr"` + ID string `xml:"id,attr"` + Lang string `xml:"lang,attr"` + Title string `xml:"title,attr"` + Lines []OCRLine `xml:"span"` +} + +// OCRCArea represents a column area element in hOCR. +type OCRCArea struct { + XMLName xml.Name `xml:"div"` + Class string `xml:"class,attr"` + ID string `xml:"id,attr"` + Title string `xml:"title,attr"` + Pars []OCRPar `xml:"p"` +} + +// OCRPage represents a page element in hOCR. +type OCRPage struct { + XMLName xml.Name `xml:"div"` + Class string `xml:"class,attr"` + ID string `xml:"id,attr"` + Title string `xml:"title,attr"` + CAreas []OCRCArea `xml:"div"` +} + +// HOCRDocument represents the root hOCR document structure. +type HOCRDocument struct { + XMLName xml.Name `xml:"div"` + Pages []OCRPage `xml:"div"` +} + +// ParseTitleAttributes parses the title attribute string and extracts structured data. +func ParseTitleAttributes(title string) *TitleAttributes { + attrs := &TitleAttributes{} + + // Regular expressions for parsing different attributes + bboxRe := regexp.MustCompile(`bbox\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)`) + baselineRe := regexp.MustCompile(`baseline\s+([-\d.]+)\s+([-\d.]+)`) + xSizeRe := regexp.MustCompile(`x_size\s+([\d.]+)`) + xDescendersRe := regexp.MustCompile(`x_descenders\s+([\d.]+)`) + xAscendersRe := regexp.MustCompile(`x_ascenders\s+([\d.]+)`) + xWConfRe := regexp.MustCompile(`x_wconf\s+(\d+)`) + pagenoRe := regexp.MustCompile(`ppageno\s+(\d+)`) + imageRe := regexp.MustCompile(`image\s+"([^"]*)"`) + + // Parse bbox + if matches := bboxRe.FindStringSubmatch(title); matches != nil { + x0, _ := strconv.Atoi(matches[1]) + y0, _ := strconv.Atoi(matches[2]) + x1, _ := strconv.Atoi(matches[3]) + y1, _ := strconv.Atoi(matches[4]) + attrs.BBox = &BBox{X0: x0, Y0: y0, X1: x1, Y1: y1} + } + + // Parse baseline + if matches := baselineRe.FindStringSubmatch(title); matches != nil { + slope, _ := strconv.ParseFloat(matches[1], 64) + offset, _ := strconv.ParseFloat(matches[2], 64) + attrs.Baseline = &Baseline{Slope: slope, Offset: offset} + } + + // Parse x_size + if matches := xSizeRe.FindStringSubmatch(title); matches != nil { + attrs.XSize, _ = strconv.ParseFloat(matches[1], 64) + } + + // Parse x_descenders + if matches := xDescendersRe.FindStringSubmatch(title); matches != nil { + attrs.XDescenders, _ = strconv.ParseFloat(matches[1], 64) + } + + // Parse x_ascenders + if matches := xAscendersRe.FindStringSubmatch(title); matches != nil { + attrs.XAscenders, _ = strconv.ParseFloat(matches[1], 64) + } + + // Parse x_wconf (word confidence) + if matches := xWConfRe.FindStringSubmatch(title); matches != nil { + attrs.XWConf, _ = strconv.Atoi(matches[1]) + } + + // Parse ppageno + if matches := pagenoRe.FindStringSubmatch(title); matches != nil { + attrs.PageNo, _ = strconv.Atoi(matches[1]) + } + + // Parse image + if matches := imageRe.FindStringSubmatch(title); matches != nil { + attrs.Image = matches[1] + } + + return attrs +} + +// GetText returns the text content from a word, trimming whitespace. +func (w *OCRWord) GetText() string { + return strings.TrimSpace(w.Content) +} + +// GetAttributes returns parsed title attributes for the word. +func (w *OCRWord) GetAttributes() *TitleAttributes { + return ParseTitleAttributes(w.Title) +} + +// GetText returns the concatenated text content from all words in the line. +func (l *OCRLine) GetText() string { + var text strings.Builder + for i, word := range l.Words { + if i > 0 { + text.WriteString(" ") + } + text.WriteString(word.GetText()) + } + return text.String() +} + +// GetAttributes returns parsed title attributes for the line. +func (l *OCRLine) GetAttributes() *TitleAttributes { + return ParseTitleAttributes(l.Title) +} + +// GetText returns the concatenated text content from all lines in the paragraph. +func (p *OCRPar) GetText() string { + var text strings.Builder + for i, line := range p.Lines { + if i > 0 { + text.WriteString("\n") + } + text.WriteString(line.GetText()) + } + return text.String() +} + +// GetAttributes returns parsed title attributes for the paragraph. +func (p *OCRPar) GetAttributes() *TitleAttributes { + return ParseTitleAttributes(p.Title) +} + +// GetText returns the concatenated text content from all paragraphs in the area. +func (c *OCRCArea) GetText() string { + var text strings.Builder + for i, par := range c.Pars { + if i > 0 { + text.WriteString("\n\n") + } + text.WriteString(par.GetText()) + } + return text.String() +} + +// GetAttributes returns parsed title attributes for the column area. +func (c *OCRCArea) GetAttributes() *TitleAttributes { + return ParseTitleAttributes(c.Title) +} + +// GetText returns the concatenated text content from all areas in the page. +func (p *OCRPage) GetText() string { + var text strings.Builder + for i, carea := range p.CAreas { + if i > 0 { + text.WriteString("\n\n") + } + text.WriteString(carea.GetText()) + } + return text.String() +} + +// GetAttributes returns parsed title attributes for the page. +func (p *OCRPage) GetAttributes() *TitleAttributes { + return ParseTitleAttributes(p.Title) +} + +func init() { + // Make sure to load your metered License API key prior to using the library. + // If you need a key, you can sign up and create a free one at https://cloud.unidoc.io + err := license.SetMeteredKey(os.Getenv(`UNIDOC_LICENSE_API_KEY`)) + if err != nil { + panic(err) + } + + // common.SetLogger(common.NewConsoleLogger(common.LogLevelDebug)) +} + +func main() { + if len(os.Args) < 2 { + fmt.Printf("Usage: go run reconstruct_pdf_from_hocr.go input.pdf\n") + os.Exit(1) + } + + // Load images from the PDF. + images, err := loadImages(os.Args[1]) + if err != nil { + fmt.Printf("Error loading images: %v\n", err) + os.Exit(1) + } + + outDir := "output" + if _, err := os.Stat(outDir); os.IsNotExist(err) { + err := os.Mkdir(outDir, 0755) + if err != nil { + fmt.Printf("Error creating output directory: %v\n", err) + os.Exit(1) + } + } + + // Process each image with OCR. + for pageNum, imgList := range images { + fmt.Printf("Processing images on page %d\n", pageNum+1) + for _, img := range imgList { + ocrPage, err := processImage(img) + if err != nil { + fmt.Printf("Error processing image on page %d: %s\n", pageNum+1, err) + continue + } + + // Successfully processed image, ocrPage contains the parsed data + writeContentAsPDF(ocrPage, fmt.Sprintf("output/page_%d.pdf", pageNum+1)) + } + } +} + +// loadImages loads images from the specified PDF file. +func loadImages(inputPath string) ([][]image.Image, error) { + result := make([][]image.Image, 0) + + // Load images from the PDF and return an error if any occurs. + pdfReader, f, err := model.NewPdfReaderFromFile(inputPath, nil) + if err != nil { + return nil, err + } + defer f.Close() + + numPages, err := pdfReader.GetNumPages() + if err != nil { + return nil, err + } + + fmt.Print("Loading images from PDF document") + + totalImages := 0 + for i := 0; i < numPages; i++ { + page, err := pdfReader.GetPage(i + 1) + if err != nil { + return nil, err + } + + pextract, err := extractor.New(page) + if err != nil { + return nil, err + } + + pimages, err := pextract.ExtractPageImages(nil) + if err != nil { + return nil, err + } + + result = append(result, make([]image.Image, 0)) + for _, img := range pimages.Images { + goImg, err := img.Image.ToGoImage() + if err != nil { + return nil, err + } + + rotatedImg := transform.Rotate(goImg, float64(*page.Rotate), nil) + + result[i] = append(result[i], rotatedImg) + } + + totalImages += len(pimages.Images) + + fmt.Print(".") + } + + fmt.Println(" Done") + + fmt.Printf("Total: %d images\n", totalImages) + + return result, nil +} + +// processImage sends the image to the OCR service and processes the HOCR response. +func processImage(img image.Image) (*OCRPage, error) { + var buf bytes.Buffer + if err := jpeg.Encode(&buf, img, nil); err != nil { + return nil, err + } + + imgReader := bytes.NewReader(buf.Bytes()) + + // Configure OCR service options. + opts := ocr.OCROptions{ + Url: "http://localhost:8080/file", + Method: "POST", + FileFieldName: "file", + Headers: map[string]string{ + "Accept": "application/json", + }, + FormFields: map[string]string{ + "format": "hocr", + }, + TimeoutSeconds: 30, + } + + // Create OCR client. + client := ocr.NewHTTPOCRService(opts) + + result, err := client.ExtractText(context.Background(), imgReader, "image.jpg") + if err != nil { + return nil, fmt.Errorf("error extracting text: %w", err) + } + + // Parse JSON response to extract the "result" field. + var jsonObj map[string]interface{} + if err := json.Unmarshal(result, &jsonObj); err != nil { + return nil, fmt.Errorf("error parsing JSON response: %w", err) + } + + content, ok := jsonObj["result"].(string) + if !ok { + return nil, fmt.Errorf("result field is not a string") + } + + // Parse hOCR HTML content + var ocrPage OCRPage + if err := xml.Unmarshal([]byte(content), &ocrPage); err != nil { + return nil, fmt.Errorf("error unmarshalling HOCR data: %w", err) + } + + return &ocrPage, nil +} + +func writeContentAsPDF(p *OCRPage, outputPath string) error { + if p.CAreas == nil || len(p.CAreas) == 0 { + fmt.Println("CAreas empty") + return nil + } + + pt := p.GetAttributes() + + c := creator.New() + c.SetPageSize(creator.PageSize{float64(pt.BBox.X1), float64(pt.BBox.Y1)}) + c.NewPage() + + for _, a := range p.CAreas { + // Process each column area in the page + at := a.GetAttributes() + + adiv := c.NewDivision() + adiv.SetMargins(float64(at.BBox.X0)-float64(pt.BBox.X0), + float64(pt.BBox.Y1)-float64(at.BBox.Y1), + float64(pt.BBox.X1)-float64(at.BBox.X1), + float64(at.BBox.Y0)-float64(pt.BBox.Y0), + ) + + for _, par := range a.Pars { + // Process each paragraph in the area + for _, l := range par.Lines { + // Process each line in the paragraph + lt := l.GetAttributes() + + for _, w := range l.Words { + // Process each word in the line + wt := w.GetAttributes() + + sp := c.NewStyledParagraph() + sp.SetPos(float64(wt.BBox.X0), float64(wt.BBox.Y1)) + sp.SetMargins(0, float64(pt.BBox.X1)-float64(wt.BBox.X1), 0, float64(pt.BBox.Y1)-float64(wt.BBox.Y1)) + sp.SetFontSize(lt.XSize) + sp.SetText(w.GetText()) + + adiv.Add(sp) + } + } + } + + c.Draw(adiv) + } + + err := c.WriteToFile(outputPath) + if err != nil { + return err + } + + fmt.Println("Saved reconstructed PDF to", outputPath) + + return nil +}