Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ powered by the [UniPDF](https://github.com/unidoc/unipdf) PDF library.
- [Extract text from PDF files](#extract-text)
- [Extract images from PDF files](#extract-images)
- [Search text in PDF files](#search)
- [Replace text in PDF files](#replace)
- [Export PDF form fields as JSON](#form-export)
- [Fill PDF form fields from JSON file](#form-fill)
- [Fill PDF form fields from FDF file](#fdf-merge)
Expand All @@ -36,7 +37,7 @@ powered by the [UniPDF](https://github.com/unidoc/unipdf) PDF library.

## Installation

Minimum required Go version: 1.13. We officially support the 3 latest minor versions of Go, but it may work on earlier ones as well.
Minimum required Go version: 1.18. We officially support the 3 latest minor versions of Go, but it may work on earlier ones as well.

```
git clone git@github.com:unidoc/unipdf-cli.git
Expand Down Expand Up @@ -372,6 +373,26 @@ unipdf search input_file.pdf text_to_search
unipdf search -p pass input_file.pdf text_to_search
```

#### Replace

Replace text in PDF files.

```
unipdf replace [FLAG]... INPUT_FILE TEXT

Flags:
-o, --output-file string output file
-r, - replace-text string replacement text
-p, --password string PDF file password

Examples:
unipdf replace input_file.pdf text_to_search
unipdf replace -o output_file.pdf input_file.pdf text_to_search
unipdf replace -o output_file.pdf -r replacement_text input_file.pdf text_to_search
unipdf replace -o output_file.pdf -r replacement_text -p pass input_file.pdf text_to_search
```


#### Form Export

Export JSON representation of form fields.
Expand Down
74 changes: 74 additions & 0 deletions internal/cli/replace.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/

package cli

import (
"errors"
"fmt"

"github.com/spf13/cobra"
"github.com/unidoc/unipdf-cli/pkg/pdf"
)

const replaceCmdDesc = `Replace text in PDF files`

var replaceCmdExample = fmt.Sprintf("%s\n%s\n%s\n%s\n",
fmt.Sprintf("%s replace input_file.pdf text_to_search", appName),
fmt.Sprintf("%s replace -o output_file input_file.pdf text_to_search", appName),
fmt.Sprintf("%s replace -o output_file -r new_text input_file.pdf text_to_search", appName),
fmt.Sprintf("%s replace -o output_file -r new_text -p pass input_file.pdf text_to_search", appName),
)

// replaceCmd represents the replace command.
var replaceCmd = &cobra.Command{
Use: "replace [FLAG]... INPUT_FILE TEXT",
Short: "Replace text in PDF files",
Long: replaceCmdDesc,
Example: replaceCmdExample,
DisableFlagsInUseLine: true,
Run: func(cmd *cobra.Command, args []string) {
// Parse input parameters.
inputPath := args[0]
text := args[1]
password, _ := cmd.Flags().GetString("password")

// Parse output file.
outputPath, _ := cmd.Flags().GetString("output-file")
if outputPath == "" {
outputPath = inputPath
}

// Parse replaceText.
replaceText, _ := cmd.Flags().GetString("replace-text")
if replaceText == "" {
replaceText = text
}

// Search text.
err := pdf.Replace(inputPath, outputPath, text, replaceText, password)
if err != nil {
printErr("Could not replace the specified text: %s\n", err)
}

fmt.Printf("Successfully replaced text %s with %s\n", text, replaceText)
fmt.Printf("Output file saved to %s\n", outputPath)
},
Args: func(_ *cobra.Command, args []string) error {
if len(args) < 2 {
return errors.New("must provide a PDF file and the text to search")
}

return nil
},
}

func init() {
rootCmd.AddCommand(replaceCmd)

replaceCmd.Flags().StringP("output-file", "o", "", "output file")
replaceCmd.Flags().StringP("replace-text", "r", "", "replacement text")
replaceCmd.Flags().StringP("password", "p", "", "input file password")
}
254 changes: 254 additions & 0 deletions pkg/pdf/replace.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/

package pdf

import (
"strings"

"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/contentstream"
"github.com/unidoc/unipdf/v3/core"
"github.com/unidoc/unipdf/v3/model"
unipdf "github.com/unidoc/unipdf/v3/model"
)

type textChunk struct {
font *model.PdfFont
strObj *core.PdfObjectString
val string
idx int
}

func (tc *textChunk) encode() {
var encoded string
if font := tc.font; font != nil {
encodedBytes, numMisses := font.StringToCharcodeBytes(tc.val)
if numMisses != 0 {
common.Log.Debug("WARN: some runes could not be encoded.\n\t%s -> %v")
}
encoded = string(encodedBytes)
}

*tc.strObj = *core.MakeString(encoded)
}

type textChunks struct {
text string
chunks []*textChunk
}

func (tc *textChunks) replace(search, replacement string) {
text := tc.text
chunks := tc.chunks

// Steps:
// 1. Search for the first index of the search term in the text.
// 2. Use the found index to match the text chunk which contains
// (or partly contains) the search term.
// 3. Replace the search term in the found text chunk. The search term
// will not always start at the beginning of the text chunk. Also,
// the search term could be split in multiple text chunks. If that's
// the case, replace the portion of the search term in the found
// chunk and continue removing characters from the following chunks
// until the search term has been completely erased.
// 4. Offset the text chunks slice to the last processed text chunk from
// the previous step, if the text chunk was not completely erased, or
// to the next one otherwise. This is necessary so that the visited
// text chunks are skipped when searching for the next occurrence of the
// search term.
// 5. Discard the part of the text up to (and including) the index found
// in step one.
// 6. Move to step 1 in order to search for the search term in the remaining
// text.
var chunkOffset int
matchIdx := strings.Index(text, search)
for currMatchIdx := matchIdx; matchIdx != -1; {
for i, chunk := range chunks[chunkOffset:] {
idx, lenChunk := chunk.idx, len(chunk.val)
if currMatchIdx < idx || currMatchIdx > idx+lenChunk-1 {
continue
}
chunkOffset += i + 1

start := currMatchIdx - idx
remaining := len(search) - (lenChunk - start)

replaceVal := chunk.val[:start] + replacement
if remaining < 0 {
replaceVal += chunk.val[lenChunk+remaining:]
chunkOffset--
}

chunk.val = replaceVal
chunk.encode()

for j := chunkOffset; remaining > 0; j++ {
c := chunks[j]
l := len(c.val)

if l > remaining {
c.val = c.val[remaining:]
} else {
c.val = ""
chunkOffset++
}

c.encode()
remaining -= l
}

break
}

text = text[matchIdx+1:]
matchIdx = strings.Index(text, search)
currMatchIdx += matchIdx + 1
}

tc.text = strings.Replace(tc.text, search, replacement, -1)
}

// Replace searches the provided text in the PDF file specified by the inputPath
// parameter and replaces it by the newText. A password can be passed in for encrypted input files.
// The result is saved to outputPath.
func Replace(inputPath, outputPath, text, replaceText, password string) error {
// Read input file.
r, pages, _, _, err := readPDF(inputPath, password)
if err != nil {
return err
}

w := unipdf.NewPdfWriter()

// Search specified text.
for i := 0; i < pages; i++ {
// Get page.
numPage := i + 1

page, err := r.GetPage(numPage)
if err != nil {
return err
}

err = searchReplacePageText(page, text, replaceText)
if err != nil {
return err
}

err = w.AddPage(page)
if err != nil {
return err
}
}

// Write output file.
safe := inputPath == outputPath
return writePDF(outputPath, &w, safe)
}

func searchReplacePageText(page *model.PdfPage, searchText, replaceText string) error {
contents, err := page.GetAllContentStreams()
if err != nil {
return err
}

ops, err := contentstream.NewContentStreamParser(contents).Parse()
if err != nil {
return err
}

// Generate text chunks.
var currFont *model.PdfFont
tc := textChunks{}

textProcFunc := func(objptr *core.PdfObject) {
strObj, ok := core.GetString(*objptr)
if !ok {
common.Log.Debug("Invalid parameter, skipping")
return
}

str := strObj.String()
if currFont != nil {
decoded, _, numMisses := currFont.CharcodeBytesToUnicode(strObj.Bytes())
if numMisses != 0 {
common.Log.Debug("WARN: some charcodes could not be decoded.\n\t%v -> %s", strObj.Bytes(), decoded)
}
str = decoded
}

tc.chunks = append(tc.chunks, &textChunk{
font: currFont,
strObj: strObj,
val: str,
idx: len(tc.text),
})
tc.text += str
}

processor := contentstream.NewContentStreamProcessor(*ops)
processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "",
func(op *contentstream.ContentStreamOperation, _ contentstream.GraphicsState, resources *model.PdfPageResources) error {
switch op.Operand {
case `Tj`, `'`:
if len(op.Params) != 1 {
common.Log.Debug("Invalid: Tj/' with invalid set of parameters - skip")
return nil
}
textProcFunc(&op.Params[0])
case `''`:
if len(op.Params) != 3 {
common.Log.Debug("Invalid: '' with invalid set of parameters - skip")
return nil
}
textProcFunc(&op.Params[3])
case `TJ`:
if len(op.Params) != 1 {
common.Log.Debug("Invalid: TJ with invalid set of parameters - skip")
return nil
}
arr, _ := core.GetArray(op.Params[0])
for i := range arr.Elements() {
obj := arr.Get(i)
textProcFunc(&obj)
arr.Set(i, obj)
}
case "Tf":
if len(op.Params) != 2 {
common.Log.Debug("Invalid: Tf with invalid set of parameters - skip")
return nil
}

fname, ok := core.GetName(op.Params[0])
if !ok || fname == nil {
common.Log.Debug("ERROR: could not get font name")
return nil
}

fObj, has := resources.GetFontByName(*fname)
if !has {
common.Log.Debug("ERROR: font %s not found", fname.String())
return nil
}

pdfFont, err := model.NewPdfFontFromPdfObject(fObj)
if err != nil {
common.Log.Debug("ERROR: loading font")
return nil
}
currFont = pdfFont
}

return nil
})

if err = processor.Process(page.Resources); err != nil {
return err
}

tc.replace(searchText, replaceText)
return page.SetContentStreams([]string{ops.String()}, core.NewFlateEncoder())
}