Skip to content

Commit 4a59203

Browse files
committed
Improve library
1 parent a6dfec7 commit 4a59203

File tree

9 files changed

+160
-40
lines changed

9 files changed

+160
-40
lines changed

.github/workflows/ci.yml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
name: Go CI
2+
3+
on:
4+
push:
5+
branches: [ "*" ]
6+
pull_request:
7+
branches: [ "*" ]
8+
9+
jobs:
10+
build-and-test:
11+
runs-on: ubuntu-latest
12+
13+
steps:
14+
- uses: actions/checkout@v4
15+
16+
- name: Set up Go
17+
uses: actions/setup-go@v5
18+
with:
19+
go-version: '1.24'
20+
cache: true
21+
22+
- name: Install dependencies
23+
run: go mod download
24+
25+
- name: Run tests
26+
run: go test -v ./...
27+
28+
- name: Build
29+
run: go build -v ./...

README.md

Lines changed: 37 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ Features
1212

1313
`go get -u github.com/ledongthuc/pdf`
1414

15+
## Examples:
16+
17+
- Check in examples/ folder
18+
1519

1620
## Read plain text
1721

@@ -27,60 +31,56 @@ import (
2731

2832
func main() {
2933
pdf.DebugOn = true
30-
content, err := readPdf("test.pdf") // Read local pdf file
34+
35+
f, r, err := pdf.Open("./pdf_test.pdf")
3136
if err != nil {
3237
panic(err)
3338
}
34-
fmt.Println(content)
35-
return
36-
}
39+
defer f.Close()
3740

38-
func readPdf(path string) (string, error) {
39-
f, r, err := pdf.Open(path)
40-
// remember close file
41-
defer f.Close()
41+
var buf bytes.Buffer
42+
b, err := r.GetPlainText()
4243
if err != nil {
43-
return "", err
44+
panic(err)
4445
}
45-
var buf bytes.Buffer
46-
b, err := r.GetPlainText()
47-
if err != nil {
48-
return "", err
49-
}
50-
buf.ReadFrom(b)
51-
return buf.String(), nil
46+
buf.ReadFrom(b)
47+
content := buf.String()
48+
fmt.Println(content)
5249
}
5350
```
5451

5552
## Read all text with styles from PDF
5653

5754
```golang
58-
func readPdf2(path string) (string, error) {
59-
f, r, err := pdf.Open(path)
60-
// remember close file
55+
package main
56+
57+
import (
58+
"fmt"
59+
60+
"github.com/ledongthuc/pdf"
61+
)
62+
63+
func main() {
64+
f, r, err := pdf.Open("./pdf_test.pdf")
65+
if err != nil {
66+
panic(err)
67+
}
6168
defer f.Close()
69+
70+
sentences, err := r.GetStyledTexts()
6271
if err != nil {
63-
return "", err
72+
panic(err)
6473
}
65-
totalPage := r.NumPage()
6674

67-
for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
68-
p := r.Page(pageIndex)
69-
if p.V.IsNull() {
70-
continue
71-
}
72-
var lastTextStyle pdf.Text
73-
texts := p.Content().Text
74-
for _, text := range texts {
75-
if isSameSentence(text, lastTextStyle) {
76-
lastTextStyle.S = lastTextStyle.S + text.S
77-
} else {
78-
fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n", lastTextStyle.Font, lastTextStyle.FontSize, lastTextStyle.X, lastTextStyle.Y, lastTextStyle.S)
79-
lastTextStyle = text
80-
}
81-
}
75+
// Print all sentences
76+
for _, sentence := range sentences {
77+
fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n",
78+
sentence.Font,
79+
sentence.FontSize,
80+
sentence.X,
81+
sentence.Y,
82+
sentence.S)
8283
}
83-
return "", nil
8484
}
8585
```
8686

examples/read_plain_text/main.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package main
2+
3+
import (
4+
"bytes"
5+
"fmt"
6+
7+
"github.com/ledongthuc/pdf"
8+
)
9+
10+
func main() {
11+
pdf.DebugOn = true
12+
13+
f, r, err := pdf.Open("./pdf_test.pdf")
14+
if err != nil {
15+
panic(err)
16+
}
17+
defer f.Close()
18+
19+
var buf bytes.Buffer
20+
b, err := r.GetPlainText()
21+
if err != nil {
22+
panic(err)
23+
}
24+
buf.ReadFrom(b)
25+
content := buf.String()
26+
fmt.Println(content)
27+
}
40.3 KB
Binary file not shown.
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
package main
2+
3+
import (
4+
"fmt"
5+
6+
"github.com/ledongthuc/pdf"
7+
)
8+
9+
func main() {
10+
f, r, err := pdf.Open("./pdf_test.pdf")
11+
if err != nil {
12+
panic(err)
13+
}
14+
defer f.Close()
15+
16+
sentences, err := r.GetStyledTexts()
17+
if err != nil {
18+
panic(err)
19+
}
20+
21+
// Print all sentences
22+
for _, sentence := range sentences {
23+
fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n",
24+
sentence.Font,
25+
sentence.FontSize,
26+
sentence.X,
27+
sentence.Y,
28+
sentence.S)
29+
}
30+
}
40.3 KB
Binary file not shown.

go.mod

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
module github.com/ledongthuc/pdf
2+
3+
go 1.24.1

page.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,37 @@ func (r *Reader) GetPlainText() (reader io.Reader, err error) {
8282
return &buf, nil
8383
}
8484

85+
// GetStyledTexts returns list all sentences in an array, that are included styles
86+
func (r *Reader) GetStyledTexts() (sentences []Text, err error) {
87+
totalPage := r.NumPage()
88+
for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
89+
p := r.Page(pageIndex)
90+
if p.V.IsNull() {
91+
continue
92+
}
93+
var lastTextStyle Text
94+
texts := p.Content().Text
95+
for _, text := range texts {
96+
if lastTextStyle == (Text{}) {
97+
lastTextStyle = text
98+
continue
99+
}
100+
101+
if IsSameSentence(lastTextStyle, text) {
102+
lastTextStyle.S = lastTextStyle.S + text.S
103+
} else {
104+
sentences = append(sentences, lastTextStyle)
105+
lastTextStyle = text
106+
}
107+
}
108+
if len(lastTextStyle.S) > 0 {
109+
sentences = append(sentences, lastTextStyle)
110+
}
111+
}
112+
113+
return sentences, err
114+
}
115+
85116
func (p Page) findInherited(key string) Value {
86117
for v := p.V; !v.IsNull(); v = v.Key("Parent") {
87118
if r := v.Key(key); !r.IsNull() {
@@ -524,6 +555,8 @@ func (p Page) GetPlainText(fonts map[string]*Font) (result string, err error) {
524555

525556
switch op {
526557
default:
558+
// Easier debug
559+
// fmt.Println("<DEBUG><op>", op, "</op><args>", args, "</args>")
527560
return
528561
case "T*": // move to start of next line
529562
showText("\n")

text.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ package pdf
66

77
import (
88
"math"
9-
"strings"
109
"unicode"
1110
"unicode/utf16"
1211
)
@@ -162,10 +161,9 @@ var macRomanEncoding = [256]rune{
162161
// isSameSentence checks if the current text segment likely belongs to the same sentence
163162
// as the last text segment based on font, size, vertical position, and lack of
164163
// sentence-ending punctuation in the last segment.
165-
func isSameSentence(last, current Text) bool {
164+
func IsSameSentence(last, current Text) bool {
166165
return last.Font == current.Font &&
167166
math.Abs(last.FontSize-current.FontSize) < 0.1 &&
168167
math.Abs(last.Y-current.Y) < 5 &&
169-
!strings.ContainsAny(last.S, ".!?") &&
170168
last.S != ""
171169
}

0 commit comments

Comments
 (0)