Skip to content

Commit bbb7d75

Browse files
committed
Switch to using external tool pdftotext
1 parent ce45744 commit bbb7d75

File tree

3 files changed

+36
-25
lines changed

3 files changed

+36
-25
lines changed

Dockerfile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ RUN apt-get update && apt-get install -y chromium
2323
ENV CHROMIUM_BIN=/usr/bin/chromium
2424
ENV GOOGLE_CHROME_BIN=/usr/bin/chromium
2525

26+
# Install poppler-utils for pdftotext for academic calendar parsing
27+
RUN apt-get install -y poppler-utils
28+
2629
# Copy build file from builder
2730
COPY --from=builder /app/api-tools /app/api-tools
2831

go.mod

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ require (
99
github.com/chromedp/chromedp v0.12.1
1010
github.com/google/go-cmp v0.7.0
1111
github.com/joho/godotenv v1.5.1
12-
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80
1312
github.com/valyala/fastjson v1.6.4
1413
go.mongodb.org/mongo-driver v1.17.3
1514
golang.org/x/net v0.43.0

parser/academicCalendars.go

Lines changed: 33 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
/*
2+
Code requires having pdftotext installed: https://www.xpdfreader.com/pdftotext-man.html
3+
apt-get install -y poppler-utils
4+
I found all the Go programs for PDF text extraction were all either paid, had a
5+
complicated installation process, or errored on one of the PDFs.
6+
*/
7+
18
package parser
29

310
import (
@@ -12,13 +19,14 @@ import (
1219
"log"
1320
"net/http"
1421
"os"
22+
"os/exec"
1523
"path/filepath"
1624
"strings"
1725
"sync"
26+
"time"
1827

1928
"github.com/UTDNebula/api-tools/utils"
2029
"github.com/UTDNebula/nebula-api/api/schema"
21-
"github.com/ledongthuc/pdf"
2230
"google.golang.org/genai"
2331
)
2432

@@ -93,7 +101,21 @@ func ParseAcademicCalendars(inDir string, outDir string) {
93101

94102
academicCalendar, err := parsePdf(path)
95103
if err != nil {
96-
panic(err)
104+
if strings.Contains(err.Error(), "429") {
105+
// Exponential-ish backoff up to 60s for 429 rate limiting
106+
backoffs := []time.Duration{20 * time.Second, 40 * time.Second, 60 * time.Second}
107+
for _, delay := range backoffs {
108+
time.Sleep(delay)
109+
academicCalendar, err = parsePdf(path)
110+
if err == nil || !strings.Contains(err.Error(), "429") {
111+
break
112+
}
113+
}
114+
}
115+
116+
if err != nil {
117+
panic(err)
118+
}
97119
}
98120

99121
mu.Lock()
@@ -200,33 +222,20 @@ func parsePdf(path string) (schema.AcademicCalendar, error) {
200222
}
201223

202224
// Read the text from the first page of a PDF
225+
// Using external program pdftotext
203226
func readPdf(path string) (string, error) {
204-
// Open the PDF
205-
f, r, err := pdf.Open(path)
206-
if err != nil {
207-
return "", err
208-
}
209-
defer f.Close()
227+
cmd := exec.Command("pdftotext", "-l", "1", "-raw", path, "-")
210228

211-
// Make sure at least one page exists
212-
if r.NumPage() < 1 {
213-
return "", fmt.Errorf("no pages in PDF")
214-
}
215-
216-
// Get the first page
217-
page := r.Page(1) // pages are 1-indexed
218-
if page.V.IsNull() {
219-
return "", fmt.Errorf("failed to read page 1")
220-
}
229+
var out bytes.Buffer
230+
var stderr bytes.Buffer
231+
cmd.Stdout = &out
232+
cmd.Stderr = &stderr
221233

222-
// Read text
223-
var buf bytes.Buffer
224-
text := page.Content().Text
225-
for _, t := range text {
226-
buf.WriteString(t.S) // S is the actual string
234+
if err := cmd.Run(); err != nil {
235+
return "", fmt.Errorf("failed to run pdftotext: %v (%s)", err, stderr.String())
227236
}
228237

229-
return buf.String(), nil
238+
return out.String(), nil
230239
}
231240

232241
// Check cache for a response to the same prompt

0 commit comments

Comments
 (0)