Skip to content

Commit 713a855

Browse files
authored
Merge pull request #105 from UTDNebula/academic-cal-fixes
Academic cal fixes
2 parents 1ef1b8d + 771bcd8 commit 713a855

File tree

5 files changed

+46
-31
lines changed

5 files changed

+46
-31
lines changed

Dockerfile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ RUN apt-get update && apt-get install -y chromium
2323
ENV CHROMIUM_BIN=/usr/bin/chromium
2424
ENV GOOGLE_CHROME_BIN=/usr/bin/chromium
2525

26+
# Install poppler-utils for pdftotext for academic calendar parsing
27+
RUN apt-get install -y poppler-utils
28+
2629
# Copy build file from builder
2730
COPY --from=builder /app/api-tools /app/api-tools
2831

@@ -32,5 +35,5 @@ RUN chmod +x /app/runners/setup.sh
3235
ENTRYPOINT ["/app/runners/setup.sh"]
3336

3437
# Optional .env copy for development
35-
FROM base AS dev
38+
FROM base AS local
3639
COPY .env /app/.env

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,8 @@ Docker is used for automated running on Google Cloud Platform. More info [here](
9898

9999
To build the container for local testing first make sure all scripts in the `runners` folder have LF line endings then run:
100100
```
101-
docker build --target dev -t my-runner:local .
102-
docker run --rm -e ENVIRONMENT=dev -e RUNNER_SCRIPT_NAME=daily.sh my-runner:local
101+
docker build --target local -t my-runner:local .
102+
docker run --rm -e ENVIRONMENT=local -e RUNNER_SCRIPT_NAME=daily.sh my-runner:local
103103
```
104104

105105
## Questions?

go.mod

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ require (
99
github.com/chromedp/chromedp v0.12.1
1010
github.com/google/go-cmp v0.7.0
1111
github.com/joho/godotenv v1.5.1
12-
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80
1312
github.com/valyala/fastjson v1.6.4
1413
go.mongodb.org/mongo-driver v1.17.3
1514
golang.org/x/net v0.43.0

parser/academicCalendars.go

Lines changed: 39 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
/*
2+
Code requires having pdftotext installed: https://www.xpdfreader.com/pdftotext-man.html
3+
apt-get install -y poppler-utils
4+
I found all the Go programs for PDF text extraction were all either paid, had a
5+
complicated installation process, or errored on one of the PDFs.
6+
*/
7+
18
package parser
29

310
import (
@@ -12,13 +19,14 @@ import (
1219
"log"
1320
"net/http"
1421
"os"
22+
"os/exec"
1523
"path/filepath"
1624
"strings"
1725
"sync"
26+
"time"
1827

1928
"github.com/UTDNebula/api-tools/utils"
2029
"github.com/UTDNebula/nebula-api/api/schema"
21-
"github.com/ledongthuc/pdf"
2230
"google.golang.org/genai"
2331
)
2432

@@ -73,14 +81,15 @@ PDF Content:
7381

7482
func ParseAcademicCalendars(inDir string, outDir string) {
7583
// Get sub folder from output folder
76-
outSubDir := filepath.Join(outDir, "academicCalendars")
84+
inSubDir := filepath.Join(inDir, "academicCalendars")
7785

7886
result := []schema.AcademicCalendar{}
7987

8088
// Parallel requests
8189
numWorkers := 10
8290
jobs := make(chan string)
8391
var wg sync.WaitGroup
92+
var mu sync.Mutex
8493

8594
// Start worker goroutines
8695
for range numWorkers {
@@ -92,16 +101,33 @@ func ParseAcademicCalendars(inDir string, outDir string) {
92101

93102
academicCalendar, err := parsePdf(path)
94103
if err != nil {
95-
panic(err)
104+
if strings.Contains(err.Error(), "429") {
105+
// Exponential-ish backoff up to 60s for 429 rate limiting
106+
backoffs := []time.Duration{20 * time.Second, 40 * time.Second, 60 * time.Second}
107+
for _, delay := range backoffs {
108+
time.Sleep(delay)
109+
academicCalendar, err = parsePdf(path)
110+
if err == nil || !strings.Contains(err.Error(), "429") {
111+
break
112+
}
113+
}
114+
}
115+
116+
if err != nil {
117+
panic(err)
118+
}
96119
}
120+
121+
mu.Lock()
97122
result = append(result, academicCalendar)
123+
mu.Unlock()
98124

99125
log.Printf("Parsed %s!", filepath.Base(path))
100126
}
101127
}()
102128
}
103129

104-
err := filepath.WalkDir(outSubDir, func(path string, d fs.DirEntry, err error) error {
130+
err := filepath.WalkDir(inSubDir, func(path string, d fs.DirEntry, err error) error {
105131
if err != nil {
106132
return err
107133
}
@@ -196,33 +222,20 @@ func parsePdf(path string) (schema.AcademicCalendar, error) {
196222
}
197223

198224
// Read the text from the first page of a PDF
225+
// Using external program pdftotext
199226
func readPdf(path string) (string, error) {
200-
// Open the PDF
201-
f, r, err := pdf.Open(path)
202-
if err != nil {
203-
return "", err
204-
}
205-
defer f.Close()
206-
207-
// Make sure at least one page exists
208-
if r.NumPage() < 1 {
209-
return "", fmt.Errorf("no pages in PDF")
210-
}
227+
cmd := exec.Command("pdftotext", "-l", "1", "-raw", path, "-")
211228

212-
// Get the first page
213-
page := r.Page(1) // pages are 1-indexed
214-
if page.V.IsNull() {
215-
return "", fmt.Errorf("failed to read page 1")
216-
}
229+
var out bytes.Buffer
230+
var stderr bytes.Buffer
231+
cmd.Stdout = &out
232+
cmd.Stderr = &stderr
217233

218-
// Read text
219-
var buf bytes.Buffer
220-
text := page.Content().Text
221-
for _, t := range text {
222-
buf.WriteString(t.S) // S is the actual string
234+
if err := cmd.Run(); err != nil {
235+
return "", fmt.Errorf("failed to run pdftotext: %v (%s)", err, stderr.String())
223236
}
224237

225-
return buf.String(), nil
238+
return out.String(), nil
226239
}
227240

228241
// Check cache for a response to the same prompt

runners/setup.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/sh
22

3-
if [ "$ENVIRONMENT" = "prod" ]; then
3+
if [ "$ENVIRONMENT" = "gcp" ]; then
44
# auth with service account
55
gcloud secrets versions access latest --secret="$SERVICE_ACCOUNT_SECRET_NAME" > service_account.json
66
gcloud auth activate-service-account --key-file=service_account.json

0 commit comments

Comments
 (0)