1+ /*
2+ Code requires having pdftotext installed: https://www.xpdfreader.com/pdftotext-man.html
3+ apt-get install -y poppler-utils
4+ I found all the Go programs for PDF text extraction were all either paid, had a
5+ complicated installation process, or errored on one of the PDFs.
6+ */
7+
18package parser
29
310import (
@@ -12,13 +19,14 @@ import (
1219 "log"
1320 "net/http"
1421 "os"
22+ "os/exec"
1523 "path/filepath"
1624 "strings"
1725 "sync"
26+ "time"
1827
1928 "github.com/UTDNebula/api-tools/utils"
2029 "github.com/UTDNebula/nebula-api/api/schema"
21- "github.com/ledongthuc/pdf"
2230 "google.golang.org/genai"
2331)
2432
@@ -93,7 +101,21 @@ func ParseAcademicCalendars(inDir string, outDir string) {
93101
94102 academicCalendar , err := parsePdf (path )
95103 if err != nil {
96- panic (err )
104+ if strings .Contains (err .Error (), "429" ) {
105+ // Exponential-ish backoff up to 60s for 429 rate limiting
106+ backoffs := []time.Duration {20 * time .Second , 40 * time .Second , 60 * time .Second }
107+ for _ , delay := range backoffs {
108+ time .Sleep (delay )
109+ academicCalendar , err = parsePdf (path )
110+ if err == nil || ! strings .Contains (err .Error (), "429" ) {
111+ break
112+ }
113+ }
114+ }
115+
116+ if err != nil {
117+ panic (err )
118+ }
97119 }
98120
99121 mu .Lock ()
@@ -200,33 +222,20 @@ func parsePdf(path string) (schema.AcademicCalendar, error) {
200222}
201223
202224// Read the text from the first page of a PDF
225+ // Using external program pdftotext
203226func readPdf (path string ) (string , error ) {
204- // Open the PDF
205- f , r , err := pdf .Open (path )
206- if err != nil {
207- return "" , err
208- }
209- defer f .Close ()
227+ cmd := exec .Command ("pdftotext" , "-l" , "1" , "-raw" , path , "-" )
210228
211- // Make sure at least one page exists
212- if r .NumPage () < 1 {
213- return "" , fmt .Errorf ("no pages in PDF" )
214- }
215-
216- // Get the first page
217- page := r .Page (1 ) // pages are 1-indexed
218- if page .V .IsNull () {
219- return "" , fmt .Errorf ("failed to read page 1" )
220- }
229+ var out bytes.Buffer
230+ var stderr bytes.Buffer
231+ cmd .Stdout = & out
232+ cmd .Stderr = & stderr
221233
222- // Read text
223- var buf bytes.Buffer
224- text := page .Content ().Text
225- for _ , t := range text {
226- buf .WriteString (t .S ) // S is the actual string
234+ if err := cmd .Run (); err != nil {
235+ return "" , fmt .Errorf ("failed to run pdftotext: %v (%s)" , err , stderr .String ())
227236 }
228237
229- return buf .String (), nil
238+ return out .String (), nil
230239}
231240
232241// Check cache for a response to the same prompt
0 commit comments