1+ /*
2+ Code requires having pdftotext installed: https://www.xpdfreader.com/pdftotext-man.html
3+ apt-get install -y poppler-utils
4+ I found all the Go programs for PDF text extraction were all either paid, had a
5+ complicated installation process, or errored on one of the PDFs.
6+ */
7+
18package parser
29
310import (
@@ -12,13 +19,14 @@ import (
1219 "log"
1320 "net/http"
1421 "os"
22+ "os/exec"
1523 "path/filepath"
1624 "strings"
1725 "sync"
26+ "time"
1827
1928 "github.com/UTDNebula/api-tools/utils"
2029 "github.com/UTDNebula/nebula-api/api/schema"
21- "github.com/ledongthuc/pdf"
2230 "google.golang.org/genai"
2331)
2432
@@ -73,14 +81,15 @@ PDF Content:
7381
7482func ParseAcademicCalendars (inDir string , outDir string ) {
7583 // Get sub folder from output folder
76- outSubDir := filepath .Join (outDir , "academicCalendars" )
84+ inSubDir := filepath .Join (inDir , "academicCalendars" )
7785
7886 result := []schema.AcademicCalendar {}
7987
8088 // Parallel requests
8189 numWorkers := 10
8290 jobs := make (chan string )
8391 var wg sync.WaitGroup
92+ var mu sync.Mutex
8493
8594 // Start worker goroutines
8695 for range numWorkers {
@@ -92,16 +101,33 @@ func ParseAcademicCalendars(inDir string, outDir string) {
92101
93102 academicCalendar , err := parsePdf (path )
94103 if err != nil {
95- panic (err )
104+ if strings .Contains (err .Error (), "429" ) {
105+ // Exponential-ish backoff up to 60s for 429 rate limiting
106+ backoffs := []time.Duration {20 * time .Second , 40 * time .Second , 60 * time .Second }
107+ for _ , delay := range backoffs {
108+ time .Sleep (delay )
109+ academicCalendar , err = parsePdf (path )
110+ if err == nil || ! strings .Contains (err .Error (), "429" ) {
111+ break
112+ }
113+ }
114+ }
115+
116+ if err != nil {
117+ panic (err )
118+ }
96119 }
120+
121+ mu .Lock ()
97122 result = append (result , academicCalendar )
123+ mu .Unlock ()
98124
99125 log .Printf ("Parsed %s!" , filepath .Base (path ))
100126 }
101127 }()
102128 }
103129
104- err := filepath .WalkDir (outSubDir , func (path string , d fs.DirEntry , err error ) error {
130+ err := filepath .WalkDir (inSubDir , func (path string , d fs.DirEntry , err error ) error {
105131 if err != nil {
106132 return err
107133 }
@@ -196,33 +222,20 @@ func parsePdf(path string) (schema.AcademicCalendar, error) {
196222}
197223
198224// Read the text from the first page of a PDF
225+ // Using external program pdftotext
199226func readPdf (path string ) (string , error ) {
200- // Open the PDF
201- f , r , err := pdf .Open (path )
202- if err != nil {
203- return "" , err
204- }
205- defer f .Close ()
206-
207- // Make sure at least one page exists
208- if r .NumPage () < 1 {
209- return "" , fmt .Errorf ("no pages in PDF" )
210- }
227+ cmd := exec .Command ("pdftotext" , "-l" , "1" , "-raw" , path , "-" )
211228
212- // Get the first page
213- page := r .Page (1 ) // pages are 1-indexed
214- if page .V .IsNull () {
215- return "" , fmt .Errorf ("failed to read page 1" )
216- }
229+ var out bytes.Buffer
230+ var stderr bytes.Buffer
231+ cmd .Stdout = & out
232+ cmd .Stderr = & stderr
217233
218- // Read text
219- var buf bytes.Buffer
220- text := page .Content ().Text
221- for _ , t := range text {
222- buf .WriteString (t .S ) // S is the actual string
234+ if err := cmd .Run (); err != nil {
235+ return "" , fmt .Errorf ("failed to run pdftotext: %v (%s)" , err , stderr .String ())
223236 }
224237
225- return buf .String (), nil
238+ return out .String (), nil
226239}
227240
228241// Check cache for a response to the same prompt
0 commit comments