Skip to content

Commit d05dbc7

Browse files
committed
More comments
1 parent 0e14c01 commit d05dbc7

File tree

1 file changed

+22
-13
lines changed

1 file changed

+22
-13
lines changed

parser/academicCalendars.go

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
var once sync.Once
2727
var geminiClient *genai.Client
2828

29+
// What gets sent to Gemini, with the PDF content added
2930
var prompt = `Parse this PDF content and generate the following JSON schema.
3031
3132
{
@@ -78,12 +79,11 @@ func ParseAcademicCalendars(inDir string, outDir string) {
7879

7980
// Parallel requests
8081
numWorkers := 10
81-
8282
jobs := make(chan string)
8383
var wg sync.WaitGroup
8484

8585
// Start worker goroutines
86-
for i := 0; i < numWorkers; i++ {
86+
for range numWorkers {
8787
wg.Add(1)
8888
go func() {
8989
defer wg.Done()
@@ -122,8 +122,9 @@ func ParseAcademicCalendars(inDir string, outDir string) {
122122
utils.WriteJSON(fmt.Sprintf("%s/academicCalendars.json", outDir), result)
123123
}
124124

125+
// Read a PDF, build a prompt for Gemini to parse it, check if it has already been asked in the cache, and ask Gemini if not
125126
func parsePdf(path string) (schema.AcademicCalendar, error) {
126-
// Fall 2025 to 25F
127+
// "Fall 2025" to "25F"
127128
filename := filepath.Base(path)
128129
filename = filename[0 : len(filename)-4]
129130
filenameParts := strings.Split(filename, "-")
@@ -147,20 +148,24 @@ func parsePdf(path string) (schema.AcademicCalendar, error) {
147148
promptFilled := fmt.Sprintf(prompt, name, timeline, content)
148149

149150
// Check cache
150-
hash := sha256.Sum256([]byte(promptFilled))
151-
key := hex.EncodeToString(hash[:]) + ".json"
152-
result, err := checkCache(key)
151+
hashByte := sha256.Sum256([]byte(promptFilled))
152+
hash := hex.EncodeToString(hashByte[:]) + ".json"
153+
result, err := checkCache(hash)
153154
if err != nil {
154155
return schema.AcademicCalendar{}, err
155156
}
157+
158+
// Skip AI if cache found
156159
if result != "" {
157160
log.Printf("Cache found for %s!", filename)
158161
} else {
162+
// Cache not found
159163
log.Printf("No cache for %s, asking Gemini.", filename)
164+
160165
// AI
161166
geminiClient := getGeminiClient()
162167

163-
// Send with default config
168+
// Send request with default config
164169
response, err := geminiClient.Models.GenerateContent(context.Background(),
165170
"gemini-2.5-pro",
166171
genai.Text(promptFilled),
@@ -170,11 +175,11 @@ func parsePdf(path string) (schema.AcademicCalendar, error) {
170175
return schema.AcademicCalendar{}, err
171176
}
172177

173-
// Get response, remove backtick formatting
178+
// Get response, remove backtick formatting if present
174179
result = strings.ReplaceAll(strings.ReplaceAll(response.Candidates[0].Content.Parts[0].Text, "```json", ""), "```", "")
175180

176-
// Set cache
177-
err = setCache(key, result)
181+
// Set cache for next time
182+
err = setCache(hash, result)
178183
if err != nil {
179184
return schema.AcademicCalendar{}, err
180185
}
@@ -190,6 +195,7 @@ func parsePdf(path string) (schema.AcademicCalendar, error) {
190195
return academicCalendar, nil
191196
}
192197

198+
// Read the text from the first page of a PDF
193199
func readPdf(path string) (string, error) {
194200
// Open the PDF
195201
f, r, err := pdf.Open(path)
@@ -219,6 +225,7 @@ func readPdf(path string) (string, error) {
219225
return buf.String(), nil
220226
}
221227

228+
// Check cache for a response to the same prompt
222229
func checkCache(hash string) (string, error) {
223230
apiUrl, apiBucket, apiKey, apiStorageKey, err := getNebulaKeys()
224231
if err != nil {
@@ -272,7 +279,8 @@ func checkCache(hash string) (string, error) {
272279
return string(body), nil
273280
}
274281

275-
func setCache(key string, result string) error {
282+
// Upload AI response to cache
283+
func setCache(hash string, result string) error {
276284
apiUrl, apiBucket, apiKey, apiStorageKey, err := getNebulaKeys()
277285
if err != nil {
278286
return err
@@ -281,7 +289,7 @@ func setCache(key string, result string) error {
281289
// Make request
282290
jsonStr := []byte(result)
283291
bodyReader := bytes.NewBuffer(jsonStr)
284-
req, err := http.NewRequest("POST", apiUrl+"storage/"+apiBucket+"/"+key, bodyReader)
292+
req, err := http.NewRequest("POST", apiUrl+"storage/"+apiBucket+"/"+hash, bodyReader)
285293
if err != nil {
286294
return err
287295
}
@@ -298,6 +306,7 @@ func setCache(key string, result string) error {
298306
return nil
299307
}
300308

309+
// Get all the keys to access the Nebula API storage routes
301310
func getNebulaKeys() (string, string, string, string, error) {
302311
apiUrl, err := utils.GetEnv("NEBULA_API_URL")
303312
if err != nil {
@@ -320,7 +329,7 @@ func getNebulaKeys() (string, string, string, string, error) {
320329
}
321330

322331
// Create client only once
323-
// Auth is from GOOGLE_GENAI_USE_VERTEXAI, GOOGLE_CLOUD_PROJECT and GOOGLE_APPLICATION_CREDENTIALS environment variables and service account JSON
332+
// Auth is from GOOGLE_GENAI_USE_VERTEXAI, GOOGLE_CLOUD_PROJECT and GOOGLE_APPLICATION_CREDENTIALS environment variables and service account JSON which is created from GEMINI_SERVICE_ACCOUNT
324333
func getGeminiClient() *genai.Client {
325334
once.Do(func() {
326335
// Create JSON file

0 commit comments

Comments
 (0)