Skip to content

Commit 08b831f

Browse files
author
R. S. Doiel
committed
working proof of concept
1 parent e1aa0c9 commit 08b831f

File tree

7 files changed

+166
-41
lines changed

7 files changed

+166
-41
lines changed

articlefetch.1.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
%articlefetch(1) user manual | version 0.0.0 fd34fb1
1+
%articlefetch(1) user manual | version 0.0.0 e1aa0c9
22
% R. S. Doiel
33
% 2025-10-23
44

articlefetch.go

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@ import (
44
"fmt"
55
"io"
66
"os"
7-
"strings"
7+
"path/filepath"
8+
//"strings"
89
"time"
910
)
1011

@@ -17,34 +18,56 @@ func Run(in io.Reader, out io.Writer, eout io.Writer, appName string, hostname s
1718
return 1
1819
}
1920
tot := len(rdmIds)
21+
t0 := time.Now()
22+
iTime := time.Now()
23+
reportProgress := false
2024
retrieved := 0
21-
fmt.Printf("retrieving %d records\n", tot)
22-
pdfToRetrieve := []string{}
25+
waitTime := 5 * time.Second
26+
fmt.Printf("processing %d records\n", tot)
2327
for i, id := range rdmIds {
28+
if i > 0 {
29+
if iTime, reportProgress = CheckWaitInterval(iTime, waitTime); reportProgress {
30+
fmt.Printf("%s | next id %q\n", ProgressETA(t0, i, tot), id)
31+
time.Sleep(waitTime)
32+
}
33+
}
2434
rdmUrl := RdmRecordURL(hostname, id)
25-
src, duration, err := RdmFetchJSON(rdmUrl)
35+
src, err := RdmFetchJSON(rdmUrl)
2636
if err != nil {
2737
fmt.Fprintf(os.Stderr, "%s\n", err)
2838
continue
2939
}
30-
time.Sleep(duration)
3140

3241
pdfUrls, err := RdmPdfURLs(src)
3342
if err != nil {
3443
fmt.Fprintf(os.Stderr, "failed to find a pdfUrls %q, %s\n", rdmUrl, err)
3544
continue
3645
}
3746
if len(pdfUrls) > 0 {
38-
fmt.Printf("DEBUG pdfUrls (%d)\n\t%+v\n", len(pdfUrls), strings.Join(pdfUrls, "\n\t"))
39-
pdfToRetrieve = append(pdfToRetrieve, pdfUrls...)
40-
time.Sleep(10 * time.Second)
47+
// Make a directory for {clpid}/{rdmid}
48+
saveDir := filepath.Join(clpid, id)
49+
if _, err := os.Stat(saveDir); err != nil {
50+
os.MkdirAll(saveDir, 0775)
51+
}
52+
// For each PDF create a directory for the RDM record id
53+
for i, pdfUrl := range pdfUrls {
54+
fName, err := RdmGetFilenameFromContentURL(pdfUrl)
55+
if err != nil {
56+
fmt.Fprintf(os.Stderr, "failed to extract filename (file no. %d for %s) %s, %s\n", i, id, pdfUrl, err)
57+
}
58+
// Retrieve and write out the PDF to dir
59+
if src, err := RdmRetrieveFile(pdfUrl); err != nil {
60+
fmt.Fprintf(os.Stderr, "failed to retrieve file %s, %s\n", pdfUrl, err)
61+
} else {
62+
fName = filepath.Join(saveDir, fName)
63+
if err := os.WriteFile(fName, src, 0664); err != nil {
64+
fmt.Fprintf(os.Stderr, "failed to write %s, %s\n", fName, err)
65+
}
66+
}
67+
}
4168
}
4269
retrieved += 1
43-
if (i % 5) == 0{
44-
fmt.Printf("%d/%d processed\n", i+1, tot)
45-
}
4670
}
47-
fmt.Printf("%d/%d retrieved\n", retrieved, tot)
48-
fmt.Printf("Retrieve the following URL:\n\n%s\n\n", strings.Join(pdfToRetrieve, "\n\t"))
71+
fmt.Printf("%d of %d records processed\n", retrieved, tot)
4972
return 0
5073
}

installer.ps1

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/usr/bin/env pwsh
2-
# generated with CMTools 0.0.0 fd34fb1
2+
# generated with CMTools 0.0.0 e1aa0c9
33

44
#
55
# Set the package name and version to install

installer.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/bin/sh
2-
# generated with CMTools 0.0.0 fd34fb1
2+
# generated with CMTools 0.0.0 e1aa0c9
33

44
#
55
# Set the package name and version to install

progress.go

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
package articlefetch
2+
3+
import (
4+
"fmt"
5+
"time"
6+
)
7+
8+
// CheckWaitInterval checks to see if an interval of time has been met or exceeded.
9+
// It returns the remaining time interval (possibly reset) and a boolean. The
10+
// boolean is true when the time interval has been met or exceeded, false otherwise.
11+
//
12+
// ```
13+
// tot := len(something) // calculate the total number of items to process
14+
// t0 := time.Now()
15+
// iTime := time.Now()
16+
// reportProgress := false
17+
//
18+
// for i, key := range records {
19+
// // ... process stuff ...
20+
// if iTime, reportProgress = CheckWaitInterval(rptTime, (30 * time.Second)); reportProgress {
21+
// log.Printf("%s", ProgressETA(t0, i, tot))
22+
// }
23+
// }
24+
//
25+
// ```
26+
func CheckWaitInterval(iTime time.Time, wait time.Duration) (time.Time, bool) {
27+
if time.Since(iTime) >= wait {
28+
iTime = time.Now()
29+
return iTime, true
30+
}
31+
return iTime, false
32+
}
33+
34+
// ProgressETA returns a string with the percentage processed and estimated time remaining.
35+
// It requires the a counter of records processed, the total count of records and a time zero value.
36+
//
37+
// ```
38+
// tot := len(something) // calculate the total number of items to process
39+
// t0 := time.Now()
40+
// iTime := time.Now()
41+
// reportProgress := false
42+
//
43+
// for i, key := range records {
44+
// // ... process stuff ...
45+
// if iTime, reportProgress = CheckWaitInterval(rptTime, (30 * time.Second)); reportProgress {
46+
// log.Printf("%s", ProgressETA(t0, i, tot))
47+
// }
48+
// }
49+
//
50+
// ```
51+
func ProgressETA(t0 time.Time, i int, tot int) string {
52+
if i == 0 {
53+
return fmt.Sprintf("%.2f%% ETA unknown", 0.0)
54+
}
55+
56+
percent := (float64(i) / float64(tot)) * 100.0
57+
elapsedT := float64(time.Since(t0))
58+
eta := (time.Duration((float64(tot) * (float64(elapsedT) / float64(i))) - elapsedT))
59+
return fmt.Sprintf("%.2f%% ETA %v", percent, eta.Round(time.Second))
60+
}
61+
62+
// ProgressIPS returns a string with the elapsed time and increments per second.
63+
// Takes a time zero, a counter and time unit. Returns a string with count, running time and
64+
// increments per time unit.
65+
// ```
66+
// t0 := time.Now()
67+
// iTime := time.Now()
68+
// reportProgress := false
69+
//
70+
// for i, key := range records {
71+
// // ... process stuff ...
72+
// if iTime, reportProgress = CheckWaitInterval(iTime, (30 * time.Second)); reportProgress || i = 0 {
73+
// log.Printf("%s", ProgressIPS(t0, i, time.Second))
74+
// }
75+
// }
76+
//
77+
// ```
78+
func ProgressIPS(t0 time.Time, i int, timeUnit time.Duration) string {
79+
if i == 0 {
80+
return fmt.Sprintf("(%d/%s) IPS unknown", i, time.Since(t0).Round(timeUnit))
81+
}
82+
ips := float64(i) / float64(time.Since(t0).Seconds())
83+
return fmt.Sprintf("(%d/%s) IPS %.2f i/sec.", i, time.Since(t0).Round(timeUnit), ips)
84+
}

query.go

Lines changed: 42 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,9 @@ import (
55
"fmt"
66
"io"
77
"net/http"
8-
"os"
9-
"strconv"
8+
"net/url"
9+
"path"
1010
"strings"
11-
"time"
1211
)
1312

1413
// FeedsURL takes a set of query terms and returns the url with query
@@ -63,48 +62,67 @@ func RdmRecordURL(hostname string, id string) string {
6362
return fmt.Sprintf("https://%s/api/records/%s/files", hostname, id)
6463
}
6564

66-
func RdmFetchJSON(u string) ([]byte, time.Duration, error) {
67-
var targetTime time.Time
65+
func RdmFetchJSON(u string) ([]byte, error) {
6866
res, err := http.Get(u)
69-
tReset := res.Header.Get("X-RateLimit-Reset")
70-
if tReset != "" {
71-
unixTime, _ := strconv.ParseInt(tReset, 10, 64)
72-
targetTime = time.Unix(unixTime, 0)
73-
fmt.Fprintf(os.Stderr, "Reset will happen at %s\n", targetTime.Format(time.RFC822Z))
74-
}
7567
if err != nil {
76-
return nil, time.Until(targetTime), err
77-
}
78-
duration := time.Until(targetTime)
79-
if res.StatusCode != 200 {
80-
return nil, duration, fmt.Errorf("failed to retrieve %s, %s", u, res.Status)
68+
return nil, err
8169
}
8270
body, err := io.ReadAll(res.Body)
8371
if err != nil {
84-
return nil, duration, err
72+
return nil, err
8573
}
8674
res.Body.Close()
87-
return body, duration, nil
75+
return body, nil
8876
}
8977

9078
func RdmPdfURLs(src []byte) ([]string, error) {
91-
//fmt.Printf("DEBUG obj retrieved -> %s\n", src)
9279
contentUrls := []string{}
9380
obj := map[string]interface{}{}
9481
if err := JSONUnmarshal(src, &obj); err != nil {
9582
return nil, fmt.Errorf("failed to unmarhsal object %s\n", err)
9683
}
97-
if entries, ok := obj["entries"].([]map[string]interface{}); ok {
98-
for _, entry := range entries {
84+
// DEBUG
85+
//src, _ = JSONMarshalIndent(obj, "", " ") // DEBUG
86+
//fmt.Printf("DEBUG obj retrieved -> %s\n", src) // DEBUG
87+
if entries, ok := obj["entries"].([]interface{}); ok {
88+
//fmt.Printf("DEBUG entries (%T): %+v\n", entries)
89+
for _, val := range entries {
90+
entry := val.(map[string]interface{})
91+
//src, _ = JSONMarshalIndent(entry, "", " ") // DEBUG
92+
//fmt.Printf("DEBUG obj entries -> %s\n", src) // DEBUG
9993
if mimetype, ok := entry["mimetype"].(string); ok && mimetype == "application/pdf" {
100-
if links, ok := entry["links"].(map[string]string); ok {
101-
if contentUrl, ok := links["content"]; ok {
102-
fmt.Printf("DEBUG contentUrl: %s\n", contentUrl)
94+
//fmt.Printf("DEBUG entry.mimetype -> %s\n", mimetype) // DEBUG
95+
if links, ok := entry["links"].(map[string]interface{}); ok {
96+
//src, _ = JSONMarshalIndent(links, "", " ") // DEBUG
97+
//fmt.Printf("DEBUG links %s\n", src) // DEBUG
98+
if contentUrl, ok := links["content"].(string); ok {
99+
//fmt.Printf("DEBUG contentUrl: %s\n", contentUrl)
103100
contentUrls = append(contentUrls, contentUrl)
104101
}
105102
}
106103
}
107104
}
108105
}
109106
return contentUrls, nil
107+
}
108+
109+
func RdmGetFilenameFromContentURL(s string) (string, error) {
110+
u, err := url.Parse(s)
111+
if err != nil {
112+
return "", err
113+
}
114+
return path.Base(strings.TrimSuffix(u.Path, "/content")), nil
115+
}
116+
117+
func RdmRetrieveFile(u string) ([]byte, error) {
118+
res, err := http.Get(u)
119+
if err != nil {
120+
return nil, err
121+
}
122+
body, err := io.ReadAll(res.Body)
123+
if err != nil {
124+
return nil, err
125+
}
126+
res.Body.Close()
127+
return body, nil
110128
}

version.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ const (
1212
ReleaseDate = "2025-10-23"
1313

1414
// ReleaseHash, the Git hash when version.go was generated
15-
ReleaseHash = "fd34fb1"
15+
ReleaseHash = "e1aa0c9"
1616
LicenseText = `
1717
1818
Copyright (c) 2025, Caltech

0 commit comments

Comments
 (0)