|
| 1 | +// Copyright 2025 Dolthub, Inc. |
| 2 | +// |
| 3 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +// you may not use this file except in compliance with the License. |
| 5 | +// You may obtain a copy of the License at |
| 6 | +// |
| 7 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +// |
| 9 | +// Unless required by applicable law or agreed to in writing, software |
| 10 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +// See the License for the specific language governing permissions and |
| 13 | +// limitations under the License. |
| 14 | + |
| 15 | +package main |
| 16 | + |
| 17 | +import ( |
| 18 | + "context" |
| 19 | + "encoding/json" |
| 20 | + "fmt" |
| 21 | + "io" |
| 22 | + "net/http" |
| 23 | + "net/url" |
| 24 | + "os" |
| 25 | + "path/filepath" |
| 26 | + "runtime" |
| 27 | + "strconv" |
| 28 | + "strings" |
| 29 | + "time" |
| 30 | +) |
| 31 | + |
| 32 | +const ( |
| 33 | + query = `extension:sql pg_dump` |
| 34 | + downloadCount = 110 |
| 35 | +) |
| 36 | + |
| 37 | +// RepoName simply contains the name of the repository. |
| 38 | +type RepoName struct { |
| 39 | + FullName string `json:"full_name"` |
| 40 | +} |
| 41 | + |
| 42 | +// Item is a SQL file (hopefully) containing a pg_dump. |
| 43 | +type Item struct { |
| 44 | + Name string `json:"name"` |
| 45 | + Path string `json:"path"` |
| 46 | + HtmlURL string `json:"html_url"` |
| 47 | + ContentsURL string `json:"url"` |
| 48 | + Repository RepoName `json:"repository"` |
| 49 | +} |
| 50 | + |
| 51 | +// CodeSearchResult contains the result of a code search. |
| 52 | +type CodeSearchResult struct { |
| 53 | + TotalCount int `json:"total_count"` |
| 54 | + IncompleteResults bool `json:"incomplete_results"` |
| 55 | + Items []Item `json:"items"` |
| 56 | + Message string `json:"message"` // Only used when there's an error |
| 57 | +} |
| 58 | + |
| 59 | +// ContentFile is all of the information about a SQL file, including how to retrieve it. |
| 60 | +type ContentFile struct { |
| 61 | + Type string `json:"type"` |
| 62 | + Name string `json:"name"` |
| 63 | + Path string `json:"path"` |
| 64 | + SHA string `json:"sha"` |
| 65 | + Size int64 `json:"size"` |
| 66 | + HTMLURL string `json:"html_url"` |
| 67 | + DownloadURL string `json:"download_url"` |
| 68 | +} |
| 69 | + |
| 70 | +func main() { |
| 71 | + ctx := context.Background() |
| 72 | + httpClient := &http.Client{Timeout: 30 * time.Second} |
| 73 | + token := os.Getenv("GITHUB_TOKEN") |
| 74 | + if len(token) == 0 { |
| 75 | + fmt.Println("Must provide a GITHUB_TOKEN as an environment variable") |
| 76 | + os.Exit(1) |
| 77 | + } |
| 78 | + |
| 79 | + _, currentFileLocation, _, ok := runtime.Caller(0) |
| 80 | + if !ok { |
| 81 | + fmt.Println("Unable to find the folder where this file is located") |
| 82 | + os.Exit(1) |
| 83 | + } |
| 84 | + dumpsFolder := filepath.Clean(filepath.Join(filepath.Dir(currentFileLocation), "../sql")) |
| 85 | + |
| 86 | + var saved int |
| 87 | + page := 1 |
| 88 | + |
| 89 | +OuterLoop: |
| 90 | + for { |
| 91 | + remaining := downloadCount - saved |
| 92 | + items, err := SearchCode(ctx, httpClient, token, page, min(50, remaining)) |
| 93 | + if err != nil { |
| 94 | + fmt.Println(err) |
| 95 | + os.Exit(1) |
| 96 | + } |
| 97 | + if len(items) == 0 { |
| 98 | + break |
| 99 | + } |
| 100 | + |
| 101 | + for _, item := range items { |
| 102 | + cf, err := GetContent(ctx, httpClient, token, item.ContentsURL) |
| 103 | + if err != nil { |
| 104 | + fmt.Printf("warn: %s/%s: %v\n", item.Repository.FullName, item.Path, err) |
| 105 | + continue |
| 106 | + } |
| 107 | + if cf.Type != "file" || cf.DownloadURL == "" { |
| 108 | + continue |
| 109 | + } |
| 110 | + |
| 111 | + dest := filepath.Join(dumpsFolder, SanitizePath(item.Repository.FullName)+filepath.Ext(cf.Path)) |
| 112 | + if _, err = os.Stat(dest); err == nil { |
| 113 | + continue |
| 114 | + } |
| 115 | + if err = DownloadFile(ctx, httpClient, item, cf.DownloadURL, dest); err != nil { |
| 116 | + fmt.Printf("download error: %s -> %v\n", dest, err) |
| 117 | + continue |
| 118 | + } |
| 119 | + fmt.Printf("saved: %s (%d bytes)\n", dest, cf.Size) |
| 120 | + |
| 121 | + saved++ |
| 122 | + if saved >= downloadCount { |
| 123 | + break OuterLoop |
| 124 | + } |
| 125 | + time.Sleep(6500 * time.Millisecond) // We sleep to mitigate rate limits |
| 126 | + } |
| 127 | + page++ |
| 128 | + } |
| 129 | +} |
| 130 | + |
| 131 | +// SearchCode executes the query against the API, returning all items that were found. |
| 132 | +func SearchCode(ctx context.Context, hc *http.Client, token string, page int, perPage int) ([]Item, error) { |
| 133 | + params := url.Values{} |
| 134 | + params.Set("q", query) |
| 135 | + params.Set("page", strconv.Itoa(page)) |
| 136 | + params.Set("per_page", strconv.Itoa(perPage)) |
| 137 | + |
| 138 | + req, _ := http.NewRequestWithContext(ctx, "GET", "https://api.github.com/search/code?"+params.Encode(), nil) |
| 139 | + SetHeaders(req, token) |
| 140 | + resp, err := hc.Do(req) |
| 141 | + if err != nil { |
| 142 | + return nil, err |
| 143 | + } |
| 144 | + defer resp.Body.Close() |
| 145 | + if HandleRate(resp) { |
| 146 | + return SearchCode(ctx, hc, token, page, perPage) |
| 147 | + } |
| 148 | + var sr CodeSearchResult |
| 149 | + if err = json.NewDecoder(resp.Body).Decode(&sr); err != nil { |
| 150 | + return nil, err |
| 151 | + } |
| 152 | + if resp.StatusCode != 200 { |
| 153 | + if sr.Message != "" { |
| 154 | + return nil, fmt.Errorf("search error: %s (HTTP %d)", sr.Message, resp.StatusCode) |
| 155 | + } |
| 156 | + return nil, fmt.Errorf("search error: HTTP %d", resp.StatusCode) |
| 157 | + } |
| 158 | + return sr.Items, nil |
| 159 | +} |
| 160 | + |
| 161 | +// GetContent gets the ContentFile from the given URL. |
| 162 | +func GetContent(ctx context.Context, hc *http.Client, token string, contentsURL string) (*ContentFile, error) { |
| 163 | + req, _ := http.NewRequestWithContext(ctx, "GET", contentsURL, nil) |
| 164 | + SetHeaders(req, token) |
| 165 | + resp, err := hc.Do(req) |
| 166 | + if err != nil { |
| 167 | + return nil, err |
| 168 | + } |
| 169 | + defer resp.Body.Close() |
| 170 | + if HandleRate(resp) { |
| 171 | + return GetContent(ctx, hc, token, contentsURL) |
| 172 | + } |
| 173 | + if resp.StatusCode != 200 { |
| 174 | + b, _ := io.ReadAll(resp.Body) |
| 175 | + return nil, fmt.Errorf("contents error: HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(b))) |
| 176 | + } |
| 177 | + var cf ContentFile |
| 178 | + if err = json.NewDecoder(resp.Body).Decode(&cf); err != nil { |
| 179 | + return nil, err |
| 180 | + } |
| 181 | + return &cf, nil |
| 182 | +} |
| 183 | + |
| 184 | +// DownloadFile downloads the given SQL file to the destination. |
| 185 | +func DownloadFile(ctx context.Context, hc *http.Client, item Item, rawURL string, dest string) error { |
| 186 | + req, _ := http.NewRequestWithContext(ctx, "GET", rawURL, nil) |
| 187 | + req.Header.Set("User-Agent", "gh-pg-dump-finder/1.0") |
| 188 | + resp, err := hc.Do(req) |
| 189 | + if err != nil { |
| 190 | + return err |
| 191 | + } |
| 192 | + defer resp.Body.Close() |
| 193 | + if resp.StatusCode != 200 { |
| 194 | + return fmt.Errorf("download HTTP %d", resp.StatusCode) |
| 195 | + } |
| 196 | + out, err := os.Create(dest) |
| 197 | + if err != nil { |
| 198 | + return err |
| 199 | + } |
| 200 | + defer out.Close() |
| 201 | + _, _ = io.WriteString(out, fmt.Sprintf("-- Downloaded from: %s\n", item.HtmlURL)) |
| 202 | + _, err = io.Copy(out, resp.Body) |
| 203 | + return err |
| 204 | +} |
| 205 | + |
| 206 | +// SetHeaders sets the appropriate headers for a request. |
| 207 | +func SetHeaders(req *http.Request, token string) { |
| 208 | + req.Header.Set("Accept", "application/vnd.github.v3+json") |
| 209 | + req.Header.Set("User-Agent", "gh-pg-dump-finder/1.0") |
| 210 | + req.Header.Set("Authorization", "Bearer "+token) |
| 211 | +} |
| 212 | + |
| 213 | +// HandleRate handles potential rate limits. |
| 214 | +func HandleRate(resp *http.Response) bool { |
| 215 | + if resp.StatusCode == 403 { |
| 216 | + if ra := resp.Header.Get("Retry-After"); ra != "" { |
| 217 | + if secs, _ := strconv.Atoi(ra); secs > 0 { |
| 218 | + sleepTime := time.Duration(secs) * time.Second |
| 219 | + fmt.Printf("rate limited (%s), retrying\n", sleepTime.String()) |
| 220 | + time.Sleep(sleepTime) |
| 221 | + return true |
| 222 | + } |
| 223 | + } |
| 224 | + if reset := resp.Header.Get("X-RateLimit-Reset"); reset != "" { |
| 225 | + if ts, _ := strconv.ParseInt(reset, 10, 64); ts > 0 { |
| 226 | + wait := time.Until(time.Unix(ts+5, 0)) |
| 227 | + if wait > 0 && wait < 5*time.Minute { |
| 228 | + fmt.Printf("rate limited (%s), retrying\n", wait.String()) |
| 229 | + time.Sleep(wait) |
| 230 | + return true |
| 231 | + } |
| 232 | + } |
| 233 | + } |
| 234 | + } |
| 235 | + return false |
| 236 | +} |
| 237 | + |
| 238 | +// SanitizePath removes potentially invalid file system characters. |
| 239 | +func SanitizePath(s string) string { |
| 240 | + illegal := `<>:"\|/?*` |
| 241 | + for _, r := range illegal { |
| 242 | + s = strings.ReplaceAll(s, string(r), "_") |
| 243 | + } |
| 244 | + return s |
| 245 | +} |
0 commit comments