Skip to content

Commit 4d6ec3a

Browse files
authored
feat(stars): Achieve sampling mode to obtain star historical data from a large database repository (#248)
- Add GitHub Max sample pages configuration option - Implement Link header parsing to get the actual maximum number of pages - Add a sampling algorithm to evenly distribute the page fetching strategy - Support pagination sampling data fetching for large repositories - Optimize concurrent request limits and error handling mechanisms - Change the Y-axis data of the chart to use actual counts instead of index values
1 parent 3ae9911 commit 4d6ec3a

File tree

6 files changed

+284
-24
lines changed

6 files changed

+284
-24
lines changed

README.md

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,22 @@
66

77
Plot your repo stars over time!
88

9+
## Features
10+
11+
### Smart Sampling Mode (Large Repository Optimization)
12+
13+
For large repositories with massive amounts of stars, this project uses **Smart Sampling Mode** to efficiently fetch star history data and render trend charts.
14+
15+
**How it works:**
16+
17+
1. **Auto Detection**: First requests the first page of GitHub API data and parses the `Link` Header to get total page count
18+
2. **Mode Switching**:
19+
- When total pages ≤ `maxSamplePages` (default 15 pages, ~1500 stars), fetches all data
20+
- When total pages > `maxSamplePages`, automatically switches to sampling mode
21+
3. **Uniform Sampling**: Evenly selects sample points across all pages to ensure coverage of the complete star growth timeline
22+
4. **Data Point Extraction**: Extracts the timestamp and corresponding star count from the first Stargazer of each sampled page
23+
5. **Trend Completion**: Adds current time and total star count as the final data point to ensure the chart extends to the latest state
24+
925
## Usage
1026

1127
```console
@@ -14,6 +30,19 @@ go run main.go
1430

1531
Then browse http://localhost:3000/me/myrepo .
1632

17-
Example chart:
33+
## Configuration
34+
35+
Configure via environment variables:
36+
37+
| Variable | Default | Description |
38+
|----------|---------|-------------|
39+
| `REDIS_URL` | `redis://localhost:6379` | Redis cache URL |
40+
| `GITHUB_TOKENS` | - | GitHub API Token (supports multiple, comma-separated) |
41+
| `GITHUB_PAGE_SIZE` | `100` | Number of stars per page |
42+
| `GITHUB_MAX_SAMPLE_PAGES` | `15` | Max sample pages (triggers sampling mode when exceeded) |
43+
| `GITHUB_MAX_RATE_LIMIT_USAGE` | `80` | API Rate Limit usage threshold percentage |
44+
| `LISTEN` | `127.0.0.1:3000` | Server listen address |
45+
46+
## Example
1847

1948
[![starcharts stargazers over time](https://starchart.cc/caarlos0/starcharts.svg)](https://starchart.cc/caarlos0/starcharts)

config/config.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ type Config struct {
1111
GitHubTokens []string `env:"GITHUB_TOKENS"`
1212
GitHubPageSize int `env:"GITHUB_PAGE_SIZE" envDefault:"100"`
1313
GitHubMaxRateUsagePct int `env:"GITHUB_MAX_RATE_LIMIT_USAGE" envDefault:"80"`
14+
GitHubMaxSamplePages int `env:"GITHUB_MAX_SAMPLE_PAGES" envDefault:"15"`
1415
Listen string `env:"LISTEN" envDefault:"127.0.0.1:3000"`
1516
}
1617

controller/chart.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,13 @@ func GetRepoChart(gh *github.GitHub, cache *cache.Redis) http.Handler {
6565
}
6666
for i, star := range stargazers {
6767
series.XValues = append(series.XValues, star.StarredAt)
68-
series.YValues = append(series.YValues, float64(i+1))
68+
// If star.Count > 0, use the actual count from sampling mode
69+
// Otherwise use index+1 (non-sampling mode, continuous data)
70+
if star.Count > 0 {
71+
series.YValues = append(series.YValues, float64(star.Count))
72+
} else {
73+
series.YValues = append(series.YValues, float64(i+1))
74+
}
6975
}
7076
if len(series.XValues) < 2 {
7177
log.Info("not enough results, adding some fake ones")

internal/github/github.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ var ErrGitHubAPI = errors.New("failed to talk with github api")
2424
type GitHub struct {
2525
tokens roundrobin.RoundRobiner
2626
pageSize int
27+
maxSamplePages int
2728
cache *cache.Redis
2829
maxRateUsagePct int
2930
}
@@ -66,9 +67,10 @@ func init() {
6667
func New(config config.Config, cache *cache.Redis) *GitHub {
6768
tokensCount.Set(float64(len(config.GitHubTokens)))
6869
return &GitHub{
69-
tokens: roundrobin.New(config.GitHubTokens),
70-
pageSize: config.GitHubPageSize,
71-
cache: cache,
70+
tokens: roundrobin.New(config.GitHubTokens),
71+
pageSize: config.GitHubPageSize,
72+
maxSamplePages: config.GitHubMaxSamplePages,
73+
cache: cache,
7274
}
7375
}
7476

internal/github/stars.go

Lines changed: 237 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@ import (
66
"errors"
77
"fmt"
88
"io"
9+
"math"
910
"net/http"
11+
"regexp"
1012
"sort"
13+
"strconv"
1114
"sync"
1215
"time"
1316

@@ -16,28 +19,128 @@ import (
1619
)
1720

1821
var (
19-
errNoMorePages = errors.New("no more pages to get")
20-
ErrTooManyStars = errors.New("repo has too many stargazers, github won't allow us to list all stars")
22+
errNoMorePages = errors.New("no more pages to get")
23+
// linkLastPageRegex is used to parse the last page number from the Link header
24+
linkLastPageRegex = regexp.MustCompile(`[&?]page=(\d+)[^>]*>;\s*rel="last"`)
2125
)
2226

27+
// maxConcurrentRequests is the maximum number of concurrent requests to GitHub API
28+
const maxConcurrentRequests = 5
29+
2330
// Stargazer is a star at a given time.
2431
type Stargazer struct {
2532
StarredAt time.Time `json:"starred_at"`
33+
// Count represents the actual position/count of this star (used in sampling mode).
34+
// If 0, use index+1 as count (non-sampling mode).
35+
Count int `json:"-"`
2636
}
2737

2838
// Stargazers returns all the stargazers of a given repo.
39+
// If star count is too large, it uses sampling mode to fetch data points.
2940
func (gh *GitHub) Stargazers(ctx context.Context, repo Repository) (stars []Stargazer, err error) {
30-
if gh.totalPages(repo) > 400 {
31-
return stars, ErrTooManyStars
41+
// First request the first page to get the actual max page count (via Link header)
42+
firstPageStars, lastPage, err := gh.getFirstPageAndLastPage(ctx, repo)
43+
if err != nil {
44+
return nil, err
45+
}
46+
47+
log.WithField("repo", repo.FullName).
48+
WithField("lastPage", lastPage).
49+
WithField("starCount", repo.StargazersCount).
50+
Debug("got pagination info from API")
51+
52+
// If only one page or page count is less than max sample pages, fetch all pages
53+
if lastPage <= gh.maxSamplePages {
54+
return gh.getAllStargazersWithFirstPage(ctx, repo, firstPageStars, lastPage)
55+
}
56+
57+
// Otherwise use sampling mode
58+
return gh.getSampledStargazers(ctx, repo, firstPageStars, lastPage)
59+
}
60+
61+
// getFirstPageAndLastPage requests the first page and parses the Link header to get the max page count.
62+
func (gh *GitHub) getFirstPageAndLastPage(ctx context.Context, repo Repository) ([]Stargazer, int, error) {
63+
log := log.WithField("repo", repo.FullName)
64+
65+
resp, err := gh.makeStarPageRequest(ctx, repo, 1, "")
66+
if err != nil {
67+
return nil, 0, err
68+
}
69+
defer resp.Body.Close()
70+
71+
if resp.StatusCode == http.StatusForbidden {
72+
rateLimits.Inc()
73+
log.Warn("rate limit hit")
74+
return nil, 0, ErrRateLimit
75+
}
76+
77+
if resp.StatusCode != http.StatusOK {
78+
bts, _ := io.ReadAll(resp.Body)
79+
return nil, 0, fmt.Errorf("%w: %v", ErrGitHubAPI, string(bts))
80+
}
81+
82+
bts, err := io.ReadAll(resp.Body)
83+
if err != nil {
84+
return nil, 0, err
85+
}
86+
87+
var stars []Stargazer
88+
if err := json.Unmarshal(bts, &stars); err != nil {
89+
return nil, 0, err
90+
}
91+
92+
// Parse Link header to get the max page count
93+
linkHeader := resp.Header.Get("Link")
94+
lastPage := gh.parseLastPageFromLink(linkHeader)
95+
96+
// If no Link header or parsing failed, there is only one page
97+
if lastPage == 0 {
98+
lastPage = 1
99+
}
100+
101+
log.WithField("lastPage", lastPage).Debug("parsed last page from Link header")
102+
103+
return stars, lastPage, nil
104+
}
105+
106+
// parseLastPageFromLink parses the max page count from the Link header.
107+
// Link header format: <url>; rel="next", <url>; rel="last"
108+
func (gh *GitHub) parseLastPageFromLink(linkHeader string) int {
109+
if linkHeader == "" {
110+
return 0
111+
}
112+
113+
matches := linkLastPageRegex.FindStringSubmatch(linkHeader)
114+
if len(matches) < 2 {
115+
return 0
116+
}
117+
118+
lastPage, err := strconv.Atoi(matches[1])
119+
if err != nil {
120+
return 0
121+
}
122+
123+
return lastPage
124+
}
125+
126+
// getAllStargazersWithFirstPage fetches all stargazers (used for small repositories).
127+
// firstPageStars is the already fetched first page data.
128+
func (gh *GitHub) getAllStargazersWithFirstPage(ctx context.Context, repo Repository, firstPageStars []Stargazer, lastPage int) (stars []Stargazer, err error) {
129+
stars = append(stars, firstPageStars...)
130+
131+
// If only one page, return directly
132+
if lastPage <= 1 {
133+
return stars, nil
32134
}
33135

34136
var (
35137
wg errgroup.Group
36138
lock sync.Mutex
37139
)
38140

39-
wg.SetLimit(4)
40-
for page := 1; page <= gh.lastPage(repo); page++ {
141+
wg.SetLimit(maxConcurrentRequests)
142+
// Start fetching from page 2 (page 1 is already fetched)
143+
for page := 2; page <= lastPage; page++ {
41144
page := page
42145
wg.Go(func() error {
43146
result, err := gh.getStargazersPage(ctx, repo, page)
@@ -61,6 +164,134 @@ func (gh *GitHub) Stargazers(ctx context.Context, repo Repository) (stars []Star
61164
return
62165
}
63166

167+
// getSampledStargazers fetches stargazers using sampling mode (used for large repositories).
168+
// Inspired by star-history project's sampling logic.
169+
// firstPageStars is the already fetched first page data, lastPage is the actual max page count parsed from Link header.
170+
func (gh *GitHub) getSampledStargazers(ctx context.Context, repo Repository, firstPageStars []Stargazer, lastPage int) (stars []Stargazer, err error) {
171+
log.WithField("repo", repo.FullName).
172+
WithField("lastPage", lastPage).
173+
Info("using sampling mode for large repo")
174+
175+
// Calculate sample page numbers, evenly distributed across all pages
176+
samplePages := gh.calculateSamplePages(lastPage, gh.maxSamplePages)
177+
178+
type pageResult struct {
179+
page int
180+
star Stargazer
181+
starCount int // the actual count position of this star
182+
}
183+
184+
var (
185+
wg errgroup.Group
186+
lock sync.Mutex
187+
results []pageResult
188+
)
189+
190+
// First page is already fetched, add it to results directly
191+
if len(firstPageStars) > 0 {
192+
results = append(results, pageResult{
193+
page: 1,
194+
star: firstPageStars[0],
195+
starCount: 1,
196+
})
197+
}
198+
199+
wg.SetLimit(maxConcurrentRequests)
200+
for _, page := range samplePages {
201+
// Skip first page (already fetched)
202+
if page == 1 {
203+
continue
204+
}
205+
page := page
206+
wg.Go(func() error {
207+
result, err := gh.getStargazersPage(ctx, repo, page)
208+
if errors.Is(err, errNoMorePages) {
209+
return nil
210+
}
211+
if err != nil {
212+
return err
213+
}
214+
if len(result) == 0 {
215+
return nil
216+
}
217+
218+
// Calculate the actual position of the first star on this page (based on page number and page size)
219+
// The 1st star on page 1 is star #1
220+
// The 1st star on page N is star #(N-1)*pageSize + 1
221+
starCount := (page-1)*gh.pageSize + 1
222+
223+
lock.Lock()
224+
defer lock.Unlock()
225+
results = append(results, pageResult{
226+
page: page,
227+
star: result[0],
228+
starCount: starCount,
229+
})
230+
return nil
231+
})
232+
}
233+
234+
if err = wg.Wait(); err != nil {
235+
return nil, err
236+
}
237+
238+
// Sort results by page number
239+
sort.Slice(results, func(i, j int) bool {
240+
return results[i].page < results[j].page
241+
})
242+
243+
// Extract the first star from each sampled page as a data point and set Count
244+
for _, r := range results {
245+
star := r.star
246+
star.Count = r.starCount
247+
stars = append(stars, star)
248+
}
249+
250+
// Add the last data point (current time and total star count)
251+
// This ensures the chart extends to the current time point
252+
stars = append(stars, Stargazer{
253+
StarredAt: time.Now(),
254+
Count: repo.StargazersCount,
255+
})
256+
257+
return stars, nil
258+
}
259+
260+
// calculateSamplePages calculates the page numbers to sample.
261+
// Evenly distributed across all pages, ensuring the first page is included.
262+
func (gh *GitHub) calculateSamplePages(totalPages, maxSamples int) []int {
263+
pages := make([]int, 0, maxSamples)
264+
265+
for i := 1; i <= maxSamples; i++ {
266+
// Calculate evenly distributed page numbers
267+
page := int(math.Round(float64(i*totalPages) / float64(maxSamples)))
268+
if page < 1 {
269+
page = 1
270+
}
271+
if page > totalPages {
272+
page = totalPages
273+
}
274+
pages = append(pages, page)
275+
}
276+
277+
// Ensure first page is included (important for displaying start time)
278+
if len(pages) > 0 && pages[0] != 1 {
279+
pages[0] = 1
280+
}
281+
282+
// Deduplicate (may have duplicates in edge cases)
283+
seen := make(map[int]bool)
284+
uniquePages := make([]int, 0, len(pages))
285+
for _, p := range pages {
286+
if !seen[p] {
287+
seen[p] = true
288+
uniquePages = append(uniquePages, p)
289+
}
290+
}
291+
292+
return uniquePages
293+
}
294+
64295
// - get last modified from cache
65296
// - if exists, hit api with it
66297
// - if it returns 304, get from cache
@@ -139,14 +370,6 @@ func (gh *GitHub) getStargazersPage(ctx context.Context, repo Repository, page i
139370
}
140371
}
141372

142-
func (gh *GitHub) totalPages(repo Repository) int {
143-
return repo.StargazersCount / gh.pageSize
144-
}
145-
146-
func (gh *GitHub) lastPage(repo Repository) int {
147-
return gh.totalPages(repo) + 1
148-
}
149-
150373
func (gh *GitHub) makeStarPageRequest(ctx context.Context, repo Repository, page int, etag string) (*http.Response, error) {
151374
url := fmt.Sprintf(
152375
"https://api.github.com/repos/%s/stargazers?page=%d&per_page=%d",

0 commit comments

Comments
 (0)