diff --git a/.chloggen/gh-scrape-limits.yaml b/.chloggen/gh-scrape-limits.yaml new file mode 100644 index 0000000000000..f312dd002c878 --- /dev/null +++ b/.chloggen/gh-scrape-limits.yaml @@ -0,0 +1,27 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. receiver/filelog) +component: receiver/github + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Add concurrency limit and pull request filtering to reduce rate limiting + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [43388] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [] diff --git a/receiver/githubreceiver/README.md b/receiver/githubreceiver/README.md index 9c335bbed73b9..59673aad2575c 100644 --- a/receiver/githubreceiver/README.md +++ b/receiver/githubreceiver/README.md @@ -77,6 +77,8 @@ receivers: enabled: true github_org: search_query: "org: topic:" # Recommended optional query override, defaults to "{org,user}:" + max_concurrent_requests: 100 # Optional, default: 100 + pull_request_lookback_days: 30 # Optional, default: 30 endpoint: "https://selfmanagedenterpriseserver.com" # Optional auth: authenticator: bearertokenauth/github @@ -97,6 +99,10 @@ service: `search_query` (optional): A filter to narrow down repositories. Defaults to `org:` (or `user:`). For example, use `repo:/` to target a specific repository. Any valid GitHub search syntax is allowed. +`max_concurrent_requests` (optional, default: 100): Maximum concurrent API requests to prevent exceeding GitHub's secondary rate limit of 100 concurrent requests. Set lower if sharing the token with other tools, or higher (at your own risk) if you understand the implications. + +`pull_request_lookback_days` (optional, default: 30): Days to look back for merged pull requests. Set to 0 for unlimited history. Open pull requests are always fetched regardless of this setting. + `metrics` (optional): Enable or disable metrics scraping. See the [metrics documentation](./documentation.md) for details. ### Scraping diff --git a/receiver/githubreceiver/go.mod b/receiver/githubreceiver/go.mod index 19155cbe1f1cd..0e808a1d1348c 100644 --- a/receiver/githubreceiver/go.mod +++ b/receiver/githubreceiver/go.mod @@ -31,6 +31,7 @@ require ( go.uber.org/goleak v1.3.0 go.uber.org/multierr v1.11.0 go.uber.org/zap v1.27.1 + golang.org/x/sync v0.18.0 ) require ( diff --git a/receiver/githubreceiver/go.sum b/receiver/githubreceiver/go.sum index 6edb2b42d0e30..a9a51c7cdc58f 100644 --- a/receiver/githubreceiver/go.sum +++ b/receiver/githubreceiver/go.sum @@ -338,6 +338,8 @@ golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 h1:vr/HnozRka3pE4EsMEg1lgkXJ golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842/go.mod h1:XtvwrStGgqGPLc4cjQfWqZHG1YFdYs6swckp8vpsjnc= golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= +golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= +golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/receiver/githubreceiver/internal/scraper/githubscraper/README.md b/receiver/githubreceiver/internal/scraper/githubscraper/README.md index d960a1f2587ed..06ea5ec840d1e 100644 --- a/receiver/githubreceiver/internal/scraper/githubscraper/README.md +++ b/receiver/githubreceiver/internal/scraper/githubscraper/README.md @@ -42,23 +42,90 @@ to prevent abuse and maintain API availability. The following secondary limit is particularly relevant: - **Concurrent Requests Limit**: The API allows no more than 100 concurrent -requests. This limit is shared across the REST and GraphQL APIs. Since the -scraper creates a goroutine per repository, having more than 100 repositories -returned by the `search_query` will result in exceeding this limit. -It is recommended to use the `search_query` config option to limit the number of -repositories that are scraped. We recommend one instance of the receiver per -team (note: `team` is not a valid quantifier when searching repositories `topic` -is). Reminder that each instance of the receiver should have its own -corresponding token for authentication as this is what rate limits are tied to. +requests. This limit is shared across the REST and GraphQL APIs. The scraper +provides a `max_concurrent_requests` configuration option (default: 100) to +control concurrency and reduce the likelihood of exceeding this limit. -In summary, we recommend the following: +## Configuration Options for Rate Limiting + +To reduce rate limit issues, the GitHub scraper provides two configuration +options: + +### Concurrency Control + +```yaml +receivers: + github: + scrapers: + github: + max_concurrent_requests: 100 # Default: 100 +``` + +The `max_concurrent_requests` option limits how many repositories are scraped +concurrently. GitHub's API enforces a secondary rate limit of 100 concurrent +requests (shared between REST and GraphQL APIs). The default value of 100 +respects this limit. + +**When to adjust:** +- Set lower (e.g., 50) if you're also using GitHub's API from other tools with + the same token +- Set higher than 100 only if you understand the risk of secondary rate limit + errors +- The receiver will warn if this value exceeds 100 + +### Pull Request Time Filtering + +```yaml +receivers: + github: + scrapers: + github: + pull_request_lookback_days: 30 # Default: 30 +``` + +The `pull_request_lookback_days` option limits how far back to query for merged +pull requests. Open pull requests are always fetched regardless of age. The +scraper will stop paginating through merged PRs once it encounters PRs older +than the lookback period, significantly reducing API calls for repositories with +large PR histories. + +**When to adjust:** +- Set to 0 to fetch all historical pull requests (may consume significant API + quota) +- Increase (e.g., 90) if your team's PR cycle time is longer +- Decrease (e.g., 7) if you only care about very recent merged PRs + +**Note:** The implementation fetches open and merged PRs separately to enable +early termination of pagination for merged PRs, minimizing unnecessary API +calls. + +### Example Configuration + +```yaml +receivers: + github: + collection_interval: 300s # 5 minutes + scrapers: + github: + github_org: myorg + search_query: "org:myorg topic:observability" + max_concurrent_requests: 50 + pull_request_lookback_days: 30 + endpoint: https://github.example.com # For GitHub Enterprise + auth: + authenticator: bearertokenauth/github +``` + +### Recommendations + +Based on the limitations above, we recommend: - One instance of the receiver per team -- Each instance of the receiver should have its own token -- Leverage `search_query` config option to limit repositories returned to 100 or -less per instance -- `collection_interval` should be long enough to avoid rate limiting (see above -formula). A sensible default is `300s`. +- Each instance should have its own token +- Use `search_query` to limit repositories to a reasonable number +- Set `collection_interval` to 300s (5 minutes) or higher to avoid primary rate limits +- Use `max_concurrent_requests: 100` (default) to prevent secondary rate limit errors +- Use `pull_request_lookback_days: 30` (default) to limit historical PR queries **Additional Resources:** diff --git a/receiver/githubreceiver/internal/scraper/githubscraper/config.go b/receiver/githubreceiver/internal/scraper/githubscraper/config.go index 554065f81bf18..2aa16bb686021 100644 --- a/receiver/githubreceiver/internal/scraper/githubscraper/config.go +++ b/receiver/githubreceiver/internal/scraper/githubscraper/config.go @@ -4,7 +4,10 @@ package githubscraper // import "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/githubreceiver/internal/scraper/githubscraper" import ( + "errors" + "go.opentelemetry.io/collector/config/confighttp" + "go.uber.org/multierr" "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/githubreceiver/internal" "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/githubreceiver/internal/metadata" @@ -19,4 +22,27 @@ type Config struct { GitHubOrg string `mapstructure:"github_org"` // SearchQuery is the query to use when defining a custom search for repository data SearchQuery string `mapstructure:"search_query"` + // MaxConcurrentRequests limits the number of concurrent API requests to prevent + // exceeding GitHub's secondary rate limit of 100 concurrent requests. + // Default: 100 + MaxConcurrentRequests int `mapstructure:"max_concurrent_requests"` + // PullRequestLookbackDays limits how far back to query for merged/closed pull requests. + // Open pull requests are always fetched regardless of age. + // Set to 0 to fetch all historical pull requests. + // Default: 30 + PullRequestLookbackDays int `mapstructure:"pull_request_lookback_days"` +} + +func (cfg *Config) Validate() error { + var errs error + + if cfg.MaxConcurrentRequests <= 0 { + errs = multierr.Append(errs, errors.New("max_concurrent_requests must be greater than 0")) + } + + if cfg.PullRequestLookbackDays < 0 { + errs = multierr.Append(errs, errors.New("pull_request_lookback_days cannot be negative")) + } + + return errs } diff --git a/receiver/githubreceiver/internal/scraper/githubscraper/config_test.go b/receiver/githubreceiver/internal/scraper/githubscraper/config_test.go index 9df8b203709a8..d55b40e44b79b 100644 --- a/receiver/githubreceiver/internal/scraper/githubscraper/config_test.go +++ b/receiver/githubreceiver/internal/scraper/githubscraper/config_test.go @@ -23,9 +23,71 @@ func TestConfig(t *testing.T) { clientConfig.Timeout = 15 * time.Second expectedConfig := &Config{ - MetricsBuilderConfig: metadata.DefaultMetricsBuilderConfig(), - ClientConfig: clientConfig, + MetricsBuilderConfig: metadata.DefaultMetricsBuilderConfig(), + ClientConfig: clientConfig, + MaxConcurrentRequests: 100, + PullRequestLookbackDays: 30, } assert.Equal(t, expectedConfig, defaultConfig) } + +func TestConfigValidate(t *testing.T) { + tests := []struct { + name string + config *Config + expectedErr string + }{ + { + name: "valid config with defaults", + config: &Config{ + MaxConcurrentRequests: 100, + PullRequestLookbackDays: 30, + }, + expectedErr: "", + }, + { + name: "valid config with zero lookback (unlimited)", + config: &Config{ + MaxConcurrentRequests: 50, + PullRequestLookbackDays: 0, + }, + expectedErr: "", + }, + { + name: "invalid negative max_concurrent_requests", + config: &Config{ + MaxConcurrentRequests: -1, + PullRequestLookbackDays: 30, + }, + expectedErr: "max_concurrent_requests must be greater than 0", + }, + { + name: "invalid zero max_concurrent_requests", + config: &Config{ + MaxConcurrentRequests: 0, + PullRequestLookbackDays: 30, + }, + expectedErr: "max_concurrent_requests must be greater than 0", + }, + { + name: "invalid negative lookback days", + config: &Config{ + MaxConcurrentRequests: 100, + PullRequestLookbackDays: -1, + }, + expectedErr: "pull_request_lookback_days cannot be negative", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.config.Validate() + if tt.expectedErr == "" { + assert.NoError(t, err) + } else { + assert.ErrorContains(t, err, tt.expectedErr) + } + }) + } +} diff --git a/receiver/githubreceiver/internal/scraper/githubscraper/factory.go b/receiver/githubreceiver/internal/scraper/githubscraper/factory.go index ff9d37e04235d..ae9bcd73bb47a 100644 --- a/receiver/githubreceiver/internal/scraper/githubscraper/factory.go +++ b/receiver/githubreceiver/internal/scraper/githubscraper/factory.go @@ -18,8 +18,10 @@ import ( // This file implements factory for the GitHub Scraper as part of the GitHub Receiver const ( - TypeStr = "scraper" - defaultHTTPTimeout = 15 * time.Second + TypeStr = "scraper" + defaultHTTPTimeout = 15 * time.Second + defaultMaxConcurrentRequests = 100 + defaultPullRequestLookbackDays = 30 ) type Factory struct{} @@ -28,8 +30,10 @@ func (*Factory) CreateDefaultConfig() internal.Config { clientConfig := confighttp.NewDefaultClientConfig() clientConfig.Timeout = defaultHTTPTimeout return &Config{ - MetricsBuilderConfig: metadata.DefaultMetricsBuilderConfig(), - ClientConfig: clientConfig, + MetricsBuilderConfig: metadata.DefaultMetricsBuilderConfig(), + ClientConfig: clientConfig, + MaxConcurrentRequests: defaultMaxConcurrentRequests, + PullRequestLookbackDays: defaultPullRequestLookbackDays, } } diff --git a/receiver/githubreceiver/internal/scraper/githubscraper/github_scraper.go b/receiver/githubreceiver/internal/scraper/githubscraper/github_scraper.go index 125254021d9ca..6e0f6004b4bfb 100644 --- a/receiver/githubreceiver/internal/scraper/githubscraper/github_scraper.go +++ b/receiver/githubreceiver/internal/scraper/githubscraper/github_scraper.go @@ -17,6 +17,7 @@ import ( "go.opentelemetry.io/collector/pdata/pmetric" "go.opentelemetry.io/collector/receiver" "go.uber.org/zap" + "golang.org/x/sync/semaphore" "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/githubreceiver/internal/metadata" ) @@ -30,6 +31,7 @@ type githubScraper struct { logger *zap.Logger mb *metadata.MetricsBuilder rb *metadata.ResourceBuilder + sem *semaphore.Weighted // Concurrency limiter } func (ghs *githubScraper) start(ctx context.Context, host component.Host) (err error) { @@ -95,6 +97,28 @@ func (ghs *githubScraper) scrape(ctx context.Context) (pmetric.Metrics, error) { ghs.mb.RecordVcsRepositoryCountDataPoint(now, int64(count)) + // Log warning if repository count exceeds configured concurrency limit + if len(repos) > ghs.cfg.MaxConcurrentRequests { + ghs.logger.Sugar().Warnf( + "Found %d repositories but max_concurrent_requests is set to %d. "+ + "Consider using search_query to reduce repository count or increase max_concurrent_requests. "+ + "Note: GitHub's API limit is 100 concurrent requests.", + len(repos), ghs.cfg.MaxConcurrentRequests, + ) + } + + // Log warning if max_concurrent_requests exceeds GitHub's limit + if ghs.cfg.MaxConcurrentRequests > 100 { + ghs.logger.Sugar().Warnf( + "max_concurrent_requests is set to %d which exceeds GitHub's API limit of 100 concurrent requests. "+ + "This may result in secondary rate limit errors.", + ghs.cfg.MaxConcurrentRequests, + ) + } + + // Initialize semaphore for concurrency control + ghs.sem = semaphore.NewWeighted(int64(ghs.cfg.MaxConcurrentRequests)) + // Get the ref (branch) count (future branch data) for each repo and record // the given metrics var wg sync.WaitGroup @@ -110,6 +134,13 @@ func (ghs *githubScraper) scrape(ctx context.Context) (pmetric.Metrics, error) { go func() { defer wg.Done() + // Acquire semaphore before making API calls + if err := ghs.sem.Acquire(ctx, 1); err != nil { + ghs.logger.Sugar().Errorf("failed to acquire semaphore: %v", zap.Error(err)) + return + } + defer ghs.sem.Release(1) + branches, count, err := ghs.getBranches(ctx, genClient, name, trunk) if err != nil { ghs.logger.Sugar().Errorf("error getting branch count: %v", zap.Error(err)) @@ -164,7 +195,7 @@ func (ghs *githubScraper) scrape(ctx context.Context) (pmetric.Metrics, error) { ghs.mb.RecordVcsContributorCountDataPoint(now, int64(contribs), url, name) // Get change (pull request) data - prs, err := ghs.getPullRequests(ctx, genClient, name) + prs, err := ghs.getPullRequests(ctx, genClient, name, ghs.cfg.PullRequestLookbackDays) if err != nil { ghs.logger.Sugar().Errorf("error getting pull requests: %v", zap.Error(err)) } diff --git a/receiver/githubreceiver/internal/scraper/githubscraper/helpers.go b/receiver/githubreceiver/internal/scraper/githubscraper/helpers.go index ee9a894e199c9..162d0484a5afa 100644 --- a/receiver/githubreceiver/internal/scraper/githubscraper/helpers.go +++ b/receiver/githubreceiver/internal/scraper/githubscraper/helpers.go @@ -181,13 +181,53 @@ func (ghs *githubScraper) getContributorCount( } // Get the pull request data from the GraphQL API. +// For merged PRs, only returns those created within lookbackDays. +// For open PRs, returns all regardless of age. +// Set lookbackDays to 0 to fetch all historical PRs. func (ghs *githubScraper) getPullRequests( ctx context.Context, client graphql.Client, repoName string, + lookbackDays int, +) ([]PullRequestNode, error) { + var pullRequests []PullRequestNode + + // Calculate cutoff time for merged/closed PRs + var cutoffTime time.Time + if lookbackDays > 0 { + cutoffTime = time.Now().AddDate(0, 0, -lookbackDays) + } + + // Fetch open PRs (all of them, regardless of age) + openPRs, err := ghs.fetchPRsByState(ctx, client, repoName, []PullRequestState{"OPEN"}, time.Time{}) + if err != nil { + return nil, err + } + pullRequests = append(pullRequests, openPRs...) + + // Fetch merged PRs with optional time filtering + mergedPRs, err := ghs.fetchPRsByState(ctx, client, repoName, []PullRequestState{"MERGED"}, cutoffTime) + if err != nil { + return nil, err + } + pullRequests = append(pullRequests, mergedPRs...) + + return pullRequests, nil +} + +// fetchPRsByState fetches pull requests for a specific state with optional time filtering. +// If cutoffTime is zero, all PRs are fetched. Otherwise, pagination stops when encountering +// PRs older than the cutoff time. +func (ghs *githubScraper) fetchPRsByState( + ctx context.Context, + client graphql.Client, + repoName string, + states []PullRequestState, + cutoffTime time.Time, ) ([]PullRequestNode, error) { var cursor *string var pullRequests []PullRequestNode + hasTimeCutoff := !cutoffTime.IsZero() for hasNextPage := true; hasNextPage; { prs, err := getPullRequestData( @@ -197,15 +237,38 @@ func (ghs *githubScraper) getPullRequests( ghs.cfg.GitHubOrg, defaultReturnItems, cursor, - []PullRequestState{"OPEN", "MERGED"}, + states, ) if err != nil { return nil, err } - pullRequests = append(pullRequests, prs.Repository.PullRequests.Nodes...) + // Track if we found any PRs within the time window in this page + foundRecentPR := false + + for i := range prs.Repository.PullRequests.Nodes { + pr := &prs.Repository.PullRequests.Nodes[i] + + // If we have a time cutoff and this is a merged PR, check the merge time + if hasTimeCutoff && pr.Merged { + if pr.MergedAt.IsZero() || pr.MergedAt.Before(cutoffTime) { + // This PR is too old, skip it + continue + } + foundRecentPR = true + } + + pullRequests = append(pullRequests, *pr) + } + cursor = &prs.Repository.PullRequests.PageInfo.EndCursor hasNextPage = prs.Repository.PullRequests.PageInfo.HasNextPage + + // Early termination: if we have a time cutoff and didn't find any recent PRs + // in this page, we can stop pagination (assumes PRs are roughly ordered by time) + if hasTimeCutoff && !foundRecentPR && len(prs.Repository.PullRequests.Nodes) > 0 { + hasNextPage = false + } } return pullRequests, nil diff --git a/receiver/githubreceiver/internal/scraper/githubscraper/helpers_test.go b/receiver/githubreceiver/internal/scraper/githubscraper/helpers_test.go index b9cb72f8da789..74f54597e80c2 100644 --- a/receiver/githubreceiver/internal/scraper/githubscraper/helpers_test.go +++ b/receiver/githubreceiver/internal/scraper/githubscraper/helpers_test.go @@ -452,7 +452,7 @@ func TestGetPullRequests(t *testing.T) { defer server.Close() client := graphql.NewClient(server.URL, ghs.client) - prs, err := ghs.getPullRequests(t.Context(), client, "repo name") + prs, err := ghs.getPullRequests(t.Context(), client, "repo name", 0) assert.Len(t, prs, tc.expectedPrCount) if tc.expectedErr == nil {