Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions backend/plugins/github/README_FILTERING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# GitHub Plugin - Bot Filtering

## Overview

The GitHub plugin supports filtering bot-generated PRs, reviews, and comments from data collection to prevent them from skewing metrics like lead time for changes and PR pickup time.

## Configuration

Set the `GITHUB_PR_EXCLUDELIST` environment variable with a comma-separated list of bot usernames to exclude:

```bash
export GITHUB_PR_EXCLUDELIST="renovate[bot],dependabot[bot],github-actions[bot]"
```

## Common Bot Usernames

- `renovate[bot]` - Renovate dependency updates
- `dependabot[bot]` - GitHub Dependabot
- `github-actions[bot]` - GitHub Actions automated PRs
- `sonarcloud[bot]` - SonarCloud code analysis
- `codecov[bot]` - Codecov coverage reports

## What Gets Filtered

When a username is in the exclusion list, the following entities are NOT collected:

1. **Pull Requests** - PRs authored by bots
2. **PR Reviews** - Reviews submitted by bots
3. **PR Review Comments** - Comments on PR reviews by bots
4. **Issue Comments** - Comments on issues by bots

## How It Works

- Filtering happens at the **extraction** layer
- Raw API responses are still saved (in `_raw_github_api_*` tables)
- Filtered entities never reach the tool layer tables
- Metrics queries only see non-bot entities

## Matching Rules

- **Case-insensitive**: `renovate[bot]` matches `Renovate[bot]` and `RENOVATE[BOT]`
- **Exact match**: Must match the full username
- **Whitespace trimmed**: Extra spaces in the config are ignored

## Examples

### Docker Compose

```yaml
services:
devlake:
environment:
- GITHUB_PR_EXCLUDELIST=renovate[bot],dependabot[bot]
```

### Kubernetes

```yaml
env:
- name: GITHUB_PR_EXCLUDELIST
value: "renovate[bot],dependabot[bot],github-actions[bot]"
```

### Local Development

```bash
# .env file
GITHUB_PR_EXCLUDELIST=renovate[bot],dependabot[bot]
```

## Updating the Exclusion List

Changes to `GITHUB_PR_EXCLUDELIST` require a DevLake restart. After updating:

1. Restart DevLake
2. Trigger re-collection for affected repositories
3. Previously collected bot data remains in the database
4. New collections will respect the updated filter

## Verification

Check logs for filtering activity:

```
DEBUG: Skipping PR #123 from bot user: renovate[bot]
DEBUG: Skipping review #456 from bot user: dependabot[bot]
```

## Troubleshooting

**Bot PRs still appearing in metrics:**

1. Verify `GITHUB_PR_EXCLUDELIST` is set correctly
2. Check DevLake logs for "Skipping" messages
3. Ensure username matches exactly (case-insensitive)
4. Restart DevLake after config changes
5. Re-run collection for the repository

**How to find bot usernames:**

Check GitHub PR/comment authors in the web UI - bot usernames typically end with `[bot]`.
39 changes: 39 additions & 0 deletions backend/plugins/github/e2e/pr_review_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ limitations under the License.
package e2e

import (
"os"
"testing"

"github.com/apache/incubator-devlake/core/dal"
"github.com/apache/incubator-devlake/core/models/domainlayer/code"
"github.com/apache/incubator-devlake/helpers/e2ehelper"
"github.com/apache/incubator-devlake/plugins/github/impl"
Expand Down Expand Up @@ -99,3 +101,40 @@ func TestPrReviewDataFlow(t *testing.T) {
},
)
}

func TestPrReviewDataFlowWithBotFiltering(t *testing.T) {
var plugin impl.Github
dataflowTester := e2ehelper.NewDataFlowTester(t, "github", plugin)

// Set up bot filtering
os.Setenv("GITHUB_PR_EXCLUDELIST", "renovate[bot]")
defer os.Unsetenv("GITHUB_PR_EXCLUDELIST")

taskData := &tasks.GithubTaskData{
Options: &tasks.GithubOptions{
ConnectionId: 1,
Name: "test/repo",
GithubId: 123,
},
}

// import raw data table with bot and human reviews
dataflowTester.ImportCsvIntoRawTable("./raw_tables/_raw_github_api_pr_reviews_bot_filter.csv", "_raw_github_api_pull_request_reviews")

// verify review extraction filters bot reviews
dataflowTester.FlushTabler(&models.GithubPrReview{})
dataflowTester.FlushTabler(&models.GithubReviewer{})
dataflowTester.FlushTabler(&models.GithubRepoAccount{})
dataflowTester.Subtask(tasks.ExtractApiPullRequestReviewsMeta, taskData)

// Verify only human review was extracted
var reviews []models.GithubPrReview
dataflowTester.Dal.All(&reviews, dal.Where("connection_id = ?", 1))

if len(reviews) != 1 {
t.Errorf("Expected 1 review (human), got %d", len(reviews))
}
if len(reviews) > 0 && reviews[0].GithubId != 5002 {
t.Errorf("Expected review #5002 (human), got #%d", reviews[0].GithubId)
}
}
39 changes: 39 additions & 0 deletions backend/plugins/github/e2e/pr_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ limitations under the License.
package e2e

import (
"os"
"testing"

"github.com/apache/incubator-devlake/core/dal"
"github.com/apache/incubator-devlake/core/models/domainlayer/code"
"github.com/apache/incubator-devlake/helpers/e2ehelper"
"github.com/apache/incubator-devlake/plugins/github/impl"
Expand Down Expand Up @@ -171,3 +173,40 @@ func TestPrDataFlow(t *testing.T) {
},
)
}

func TestPrDataFlowWithBotFiltering(t *testing.T) {
var plugin impl.Github
dataflowTester := e2ehelper.NewDataFlowTester(t, "github", plugin)

// Set up bot filtering
os.Setenv("GITHUB_PR_EXCLUDELIST", "renovate[bot]")
defer os.Unsetenv("GITHUB_PR_EXCLUDELIST")

taskData := &tasks.GithubTaskData{
Options: &tasks.GithubOptions{
ConnectionId: 1,
Name: "test/repo",
GithubId: 123,
ScopeConfig: &models.GithubScopeConfig{},
},
}

// import raw data table with bot and human PRs
dataflowTester.ImportCsvIntoRawTable("./raw_tables/_raw_github_api_pull_requests_bot_filter.csv", "_raw_github_api_pull_requests")

// verify pr extraction filters bot PRs
dataflowTester.FlushTabler(&models.GithubPullRequest{})
dataflowTester.FlushTabler(&models.GithubRepoAccount{})
dataflowTester.Subtask(tasks.ExtractApiPullRequestsMeta, taskData)

// Verify only human PR was extracted
var prs []models.GithubPullRequest
dataflowTester.Dal.All(&prs, dal.Where("connection_id = ?", 1))

if len(prs) != 1 {
t.Errorf("Expected 1 PR (human), got %d", len(prs))
}
if len(prs) > 0 && prs[0].Number != 1000 {
t.Errorf("Expected PR #1000 (human), got #%d", prs[0].Number)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
id,params,data,url,input,created_at
1,"{""ConnectionId"":1,""Name"":""test/repo""}","{""id"":5001,""user"":{""login"":""renovate[bot]"",""id"":29139614},""body"":""LGTM"",""state"":""APPROVED"",""commit_id"":""abc123"",""submitted_at"":""2024-01-01T00:00:00Z""}",https://api.github.com/repos/test/repo/pulls/1/reviews,"{""GithubId"":1,""Number"":1}",2024-01-01 00:00:00
2,"{""ConnectionId"":1,""Name"":""test/repo""}","{""id"":5002,""user"":{""login"":""human-reviewer"",""id"":12345},""body"":""Looks good"",""state"":""APPROVED"",""commit_id"":""abc123"",""submitted_at"":""2024-01-02T00:00:00Z""}",https://api.github.com/repos/test/repo/pulls/1/reviews,"{""GithubId"":1,""Number"":1}",2024-01-02 00:00:00
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
id,params,data,url,input,created_at
1,"{""ConnectionId"":1,""Name"":""test/repo""}","{""id"":999,""number"":999,""state"":""closed"",""title"":""Dependency Update"",""user"":{""login"":""renovate[bot]"",""id"":29139614},""body"":""Updates dependencies"",""created_at"":""2024-01-01T00:00:00Z"",""updated_at"":""2024-01-02T00:00:00Z"",""closed_at"":""2024-01-02T00:00:00Z"",""merged_at"":""2024-01-02T00:00:00Z"",""merge_commit_sha"":""abc123"",""merged"":true,""additions"":10,""deletions"":5,""changed_files"":2,""comments"":0,""review_comments"":0,""commits"":1,""draft"":false,""labels"":[],""head"":{""ref"":""renovate/deps"",""sha"":""head123"",""repo"":{""id"":123,""name"":""repo""}},""base"":{""ref"":""main"",""sha"":""base123"",""repo"":{""id"":123,""name"":""repo""}},""html_url"":""https://github.com/test/repo/pull/999""}",https://api.github.com/repos/test/repo/pulls,null,2024-01-01 00:00:00
2,"{""ConnectionId"":1,""Name"":""test/repo""}","{""id"":1000,""number"":1000,""state"":""open"",""title"":""Feature PR"",""user"":{""login"":""human-dev"",""id"":12345},""body"":""Adds feature"",""created_at"":""2024-01-03T00:00:00Z"",""updated_at"":""2024-01-03T00:00:00Z"",""closed_at"":null,""merged_at"":null,""merge_commit_sha"":"""",""merged"":false,""additions"":100,""deletions"":20,""changed_files"":5,""comments"":2,""review_comments"":3,""commits"":5,""draft"":false,""labels"":[],""head"":{""ref"":""feature/new"",""sha"":""head456"",""repo"":{""id"":123,""name"":""repo""}},""base"":{""ref"":""main"",""sha"":""base123"",""repo"":{""id"":123,""name"":""repo""}},""html_url"":""https://github.com/test/repo/pull/1000""}",https://api.github.com/repos/test/repo/pulls,null,2024-01-03 00:00:00
29 changes: 22 additions & 7 deletions backend/plugins/github/tasks/comment_extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,19 @@ func ExtractApiComments(taskCtx plugin.SubTaskContext) errors.Error {
Type: "NORMAL",
}
if apiComment.User != nil {
// Filter bot comments by username
if shouldSkipByUsername(apiComment.User.Login) {
taskCtx.GetLogger().Debug("Skipping PR comment #%d from bot user: %s", apiComment.GithubId, apiComment.User.Login)
return nil, nil
}
githubPrComment.AuthorUsername = apiComment.User.Login
githubPrComment.AuthorUserId = apiComment.User.Id

githubAccount, err := convertAccount(apiComment.User, data.Options.GithubId, data.Options.ConnectionId)
if err != nil {
return nil, err
}
results = append(results, githubAccount)
}
results = append(results, githubPrComment)
} else {
Expand All @@ -121,18 +132,22 @@ func ExtractApiComments(taskCtx plugin.SubTaskContext) errors.Error {
GithubUpdatedAt: apiComment.GithubUpdatedAt.ToTime(),
}
if apiComment.User != nil {
// Filter bot comments by username
if shouldSkipByUsername(apiComment.User.Login) {
taskCtx.GetLogger().Debug("Skipping issue comment #%d from bot user: %s", apiComment.GithubId, apiComment.User.Login)
return nil, nil
}
githubIssueComment.AuthorUsername = apiComment.User.Login
githubIssueComment.AuthorUserId = apiComment.User.Id

githubAccount, err := convertAccount(apiComment.User, data.Options.GithubId, data.Options.ConnectionId)
if err != nil {
return nil, err
}
results = append(results, githubAccount)
}
results = append(results, githubIssueComment)
}
if apiComment.User != nil {
githubAccount, err := convertAccount(apiComment.User, data.Options.GithubId, data.Options.ConnectionId)
if err != nil {
return nil, err
}
results = append(results, githubAccount)
}
return results, nil
},
})
Expand Down
5 changes: 5 additions & 0 deletions backend/plugins/github/tasks/pr_extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,11 @@ func ExtractApiPullRequests(taskCtx plugin.SubTaskContext) errors.Error {
if rawL.GithubId == 0 {
return nil, nil
}
// Filter bot PRs by username
if rawL.User != nil && shouldSkipByUsername(rawL.User.Login) {
taskCtx.GetLogger().Debug("Skipping PR #%d from bot user: %s", rawL.Number, rawL.User.Login)
return nil, nil
}
//If this is a pr, ignore
githubPr, err := convertGithubPullRequest(rawL, data.Options.ConnectionId, data.Options.GithubId)
if err != nil {
Expand Down
6 changes: 6 additions & 0 deletions backend/plugins/github/tasks/pr_review_comment_extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@ func ExtractApiPrReviewComments(taskCtx plugin.SubTaskContext) errors.Error {
}

if prReviewComment.User != nil {
// Filter bot comments by username
if shouldSkipByUsername(prReviewComment.User.Login) {
taskCtx.GetLogger().Debug("Skipping PR review comment #%d from bot user: %s", prReviewComment.GithubId, prReviewComment.User.Login)
return nil, nil
}

githubPrComment.AuthorUserId = prReviewComment.User.Id
githubPrComment.AuthorUsername = prReviewComment.User.Login

Expand Down
5 changes: 5 additions & 0 deletions backend/plugins/github/tasks/pr_review_extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ func ExtractApiPullRequestReviews(taskCtx plugin.SubTaskContext) errors.Error {
if apiPullRequestReview.State == "PENDING" || apiPullRequestReview.User == nil {
return nil, nil
}
// Filter bot reviews by username
if shouldSkipByUsername(apiPullRequestReview.User.Login) {
taskCtx.GetLogger().Debug("Skipping review #%d from bot user: %s", apiPullRequestReview.GithubId, apiPullRequestReview.User.Login)
return nil, nil
}
pull := &SimplePr{}
err = errors.Convert(json.Unmarshal(row.Input, pull))
if err != nil {
Expand Down
87 changes: 87 additions & 0 deletions backend/plugins/github/tasks/username_filter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
/*
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package tasks

import (
"os"
"strings"
"sync"
)

var (
excludedUsernames []string
excludedUsernamesOnce sync.Once
excludedUsernamesMu sync.RWMutex
)

// initExcludedUsernames reads and parses the GITHUB_PR_EXCLUDELIST environment variable
func initExcludedUsernames() {
excludedUsernamesOnce.Do(func() {
loadExcludedUsernames()
})
}

// loadExcludedUsernames parses the environment variable (called by initExcludedUsernames or tests)
func loadExcludedUsernames() {
excludedUsernamesMu.Lock()
defer excludedUsernamesMu.Unlock()

envValue := os.Getenv("GITHUB_PR_EXCLUDELIST")
if envValue == "" {
excludedUsernames = []string{}
return
}

usernames := strings.Split(envValue, ",")
excludedUsernames = make([]string, 0, len(usernames))
for _, username := range usernames {
trimmed := strings.TrimSpace(username)
if trimmed != "" {
excludedUsernames = append(excludedUsernames, strings.ToLower(trimmed))
}
}
}

// resetExcludedUsernamesForTest resets the cache for testing purposes
func resetExcludedUsernamesForTest() {
excludedUsernamesMu.Lock()
defer excludedUsernamesMu.Unlock()
excludedUsernames = nil
excludedUsernamesOnce = sync.Once{}
}

// shouldSkipByUsername checks if the given username should be filtered out
// Returns true if the username matches any entry in the GITHUB_PR_EXCLUDELIST
func shouldSkipByUsername(username string) bool {
initExcludedUsernames()

if username == "" {
return false
}

excludedUsernamesMu.RLock()
defer excludedUsernamesMu.RUnlock()

lowerUsername := strings.ToLower(username)
for _, excluded := range excludedUsernames {
if lowerUsername == excluded {
return true
}
}
return false
}
Loading