Skip to content

Commit 09bd1c9

Browse files
Merge pull request #4758 from linuxfoundation/dev
Update finding by github username prod (step 4)
2 parents ff4d99c + b2e1c66 commit 09bd1c9

File tree

8 files changed

+81
-6
lines changed

8 files changed

+81
-6
lines changed

CO_AUTHORS.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,17 @@ If it does, the backend will process the co-authors as follows, assume trailer v
157157

158158
- Third we lookup for email using GitHub API. If the user is found, we use that user as co-author.
159159

160-
- Finally we use the name part for `name <email>` and lookup using GitHub API assuming that this name is GitHub username/login (this is the case for some bots). If the user is found, we use that user as co-author.
160+
- Finally we use the name part for `name <email>`. If the name matches the GitHub username pattern (alphanumeric characters or hyphens, must be 3–39 characters long, cannot start or end with a hyphen, and cannot contain consecutive hyphens), then we lookup using the GitHub API assuming that this name is a GitHub username/login (this is the case for some bots). If the user is found, we use that user as co-author.
161161

162162
We use internal caching while doing all those lookups with cache key `name` and `email` and TTL 24 hours. We even cache by `(name, email)` when nothing is found because this is the most time consuming option. It will have a chance to be found in the future (up to 24 hours from lookup).
163+
164+
165+
# How to fix missing commit author message
166+
167+
Make sure that co-authors use one of the following formats in their commit message:
168+
169+
- `Co-authored-by: Any name <[email protected]>` - exact GitHub user will be found by unique `ID` part.
170+
- `Co-authored-by: Any name <[email protected]>` - exact GitHub user will be found by unique `username` part.
171+
- `Co-authored-by: Any name <public-email>` - GitHub user will be found by `public-email` part - that must be made public on GitHub.
172+
- `Co-authored-by: github-login <any-email>` - GitHub user will be found by `github-login` part, (must be at least 3 characters long).
173+

cla-backend-go/github/github_repository.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ var (
2727
ErrGitHubRepositoryNotFound = errors.New("github repository not found")
2828
NoreplyIDPattern = regexp.MustCompile(`^(\d+)\+([a-zA-Z0-9-]+)@users\.noreply\.github\.com$`)
2929
NoreplyUserPattern = regexp.MustCompile(`^([a-zA-Z0-9-]+)@users\.noreply\.github\.com$`)
30+
GithubUsernameRegex = regexp.MustCompile(`^[A-Za-z0-9-]{3,39}$`)
3031
)
3132

3233
const (
@@ -359,6 +360,20 @@ func ExpandWithCoAuthors(
359360
}
360361
}
361362

363+
// IsValidGitHubUsername checks if the provided username is a valid GitHub username.
364+
func IsValidGitHubUsername(username string) bool {
365+
if !GithubUsernameRegex.MatchString(username) {
366+
return false
367+
}
368+
if strings.HasPrefix(username, "-") || strings.HasSuffix(username, "-") {
369+
return false
370+
}
371+
if strings.Contains(username, "--") {
372+
return false
373+
}
374+
return true
375+
}
376+
362377
func GetCoAuthorCommits(
363378
ctx context.Context,
364379
client *github.Client,
@@ -449,7 +464,7 @@ func GetCoAuthorCommits(
449464
}
450465

451466
// 4. Last resort - try to find by name=login
452-
if user == nil {
467+
if user == nil && IsValidGitHubUsername(name) {
453468
// Note that Co-authored-by: name <email> is not actually a GitHub login but rather a name - but we are trying hard to find a GitHub profile
454469
user, err = GetGithubUserByLogin(ctx, client, name)
455470
if err != nil {

cla-backend/cla/models/github_models.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@
3636
EXCLUDE_GITHUB_EMAILS = ["noreply.github.com"]
3737
NOREPLY_ID_PATTERN = re.compile(r"^(\d+)\+([a-zA-Z0-9-]+)@users\.noreply\.github\.com$")
3838
NOREPLY_USER_PATTERN = re.compile(r"^([a-zA-Z0-9-]+)@users\.noreply\.github\.com$")
39+
# GitHub usernames must be 3-39 characters long, can only contain alphanumeric characters or hyphens,
40+
# cannot begin or end with a hyphen, and cannot contain consecutive hyphens.
41+
GITHUB_USERNAME_REGEX = re.compile(r'^(?!-)(?!.*--)[A-Za-z0-9-]{3,39}(?<!-)$')
3942

4043
class TTLCache:
4144
def __init__(self, ttl_seconds=86400):
@@ -1882,6 +1885,8 @@ def get_pull_request_commit_authors(pull_request, installation_id, with_co_autho
18821885

18831886
return commit_authors
18841887

1888+
def is_valid_github_username(username: str) -> bool:
1889+
return bool(GITHUB_USERNAME_REGEX.match(username))
18851890

18861891
def get_co_author_commits(co_author, commit, pr, installation_id):
18871892
fn = "cla.models.github_models.get_co_author_commits"
@@ -1965,7 +1970,7 @@ def get_co_author_commits(co_author, commit, pr, installation_id):
19651970
user = None
19661971

19671972
# 4. Last resort: try to find by name (login)
1968-
if user is None:
1973+
if user is None and is_valid_github_username(name):
19691974
try:
19701975
# Note that Co-authored-by: name <email> is not actually a GitHub login but rather a name - but we are trying hard to find a GitHub profile
19711976
cla.log.debug(f"{fn} - Lookup via login=name: {name}")

tests/functional/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ CYPRESS_ENV=dev
8888
You can ask for example `.env` file over slack.
8989

9090
- Run `npx cypress install`
91-
- Run tests using cmd `npx cypress run`.
91+
- Run tests using cmd `npx cypress run`. Or `xvfb-run -a npx cypress run` when runnign over SSH.
9292
- Run tests using UI `npx cypress open`. Choose **E2E testing**, select **Chrome** browser.
9393
- View test reports in the `cypress-report` directory.
9494
- Explore source code files for detailed implementation.

utils/calculate_api_stats.sh

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
# Copyright The Linux Foundation and each contributor to CommunityBridge.
3+
# SPDX-License-Identifier: MIT
4+
5+
6+
if [ -z "$STAGE" ]
7+
then
8+
export STAGE=prod
9+
fi
10+
if [ -z "$1" ]
11+
then
12+
echo "$0: please provide time range from value as a 1st argument, for example '2 hours ago'"
13+
exit 1
14+
fi
15+
export DTFROM="${1}"
16+
REGION=us-east-1 NO_ECHO=1 DTTO='1 second ago' OUT="api-logs-${STAGE}-1.json" ./utils/search_aws_logs.sh 'LG:api-request-path'
17+
REGION=us-east-2 NO_ECHO=1 DTTO='1 second ago' OUT="api-logs-${STAGE}-2.json" ./utils/search_aws_logs.sh 'LG:api-request-path'
18+
jq -s 'add' "api-logs-${STAGE}-1.json" "api-logs-${STAGE}-2.json" > "api-logs-${STAGE}.json" && rm -f "api-logs-${STAGE}-1.json" "api-logs-${STAGE}-2.json"
19+
./utils/count_apis.sh "api-logs-${STAGE}.json" > "api-logs-${STAGE}.log" && cat "api-logs-${STAGE}.log"

utils/count_apis.sh

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/bash
2+
# Copyright The Linux Foundation and each contributor to CommunityBridge.
3+
# SPDX-License-Identifier: MIT
4+
5+
if [ -z "${1}" ]
6+
then
7+
echo "Usage: $0 <path-to-api-logs>"
8+
echo "Example: $0 api-logs-prod.json"
9+
exit 1
10+
fi
11+
12+
jq -r '
13+
.[].message
14+
| capture("LG:api-request-path:(?<p>[^\"[:space:]]+)")? # find the path
15+
| select(.) # drop non-matches
16+
| .p
17+
' "${1}" \
18+
| sed -E 's/[0-9a-fA-F-]{36}/<uuid>/g' \
19+
| sed -E ':a;s#/([0-9]{1,})(/|$)#/<id>\2#g;ta' \
20+
| sed -E 's#/(00|a0)[A-Za-z0-9]{13,16}(/|$)#/<sfid>\2#g' \
21+
| sort | uniq -c | sort -nr

utils/search_aws_log_group.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
#!/bin/bash
22
# STAGE=dev DEBUG=1 DTFROM='3 days ago' DTTO='2 days ago' ./utils/search_aws_log_group.sh 'cla-backend-dev-githubactivity' 'error'
3+
# REGION=us-east-2 STAGE=prod DEBUG=1 DTFROM='15 minutes ago' DTTO='1 second ago' ./utils/search_aws_log_group.sh 'cla-backend-go-api-v4-lambda' 'LG:api-request-path'
4+
# REGION=us-east-1 STAGE=prod DEBUG=1 DTFROM='15 minutes ago' DTTO='1 second ago' ./utils/search_aws_log_group.sh 'cla-backend-prod-api-v3-lambda' 'LG:api-request-path'
5+
# REGION=us-east-1 STAGE=prod DEBUG=1 DTFROM='15 minutes ago' DTTO='1 second ago' ./utils/search_aws_log_group.sh 'cla-backend-prod-apiv2' 'LG:api-request-path'
6+
# REGION=us-east-1 STAGE=prod DEBUG=1 DTFROM='15 minutes ago' DTTO='1 second ago' ./utils/search_aws_log_group.sh 'cla-backend-prod-githubactivity' 'LG:api-request-path'
37

48
if [ -z "$STAGE" ]
59
then

utils/search_aws_logs.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
# SPDX-License-Identifier: MIT
55

66
# REGION=us-east-1|us-east-2 STAGE=dev DEBUG=1 DTFROM='3 days ago' DTTO='2 days ago' OUT=logs.json ./utils/search_aws_logs.sh 'error'
7-
# DEBUG=1 STAGE=dev REGION=us-east-1 DTFROM='10 days ago' DTTO='1 second ago' OUT=api-logs-dev.json ./utils/search_aws_logs.sh 'LG:api-request-path' && jq -r '.[].message' api-logs-dev.json | grep -o 'LG:api-request-path:[^[:space:]]*' | sed 's/^LG:api-request-path://' | sed -E 's/[0-9a-fA-F-]{36}/<uuid>/g' | sed -E 's/\b[0-9]{2,}\b/<id>/g' | sort | uniq -c | sort -nr
8-
# DEBUG=1 STAGE=prod REGION=us-east-1 NO_ECHO=1 DTFROM='10 days ago' DTTO='1 second ago' OUT=api-logs-prod.json ./utils/search_aws_logs.sh 'LG:api-request-path' && jq -r '.[].message' api-logs-prod.json | grep -o 'LG:api-request-path:[^[:space:]]*' | sed 's/^LG:api-request-path://' | sed -E 's/[0-9a-fA-F-]{36}/<uuid>/g' | sed -E ':a;s#/([0-9]{1,})(/|$)#/<id>\2#g;ta' | sort | uniq -c | sort -nr
7+
# DEBUG=1 STAGE=dev REGION=us-east-1 DTFROM='10 days ago' DTTO='1 second ago' OUT=api-logs-dev.json ./utils/search_aws_logs.sh 'LG:api-request-path' && ./utils/count_apis.sh api-logs-dev.json
8+
# DEBUG=1 STAGE=prod REGION=us-east-1 NO_ECHO=1 DTFROM='10 days ago' DTTO='1 second ago' OUT=api-logs-prod.json ./utils/search_aws_logs.sh 'LG:api-request-path' && ./utils/count_apis.sh api-logs-prod.json
99
# To find distinct log groups: | jq -r 'map(.logGroupName) | unique | .[]'
1010
# in us-east-1 (mostly V1, V2 and V3):
1111
# To see specific log group: | jq 'map(select(.logGroupName == "/aws/lambda/cla-backend-dev-apiv1"))'

0 commit comments

Comments
 (0)