Skip to content

Commit 52dd0ed

Browse files
Update finding by github username (step 4)
Signed-off-by: Lukasz Gryglicki <[email protected]> Generated with [OpenAI](https://platform.openai.com/) Assisted by [OpenAI](https://platform.openai.com/)
1 parent e022325 commit 52dd0ed

File tree

6 files changed

+55
-5
lines changed

6 files changed

+55
-5
lines changed

CO_AUTHORS.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,17 @@ If it does, the backend will process the co-authors as follows, assume trailer v
157157

158158
- Third we lookup for email using GitHub API. If the user is found, we use that user as co-author.
159159

160-
- Finally we use the name part for `name <email>` and lookup using GitHub API assuming that this name is GitHub username/login (this is the case for some bots). If the user is found, we use that user as co-author.
160+
- Finally we use the name part for `name <email>`. If the name matches the GitHub username pattern (alphanumeric characters or hyphens, must be 3–39 characters long, cannot start or end with a hyphen, and cannot contain consecutive hyphens), then we lookup using the GitHub API assuming that this name is a GitHub username/login (this is the case for some bots). If the user is found, we use that user as co-author.
161161

162162
We use internal caching while doing all those lookups with cache key `name` and `email` and TTL 24 hours. We even cache by `(name, email)` when nothing is found because this is the most time consuming option. It will have a chance to be found in the future (up to 24 hours from lookup).
163+
164+
165+
# How to fix missing commit author message
166+
167+
make sure that co-authors use one of the following formats in their commit message:
168+
169+
- `Co-authored-by: Any name <[email protected]>` - exact GitHub user will be found by unique `ID` part.
170+
- `Co-authored-by: Any name <[email protected]>` - exact GitHub user will be found by unique `username` part.
171+
- `Co-authored-by: Any name <public-email>` - GitHub user will be found by `public-email` part - that must be made public on GitHub.
172+
- `Co-authored-by: github-login <any-email>` - GitHub user will be found by `github-login` part, (must be at least 3 characters long).
173+

cla-backend-go/github/github_repository.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ var (
2727
ErrGitHubRepositoryNotFound = errors.New("github repository not found")
2828
NoreplyIDPattern = regexp.MustCompile(`^(\d+)\+([a-zA-Z0-9-]+)@users\.noreply\.github\.com$`)
2929
NoreplyUserPattern = regexp.MustCompile(`^([a-zA-Z0-9-]+)@users\.noreply\.github\.com$`)
30+
GithubUsernameRegex = regexp.MustCompile(`^[A-Za-z0-9-]{3,39}$`)
3031
)
3132

3233
const (
@@ -359,6 +360,20 @@ func ExpandWithCoAuthors(
359360
}
360361
}
361362

363+
// IsValidGitHubUsername checks if the provided username is a valid GitHub username.
364+
func IsValidGitHubUsername(username string) bool {
365+
if !GithubUsernameRegex.MatchString(username) {
366+
return false
367+
}
368+
if strings.HasPrefix(username, "-") || strings.HasSuffix(username, "-") {
369+
return false
370+
}
371+
if strings.Contains(username, "--") {
372+
return false
373+
}
374+
return true
375+
}
376+
362377
func GetCoAuthorCommits(
363378
ctx context.Context,
364379
client *github.Client,
@@ -449,7 +464,7 @@ func GetCoAuthorCommits(
449464
}
450465

451466
// 4. Last resort - try to find by name=login
452-
if user == nil {
467+
if user == nil && IsValidGitHubUsername(name) {
453468
// Note that Co-authored-by: name <email> is not actually a GitHub login but rather a name - but we are trying hard to find a GitHub profile
454469
user, err = GetGithubUserByLogin(ctx, client, name)
455470
if err != nil {

cla-backend/cla/models/github_models.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
EXCLUDE_GITHUB_EMAILS = ["noreply.github.com"]
3737
NOREPLY_ID_PATTERN = re.compile(r"^(\d+)\+([a-zA-Z0-9-]+)@users\.noreply\.github\.com$")
3838
NOREPLY_USER_PATTERN = re.compile(r"^([a-zA-Z0-9-]+)@users\.noreply\.github\.com$")
39+
GITHUB_USERNAME_REGEX = re.compile(r'^(?!-)(?!.*--)[A-Za-z0-9-]{3,39}(?<!-)$')
3940

4041
class TTLCache:
4142
def __init__(self, ttl_seconds=86400):
@@ -1882,6 +1883,8 @@ def get_pull_request_commit_authors(pull_request, installation_id, with_co_autho
18821883

18831884
return commit_authors
18841885

1886+
def is_valid_github_username(username: str) -> bool:
1887+
return bool(GITHUB_USERNAME_REGEX.match(username))
18851888

18861889
def get_co_author_commits(co_author, commit, pr, installation_id):
18871890
fn = "cla.models.github_models.get_co_author_commits"
@@ -1965,7 +1968,7 @@ def get_co_author_commits(co_author, commit, pr, installation_id):
19651968
user = None
19661969

19671970
# 4. Last resort: try to find by name (login)
1968-
if user is None:
1971+
if user is None and is_valid_github_username(name):
19691972
try:
19701973
# Note that Co-authored-by: name <email> is not actually a GitHub login but rather a name - but we are trying hard to find a GitHub profile
19711974
cla.log.debug(f"{fn} - Lookup via login=name: {name}")

utils/count_apis.sh

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
if [ -z "${1}" ]
3+
then
4+
echo "Usage: $0 <path-to-api-logs>"
5+
echo "Example: $0 api-logs-prod.json"
6+
exit 1
7+
fi
8+
9+
jq -r '
10+
.[].message
11+
| capture("LG:api-request-path:(?<p>[^\"[:space:]]+)")? # find the path
12+
| select(.) # drop non-matches
13+
| .p
14+
' "${1}" \
15+
| sed -E 's/[0-9a-fA-F-]{36}/<uuid>/g' \
16+
| sed -E ':a;s#/([0-9]{1,})(/|$)#/<id>\2#g;ta' \
17+
| sort | uniq -c | sort -nr

utils/search_aws_log_group.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
#!/bin/bash
22
# STAGE=dev DEBUG=1 DTFROM='3 days ago' DTTO='2 days ago' ./utils/search_aws_log_group.sh 'cla-backend-dev-githubactivity' 'error'
3+
# REGION=us-east-2 STAGE=prod DEBUG=1 DTFROM='15 minutes ago' DTTO='1 second ago' ./utils/search_aws_log_group.sh 'cla-backend-go-api-v4-lambda' 'LG:api-request-path'
4+
# REGION=us-east-1 STAGE=prod DEBUG=1 DTFROM='15 minutes ago' DTTO='1 second ago' ./utils/search_aws_log_group.sh 'cla-backend-prod-api-v3-lambda' 'LG:api-request-path'
5+
# REGION=us-east-1 STAGE=prod DEBUG=1 DTFROM='15 minutes ago' DTTO='1 second ago' ./utils/search_aws_log_group.sh 'cla-backend-prod-apiv2' 'LG:api-request-path'
6+
# REGION=us-east-1 STAGE=prod DEBUG=1 DTFROM='15 minutes ago' DTTO='1 second ago' ./utils/search_aws_log_group.sh 'cla-backend-prod-githubactivity' 'LG:api-request-path'
37

48
if [ -z "$STAGE" ]
59
then

utils/search_aws_logs.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
# SPDX-License-Identifier: MIT
55

66
# REGION=us-east-1|us-east-2 STAGE=dev DEBUG=1 DTFROM='3 days ago' DTTO='2 days ago' OUT=logs.json ./utils/search_aws_logs.sh 'error'
7-
# DEBUG=1 STAGE=dev REGION=us-east-1 DTFROM='10 days ago' DTTO='1 second ago' OUT=api-logs-dev.json ./utils/search_aws_logs.sh 'LG:api-request-path' && jq -r '.[].message' api-logs-dev.json | grep -o 'LG:api-request-path:[^[:space:]]*' | sed 's/^LG:api-request-path://' | sed -E 's/[0-9a-fA-F-]{36}/<uuid>/g' | sed -E 's/\b[0-9]{2,}\b/<id>/g' | sort | uniq -c | sort -nr
8-
# DEBUG=1 STAGE=prod REGION=us-east-1 NO_ECHO=1 DTFROM='10 days ago' DTTO='1 second ago' OUT=api-logs-prod.json ./utils/search_aws_logs.sh 'LG:api-request-path' && jq -r '.[].message' api-logs-prod.json | grep -o 'LG:api-request-path:[^[:space:]]*' | sed 's/^LG:api-request-path://' | sed -E 's/[0-9a-fA-F-]{36}/<uuid>/g' | sed -E ':a;s#/([0-9]{1,})(/|$)#/<id>\2#g;ta' | sort | uniq -c | sort -nr
7+
# DEBUG=1 STAGE=dev REGION=us-east-1 DTFROM='10 days ago' DTTO='1 second ago' OUT=api-logs-dev.json ./utils/search_aws_logs.sh 'LG:api-request-path' && ./utils/count_apis.sh api-logs-dev.json
8+
# DEBUG=1 STAGE=prod REGION=us-east-1 NO_ECHO=1 DTFROM='10 days ago' DTTO='1 second ago' OUT=api-logs-prod.json ./utils/search_aws_logs.sh 'LG:api-request-path' && ./utils/count_apis.sh api-logs-prod.json
99
# To find distinct log groups: | jq -r 'map(.logGroupName) | unique | .[]'
1010
# in us-east-1 (mostly V1, V2 and V3):
1111
# To see specific log group: | jq 'map(select(.logGroupName == "/aws/lambda/cla-backend-dev-apiv1"))'

0 commit comments

Comments
 (0)