Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion CO_AUTHORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,17 @@ If it does, the backend will process the co-authors as follows, assume trailer v

- Third we lookup for email using GitHub API. If the user is found, we use that user as co-author.

- Finally we use the name part for `name <email>` and lookup using GitHub API assuming that this name is GitHub username/login (this is the case for some bots). If the user is found, we use that user as co-author.
- Finally we use the name part for `name <email>`. If the name matches the GitHub username pattern (alphanumeric characters or hyphens, must be 3–39 characters long, cannot start or end with a hyphen, and cannot contain consecutive hyphens), then we lookup using the GitHub API assuming that this name is a GitHub username/login (this is the case for some bots). If the user is found, we use that user as co-author.

We use internal caching while doing all those lookups with cache key `name` and `email` and TTL 24 hours. We even cache by `(name, email)` when nothing is found because this is the most time consuming option. It will have a chance to be found in the future (up to 24 hours from lookup).


# How to fix missing commit author message

Make sure that co-authors use one of the following formats in their commit message:

- `Co-authored-by: Any name <ID+username@users.noreply.github.com>` - exact GitHub user will be found by unique `ID` part.
- `Co-authored-by: Any name <username@users.noreply.github.com>` - exact GitHub user will be found by unique `username` part.
- `Co-authored-by: Any name <public-email>` - GitHub user will be found by `public-email` part - that must be made public on GitHub.
- `Co-authored-by: github-login <any-email>` - GitHub user will be found by `github-login` part, (must be at least 3 characters long).

17 changes: 16 additions & 1 deletion cla-backend-go/github/github_repository.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ var (
ErrGitHubRepositoryNotFound = errors.New("github repository not found")
NoreplyIDPattern = regexp.MustCompile(`^(\d+)\+([a-zA-Z0-9-]+)@users\.noreply\.github\.com$`)
NoreplyUserPattern = regexp.MustCompile(`^([a-zA-Z0-9-]+)@users\.noreply\.github\.com$`)
GithubUsernameRegex = regexp.MustCompile(`^[A-Za-z0-9-]{3,39}$`)
)

const (
Expand Down Expand Up @@ -359,6 +360,20 @@ func ExpandWithCoAuthors(
}
}

// IsValidGitHubUsername checks if the provided username is a valid GitHub username.
func IsValidGitHubUsername(username string) bool {
if !GithubUsernameRegex.MatchString(username) {
return false
}
if strings.HasPrefix(username, "-") || strings.HasSuffix(username, "-") {
return false
}
if strings.Contains(username, "--") {
return false
}
return true
}

func GetCoAuthorCommits(
ctx context.Context,
client *github.Client,
Expand Down Expand Up @@ -449,7 +464,7 @@ func GetCoAuthorCommits(
}

// 4. Last resort - try to find by name=login
if user == nil {
if user == nil && IsValidGitHubUsername(name) {
// Note that Co-authored-by: name <email> is not actually a GitHub login but rather a name - but we are trying hard to find a GitHub profile
user, err = GetGithubUserByLogin(ctx, client, name)
if err != nil {
Expand Down
7 changes: 6 additions & 1 deletion cla-backend/cla/models/github_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@
EXCLUDE_GITHUB_EMAILS = ["noreply.github.com"]
NOREPLY_ID_PATTERN = re.compile(r"^(\d+)\+([a-zA-Z0-9-]+)@users\.noreply\.github\.com$")
NOREPLY_USER_PATTERN = re.compile(r"^([a-zA-Z0-9-]+)@users\.noreply\.github\.com$")
# GitHub usernames must be 3-39 characters long, can only contain alphanumeric characters or hyphens,
# cannot begin or end with a hyphen, and cannot contain consecutive hyphens.
GITHUB_USERNAME_REGEX = re.compile(r'^(?!-)(?!.*--)[A-Za-z0-9-]{3,39}(?<!-)$')

class TTLCache:
def __init__(self, ttl_seconds=86400):
Expand Down Expand Up @@ -1882,6 +1885,8 @@ def get_pull_request_commit_authors(pull_request, installation_id, with_co_autho

return commit_authors

def is_valid_github_username(username: str) -> bool:
return bool(GITHUB_USERNAME_REGEX.match(username))

def get_co_author_commits(co_author, commit, pr, installation_id):
fn = "cla.models.github_models.get_co_author_commits"
Expand Down Expand Up @@ -1965,7 +1970,7 @@ def get_co_author_commits(co_author, commit, pr, installation_id):
user = None

# 4. Last resort: try to find by name (login)
if user is None:
if user is None and is_valid_github_username(name):
try:
# Note that Co-authored-by: name <email> is not actually a GitHub login but rather a name - but we are trying hard to find a GitHub profile
cla.log.debug(f"{fn} - Lookup via login=name: {name}")
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ CYPRESS_ENV=dev
You can ask for example `.env` file over slack.

- Run `npx cypress install`
- Run tests using cmd `npx cypress run`.
- Run tests using cmd `npx cypress run`. Or `xvfb-run -a npx cypress run` when runnign over SSH.
- Run tests using UI `npx cypress open`. Choose **E2E testing**, select **Chrome** browser.
- View test reports in the `cypress-report` directory.
- Explore source code files for detailed implementation.
Expand Down
19 changes: 19 additions & 0 deletions utils/calculate_api_stats.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash
# Copyright The Linux Foundation and each contributor to CommunityBridge.
# SPDX-License-Identifier: MIT


if [ -z "$STAGE" ]
then
export STAGE=prod
fi
if [ -z "$1" ]
then
echo "$0: please provide time range from value as a 1st argument, for example '2 hours ago'"
exit 1
fi
export DTFROM="${1}"
REGION=us-east-1 NO_ECHO=1 DTTO='1 second ago' OUT="api-logs-${STAGE}-1.json" ./utils/search_aws_logs.sh 'LG:api-request-path'
REGION=us-east-2 NO_ECHO=1 DTTO='1 second ago' OUT="api-logs-${STAGE}-2.json" ./utils/search_aws_logs.sh 'LG:api-request-path'
jq -s 'add' "api-logs-${STAGE}-1.json" "api-logs-${STAGE}-2.json" > "api-logs-${STAGE}.json" && rm -f "api-logs-${STAGE}-1.json" "api-logs-${STAGE}-2.json"
./utils/count_apis.sh "api-logs-${STAGE}.json" > "api-logs-${STAGE}.log" && cat "api-logs-${STAGE}.log"
21 changes: 21 additions & 0 deletions utils/count_apis.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash
# Copyright The Linux Foundation and each contributor to CommunityBridge.
# SPDX-License-Identifier: MIT

if [ -z "${1}" ]
then
echo "Usage: $0 <path-to-api-logs>"
echo "Example: $0 api-logs-prod.json"
exit 1
fi

jq -r '
.[].message
| capture("LG:api-request-path:(?<p>[^\"[:space:]]+)")? # find the path
| select(.) # drop non-matches
| .p
' "${1}" \
| sed -E 's/[0-9a-fA-F-]{36}/<uuid>/g' \
| sed -E ':a;s#/([0-9]{1,})(/|$)#/<id>\2#g;ta' \
| sed -E 's#/(00|a0)[A-Za-z0-9]{13,16}(/|$)#/<sfid>\2#g' \
| sort | uniq -c | sort -nr
4 changes: 4 additions & 0 deletions utils/search_aws_log_group.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
#!/bin/bash
# STAGE=dev DEBUG=1 DTFROM='3 days ago' DTTO='2 days ago' ./utils/search_aws_log_group.sh 'cla-backend-dev-githubactivity' 'error'
# REGION=us-east-2 STAGE=prod DEBUG=1 DTFROM='15 minutes ago' DTTO='1 second ago' ./utils/search_aws_log_group.sh 'cla-backend-go-api-v4-lambda' 'LG:api-request-path'
# REGION=us-east-1 STAGE=prod DEBUG=1 DTFROM='15 minutes ago' DTTO='1 second ago' ./utils/search_aws_log_group.sh 'cla-backend-prod-api-v3-lambda' 'LG:api-request-path'
# REGION=us-east-1 STAGE=prod DEBUG=1 DTFROM='15 minutes ago' DTTO='1 second ago' ./utils/search_aws_log_group.sh 'cla-backend-prod-apiv2' 'LG:api-request-path'
# REGION=us-east-1 STAGE=prod DEBUG=1 DTFROM='15 minutes ago' DTTO='1 second ago' ./utils/search_aws_log_group.sh 'cla-backend-prod-githubactivity' 'LG:api-request-path'

if [ -z "$STAGE" ]
then
Expand Down
4 changes: 2 additions & 2 deletions utils/search_aws_logs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
# SPDX-License-Identifier: MIT

# REGION=us-east-1|us-east-2 STAGE=dev DEBUG=1 DTFROM='3 days ago' DTTO='2 days ago' OUT=logs.json ./utils/search_aws_logs.sh 'error'
# DEBUG=1 STAGE=dev REGION=us-east-1 DTFROM='10 days ago' DTTO='1 second ago' OUT=api-logs-dev.json ./utils/search_aws_logs.sh 'LG:api-request-path' && jq -r '.[].message' api-logs-dev.json | grep -o 'LG:api-request-path:[^[:space:]]*' | sed 's/^LG:api-request-path://' | sed -E 's/[0-9a-fA-F-]{36}/<uuid>/g' | sed -E 's/\b[0-9]{2,}\b/<id>/g' | sort | uniq -c | sort -nr
# DEBUG=1 STAGE=prod REGION=us-east-1 NO_ECHO=1 DTFROM='10 days ago' DTTO='1 second ago' OUT=api-logs-prod.json ./utils/search_aws_logs.sh 'LG:api-request-path' && jq -r '.[].message' api-logs-prod.json | grep -o 'LG:api-request-path:[^[:space:]]*' | sed 's/^LG:api-request-path://' | sed -E 's/[0-9a-fA-F-]{36}/<uuid>/g' | sed -E ':a;s#/([0-9]{1,})(/|$)#/<id>\2#g;ta' | sort | uniq -c | sort -nr
# DEBUG=1 STAGE=dev REGION=us-east-1 DTFROM='10 days ago' DTTO='1 second ago' OUT=api-logs-dev.json ./utils/search_aws_logs.sh 'LG:api-request-path' && ./utils/count_apis.sh api-logs-dev.json
# DEBUG=1 STAGE=prod REGION=us-east-1 NO_ECHO=1 DTFROM='10 days ago' DTTO='1 second ago' OUT=api-logs-prod.json ./utils/search_aws_logs.sh 'LG:api-request-path' && ./utils/count_apis.sh api-logs-prod.json
# To find distinct log groups: | jq -r 'map(.logGroupName) | unique | .[]'
# in us-east-1 (mostly V1, V2 and V3):
# To see specific log group: | jq 'map(select(.logGroupName == "/aws/lambda/cla-backend-dev-apiv1"))'
Expand Down
Loading