Skip to content

Commit 52a3fcf

Browse files
authored
Fix databricks labs list to include all projects (#3935)
## Changes This PR fixes the `databricks labs list` command so that all projects from Databricks Labs are listed: prior to this PR only the first 30 projects are listed. The root cause of the problem is that the GitHub API being used is paginated, and defaults to only 30 results per page, and pagination was not implemented. This PR resolves the issue by: - updating the implementation to use the pagination mechanism; and - increasing the number of results per page from 30 (default) to 100 (max allowed). Incidental changes include: - Some trace-level logging of the requests that are being made to the GitHub API, consistent with elsewhere in the codebase. - Some spelling fixes. Out of scope for now: - Filtering the list of projects to only cover those that can be installed. ## Why The `databricks labs list` command was returning an incomplete list; projects that could be installed (eg. `ucx`) were not included in the list. ## Tests - Additional test case to cover pagination of the GitHub API. - Manual testing.
1 parent e68b551 commit 52a3fcf

File tree

5 files changed

+214
-12
lines changed

5 files changed

+214
-12
lines changed

cmd/labs/github/github.go

Lines changed: 51 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,20 @@ func WithUserContentOverride(ctx context.Context, override string) context.Conte
3333

3434
var ErrNotFound = errors.New("not found")
3535

36+
type pagedResponse struct {
37+
Body []byte
38+
NextLink string
39+
}
40+
3641
func getBytes(ctx context.Context, method, url string, body io.Reader) ([]byte, error) {
42+
resp, err := getPagedBytes(ctx, method, url, body)
43+
if err != nil {
44+
return nil, err
45+
}
46+
return resp.Body, nil
47+
}
48+
49+
func getPagedBytes(ctx context.Context, method, url string, body io.Reader) (*pagedResponse, error) {
3750
ao, ok := ctx.Value(&apiOverride).(string)
3851
if ok {
3952
url = strings.Replace(url, gitHubAPI, ao, 1)
@@ -57,14 +70,47 @@ func getBytes(ctx context.Context, method, url string, body io.Reader) ([]byte,
5770
if res.StatusCode >= 400 {
5871
return nil, fmt.Errorf("github request failed: %s", res.Status)
5972
}
73+
nextLink := parseNextLink(res.Header.Get("link"))
6074
defer res.Body.Close()
61-
return io.ReadAll(res.Body)
75+
bodyBytes, err := io.ReadAll(res.Body)
76+
if err != nil {
77+
return nil, err
78+
}
79+
return &pagedResponse{
80+
Body: bodyBytes,
81+
NextLink: nextLink,
82+
}, nil
83+
}
84+
85+
func parseNextLink(linkHeader string) string {
86+
if linkHeader == "" {
87+
return ""
88+
}
89+
// Pagination and link headers are documented here:
90+
// https://docs.github.com/en/rest/using-the-rest-api/using-pagination-in-the-rest-api?apiVersion=2022-11-28#using-link-headers
91+
// An example link header to handle:
92+
// link: <https://api.github.com/repositories/1300192/issues?page=2>; rel="prev", <https://api.github.com/repositories/1300192/issues?page=4>; rel="next", <https://api.github.com/repositories/1300192/issues?page=515>; rel="last", <https://api.github.com/repositories/1300192/issues?page=1>; rel="first"
93+
links := strings.Split(linkHeader, ",")
94+
for _, link := range links {
95+
parts := strings.Split(strings.TrimSpace(link), ";")
96+
if len(parts) != 2 {
97+
continue
98+
}
99+
if strings.Contains(parts[1], `rel="next"`) {
100+
urlField := strings.TrimSpace(parts[0])
101+
if strings.HasPrefix(urlField, "<") && strings.HasSuffix(urlField, ">") {
102+
url := urlField[1 : len(urlField)-1]
103+
return url
104+
}
105+
}
106+
}
107+
return ""
62108
}
63109

64-
func httpGetAndUnmarshal(ctx context.Context, url string, response any) error {
65-
raw, err := getBytes(ctx, "GET", url, nil)
110+
func httpGetAndUnmarshal(ctx context.Context, url string, response any) (string, error) {
111+
raw, err := getPagedBytes(ctx, "GET", url, nil)
66112
if err != nil {
67-
return err
113+
return "", err
68114
}
69-
return json.Unmarshal(raw, response)
115+
return raw.NextLink, json.Unmarshal(raw.Body, response)
70116
}

cmd/labs/github/github_test.go

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
package github
2+
3+
import (
4+
"testing"
5+
6+
"github.com/stretchr/testify/assert"
7+
)
8+
9+
func TestParseNextLink(t *testing.T) {
10+
tests := []struct {
11+
name string
12+
linkHeader string
13+
expected string
14+
}{
15+
// First and foremost, the well-formed cases that we can expect from real GitHub API.
16+
{
17+
name: "no header",
18+
linkHeader: "",
19+
expected: "",
20+
},
21+
{
22+
name: "documentation example",
23+
linkHeader: `<https://api.github.com/repositories/1300192/issues?page=2>; rel="prev", <https://api.github.com/repositories/1300192/issues?page=4>; rel="next", <https://api.github.com/repositories/1300192/issues?page=515>; rel="last", <https://api.github.com/repositories/1300192/issues?page=1>; rel="first"`,
24+
expected: "https://api.github.com/repositories/1300192/issues?page=4",
25+
},
26+
{
27+
name: "with next only",
28+
linkHeader: `<https://api.github.com/repos/databricks/cli/issues?page=2>; rel="next"`,
29+
expected: "https://api.github.com/repos/databricks/cli/issues?page=2",
30+
},
31+
{
32+
name: "without next",
33+
linkHeader: `<https://api.github.com/repositories/1300192/issues?page=1>; rel="prev", <https://api.github.com/repositories/1300192/issues?page=1>; rel="first", <https://api.github.com/repositories/1300192/issues?page=515>; rel="last"`,
34+
expected: "",
35+
},
36+
{
37+
name: "next at beginning",
38+
linkHeader: `<https://api.github.com/repos/test/test?page=5>; rel="next", <https://api.github.com/repos/test/test?page=10>; rel="last"`,
39+
expected: "https://api.github.com/repos/test/test?page=5",
40+
},
41+
{
42+
name: "next at end",
43+
linkHeader: `<https://api.github.com/repos/test/test?page=10>; rel="last", <https://api.github.com/repos/test/test?page=5>; rel="next"`,
44+
expected: "https://api.github.com/repos/test/test?page=5",
45+
},
46+
// Malformed cases to ensure robustness. (These should not occur in practice, but are here to demonstrate resilience.)
47+
{
48+
name: "malformed no semicolon",
49+
linkHeader: `<https://api.github.com/repos/test/test?page=2> rel="next"`,
50+
expected: "",
51+
},
52+
{
53+
name: "malformed no angle-brackets",
54+
linkHeader: `https://api.github.com/repos/test/test?page=2; rel="next"`,
55+
expected: "",
56+
},
57+
{
58+
name: "malformed multiple parts",
59+
linkHeader: `<https://api.github.com/repos/test/test?page=2>; rel="next"; extra="value"`,
60+
expected: "",
61+
},
62+
{
63+
name: "malformed no url",
64+
linkHeader: `<>; rel="next"`,
65+
expected: "",
66+
},
67+
{
68+
name: "malformed empty link",
69+
linkHeader: `, <https://api.github.com/repos/test/test?page=5>; rel="next"`,
70+
expected: "https://api.github.com/repos/test/test?page=5",
71+
},
72+
// Borderline case: some tolerance of whitespace.
73+
{
74+
name: "tolerate whitespace",
75+
linkHeader: ` <https://api.github.com/repos/test/test?page=2> ; rel="next" `,
76+
expected: "https://api.github.com/repos/test/test?page=2",
77+
},
78+
}
79+
80+
for _, tt := range tests {
81+
t.Run(tt.name, func(t *testing.T) {
82+
result := parseNextLink(tt.linkHeader)
83+
assert.Equal(t, tt.expected, result)
84+
})
85+
}
86+
}

cmd/labs/github/releases.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ func getVersions(ctx context.Context, org, repo string) (Versions, error) {
4848
var releases Versions
4949
log.Debugf(ctx, "Fetching latest releases for %s/%s from GitHub API", org, repo)
5050
url := fmt.Sprintf("%s/repos/%s/%s/releases", gitHubAPI, org, repo)
51-
err := httpGetAndUnmarshal(ctx, url, &releases)
51+
_, err := httpGetAndUnmarshal(ctx, url, &releases)
5252
return releases, err
5353
}
5454

cmd/labs/github/repositories.go

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,21 +30,30 @@ func (r *repositoryCache) Load(ctx context.Context) (Repositories, error) {
3030
})
3131
}
3232

33-
// getRepositories is considered to be privata API, as we want the usage to go through a cache
33+
// getRepositories is considered to be private API, as we want the usage to go through a cache
3434
func getRepositories(ctx context.Context, org string) (Repositories, error) {
35-
var repos Repositories
35+
var allRepos Repositories
3636
log.Debugf(ctx, "Loading repositories for %s from GitHub API", org)
37-
url := fmt.Sprintf("%s/users/%s/repos", gitHubAPI, org)
38-
err := httpGetAndUnmarshal(ctx, url, &repos)
39-
return repos, err
37+
url := fmt.Sprintf("%s/users/%s/repos?per_page=100", gitHubAPI, org)
38+
39+
for url != "" {
40+
var repos Repositories
41+
nextUrl, err := httpGetAndUnmarshal(ctx, url, &repos)
42+
if err != nil {
43+
return nil, err
44+
}
45+
allRepos = append(allRepos, repos...)
46+
url = nextUrl
47+
}
48+
return allRepos, nil
4049
}
4150

4251
type Repositories []ghRepo
4352

4453
type ghRepo struct {
4554
Name string `json:"name"`
4655
Description string `json:"description"`
47-
Langauge string `json:"language"`
56+
Language string `json:"language"`
4857
DefaultBranch string `json:"default_branch"`
4958
Stars int `json:"stargazers_count"`
5059
IsFork bool `json:"fork"`

cmd/labs/github/repositories_test.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@ package github
22

33
import (
44
"context"
5+
"fmt"
6+
"maps"
57
"net/http"
68
"net/http/httptest"
9+
"strconv"
710
"testing"
811

912
"github.com/stretchr/testify/assert"
@@ -29,3 +32,61 @@ func TestRepositories(t *testing.T) {
2932
assert.NoError(t, err)
3033
assert.NotEmpty(t, all)
3134
}
35+
36+
func TestRepositoriesPagination(t *testing.T) {
37+
var requestedURLs []string
38+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
39+
requestedURLs = append(requestedURLs, r.URL.String())
40+
switch r.URL.Path {
41+
case "/users/databrickslabs/repos":
42+
requestedQueryValues := r.URL.Query()
43+
linkTo := func(page int, rel string) string {
44+
nextQueryValues := maps.Clone(requestedQueryValues)
45+
nextQueryValues.Set("page", strconv.Itoa(page))
46+
builder := *r.URL
47+
builder.Scheme = "http"
48+
builder.Host = r.Host
49+
builder.RawQuery = nextQueryValues.Encode()
50+
return fmt.Sprintf(`<%s>; rel="%s"`, builder.String(), rel)
51+
}
52+
page := requestedQueryValues.Get("page")
53+
var link string
54+
var body string
55+
switch page {
56+
// Pagination logic with next and prev links for 3 pages of results.
57+
case "", "1":
58+
link = linkTo(2, "next")
59+
body = `[{"name": "repo1"}, {"name": "repo2"}]`
60+
case "2":
61+
link = linkTo(1, "prev") + ", " + linkTo(3, "next")
62+
body = `[{"name": "repo3"}, {"name": "repo4"}]`
63+
case "3":
64+
link = linkTo(2, "prev")
65+
body = `[{"name": "repo5"}]`
66+
}
67+
w.Header().Set("link", link)
68+
_, err := w.Write([]byte(body))
69+
assert.NoError(t, err)
70+
return
71+
}
72+
t.Logf("Requested: %s", r.URL.String())
73+
panic("stub required")
74+
}))
75+
defer server.Close()
76+
77+
ctx := context.Background()
78+
ctx = WithApiOverride(ctx, server.URL)
79+
80+
repos, err := getRepositories(ctx, "databrickslabs")
81+
assert.NoError(t, err)
82+
var names []string
83+
for _, repo := range repos {
84+
names = append(names, repo.Name)
85+
}
86+
assert.Equal(t, []string{
87+
"/users/databrickslabs/repos?per_page=100",
88+
"/users/databrickslabs/repos?page=2&per_page=100",
89+
"/users/databrickslabs/repos?page=3&per_page=100",
90+
}, requestedURLs)
91+
assert.Equal(t, []string{"repo1", "repo2", "repo3", "repo4", "repo5"}, names)
92+
}

0 commit comments

Comments
 (0)