Skip to content

Commit aa6736d

Browse files
authored
feat: HTTP archive fallback for orphaned git commits (#9)
* feat: add HTTP archive fallback for orphaned git commits When git fetch fails for a commit ID (e.g. orphaned commits unreachable from any ref), fall back to downloading a tarball archive from the hosting platform's HTTP API. Supports GitHub, GitLab, and Bitbucket. Credentials are resolved from URL userinfo, netrc, or environment variables (GH_TOKEN, GITHUB_TOKEN, GITLAB_TOKEN, GL_TOKEN, BITBUCKET_TOKEN). SSH-only users will need HTTP credentials configured for private repositories. * fix: check resp.Body.Close error to satisfy errcheck linter * chore: remove redundant +build lines in favor of go:build directives * add integration tests * unpack archive at correct level
1 parent c22b28e commit aa6736d

File tree

7 files changed

+754
-4
lines changed

7 files changed

+754
-4
lines changed

detect_file_unix_test.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
// SPDX-License-Identifier: MPL-2.0
33

44
//go:build test || unix
5-
// +build test unix
65

76
package getter
87

get_file_unix.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
// SPDX-License-Identifier: MPL-2.0
33

44
//go:build !windows
5-
// +build !windows
65

76
package getter
87

get_file_windows.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
// SPDX-License-Identifier: MPL-2.0
33

44
//go:build windows
5-
// +build windows
65

76
package getter
87

get_git.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"bytes"
88
"context"
99
"encoding/base64"
10+
"errors"
1011
"fmt"
1112
"net/url"
1213
"os"
@@ -126,6 +127,16 @@ func (g *GitGetter) Get(ctx context.Context, dst string, u *url.URL) error {
126127
err = g.clone(ctx, dst, sshKeyFile, u, ref, depth, subdir)
127128
}
128129
if err != nil {
130+
// If git operations failed for a commit ID, try downloading via
131+
// the hosting platform's HTTP archive endpoint. This handles
132+
// orphaned commits that are unreachable via the git protocol.
133+
if gitCommitIDRegex.MatchString(ref) {
134+
if archiveErr := fetchArchive(ctx, dst, u, ref, subdir); archiveErr == nil {
135+
return nil
136+
} else {
137+
return errors.Join(err, archiveErr)
138+
}
139+
}
129140
return err
130141
}
131142

get_git_archive.go

Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
package getter
2+
3+
import (
4+
"archive/tar"
5+
"compress/gzip"
6+
"context"
7+
"fmt"
8+
"io"
9+
"net/http"
10+
"net/url"
11+
"os"
12+
"path/filepath"
13+
"strings"
14+
)
15+
16+
// archiveURLOverride, when non-empty, replaces the URL that fetchArchive
17+
// downloads from. This exists so that tests can point at a local httptest
18+
// server without needing a real hosting platform.
19+
var archiveURLOverride string
20+
21+
// fetchArchive downloads a tarball archive of the given commit from the
22+
// hosting platform's HTTP API and extracts it to dst. This is used as a
23+
// fallback when git-fetch cannot retrieve a commit (e.g. orphaned commits
24+
// that are unreachable from any ref).
25+
//
26+
// Authentication is resolved from URL userinfo, netrc, or environment
27+
// variables (GH_TOKEN, GITLAB_TOKEN, etc.). SSH keys cannot be used here
28+
// since this is an HTTP download — if the user only has SSH credentials
29+
// configured, this fallback will fail for private repositories.
30+
//
31+
// If subdir is non-empty, only files under that subdirectory are placed in
32+
// dst. The resulting directory is NOT a git repository.
33+
func fetchArchive(ctx context.Context, dst string, u *url.URL, ref string, subdir string) error {
34+
aURL := archiveURLOverride
35+
if aURL == "" {
36+
var err error
37+
aURL, err = archiveURL(u, ref)
38+
if err != nil {
39+
return err
40+
}
41+
}
42+
43+
// Parse the archive URL so we can attach credentials.
44+
archiveParsed, err := url.Parse(aURL)
45+
if err != nil {
46+
return err
47+
}
48+
49+
// Carry over credentials from the original git URL if present,
50+
// otherwise fall back to the user's netrc file. Skip the common SSH
51+
// placeholder user "git" since it isn't a real credential.
52+
if u.User != nil && u.User.Username() != "" && u.User.Username() != "git" {
53+
archiveParsed.User = u.User
54+
} else if err := addAuthFromNetrc(archiveParsed); err != nil {
55+
return err
56+
}
57+
58+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, archiveParsed.String(), nil)
59+
if err != nil {
60+
return err
61+
}
62+
63+
if archiveParsed.User != nil {
64+
password, _ := archiveParsed.User.Password()
65+
req.SetBasicAuth(archiveParsed.User.Username(), password)
66+
} else if token := tokenFromEnv(u.Host); token != "" {
67+
req.Header.Set("Authorization", "Bearer "+token)
68+
}
69+
70+
resp, err := http.DefaultClient.Do(req)
71+
if err != nil {
72+
return fmt.Errorf("failed to download archive from %s: %w", aURL, err)
73+
}
74+
defer func() { _ = resp.Body.Close() }()
75+
76+
if resp.StatusCode != http.StatusOK {
77+
return fmt.Errorf("failed to download archive (%s): HTTP %d", aURL, resp.StatusCode)
78+
}
79+
80+
gzipR, err := gzip.NewReader(resp.Body)
81+
if err != nil {
82+
return fmt.Errorf("failed to decompress archive: %w", err)
83+
}
84+
defer func() { _ = gzipR.Close() }()
85+
86+
if err := extractArchive(gzipR, dst, subdir); err != nil {
87+
return fmt.Errorf("failed to extract archive: %w", err)
88+
}
89+
90+
return nil
91+
}
92+
93+
// extractArchive reads a tar stream and extracts its contents into dst. The
94+
// archive is expected to contain a single top-level directory (e.g.
95+
// "repo-sha/") which is stripped from all paths. If subdir is non-empty, only
96+
// entries under that subdirectory are extracted, and the subdir path is
97+
// preserved relative to dst.
98+
//
99+
// This does not reuse the shared untar helper because hosting-platform
100+
// archives require stripping the top-level directory and filtering by subdir,
101+
// neither of which untar supports. Adding those concerns to untar would
102+
// complicate a function shared by all tar-based decompressors.
103+
func extractArchive(r io.Reader, dst string, subdir string) error {
104+
tarR := tar.NewReader(r)
105+
topDir := ""
106+
found := false
107+
108+
for {
109+
hdr, err := tarR.Next()
110+
if err == io.EOF {
111+
break
112+
}
113+
if err != nil {
114+
return err
115+
}
116+
117+
if hdr.Typeflag == tar.TypeXGlobalHeader || hdr.Typeflag == tar.TypeXHeader {
118+
continue
119+
}
120+
121+
// Disallow parent traversal.
122+
if containsDotDot(hdr.Name) {
123+
return fmt.Errorf("entry contains '..': %s", hdr.Name)
124+
}
125+
126+
// Discover and strip the top-level directory.
127+
if topDir == "" {
128+
topDir = strings.SplitN(hdr.Name, "/", 2)[0] + "/"
129+
}
130+
rel := strings.TrimPrefix(hdr.Name, topDir)
131+
if rel == "" {
132+
// This is the top-level directory entry itself; skip it.
133+
continue
134+
}
135+
136+
// If a subdir filter is set, skip entries outside it.
137+
if subdir != "" {
138+
subdirPrefix := strings.TrimRight(subdir, "/") + "/"
139+
if !strings.HasPrefix(rel, subdirPrefix) {
140+
continue
141+
}
142+
}
143+
144+
found = true
145+
outPath := filepath.Join(dst, filepath.FromSlash(rel))
146+
147+
if hdr.FileInfo().IsDir() {
148+
if err := os.MkdirAll(outPath, 0755); err != nil {
149+
return err
150+
}
151+
continue
152+
}
153+
154+
// Ensure parent directory exists.
155+
if err := os.MkdirAll(filepath.Dir(outPath), 0755); err != nil {
156+
return err
157+
}
158+
159+
if err := copyReader(outPath, tarR, hdr.FileInfo().Mode(), 0, 0); err != nil {
160+
return err
161+
}
162+
}
163+
164+
if subdir != "" && !found {
165+
return fmt.Errorf("path %q not found in archive", subdir)
166+
}
167+
168+
return nil
169+
}
170+
171+
// archiveURL constructs a tarball download URL for the given ref based on the
172+
// hosting platform detected from u's hostname. The API endpoints are used
173+
// rather than the web URLs because they resolve short commit SHAs.
174+
func archiveURL(u *url.URL, ref string) (string, error) {
175+
owner, repo, err := parseOwnerRepo(u.Path)
176+
if err != nil {
177+
return "", err
178+
}
179+
180+
host := strings.ToLower(u.Host)
181+
// Strip port if present (e.g. "github.com:443" → "github.com").
182+
if i := strings.LastIndex(host, ":"); i != -1 {
183+
host = host[:i]
184+
}
185+
186+
switch {
187+
case host == "github.com" || strings.HasSuffix(host, ".github.com"):
188+
return fmt.Sprintf("https://api.github.com/repos/%s/%s/tarball/%s", owner, repo, ref), nil
189+
case host == "gitlab.com" || strings.HasSuffix(host, ".gitlab.com"):
190+
return fmt.Sprintf("https://gitlab.com/api/v4/projects/%s%%2F%s/repository/archive.tar.gz?sha=%s", owner, repo, ref), nil
191+
case host == "bitbucket.org" || strings.HasSuffix(host, ".bitbucket.org"):
192+
return fmt.Sprintf("https://bitbucket.org/%s/%s/get/%s.tar.gz", owner, repo, ref), nil
193+
default:
194+
return "", fmt.Errorf("unsupported git hosting platform %q for archive fallback", u.Host)
195+
}
196+
}
197+
198+
// tokenFromEnv returns an API token from well-known environment variables
199+
// for the given host. It returns an empty string if no token is found.
200+
func tokenFromEnv(host string) string {
201+
host = strings.ToLower(host)
202+
// Strip port if present.
203+
if i := strings.LastIndex(host, ":"); i != -1 {
204+
host = host[:i]
205+
}
206+
207+
switch {
208+
case host == "github.com" || strings.HasSuffix(host, ".github.com"):
209+
// GH_TOKEN is the newer GitHub CLI convention; GITHUB_TOKEN is the
210+
// widely-used CI/Actions variable.
211+
if t := os.Getenv("GH_TOKEN"); t != "" {
212+
return t
213+
}
214+
return os.Getenv("GITHUB_TOKEN")
215+
case host == "gitlab.com" || strings.HasSuffix(host, ".gitlab.com"):
216+
if t := os.Getenv("GITLAB_TOKEN"); t != "" {
217+
return t
218+
}
219+
return os.Getenv("GL_TOKEN")
220+
case host == "bitbucket.org" || strings.HasSuffix(host, ".bitbucket.org"):
221+
return os.Getenv("BITBUCKET_TOKEN")
222+
default:
223+
return ""
224+
}
225+
}
226+
227+
// parseOwnerRepo extracts the owner and repository name from a URL path
228+
// like "/owner/repo.git" or "/owner/repo".
229+
func parseOwnerRepo(rawPath string) (owner, repo string, err error) {
230+
path := strings.TrimPrefix(rawPath, "/")
231+
path = strings.TrimSuffix(path, ".git")
232+
parts := strings.SplitN(path, "/", 3)
233+
if len(parts) < 2 || parts[0] == "" || parts[1] == "" {
234+
return "", "", fmt.Errorf("cannot parse owner/repo from path %q", rawPath)
235+
}
236+
return parts[0], parts[1], nil
237+
}

0 commit comments

Comments
 (0)