Skip to content

Commit c085e2d

Browse files
author
Thomas Gilgenast
committed
rework github implementation to support lfs
1 parent 6b85a47 commit c085e2d

File tree

2 files changed

+69
-14
lines changed

2 files changed

+69
-14
lines changed

fsspec/implementations/github.py

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
import requests
1+
import base64
22

3-
import fsspec
3+
import requests
44

55
from ..spec import AbstractFileSystem
66
from ..utils import infer_storage_options
@@ -36,7 +36,7 @@ class GithubFileSystem(AbstractFileSystem):
3636
"""
3737

3838
url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
39-
rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}"
39+
content_url = "https://api.github.com/repos/{org}/{repo}/contents/{path}?ref={sha}"
4040
protocol = "github"
4141
timeout = (60, 60) # connect, read timeouts
4242

@@ -219,21 +219,35 @@ def _open(
219219
):
220220
if mode != "rb":
221221
raise NotImplementedError
222-
url = self.rurl.format(
222+
223+
# construct a url to hit the GitHub API's repo contents API
224+
url = self.content_url.format(
223225
org=self.org, repo=self.repo, path=path, sha=sha or self.root
224226
)
227+
228+
# make a request to this API, and parse the response as JSON
225229
r = requests.get(url, timeout=self.timeout, **self.kw)
226230
if r.status_code == 404:
227231
raise FileNotFoundError(path)
228232
r.raise_for_status()
233+
content_json = r.json()
234+
235+
# if the response's content key is not empty, try to parse it as base64
236+
if content_json["content"]:
237+
content = base64.b64decode(content_json["content"])
238+
239+
# as long as the content does not start with the string
240+
# "version https://git-lfs.github.com/"
241+
# then it is probably not a git-lfs pointer and we can just return
242+
# the content directly
243+
if not content.startswith(b"version https://git-lfs.github.com/"):
244+
return MemoryFile(None, None, content)
245+
246+
# we land here if the content was not present in the first response
247+
# (regular file over 1MB or git-lfs tracked file)
248+
# in this case, we get the content from the download_url
249+
r = requests.get(content_json["download_url"], timeout=self.timeout, **self.kw)
250+
if r.status_code == 404:
251+
raise FileNotFoundError(path)
252+
r.raise_for_status()
229253
return MemoryFile(None, None, r.content)
230-
231-
def cat(self, path, recursive=False, on_error="raise", **kwargs):
232-
paths = self.expand_path(path, recursive=recursive)
233-
urls = [
234-
self.rurl.format(org=self.org, repo=self.repo, path=u, sha=self.root)
235-
for u, sh in paths
236-
]
237-
fs = fsspec.filesystem("http")
238-
data = fs.cat(urls, on_error="return")
239-
return {u: v for ((k, v), u) in zip(data.items(), urls)}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import fsspec
2+
3+
4+
def test_github_open_small_file():
5+
# test opening a small file <1 MB
6+
with fsspec.open("github://mwaskom:seaborn-data@4e06bf0/penguins.csv") as f:
7+
assert f.readline().startswith(b"species,island")
8+
9+
10+
def test_github_open_large_file():
11+
# test opening a large file >1 MB
12+
with fsspec.open("github://mwaskom:seaborn-data@83bfba7/brain_networks.csv") as f:
13+
assert f.readline().startswith(b"network,1,1,2,2")
14+
15+
16+
def test_github_open_lfs_file():
17+
# test opening a git-lfs tracked file
18+
with fsspec.open(
19+
"github://cBioPortal:datahub@55cd360"
20+
"/public/acc_2019/data_gene_panel_matrix.txt",
21+
) as f:
22+
assert f.readline().startswith(b"SAMPLE_ID\tmutations")
23+
24+
25+
def test_github_cat():
26+
# test using cat to fetch the content of multiple files
27+
fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data")
28+
paths = ["penguins.csv", "mpg.csv"]
29+
cat_result = fs.cat(paths)
30+
assert set(cat_result.keys()) == {"penguins.csv", "mpg.csv"}
31+
assert cat_result["penguins.csv"].startswith(b"species,island")
32+
assert cat_result["mpg.csv"].startswith(b"mpg,cylinders")
33+
34+
35+
def test_github_ls():
36+
# test using ls to list the files in a resository
37+
fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data")
38+
ls_result = set(fs.ls(""))
39+
expected = {"brain_networks.csv", "mpg.csv", "penguins.csv", "README.md", "raw"}
40+
# check if the result is a subset of the expected files
41+
assert expected.issubset(ls_result)

0 commit comments

Comments
 (0)