rework github implementation to support lfs

Thomas Gilgenast · Thomas Gilgenast · commit c085e2d3b7b2 · 2025-03-14T14:29:13.000-04:00
diff --git a/fsspec/implementations/github.py b/fsspec/implementations/github.py
@@ -1,6 +1,6 @@
-import requests
+import base64
 
-import fsspec
+import requests
 
 from ..spec import AbstractFileSystem
 from ..utils import infer_storage_options
@@ -36,7 +36,7 @@ class GithubFileSystem(AbstractFileSystem):
     """
 
     url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
-    rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}"
+    content_url = "https://api.github.com/repos/{org}/{repo}/contents/{path}?ref={sha}"
     protocol = "github"
     timeout = (60, 60)  # connect, read timeouts
 
@@ -219,21 +219,35 @@ def _open(
     ):
         if mode != "rb":
             raise NotImplementedError
-        url = self.rurl.format(
+
+        # construct a url to hit the GitHub API's repo contents API
+        url = self.content_url.format(
             org=self.org, repo=self.repo, path=path, sha=sha or self.root
         )
+
+        # make a request to this API, and parse the response as JSON
         r = requests.get(url, timeout=self.timeout, **self.kw)
         if r.status_code == 404:
             raise FileNotFoundError(path)
         r.raise_for_status()
+        content_json = r.json()
+
+        # if the response's content key is not empty, try to parse it as base64
+        if content_json["content"]:
+            content = base64.b64decode(content_json["content"])
+
+            # as long as the content does not start with the string
+            # "version https://git-lfs.github.com/"
+            # then it is probably not a git-lfs pointer and we can just return
+            # the content directly
+            if not content.startswith(b"version https://git-lfs.github.com/"):
+                return MemoryFile(None, None, content)
+
+        # we land here if the content was not present in the first response
+        # (regular file over 1MB or git-lfs tracked file)
+        # in this case, we get the content from the download_url
+        r = requests.get(content_json["download_url"], timeout=self.timeout, **self.kw)
+        if r.status_code == 404:
+            raise FileNotFoundError(path)
+        r.raise_for_status()
         return MemoryFile(None, None, r.content)
-
-    def cat(self, path, recursive=False, on_error="raise", **kwargs):
-        paths = self.expand_path(path, recursive=recursive)
-        urls = [
-            self.rurl.format(org=self.org, repo=self.repo, path=u, sha=self.root)
-            for u, sh in paths
-        ]
-        fs = fsspec.filesystem("http")
-        data = fs.cat(urls, on_error="return")
-        return {u: v for ((k, v), u) in zip(data.items(), urls)}
diff --git a/fsspec/implementations/tests/test_github.py b/fsspec/implementations/tests/test_github.py
@@ -0,0 +1,41 @@
+import fsspec
+
+
+def test_github_open_small_file():
+    # test opening a small file <1 MB
+    with fsspec.open("github://mwaskom:seaborn-data@4e06bf0/penguins.csv") as f:
+        assert f.readline().startswith(b"species,island")
+
+
+def test_github_open_large_file():
+    # test opening a large file >1 MB
+    with fsspec.open("github://mwaskom:seaborn-data@83bfba7/brain_networks.csv") as f:
+        assert f.readline().startswith(b"network,1,1,2,2")
+
+
+def test_github_open_lfs_file():
+    # test opening a git-lfs tracked file
+    with fsspec.open(
+        "github://cBioPortal:datahub@55cd360"
+        "/public/acc_2019/data_gene_panel_matrix.txt",
+    ) as f:
+        assert f.readline().startswith(b"SAMPLE_ID\tmutations")
+
+
+def test_github_cat():
+    # test using cat to fetch the content of multiple files
+    fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data")
+    paths = ["penguins.csv", "mpg.csv"]
+    cat_result = fs.cat(paths)
+    assert set(cat_result.keys()) == {"penguins.csv", "mpg.csv"}
+    assert cat_result["penguins.csv"].startswith(b"species,island")
+    assert cat_result["mpg.csv"].startswith(b"mpg,cylinders")
+
+
+def test_github_ls():
+    # test using ls to list the files in a resository
+    fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data")
+    ls_result = set(fs.ls(""))
+    expected = {"brain_networks.csv", "mpg.csv", "penguins.csv", "README.md", "raw"}
+    # check if the result is a subset of the expected files
+    assert expected.issubset(ls_result)