-
Notifications
You must be signed in to change notification settings - Fork 409
rework github _open() implementation to support LFS #1810
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
c085e2d
a751042
f505aff
7d88086
75a6f4e
8e7fb56
083392b
65e09cf
38aadce
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import fsspec | ||
|
||
|
||
def test_github_open_small_file(): | ||
# test opening a small file <1 MB | ||
with fsspec.open("github://mwaskom:seaborn-data@4e06bf0/penguins.csv") as f: | ||
assert f.readline().startswith(b"species,island") | ||
|
||
|
||
def test_github_open_large_file(): | ||
# test opening a large file >1 MB | ||
# use block_size=0 to get a streaming interface to the file, ensuring that | ||
# we fetch only the parts we need instead of downloading the full file all | ||
# at once | ||
with fsspec.open( | ||
"github://mwaskom:seaborn-data@83bfba7/brain_networks.csv", block_size=0 | ||
) as f: | ||
# read only the first 20 bytes of the file | ||
assert f.read(20) == b"network,1,1,2,2,3,3," | ||
|
||
|
||
def test_github_open_lfs_file(): | ||
# test opening a git-lfs tracked file | ||
with fsspec.open( | ||
"github://cBioPortal:datahub@55cd360" | ||
"/public/acc_2019/data_gene_panel_matrix.txt", | ||
block_size=0, | ||
) as f: | ||
assert f.read(19) == b"SAMPLE_ID\tmutations" | ||
|
||
|
||
def test_github_cat(): | ||
# test using cat to fetch the content of multiple files | ||
fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data") | ||
paths = ["penguins.csv", "mpg.csv"] | ||
cat_result = fs.cat(paths) | ||
assert set(cat_result.keys()) == {"penguins.csv", "mpg.csv"} | ||
assert cat_result["penguins.csv"].startswith(b"species,island") | ||
assert cat_result["mpg.csv"].startswith(b"mpg,cylinders") | ||
|
||
|
||
def test_github_ls(): | ||
# test using ls to list the files in a resository | ||
fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data") | ||
ls_result = set(fs.ls("")) | ||
expected = {"brain_networks.csv", "mpg.csv", "penguins.csv", "README.md", "raw"} | ||
# check if the result is a subset of the expected files | ||
assert expected.issubset(ls_result) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -56,7 +56,7 @@ full = [ | |
fuse = ["fusepy"] | ||
gcs = ["gcsfs"] | ||
git = ["pygit2"] | ||
github = ["requests"] | ||
github = ["fsspec[http]", "requests"] | ||
|
||
gs = ["gcsfs"] | ||
gui = ["panel"] | ||
hdfs = ["pyarrow >= 1"] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think there might be an alternative here to instead just do something like
which avoids some of the complexity of creating the HTTPFile directly.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've gone ahead and made this change in 75a6f4e but would be happy to revert if it's going in the wrong direction.