Skip to content

Commit 977525a

Browse files
authored
Merge pull request #259 from scverse/fix_psbulk_gene_order
- `pp.adjmat` now returns the same features as used as input instead of the subset of `net` - `pp.pseudobulk` now returns the same order features as used as input instead of shuffling them - Added a dedicated header and 5 attemps to `_download` to mitigate 429 Client Error from Zenodo downloads
2 parents 555e244 + 74371a3 commit 977525a

File tree

6 files changed

+48
-13
lines changed

6 files changed

+48
-13
lines changed

CHANGELOG.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@ and this project adheres to [Semantic Versioning][].
1111
## 2.1.3
1212

1313
### Changes
14-
- `pp.adjmat` now returns the same features as used as input instead of the subset of `net`.
15-
14+
- `pp.adjmat` now returns the same features as used as input instead of the subset of `net`
15+
- `pp.pseudobulk` now returns the same order features as used as input instead of shuffling them
16+
- Added a dedicated header and 5 attemps to `_download` to mitigate 429 Client Error from Zenodo downloads
1617

1718
## 2.1.2
1819

src/decoupler/_download.py

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import io
2+
import time
3+
from importlib.metadata import version
24

35
import pandas as pd
46
import requests
@@ -10,16 +12,16 @@
1012
URL_INT = "https://omnipathdb.org/interactions/?genesymbols=1&"
1113

1214

13-
def _download(
15+
def _download_chunks(
1416
url: str,
1517
verbose: bool = False,
1618
) -> io.BytesIO:
1719
assert isinstance(url, str), "url must be str"
1820
# Download with progress bar
19-
m = f"Downloading {url}"
20-
_log(m, level="info", verbose=verbose)
2121
chunks = []
22-
with requests.get(url, stream=True) as r:
22+
__version__ = version("decoupler")
23+
headers = {"User-Agent": f"decoupler/{__version__} (https://github.com/scverse/decoupler)"}
24+
with requests.get(url, stream=True, headers=headers) as r:
2325
r.raise_for_status()
2426
with tqdm(unit="B", unit_scale=True, desc="Progress", disable=not verbose) as pbar:
2527
for chunk in r.iter_content(chunk_size=8192):
@@ -28,6 +30,33 @@ def _download(
2830
pbar.update(len(chunk))
2931
# Read into bytes
3032
data = io.BytesIO(b"".join(chunks))
33+
return data
34+
35+
36+
def _download(
37+
url: str,
38+
verbose: bool = False,
39+
retries: int = 5,
40+
wait_time: int = 20,
41+
) -> io.BytesIO:
42+
m = f"Downloading {url}"
43+
_log(m, level="info", verbose=verbose)
44+
data = None
45+
for attempt in range(1, retries + 1):
46+
try:
47+
data = _download_chunks(url, verbose=False)
48+
break
49+
except requests.exceptions.HTTPError as e:
50+
status_code = e.response.status_code if e.response is not None else None
51+
if status_code == 429 and attempt < retries:
52+
_log(
53+
f"429 Too Many Requests for {url}. Retrying in {wait_time}s (attempt {attempt + 1}/{retries})",
54+
level="warn",
55+
verbose=verbose,
56+
)
57+
time.sleep(wait_time)
58+
continue
59+
raise # Not a 429 or no retries left: re-raise
3160
m = "Download finished"
3261
_log(m, level="info", verbose=verbose)
3362
return data

src/decoupler/mt/_run.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def _run(
5555
) -> tuple[pd.DataFrame, pd.DataFrame] | AnnData | None:
5656
_log(f"{name} - Running {name}", level="info", verbose=verbose)
5757
# Process data
58-
mat, obs, var = extract(data, layer=layer, raw=raw, empty=empty, verbose=verbose, bsize=bsize)
58+
mat, obs, var = extract(data, layer=layer, raw=raw, empty=empty, shuffle=True, verbose=verbose, bsize=bsize)
5959
issparse = sps.issparse(mat)
6060
isbacked = isinstance(mat, tuple)
6161
# Process net

src/decoupler/pp/anndata.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,7 @@ def pseudobulk(
366366
assert isinstance(groups_col, str | list) or groups_col is None, "groups_col must be str or None"
367367
assert isinstance(mode, str | dict) or callable(mode), "mode must be str, dict or callable"
368368
# Extract data
369-
X, obs, var = extract(adata, layer=layer, raw=raw, empty=empty, bsize=bsize, verbose=verbose)
369+
X, obs, var = extract(adata, layer=layer, raw=raw, empty=empty, bsize=bsize, shuffle=False, verbose=verbose)
370370
assert len(set(obs)) == len(obs), (
371371
"Repeated elements in adata.obs_names, to make them unique run adata.obs_names_make_unique()"
372372
)
@@ -597,7 +597,7 @@ def filter_by_expr(
597597
assert isinstance(large_n, int | float) and large_n >= 0, "large_n must be numeric and > 0"
598598
assert isinstance(min_prop, int | float) and 1 >= min_prop >= 0, "min_prop must be numeric and between 0 and 1"
599599
# Extract inputs
600-
X, _, var_names = extract(adata, empty=False)
600+
X, _, var_names = extract(adata, empty=False, shuffle=False)
601601
isbacked = isinstance(X, tuple)
602602
assert not isbacked, "adata is in backed mode, reload adata without backed='r'"
603603
obs = adata.obs

src/decoupler/pp/data.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ def extract(
132132
layer: str | None = None,
133133
raw: bool = False,
134134
empty: bool = True,
135+
shuffle: bool = True,
135136
verbose: bool = False,
136137
bsize: int = 250_000,
137138
) -> tuple[np.ndarray, np.ndarray, np.ndarray] | tuple[tuple[np.ndarray, np.ndarray], np.ndarray, np.ndarray]:
@@ -144,6 +145,8 @@ def extract(
144145
%(layer)s
145146
%(raw)s
146147
%(empty)s
148+
shuffle
149+
Whether to shuffle features to ensure ties are broken.
147150
%(verbose)s
148151
149152
Returns
@@ -169,7 +172,8 @@ def extract(
169172
if not isbacked:
170173
mat, row, col = _validate_mat(mat=mat, row=row, col=col, empty=empty, verbose=verbose)
171174
# Randomly sort features
172-
mat, col = _break_ties(mat=mat, features=col)
175+
if shuffle:
176+
mat, col = _break_ties(mat=mat, features=col)
173177
mat_tuple = (mat, row, col)
174178
else:
175179
msk_col = _validate_backed(mat=mat, row=row, col=col, empty=empty, verbose=verbose, bsize=bsize)

tests/pp/test_anndata.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def test_pseudobulk(
6868
empty,
6969
rng,
7070
):
71-
adata, _ = dc.ds.toy(nobs=10000, nvar=500, bval=2, seed=42, verbose=False)
71+
adata, _ = dc.ds.toy(nobs=10000, nvar=900, bval=2, seed=42, verbose=False)
7272
adata.layers["counts"] = adata.X.round()
7373
adata.obs["sample"] = adata.obs["sample"]
7474
adata.obs["dose"] = rng.choice(["low", "medium", "high"], size=adata.n_obs, replace=True)
@@ -105,14 +105,15 @@ def _run_psbulk():
105105
)
106106
return pdata
107107

108-
l_mem_usage, pdata = memory_usage(_run_psbulk, retval=True, interval=0.01)
108+
l_mem_usage, pdata = memory_usage(_run_psbulk, retval=True, interval=0.001)
109109
l_mem_usage = max(l_mem_usage) - min(l_mem_usage)
110110
assert isinstance(pdata, ad.AnnData)
111111
assert pdata.shape[0] < adata.shape[0]
112112
if empty:
113113
assert pdata.shape[1] < adata.shape[1]
114114
else:
115115
assert pdata.shape[1] == adata.shape[1]
116+
assert all(pdata.var_names == pdata.var_names)
116117
assert not pdata.obs["sample"].str.contains("_").any()
117118
obs_cols = {"psbulk_cells", "psbulk_counts"}
118119
assert obs_cols.issubset(pdata.obs.columns)
@@ -139,7 +140,7 @@ def _run_psbulk_backed_data():
139140
)
140141
return pbdata
141142

142-
b_mem_usage, pbdata = memory_usage(_run_psbulk_backed_data, retval=True, interval=0.01)
143+
b_mem_usage, pbdata = memory_usage(_run_psbulk_backed_data, retval=True, interval=0.001)
143144
b_mem_usage = max(b_mem_usage) - min(b_mem_usage)
144145
assert b_mem_usage < l_mem_usage
145146
msk = pbdata.X.sum(1) != 0

0 commit comments

Comments
 (0)