|
7 | 7 | import logging |
8 | 8 | import os |
9 | 9 | import subprocess |
| 10 | +import tarfile |
| 11 | +import tempfile |
10 | 12 | import warnings |
11 | 13 | from getpass import getpass |
12 | | -from io import StringIO |
13 | | -from typing import Optional, List, Callable |
| 14 | +from io import StringIO, BytesIO |
| 15 | +from os.path import join |
| 16 | +from typing import Optional, List, Callable, Any |
14 | 17 |
|
15 | 18 | import anndata as ad |
16 | 19 | import numpy as np |
17 | 20 | import pandas as pd |
| 21 | +import scanpy |
18 | 22 | from anndata import AnnData |
19 | 23 | from pandas import DataFrame |
20 | 24 |
|
@@ -1667,12 +1671,51 @@ def make_anndata(pack): |
1667 | 1671 | pass |
1668 | 1672 | return out |
1669 | 1673 |
|
1670 | | - def get_differential_expression_values(self, |
1671 | | - dataset:Optional[str|int] = None, |
1672 | | - keep_non_specific:bool = False, |
1673 | | - result_sets:Optional[List[str|int]] = None, |
1674 | | - readable_contrasts:bool = False, |
1675 | | - **kwargs)->List[DataFrame]: |
| 1674 | + def get_single_cell_dataset_object(self, dataset: str | int, |
| 1675 | + download_dir=None) -> AnnData: |
| 1676 | + """ |
| 1677 | + :param download_dir: Directory where datasets can be downloaded, or else |
| 1678 | + the data will be retrieved in-memory. |
| 1679 | + :return: |
| 1680 | + """ |
| 1681 | + |
| 1682 | + def resolve(): |
| 1683 | + if download_dir: |
| 1684 | + dest = join(download_dir, dataset + '.tar') |
| 1685 | + if not os.path.exists(dest): |
| 1686 | + logger.info('Downloading single-cell data for %s to %s...', |
| 1687 | + dataset, download_dir) |
| 1688 | + with open(dest, 'wb') as f: |
| 1689 | + f.write(self.raw.get_dataset_single_cell_expression( |
| 1690 | + dataset)) |
| 1691 | + return open(dest, 'rb') |
| 1692 | + else: |
| 1693 | + logger.info("Downloading single-cell data data for %s...", |
| 1694 | + str(dataset)) |
| 1695 | + return BytesIO( |
| 1696 | + self.raw.get_dataset_single_cell_expression(dataset)) |
| 1697 | + |
| 1698 | + with (resolve() as f, tarfile.open(fileobj=f) as tf, |
| 1699 | + tempfile.TemporaryDirectory() as tmpdir): |
| 1700 | + logger.info('Extracting TAR file for %s to %s...', str(dataset), |
| 1701 | + tmpdir) |
| 1702 | + tf.extractall(tmpdir) |
| 1703 | + samples = [] |
| 1704 | + for sample_dir in os.listdir(tmpdir): |
| 1705 | + logger.info('Reading MEX data for %s...', sample_dir) |
| 1706 | + # Gemma already guarantees unicity of cell identifiers and |
| 1707 | + # scanpy cannot deal with numeric gene identifiers when |
| 1708 | + # make_unique is True, so we skip that part |
| 1709 | + samples.append(scanpy.read_10x_mtx(join(tmpdir, sample_dir), |
| 1710 | + make_unique=False)) |
| 1711 | + return scanpy.concat(samples, axis="var") |
| 1712 | + |
| 1713 | + def get_differential_expression_values(self, |
| 1714 | + dataset: Optional[str | int] = None, |
| 1715 | + keep_non_specific: bool = False, |
| 1716 | + result_sets: Optional[List[str | int]] = None, |
| 1717 | + readable_contrasts: bool = False, |
| 1718 | + **kwargs) -> List[DataFrame]: |
1676 | 1719 | """ |
1677 | 1720 | Retrieves the differential expression resultSet(s) associated with the dataset. |
1678 | 1721 | If there is more than one resultSet, use get_result_sets() to see the options |
|
0 commit comments