Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ var/
*.pex
artifacts/
wheelhouse*/
condaenv*

# PyInstaller
# Usually these files are written by a python script from a template
Expand Down
8 changes: 8 additions & 0 deletions environment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name: mutation_indexer
dependencies:
- python~=3.10.0
- pip
- pip:
- --index-url https://nexus.osdc.io/repository/pypi-all/simple
- -r requirements.txt
- "."
22 changes: 12 additions & 10 deletions master-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile --extra=master --index-url=https://nexus.osdc.io/repository/pypi-all/simple --output-file=master-requirements.txt
# pip-compile --extra=master --index-url=https://nexus.osdc.io/repository/pypi-all/simple --output-file=master-requirements.txt --strip-extras
#
--index-url https://nexus.osdc.io/repository/pypi-all/simple

aiohttp==3.9.3
# via elasticsearch
aiohttp==3.9.5
# via
# elasticsearch
# gdc-mutation-indexer (setup.py)
aiosignal==1.3.1
# via aiohttp
async-timeout==4.0.3
Expand All @@ -26,19 +28,19 @@ colorama==0.4.6
# log-symbols
deepdiff==6.7.1
# via gdcmodels
elasticsearch[async]==7.17.9
elasticsearch==7.17.9
# via
# gdc-mutation-indexer (setup.py)
# gdcmodels
frozenlist==1.4.1
# via
# aiohttp
# aiosignal
gdcmodels==5.0.0
gdcmodels==5.1.0
# via gdc-mutation-indexer (setup.py)
halo==0.0.31
# via gdc-mutation-indexer (setup.py)
idna==3.6
idna==3.7
# via
# requests
# yarl
Expand All @@ -48,11 +50,11 @@ indexclient==2.4.0
# via gdc-mutation-indexer (setup.py)
log-symbols==0.0.14
# via halo
marshmallow==3.21.1
marshmallow==3.21.2
# via
# marshmallow-dataclass
# marshmallow-enum
marshmallow-dataclass==8.6.0
marshmallow-dataclass==8.6.1
# via gdc-mutation-indexer (setup.py)
marshmallow-enum==1.5.1
# via gdc-mutation-indexer (setup.py)
Expand All @@ -74,7 +76,7 @@ ordered-set==4.1.0
# via deepdiff
packaging==24.0
# via marshmallow
pex==2.2.2
pex==2.3.1
# via gdc-mutation-indexer (setup.py)
py4j==0.10.9.5
# via pyspark
Expand All @@ -94,7 +96,7 @@ termcolor==2.4.0
# via halo
toml==0.10.2
# via gdc-mutation-indexer (setup.py)
typing-extensions==4.10.0
typing-extensions==4.11.0
# via
# gdc-mutation-indexer (setup.py)
# gdcmodels
Expand Down
16 changes: 7 additions & 9 deletions master.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
import os
import tempfile
import uuid
from collections.abc import Iterable, Iterator, Mapping
from os import path
from typing import Any, Iterable, Iterator, Mapping, Optional, Tuple
from typing import Any

import elasticsearch
import halo
Expand Down Expand Up @@ -72,7 +73,9 @@ def load_config_data(
Returns:
the final configuration data as a mapping.
"""
default_config = toml.loads(resources.read_text(mutation_indexer, "configuration.toml"))
default_config = toml.loads(
resources.read_text(mutation_indexer, "configuration.toml")
)
default_config["build"]["config_file"] = final_config_file
user_config = toml.load(user_config_file)

Expand Down Expand Up @@ -145,20 +148,15 @@ def get_config(
write_manifest(config)


def get_file_args(config: configuration.Configuration) -> Iterable[Tuple[str, str]]:
def get_file_args(config: configuration.Configuration) -> Iterable[tuple[str, str]]:
"""
Sets the spark-submit config values as well as jars params.

Yields:
a tuple of argument flag and value.
"""
build = config.build
files = ",".join(
(
f"{config.build.config_file}#configuration.toml",
path.join(ROOT_DIR, "mutation-indexer.pex#mutation-indexer.pex"),
)
)
files = ",".join((f"{config.build.config_file}#configuration.toml",))

yield (
"--conf",
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@ testpaths = [
]

[tool.mypy]
enable_incomplete_feature = ["Unpack"]
plugins = ["marshmallow_dataclass.mypy"]
8 changes: 8 additions & 0 deletions release.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/usr/bin/bash

export PYTHONUSERBASE=intentionally-disabled

conda env create -f environment.yaml -p /tmp/mutation_indexer
conda-pack -p /tmp/mutation_indexer -o /tmp/mutation_indexer.tar.gz
scp /tmp/mutation_indexer.tar.gz micky@172.23.8.228:/home/micky
rm -fr /tmp/mutation_indexer*
20 changes: 11 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile --index-url=https://nexus.osdc.io/repository/pypi-all/simple
# pip-compile --index-url=https://nexus.osdc.io/repository/pypi-all/simple --strip-extras
#
--index-url https://nexus.osdc.io/repository/pypi-all/simple

aiohttp==3.9.3
# via elasticsearch
aiohttp==3.9.5
# via
# elasticsearch
# gdc-mutation-indexer (setup.py)
aiosignal==1.3.1
# via aiohttp
async-timeout==4.0.3
Expand All @@ -22,29 +24,29 @@ charset-normalizer==3.3.2
# via requests
deepdiff==6.7.1
# via gdcmodels
elasticsearch[async]==7.17.9
elasticsearch==7.17.9
# via
# gdc-mutation-indexer (setup.py)
# gdcmodels
frozenlist==1.4.1
# via
# aiohttp
# aiosignal
gdcmodels==5.0.0
gdcmodels==5.1.0
# via gdc-mutation-indexer (setup.py)
idna==3.6
idna==3.7
# via
# requests
# yarl
importlib-resources==3.3.1
# via gdc-mutation-indexer (setup.py)
indexclient==2.4.0
# via gdc-mutation-indexer (setup.py)
marshmallow==3.21.1
marshmallow==3.21.2
# via
# marshmallow-dataclass
# marshmallow-enum
marshmallow-dataclass==8.6.0
marshmallow-dataclass==8.6.1
# via gdc-mutation-indexer (setup.py)
marshmallow-enum==1.5.1
# via gdc-mutation-indexer (setup.py)
Expand Down Expand Up @@ -78,7 +80,7 @@ requests==2.31.0
# via indexclient
toml==0.10.2
# via gdc-mutation-indexer (setup.py)
typing-extensions==4.10.0
typing-extensions==4.11.0
# via
# gdc-mutation-indexer (setup.py)
# gdcmodels
Expand Down
7 changes: 5 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
],
license="Apache",
include_package_data=True,
install_requires=[
install_requires=(
"aiohttp",
"elasticsearch[async]~=7.6",
"importlib-resources~=3.2",
"marshmallow-dataclass~=8.5",
Expand All @@ -35,15 +36,17 @@
"typing-extensions~=4.1",
"indexclient",
"gdcmodels",
"exceptiongroup",
"mutation-indexer-resource @ git+ssh://git@github.com/NCI-GDC/mutation-indexer-resource.git@1.0.0",
],
),
extras_require={
"dev": (
"click~=8.1",
"coverage[toml]~=7.0",
"deepdiff~=6.0",
"pre-commit==1.21.0",
"pytest~=7.0",
"pytest-asyncio",
"pytest-cov~=4.0",
),
"master": ("halo~=0.0", "pex~=2.1"),
Expand Down
22 changes: 22 additions & 0 deletions src/mutation_indexer/aioutils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import asyncio
import functools
from collections.abc import Awaitable, Callable
from typing import TypeVar

from typing_extensions import ParamSpec

__all__ = ("to_thread",)


TParams = ParamSpec("TParams")
TReturn = TypeVar("TReturn")


def to_thread(
func: Callable[TParams, TReturn]
) -> Callable[TParams, Awaitable[TReturn]]:
@functools.wraps(func)
def wrapper(*args: TParams.args, **kwargs: TParams.kwargs) -> Awaitable[TReturn]:
return asyncio.to_thread(func, *args, **kwargs)

return wrapper
8 changes: 4 additions & 4 deletions src/mutation_indexer/builders/ascat.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,8 @@ def __init__(

self._document_dataframe_util = document_dataframe_util

def _build_document_df(self, doc_ids: Iterable[str]) -> sql.DataFrame:
document_df = self._document_dataframe_util.get_dataframe(
async def _build_document_df(self, doc_ids: Iterable[str]) -> sql.DataFrame:
document_df = await self._document_dataframe_util.get_dataframe(
doc_ids, schema=schemas.load_schema("builders/ascat/ascat_document.yaml")
)

Expand All @@ -202,7 +202,7 @@ def _build_document_df(self, doc_ids: Iterable[str]) -> sql.DataFrame:

return _add_cnv_change(document_df)

def _build_from_scratch(self, input_dfs: ASCATInputs) -> sql.DataFrame:
async def _build_from_scratch(self, input_dfs: ASCATInputs) -> sql.DataFrame:
"""Builds the ASCAT dataframe

ascat {}
Expand Down Expand Up @@ -279,7 +279,7 @@ def _build_from_scratch(self, input_dfs: ASCATInputs) -> sql.DataFrame:
.where(utils.is_protein_coding())
.where(utils.is_between_chr1_and_chr22())
)
document_df = self._build_document_df(
document_df = await self._build_document_df(
r.file_id for r in ascat_metadata_df.select("file_id").toLocalIterator()
)
ascat_df = document_df.join(ascat_metadata_df, on=["file_id"]).join(
Expand Down
9 changes: 5 additions & 4 deletions src/mutation_indexer/builders/ascat_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ def __init__(
output=build.DataFrame.ASCAT_METADATA,
)

def _build_from_scratch(self, input_dfs: ASCATMetadataInputs) -> sql.DataFrame:
async def _build_from_scratch(
self, input_dfs: ASCATMetadataInputs
) -> sql.DataFrame:
filters = [
{
"bool": {
Expand Down Expand Up @@ -70,7 +72,6 @@ def _build_from_scratch(self, input_dfs: ASCATMetadataInputs) -> sql.DataFrame:
}
}
]
df = await self._get_primary_aliquot_df(filters, entities=self.CASE_ONLY)

return self._get_primary_aliquot_df(
filters, entities=frozenset(("case",))
).select("aliquot_id", "case_id", "file_id")
return df.select("aliquot_id", "case_id", "file_id")
Loading