diff --git a/.gitignore b/.gitignore index b6e4761..bd6ad26 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,6 @@ dmypy.json # Pyre type checker .pyre/ + +# PyCharm +.idea/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..e892c22 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,18 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.0.1 + hooks: + - id: check-yaml + - id: check-json + - id: debug-statements + - id: detect-private-key + - id: end-of-file-fixer + files: \.py$ + - id: pretty-format-json + - id: trailing-whitespace + files: \.py$ +- repo: https://github.com/psf/black + rev: 22.3.0 + hooks: + - id: black + language_version: python3 diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..1ba065f --- /dev/null +++ b/.travis.yml @@ -0,0 +1,28 @@ +language: python +matrix: + include: + - python: "3.6" + - python: "3.7" + - python: "3.8" + - python: "3.9" + - python: "3.9-dev" + - python: "3.10-dev" + - python: "pypy3" + env: NO_MYPY=true + allow_failures: + - python: "3.9" + - python: "3.9-dev" + - python: "3.10-dev" + - python: "pypy3" + env: NO_MYPY=true +install: + - pip3 install . +before_script: + - pip3 install coverage + - pip3 install coveralls + - if ! $NO_MYPY; then pip3 install mypy; fi +script: + - coverage run --source mavecore -m unittest + - if ! $NO_MYPY; then mypy mavecore tests; fi +after_success: + - coveralls \ No newline at end of file diff --git a/README.md b/README.md index 738a35b..7d984d5 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,17 @@ # MaveCore Shared MaveDB and MaveTools functionality + +## Contributing + +To contribute to MaveCore development, please install the additional requirements: +``` +pip install -r requirements-dev.txt +``` + +To run the tests and generate an HTML coverage report use: +``` +coverage run -m unittest && coverage html +``` + +By default, the coverage report will be located at `htmlcov/index.html`. +Open this file in your browser to identify lines that have not been adequately covered by the test suite. diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..92dd33a --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..061f32f --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..5926551 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,68 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys + +sys.path.insert(0, os.path.abspath("../..")) + + +# -- Project information ----------------------------------------------------- + +project = "MaveCore" +copyright = "2022, Alan F Rubin" +author = "Alan F Rubin" + +# The full version, including alpha/beta/rc tags +release = "0.0.1" + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx.ext.intersphinx", + "sphinx.ext.autosectionlabel", +] +nbsphinx_allow_errors = True + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +# Intersphinx information for documentation from other packages +intersphinx_mapping = {"python": ("https://docs.python.org/3", None)} + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "pyramid" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["static"] + + +def setup(app): + app.add_css_file("styles.css") diff --git a/docs/source/dataset_validators.rst b/docs/source/dataset_validators.rst new file mode 100644 index 0000000..7b39877 --- /dev/null +++ b/docs/source/dataset_validators.rst @@ -0,0 +1,8 @@ +dataset validation +================== + +Dataset validation contains validation code for datasets. + + +.. automodule:: mavecore.validation.dataset_validators + :members: diff --git a/docs/source/genome_validators.rst b/docs/source/genome_validators.rst new file mode 100644 index 0000000..e2745ca --- /dev/null +++ b/docs/source/genome_validators.rst @@ -0,0 +1,8 @@ +genome validation +================= + +Genome validation contains validation functions relating to wild type sequences, +reference genomes, target genes, reference maps, and genomic intervals. + +.. automodule:: mavecore.validation.genome_validators + :members: diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..64b7bb9 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,31 @@ +.. MaveCore documentation master file, created by + sphinx-quickstart on Tue Mar 1 14:20:15 2022. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +MaveCore +======== +MaveCore is a pure Python Module for bioinformatics and computational biology. +It features all the shared functionality of MaveDB and MaveTools. + +Install MaveCore using pip:: + + pip3 install MaveCore + +Building a local copy of the documentation requires the following additional packages:: + + pip3 install sphinx + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + validation + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/source/metadata_validators.rst b/docs/source/metadata_validators.rst new file mode 100644 index 0000000..11422c2 --- /dev/null +++ b/docs/source/metadata_validators.rst @@ -0,0 +1,8 @@ +metadata validation +=================== + +Metadata validation contains functions designed to check the validity of metadata relating to +sra, keyword, pubmed, doi, ensembl, uniprot, refseq and genome identifiers and lists. + +.. automodule:: mavecore.validation.metadata_validators + :members: diff --git a/docs/source/static/lato-fonts.css b/docs/source/static/lato-fonts.css new file mode 100644 index 0000000..f21b9cc --- /dev/null +++ b/docs/source/static/lato-fonts.css @@ -0,0 +1,108 @@ +/* latin-ext */ +@font-face { + font-family: 'Lato'; + font-style: italic; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u_w4BMUTPHjxsI9w2_FQft1dw.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Lato'; + font-style: italic; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u_w4BMUTPHjxsI9w2_Gwft.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* latin-ext */ +@font-face { + font-family: 'Lato'; + font-style: italic; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u8w4BMUTPHjxsAUi-qJCY.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Lato'; + font-style: italic; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u8w4BMUTPHjxsAXC-q.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* latin-ext */ +@font-face { + font-family: 'Lato'; + font-style: italic; + font-weight: 700; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u_w4BMUTPHjxsI5wq_FQft1dw.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Lato'; + font-style: italic; + font-weight: 700; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u_w4BMUTPHjxsI5wq_Gwft.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* latin-ext */ +@font-face { + font-family: 'Lato'; + font-style: normal; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u9w4BMUTPHh7USSwaPGR_p.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Lato'; + font-style: normal; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u9w4BMUTPHh7USSwiPGQ.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* latin-ext */ +@font-face { + font-family: 'Lato'; + font-style: normal; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6uyw4BMUTPHjxAwXjeu.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Lato'; + font-style: normal; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6uyw4BMUTPHjx4wXg.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* latin-ext */ +@font-face { + font-family: 'Lato'; + font-style: normal; + font-weight: 700; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u9w4BMUTPHh6UVSwaPGR_p.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Lato'; + font-style: normal; + font-weight: 700; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u9w4BMUTPHh6UVSwiPGQ.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} diff --git a/docs/source/static/raleway-fonts.css b/docs/source/static/raleway-fonts.css new file mode 100644 index 0000000..1918a50 --- /dev/null +++ b/docs/source/static/raleway-fonts.css @@ -0,0 +1,360 @@ +/* cyrillic-ext */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4ejMDr4fIA9c.woff2) format('woff2'); + unicode-range: U+0460-052F, U+1C80-1C88, U+20B4, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F; +} +/* cyrillic */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4ejMDrcfIA9c.woff2) format('woff2'); + unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; +} +/* vietnamese */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4ejMDrwfIA9c.woff2) format('woff2'); + unicode-range: U+0102-0103, U+0110-0111, U+0128-0129, U+0168-0169, U+01A0-01A1, U+01AF-01B0, U+1EA0-1EF9, U+20AB; +} +/* latin-ext */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4ejMDr0fIA9c.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4ejMDrMfIA.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* cyrillic-ext */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4TbMDr4fIA9c.woff2) format('woff2'); + unicode-range: U+0460-052F, U+1C80-1C88, U+20B4, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F; +} +/* cyrillic */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4TbMDrcfIA9c.woff2) format('woff2'); + unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; +} +/* vietnamese */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4TbMDrwfIA9c.woff2) format('woff2'); + unicode-range: U+0102-0103, U+0110-0111, U+0128-0129, U+0168-0169, U+01A0-01A1, U+01AF-01B0, U+1EA0-1EF9, U+20AB; +} +/* latin-ext */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4TbMDr0fIA9c.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4TbMDrMfIA.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* cyrillic-ext */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4WjMDr4fIA9c.woff2) format('woff2'); + unicode-range: U+0460-052F, U+1C80-1C88, U+20B4, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F; +} +/* cyrillic */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4WjMDrcfIA9c.woff2) format('woff2'); + unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; +} +/* vietnamese */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4WjMDrwfIA9c.woff2) format('woff2'); + unicode-range: U+0102-0103, U+0110-0111, U+0128-0129, U+0168-0169, U+01A0-01A1, U+01AF-01B0, U+1EA0-1EF9, U+20AB; +} +/* latin-ext */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4WjMDr0fIA9c.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4WjMDrMfIA.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* cyrillic-ext */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4bbLDr4fIA9c.woff2) format('woff2'); + unicode-range: U+0460-052F, U+1C80-1C88, U+20B4, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F; +} +/* cyrillic */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4bbLDrcfIA9c.woff2) format('woff2'); + unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; +} +/* vietnamese */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4bbLDrwfIA9c.woff2) format('woff2'); + unicode-range: U+0102-0103, U+0110-0111, U+0128-0129, U+0168-0169, U+01A0-01A1, U+01AF-01B0, U+1EA0-1EF9, U+20AB; +} +/* latin-ext */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4bbLDr0fIA9c.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4bbLDrMfIA.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* cyrillic-ext */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVtaorCFPrEHJA.woff2) format('woff2'); + unicode-range: U+0460-052F, U+1C80-1C88, U+20B4, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F; +} +/* cyrillic */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVtaorCMPrEHJA.woff2) format('woff2'); + unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; +} +/* vietnamese */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVtaorCHPrEHJA.woff2) format('woff2'); + unicode-range: U+0102-0103, U+0110-0111, U+0128-0129, U+0168-0169, U+01A0-01A1, U+01AF-01B0, U+1EA0-1EF9, U+20AB; +} +/* latin-ext */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVtaorCGPrEHJA.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVtaorCIPrE.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* cyrillic-ext */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVuEorCFPrEHJA.woff2) format('woff2'); + unicode-range: U+0460-052F, U+1C80-1C88, U+20B4, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F; +} +/* cyrillic */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVuEorCMPrEHJA.woff2) format('woff2'); + unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; +} +/* vietnamese */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVuEorCHPrEHJA.woff2) format('woff2'); + unicode-range: U+0102-0103, U+0110-0111, U+0128-0129, U+0168-0169, U+01A0-01A1, U+01AF-01B0, U+1EA0-1EF9, U+20AB; +} +/* latin-ext */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVuEorCGPrEHJA.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVuEorCIPrE.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* cyrillic-ext */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVvaorCFPrEHJA.woff2) format('woff2'); + unicode-range: U+0460-052F, U+1C80-1C88, U+20B4, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F; +} +/* cyrillic */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVvaorCMPrEHJA.woff2) format('woff2'); + unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; +} +/* vietnamese */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVvaorCHPrEHJA.woff2) format('woff2'); + unicode-range: U+0102-0103, U+0110-0111, U+0128-0129, U+0168-0169, U+01A0-01A1, U+01AF-01B0, U+1EA0-1EF9, U+20AB; +} +/* latin-ext */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVvaorCGPrEHJA.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVvaorCIPrE.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* cyrillic-ext */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVsEpbCFPrEHJA.woff2) format('woff2'); + unicode-range: U+0460-052F, U+1C80-1C88, U+20B4, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F; +} +/* cyrillic */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVsEpbCMPrEHJA.woff2) format('woff2'); + unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; +} +/* vietnamese */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVsEpbCHPrEHJA.woff2) format('woff2'); + unicode-range: U+0102-0103, U+0110-0111, U+0128-0129, U+0168-0169, U+01A0-01A1, U+01AF-01B0, U+1EA0-1EF9, U+20AB; +} +/* latin-ext */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVsEpbCGPrEHJA.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVsEpbCIPrE.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} diff --git a/docs/source/static/styles.css b/docs/source/static/styles.css new file mode 100644 index 0000000..4e7affd --- /dev/null +++ b/docs/source/static/styles.css @@ -0,0 +1,41 @@ +@import url("lato-fonts.css"); +@import url("raleway-fonts.css"); + +body { + font-family: "Lato", sans-serif; +} + +div.body h1, +div.body h2, +div.body h3, +div.body h4, +div.body h5, +div.body h6 { + font-family: "Raleway", sans-serif; +} + +div.sphinxsidebar { + font-size: 0.85em; +} + +div.related { + font-size: 0.85em; +} + +div.sphinxsidebar h1, +div.sphinxsidebar h2, +div.sphinxsidebar h3, +div.sphinxsidebar h4 { + font-family: "Raleway", sans-serif; +} + +table caption { + margin-bottom: 10px; + font-family: "Raleway", sans-serif; +} + +figcaption span.caption-text, figcaption span.caption-number { + font-family: "Raleway", sans-serif; + font-style: normal; + font-weight: bold; +} diff --git a/docs/source/urn_validators.rst b/docs/source/urn_validators.rst new file mode 100644 index 0000000..26d5bb9 --- /dev/null +++ b/docs/source/urn_validators.rst @@ -0,0 +1,7 @@ +urn validation +============== + +Urn validation validates MaveDB urn values. + +.. automodule:: mavecore.validation.urn_validators + :members: diff --git a/docs/source/validation.rst b/docs/source/validation.rst new file mode 100644 index 0000000..0b0545b --- /dev/null +++ b/docs/source/validation.rst @@ -0,0 +1,13 @@ +Validation +========== +Validation features MAVE dataset validator functions applied in MaveTools and MaveDB. + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + dataset_validators + genome_validators + metadata_validators + urn_validators + variant_validators diff --git a/docs/source/variant_validators.rst b/docs/source/variant_validators.rst new file mode 100644 index 0000000..433b064 --- /dev/null +++ b/docs/source/variant_validators.rst @@ -0,0 +1,7 @@ +variant validation +================== + +Variant validation contains functions to validate variants within a dataset. + +.. automodule:: mavecore.validation.variant_validators + :members: diff --git a/mavecore/__init__.py b/mavecore/__init__.py index e69de29..8b13789 100644 --- a/mavecore/__init__.py +++ b/mavecore/__init__.py @@ -0,0 +1 @@ + diff --git a/mavecore/validators/__init__.py b/mavecore/models/__init__.py similarity index 100% rename from mavecore/validators/__init__.py rename to mavecore/models/__init__.py diff --git a/mavecore/models/data.py b/mavecore/models/data.py new file mode 100644 index 0000000..b0d1800 --- /dev/null +++ b/mavecore/models/data.py @@ -0,0 +1,51 @@ +from pydantic import BaseModel, validator +from typing import List, Dict, Optional + +from .identifier import DoiIdentifier, PubmedIdentifier +from .target import TargetGene + +from mavecore.validation import keywords, urn +from mavecore.validation.utilities import to_camel + + +class DataSet(BaseModel): + title: str + short_description: str + abstract_text: str + method_text: str + extra_metadata: Optional[Dict] + keywords: Optional[List[str]] + + class Config: + alias_generator = to_camel + + @validator('keywords') + def validate_keywords(cls, v): + keywords.validate_keywords(v) + + +class Experiment(DataSet): + doi_identifiers: Optional[List[DoiIdentifier]] + pubmed_identifiers: Optional[List[PubmedIdentifier]] + + +class ScoreSet(DataSet): + data_usage_policy: str + licence_id: int + experiment_urn: str + superseded_scoreset_urn: Optional[str] + meta_analysis_source_scoreset_urns: Optional[List[str]] + doi_identifiers: Optional[List[DoiIdentifier]] + pubmed_identifiers: Optional[List[PubmedIdentifier]] + target_gene: TargetGene + + @validator('superseded_scoreset_urn', 'meta_analysis_source_scoreset_urns') + def validate_scoreset_urn(cls, v): + if type(v) == str: + urn.validate_mavedb_urn_scoreset(v) + else: + [urn.validate_mavedb_urn_scoreset(s) for s in v] + + @validator('experiment_urn') + def validate_experiment_urn(cls, v): + urn.validate_mavedb_urn_experiment(v) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py new file mode 100644 index 0000000..13c748e --- /dev/null +++ b/mavecore/models/identifier.py @@ -0,0 +1,64 @@ +from pydantic import BaseModel, validator, root_validator +from typing import Optional + +from mavecore.validation import identifier as id +from mavecore.validation.utilities import to_camel + + +class Identifier(BaseModel): + identifier: str + + class Config: + alias_generator = to_camel + + +class DoiIdentifier(Identifier): + + @validator('identifier') + def must_be_valid_doi(cls, v): + id.validate_doi_identifier(v) + + +class PubmedIdentifier(Identifier): + + @validator('identifier') + def must_be_valid_pubmed(cls, v): + id.validate_pubmed_identifier(v) + + +'''class ExternalIdentifierId(BaseModel): + dbname: str + identifier: str + + @root_validator(pre=True) + def check_passwords_match(cls, values): + print(values.get("dbname")) + # TODO resolve errors when using root_validator + #TODO confirm what valid dbname(s) are + #dbname, dbid = values.get('dbname'), values.get('identifier') + #print(dbname) + #print(dbid) + #if dbname == "sra": + # identifier.validate_sra_identifier(dbid) + #elif dbname == "ensembl": + # identifier.validate_ensembl_identifier(dbid) + #elif dbname == "uniprot": + # identifier.validate_uniprot_identifier(dbid) + #elif dbname == "refseq": + # identifier.validate_refseq_identifier(dbid) + #elif dbname == "genome": + # identifier.validate_genome_identifier(dbid) + #else: + # raise ValidationError("dbname must be valid dbname from this list: ")''' + + +class ExternalIdentifier(BaseModel): + identifier: dict + offset: Optional[int] + + class Config: + alias_generator = to_camel + + @validator('identifier') + def validate_identifier(cls, v): + id.validate_external_identifier(v) diff --git a/mavecore/models/map.py b/mavecore/models/map.py new file mode 100644 index 0000000..bbad61a --- /dev/null +++ b/mavecore/models/map.py @@ -0,0 +1,11 @@ +from pydantic import BaseModel + +from mavecore.validation.utilities import to_camel + + +class ReferenceMap(BaseModel): + genome_id: int + target_id: int + + class Config: + alias_generator = to_camel diff --git a/mavecore/models/sequence.py b/mavecore/models/sequence.py new file mode 100644 index 0000000..7c2399e --- /dev/null +++ b/mavecore/models/sequence.py @@ -0,0 +1,20 @@ +from pydantic import BaseModel, validator + +from mavecore.validation import target +from mavecore.validation.utilities import to_camel + + +class WildType(BaseModel): + sequence_type: str + sequence: str + + class Config: + alias_generator = to_camel + + @validator('sequence_type') + def validate_category(cls, v): + target.validate_sequence_category(v) + + @validator('sequence') + def validate_sequence(cls, v): + target.validate_target_sequence(v) diff --git a/mavecore/models/target.py b/mavecore/models/target.py new file mode 100644 index 0000000..d246337 --- /dev/null +++ b/mavecore/models/target.py @@ -0,0 +1,24 @@ +from pydantic import BaseModel, validator +from typing import List, Optional + +from .map import ReferenceMap +from .sequence import WildType + +from mavecore.validation import target +from mavecore.models.identifier import ExternalIdentifier +from mavecore.validation.utilities import to_camel + + +class TargetGene(BaseModel): + name: str + category: str + external_identifiers: List[ExternalIdentifier] + reference_maps: List[ReferenceMap] + wt_sequence: WildType + + class Config: + alias_generator = to_camel + + @validator('category') + def validate_category(cls, v): + target.validate_target_category(v) diff --git a/tests/test_validators/__init__.py b/mavecore/validation/__init__.py similarity index 100% rename from tests/test_validators/__init__.py rename to mavecore/validation/__init__.py diff --git a/tests/test_validators/test_variant_validators/__init__.py b/mavecore/validation/constants/__init__.py similarity index 100% rename from tests/test_validators/test_variant_validators/__init__.py rename to mavecore/validation/constants/__init__.py diff --git a/mavecore/validation/constants/conversion.py b/mavecore/validation/constants/conversion.py new file mode 100644 index 0000000..37bc20f --- /dev/null +++ b/mavecore/validation/constants/conversion.py @@ -0,0 +1,42 @@ +aa_dict_key_1 = { + "A": "Ala", "C": "Cys", "D": "Asp", "E": "Glu", "F": "Phe", + "G": "Gly", "H": "His", "I": "Ile", "K": "Lys", "L": "Leu", + "M": "Met", "N": "Asn", "P": "Pro", "Q": "Gln", "R": "Arg", + "S": "Ser", "T": "Thr", "V": "Val", "W": "Trp", "Y": "Tyr", + "*": "Ter" +} # what is Z? "X":"Ter", "WTSYN":""} + +aa_dict_key_3 = { + "Ala": "A", "Cys": "C", "Asp": "D", "Glu": "E", "Phe": "F", + "Gly": "G", "His": "H", "Ile": "I", "Lys": "K", "Leu": "L", + "Met": "M", "Asn": "N", "Pro": "P", "Gln": "Q", "Arg": "R", + "Ser": "S", "Thr": "T", "Val": "V", "Trp": "W", "Tyr": "Y", + "Ter": "*" +} + +codon_dict_DNA = { + # T + 'TTT': 'Phe', 'TCT': 'Ser', 'TAT': 'Tyr', 'TGT': 'Cys', # TxT + 'TTC': 'Phe', 'TCC': 'Ser', 'TAC': 'Tyr', 'TGC': 'Cys', # TxC + 'TTA': 'Leu', 'TCA': 'Ser', 'TAA': 'Ter', 'TGA': 'Ter', # TxA + 'TTG': 'Leu', 'TCG': 'Ser', 'TAG': 'Ter', 'TGG': 'Trp', # TxG + + # C + 'CTT': 'Leu', 'CCT': 'Pro', 'CAT': 'His', 'CGT': 'Arg', # CxT + 'CTC': 'Leu', 'CCC': 'Pro', 'CAC': 'His', 'CGC': 'Arg', # CxC + 'CTA': 'Leu', 'CCA': 'Pro', 'CAA': 'Gln', 'CGA': 'Arg', # CxA + 'CTG': 'Leu', 'CCG': 'Pro', 'CAG': 'Gln', 'CGG': 'Arg', # CxG + + # A + 'ATT': 'Ile', 'ACT': 'Thr', 'AAT': 'Asn', 'AGT': 'Ser', # AxT + 'ATC': 'Ile', 'ACC': 'Thr', 'AAC': 'Asn', 'AGC': 'Ser', # AxC + 'ATA': 'Ile', 'ACA': 'Thr', 'AAA': 'Lys', 'AGA': 'Arg', # AxA + 'ATG': 'Met', 'ACG': 'Thr', 'AAG': 'Lys', 'AGG': 'Arg', # AxG + + # G + 'GTT': 'Val', 'GCT': 'Ala', 'GAT': 'Asp', 'GGT': 'Gly', # GxT + 'GTC': 'Val', 'GCC': 'Ala', 'GAC': 'Asp', 'GGC': 'Gly', # GxC + 'GTA': 'Val', 'GCA': 'Ala', 'GAA': 'Glu', 'GGA': 'Gly', # GxA + 'GTG': 'Val', 'GCG': 'Ala', 'GAG': 'Glu', 'GGG': 'Gly' # GxG +} + diff --git a/mavecore/validation/constants/general.py b/mavecore/validation/constants/general.py new file mode 100644 index 0000000..3b0abe0 --- /dev/null +++ b/mavecore/validation/constants/general.py @@ -0,0 +1,53 @@ +# valid data usage policies + +import re + +""" +Null Constant definitions +""" +NA_STRING = "NA" +null_values_list = ( + "nan", + "na", + "none", + "", + "undefined", + "n/a", + "null", + "nil", + "-", + None, +) +# enforce the assumption that these are all lowercase values +null_values_list = [s.lower() for s in null_values_list if s is not None] +# add the NA_STRING only if it's not already in the list +if NA_STRING.lower() not in null_values_list: + null_values_list.append(NA_STRING.lower()) +null_values_list.sort() + +null_values_re = re.compile( + r"^\s+$|" + "|".join(f"^{s}$" for s in null_values_list if len(s)), + flags=re.IGNORECASE, +) + +readable_null_values_list = [f"{s}" for s in null_values_list] + ["whitespace"] + +hgvs_nt_column = "hgvs_nt" +hgvs_splice_column = "hgvs_splice" +hgvs_pro_column = "hgvs_pro" +hgvs_columns = sorted([hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]) +meta_data = "meta_data" +score_columns = "score_columns" +count_columns = "count_columns" +variant_score_data = "score_data" +variant_count_data = "count_data" +required_score_column = "score" + +valid_dataset_columns = [score_columns, count_columns] +valid_variant_columns = [variant_score_data, variant_count_data] + +variant_to_scoreset_column = { + variant_score_data: score_columns, + variant_count_data: count_columns, +} +scoreset_to_variant_column = {v: k for k, v in variant_to_scoreset_column.items()} diff --git a/mavecore/validation/constants/identifier.py b/mavecore/validation/constants/identifier.py new file mode 100644 index 0000000..100dedc --- /dev/null +++ b/mavecore/validation/constants/identifier.py @@ -0,0 +1 @@ +valid_dbnames = ["UniProt", "RefSeq", "Ensembl"] diff --git a/mavecore/validation/constants/keywords.py b/mavecore/validation/constants/keywords.py new file mode 100644 index 0000000..4abaab2 --- /dev/null +++ b/mavecore/validation/constants/keywords.py @@ -0,0 +1 @@ +# valid keywords \ No newline at end of file diff --git a/mavecore/validation/constants/target.py b/mavecore/validation/constants/target.py new file mode 100644 index 0000000..6183e2a --- /dev/null +++ b/mavecore/validation/constants/target.py @@ -0,0 +1,2 @@ +valid_categories = ["Protein coding", "Regulatory", "Other noncoding"] +valid_sequence_types = ["Infer", "DNA", "Protein"] diff --git a/mavecore/validation/constants/urn.py b/mavecore/validation/constants/urn.py new file mode 100644 index 0000000..99c816a --- /dev/null +++ b/mavecore/validation/constants/urn.py @@ -0,0 +1,55 @@ +import re + +MAVEDB_EXPERIMENTSET_URN_DIGITS = 8 +#MAVEDB_TMP_URN_DIGITS = 16 +MAVEDB_URN_MAX_LENGTH = 64 +MAVEDB_URN_NAMESPACE = "mavedb" + + +# Temp URN patterns +# --------------------------------------------------------------------------- # +#TODO get tmp pattern from UUID4 regex +#MAVEDB_TMP_URN_PATTERN = r"^tmp:[A-Za-z0-9]{{{width}}}$".format( +# width=MAVEDB_TMP_URN_DIGITS +#) +#MAVEDB_TMP_URN_RE = re.compile(MAVEDB_TMP_URN_PATTERN) + + +# Experimentset Pattern/Compiled RE +MAVEDB_EXPERIMENTSET_URN_PATTERN = r"^urn:{namespace}:\d{{{width}}}$".format( + namespace=MAVEDB_URN_NAMESPACE, width=MAVEDB_EXPERIMENTSET_URN_DIGITS +) +MAVEDB_EXPERIMENTSET_URN_RE = re.compile(MAVEDB_EXPERIMENTSET_URN_PATTERN) + +# Experiment Pattern/Compiled RE +MAVEDB_EXPERIMENT_URN_PATTERN = r"{pattern}-([a-z]+|0)$".format( + pattern=MAVEDB_EXPERIMENTSET_URN_PATTERN[:-1] +) +MAVEDB_EXPERIMENT_URN_RE = re.compile(MAVEDB_EXPERIMENT_URN_PATTERN) + +# Scoreset Pattern/Compiled RE +MAVEDB_SCORESET_URN_PATTERN = r"{pattern}-\d+$".format( + pattern=MAVEDB_EXPERIMENT_URN_PATTERN[:-1] +) +MAVEDB_SCORESET_URN_RE = re.compile(MAVEDB_SCORESET_URN_PATTERN) + +# Variant Pattern/Compiled RE +MAVEDB_VARIANT_URN_PATTERN = r"{pattern}#\d+$".format( + pattern=MAVEDB_SCORESET_URN_PATTERN[:-1] +) +MAVEDB_VARIANT_URN_RE = re.compile(MAVEDB_VARIANT_URN_PATTERN) + +# Any Pattern/Compiled RE +MAVEDB_ANY_URN_PATTERN = "|".join( + [ + r"({pattern})".format(pattern=p) + for p in ( + MAVEDB_EXPERIMENTSET_URN_PATTERN, + MAVEDB_EXPERIMENT_URN_PATTERN, + MAVEDB_SCORESET_URN_PATTERN, + MAVEDB_VARIANT_URN_PATTERN, + #MAVEDB_TMP_URN_PATTERN, + ) + ] +) +MAVEDB_ANY_URN_RE = re.compile(MAVEDB_ANY_URN_PATTERN) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py new file mode 100644 index 0000000..c6fe168 --- /dev/null +++ b/mavecore/validation/dataframe.py @@ -0,0 +1,379 @@ +from numpy.testing import assert_array_equal +from pandas.testing import assert_frame_equal +from mavehgvs.variant import Variant +import numpy as np + +from mavecore.validation.constants.general import ( + readable_null_values_list, + hgvs_nt_column, + hgvs_pro_column, + hgvs_splice_column, + required_score_column +) +from mavecore.validation.exceptions import ValidationError +from mavecore.validation.variant import validate_hgvs_string +from mavecore.validation.utilities import convert_hgvs_nt_to_hgvs_pro, is_null +from mavecore.validation.target import validate_target_sequence + +# handle with pandas all null strings +# provide a csv or a pandas dataframe +# take dataframe, output as csv to temp directory, use standard library + + +def validate_dataframes(target_seq: str, scores, counts=None): + """ + Validates scores and counts dataframes for MaveDB upload. This function performs + comprehensive validation. + + Parameters + __________ + scores : pandas.DataFrame + The scores data as a pandas dataframe. + counts : pandas.DataFrame + The counts data as a pandas dataframe. + + Raises + ______ + ValidationError + If any of the validation fails. + """ + validate_no_null_columns_or_rows(scores) + scores = validate_column_names(scores) + validate_values_by_column(scores, target_seq) + if counts is not None: + validate_no_null_columns_or_rows(counts) + counts = validate_column_names(counts, scores=False) + validate_values_by_column(counts, target_seq) + validate_dataframes_define_same_variants(scores, counts) + + +def validate_no_null_columns_or_rows(dataframe): + # TODO: we may not need this - current datasets exist where all values are None + """ + Checks that there are no null columns or rows in the dataframe. Note that a null + column may still have a valid column name. + + Parameters + __________ + dataframe : pandas.DataFrame + The scores or counts dataframe being validated + + Raises + ______ + ValidationError + If there are null columns or rows in the dataframe + """ + # first drop any columns where null columns are allowed + if hgvs_nt_column: + dataframe = dataframe.drop([hgvs_nt_column], axis=1) + if hgvs_pro_column: + dataframe = dataframe.drop([hgvs_pro_column], axis=1) + if hgvs_splice_column: + dataframe = dataframe.drop([hgvs_splice_column], axis=1) + df = dataframe.dropna(axis=0, how='all') + df = df.dropna(axis=1, how='all') + try: + assert_frame_equal(df, dataframe) + except AssertionError: + raise ValidationError("Dataset should not contain null columns or rows.") + + +def validate_column_names(dataframe, scores=True): + # TODO: return errors to user regarding column name ordering + """ + This function validates the columns in a dataframe. The first columns should be + an hgvs column such as hgvs_nt, hgvs_pro, and hgvs_splice. There should be at least + one column beyond the hgvs columns. A scores dataframe should have a score column and + a counts dataframe should have a counts column. There should not be any null columns. + The column names will also be validated against unusual file conversions that could + corrupt the column names. + + Parameters + __________ + dataframe : pandas.DataFrame + The scores or counts dataframe to be validated. + + Raises + ______ + ValidationError + If the column names are not formatted correctly. + """ + # get columns from dataframe + columns = dataframe.columns + + # check that there are no duplicate column names + if len(columns) != len(set(columns)): + raise ValidationError("There cannot be duplicate column names.") + + # count instances of hgvs columns + count = 0 + # note presence of different columns + hgvs_nt = False + hgvs_pro = False + hgvs_splice = False + score_column = False + for i in range(len(columns)): + # there should not be any null columns + # check for empty strings, np.nan, and None + # if is_null(columns[i]) or columns[i] is None: + if not isinstance(columns[i], str) or columns[i] == "" or columns[i].isspace(): + # above condition will check that value is not None or np.nan also + raise ValidationError("Column names must not be null.") # in readable_null_values_list: + if columns[i] in [hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]: + count += 1 + # mark what type of column the current column is + if columns[i] == hgvs_nt_column: + hgvs_nt = True + elif columns[i] == hgvs_pro_column: + hgvs_pro = True + elif columns[i] == hgvs_splice_column: + hgvs_splice = True + elif columns[i] == required_score_column: + score_column = True + # check for uppercase and raise error + elif (columns[i] == hgvs_nt_column.upper() or + columns[i] == hgvs_pro_column.upper() or + columns[i] == hgvs_splice_column.upper() or + columns[i] == required_score_column.upper()): + raise ValidationError("hgvs columns and score column should be lowercase.") + + # there should be at least one of hgvs_nt or hgvs_pro column + # if count == 0: + if not hgvs_nt and not hgvs_pro: + raise ValidationError("Must include hgvs_nt or hgvs_pro column.") # or hgvs_splice column.") + + # splice should not be defined in nt is not + if hgvs_splice and not hgvs_nt: + raise ValidationError("Must define hgvs_nt column if defining hgvs_splice column.") + + # first columns should be hgvs columns, reorder columns to meet this requirement + if score_column: + score = dataframe.pop(required_score_column) + dataframe.insert(0, required_score_column, score) + if hgvs_splice: + splice_column = dataframe.pop(hgvs_splice_column) + dataframe.insert(0, hgvs_splice_column, splice_column) + if hgvs_pro: + pro_column = dataframe.pop(hgvs_pro_column) + dataframe.insert(0, hgvs_pro_column, pro_column) + if hgvs_nt: + nt_column = dataframe.pop(hgvs_nt_column) + dataframe.insert(0, hgvs_nt_column, nt_column) + + # there should be at least one additional column beyond the hgvs columns + if len(columns) == count: + raise ValidationError("There must be at least one additional column beyond the hgvs columns.") + + # if dataframe is a scores df make sure it has a score column + # also make sure counts df has a counts column and not a score column + if scores and not score_column: + raise ValidationError("A scores dataframe must include a `score` column.") + if not scores and score_column: + raise ValidationError("A counts dataframe should not include a `score` column, include `score` " + "column in a scores dataframe.") + + return dataframe + + +def validate_values_by_column(dataset, target_seq: str): + """ + Validates that the values in each column labeled `hgvs_nt`, `hgvs_pro`, `hgvs_splice`, and `score` make sense + with regards to their column name. It also validates via a helper function that if both an `hgvs_nt` column and + an `hgvs_pro` column exist, they are consistent with one another. + + Parameters + __________ + dataset : pandas.DataFrame + A scores or counts dataframe. + target_seq: str + The hgvs column name from which the variants parameter originates. + + Raises + ______ + ValidationError + If the target sequence does not contain solely the bases ACTG. + ValidationError + If any variant fails validation or if the variants are not consistent with one another. + """ + # first check that dataframe is not empty + if dataset.empty: + raise ValidationError("Dataset must not be empty.") + + # validate target sequence + validate_target_sequence(target_seq) + + # first check the column names, establish the order or the hgvs and score columns + hgvs_nt = False + hgvs_pro = False + hgvs_splice = False + score = False + for column in dataset.columns: + if column == hgvs_nt_column: + hgvs_nt = True + elif column == hgvs_pro_column: + hgvs_pro = True + elif column == hgvs_splice_column: + hgvs_splice = True + elif column == required_score_column: + score = True + else: + raise ValidationError("Missing required hgvs and/or score columns.") + + # check that the first column, hgvs_nt or hgvs_pro, is valid + if hgvs_nt: + validate_index_column(dataset["hgvs_nt"], hgvs="nt") + elif hgvs_pro: + validate_index_column(dataset["hgvs_pro"], hgvs="pro") + else: + raise ValidationError("Must include either hgvs_nt or hgvs_pro column.") + + # check that prefixes all match and are consistent with one another + hgvs_nt_prefix = None + + # loop through row by row, validate hgvs strings, make sure nt and pro are consistent with one another + for i in range(len(dataset)): + if hgvs_nt and dataset.loc[i, hgvs_nt_column] is not None: + validate_hgvs_string(value=dataset.loc[i, hgvs_nt_column], + column="nt", + targetseq=target_seq, + splice_present=hgvs_splice) + if hgvs_nt_prefix: + if Variant(dataset.loc[i, hgvs_nt_column]).prefix != hgvs_nt_prefix: + raise ValidationError("All prefixes within the hgvs_nt column must be the same.") + # if prefix is non-coding, there should not be an hgvs_pro value + if hgvs_nt_prefix == "n": + if hgvs_pro_column and dataset.loc[i, hgvs_pro_column] is not None: + raise ValidationError("Cannot have hgvs_pro value with non-coding hgvs_nt value.") + else: # assign the prefix value since it has not yet been assigned + hgvs_nt_prefix = Variant(dataset.loc[i, hgvs_nt_column]).prefix + if hgvs_pro and dataset.loc[i, hgvs_pro_column] is not None: + validate_hgvs_string(value=dataset.loc[i, hgvs_pro_column], + column="p", + targetseq=target_seq, + splice_present=hgvs_splice) + if hgvs_splice and dataset.loc[i, hgvs_splice_column] is not None: + validate_hgvs_string(value=dataset.loc[i, hgvs_splice_column], + column="splice", + targetseq=target_seq, + splice_present=hgvs_splice) + if hgvs_nt_prefix != 'g': + raise ValidationError("hgvs_nt prefix must be genomic when splice present.") + if score: + s = validate_score(dataset.loc[i, required_score_column]) + dataset.loc[i, required_score_column] = s + if hgvs_nt and hgvs_pro and dataset.loc[i, hgvs_nt_column] is not None and dataset.loc[i, hgvs_pro_column] is not None: + # TODO: ensure this function is implemented correctly before applying, complete unit testing + if not Variant(dataset.loc[i, hgvs_pro_column]).is_multi_variant(): # can only convert to single hgvs_pro variants + '''validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq=target_seq, + nt=dataset.loc[i, hgvs_nt_column], + pro=dataset.loc[i, hgvs_pro_column], + row=i)''' + + # check that primary column, whether hgvs_nt or hgvs_pro, does not contain None values + # make sure target seq is the right type + # no protein target with just nt variants + + +def validate_index_column(column, hgvs: str): + """ + Validates the first column in a dataframe, should be hgvs_nt or hgvs_pro. All values in the column should be + unique and there should be no missing values. + + Parameters + __________ + column : list + The column that will be validated. + hgvs : str + Indicates whether or not the column is an hgvs_nt or hgvs_pro column. Can have value "nt" or "pro". + + Raises + ______ + ValidationError + If there are duplicate values in the column. + ValidationError + If there are missing values in the column. + """ + col_set = set(column) + if len(col_set) != len(column): + raise ValidationError( + "Each value in hgvs_'{}' column must be unique.".format(hgvs) + ) + if np.nan in col_set: + raise ValidationError( + "Primary column (hgvs_'{}') must not contain missing values.".format(hgvs) + ) + + +def validate_score(score): + if isinstance(score, float) or isinstance(score, int): + score = float(score) + else: + raise ValidationError("Each value in score column must by a float. " + "'{}' has the type '{}'.".format(score, type(score).__name__)) + return score + + +def validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq: str, nt: str, pro: str, row: int): + """ + Checks that, when both an `hgvs_nt` and an `hgvs_pro` exist, the variant strings within + those columns are representing the same change. + + Parameters + __________ + target_seq : str + The target sequence associated withe variants. + nt : str + The hgvs_nt string. + pro : str + The hgvs_pro string. + row : int + The row that the current hgvs strings being evaluated are in. + + Raises + ______ + ValidationError + If the variants do not represent the same change. + """ + # TODO think about how double quoted variants are handled here (e.g., "c.[123A>G;124A>G]") + nt_converted = convert_hgvs_nt_to_hgvs_pro(nt, target_seq) + # compare nt_converted with pro + if nt_converted != pro: + raise ValidationError("The hgvs_nt variant {} and the hgvs_pro variant {} on row {} do not represent the " + "same change.".format(nt, pro, row)) + + +def validate_dataframes_define_same_variants(scores, counts): + """ + Checks if two `pd.DataFrame` objects parsed from uploaded files + define the same variants. + + Parameters + ---------- + scores : pandas.DataFrame + Scores dataframe parsed from an uploaded scores file. + counts : pandas.DataFrame + Scores dataframe parsed from an uploaded counts file. + + Raises + ______ + ValidationError + If score and counts files do not define the same variants. + """ + try: + assert_array_equal( + scores[hgvs_nt_column].sort_values().values, + counts[hgvs_nt_column].sort_values().values, + ) + assert_array_equal( + scores[hgvs_splice_column].sort_values().values, + counts[hgvs_splice_column].sort_values().values, + ) + assert_array_equal( + scores[hgvs_pro_column].sort_values().values, + counts[hgvs_pro_column].sort_values().values, + ) + except AssertionError: + raise ValidationError( + "Your score and counts files do not define the same variants. " + "Check that the hgvs columns in both files match." + ) diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataset.py new file mode 100644 index 0000000..716ae59 --- /dev/null +++ b/mavecore/validation/dataset.py @@ -0,0 +1,48 @@ +import json + +from mavecore.models.data import Experiment, ScoreSet + + +def validate_experiment(experiment: dict): + """ + Validates an experiment represented as a dictionary. Validation is handled via pydantic. A valid dictionary is + returned upon validation. If extra or duplicate keys are included, those fields are excluded from the returned + dictionary. If required keys are missing or any keys contain incorrect values, an error is raised. + + Parameters + __________ + experiment : dict + The experiment dictionary that will be validated. + + Raises + ______ + ValueError + If required keys are missing or any keys contain incorrect values. + """ + try: + return json.loads(Experiment.parse_obj(experiment).json()) + except ValueError as e: + print(e) + + +def validate_scoreset(scoreset: dict): + """ + Validates a scoreset represented as a dictionary (Note: this does not validate dataframes, look to dataframe.py + for that validation code). Validation is handled via pydantic. A valid dictionary is returned upon validation. + If extra or duplicate keys are included, those fields are excluded from the returned dictionary. If required keys + are missing or any keys contain incorrect values, an error is raised. + + Parameters + __________ + experiment : dict + The scoreset dictionary that will be validated. + + Raises + ______ + ValueError + If required keys are missing or any keys contain incorrect values. + """ + try: + return json.loads(ScoreSet.parse_obj(scoreset).json()) + except ValueError as e: + print(e) diff --git a/mavecore/validation/exceptions.py b/mavecore/validation/exceptions.py new file mode 100644 index 0000000..0d18a8b --- /dev/null +++ b/mavecore/validation/exceptions.py @@ -0,0 +1,5 @@ +NON_FIELD_ERRORS = "__all__" + + +class ValidationError(ValueError, AssertionError): + None diff --git a/mavecore/validation/identifier.py b/mavecore/validation/identifier.py new file mode 100644 index 0000000..9e6c693 --- /dev/null +++ b/mavecore/validation/identifier.py @@ -0,0 +1,307 @@ +import idutils + +from mavecore.validation.exceptions import ValidationError +from mavecore.validation.utilities import is_null +from mavecore.validation.constants.identifier import valid_dbnames + + +def validate_external_identifier(identifier: dict): + """ + Validates an external identifier represented as a dictionary. The dictionary should have a length of 2 + and have the keys `dbname` and `identifier`, both with str values. The valid values for these keys are + stored in lists within the identifier file in constants directory. + + Parameters + __________ + identifier : dict + The identifier to be validated. + + Raises + ______ + ValidationError + If the length of the dictionary is not 2. + ValidationError + If the keys do not have the correct name. + ValidationError + If the `dbname` value is not valid. + ValidationError + If the `identifier` value is not correct as it relates to the `dbname` value. + """ + # check that identifier dict only has two keys + if len(identifier) != 2: + raise ValidationError("The identifier attribute of the external identifier should have two keys, `dbname` " + "and `identifier`.") + + # check that the keys are the right name + if "dbname" not in identifier: + raise ValidationError("The identifier attribute of the external identifier should have two Keys, `dbname` " + "and `identifier`.") + if "identifier" not in identifier: + raise ValidationError("The identifier attribute of the external identifier should have two Keys, `dbname` " + "and `identifier`.") + + # check that dbname is valid + if identifier.get("dbname") not in valid_dbnames: + raise ValidationError(f"The `dbname` key within the identifier attribute of the external identifier should " + f"take one of the following values: {valid_dbnames}.") + + # validate identifier based on dbname: could be one of UniProt, RefSeq, or Ensembl + if identifier.get("dbname") == "UniProt": + validate_uniprot_identifier(identifier.get("identifier")) + elif identifier.get("dbname") == "RefSeq": + validate_refseq_identifier(identifier.get("identifier")) + elif identifier.get("dbname") == "Ensembl": + validate_ensembl_identifier(identifier.get("identifier")) + + +def validate_sra_identifier(identifier: str): + """ + Validates whether the identifier is a valid SRA identifier. + + Parameters + __________ + identifier : str + The identifier to be validated. + + Raises + ______ + ValidationError + If the identifier is not a valid SRA identifier. + """ + if not ( + idutils.is_sra(identifier) + or idutils.is_bioproject(identifier) + or idutils.is_geo(identifier) + or idutils.is_arrayexpress_array(identifier) + or idutils.is_arrayexpress_experiment(identifier) + ): + raise ValidationError( + f"'{identifier} is not a valid SRA, GEO, ArrayExpress or BioProject " + "accession." + ) + + +def validate_pubmed_identifier(identifier: str): + """ + Validates whether the identifier is a valid PubMed identifier. + + Parameters + __________ + identifier : str + The identifier to be validated. + + Raises + ______ + ValidationError + If the identifier is not a valid PubMed identifier. + """ + if not idutils.is_pmid(identifier): + #raise ValidationError(f"'{identifier} is not a valid PubMed identifier.") + raise ValidationError("{} is not a valid PubMed identifier.".format(identifier)) + + +def validate_doi_identifier(identifier: str): + """ + Validates whether the identifier is a valid DOI identifier. + + Parameters + __________ + identifier : str + The identifier to be validated. + + Raises + ______ + ValidationError + If the identifier is not a valid DOI identifier. + """ + if not idutils.is_doi(identifier): + #raise ValidationError(f"'{identifier}' is not a valid DOI.") + raise ValidationError("{} is not a valid DOI identifier.".format(identifier)) + + +def validate_ensembl_identifier(identifier: str): + """ + Validates whether the identifier is a valid Ensembl identifier. + + Parameters + __________ + identifier : str + The identifier to be validated. + + Raises + ______ + ValidationError + If the identifier is not a valid Ensembl identifier. + """ + if not idutils.is_ensembl(identifier): + raise ValidationError(f"'{identifier}' is not a valid Ensembl accession.") + + +def validate_uniprot_identifier(identifier: str): + """ + Validates whether the identifier is a valid UniProt identifier. + + Parameters + __________ + identifier : str + The identifier to be validated. + + Raises + ______ + ValidationError + If the identifier is not a valid UniProt identifier. + """ + if not idutils.is_uniprot(identifier): + raise ValidationError(f"'{identifier}' is not a valid UniProt accession.") + + +def validate_refseq_identifier(identifier: str): + """ + Validates whether the identifier is a valid RefSeq identifier. + + Parameters + __________ + identifier : str + The identifier to be validated. + + Raises + ______ + ValidationError + If the identifier is not a valid RefSeq identifier. + """ + if not idutils.is_refseq(identifier): + raise ValidationError(f"'{identifier}' is not a valid RefSeq accession.") + + +def validate_genome_identifier(identifier: str): + """ + Validates whether the identifier is a valid genome identifier. + + Parameters + __________ + identifier : str + The identifier to be validated. + + Raises + ______ + ValidationError + If the identifier is not a valid genome identifier. + """ + if not idutils.is_genome(identifier): + raise ValidationError( + f"'{identifier}' is not a valid GenBank or RefSeq genome assembly." + ) + + +def validate_pubmed_list(values: list[str]): + """ + Validates whether each identifier in a list of identifiers (values) is a valid PubMed identifier. + + Parameters + __________ + identifier : list[str] + The list of identifiers to be validated. + + Raises + ______ + ValidationError + If at least one of the identifiers is not a valid PubMed identifier. + """ + for value in values: + if not is_null(value): + validate_pubmed_identifier(value) + + +def validate_sra_list(values: list[str]): + """ + Validates whether each identifier in a list of identifiers (values) is a valid SRA identifier. + + Parameters + __________ + identifier : list[str] + The list of identifiers to be validated. + + Raises + ______ + ValidationError + If at least one of the identifiers is not a valid SRA identifier. + """ + for value in values: + if not is_null(value): + validate_sra_identifier(value) + + +def validate_doi_list(values: list[str]): + """ + Validates whether each identifier in a list of identifiers (values) is a valid DOI identifier. + + Parameters + __________ + identifier : list[str] + The list of identifiers to be validated. + + Raises + ______ + ValidationError + If at least one of the identifiers is not a valid DOI identifier. + """ + for value in values: + if not is_null(value): + validate_doi_identifier(value) + + +def validate_ensembl_list(values: list[str]): + """ + Validates whether each identifier in a list of identifiers (values) is a valid Ensembl identifier. + + Parameters + __________ + identifier : list[str] + The list of identifiers to be validated. + + Raises + ______ + ValidationError + If at least one of the identifiers is not a valid Ensemble identifier. + """ + for value in values: + if not is_null(value): + validate_ensembl_identifier(value) + + +def validate_refseq_list(values: list[str]): + """ + Validates whether each identifier in a list of identifiers (values) is a valid RefSeq identifier. + + Parameters + __________ + identifier : list[str] + The list of identifiers to be validated. + + Raises + ______ + ValidationError + If at least one of the identifiers is not a valid RefSeq identifier. + """ + for value in values: + if not is_null(value): + validate_refseq_identifier(value) + + +def validate_uniprot_list(values: list[str]): + """ + Validates whether each identifer in a list of identifiers (values) is a valid UniProt identifier. + + Parameters + __________ + identifier : list[str] + The list of identifiers to be validated. + + Raises + ______ + ValidationError + If at least one of the identifiers is not a valid UniProt identifier. + """ + for value in values: + if not is_null(value): + validate_uniprot_identifier(value) diff --git a/mavecore/validation/keywords.py b/mavecore/validation/keywords.py new file mode 100644 index 0000000..2d8cd8f --- /dev/null +++ b/mavecore/validation/keywords.py @@ -0,0 +1,46 @@ +from mavecore.validation.exceptions import ValidationError +from mavecore.validation.utilities import is_null + + +def validate_keywords(keywords: list[str]): + """ + Validates a list of keywords. + + Parameters + __________ + keywords: list[str] + A list of keywords. + + Raises + ______ + ValidationError + If the list is invalid or null or if any individual keyword is invalid or null. + """ + if is_null(keywords): + raise ValidationError( + "{} are not valid keywords. Keywords must be a non null list of strings.".format(keywords) + ) + else: + for keyword in keywords: + validate_keyword(keyword) + + +def validate_keyword(keyword: str): + """ + This function validates whether or not the kw parameter is valid by + checking that it is a string that is not null. If kw is null + or is not a string, an error is raised. + + Parameters + __________ + kw : str + The keyword to be validated. + + Raises + ______ + ValidationError + If the kw argument is not a valid string. + """ + if is_null(keyword) or not isinstance(keyword, str): + raise ValidationError("{} not a valid keyword. Keywords must be non null strings.".format(keyword)) + diff --git a/mavecore/validation/target.py b/mavecore/validation/target.py new file mode 100644 index 0000000..591425e --- /dev/null +++ b/mavecore/validation/target.py @@ -0,0 +1,62 @@ +from mavecore.validation.exceptions import ValidationError +from mavecore.validation.constants.target import valid_categories, valid_sequence_types + + +def validate_target_category(category: str): + """ + If the target category provided does not fall within a pre-defined list of valid categories. + + Parameters + __________ + category: str + The target category to be validated. + + Raises + ______ + ValidationError + If the target category provided is not valid. + """ + if category not in valid_categories: + raise ValidationError("{}'s is not a valid target category. Valid categories are " + "Protein coding, Regulatory, and Other noncoding".format(category)) + + +def validate_sequence_category(sequence_type: str): + """ + If the sequence type provided does not fall within a pre-defined list of valid sequence types. + + Parameters + __________ + sequence_type: str + The sequence type to be validated. + + Raises + ______ + ValidationError + If the sequence type provided is not valid. + """ + if sequence_type not in valid_sequence_types: + raise ValidationError("{}'s is not a valid sequence type. Valid sequence types are " + "Infer, DNA, and Protein".format(sequence_type)) + + +def validate_target_sequence(target_seq: str): + """ + Validates a target sequence. The sequence should consists of only ACTG and the length should be a multiple of 3. + + Parameters + __________ + sequence : str + The target sequence that will be validated. + + Raises + ______ + ValidationError + If the target sequence does not consist of ACTG or if the length of the sequence is not a multiple of 3. + """ + # if target_seq is not made solely of characters ACTG + check_chars = [letter in "ACTG" for letter in target_seq] + if False in check_chars: + raise ValidationError("target_seq is invalid, must be composed only of bases ACTG.") + if len(target_seq) % 3 != 0: + raise ValidationError("target_seq is invalid, length must be a multiple of three.") \ No newline at end of file diff --git a/mavecore/validation/urn.py b/mavecore/validation/urn.py new file mode 100644 index 0000000..2f5a52b --- /dev/null +++ b/mavecore/validation/urn.py @@ -0,0 +1,137 @@ +import uuid +from mavecore.validation.exceptions import ValidationError +from mavecore.validation.constants.urn import * + + +def validate_mavedb_urn(urn: str): + """ + This function validates a MaveDB urn and raises an error if it is not valid. + + Parameters + __________ + urn : str + The MaveDB urn to be validated. + + Raises + ______ + ValidationError + If the MaveDB urn is not valid. + """ + if urn.startswith("tmp:"): + try: + uuid.UUID(urn[4:]) + except ValueError: + raise ValidationError( + "{}'s is not a valid Experiment Set urn.".format(urn) + ) + else: + if not MAVEDB_ANY_URN_RE.match(urn): + raise ValidationError("{}'s is not a valid urn.".format(urn)) + + +def validate_mavedb_urn_experimentset(urn: str): + """ + This function validates a Experiment Set urn and raises an error if it is not valid. + + Parameters + __________ + urn : str + The Experiment Set urn to be validated. + + Raises + ______ + ValidationError + If the Experiment Set urn is not valid. + """ + if urn.startswith("tmp:"): + try: + uuid.UUID(urn[4:]) + except ValueError: + raise ValidationError( + "{}'s is not a valid Experiment Set urn.".format(urn) + ) + else: + if not MAVEDB_EXPERIMENTSET_URN_RE.match(urn): + raise ValidationError( + "{}'s is not a valid Experiment Set urn.".format(urn) + ) + + +def validate_mavedb_urn_experiment(urn: str): + """ + This function validates an Experiment urn and raises an error if it is not valid. + + Parameters + __________ + urn : str + The Experiment urn to be validated. + + Raises + ______ + ValidationError + If the Experiemnt urn is not valid. + """ + if urn.startswith("tmp:"): + try: + uuid.UUID(urn[4:]) + except ValueError: + raise ValidationError( + "{}'s is not a valid Experiment Set urn.".format(urn) + ) + else: + if not MAVEDB_EXPERIMENT_URN_RE.match(urn): + raise ValidationError( + "{}'s is not a valid Experiment urn.".format(urn) + ) + + +def validate_mavedb_urn_scoreset(urn: str): + """ + This function validates a Scoreset urn and raises an error if it is not valid. + + Parameters + __________ + urn : str + The Scoreset urn to be validated + + Raises + ______ + ValidationError + If the Scoreset urn is not valid. + """ + if urn.startswith("tmp:"): + try: + uuid.UUID(urn[4:]) + except ValueError: + raise ValidationError( + "{}'s is not a valid Experiment Set urn.".format(urn) + ) + else: + if not MAVEDB_SCORESET_URN_RE.match(urn): + raise ValidationError("{}'s is not a valid score set urn.".format(urn)) + + +def validate_mavedb_urn_variant(urn: str): + """ + This function validates a MaveDB Variant urn and raises an error if it is not valid. + + Parameters + __________ + urn : str + The MaveDB Variant urn to be validated. + + Raises + ______ + ValidationError + If the MaveDB Variant urn is not valid. + """ + if urn.startswith("tmp:"): + try: + uuid.UUID(urn[4:]) + except ValueError: + raise ValidationError( + "{}'s is not a valid Experiment Set urn.".format(urn) + ) + else: + if not MAVEDB_VARIANT_URN_RE.match(urn): + raise ValidationError("{}'s is not a valid Variant urn.".format(urn)) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py new file mode 100644 index 0000000..2291e90 --- /dev/null +++ b/mavecore/validation/utilities.py @@ -0,0 +1,361 @@ +from mavecore.validation.constants.general import null_values_re +from random import choice +from typing import Optional + +from mavehgvs.variant import Variant +from mavecore.validation.constants.conversion import codon_dict_DNA +from mavecore.validation.constants.conversion import aa_dict_key_1 +#from mavecore.validation.variant import validate_hgvs_string + + +def to_camel(string: str) -> str: + camel = ''.join(word.capitalize() for word in string.split('_')) + camel = camel[0].lower() + camel[1:] + return camel + + +def is_null(value): + """ + Returns True if a stripped/lowercase value in in `nan_col_values`. + + Parameters + __________ + value : str + The value to be checked as null or not. + + Returns + _______ + bool + True value is NoneType or if value matches the stated regex patterns in constants.null_values_re. + """ + value = str(value).strip().lower() + if not value: + return True + match = null_values_re.fullmatch(value) + if match: + return True + else: + return False + #return null_values_re.fullmatch(value) or not value + + +def generate_hgvs(prefix: str = "c") -> str: + """ + Generates a random hgvs string from a small sample. + """ + if prefix == "p": + # Subset of 3-letter codes, chosen at random. + amino_acids = [ + "Ala", + "Leu", + "Gly", + "Val", + "Tyr", + "Met", + "Cys", + "His", + "Glu", + "Phe", + ] + ref = choice(amino_acids) + alt = choice(amino_acids) + return f"{prefix}.{ref}{choice(range(1, 100))}{alt}" + else: + alt = choice("ATCG") + ref = choice("ATCG") + return f"{prefix}.{choice(range(1, 100))}{ref}>{alt}" + + +def construct_hgvs_pro(wt: str, mutant: str, position: int, target_seq: Optional[str] = None): + # TODO: the testing on this function needs to be improved + """ + Given the wt and mutant 3 lette amino acid codes as well as the position, this function generates a validated + hgvs_pro string. + + Parameters + __________ + wt: str + The wt 3 letter amino acid code. + mutant: str + The mutant 3 letter amino acid code. + position: int + The position of the change. + + Returns + _______ + hgvs + The constructed hgvs_pro string. + + Raises + ______ + ValueError + If the wt or mutant 3 letter amino acid codes are invalid. + """ + # TODO account for when variant codon is None, a deletion event + # check that the provided 3 letter amino acid codes are valid + if wt not in aa_dict_key_1.values(): + raise ValueError("wt 3 letter amino acid code {} is invalid, " + "must be one of the following: {}".format(wt, list(aa_dict_key_1.values()))) + if mutant not in aa_dict_key_1.values(): + raise ValueError("wt 3 letter amino acid code {} is invalid, " + "must be one of the following: {}".format(mutant, list(aa_dict_key_1.values()))) + + if wt == mutant: + hgvs = "p." + wt + str(position) + "=" + else: + hgvs = "p." + wt + str(position) + mutant + # validate variant + Variant(hgvs) + #var.validate_hgvs_string(value=hgvs, column="p", targetseq=target_seq) + return hgvs + + +def convert_hgvs_nt_to_hgvs_pro(hgvs_nt: str, target_seq: str): + # TODO note that this only works for codon changes and single mutants + """ + This function takes a hgvs_nt variant string and its associated target sequence and returns + a validated hgvs_pro equivalent. + + Parameters + __________ + hgvs_nt: string + The hgvs_nt string that will be converted. + target_seq: + The target sequence associated with the hgvs_nt variant. + + Raises + ______ + TypeError + If target_seq is not string. + ValueError + If target_seq is not made solely of characters ACTG. + """ + # check that the hgvs_nt variant is valid with regards to the target sequence + #validate_hgvs_string(value=hgvs_nt, + # column="nt", + # targetseq=target_seq) + + # check for TypeError + # if target_seq is not string + if not isinstance(target_seq, str): + raise TypeError("target_seq must be string.") + + # check for ValueError + # if target_seq is not made solely of characters ACTG + check_chars = [letter in "ACTG" for letter in target_seq] + if False in check_chars: + raise ValueError("target_seq is invalid, must be composed only of bases ACTG.") + + # identify variant_position and get codon_number associated with it + + if _is_wild_type(hgvs_nt): # variant_codon is wild-type + codon_number = None + target_codon = None + else: # any other variant change + # instantiate Variant object + variant = Variant(hgvs_nt) + # get variant position and convert to int + if type(variant.positions) == list: # multiple positions values exist + variant_position = int(str(variant.positions[0])) + elif type(variant.positions) == tuple: + variant_position = int(str(variant.positions[0])) + else: # only one value for positions + variant_position = int(str(variant.positions)) + # now that we have the variant_position, get codon_number + codon_number = round((variant_position / 3) + 0.5) + # use codon_number to get target_codon from target_seq + target_codon = target_seq[(codon_number - 1) * 3: codon_number * 3] + + # declare variables for codon data + # keep track of the number and location of the changes within the codon + sub_one = None + sub_two = None + sub_three = None + # keep tack of the number and value of the changes within the codon + sub_one_nuc = None + sub_two_nuc = None + sub_three_nuc = None + # keep track of the full codon changes + variant_codon = None + + # determine sequence of variant_codon + + if _is_wild_type(hgvs_nt): # variant_codon is wild-type + variant_codon = target_codon + sub_one = None # no nucleotide substitutions + elif _is_deletion(hgvs_nt): # target_codon was deleted + variant_codon = None + sub_one = None # no nucleotide substitutions + elif _is_substitution_one_base( + hgvs_nt + ): # variant_codon has one nucleotide substitution + # instantiate Variant object + variant = Variant(hgvs_nt) + # get index of nucleotide substitution + sub_one = int(str(variant.positions)) % 3 - 1 + # get nucleotide of substitution + sub_one_nuc = variant.sequence[1] + # set other possible indices for codon substitution to None + sub_two = None + sub_three = None + elif _is_substitution_two_bases_nonadjacent( + hgvs_nt + ): # variant has two nucleotide substitutions, non-adjacent + # instantiate Variant object + variant = Variant(hgvs_nt) + # get indices of nucleotide substitutions + sub_one = int(str(variant.positions[0])) % 3 - 1 + sub_two = int(str(variant.positions[1])) % 3 - 1 + # get nucleotides of substitutions + sub_one_nuc = variant.sequence[0][1] + sub_two_nuc = variant.sequence[1][1] + # set other possible indices for codon substitution to None + sub_three = None + else: # variant_codon has two or three adjacent nucleotide substitutions + # instantiate Variant object + variant = Variant(hgvs_nt) + variant_codon = variant.sequence + # get index of first codon substitution + sub_one = int(str(variant.positions[0])) % 3 - 1 + # get string of substituted nucleotides + sub_nucs = variant.sequence + if ( + len(sub_nucs) == 2 + ): # variant codon has two adjacent nucleotide substitutions + # assign additional nucleotide substitution indices + sub_two = sub_one + 1 + # get nucleotides of substitutions + sub_one_nuc = sub_nucs[0] + sub_two_nuc = sub_nucs[1] + # set other possible indices for codon substitution to None + sub_three = None + else: # variant has three adjacent nucleotide substitutions + # assign additional nucleotide substitution indices + sub_two = sub_one + 1 + sub_three = sub_two + 1 + # get nucleotides of substitutions + sub_one_nuc = sub_nucs[0] + sub_two_nuc = sub_nucs[1] + sub_three_nuc = sub_nucs[2] + + # using data generated above (substituted nucleotides and indices in codon), construct variant_codon + + # only assign variant_codon if nucleotide substitution occurred + if sub_one is not None: + # declare and initialize variant_codon + variant_codon = "" + # set first nucleotide of variant_codon + if sub_one == 0: + variant_codon = variant_codon + sub_one_nuc + else: + variant_codon = variant_codon + target_codon[0] + # set second nucleotide of variant_codon + if sub_one == 1: + variant_codon = variant_codon + sub_one_nuc + elif sub_two == 1: + variant_codon = variant_codon + sub_two_nuc + else: + variant_codon = variant_codon + target_codon[1] + # set third nucleotide of variant_codon + if sub_one == -1 or sub_one == 2: + variant_codon = variant_codon + sub_one_nuc + elif sub_two == -1 or sub_two == 2: + variant_codon = variant_codon + sub_two_nuc + elif sub_three == -1 or sub_three == 2: + variant_codon = variant_codon + sub_three_nuc + else: + variant_codon = variant_codon + target_codon[2] + + # convert to 3 letter amino acid code + target_aa = codon_dict_DNA[target_codon] + if variant_codon: + variant_aa = codon_dict_DNA[variant_codon] + else: + variant_aa = None + + return construct_hgvs_pro(wt=target_aa, mutant=variant_aa, position=codon_number, target_seq=target_seq) + + +def _is_wild_type(hgvs: str): + # TODO this is no longer valid + """ + This function takes an hgvs formatted string and returns True if the hgvs string indicates + there was no change from the target sequence. + + Parameters + ---------- + hgvs : string + hgvs formatted string + + Returns + ------- + wt : bool + True if hgvs string indicates wild type + """ + wt = False + if hgvs.startswith("_wt"): + wt = True + return wt + + +def _is_deletion(hgvs: str): + """ + This function takes an hgvs formatted string and returns True if the hgvs string indicates + there was a deletion. + + Parameters + ---------- + hgvs : string + hgvs formatted string + + Returns + ------- + deletion : bool + True if hgvs string is indicates a deletion + """ + deletion = False + if hgvs.endswith("del"): + deletion = True + return deletion + + +def _is_substitution_one_base(hgvs: str): + """ + This function takes an hgvs formatted string and returns True if the hgvs string indicates + there was a substitution at one base of the codon. + + Parameters + ---------- + hgvs : string + hgvs formatted string + + Returns + ------- + sub_one : bool + True if hgvs string is indicates a substitution at one base of codon + """ + sub_one = False + if hgvs[-2] == ">": + sub_one = True + return sub_one + + +def _is_substitution_two_bases_nonadjacent(hgvs: str): + """ + This function takes an hgvs formatted string and returns True if the hgvs string indicates + there were substitutions (non-adjacent) in the codon. + + Parameters + ---------- + hgvs : string + hgvs formatted string + + Returns + ------- + sub_two : bool + True if hgvs string is indicates a substitution at one base of codon + """ + sub_two = False + if hgvs[-1] == "]": + sub_two = True + return sub_two diff --git a/mavecore/validation/validate.py b/mavecore/validation/validate.py new file mode 100644 index 0000000..9f8533c --- /dev/null +++ b/mavecore/validation/validate.py @@ -0,0 +1,40 @@ +from exceptions import ValidationError +from mavecore.models.data import Experiment, ScoreSet +from mavecore.validation.dataframe import validate_dataframes + + +def validate(dataset: dict, dataset_type: str, scores=None, counts=None): + """ + This function validates data to by uploaded to MaveDB. Descriptive errors will be raised if any of the validation + fails. Scores and counts are optional as this function accepts both experiments and scoresets. + + Parameters + __________ + dataset : dict + The scoreset or experiment to be uploaded. This will be cast into a pydantic object. + dataset_type : str + The type of dataset that the first argument is, either "experiments" or "scoresets". + scores : Pandas.DataFrame + The scores dataframe as a Pandas DataFrame. + counts : Pandas.DataFrame + The counts dataframe as a Pandas DataFrame. + + Raises + ______ + ValueError + If the dataset_type attribute is not a string that reads `experiments` or `scoresets`. + """ + if dataset_type == "experiments": + try: + Experiment.parse_obj(dataset) + except ValidationError as e: + print(e.json()) + elif dataset_type == "scoresets": + try: + ScoreSet.parse_obj(dataset) + except ValidationError as e: + print(e.json()) + target_seq = dataset["targetGene"]["wtSequence"]["sequence"] + validate_dataframes(target_seq=target_seq, scores=scores, counts=counts) + else: + raise ValueError("The dataset_type must be a string that reads `experiments` or `scoresets`.") diff --git a/mavecore/validators/variant_validators/hgvs.py b/mavecore/validation/variant.py similarity index 63% rename from mavecore/validators/variant_validators/hgvs.py rename to mavecore/validation/variant.py index c5aef83..98c5130 100644 --- a/mavecore/validators/variant_validators/hgvs.py +++ b/mavecore/validation/variant.py @@ -1,24 +1,18 @@ from functools import partial from typing import Optional, Union -import re from mavehgvs import Variant, MaveHgvsParseError -from mavecore.validators.exceptions import ValidationError +from mavecore.validation.exceptions import ValidationError -from mavecore.validators.constants import NA_value, null_values_re - -from mavecore.validators.constants import ( +from mavecore.validation.constants.general import ( hgvs_nt_column, hgvs_splice_column, hgvs_pro_column, ) +__all__ = ["validate_hgvs_string"] -# from core.utilities import is_null -def is_null(value): - """Returns True if a stripped/lowercase value in in `nan_col_values`.""" - value = str(value).strip().lower() - return null_values_re.fullmatch(value) or not value +from mavecore.validation.utilities import is_null def validate_hgvs_string( @@ -28,6 +22,40 @@ def validate_hgvs_string( targetseq: Optional[str] = None, relaxed_ordering: bool = False, ) -> Optional[str]: + """ + Validates hgvs string. + + Parameters + __________ + value : Union[str, bytes] + column : Optional[str] = None + splice_present : bool = False + targetseq : Optional[str] = None + relaxed_ordering : bool = False + + Returns + _______ + Optional[str] + + Raises + ______ + ValidationError + If variant HGVS input values are not strings. + ValidationError + If value is _sy or _wt, which are no longer supported. + ValidationError + If + ValidationError + If value is not a genomic variant (prefix 'g.'). Nucleotide variants must + be genomic if transcript variants are also defined. + ValidationError + If value is not a transcript variant. The accepted transcript variant + prefixes are 'c.', 'n.'. + ValidationError + If value is not a protein variant. The accepted protein variant prefix is 'p.'. + ValueError + If there exists an unknown column. Function expects nt, splice or p." + """ if is_null(value): return None @@ -88,6 +116,6 @@ def validate_hgvs_string( return str(variant) -validate_nt_variant = partial(validate_hgvs_string, **{"column": "nt"}) -validate_splice_variant = partial(validate_hgvs_string, **{"column": "splice"}) -validate_pro_variant = partial(validate_hgvs_string, **{"column": "p"}) +#validate_nt_variant = partial(validate_hgvs_string, **{"column": "nt"}) +#validate_splice_variant = partial(validate_hgvs_string, **{"column": "splice"}) +#validate_pro_variant = partial(validate_hgvs_string, **{"column": "p"}) diff --git a/mavecore/validators/constants.py b/mavecore/validators/constants.py deleted file mode 100644 index 9a6b8fd..0000000 --- a/mavecore/validators/constants.py +++ /dev/null @@ -1,90 +0,0 @@ -import re - -""" -Null Constant definitions -""" -NA_value = "NA" -null_values_list = ( - "nan", - "na", - "none", - "", - "undefined", - "n/a", - "null", - "nil", - NA_value, -) - -null_values_re = re.compile( - r"^\s+$|none|nan|na|undefined|n/a|null|nil|{}".format(NA_value), flags=re.IGNORECASE -) - -readable_null_values = [ - "'{}'".format(v) for v in set([v.lower() for v in null_values_list]) if v.strip() -] + ["whitespace"] - -""" -Sequence constants -""" -AA_LETTERS = "ABCDEFGHIKLMNPQRSTVWXYZ" -DNA_LETTERS = "ATCG" - -DNA_SEQ_PATTERN = fr"[{DNA_LETTERS}]+" -AA_SEQ_PATTERN = fr"[{AA_LETTERS}]+" - - -""" -Constant definitions for application `experiment`. -""" -from mavecore.validators.urn_validators import ( - MAVEDB_EXPERIMENTSET_URN_PATTERN, - MAVEDB_EXPERIMENT_URN_PATTERN, - MAVEDB_SCORESET_URN_PATTERN, - MAVEDB_TMP_URN_PATTERN, -) - -hgvs_nt_column = "hgvs_nt" -hgvs_splice_column = "hgvs_splice" -hgvs_pro_column = "hgvs_pro" -hgvs_columns = sorted([hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]) -meta_data = "meta_data" -score_columns = "score_columns" -count_columns = "count_columns" -variant_score_data = "score_data" -variant_count_data = "count_data" -required_score_column = "score" - -experimentset_url_pattern = "|".join( - [MAVEDB_EXPERIMENTSET_URN_PATTERN[1:-1], MAVEDB_TMP_URN_PATTERN[1:-1]] -) -experiment_url_pattern = "|".join( - [MAVEDB_EXPERIMENT_URN_PATTERN[1:-1], MAVEDB_TMP_URN_PATTERN[1:-1]] -) -scoreset_url_pattern = "|".join( - [MAVEDB_SCORESET_URN_PATTERN[1:-1], MAVEDB_TMP_URN_PATTERN[1:-1]] -) - -any_url_pattern = "|".join( - [experimentset_url_pattern, experiment_url_pattern, scoreset_url_pattern] -) - - -valid_dataset_columns = [score_columns, count_columns] -valid_variant_columns = [variant_score_data, variant_count_data] - -variant_to_scoreset_column = { - variant_score_data: score_columns, - variant_count_data: count_columns, -} -scoreset_to_variant_column = {v: k for k, v in variant_to_scoreset_column.items()} - -# Celery dataset status -processing = "processing" -failed = "failed" -success = "success" - -# User roles -administrator = "administrator" -editor = "editor" -viewer = "viewer" diff --git a/mavecore/validators/dataset_validators.py b/mavecore/validators/dataset_validators.py deleted file mode 100644 index 5071a2f..0000000 --- a/mavecore/validators/dataset_validators.py +++ /dev/null @@ -1,223 +0,0 @@ -import io -import csv -import re - -from numpy.testing import assert_array_equal - -from mavecore.validators import constants - - -def is_null(value): - """Returns True if a stripped/lowercase value in in `nan_col_values`.""" - value = str(value).strip().lower() - return constants.null_values_re.fullmatch(value) or not value - - -class WordLimitValidator: - message = "This field is limited to {} words." - code = "invalid" - counter = re.compile(r"\w+\b", flags=re.IGNORECASE) - - def __init__(self, word_limit, message=None, code=None): - if message is not None: - self.message = message - if code is not None: - self.code = code - self.word_limit = int(word_limit) - - def __call__(self, value): - if not value: - return - if len(self.counter.findall(value)) > self.word_limit: - raise ValueError(self.message.format(self.word_limit)) - - -def read_header_from_io(file, label=None, msg=None): - if label is None: - label = "uploaded" - - try: - header_line = file.readline() - if isinstance(header_line, bytes): - header_line = header_line.decode() - file.seek(0) - f = io.StringIO(header_line.strip()) - return [h.strip() for h in csv.DictReader(f, delimiter=",").fieldnames] - except Exception: - if not msg: - msg = ( - "A header could not be parsed from your {} file. Make sure" - "Columns are comma delimited. Column names with commas must be" - "escaped by enclosing them in double quotes.".format(label) - ) - raise ValueError(msg) - - -def validate_has_hgvs_in_header(header, label=None, msg=None): - if label is None: - label = "Uploaded" - params = {} - if msg is None: - msg = ( - "Your %(label)s file must define either a nucleotide hgvs column " - "'%(col_nt)s' or a protein hgvs column '%(col_p)s'. " - "Columns are case-sensitive and must be comma delimited." - ) - params = { - "label": label, - "col_nt": constants.hgvs_nt_column, - "col_p": constants.hgvs_pro_column, - } - if not set(header) & set(constants.hgvs_columns): - raise ValueError(msg) - - -def validate_at_least_one_additional_column(header, label=None, msg=None): - if label is None: - label = "Uploaded" - params = {} - if not any(v not in constants.hgvs_columns for v in header): - if msg is None: - msg = ( - "Your %(label)s file must define at " - "least one additional column different " - "from '{}', '{}' and '{}'.".format( - constants.hgvs_nt_column, - constants.hgvs_splice_column, - constants.hgvs_pro_column, - ) - ) - params = {"label": label} - raise ValueError(msg) - - -def validate_header_contains_no_null_columns(header, label=None, msg=None): - if label is None: - label = "File" - any_null = any([is_null(v) for v in header]) - if any_null: - if msg is None: - msg = ( - "%(label)s file header cannot contain blank/empty/whitespace " - "only columns or the following case-insensitive null " - "values: {}.".format(label, ", ".join(constants.readable_null_values)) - ) - raise ValueError(msg) - - -def validate_datasets_define_same_variants(scores, counts): - """ - Checks if two `pd.DataFrame` objects parsed from uploaded files - define the same variants. - - Parameters - ---------- - scores : `pd.DataFrame` - Scores dataframe parsed from an uploaded scores file. - counts : `pd.DataFrame` - Scores dataframe parsed from an uploaded counts file. - """ - try: - assert_array_equal( - scores[constants.hgvs_nt_column].sort_values().values, - counts[constants.hgvs_nt_column].sort_values().values, - ) - assert_array_equal( - scores[constants.hgvs_splice_column].sort_values().values, - counts[constants.hgvs_splice_column].sort_values().values, - ) - assert_array_equal( - scores[constants.hgvs_pro_column].sort_values().values, - counts[constants.hgvs_pro_column].sort_values().values, - ) - except AssertionError: - raise ValueError( - "Your score and counts files do not define the same variants. " - "Check that the hgvs columns in both files match." - ) - - -def validate_scoreset_score_data_input(file): - """ - Validator function for checking that the scores file input contains - at least the column 'hgvs' and 'score'. Returns the file to position 0 - after reading the header (first line). - - Parameters - ---------- - file : :class:`io.FileIO` - An open file handle in read mode. - """ - file.seek(0) - header = read_header_from_io(file, label="Score") - validate_header_contains_no_null_columns(header, label="Score") - validate_has_hgvs_in_header(header, label="Score") - validate_at_least_one_additional_column(header, label="Score") - - if constants.required_score_column not in header: - raise ValueError( - "Score data file is missing the required column " - + constants.required_score_column - + "." - + "Columns are case-sensitive and must be comma delimited." - ) - - -def validate_scoreset_count_data_input(file): - """ - Validator function for checking that the counts file input contains - at least the column 'hgvs'. Returns the file to position 0 - after reading the header (first line). - - Parameters - ---------- - file : :class:`io.FileIO` - File parsed by a `django` form. - """ - file.seek(0) - header = read_header_from_io(file, label="Count") - validate_header_contains_no_null_columns(header, label="Count") - validate_has_hgvs_in_header(header, label="Count") - validate_at_least_one_additional_column(header, label="Count") - - -def validate_scoreset_json(dict_): - """ - Checks a given dictionary to ensure that it is suitable to be used - as the `dataset_columns` attribute in a :class:`ScoreSet` instance. - - Parameters - ---------- - dict_ : dict - Dictionary of keys mapping to a list. - """ - required_columns = [constants.score_columns, constants.count_columns] - - for key in required_columns: - if key not in dict_.keys(): - raise ValueError("Scoreset data is missing the required key " + key) - - columns = dict_[key] - if not all([isinstance(c, str) for c in columns]): - raise ValueError("Header values must be strings.") - - if not isinstance(columns, list): - type_ = type(columns).__name__ - raise ValueError( - "Value for " + key.replace("_", " ") + " must be a list not " + type_ - ) - - # Check score columns is not-empty and at least contains hgvs and score - if key == constants.score_columns: - if constants.required_score_column not in columns: - raise ValueError( - "Missing required column constants.required_score_column " - "for score dataset." - ) - - # Check there are not unexpected columns supplied to the scoreset json - # field. - extras = [k for k in dict_.keys() if k not in set(required_columns)] - if len(extras) > 0: - extras = [k for k in dict_.keys() if k not in required_columns] - raise ValueError("Encountered unexpected keys extras") diff --git a/mavecore/validators/exceptions.py b/mavecore/validators/exceptions.py deleted file mode 100644 index 2851fa7..0000000 --- a/mavecore/validators/exceptions.py +++ /dev/null @@ -1,2 +0,0 @@ -class ValidationError(ValueError): - pass diff --git a/mavecore/validators/genome_validators.py b/mavecore/validators/genome_validators.py deleted file mode 100644 index 0065d5e..0000000 --- a/mavecore/validators/genome_validators.py +++ /dev/null @@ -1,272 +0,0 @@ -""" -Validator functions for the fields of the following classes: - WildTypeSequence - ReferenceGenome - TargetGene - ReferenceMap - GenomicInterval - -Most validators should validate one specific field, unless fields need -to be validated against each other. -""" -import re -from fqfa.validator.validator import dna_bases_validator, amino_acids_validator -from mavecore.validators.exceptions import ValidationError - -from mavecore.validators import constants - - -def is_null(value): - """Returns True if a stripped/lowercase value in in `nan_col_values`.""" - value = str(value).strip().lower() - return constants.null_values_re.fullmatch(value) or not value - - -# min_start_validator = MinValueValidator( -# 1, message=_("Start coordinate must be a positive integer.") -# ) -# min_end_validator = MinValueValidator( -# 1, message=_("End coordinate must be a positive integer.") -# ) - - -class WildTypeSequence: - """ - Basic model specifying a wild-type sequence. - - Parameters - ---------- - sequence : `models.CharField` - The wild type DNA sequence that is related to the `target`. Will - be converted to upper-case upon instantiation. - - sequence_type : `models.CharField` - Protein sequence (amino acids) or DNA (nucleotides) - """ - - class SequenceType: - DNA = "dna" - PROTEIN = "protein" - INFER = "infer" - - @classmethod - def detect_sequence_type(cls, sequence): - if sequence_is_dna(sequence): - return cls.DNA - elif sequence_is_protein(sequence): - return cls.PROTEIN - else: - raise ValueError( - f"Unknown sequence '{sequence}'. It is not protein or DNA." - ) - - @classmethod - def is_protein(cls, value): - return value == cls.PROTEIN - - @classmethod - def is_dna(cls, value): - return value == cls.DNA - - @classmethod - def choices(cls): - return [(cls.INFER, "Infer"), (cls.DNA, "DNA"), (cls.PROTEIN, "Protein")] - - class Meta: - verbose_name = "Reference sequence" - verbose_name_plural = "Reference sequences" - - def __str__(self): - return self.get_sequence() - - # sequence = models.TextField( - # default=None, - # blank=False, - # null=False, - # verbose_name="Reference sequence", - # validators=[validate_wildtype_sequence], - # ) - # sequence_type = models.CharField( - # blank=True, - # null=False, - # default=SequenceType.INFER, - # verbose_name="Reference sequence type", - # max_length=32, - # choices=SequenceType.choices(), - # ) - - @property - def is_dna(self): - return self.__class__.SequenceType.is_dna(self.sequence_type) - - @property - def is_protein(self): - return self.__class__.SequenceType.is_protein(self.sequence_type) - - def save(self, *args, **kwargs): - if self.sequence is not None: - self.sequence = self.sequence.upper() - self.sequence_type = ( - (self.__class__.SequenceType.detect_sequence_type(self.sequence)) - if self.__class__.SequenceType.INFER - else self.sequence_type - ) - - return super().save(*args, **kwargs) - - def get_sequence(self): - return self.sequence.upper() - - def is_attached(self): - return getattr(self, "target", None) is not None - - -# GenomicInterval -# ------------------------------------------------------------------------- # -def validate_interval_start_lteq_end(start, end): - # Intervals may be underspecified, but will be ignored so skip validation. - if start is None or end is None: - return - if start > end: - raise ValidationError( - ( - "An interval's starting coordinate cannot be greater than the " - "ending coordinate." - ) - ) - - -def validate_strand(value): - if value not in ("+", "-"): - raise ValidationError("GenomicInterval strand must be either '+' or '-'") - - -def validate_chromosome(value): - # Intervals may be underspecified, but will be ignored so skip validation. - if value is None: - return - if is_null(value): - raise ValidationError("Chromosome identifier must not be null.") - - -def validate_unique_intervals(intervals): - for interval1 in intervals: - for interval2 in intervals: - if ( - (interval1.pk is not None) - and (interval2.pk is not None) - and (interval1.pk == interval2.pk) - ): - continue - elif interval1 is interval2: - continue - elif interval1.equals(interval2): - raise ValidationError("You can not specify the same interval twice.") - - -# WildTypeSequence -# ------------------------------------------------------------------------- # -def validate_wildtype_sequence(seq, as_type="any"): - # from .models import WildTypeSequence - - # Explicitly check for these cases as they are also valid AA sequences. - if is_null(seq): - raise ValidationError( - "'%(seq)s' is not a valid wild type sequence." # , params={"seq": seq} - ) - - seq = seq.upper() - is_dna = dna_bases_validator(seq) is not None - is_aa = amino_acids_validator(seq) is not None - - if as_type == WildTypeSequence.SequenceType.DNA and not is_dna: - raise ValidationError( - "'%(seq)s' is not a valid DNA reference sequence." # , - # params={"seq": seq}, - ) - elif as_type == WildTypeSequence.SequenceType.PROTEIN and not is_aa: - raise ValidationError( - "'%(seq)s' is not a valid protein reference sequence." # , - # params={"seq": seq}, - ) - elif (as_type == "any" or WildTypeSequence.SequenceType.INFER) and not ( - is_dna or is_aa - ): - raise ValidationError( - "'%(seq)s' is not a valid DNA or protein reference sequence." # , - # params={"seq": seq}, - ) - - -def sequence_is_dna(seq): - # Explicitly check for these cases as they are also valid AA sequences. - if is_null(seq): - return False - seq = seq.upper() - return dna_bases_validator(seq) is not None - - -def sequence_is_protein(seq): - # Explicitly check for these cases as they are also valid AA sequences. - if is_null(seq): - return False - seq = seq.upper() - if dna_bases_validator(seq) is not None: - return False # Very likely a DNA sequence if only ATG - return amino_acids_validator(seq) is not None - - -# ReferenceGenome -# ------------------------------------------------------------------------- # -def validate_organism_name(value): - if is_null(value): - raise ValidationError("Species name must not be null.") - - -def validate_reference_genome_has_one_external_identifier(referencegenome): - if not referencegenome.genome_id: - raise ValidationError( - "Only one external identifier can be specified for a reference" "genome." - ) - - -def validate_genome_short_name(value): - if is_null(value): - raise ValidationError("Genome short name must not be null.") - - -# ReferenceMap -# ------------------------------------------------------------------------- # -def validate_map_has_unique_reference_genome(annotations): - genomes = set([str(a.get_reference_genome_name()).lower() for a in annotations]) - if len(genomes) < len(annotations): - raise ValidationError( - "Each reference map must specify a different reference genome." - ) - - -def validate_map_has_at_least_one_interval(reference_map): - if not reference_map.get_intervals().count(): - raise ValidationError( - "You must specify at least one interval for each reference map." - ) - - -def validate_at_least_one_map(reference_maps): - if not len(reference_maps): - raise ValidationError( - "A target must have at least one reference map specified." - ) - - -def validate_one_primary_map(reference_maps): - primary_count = sum(a.is_primary_reference_map() for a in reference_maps) - if primary_count > 1 or primary_count < 1: - raise ValidationError("A target must have one primary reference map.") - - -# TargetGene -# ------------------------------------------------------------------------- # -def validate_gene_name(value): - if is_null(value): - raise ValidationError("Gene name must not be null.") diff --git a/mavecore/validators/metadata_validators.py b/mavecore/validators/metadata_validators.py deleted file mode 100644 index d65981c..0000000 --- a/mavecore/validators/metadata_validators.py +++ /dev/null @@ -1,106 +0,0 @@ -import re -import idutils - -from mavecore.validators.exceptions import ValidationError -from mavecore.validators.constants import null_values_re - - -def is_null(value): - """Returns True if a stripped/lowercase value in in `nan_col_values`.""" - value = str(value).strip().lower() - return null_values_re.fullmatch(value) or not value - - -def validate_sra_identifier(identifier): - if not ( - idutils.is_sra(identifier) - or idutils.is_bioproject(identifier) - or idutils.is_geo(identifier) - or idutils.is_arrayexpress_array(identifier) - or idutils.is_arrayexpress_experiment(identifier) - ): - raise ValidationError( - f"'{identifier} is not a valid SRA, GEO, ArrayExpress or BioProject " - "accession." - ) - - -def validate_keyword(kw): - if is_null(kw) or not isinstance(kw, str): - raise ValidationError( - f"'{kw}' not a valid keyword. Keywords must be valid strings." - ) - - -def validate_pubmed_identifier(identifier): - if not idutils.is_pmid(identifier): - raise ValidationError(f"'{identifier} is not a valid PubMed identifier.") - - -def validate_doi_identifier(identifier): - if not idutils.is_doi(identifier): - raise ValidationError(f"'{identifier}' is not a valid DOI.") - - -def validate_ensembl_identifier(identifier): - if not idutils.is_ensembl(identifier): - raise ValidationError(f"'{identifier}' is not a valid Ensembl accession.") - - -def validate_uniprot_identifier(identifier): - if not idutils.is_uniprot(identifier): - raise ValidationError(f"'{identifier}' is not a valid UniProt accession.") - - -def validate_refseq_identifier(identifier): - if not idutils.is_refseq(identifier): - raise ValidationError(f"'{identifier}' is not a valid RefSeq accession.") - - -def validate_genome_identifier(identifier): - if not idutils.is_genome(identifier): - raise ValidationError( - f"'{identifier}' is not a valid GenBank or RefSeq genome assembly." - ) - - -def validate_keyword_list(values): - for value in values: - if not is_null(value): - validate_keyword(value) - - -def validate_pubmed_list(values): - for value in values: - if not is_null(value): - validate_pubmed_identifier(value) - - -def validate_sra_list(values): - for value in values: - if not is_null(value): - validate_sra_identifier(value) - - -def validate_doi_list(values): - for value in values: - if not is_null(value): - validate_doi_identifier(value) - - -def validate_ensembl_list(values): - for value in values: - if not is_null(value): - validate_ensembl_identifier(value) - - -def validate_refseq_list(values): - for value in values: - if not is_null(value): - validate_refseq_identifier(value) - - -def validate_uniprot_list(values): - for value in values: - if not is_null(value): - validate_uniprot_identifier(value) diff --git a/mavecore/validators/urn_validators.py b/mavecore/validators/urn_validators.py deleted file mode 100644 index 823c537..0000000 --- a/mavecore/validators/urn_validators.py +++ /dev/null @@ -1,95 +0,0 @@ -import re -from mavecore.validators.exceptions import ValidationError - -MAVEDB_EXPERIMENTSET_URN_DIGITS = 8 -MAVEDB_TMP_URN_DIGITS = 16 -MAVEDB_URN_MAX_LENGTH = 64 -MAVEDB_URN_NAMESPACE = "mavedb" - - -# Temp URN patterns -# --------------------------------------------------------------------------- # -MAVEDB_TMP_URN_PATTERN = r"^tmp:[A-Za-z0-9]{{{width}}}$".format( - width=MAVEDB_TMP_URN_DIGITS -) -MAVEDB_TMP_URN_RE = re.compile(MAVEDB_TMP_URN_PATTERN) - - -# Experimentset Pattern/Compiled RE -MAVEDB_EXPERIMENTSET_URN_PATTERN = r"^urn:{namespace}:\d{{{width}}}$".format( - namespace=MAVEDB_URN_NAMESPACE, width=MAVEDB_EXPERIMENTSET_URN_DIGITS -) -MAVEDB_EXPERIMENTSET_URN_RE = re.compile(MAVEDB_EXPERIMENTSET_URN_PATTERN) - -# Experiment Pattern/Compiled RE -MAVEDB_EXPERIMENT_URN_PATTERN = r"{pattern}-([a-z]+|0)$".format( - pattern=MAVEDB_EXPERIMENTSET_URN_PATTERN[:-1] -) -MAVEDB_EXPERIMENT_URN_RE = re.compile(MAVEDB_EXPERIMENT_URN_PATTERN) - -# Scoreset Pattern/Compiled RE -MAVEDB_SCORESET_URN_PATTERN = r"{pattern}-\d+$".format( - pattern=MAVEDB_EXPERIMENT_URN_PATTERN[:-1] -) -MAVEDB_SCORESET_URN_RE = re.compile(MAVEDB_SCORESET_URN_PATTERN) - -# Variant Pattern/Compiled RE -MAVEDB_VARIANT_URN_PATTERN = r"{pattern}#\d+$".format( - pattern=MAVEDB_SCORESET_URN_PATTERN[:-1] -) -MAVEDB_VARIANT_URN_RE = re.compile(MAVEDB_VARIANT_URN_PATTERN) - -# Any Pattern/Compiled RE -MAVEDB_ANY_URN_PATTERN = "|".join( - [ - r"({pattern})".format(pattern=p) - for p in ( - MAVEDB_EXPERIMENTSET_URN_PATTERN, - MAVEDB_EXPERIMENT_URN_PATTERN, - MAVEDB_SCORESET_URN_PATTERN, - MAVEDB_VARIANT_URN_PATTERN, - MAVEDB_TMP_URN_PATTERN, - ) - ] -) -MAVEDB_ANY_URN_RE = re.compile(MAVEDB_ANY_URN_PATTERN) - - -def validate_mavedb_urn(urn): - if not MAVEDB_ANY_URN_RE.match(urn): - raise ValidationError( - "Error test" - # "%(urn)s is not a valid urn.", params={"urn": urn} - ) - - -def validate_mavedb_urn_experimentset(urn): - if not (MAVEDB_EXPERIMENTSET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError( - "Error test" - # "%(urn)s is not a valid Experiment Set urn.", params={"urn": urn} - ) - - -def validate_mavedb_urn_experiment(urn): - if not (MAVEDB_EXPERIMENT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError( - "Error test" - # "%(urn)s is not a valid Experiment urn.", params={"urn": urn} - ) - - -def validate_mavedb_urn_scoreset(urn): - if not (MAVEDB_SCORESET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError( - "Error test" - # "%(urn)s is not a valid score set urn.", params={"urn": urn} - ) - - -def validate_mavedb_urn_variant(urn): - if not (MAVEDB_VARIANT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError( - "Error test" - # "%(urn)s is not a valid Variant urn.", params={"urn": urn} - ) diff --git a/mavecore/validators/validate.py b/mavecore/validators/validate.py deleted file mode 100644 index cd822b6..0000000 --- a/mavecore/validators/validate.py +++ /dev/null @@ -1,56 +0,0 @@ -from mavecore.validators import dataset_validators - - -def validate_all(countfile=None, scorefile=None, scorejson=None): - """ - By calling other helper functions, this function runs all of the validation code - """ - validate_dataset(countfile, scorefile, scorejson) - - -def validate_dataset(countfile=None, scorefile=None, scorejson=None): - """ - This function calls all of the validation functions within - mavetools/mavetools/validators/dataset_validation.py - - Returns - ------- - - """ - - # how to incorporate word limit validator? - - if scorefile is not None: - # open scorefile - open(scorefile) - # this one returns header - scoreheader = dataset_validators.read_header_from_io(file=scorefile) - - # if the header was returned, do these ones - dataset_validators.validate_has_hgvs_in_header(header=scoreheader) - dataset_validators.validate_at_least_one_additional_column(header=scoreheader) - dataset_validators.validate_header_contains_no_null_columns(header=scoreheader) - - dataset_validators.validate_scoreset_score_data_input(file=scorefile) - - if scorejson is not None: - # open scorejson - open(scorejson) - dataset_validators.validate_scoreset_json(dict_=scorejson) - - if countfile is not None: - # open countfile - open(countfile) - countheader = dataset_validators.read_header_from_io(file=countfile) - - # if the header was returned, do these ones - dataset_validators.validate_has_hgvs_in_header(header=countheader) - dataset_validators.validate_at_least_one_additional_column(header=countheader) - dataset_validators.validate_header_contains_no_null_columns(header=countheader) - - dataset_validators.validate_scoreset_count_data_input(file=countfile) - - if scorefile is not None and countfile is not None: - dataset_validators.validate_datasets_define_same_variants( - scores=scorefile, counts=countfile - ) diff --git a/mavecore/validators/variant_validators/__init__.py b/mavecore/validators/variant_validators/__init__.py deleted file mode 100644 index 1f7aca1..0000000 --- a/mavecore/validators/variant_validators/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -from .dataset import MaveDataset, MaveCountsDataset, MaveScoresDataset - -from .hgvs import ( - validate_nt_variant, - validate_pro_variant, - validate_splice_variant, - validate_hgvs_string, -) - -from .variant import validate_columns_match, validate_variant_json - -__all__ = [ - "dataset", - "variant", - "hgvs", - "validate_nt_variant", - "validate_splice_variant", - "validate_pro_variant", - "validate_hgvs_string", - "validate_columns_match", - "validate_variant_json", - "MaveCountsDataset", - "MaveScoresDataset", - "MaveDataset", -] diff --git a/mavecore/validators/variant_validators/dataset.py b/mavecore/validators/variant_validators/dataset.py deleted file mode 100644 index 0764dcc..0000000 --- a/mavecore/validators/variant_validators/dataset.py +++ /dev/null @@ -1,638 +0,0 @@ -import re -from collections import defaultdict -from io import StringIO -from itertools import groupby -from operator import itemgetter -from typing import Union, Optional, Tuple, List, TextIO, BinaryIO, Set, Dict - -import pandas as pd -import numpy as np -from mavehgvs import MaveHgvsParseError, Variant -from fqfa.util.translate import translate_dna -from fqfa.util.infer import infer_sequence_type - -from mavecore.validators.constants import ( - hgvs_nt_column, - hgvs_splice_column, - hgvs_pro_column, - required_score_column - # constants - , -) - -from mavecore.validators.constants import NA_value, null_values_list, null_values_re, readable_null_values - - -def is_null(value): - """Returns True if a stripped/lowercase value in in `nan_col_values`.""" - value = str(value).strip().lower() - return null_values_re.fullmatch(value) or not value - - -class MaveDataset: - class DatasetType: - SCORES = "scores" - COUNTS = "counts" - - class HGVSColumns: - NUCLEOTIDE: str = hgvs_nt_column - TRANSCRIPT: str = hgvs_splice_column - PROTEIN: str = hgvs_pro_column - - @classmethod - def options(cls) -> List[str]: - return [cls.NUCLEOTIDE, cls.TRANSCRIPT, cls.PROTEIN] - - class AdditionalColumns: - @classmethod - def options(cls) -> List[str]: - return [] - - # ---------------------- Construction------------------------------------ # - @classmethod - def for_scores(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveScoresDataset": - return cls._for_type(file=file, dataset_type=cls.DatasetType.SCORES) - - @classmethod - def for_counts(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveCountsDataset": - return cls._for_type(file=file, dataset_type=cls.DatasetType.COUNTS) - - @classmethod - def _for_type( - cls, file: Union[str, TextIO, BinaryIO], dataset_type: str - ) -> Union["MaveScoresDataset", "MaveCountsDataset"]: - - if isinstance(file, str): - handle = file - elif hasattr(file, "read"): - file_contents = file.read() - if hasattr(file_contents, "decode"): - file_contents = file_contents.decode("utf-8") - file_contents = file_contents.strip() - handle = StringIO(file_contents) - else: - raise TypeError( - f"Expected file path or buffer object. " f"Got '{type(file).__name__}'" - ) - - extra_na_values = set( - list(null_values_list) - + [str(x).lower() for x in null_values_list] - + [str(x).upper() for x in null_values_list] - + [str(x).capitalize() for x in null_values_list] - ) - - df = pd.read_csv( - filepath_or_buffer=handle, - sep=",", - encoding="utf-8", - quotechar='"', - comment="#", - na_values=extra_na_values, - keep_default_na=True, - dtype={ - **{c: str for c in cls.HGVSColumns.options()}, - MaveScoresDataset.AdditionalColumns.SCORES: float, - }, - ).replace(null_values_re, np.NaN) - - if dataset_type == cls.DatasetType.SCORES: - return MaveScoresDataset(df) - elif dataset_type == cls.DatasetType.COUNTS: - return MaveCountsDataset(df) - else: - raise ValueError(f"'{dataset_type}' is not a recognised dataset type.") - - # ---------------------- Public ----------------------------------------- # - @property - def label(self) -> str: - return "dataset" - - @property - def is_valid(self) -> Optional[bool]: - if self._errors is None: - return None - return len(self._errors) == 0 - - @property - def n_errors(self) -> Optional[int]: - if self._errors is None: - return None - return len(self._errors) - - @property - def errors(self) -> Optional[List[str]]: - return self._errors - - @property - def is_empty(self) -> bool: - return self._df.empty - - @property - def columns(self) -> List[str]: - return list(self._df.columns) - - @property - def hgvs_columns(self) -> List[str]: - return [c for c in self.columns if c in self.HGVSColumns.options()] - - @property - def non_hgvs_columns(self) -> List[str]: - return [c for c in self.columns if c not in self.HGVSColumns.options()] - - @property - def n_rows(self) -> int: - return len(self._df) - - @property - def n_columns(self) -> int: - return len(self.columns) - - @property - def index_column(self) -> Optional[str]: - if self._errors: - return None - return self._index_column - - @property - def index(self) -> Optional[pd.Index]: - if self._errors: - return None - return self._df.index.copy(deep=True) - - def data(self, serializable=False) -> pd.DataFrame: - """ - Return underlying dataframe object. - - Parameters - ---------- - serializable: bool - Replaces `np.NaN` with `None` for JSON compatibility. - """ - if serializable: - # need to force "object" type to allow None values - return_df = self._df.astype(object, copy=True) - return_df.where(cond=pd.notnull(return_df), other=None, inplace=True) - return return_df - return self._df.copy(deep=True) - - def match_other(self, other: "MaveDataset") -> Optional[bool]: - """ - Check that each dataset defined the same variants in each column. - - Parameters - ---------- - other: MaveDataset - Validator instance to match against. - - Returns - ------- - A boolean indicating index match, otherwise `None` if either instance - is not valid. - """ - if (not self.is_valid) or (not other.is_valid): - return None - - if self.index_column != other.index_column: - return False - - return all( - self._df[column].equals(other._df[column]) - for column in self.HGVSColumns.options() - ) - - def to_dict(self) -> Dict[str, Dict]: - """ - Returns underlying dataframe as dictionary in 'records' orientation. - Keys will be index values and values will be an inner dictionary mapping - column names to row values for said index. - """ - # Convert np.NaN values to None for consistency across all columns and - # for compatibility in PostgresSQL queries. Replaces all values which - # are considered null by pandas with None by masking pd.notnull cells. - - return self.data(serializable=True).to_dict(orient="index") - - def validate( - self, - targetseq: Optional[str] = None, - relaxed_ordering: bool = False, - allow_index_duplicates: bool = False, - ) -> "MaveDataset": - - self._errors = [] - self._df.index = pd.RangeIndex(start=0, stop=self.n_rows, step=1) - self._index_column = None - - self._validate_columns() - # Only attempt to validate variants if columns are valid - if not self._errors: - ( - self._normalize_data() - ._validate_genomic_variants(targetseq, relaxed_ordering) - ._validate_transcript_variants(targetseq, relaxed_ordering) - ._validate_protein_variants(targetseq, relaxed_ordering) - ._validate_index_column(allow_duplicates=allow_index_duplicates) - ) - - if self.is_empty: - self._errors.append( - f"No variants could be parsed from your {self.label} file. " - f"Please upload a non-empty file." - ) - return self - - if not self._errors: - # Set index last as original index is used when indicating duplicate - # hgvs string row numbers in the column name used as the index ( - # either hgvs_nt when present or hgvs_pro when hgvs_nt is absent). - self._df.index = pd.Index(self._df[self.index_column]) - - return self - - # ---------------------- Private ---------------------------------------- # - def __init__( - self, - df: Optional[pd.DataFrame] = None, - index_column: Optional[str] = None, - errors: Optional[List[str]] = None, - ): - self._df: pd.DataFrame = pd.DataFrame() if df is None else df - self._index_column = index_column or None - self._errors = None if errors is None else list(errors) - - def __repr__(self): - return ( - f"<" - f"{self.__class__.__name__} " - f"columns={self.columns} " - f"index={self.index_column} " - f"valid={self.is_valid}" - f">" - ) - - @property - def _column_order(self) -> Dict[str, int]: - return defaultdict( - lambda: 100, - { - self.HGVSColumns.NUCLEOTIDE: 0, - self.HGVSColumns.TRANSCRIPT: 1, - self.HGVSColumns.PROTEIN: 2, - **{ - c: (2 + i) - for (i, c) in enumerate(self.AdditionalColumns.options(), start=1) - }, - }, - ) - - def _validate_columns(self) -> "MaveDataset": - if self._errors: - return self - - # Pandas will automatically name blank columns using the pattern below - unnamed = re.compile(r"^Unnamed: \d+$", flags=re.IGNORECASE) - columns = self.columns - if any(is_null(h) or unnamed.match(h) for h in columns): - self._errors.append( - f"Column names in your {self.label} file cannot values " - f"considered null such as the following: " - f"{', '.join(readable_null_values)}" - ) - - columns = [c for c in columns if not is_null(c)] - if len(columns) < 1: - self._errors.append( - f"No columns could not be parsed from your {self.label} file. " - "Make sure columns are comma delimited. Column names with " - "commas must be escaped by enclosing them in double quotes" - ) - - required = {self.HGVSColumns.NUCLEOTIDE, self.HGVSColumns.PROTEIN} - if not (set(columns) & required): - self._errors.append( - f"Your {self.label} file must define either a nucleotide " - f"hgvs column '({self.HGVSColumns.NUCLEOTIDE})' " - f"or a protein hgvs column '({self.HGVSColumns.PROTEIN})'. " - f"Columns are case-sensitive and must be comma delimited" - ) - - if not (set(columns) - set(self.HGVSColumns.options())): - self._errors.append( - f"Your {self.label} file must define at least one additional " - f"column different from '{self.HGVSColumns.NUCLEOTIDE}', " - f"'{self.HGVSColumns.TRANSCRIPT}' and " - f"'{self.HGVSColumns.PROTEIN}'" - ) - - return self - - def _normalize_data(self) -> "MaveDataset": - if self._errors: - return self - - # Initialize missing hgvs columns as empty. - for c in self.HGVSColumns.options(): - if c not in self.columns: - self._df[c] = np.NaN - - column_order = self._column_order - sorted_columns = list(sorted(self.columns, key=lambda x: column_order[x])) - - self._df = self._df[sorted_columns] - return self - - def _validate_genomic_variants( - self, targetseq: Optional[str] = None, relaxed_ordering: bool = False - ) -> "MaveDataset": - if self._column_is_null(self.HGVSColumns.NUCLEOTIDE): - return self - - defines_transcript_variants = not self._column_is_null( - self.HGVSColumns.TRANSCRIPT - ) - validated_variants, prefixes, errors = self._validate_variants( - column=self.HGVSColumns.NUCLEOTIDE, - splice_defined=defines_transcript_variants, - targetseq=targetseq, - relaxed_ordering=relaxed_ordering, - ) - - if ("c" in prefixes or "n" in prefixes) and "g" in prefixes: - self._errors.append( - f"{self.HGVSColumns.NUCLEOTIDE}: Genomic variants " - f"(prefix 'g.') cannot be mixed with transcript variants " - f"(prefix 'c.' or 'n.')" - ) - - if prefixes == {"g"} and not defines_transcript_variants: - self._errors.append( - f"Transcript variants ('{self.HGVSColumns.TRANSCRIPT}' column) " - f"are required when specifying genomic variants " - f"(prefix 'g.' in the 'hgvs_nt' column)" - ) - - self._errors += errors - - if not self._errors: - self._df[self.HGVSColumns.NUCLEOTIDE] = validated_variants - - self._index_column = self.HGVSColumns.NUCLEOTIDE - return self - - def _validate_transcript_variants( - self, targetseq: Optional[str] = None, relaxed_ordering: bool = False - ) -> "MaveDataset": - defines_nt = not self._column_is_null(self.HGVSColumns.NUCLEOTIDE) - defines_tx = not self._column_is_null(self.HGVSColumns.TRANSCRIPT) - - if defines_tx and (not defines_nt): - self._errors.append( - f"Genomic variants ('{self.HGVSColumns.NUCLEOTIDE}' column) " - f"must be defined when specifying transcript " - f"variants ('{self.HGVSColumns.TRANSCRIPT}' column)" - ) - - if not defines_tx: - return self - - # Don't validate transcript variants against sequence. Might come - # back to this later with research into implementing gene models. - validated_variants, _, errors = self._validate_variants( - column=self.HGVSColumns.TRANSCRIPT, - targetseq=None, - relaxed_ordering=relaxed_ordering, - ) - - self._errors += errors - - if not self._errors: - self._df[self.HGVSColumns.TRANSCRIPT] = validated_variants - - return self - - def _validate_protein_variants( - self, targetseq: Optional[str] = None, relaxed_ordering: bool = False - ) -> "MaveDataset": - if self._column_is_null(self.HGVSColumns.PROTEIN): - return self - - defines_nt = not self._column_is_null(self.HGVSColumns.NUCLEOTIDE) - defines_splice = not self._column_is_null(self.HGVSColumns.TRANSCRIPT) - - if defines_splice: - protein_seq = None - else: - protein_seq = targetseq - if targetseq and "dna" in infer_sequence_type(targetseq).lower(): - protein_seq, remainder = translate_dna(targetseq) - if remainder: - self._errors.insert( - 0, - "Protein variants could not be validated because the " - "length of your target sequence is not a multiple of 3", - ) - - validated_variants, _, errors = self._validate_variants( - column=self.HGVSColumns.PROTEIN, - targetseq=protein_seq, - relaxed_ordering=relaxed_ordering, - ) - - self._errors += errors - - if not self._errors: - self._df[self.HGVSColumns.PROTEIN] = validated_variants - - if not defines_nt: - self._index_column = self.HGVSColumns.PROTEIN - - return self - - def _validate_index_column(self, allow_duplicates: bool = False) -> "MaveDataset": - if self._errors: - return self - - if self._index_column is None: - self._index_column = self.HGVSColumns.NUCLEOTIDE - - if self._column_is_partially_null(self._index_column): - self._errors.append( - f"Primary column (inferred as '{self._index_column}') " - f"cannot contain any null values from " - f"{', '.join(readable_null_values)} (case-insensitive)" - ) - - if not allow_duplicates: - dupes = self._df[self._index_column].duplicated(keep=False) - if np.any(dupes): - dup_list = zip( - self._df.loc[dupes, self._index_column], dupes.index[dupes] - ) - dupes_str = ", ".join( - f"{v}: {[(g[1] + 1) for g in groups]}" # get row numbers - for (v, groups) in groupby(dup_list, key=itemgetter(0)) - ) - self._errors.append( - f"Primary column (inferred as '{self._index_column}') " - f"contains duplicate HGVS variants: {dupes_str}" - ) - - return self - - def _validate_variants( - self, - column: str, - splice_defined: Optional[bool] = None, - targetseq: Optional[str] = None, - relaxed_ordering: bool = False, - ) -> Tuple[pd.Series, Set[str], List[str]]: - - prefixes = set() - errors = [] - - def validate_variant(variant: str): - # TODO: logic mirrors that in validate_hgvs_string, which is kept - # as a standalone function for backwards compatibility with - # django's model validator field. Merge at some point. - - if is_null(variant): - return np.NaN - else: - try: - if variant.lower() == "_sy": - errors.append( - "'_sy' is no longer supported and should be " - "replaced by 'p.(=)'" - ) - return variant - elif variant.lower() == "_wt": - errors.append( - "'_wt' is no longer supported and should be " - "replaced by one of 'g.=', 'c.=' or 'n.='" - ) - return variant - - validated = Variant( - variant, targetseq=targetseq, relaxed_ordering=relaxed_ordering - ) - prefix = validated.prefix.lower() - prefixes.add(prefix) - - prefix_error = self._validate_variant_prefix_for_column( - variant=validated, - prefix=validated.prefix, - column=column, - splice_defined=splice_defined, - ) - if prefix_error: - errors.append(prefix_error) - - return str(validated) - - except MaveHgvsParseError as error: - errors.append(f"{variant}: {str(error)}") - return np.NaN - - validated_variants = self._df[column].apply(validate_variant) - - return validated_variants, prefixes, errors - - def _column_is_null(self, column) -> bool: - return len(self._df[self._df[column].isna()]) == len(self._df) - - def _column_is_partially_null(self, column) -> bool: - return 0 < len(self._df[self._df[column].isna()]) < len(self._df) - - def _column_is_fully_specified(self, column) -> bool: - return len(self._df[self._df[column].isna()]) == 0 - - def _validate_variant_prefix_for_column( - self, variant: Variant, prefix: str, column: str, splice_defined: bool - ) -> Optional[str]: - prefix = prefix.lower() - - if column == self.HGVSColumns.NUCLEOTIDE: - if splice_defined: - if prefix not in "g": - return ( - f"{column}: " - f"'{variant}' is not a genomic variant " - f"(prefix 'g.'). Nucleotide variants must " - f"be genomic if transcript variants are " - f"also present" - ) - else: - if prefix not in "cn": - return ( - f"{column}: " - f"'{variant}' is not a transcript variant. " - f"The accepted transcript variant prefixes " - f"are 'c.' or 'n.'" - ) - elif column == self.HGVSColumns.TRANSCRIPT: - if prefix not in "cn": - return ( - f"{column}: " - f"'{variant}' is not a transcript variant. The " - f"accepted transcript variant prefixes are " - f"'c.' or 'n.'" - ) - elif column == self.HGVSColumns.PROTEIN: - if prefix not in "p": - return ( - f"{column}: " - f"'{variant}' is not a protein variant. " - f"The accepted protein variant prefix is 'p.'" - ) - else: - raise ValueError( - f"Unknown column '{column}'. Expected one " - f"of {', '.join(self.HGVSColumns.options())}" - ) - - return None - - -class MaveScoresDataset(MaveDataset): - class AdditionalColumns: - SCORES = required_score_column - - @classmethod - def options(cls) -> List[str]: - return [cls.SCORES] - - @property - def label(self) -> str: - return "scores" - - def _validate_columns(self) -> "MaveDataset": - super()._validate_columns() - - if self.AdditionalColumns.SCORES not in self.columns: - self._errors.append( - f"Your scores dataset is missing the " - f"'{self.AdditionalColumns.SCORES}' column. " - f"Columns are case-sensitive and must be comma delimited" - ) - - return self - - def _normalize_data(self) -> "MaveDataset": - super()._normalize_data() - - should_be_numeric = [self.AdditionalColumns.SCORES] - for c in should_be_numeric: - if c in self.columns: - try: - self._df[c] = self._df[c].astype(dtype=float, errors="raise") - except ValueError as e: - self._errors.append(f"{c}: {str(e)}") - - return self - - -class MaveCountsDataset(MaveDataset): - @property - def label(self) -> str: - return "counts" diff --git a/mavecore/validators/variant_validators/variant.py b/mavecore/validators/variant_validators/variant.py deleted file mode 100644 index 155426a..0000000 --- a/mavecore/validators/variant_validators/variant.py +++ /dev/null @@ -1,60 +0,0 @@ -from typing import Dict - -from mavecore.validators.constants import ( - variant_score_data, - variant_count_data, - required_score_column, -) -from mavecore.validators.exceptions import ValidationError - - -def validate_columns_match(variant, scoreset) -> None: - """ - Validate that a child matches parents defined columns to keep - data in sync. - """ - try: - if variant.score_columns != scoreset.score_columns: - raise ValidationError( - f"Variant defines score columns '{variant.score_columns}' " - f"but parent defines columns '{scoreset.score_columns}. " - ) - if variant.count_columns != scoreset.count_columns: - raise ValidationError( - f"Variant defines count columns '{variant.count_columns}' " - f"but parent defines columns '{scoreset.count_columns}. " - ) - except KeyError as error: - raise ValidationError(f"Missing key {str(error)}") - - -def validate_variant_json(data: Dict[str, Dict]) -> None: - """ - Checks a given dictionary to ensure that it is suitable to be used - as the `data` attribute in a :class:`Variant` instance. - - Parameters - ---------- - data : dict - Dictionary of keys mapping to a list. - """ - expected_keys = [variant_score_data, variant_count_data] - for key in expected_keys: - if key not in data.keys(): - raise ValidationError(f"Missing the required key {key}") - - if required_score_column not in data[variant_score_data]: - raise ValidationError( - f"Missing required column '{required_score_column}' in variant's score data." - ) - - extras = [k for k in data.keys() if k not in set(expected_keys)] - if len(extras) > 0: - extras = [k for k in data.keys() if k not in expected_keys] - raise ValidationError("Encountered unexpected keys {extras}") - - # Check the correct data types are given. - for key in expected_keys: - if not isinstance(data[key], dict): - type_ = type(data[key]).__name__ - raise ValidationError(f"Value for '{key}' must be a dict not {type_}.") diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..ae0e580 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,3 @@ +pre-commit +coverage +pydantic diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..2187562 --- /dev/null +++ b/setup.py @@ -0,0 +1,34 @@ +import setuptools + +with open("README.md", "r") as fh: + long_description = fh.read() + +requirements = [ + "fqfa>=1.2.1", + "mavehgvs>=0.4.0", + "idutils>=1.1.0", + "pandas>=1.1.0", +] + +setuptools.setup( + name="mavecore", + version="0.1.5", + author="MaveDB Developers", + author_email="alan.rubin@wehi.edu.au", + description=("MaveCore implements shared functionality for MaveTools and MaveDB."), + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/VariantEffect/MaveCore/tree/release/0.1", + packages=setuptools.find_packages(), + classifiers=[ + "Development Status :: 2 - Pre-Alpha", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", + ], + python_requires=">=3.6", + install_requires=requirements, + test_suite="tests", +) diff --git a/tests/models/__init__.py b/tests/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/models/data.py b/tests/models/data.py new file mode 100644 index 0000000..e361be0 --- /dev/null +++ b/tests/models/data.py @@ -0,0 +1,97 @@ +from unittest import TestCase +from pydantic import ValidationError +from mavecore.models.data import DataSet, Experiment, ScoreSet + + +class TestDataSet(TestCase): + def setUp(self): + self.dataset = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + "extraMetadata": {}, + "keywords": ["string"], + } + + def test_valid_all_fields(self): + DataSet.parse_obj(self.dataset) + + def test_valid_exclude_optional(self): + self.dataset.pop("extraMetadata") + self.dataset.pop("keywords") + DataSet.parse_obj(self.dataset) + + def test_invalid_keywords(self): + self.dataset["keywords"] = ["null"] + with self.assertRaises(ValidationError): + Experiment.parse_obj(self.dataset) + + +class TestExperiment(TestCase): + def setUp(self): + doi_identifier = {"identifier": "10.1038/s41588-018-0122-z"} + pubmed_identifier = {"identifier": "29785012"} + self.experiment = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + "extraMetadata": {}, + "keywords": ["string"], + "doiIdentifiers": [doi_identifier], + "pubmedIdentifiers": [pubmed_identifier], + } + + def test_valid_all_fields(self): + Experiment.parse_obj(self.experiment) + + def test_valid_exclude_optional(self): + self.experiment.pop("extraMetadata") + self.experiment.pop("keywords") + self.experiment.pop("doiIdentifiers") + self.experiment.pop("pubmedIdentifiers") + Experiment.parse_obj(self.experiment) + + +class TestScoreSet(TestCase): + def setUp(self): + doi_identifier = {"identifier": "10.1038/s41588-018-0122-z"} + pubmed_identifier = {"identifier": "29785012"} + reference_map = {"genomeId": 0, "targetId": 0} + sequence = {"sequenceType": "DNA", "sequence": "ATC"} + external_identifier_id = {"dbname": "UniProt", "identifier": "P01133"} + external_identifier = {"identifier": external_identifier_id, "offset": 0} + target = {"name": "name", + "category": "Protein coding", + "externalIdentifiers": [external_identifier], + "referenceMaps": [reference_map], + "wtSequence": sequence} + self.scoreset = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + "extraMetadata": {}, + "dataUsagePolicy": "policy", + "licenceId": 0, + "keywords": ["string"], + "experimentUrn": "tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5", + "supersededScoresetUrn": "tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5", + "metaAnalysisSourceScoresetUrns": ["tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5"], + "doiIdentifiers": [doi_identifier], + "pubmedIdentifiers": [pubmed_identifier], + "targetGene": target, + } + + def test_valid_all_fields(self): + ScoreSet.parse_obj(self.scoreset) + + def test_valid_exclude_optional(self): + self.scoreset.pop("extraMetadata") + self.scoreset.pop("keywords") + self.scoreset.pop("doiIdentifiers") + self.scoreset.pop("pubmedIdentifiers") + self.scoreset.pop("supersededScoresetUrn") + self.scoreset.pop("metaAnalysisSourceScoresetUrns") + ScoreSet.parse_obj(self.scoreset) diff --git a/tests/models/identifier.py b/tests/models/identifier.py new file mode 100644 index 0000000..006630b --- /dev/null +++ b/tests/models/identifier.py @@ -0,0 +1,60 @@ +from unittest import TestCase +from pydantic import ValidationError +from mavecore.models.identifier import (Identifier, + DoiIdentifier, + PubmedIdentifier, + #ExternalIdentifierId, + ExternalIdentifier) + + +class TestIdentifier(TestCase): + def setUp(self): + self.identifier = { + "identifier": "10.1038/s41588-018-0122-z", + } + + def test_valid_all_fields(self): + Identifier.parse_obj(self.identifier) + + +class TestDoiIdentifier(TestCase): + def setUp(self): + self.doi_identifier = { + "identifier": "10.1038/s41588-018-0122-z", + } + + def test_valid_all_fields(self): + DoiIdentifier.parse_obj(self.doi_identifier) + + def test_invalid_type_of_identifier(self): + self.doi_identifier["identifier"] = "29785012" + with self.assertRaises(ValidationError): + DoiIdentifier.parse_obj(self.doi_identifier) + + +class TestPubmedIdentifier(TestCase): + def setUp(self): + self.pubmed_identifier = { + "identifier": "29785012", + } + + def test_valid_all_fields(self): + PubmedIdentifier.parse_obj(self.pubmed_identifier) + + def test_invalid_type_of_identifier(self): + self.pubmed_identifier["identifier"] = "10.1038/s41588-018-0122-z" + with self.assertRaises(ValidationError): + PubmedIdentifier.parse_obj(self.pubmed_identifier) + + +class TestExternalIdentifier(TestCase): + def setUp(self): + self.external_identifier_id = {"dbname": "UniProt", "identifier": "P01133"} + self.external_identifier = {"identifier": self.external_identifier_id, "offset": 0} + + """def test_valid_external_identifier_id(self): + print(hasattr(self.external_identifier_id, "dbname")) + ExternalIdentifierId.parse_obj(self.external_identifier_id)""" + + def test_valid_external_identifier(self): + ExternalIdentifier.parse_obj(self.external_identifier) \ No newline at end of file diff --git a/tests/models/map.py b/tests/models/map.py new file mode 100644 index 0000000..33c1f60 --- /dev/null +++ b/tests/models/map.py @@ -0,0 +1,13 @@ +from unittest import TestCase +from mavecore.models.map import ReferenceMap + + +class TestReferenceMap(TestCase): + def setUp(self): + self.reference_map = { + "genomeId": 0, + "targetId": 0, + } + + def test_valid_all_fields(self): + ReferenceMap.parse_obj(self.reference_map) diff --git a/tests/models/sequence.py b/tests/models/sequence.py new file mode 100644 index 0000000..6314cb1 --- /dev/null +++ b/tests/models/sequence.py @@ -0,0 +1,20 @@ +from unittest import TestCase +from pydantic import ValidationError +from mavecore.models.sequence import WildType + + +class Test(TestCase): + def test_valid_all_fields(self): + sequence = { + "sequenceType": "Protein", + "sequence": "ATC", + } + WildType.parse_obj(sequence) + + def test_invalid_sequence_type(self): + sequence = { + "sequenceType": "RNA", + "sequence": "ATC", + } + with self.assertRaises(ValidationError): + WildType.parse_obj(sequence) diff --git a/tests/models/target.py b/tests/models/target.py new file mode 100644 index 0000000..62ace8b --- /dev/null +++ b/tests/models/target.py @@ -0,0 +1,29 @@ +from unittest import TestCase +from pydantic import ValidationError +from mavecore.models.target import TargetGene + + +class TestTargetGene(TestCase): + def setUp(self): + reference_map = {"genomeId": 0, "targetId": 0} + sequence = {"sequenceType": "Protein", "sequence": "ATCGAA"} + external_identifier_id = {"dbname": "UniProt", "identifier": "P01133"} + external_identifier = {"identifier": external_identifier_id, "offset": 0} + self.target = {"name": "name", + "category": "Protein coding", + "externalIdentifiers": [external_identifier], + "referenceMaps": [reference_map], + "wtSequence": sequence} + + def test_valid_all_fields(self): + TargetGene.parse_obj(self.target) + + def test_invalid_category(self): + self.target["category"] = "Protein" + with self.assertRaises(ValidationError): + TargetGene.parse_obj(self.target) + + def test_invalid_missing_required_field(self): + self.target.pop("wtSequence") + with self.assertRaises(ValidationError): + TargetGene.parse_obj(self.target) diff --git a/tests/test_validators/test_dataset_validators.py b/tests/test_validators/test_dataset_validators.py deleted file mode 100644 index b15e16b..0000000 --- a/tests/test_validators/test_dataset_validators.py +++ /dev/null @@ -1,326 +0,0 @@ -from io import BytesIO, StringIO -from unittest import TestCase - - -import pandas as pd - -from mavecore.validators import constants - -from mavecore.validators.dataset_validators import ( - validate_scoreset_count_data_input, - validate_scoreset_score_data_input, - validate_at_least_one_additional_column, - validate_has_hgvs_in_header, - validate_header_contains_no_null_columns, - read_header_from_io, - validate_scoreset_json, - validate_datasets_define_same_variants, - WordLimitValidator, -) - - -class TestWordLimitValidator(TestCase): - def test_validation_error_more_than_word_limit(self): - with self.assertRaises(ValueError): - n = 5 - WordLimitValidator(n)("Word " * (n + 1)) - - def test_passes_equal_to_word_limit(self): - n = 5 - WordLimitValidator(n)("Word " * n) - - def test_passes_less_than_word_limit(self): - n = 5 - WordLimitValidator(n)("Word " * (n - 1)) - - -class TestHeaderFromIO(TestCase): - """ - Tests to ensure that a file in bytes or string mode can be read and then - returned to the start so there are no side effects for later reading the - files. - """ - - def test_can_read_header_from_bytes(self): - file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) - header = read_header_from_io(file) - expected = [constants.hgvs_nt_column, "score", "count"] - self.assertEqual(expected, header) - - def test_removes_quotes_from_header(self): - file = BytesIO( - '"{}","score","count,nt"\n'.format(constants.hgvs_nt_column).encode() - ) - header = read_header_from_io(file) - expected = [constants.hgvs_nt_column, "score", "count,nt"] - self.assertEqual(expected, header) - - def test_can_read_header_from_string(self): - file = StringIO("{},score,count\n".format(constants.hgvs_nt_column)) - header = read_header_from_io(file) - expected = [constants.hgvs_nt_column, "score", "count"] - self.assertEqual(expected, header) - - def test_strips_whitespace(self): - file = StringIO(" {} , score , count\n".format(constants.hgvs_nt_column)) - header = read_header_from_io(file) - expected = [constants.hgvs_nt_column, "score", "count"] - self.assertEqual(expected, header) - - def test_returns_file_position_to_begining(self): - file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) - read_header_from_io(file) - self.assertEqual( - file.read(), "{},score,count\n".format(constants.hgvs_nt_column).encode() - ) - - -class TestNoNullInColumnsValidator(TestCase): - """ - Tests to ensure that an input file contains no null values in the header - such as '', None, null etc. - """ - - def test_raises_valuerror_when_null_values_in_column(self): - for value in constants.null_values_list: - file = BytesIO( - "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() - ) - with self.assertRaises(ValueError): - header = read_header_from_io(file) - validate_header_contains_no_null_columns(header) - - def test_does_not_raise_valuerror_when_non_null_values_in_column(self,): - file = BytesIO("{},score\n".format(constants.hgvs_nt_column).encode()) - header = read_header_from_io(file) - validate_header_contains_no_null_columns(header) # Should pass - - -class TestAtLeastOneNumericColumnValidator(TestCase): - """ - Tests to ensure that an input file contains at least two columns. - """ - - def test_raises_valuerror_when_less_than_2_values_in_column(self): - file = BytesIO("{}\n".format(constants.hgvs_nt_column).encode()) - with self.assertRaises(ValueError): - header = read_header_from_io(file) - validate_at_least_one_additional_column(header) - - def test_does_not_raise_valuerror_2_or_more_values_in_column(self): - file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) - header = read_header_from_io(file) - validate_at_least_one_additional_column(header) # Should pass - - file = BytesIO("{},score\n".format(constants.hgvs_nt_column).encode()) - header = read_header_from_io(file) - validate_at_least_one_additional_column(header) # Should pass - - -class TestHgvsInHeaderValidator(TestCase): - """ - Tests that case-sensitive 'hgvs' is in the header of a file. - """ - - def test_raises_valuerror_when_neither_hgvs_col_in_column(self): - file = BytesIO("score,count\n".encode()) - with self.assertRaises(ValueError): - header = read_header_from_io(file) - validate_has_hgvs_in_header(header) - - def test_hgvs_must_be_lowercase(self): - file = BytesIO( - "{},score,count\n".format(constants.hgvs_nt_column.upper()).encode() - ) - with self.assertRaises(ValueError): - header = read_header_from_io(file) - validate_has_hgvs_in_header(header) - - def test_does_not_raise_valuerror_when_either_hgvs_in_column(self): - file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) - header = read_header_from_io(file) - validate_has_hgvs_in_header(header) # Should pass - - file = BytesIO("{},score,count\n".format(constants.hgvs_pro_column).encode()) - header = read_header_from_io(file) - validate_has_hgvs_in_header(header) # Should pass - - -class TestValidateScoreCountsDefineSameVariants(TestCase): - """ - Tests that an uploaded score/counts files define the same variants - in both the _nt column and _pro column. - """ - - def test_ve_counts_defines_different_nt_variants(self): - scores = pd.DataFrame( - { - constants.hgvs_nt_column: ["c.1A>G"], - constants.hgvs_pro_column: [None], - constants.hgvs_splice_column: [None], - } - ) - counts = pd.DataFrame( - { - constants.hgvs_nt_column: ["c.2A>G"], - constants.hgvs_pro_column: [None], - constants.hgvs_splice_column: [None], - } - ) - with self.assertRaises(ValueError): - validate_datasets_define_same_variants(scores, counts) - - def test_ve_counts_defines_different_splice_variants(self): - scores = pd.DataFrame( - { - constants.hgvs_nt_column: [None], - constants.hgvs_splice_column: ["c.1A>G"], - constants.hgvs_pro_column: [None], - } - ) - counts = pd.DataFrame( - { - constants.hgvs_nt_column: [None], - constants.hgvs_splice_column: ["c.2A>G"], - constants.hgvs_pro_column: [None], - } - ) - with self.assertRaises(ValueError): - validate_datasets_define_same_variants(scores, counts) - - def test_ve_counts_defines_different_pro_variants(self): - scores = pd.DataFrame( - { - constants.hgvs_nt_column: [None], - constants.hgvs_splice_column: [None], - constants.hgvs_pro_column: ["p.Leu5Glu"], - } - ) - counts = pd.DataFrame( - { - constants.hgvs_nt_column: [None], - constants.hgvs_splice_column: [None], - constants.hgvs_pro_column: ["p.Leu75Glu"], - } - ) - with self.assertRaises(ValueError): - validate_datasets_define_same_variants(scores, counts) - - def test_passes_when_same_variants_defined(self): - scores = pd.DataFrame( - { - constants.hgvs_nt_column: ["c.1A>G"], - constants.hgvs_pro_column: ["p.Leu5Glu"], - constants.hgvs_splice_column: ["c.1A>G"], - } - ) - counts = pd.DataFrame( - { - constants.hgvs_nt_column: ["c.1A>G"], - constants.hgvs_pro_column: ["p.Leu5Glu"], - constants.hgvs_splice_column: ["c.1A>G"], - } - ) - validate_datasets_define_same_variants(scores, counts) - - -class TestValidateScoreSetCountDataInputValidator(TestCase): - """ - Tests that validation errors are thrown when an ill-formatted count data - input file is supplied. - """ - - def test_raises_valuerror_when_hgvs_not_in_column(self): - file = BytesIO("score,count\n".encode()) - with self.assertRaises(ValueError): - validate_scoreset_count_data_input(file) - - def test_raises_valuerror_no_numeric_column(self): - file = BytesIO("{}\n".format(constants.hgvs_nt_column).encode()) - with self.assertRaises(ValueError): - validate_scoreset_count_data_input(file) - - def test_raises_valuerror_when_null_values_in_column(self): - for value in constants.null_values_list: - file = BytesIO( - "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() - ) - with self.assertRaises(ValueError): - validate_scoreset_count_data_input(file) - - -class TestValidateScoreSetScoreDataInputValidator(TestCase): - """ - Tests that validation errors are thrown when an ill-formatted score data - input file is supplied. - """ - - def test_raises_valuerror_when_hgvs_not_in_column(self): - file = BytesIO("score,count\n".encode()) - with self.assertRaises(ValueError): - validate_scoreset_score_data_input(file) - - def test_raises_valuerror_no_numeric_column(self): - file = BytesIO("{}\n".format(constants.hgvs_nt_column).encode()) - with self.assertRaises(ValueError): - validate_scoreset_score_data_input(file) - - def test_raises_valuerror_when_null_values_in_column(self): - for value in constants.null_values_list: - file = BytesIO( - "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() - ) - with self.assertRaises(ValueError): - validate_scoreset_score_data_input(file) - - def test_validatation_error_score_not_in_header(self): - file = BytesIO("{},count\n".format(constants.hgvs_nt_column).encode()) - with self.assertRaises(ValueError): - validate_scoreset_score_data_input(file) - - -class TestValidateScoreSetJsonValidator(TestCase): - """ - Test to ensure that a scoreset json field is properly formatted. - """ - - def test_valueerror_unexptected_columns(self): - field = { - "extra_column": [], - constants.score_columns: ["score"], - constants.count_columns: [], - } - with self.assertRaises(ValueError): - validate_scoreset_json(field) - - def test_valueerror_values_not_lists(self): - field = {constants.score_columns: ["score"], constants.count_columns: {}} - with self.assertRaises(ValueError): - validate_scoreset_json(field) - - def test_valueerror_list_values_not_strings(self): - field = {constants.score_columns: [b"score"], constants.count_columns: []} - with self.assertRaises(ValueError): - validate_scoreset_json(field) - - def test_valueerror_empty_score_columns(self): - field = {constants.score_columns: [], constants.count_columns: []} - with self.assertRaises(ValueError): - validate_scoreset_json(field) - - def test_valueerror_missing_dict_columns(self): - # constants.score_columns missing - field = {constants.count_columns: []} - with self.assertRaises(ValueError): - validate_scoreset_json(field) - - # constants.count_columns missing - field = {constants.score_columns: ["score"]} - with self.assertRaises(ValueError): - validate_scoreset_json(field) - - def test_valueerror_missing_header_columns(self): - # constants.score_columns columns missing 'score' - field = {constants.score_columns: ["hgvs"], constants.count_columns: []} - with self.assertRaises(ValueError): - validate_scoreset_json(field) diff --git a/tests/test_validators/test_genome_validators.py b/tests/test_validators/test_genome_validators.py deleted file mode 100644 index b4b06a9..0000000 --- a/tests/test_validators/test_genome_validators.py +++ /dev/null @@ -1,137 +0,0 @@ -from unittest import TestCase - -from mavecore.validators.genome_validators import WildTypeSequence - -# from mavetools.validators.genome_factories import ( -# ReferenceMapFactory, -# ReferenceGenomeFactory, -# GenomicIntervalFactory, -# ) - - -from mavecore.validators.genome_validators import ( - validate_wildtype_sequence, - validate_gene_name, - validate_genome_short_name, - validate_organism_name, - sequence_is_protein, - sequence_is_dna, -) -from mavecore.validators.exceptions import ValidationError - -from mavecore.validators.constants import null_values_list - - -class TestWildTypeSequenceValidators(TestCase): - """ - Tests validators associated with :class:`WildTypeSequence`. Tests: - - - validate_wildtype_sequence - """ - - def test_ve_not_a_sequence_of_nucleotides_or_aa(self): - with self.assertRaises(ValidationError): - validate_wildtype_sequence("2823d") - - def test_ve_null(self): - for v in null_values_list: - with self.assertRaises(ValidationError): - validate_wildtype_sequence(v) - - def test_passes_lowercase_nucleotides(self): - validate_wildtype_sequence("atcg") - - def test_passes_uppercase_nucleotides(self): - validate_wildtype_sequence("ATCG") - - def test_passes_lowercase_aa(self): - validate_wildtype_sequence("MDLSALRVEE") - - def test_passes_uppercase_aa(self): - validate_wildtype_sequence("MDLSALRVEE".lower()) - - def test_pass_validate_dna_sequence(self): - validate_wildtype_sequence("ATCG", as_type=WildTypeSequence.SequenceType.DNA) - - def test_pass_validate_protein_sequence(self): - validate_wildtype_sequence( - "MDLS", as_type=WildTypeSequence.SequenceType.PROTEIN - ) - - def test_fails_validate_as_type_dna_but_seq_is_protein(self): - validate_wildtype_sequence( - "MDLS", as_type=WildTypeSequence.SequenceType.PROTEIN - ) - with self.assertRaises(ValidationError): - validate_wildtype_sequence( - "MDLS", as_type=WildTypeSequence.SequenceType.DNA - ) - - def test_fail_validate_as_type_protein_when_sequence_is_invalid(self): - with self.assertRaises(ValidationError): - validate_wildtype_sequence( - "ABC", as_type=WildTypeSequence.SequenceType.PROTEIN - ) - - -class TestIsProteinSequence(TestCase): - def test_false_null(self): - for v in null_values_list: - self.assertFalse(sequence_is_protein(v)) - - def test_false_dna_sequence(self): - # Favor dna sequences when only ATCG - self.assertFalse(sequence_is_protein("ATCG")) - self.assertFalse(sequence_is_protein("atc")) - - def test_true_aa_sequence(self): - self.assertTrue(sequence_is_protein("MDLSALRVEEATC")) - self.assertTrue(sequence_is_protein("MDLSALRVEEATC".lower())) - - -class TestIsDNASequence(TestCase): - def test_false_null(self): - for v in null_values_list: - self.assertFalse(sequence_is_protein(v)) - - def test_true_dna_sequence(self): - self.assertTrue(sequence_is_dna("ATCG")) - self.assertTrue(sequence_is_dna("atc")) - - def test_false_aa_sequence(self): - self.assertFalse(sequence_is_dna("MDLSALRVEEATC")) - self.assertFalse(sequence_is_dna("MDLSALRVEEATC".lower())) - - -class TestReferenceGenomeValidators(TestCase): - """ - Tests validators associated with :class:`ReferenceGenome`: - - - validate_reference_genome_has_one_external_identifier - - validate_organism_name - - validate_genome_short_name - """ - - def test_ve_null_organism_name(self): - for v in null_values_list: - with self.assertRaises(ValidationError): - validate_organism_name(v) - - def test_ve_null_genome_short_name(self): - for v in null_values_list: - with self.assertRaises(ValidationError): - validate_genome_short_name(v) - - -class TestTargetGeneValidators(TestCase): - """ - Tests validators asscociated with :class:`TargetGene`: - - - validate_gene_name - - validate_target_has_one_primary_reference_map - """ - - def test_ve_null_gene_name(self): - for v in null_values_list: - with self.assertRaises(ValidationError): - validate_gene_name(v) diff --git a/tests/test_validators/test_variant_validators/test_hgvs_validators.py b/tests/test_validators/test_variant_validators/test_hgvs_validators.py deleted file mode 100644 index 9a72ffa..0000000 --- a/tests/test_validators/test_variant_validators/test_hgvs_validators.py +++ /dev/null @@ -1,49 +0,0 @@ -# from core.utilities import null_values_list -from unittest import TestCase - -from mavecore.validators.variant_validators import hgvs -from mavecore.validators.exceptions import ValidationError -from mavecore.validators.constants import null_values_list - - -class TestValidateHgvsString(TestCase): - def test_passes_on_null(self): - for v in null_values_list: - hgvs.validate_hgvs_string(v) - - def test_error_not_str(self): - with self.assertRaises(ValidationError): - hgvs.validate_hgvs_string(1.0) - - def test_error_unknown_column(self): - with self.assertRaises(ValueError): - hgvs.validate_hgvs_string("c.1A>G", column="random") - - def test_error_does_not_match_splice(self): - with self.assertRaises(ValidationError): - hgvs.validate_hgvs_string("g.G4L", column="splice") - - def test_error_nt_is_not_g_when_splice_present(self): - hgvs.validate_hgvs_string("c.1A>G", column="nt", splice_present=False) - with self.assertRaises(ValidationError): - hgvs.validate_hgvs_string("c.1A>G", column="nt", splice_present=True) - - def test_error_does_not_match_nt(self): - with self.assertRaises(ValidationError): - hgvs.validate_hgvs_string("p.G4L", column="nt") - - def test_error_does_not_match_pro(self): - with self.assertRaises(ValidationError): - hgvs.validate_hgvs_string("c.1A>G", column="p") - - def test_raises_on_enrich_special_types(self): - with self.assertRaises(ValidationError): - hgvs.validate_hgvs_string("_wt") - with self.assertRaises(ValidationError): - hgvs.validate_hgvs_string("_sy") - - def test_validates_valid_hgvs(self): - hgvs.validate_hgvs_string("c.1A>G", column="nt", splice_present=False) - hgvs.validate_hgvs_string("g.1A>G", column="nt", splice_present=True) - hgvs.validate_hgvs_string("c.1A>G", column="splice") - hgvs.validate_hgvs_string("p.(=)", column="p") diff --git a/tests/test_validators/test_variant_validators/test_validators.py b/tests/test_validators/test_variant_validators/test_validators.py deleted file mode 100644 index 404b558..0000000 --- a/tests/test_validators/test_variant_validators/test_validators.py +++ /dev/null @@ -1,737 +0,0 @@ -from io import StringIO -import unittest -from unittest import TestCase -from random import choice - -import pandas as pd -from pandas.testing import assert_index_equal - -# from dataset import constants -from mavecore.validators import constants -from mavecore.validators.exceptions import ValidationError - -# from ..factories import generate_hgvs, VariantFactory -from mavecore.validators.variant_validators import ( - MaveDataset, - validate_variant_json, - validate_hgvs_string, -) - - -def generate_hgvs(prefix: str = "c") -> str: - """Generates a random hgvs string from a small sample.""" - if prefix == "p": - # Subset of 3-letter codes, chosen at random. - amino_acids = [ - "Ala", - "Leu", - "Gly", - "Val", - "Tyr", - "Met", - "Cys", - "His", - "Glu", - "Phe", - ] - ref = choice(amino_acids) - alt = choice(amino_acids) - return f"{prefix}.{ref}{choice(range(1, 100))}{alt}" - else: - alt = choice("ATCG") - ref = choice("ATCG") - return f"{prefix}.{choice(range(1, 100))}{ref}>{alt}" - - -class TestHGVSValidator(TestCase): - """ - Tests the function :func:`validate_hgvs_string` to see if it is able - to validate strings which do not comply with the HGVS standard for - coding, non-coding and nucleotide variants and multi-variants. - """ - - def test_validation_error_not_str_or_bytes(self): - with self.assertRaises(ValidationError): - validate_hgvs_string([]) - - def test_does_not_pass_enrich_wt_hgvs(self): - with self.assertRaises(ValidationError): - validate_hgvs_string("_wt") - - def test_does_not_pass_enrich_sy_hgvs(self): - with self.assertRaises(ValidationError): - validate_hgvs_string("_sy") - - def test_passes_multi(self): - validate_hgvs_string("p.[Lys4Gly;Lys5Phe]", column="p") - validate_hgvs_string("c.[1A>G;127_128delinsAGC]", column="nt") - validate_hgvs_string("c.[1A>G;127_128delinsAGC]", column="splice") - - def test_error_invalid_hgvs(self): - with self.assertRaises(ValidationError): - validate_hgvs_string("c.ad", column="nt") - - def test_error_invalid_nt_prefix(self): - with self.assertRaises(ValidationError): - validate_hgvs_string("r.1a>g", column="nt") - - with self.assertRaises(ValidationError): - validate_hgvs_string("c.1A>G", column="nt", splice_present=True) - - def test_error_invalid_splice_prefix(self): - with self.assertRaises(ValidationError): - validate_hgvs_string("r.1a>g", column="splice") - - def test_error_invalid_pro_prefix(self): - with self.assertRaises(ValidationError): - validate_hgvs_string("r.1a>g", column="p") - - def test_converts_bytes_to_string_before_validation(self): - validate_hgvs_string(b"c.427A>G", column="splice") - - def test_return_none_for_null(self): - for c in constants.null_values_list: - self.assertIsNone(validate_hgvs_string(c, column="nt")) - - -class TestVariantJsonValidator(TestCase): - """ - Tests the validator :func:`validate_variant_json` to check if the correct - errors are thrown if an incorrectly formatted `dictionary` is set - as a the `data` `JSONField` attribute of a :class:`..models.Variant` - instance. - """ - - def test_validation_error_missing_score_data_key(self): - data = {constants.variant_count_data: {}} - with self.assertRaises(ValidationError): - validate_variant_json(data) - - def test_validation_error_missing_count_data_key(self): - data = {constants.variant_score_data: {}} - with self.assertRaises(ValidationError): - validate_variant_json(data) - - def test_validation_error_contains_unexpected_keys(self): - data = { - "extra": {}, - constants.variant_score_data: {}, - constants.variant_count_data: {}, - } - with self.assertRaises(ValidationError): - validate_variant_json(data) - - def test_validation_error_values_not_dict(self): - data = {constants.variant_score_data: {}, constants.variant_count_data: {}} - for key in data.keys(): - data[key] = [] - with self.assertRaises(ValidationError): - validate_variant_json(data) - data[key] = {} - - -class TestMaveDataset(TestCase): - """ - Tests the validator :func:`validate_variant_rows` to check if the correct - errors are thrown when invalid rows are encountered in a - scores/counts/meta data input file. Checks for: - - Invalid HGVS string in a row - - Row HGVS is defined in more than one row - - Row values are not int/float for a count/score file - - Tests also check to see if the correct header and hgvs data information - is parsed and returned. - """ - - SCORE_COL = constants.required_score_column - HGVS_NT_COL = constants.hgvs_nt_column - HGVS_SPLICE_COL = constants.hgvs_splice_column - HGVS_PRO_COL = constants.hgvs_pro_column - - @staticmethod - def mock_return_value(data, index=None): - df = pd.read_csv(StringIO(data), sep=",", na_values=["None", None]) - if index: - df.index = pd.Index(df[index]) - return df - - def test_invalid_row_hgvs_is_not_a_string(self): - data = "{},{}\n1.0,1.0".format(self.HGVS_NT_COL, self.SCORE_COL) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_missing_hgvs_columns(self): - data = "{},{}\n{},1.0".format("not_hgvs", self.SCORE_COL, generate_hgvs()) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_replaces_null_with_none_in_secondary_hgvs_column(self): - hgvs_nt = generate_hgvs(prefix="c") - for c in constants.null_values_list: - with self.subTest(msg=f"'{c}'"): - data = "{},{},{}\n{},{},1.0 ".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL, hgvs_nt, c - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertListEqual( - list(dataset.data(serializable=True)[self.HGVS_PRO_COL]), [None] - ) - - def test_replaces_null_with_none_in_numeric_columns(self): - hgvs_nt = generate_hgvs(prefix="c") - for c in constants.null_values_list: - with self.subTest(msg=f"'{c}'"): - data = "{},{}\n{},{}".format( - self.HGVS_NT_COL, self.SCORE_COL, hgvs_nt, c - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertListEqual( - list(dataset.data(serializable=True)[self.SCORE_COL]), [None] - ) - - def test_invalid_null_values_in_header(self): - for value in constants.null_values_list: - with self.subTest(msg=f"'{value}'"): - data = "{},{},{}\n{},1.0,1.0".format( - self.HGVS_NT_COL, self.SCORE_COL, value, generate_hgvs() - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_no_additional_columns_outside_hgvs_ones(self): - data = "{},{},{}\n{},{},{}".format( - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.HGVS_PRO_COL, - generate_hgvs(prefix="g"), - generate_hgvs(prefix="c"), - generate_hgvs(prefix="p"), - ) - - dataset = MaveDataset.for_counts(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_scores_missing_scores_column(self): - data = "{},{}\n{},{}".format( - self.HGVS_NT_COL, "scores_rna", generate_hgvs(prefix="g"), 1.0 - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_missing_either_required_hgvs_column(self): - data = "{},{}\n{},{}".format( - self.HGVS_SPLICE_COL, self.SCORE_COL, generate_hgvs(prefix="c"), 1.0 - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_empty_no_variants_parsed(self): - data = "{},{}\n".format(self.HGVS_NT_COL, self.SCORE_COL) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_empty) - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_error_non_numeric_values_in_score_column(self): - data = "{},{}\n{},{}".format( - self.HGVS_NT_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - "I am not a number", - ) - - with self.assertRaises(ValueError): - MaveDataset.for_scores(StringIO(data)) - - def test_invalid_same_hgvs_nt_defined_in_two_rows(self): - hgvs = generate_hgvs(prefix="c") - data = "{},{}\n{},1.0\n{},1.0".format( - self.HGVS_NT_COL, self.SCORE_COL, hgvs, hgvs - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_same_variant_defined_in_two_rows_in_hgvs_pro(self): - hgvs = generate_hgvs(prefix="p") - data = "{},{}\n{},1.0\n{},1.0".format(self.HGVS_PRO_COL, "count", hgvs, hgvs) - - dataset = MaveDataset.for_counts(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_data_method_converts_null_values_to_None(self): - hgvs = generate_hgvs() - for value in constants.null_values_list: - with self.subTest(msg=value): - data = "{},{}\n{},{}".format( - self.HGVS_NT_COL, self.SCORE_COL, hgvs, value - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - - df = dataset.data(serializable=True) - self.assertIsNotNone(df[self.HGVS_NT_COL].values[0]) - self.assertIsNone(df[self.SCORE_COL].values[0]) - - def test_sorts_header(self): - hgvs_nt = generate_hgvs(prefix="g") - hgvs_pro = generate_hgvs(prefix="p") - hgvs_splice = generate_hgvs(prefix="c") - data = "{},{},{},{},{}\n{},{},{},{},{}".format( - self.HGVS_PRO_COL, - self.HGVS_NT_COL, - "colA", - self.SCORE_COL, - self.HGVS_SPLICE_COL, - hgvs_pro, - hgvs_nt, - "hello", - 1.0, - hgvs_splice, - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertListEqual( - dataset.columns, - [ - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - "colA", - ], - ) - - def test_does_not_allow_wt_and_sy(self): - wt = "_wt" - sy = "_sy" - data = "{},{},{},{}\n{},{},{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - wt, - wt, - sy, - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 3) - print(dataset.errors) - - def test_parses_numeric_column_values_into_float(self): - hgvs = generate_hgvs(prefix="c") - data = "{},{}\n{},1.0".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - value = dataset.data()[self.SCORE_COL].values[0] - self.assertIsInstance(value, float) - - def test_does_not_split_double_quoted_variants(self): - hgvs = "c.[123A>G;124A>G]" - data = '{},{}\n"{}",1.0'.format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertIn(hgvs, dataset.data()[self.HGVS_NT_COL]) - - # def test_invalid_non_double_quoted_multi_variant_row(self): - # hgvs = "{},{}".format(generate_hgvs(), generate_hgvs()) - # data = "{},{}\n'{}',1.0".format( - # constants.hgvs_nt_column, required_score_column, hgvs - # ) - # with self.assertRaises(ValidationError): - # _ = validate_variant_rows(BytesIO(data.encode())) - - def test_primary_column_is_pro_when_nt_is_not_defined(self): - hgvs_pro = generate_hgvs(prefix="p") - data = "{},{}\n{},1.0".format(self.HGVS_PRO_COL, self.SCORE_COL, hgvs_pro) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertEqual(dataset.index_column, self.HGVS_PRO_COL) - - def test_primary_column_is_nt_by_default(self): - hgvs_nt = generate_hgvs(prefix="c") - hgvs_pro = generate_hgvs(prefix="p") - data = "{},{},{}\n{},{},1.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL, hgvs_nt, hgvs_pro - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertEqual(dataset.index_column, self.HGVS_NT_COL) - - def test_error_missing_value_in_nt_column_when_nt_is_primary(self): - for v in constants.null_values_list: - with self.subTest(msg=v): - data = ( - "{},{},{}\n" - "{},{},1.0\n" - "{},{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - generate_hgvs(prefix="p"), - v, - generate_hgvs(prefix="p"), - ) - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_error_missing_value_in_pro_column_when_pro_is_primary(self): - for v in constants.null_values_list: - with self.subTest(msg=v): - data = "{},{}\n{},1.0\n{},1.0".format( - self.HGVS_PRO_COL, self.SCORE_COL, generate_hgvs(prefix="p"), v - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_df_indexed_by_primary_column(self): - data = "{},{},{}\n{},{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - generate_hgvs(prefix="p"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - assert_index_equal(dataset.data().index, dataset.index) - - def test_invalid_duplicates_in_index(self): - hgvs = generate_hgvs(prefix="c") - data = "{},{},{}\n{},{},1.0\n{},{},2.0".format( - self.HGVS_NT_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - hgvs, - generate_hgvs(prefix="p"), - hgvs, - generate_hgvs(prefix="p"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_hgvs_in_column(self): - tests = [ - (self.HGVS_PRO_COL, generate_hgvs(prefix="c")), - (self.HGVS_SPLICE_COL, generate_hgvs(prefix="g")), - (self.HGVS_NT_COL, generate_hgvs(prefix="p")), - ] - for (column, variant) in tests: - with self.subTest(msg=f"{column}: {variant}"): - if column == self.HGVS_SPLICE_COL: - data = "{},{},{}\n{},{},1.0".format( - self.HGVS_NT_COL, - column, - self.SCORE_COL, - generate_hgvs(prefix="g"), - variant, - ) - else: - data = "{},{}\n{},1.0".format(column, self.SCORE_COL, variant) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_genomic_and_transcript_mixed_in_nt_column(self): - data = "{},{}\n{},1.0\n{},2.0".format( - self.HGVS_NT_COL, - self.SCORE_COL, - generate_hgvs(prefix="g"), - generate_hgvs(prefix="c"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 2) - print(dataset.errors) - - def test_invalid_nt_not_genomic_when_splice_present(self): - data = "{},{},{}\n{},{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - generate_hgvs(prefix="c"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_splice_defined_when_nt_is_not(self): - data = "{},{},{}\n,{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_splice_not_defined_when_nt_is_genomic(self): - data = "{},{}\n{},1.0".format( - self.HGVS_NT_COL, self.SCORE_COL, generate_hgvs(prefix="g") - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 2) - print(dataset.errors) - - def test_invalid_zero_is_not_parsed_as_none(self): - hgvs = generate_hgvs(prefix="c") - data = "{},{}\n{},0.0".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - df = dataset.data() - self.assertEqual(df[self.SCORE_COL].values[0], 0) - - def test_invalid_close_to_zero_is_not_parsed_as_none(self): - hgvs = generate_hgvs(prefix="c") - data = "{},{}\n{},5.6e-15".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - df = dataset.data() - self.assertEqual(df[self.SCORE_COL].values[0], 5.6e-15) - - def test_defines_same_variants(self): - tests = [ - ( - "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL), - "{},count\nc.1A>G,0.0".format(self.HGVS_NT_COL), - True, - ), - ( - "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL), - "{},count\nc.2A>G,0.0".format(self.HGVS_NT_COL), - False, - ), - ( - "{},{},{}\nc.1A>G,p.Ile1Val,0.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ), - "{},{},count\nc.1A>G,p.Ile1Val,0.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL - ), - True, - ), - ( - "{},{},{}\nc.1A>G,p.Ile1Val,0.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ), - "{},{},count\nc.1A>G,p.Ile1Phe,0.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL - ), - False, - ), - # Check returns None if either dataset invalid - ( - "wrong_columns,{}\nc.1A>G,0.0".format(self.SCORE_COL), - "{},count\nc.1A>G,0.0".format(self.HGVS_NT_COL), - None, - ), - ( - "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL), - "wrong_column,count\nc.1A>G,0.0".format(), - None, - ), - ] - - for (scores, counts, expected) in tests: - with self.subTest(msg=(scores, counts, expected)): - scores_dataset = MaveDataset.for_scores(StringIO(scores)) - scores_dataset.validate() - - counts_dataset = MaveDataset.for_counts(StringIO(counts)) - counts_dataset.validate() - - self.assertEqual(scores_dataset.match_other(counts_dataset), expected) - - def test_to_dict(self): - hgvs_1 = generate_hgvs(prefix="c") - hgvs_2 = generate_hgvs(prefix="c") - data = "{},{},{},{}\n{},,,\n{},,,1.0".format( - self.HGVS_NT_COL, - self.HGVS_PRO_COL, - self.HGVS_SPLICE_COL, - self.SCORE_COL, - hgvs_1, - hgvs_2, - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertDictEqual( - dataset.to_dict(), - { - hgvs_1: { - self.HGVS_NT_COL: hgvs_1, - self.HGVS_SPLICE_COL: None, - self.HGVS_PRO_COL: None, - self.SCORE_COL: None, - }, - hgvs_2: { - self.HGVS_NT_COL: hgvs_2, - self.HGVS_SPLICE_COL: None, - self.HGVS_PRO_COL: None, - self.SCORE_COL: 1.0, - }, - }, - ) - - def test_valid_targetseq_validation_fails(self): - data = "{},{},{}\nc.1A>G,p.Ile1Val,0.5".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate(targetseq="ATC") - - self.assertTrue(dataset.is_valid) - - def test_invalid_targetseq_validation_fails(self): - data = "{},{},{}\nc.1A>G,p.Val1Phe,0.5".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate(targetseq="ATC") - - self.assertFalse(dataset.is_valid) - print(dataset.errors) - - self.assertEqual(dataset.n_errors, 1) - self.assertIn("p.Val1Phe", dataset.errors[0]) - - def test_invalid_target_sequence_not_a_multiple_of_3(self): - data = "{},{},{}\nc.1A>G,p.Ile1Val,0.5".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate(targetseq="ATCG") - - self.assertFalse(dataset.is_valid) - print(dataset.errors) - - self.assertEqual(dataset.n_errors, 1) - self.assertIn("multiple of 3", dataset.errors[0]) - - @unittest.expectedFailure - def test_invalid_relaxed_ordering_check_fails(self): - self.fail("Test is pending") diff --git a/tests/validation/__init__.py b/tests/validation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py new file mode 100644 index 0000000..7d0117a --- /dev/null +++ b/tests/validation/dataframe.py @@ -0,0 +1,431 @@ +from unittest import TestCase +import numpy as np +import pandas as pd +from io import StringIO + +from mavecore.validation.exceptions import ValidationError + +from mavecore.validation.constants.general import ( + hgvs_nt_column, + hgvs_pro_column, + hgvs_splice_column, + required_score_column +) + +from mavecore.validation.dataframe import ( + validate_no_null_columns_or_rows, + validate_column_names, + validate_values_by_column, + validate_score, + validate_dataframes_define_same_variants, + validate_index_column, + validate_hgvs_nt_and_hgvs_pro_represent_same_change, +) +from mavecore.validation.constants.general import null_values_list +# let pandas handle the types of null values to allow + + +class TestValidateNoNullColumnsOrRows(TestCase): + def setUp(self): + self.dataframe = pd.DataFrame( + { + hgvs_nt_column: ["c.1A>G"], + hgvs_pro_column: ["p.Leu5Glu"], + hgvs_splice_column: ["c.1A>G"], + required_score_column: 1.0, + } + ) + + def test_valid(self): + validate_no_null_columns_or_rows(self.dataframe) + + def test_null_row(self): + self.dataframe.loc[1] = [np.nan, np.nan, np.nan, np.nan] + with self.assertRaises(AssertionError): + validate_no_null_columns_or_rows(self.dataframe) + + def test_null_column(self): + self.dataframe[required_score_column][0] = np.nan + with self.assertRaises(AssertionError): + validate_no_null_columns_or_rows(self.dataframe) + + +class TestValidateColumnNames(TestCase): + def setUp(self): + self.dataframe = pd.DataFrame( + { + hgvs_nt_column: ["c.1A>G"], + hgvs_pro_column: ["p.Leu5Glu"], + hgvs_splice_column: ["c.1A>G"], + required_score_column: [1.000], + } + ) + + def test_valid_scores_column_names(self): + validate_column_names(self.dataframe) + + def test_valid_counts_column_names(self): + self.dataframe = self.dataframe.drop([required_score_column], axis=1) + self.dataframe["count"] = [5] + validate_column_names(self.dataframe, scores=False) + + def test_valid_just_hgvs_nt_hgvs_column(self): + self.dataframe = self.dataframe.drop([hgvs_pro_column, hgvs_splice_column], axis=1) + validate_column_names(self.dataframe) + + def test_valid_just_hgvs_pro_hgvs_column(self): + self.dataframe = self.dataframe.drop([hgvs_nt_column, hgvs_splice_column], axis=1) + validate_column_names(self.dataframe) + + def test_primary_column_is_pro_when_nt_is_not_defined(self): + self.dataframe = self.dataframe.drop([hgvs_nt_column, hgvs_splice_column, required_score_column], axis=1) + self.dataframe.insert(0, required_score_column, [1.000], True) + self.dataframe = validate_column_names(self.dataframe) + self.assertTrue(self.dataframe.columns[0] == hgvs_pro_column) + + def test_missing_hgvs_column(self): + self.dataframe = self.dataframe.drop([hgvs_nt_column, hgvs_pro_column, hgvs_splice_column], axis=1) + with self.assertRaises(ValidationError): + validate_column_names(self.dataframe) + + def test_hgvs_in_wrong_location(self): + self.dataframe = self.dataframe[[hgvs_nt_column, required_score_column, hgvs_pro_column, hgvs_splice_column]] + validate_column_names(self.dataframe) # validation fixes problem, should pass + + def test_no_additional_columns_beyond_hgvs_scores_df(self): + self.dataframe = self.dataframe.drop([hgvs_pro_column, hgvs_splice_column, required_score_column], axis=1) + with self.assertRaises(ValidationError): + validate_column_names(self.dataframe) + + def test_no_additional_columns_beyond_hgvs_counts_df(self): + self.dataframe = self.dataframe.drop([hgvs_pro_column, hgvs_splice_column, required_score_column], axis=1) + with self.assertRaises(ValidationError): + validate_column_names(self.dataframe, scores=False) + + def test_hgvs_columns_must_be_lowercase(self): + self.dataframe.rename(columns={hgvs_nt_column: hgvs_nt_column.upper()}, inplace=True) + with self.assertRaises(ValueError): + validate_column_names(self.dataframe) + + def test_duplicate_column_names(self): + self.dataframe.rename(columns={hgvs_pro_column: hgvs_nt_column}, inplace=True) + with self.assertRaises(ValidationError): + validate_column_names(self.dataframe) + + def test_null_column_name(self): + null_values = [None, np.nan, "", 1, " "] + for value in null_values: + self.dataframe.rename(columns={hgvs_splice_column: value}, inplace=True) + with self.assertRaises(ValidationError): + validate_column_names(self.dataframe) + + def test_no_score_column_with_scores_df(self): + self.dataframe = self.dataframe.drop([required_score_column], axis=1) + self.dataframe["count"] = [1] + with self.assertRaises(ValidationError): + validate_column_names(self.dataframe) + + def test_no_additional_column_with_counts_df(self): + self.dataframe = self.dataframe.drop([required_score_column], axis=1) + with self.assertRaises(ValidationError): + validate_column_names(self.dataframe, scores=False) + + def test_invalid_missing_either_required_hgvs_column(self): + self.dataframe = self.dataframe.drop([hgvs_pro_column, hgvs_nt_column], axis=1) + with self.assertRaises(ValidationError): + validate_column_names(self.dataframe, scores=False) + + def test_invalid_splice_column_defined_when_nt_column_is_not(self): + self.dataframe = self.dataframe.drop([hgvs_nt_column], axis=1) + with self.assertRaises(ValidationError): + validate_column_names(self.dataframe, scores=False) + + def test_sort_column_names(self): + self.dataframe = pd.DataFrame( + { + "other": 5, + required_score_column: [1.000], + hgvs_splice_column: ["c.1A>G"], + hgvs_pro_column: ["p.Leu5Glu"], + hgvs_nt_column: ["g.1A>G"], + } + ) + dataset = validate_column_names(self.dataframe) + self.assertTrue(dataset.columns[0] == hgvs_nt_column) + self.assertTrue(dataset.columns[1] == hgvs_pro_column) + self.assertTrue(dataset.columns[2] == hgvs_splice_column) + self.assertTrue(dataset.columns[3] == required_score_column) + + +class TestValidateValuesByColumn(TestCase): + def setUp(self): + self.target_seq = "ATGACA" + self.dataframe = pd.DataFrame( + { + hgvs_nt_column: ["g.4A>G", "g.5C>G", "g.6A>G"], + hgvs_pro_column: ["p.Thr2Ala", "p.Thr2Arg", "p.Thr2="], + hgvs_splice_column: ["c.4A>G", "c.5C>G", "c.6A>G"], + required_score_column: [1.000, 0.5, 1.5], + } + ) + + def test_valid(self): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_non_numeric_values_in_score_column(self): + self.dataframe.loc[0, [required_score_column]] = "not a float" + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_invalid_row_hgvs_is_not_a_string(self): + self.dataframe.loc[0, [hgvs_nt_column]] = 1.0 + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_empty_no_variants_parsed(self): + self.dataframe = self.dataframe.drop(axis='rows', index=[0, 1, 2]) + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_invalid_hgvs_nt_in_column(self): + self.dataframe = self.dataframe.drop([hgvs_pro_column, hgvs_splice_column], axis=1) + self.dataframe.loc[0, [hgvs_nt_column]] = "p.Thr1Ala" + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_invalid_hgvs_pro_in_column(self): + self.dataframe = self.dataframe.drop([hgvs_nt_column, hgvs_splice_column], axis=1) + self.dataframe.loc[0, [hgvs_pro_column]] = "c.1A>G" + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_invalid_hgvs_splice_in_column(self): + self.dataframe = self.dataframe.drop([hgvs_pro_column], axis=1) + self.dataframe.loc[0, [hgvs_splice_column]] = "g.1A>G" + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_invalid_variants_do_not_represent_same_change(self): + self.dataframe.loc[0, [hgvs_nt_column]] = "c.3A>G" + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_does_not_allow_wt(self): + self.dataframe.loc[0, [hgvs_nt_column]] = "_wt" + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_does_not_allow_sy(self): + self.dataframe.loc[0, [hgvs_pro_column]] = "_sy" + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_parses_numeric_column_values_into_float(self): + self.dataframe.loc[0, [required_score_column]] = "1.1" + self.assertTrue(type(self.dataframe[required_score_column][0]) == str) + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + self.assertFalse(type(self.dataframe[required_score_column][0]) == float) + self.dataframe.loc[0, [required_score_column]] = 1 + self.assertTrue(type(self.dataframe[required_score_column][0]) == int) + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + self.assertTrue(type(self.dataframe[required_score_column][0]) == float) + + # TODO: validate hgvs string should check this + def test_does_not_split_double_quoted_variants(self): + '''hgvs = "c.[123A>G;124A>G]" + data = '{},{}\n"{}",1.0'.format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_valid) + self.assertIn(hgvs, dataset.data()[self.HGVS_NT_COL]) + + # def test_invalid_non_double_quoted_multi_variant_row(self): + # hgvs = "{},{}".format(generate_hgvs(), generate_hgvs()) + # data = "{},{}\n'{}',1.0".format( + # constants.hgvs_nt_column, required_score_column, hgvs + # ) + # with self.assertRaises(ValidationError): + # _ = validate_variant_rows(BytesIO(data.encode()))''' + + def test_invalid_genomic_and_transcript_mixed_in_nt_column(self): + self.dataframe.loc[0, [hgvs_nt_column]] = "c.4A>G" + self.dataframe = self.dataframe.drop([hgvs_splice_column], axis=1) + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_invalid_nt_not_genomic_when_splice_present(self): + self.dataframe.loc[0, [hgvs_nt_column]] = "c.4A>G" + self.dataframe.loc[1, [hgvs_nt_column]] = "c.5C>G" + self.dataframe.loc[2, [hgvs_nt_column]] = "c.6A>G" + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_noncoding_hgvs_nt_should_not_have_hgvs_pro_columns(self): + self.dataframe = self.dataframe.drop([hgvs_splice_column], axis=1) + self.dataframe.loc[0, [hgvs_nt_column]] = "n.4A>G" + self.dataframe.loc[1, [hgvs_nt_column]] = "n.5C>G" + self.dataframe.loc[2, [hgvs_nt_column]] = "n.6A>G" + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + self.dataframe.loc[0, [hgvs_pro_column]] = None + self.dataframe.loc[1, [hgvs_pro_column]] = None + self.dataframe.loc[2, [hgvs_pro_column]] = None + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_coding_hgvs_nt_may_have_hgvs_pro_columns(self): + self.dataframe = self.dataframe.drop([hgvs_splice_column], axis=1) + self.dataframe.loc[0, [hgvs_nt_column]] = "c.4A>G" + self.dataframe.loc[1, [hgvs_nt_column]] = "c.5C>G" + self.dataframe.loc[2, [hgvs_nt_column]] = "c.6A>G" + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + self.dataframe = self.dataframe.drop([hgvs_pro_column], axis=1) + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_invalid_splice_not_defined_when_nt_is_genomic(self): + self.dataframe = self.dataframe.drop([hgvs_splice_column], axis=1) + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_invalid_zero_is_not_parsed_as_none(self): + self.dataframe.loc[0, [required_score_column]] = 0.0 + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + hgvs = "c.4A>G" + data = "{},{}\n{},0.0".format(hgvs_nt_column, required_score_column, hgvs) + df = pd.read_csv(StringIO(data), sep=",") + validate_values_by_column(df, target_seq=self.target_seq) + self.assertEqual(df[required_score_column].values[0], 0) + + def test_invalid_close_to_zero_is_not_parsed_as_none(self): + self.dataframe.loc[0, [required_score_column]] = 5.6e-15 + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + hgvs = "c.4A>G" + data = "{},{}\n{},5.6e-15".format(hgvs_nt_column, required_score_column, hgvs) + df = pd.read_csv(StringIO(data), sep=",") + validate_values_by_column(df, target_seq=self.target_seq) + self.assertEqual(df[required_score_column].values[0], 5.6e-15) + + def test_mismatched_variants_and_column_names(self): + self.dataframe = pd.DataFrame( + { + hgvs_nt_column: ["p.Thr2Ala", "p.Thr2Arg", "p.Thr2="], + hgvs_pro_column: ["g.4A>G", "g.5C>G", "g.6A>G"], + hgvs_splice_column: ["c.4A>G", "c.5C>G", "c.6A>G"], + required_score_column: [1.000, 0.5, 1.5], + } + ) + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + +class TestValidateIndexColumn(TestCase): + def setUp(self): + self.dataframe = pd.DataFrame( + { + hgvs_nt_column: ["c.1A>G", "c.2C>G", "c.3A>G"], + hgvs_pro_column: ["p.Thr1Ala", "p.Thr1Arg", "p.="], + required_score_column: [1.0, 0.5, 1.5], + } + ) + + def test_valid(self): + validate_index_column(self.dataframe["hgvs_nt"], "nt") + self.dataframe = self.dataframe.drop([hgvs_nt_column], axis=1) + validate_index_column(self.dataframe["hgvs_pro"], "pro") + + def test_invalid_same_hgvs_nt_defined_in_two_rows(self): + self.dataframe.loc[0, [hgvs_nt_column]] = "c.2C>G" + with self.assertRaises(ValidationError): + validate_index_column(self.dataframe["hgvs_nt"], "nt") + + def test_invalid_same_variant_defined_in_two_rows_in_hgvs_pro_when_pro_is_primary_column(self): + self.dataframe = self.dataframe.drop([hgvs_nt_column], axis=1) + self.dataframe.loc[0, [hgvs_pro_column]] = "p.Thr1Arg" + with self.assertRaises(ValidationError): + validate_index_column(self.dataframe["hgvs_pro"], "pro") + + def test_error_missing_value_in_nt_column_when_nt_is_primary(self): + self.dataframe.loc[0, [hgvs_nt_column]] = np.nan + with self.assertRaises(ValidationError): + validate_index_column(self.dataframe["hgvs_nt"], "nt") + + def test_error_missing_value_in_pro_column_when_pro_is_primary(self): + self.dataframe = self.dataframe.drop([hgvs_nt_column], axis=1) + self.dataframe.loc[0, [hgvs_pro_column]] = np.nan + with self.assertRaises(ValidationError): + validate_index_column(self.dataframe["hgvs_pro"], "pro") + + +class TestValidateScore(TestCase): + def test_valid_score(self): + validate_score(1.1) + + def test_invalid_score(self): + with self.assertRaises(ValidationError): + validate_score("a") + + +class TestHgvsColumnsDefineSameVariants(TestCase): + def setUp(self): + self.target_seq = "ATGACA" + self.dataframe = pd.DataFrame( + { + hgvs_nt_column: ["g.4A>G", "g.5C>G", "g.6A>G"], + hgvs_pro_column: ["p.Thr2Ala", "p.Thr2Arg", "p.Thr2="], + required_score_column: [1.000, 0.5, 1.5], + } + ) + + def test_valid(self): + for i in range(3): + validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq=self.target_seq, + nt=self.dataframe[hgvs_nt_column][i], + pro=self.dataframe[hgvs_pro_column][i], + row=i) + + def test_invalid_nt_and_pro_do_not_represent_same_change(self): + self.dataframe.loc[0, [hgvs_nt_column]] = "g.2C>G" + with self.assertRaises(ValidationError): + for i in range(3): + validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq=self.target_seq, + nt=self.dataframe[hgvs_nt_column][i], + pro=self.dataframe[hgvs_pro_column][i], + row=i) + + +class TestDataframesDefineSameVariants(TestCase): + def setUp(self): + self.scores = pd.DataFrame( + { + hgvs_nt_column: ["c.1A>G"], + hgvs_pro_column: ["p.Leu5Glu"], + hgvs_splice_column: ["c.1A>G"], + } + ) + self.counts = pd.DataFrame( + { + hgvs_nt_column: ["c.1A>G"], + hgvs_pro_column: ["p.Leu5Glu"], + hgvs_splice_column: ["c.1A>G"], + } + ) + + def test_valid(self): + validate_dataframes_define_same_variants(self.scores, self.counts) + + def test_counts_defines_different_nt_variants(self): + self.counts[hgvs_nt_column][0] = "c.2A>G" + with self.assertRaises(ValidationError): + validate_dataframes_define_same_variants(self.scores, self.counts) + + def test_counts_defines_different_splice_variants(self): + self.counts[hgvs_splice_column][0] = "c.2A>G" + with self.assertRaises(ValidationError): + validate_dataframes_define_same_variants(self.scores, self.counts) + + def test_counts_defines_different_pro_variants(self): + self.counts[hgvs_pro_column][0] = "p.Leu75Glu" + with self.assertRaises(ValidationError): + validate_dataframes_define_same_variants(self.scores, self.counts) diff --git a/tests/validation/dataset.py b/tests/validation/dataset.py new file mode 100644 index 0000000..ccea630 --- /dev/null +++ b/tests/validation/dataset.py @@ -0,0 +1,79 @@ +from unittest import TestCase +from mavecore.validation.dataset import validate_experiment, validate_scoreset + + +class TestValidateExperiment(TestCase): + def setUp(self): + doi_identifier = {"identifier": "10.1038/s41588-018-0122-z"} + pubmed_identifier = {"identifier": "29785012"} + self.experiment = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + "extraMetadata": {}, + "keywords": ["string"], + "doiIdentifiers": [doi_identifier], + "pubmedIdentifiers": [pubmed_identifier], + } + + def test_valid_all_fields(self): + validate_experiment(self.experiment) + '''try: + print(type(json.loads(Experiment.parse_obj(self.experiment).json()))) + #print(a.json()) + + #b = dict() + #print(b.json()) + except ValueError as e: + print(e)''' + + def test_valid_exclude_optional(self): + self.experiment.pop("extraMetadata") + self.experiment.pop("keywords") + self.experiment.pop("doiIdentifiers") + self.experiment.pop("pubmedIdentifiers") + validate_experiment(self.experiment) + + +class TestValidateScoreSet(TestCase): + def setUp(self): + doi_identifier = {"identifier": "10.1038/s41588-018-0122-z"} + pubmed_identifier = {"identifier": "29785012"} + reference_map = {"genomeId": 0, "targetId": 0} + sequence = {"sequenceType": "DNA", "sequence": "ATC"} + external_identifier_id = {"dbname": "UniProt", "identifier": "P01133"} + external_identifier = {"identifier": external_identifier_id, "offset": 0} + target = {"name": "name", + "category": "Protein coding", + "externalIdentifiers": [external_identifier], + "referenceMaps": [reference_map], + "wtSequence": sequence} + self.scoreset = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + "extraMetadata": {}, + "dataUsagePolicy": "policy", + "licenceId": 0, + "keywords": ["string"], + "experimentUrn": "tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5", + "supersededScoresetUrn": "tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5", + "metaAnalysisSourceScoresetUrns": ["tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5"], + "doiIdentifiers": [doi_identifier], + "pubmedIdentifiers": [pubmed_identifier], + "targetGene": target, + } + + def test_valid_all_fields(self): + validate_scoreset(self.scoreset) + + def test_valid_exclude_optional(self): + self.scoreset.pop("extraMetadata") + self.scoreset.pop("keywords") + self.scoreset.pop("doiIdentifiers") + self.scoreset.pop("pubmedIdentifiers") + self.scoreset.pop("supersededScoresetUrn") + self.scoreset.pop("metaAnalysisSourceScoresetUrns") + validate_scoreset(self.scoreset) diff --git a/tests/test_validators/test_metadata_validators.py b/tests/validation/identifier.py similarity index 84% rename from tests/test_validators/test_metadata_validators.py rename to tests/validation/identifier.py index ef32868..ff00510 100644 --- a/tests/test_validators/test_metadata_validators.py +++ b/tests/validation/identifier.py @@ -1,22 +1,21 @@ from unittest import TestCase -from mavecore.validators.metadata_validators import ( +from mavecore.validation.identifier import ( validate_doi_identifier, validate_doi_list, - validate_keyword, - validate_keyword_list, validate_pubmed_identifier, validate_pubmed_list, validate_sra_identifier, validate_sra_list, validate_uniprot_identifier, + validate_uniprot_list, validate_refseq_identifier, validate_refseq_list, validate_genome_identifier, validate_ensembl_identifier, validate_ensembl_list, ) -from mavecore.validators.exceptions import ValidationError +from mavecore.validation.exceptions import ValidationError class TestDOIValidators(TestCase): @@ -131,24 +130,6 @@ def test_ve_invalid_uniprot_id(self): with self.assertRaises(ValidationError): validate_uniprot_identifier("P123") - def test_ve_invalid_uniprot_list(self): - with self.assertRaises(ValidationError): - validate_keyword_list(["protein", 555]) - def test_passes_valid_uniprot_id(self): validate_uniprot_identifier("P01133") - -class TestKeywordValidators(TestCase): - """ - Tests that each validator throws the appropriate :class:`ValidationError` - when passed invalid input. - """ - - def test_ve_invalid_keyword(self): - with self.assertRaises(ValidationError): - validate_keyword(555) - - def test_ve_invalid_keyword_in_list(self): - with self.assertRaises(ValidationError): - validate_keyword_list(["protein", 555]) diff --git a/tests/validation/keywords.py b/tests/validation/keywords.py new file mode 100644 index 0000000..bdd7cad --- /dev/null +++ b/tests/validation/keywords.py @@ -0,0 +1,19 @@ +from unittest import TestCase + +from mavecore.validation.keywords import * +from mavecore.validation.exceptions import ValidationError + + +class TestKeywordValidators(TestCase): + """ + Tests that each validator throws the appropriate :class:`ValidationError` + when passed invalid input. + """ + + def test_ve_invalid_keyword(self): + with self.assertRaises(ValidationError): + validate_keyword(555) + + def test_ve_invalid_keyword_in_list(self): + with self.assertRaises(ValidationError): + validate_keywords(["protein", 555]) diff --git a/tests/validation/target.py b/tests/validation/target.py new file mode 100644 index 0000000..dc1500a --- /dev/null +++ b/tests/validation/target.py @@ -0,0 +1,55 @@ +from unittest import TestCase + +from mavecore.validation.target import * +from mavecore.validation.exceptions import ValidationError +from mavecore.validation.constants.target import valid_categories, valid_sequence_types + + +class TestValidateTargetCategory(TestCase): + def test_valid(self): + for category in valid_categories: + validate_target_category(category) + + def test_invalid_category(self): + with self.assertRaises(ValidationError): + validate_target_category("Protein") + + def test_invalid_case(self): + with self.assertRaises(ValidationError): + validate_target_category("protein coding") + + +class TestValidateSequenceCategory(TestCase): + def test_valid(self): + for sequence_type in valid_sequence_types: + validate_sequence_category(sequence_type) + + def test_invalid_category(self): + with self.assertRaises(ValidationError): + validate_sequence_category("RNA") + + def test_invalid_case(self): + with self.assertRaises(ValidationError): + validate_sequence_category("dna") + + +class TestValidateTargetSequence(TestCase): + def setUp(self): + self.target_seq = "ATGACCAAACAT" + + def test_valid(self): + validate_target_sequence(self.target_seq) + + def test_invalid_characters(self): + self.target_seq = "AUGACCAAACAU" + with self.assertRaises(ValidationError): + validate_target_sequence(self.target_seq) + + def test_invalid_case(self): + with self.assertRaises(ValidationError): + validate_target_sequence(self.target_seq.lower()) + + def test_invalid_length(self): + self.target_seq = self.target_seq + "A" + with self.assertRaises(ValidationError): + validate_target_sequence(self.target_seq) diff --git a/tests/validation/urn.py b/tests/validation/urn.py new file mode 100644 index 0000000..6f86eb1 --- /dev/null +++ b/tests/validation/urn.py @@ -0,0 +1,79 @@ +from unittest import TestCase + +from mavecore.validation.urn import * + + +class TestValidateUrn(TestCase): + def test_valid_mavedb_urn(self): + validate_mavedb_urn("urn:mavedb:00000002-a-1") + + def test_invalid_mavedb_urn(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn("urn:mavedb:00000002-a-1-z") + + def test_valid_mavedb_urn_experimentset(self): + validate_mavedb_urn_experimentset("urn:mavedb:00000001") + + def test_invalid_mavedb_urn_experimentset(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn_experimentset("") + + def test_valid_mavedb_urn_experiment(self): + validate_mavedb_urn_experiment("urn:mavedb:00000001-a") + + def test_invalid_mavedb_urn_experiment(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn_experiment("") + + def test_valid_mavedb_urn_scoreset(self): + validate_mavedb_urn_scoreset("urn:mavedb:00000001-a-1") + + def test_invalid_mavedb_urn_scoreset(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn_scoreset("") + + def test_valid_mavedb_urn_variant(self): + # TODO find a valid variant urn + pass + #validate_mavedb_urn_variant("") + + def test_invalid_mavedb_urn_variant(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn_variant("urn:mavedb:00000002-a-1") # this is a scoreset urn + + +class TestValidateTmpUrn(TestCase): + def test_valid_tmp_mavedb_urn(self): + validate_mavedb_urn("tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5") + + def test_invalid_tmp_mavedb_urn(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn("urn:mavedb:00000002-a-1-z") + + def test_valid_tmp_mavedb_urn_experimentset(self): + validate_mavedb_urn_experimentset("tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5") + + def test_invalid_tmp_mavedb_urn_experimentset(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn_experimentset("") + + def test_valid_tmp_mavedb_urn_experiment(self): + validate_mavedb_urn_experiment("urn:mavedb:00000001-a") + + def test_invalid_tmp_mavedb_urn_experiment(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn_experiment("") + + def test_valid_tmp_mavedb_urn_scoreset(self): + validate_mavedb_urn_scoreset("tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5") + + def test_invalid_tmp_mavedb_urn_scoreset(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn_scoreset("") + + def test_valid_tmp_mavedb_urn_variant(self): + validate_mavedb_urn_variant("tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5") + + def test_invalid_tmp_mavedb_urn_variant(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn_variant("urn:mavedb:00000002-a-1") # this is a scoreset urn diff --git a/tests/validation/utilities.py b/tests/validation/utilities.py new file mode 100644 index 0000000..c5e0618 --- /dev/null +++ b/tests/validation/utilities.py @@ -0,0 +1,93 @@ +from unittest import TestCase + +from mavecore.validation.constants.general import null_values_list +from mavecore.validation.variant import validate_hgvs_string #validate_pro_variant, validate_nt_variant + +from mavecore.validation.utilities import ( + is_null, + generate_hgvs, + construct_hgvs_pro, + convert_hgvs_nt_to_hgvs_pro, + _is_wild_type, + _is_deletion, + _is_substitution_one_base, + _is_substitution_two_bases_nonadjacent +) + + +class TestIsNull(TestCase): + def test_valid_null_values(self): + for value in null_values_list: + self.assertTrue(is_null(value)) + + def test_invalid_null_values(self): + self.assertFalse(is_null(1)) + self.assertFalse(is_null("1")) + + +class TestGenerateHgvsPro(TestCase): + def test_pro(self): + pro = generate_hgvs("p") + validate_hgvs_string(pro) + + def test_nt(self): + nt = generate_hgvs() + validate_hgvs_string(nt) + + +class TestConstructHgvsPro(TestCase): + def test_valid_arguments(self): + construct_hgvs_pro(wt="Ala", mutant="Gly", position=3) + + def test_invalid_wt_aa(self): + with self.assertRaises(ValueError): + construct_hgvs_pro(wt="Alr", mutant="Gly", position=3) + + def test_invalid_mut_aa(self): + with self.assertRaises(ValueError): + construct_hgvs_pro(wt="Ala", mutant="Gla", position=3) + + def test_invalid_position(self): + # TODO what are the invalid positions we should consider? + self.assertFalse(False) + + +class TestConvertHgvsNtToHgvsPro(TestCase): + def setUp(self): + self.target_seq = "ATGACA" + self.hgvs_nt_values = ["g.4A>G", "g.5C>G", "g.6A>G"] + self.hgvs_pro_values = ["p.Thr2Ala", "p.Thr2Arg", "p.Thr2="] + + def test_wt_hgvs_nt(self): + #convert_hgvs_nt_to_hgvs_pro(hgvs_nt="g.4A>G", ) + pass + + def test_wt_hgvs_pro(self): + pass + + def test_deletion_hgvs_nt(self): + pass + + def test_one_base_change_codon_variant(self): + pass + + def test_two_base_change_codon_variant(self): + pass + + def test_three_base_change_codon_variant(self): + pass + + +class TestVariantTypeHelperFunctions(TestCase): + + def test_test_is_wild_type(self): + pass + + def test_is_deletion(self): + pass + + def test_test_is_substitution_one_base(self): + pass + + def test_test_is_substitution_two_bases_nonadjacent(self): + pass diff --git a/tests/validation/variant.py b/tests/validation/variant.py new file mode 100644 index 0000000..1b258d8 --- /dev/null +++ b/tests/validation/variant.py @@ -0,0 +1,100 @@ +from unittest import TestCase + +from mavecore.validation.variant import validate_hgvs_string +from mavecore.validation.exceptions import ValidationError +from mavecore.validation.constants.general import null_values_list + + +class TestValidateHgvsString(TestCase): + def test_passes_on_null(self): + for v in null_values_list: + validate_hgvs_string(v) + + def test_error_not_str(self): + with self.assertRaises(ValidationError): + validate_hgvs_string(1.0) + + def test_error_unknown_column(self): + with self.assertRaises(ValueError): + validate_hgvs_string("c.1A>G", column="random") + + def test_error_does_not_match_splice(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("g.G4L", column="splice") + + def test_error_nt_is_not_g_when_splice_present(self): + validate_hgvs_string("c.1A>G", column="nt", splice_present=False) + with self.assertRaises(ValidationError): + validate_hgvs_string("c.1A>G", column="nt", splice_present=True) + + def test_error_does_not_match_nt(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("p.G4L", column="nt") + + def test_error_does_not_match_pro(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("c.1A>G", column="p") + + def test_raises_on_enrich_special_types(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("_wt") + with self.assertRaises(ValidationError): + validate_hgvs_string("_sy") + + def test_validates_valid_hgvs(self): + validate_hgvs_string("c.1A>G", column="nt", splice_present=False) + validate_hgvs_string("g.1A>G", column="nt", splice_present=True) + validate_hgvs_string("c.1A>G", column="splice") + validate_hgvs_string("p.(=)", column="p") + + +class TestHGVSValidator(TestCase): + """ + Tests the function :func:`validate_hgvs_string` to see if it is able + to validate strings which do not comply with the HGVS standard for + coding, non-coding and nucleotide variants and multi-variants. + """ + + def test_validation_error_not_str_or_bytes(self): + with self.assertRaises(ValidationError): + validate_hgvs_string([]) + + def test_does_not_pass_enrich_wt_hgvs(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("_wt") + + def test_does_not_pass_enrich_sy_hgvs(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("_sy") + + def test_passes_multi(self): + validate_hgvs_string("p.[Lys4Gly;Lys5Phe]", column="p") + validate_hgvs_string("c.[1A>G;127_128delinsAGC]", column="nt") + validate_hgvs_string("c.[1A>G;127_128delinsAGC]", column="splice") + + def test_error_invalid_hgvs(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("c.ad", column="nt") + + def test_error_invalid_nt_prefix(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("r.1a>g", column="nt") + + with self.assertRaises(ValidationError): + validate_hgvs_string("c.1A>G", column="nt", splice_present=True) + + def test_error_invalid_splice_prefix(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("r.1a>g", column="splice") + + def test_error_invalid_pro_prefix(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("r.1a>g", column="p") + + def test_converts_bytes_to_string_before_validation(self): + validate_hgvs_string(b"c.427A>G", column="splice") + + def test_return_none_for_null(self): + for c in null_values_list: + self.assertIsNone(validate_hgvs_string(c, column="nt")) +